Extract Haplotype sequences from bam file¶

import pysam

from allele_functions.cigar_funcs import *

Get variables needed (use Argparse)¶

hapFileName = '/Users/hughcross/Analysis/repos/meta_tools/example_haplotypes.txt'
posFileName = '/Users/hughcross/Analysis/repos/meta_tools/example_position_list.txt'
bamFileName = '/Users/hughcross/Analysis/Paua/sort_allBucks.bam'
refName = 'CCB11_Haplotype1'

bamNameNoPath = bamFileName.split('/')[-1]
sampleName = bamNameNoPath.split('.')[0]
print(sampleName)

sort_allBucks

logFileName = sampleName+'.log'
print(logFileName)

sort_allBucks.log

Get dict of haplotypes and list of positions from files¶

hapFile = open(hapFileName, 'r')

hapDict = {}
hList = []
for line in hapFile:
    line = line.strip('\n')
    parts = line.split('\t')
    hapDict[parts[1]]=parts[0]
    hList.append(parts[0])
hapFile.close()
print(len(hapDict))
print(hapDict)
print(hList)

7
{'CAGGAGT': '1', 'TAGGGGT': '2', 'CAGGAAT': '3', 'CAAAAGT': '4', 'TGGGAGT': '5', 'CAAGAGT': '6', 'TAGGGGC': '7'}
['1', '2', '3', '4', '5', '6', '7']

posFile = open(posFileName, 'r')

posList = []
for line in posFile:
    line = line.strip('\n')
    posList.append(int(line))
posFile.close()
positions = tuple(posList)
print(posList)
print(positions)

[10, 25, 139, 160, 220, 304, 343]
(10, 25, 139, 160, 220, 304, 343)

Use pysam to parse the file¶

bamfileE = pysam.AlignmentFile(bamFileName, "rb")

hap1 = bamfileE.fetch(refName)

# more complicated
cigD = {}
seqD = {}
for c in hap1:
    qy = c.query_name
    cig = c.cigarstring
    cigRef = cigar_ref(cig)
    # make seqdict with quality
    seq = c.query_sequence
    qual = c.query_qualities
    seqD.setdefault(qy, {})['seq']=seq
    seqD.setdefault(qy, {})['qual']=qual
    # get haplotype for each
    hapStr = ''
    for p in positions:
        posi = posi_finder(cigRef, p)
        if len(seq) > posi:
            base = seq[posi]
            hapStr = hapStr + base
    cigD[qy]=hapStr
print(len(cigD))
print(len(seqD))

/Users/hughcross/anaconda/envs/py3/lib/python3.6/re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)

157698
157698

hapLists = {}
notFound = []
for k,v in cigD.items():
    if v in hapDict:
        htype = hapDict[v]
        hapLists.setdefault(htype, []).append(k)
    else:
        notFound.append(k)
        
print(len(notFound))

13531

Output to sequence files¶

# for fasta file
for hap in hList:
    newFileName = sampleName+'_haplotype'+hap+'.fasta'
    output = open(newFileName, 'w')
    readlist = hapLists[hap]
    for read in readlist:
        output.write('>'+read+'\n'+seqD[read]['seq']+'\n')
    output.close()

# for fastq file
for hap in hList:
    newFileName = sampleName+'_haplotype'+hap+'.fasta'
    output = open(newFileName, 'w')
    readlist = hapLists[hap]
    for read in readlist:
        output.write('@'+read+'\n'+seqD[read]['seq']+'\n+\n'+seqD[read]['qual']+'\n')
    output.close()

Output logfile¶

logout = open(logFileName,'w')

logout.write('Haplotype\tNumber of reads\n')

26

for h in hList:
    numReads = str(len(hapLists[h]))
    logout.write(h+'\t'+numReads+'\n')
logout.close()