Extract Haplotype sequences from bam file

In [1]:
import pysam
In [3]:
from allele_functions.cigar_funcs import *

Get variables needed (use Argparse)

In [4]:
hapFileName = '/Users/hughcross/Analysis/repos/meta_tools/example_haplotypes.txt'
posFileName = '/Users/hughcross/Analysis/repos/meta_tools/example_position_list.txt'
bamFileName = '/Users/hughcross/Analysis/Paua/sort_allBucks.bam'
refName = 'CCB11_Haplotype1'
In [5]:
bamNameNoPath = bamFileName.split('/')[-1]
sampleName = bamNameNoPath.split('.')[0]
print(sampleName)
sort_allBucks
In [6]:
logFileName = sampleName+'.log'
print(logFileName)
sort_allBucks.log

Get dict of haplotypes and list of positions from files

In [7]:
hapFile = open(hapFileName, 'r')
In [8]:
hapDict = {}
hList = []
for line in hapFile:
    line = line.strip('\n')
    parts = line.split('\t')
    hapDict[parts[1]]=parts[0]
    hList.append(parts[0])
hapFile.close()
print(len(hapDict))
print(hapDict)
print(hList)
7
{'CAGGAGT': '1', 'TAGGGGT': '2', 'CAGGAAT': '3', 'CAAAAGT': '4', 'TGGGAGT': '5', 'CAAGAGT': '6', 'TAGGGGC': '7'}
['1', '2', '3', '4', '5', '6', '7']
In [14]:
posFile = open(posFileName, 'r')
In [15]:
posList = []
for line in posFile:
    line = line.strip('\n')
    posList.append(int(line))
posFile.close()
positions = tuple(posList)
print(posList)
print(positions)
[10, 25, 139, 160, 220, 304, 343]
(10, 25, 139, 160, 220, 304, 343)

Use pysam to parse the file

In [16]:
bamfileE = pysam.AlignmentFile(bamFileName, "rb")
In [17]:
hap1 = bamfileE.fetch(refName)
In [18]:
# more complicated
cigD = {}
seqD = {}
for c in hap1:
    qy = c.query_name
    cig = c.cigarstring
    cigRef = cigar_ref(cig)
    # make seqdict with quality
    seq = c.query_sequence
    qual = c.query_qualities
    seqD.setdefault(qy, {})['seq']=seq
    seqD.setdefault(qy, {})['qual']=qual
    # get haplotype for each
    hapStr = ''
    for p in positions:
        posi = posi_finder(cigRef, p)
        if len(seq) > posi:
            base = seq[posi]
            hapStr = hapStr + base
    cigD[qy]=hapStr
print(len(cigD))
print(len(seqD))
/Users/hughcross/anaconda/envs/py3/lib/python3.6/re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)
157698
157698
In [19]:
hapLists = {}
notFound = []
for k,v in cigD.items():
    if v in hapDict:
        htype = hapDict[v]
        hapLists.setdefault(htype, []).append(k)
    else:
        notFound.append(k)
        
print(len(notFound))
13531

Output to sequence files

In [20]:
# for fasta file
for hap in hList:
    newFileName = sampleName+'_haplotype'+hap+'.fasta'
    output = open(newFileName, 'w')
    readlist = hapLists[hap]
    for read in readlist:
        output.write('>'+read+'\n'+seqD[read]['seq']+'\n')
    output.close()
In [20]:
# for fastq file
for hap in hList:
    newFileName = sampleName+'_haplotype'+hap+'.fasta'
    output = open(newFileName, 'w')
    readlist = hapLists[hap]
    for read in readlist:
        output.write('@'+read+'\n'+seqD[read]['seq']+'\n+\n'+seqD[read]['qual']+'\n')
    output.close()

Output logfile

In [21]:
logout = open(logFileName,'w')
In [22]:
logout.write('Haplotype\tNumber of reads\n')
Out[22]:
26
In [23]:
for h in hList:
    numReads = str(len(hapLists[h]))
    logout.write(h+'\t'+numReads+'\n')
logout.close()
In [ ]: