import pysam
from allele_functions.cigar_funcs import *
hapFileName = '/Users/hughcross/Analysis/repos/meta_tools/example_haplotypes.txt'
posFileName = '/Users/hughcross/Analysis/repos/meta_tools/example_position_list.txt'
bamFileName = '/Users/hughcross/Analysis/Paua/sort_allBucks.bam'
refName = 'CCB11_Haplotype1'
bamNameNoPath = bamFileName.split('/')[-1]
sampleName = bamNameNoPath.split('.')[0]
print(sampleName)
logFileName = sampleName+'.log'
print(logFileName)
hapFile = open(hapFileName, 'r')
hapDict = {}
hList = []
for line in hapFile:
line = line.strip('\n')
parts = line.split('\t')
hapDict[parts[1]]=parts[0]
hList.append(parts[0])
hapFile.close()
print(len(hapDict))
print(hapDict)
print(hList)
posFile = open(posFileName, 'r')
posList = []
for line in posFile:
line = line.strip('\n')
posList.append(int(line))
posFile.close()
positions = tuple(posList)
print(posList)
print(positions)
bamfileE = pysam.AlignmentFile(bamFileName, "rb")
hap1 = bamfileE.fetch(refName)
# more complicated
cigD = {}
seqD = {}
for c in hap1:
qy = c.query_name
cig = c.cigarstring
cigRef = cigar_ref(cig)
# make seqdict with quality
seq = c.query_sequence
qual = c.query_qualities
seqD.setdefault(qy, {})['seq']=seq
seqD.setdefault(qy, {})['qual']=qual
# get haplotype for each
hapStr = ''
for p in positions:
posi = posi_finder(cigRef, p)
if len(seq) > posi:
base = seq[posi]
hapStr = hapStr + base
cigD[qy]=hapStr
print(len(cigD))
print(len(seqD))
hapLists = {}
notFound = []
for k,v in cigD.items():
if v in hapDict:
htype = hapDict[v]
hapLists.setdefault(htype, []).append(k)
else:
notFound.append(k)
print(len(notFound))
# for fasta file
for hap in hList:
newFileName = sampleName+'_haplotype'+hap+'.fasta'
output = open(newFileName, 'w')
readlist = hapLists[hap]
for read in readlist:
output.write('>'+read+'\n'+seqD[read]['seq']+'\n')
output.close()
# for fastq file
for hap in hList:
newFileName = sampleName+'_haplotype'+hap+'.fasta'
output = open(newFileName, 'w')
readlist = hapLists[hap]
for read in readlist:
output.write('@'+read+'\n'+seqD[read]['seq']+'\n+\n'+seqD[read]['qual']+'\n')
output.close()
logout = open(logFileName,'w')
logout.write('Haplotype\tNumber of reads\n')
for h in hList:
numReads = str(len(hapLists[h]))
logout.write(h+'\t'+numReads+'\n')
logout.close()