This notebook works out deriving haplotypes from an alignment of NGS reads. First by finding variable sites in alignment, and then 'phasing' each read in the alignment, to see how SNPs are combined on each read. Then, the most numerous combinations are recorded.
from Bio import AlignIO
from Bio import SeqIO
#from StringIO import StringIO
ambigs = ['Y','K','M','R','W','S','D','V','H','B','N']
ambigDict = {'Y': ['C', 'T'], 'K':['G','T'],'M':['A','C'],'R':['A','G'],'W':['A','T'],'S':['C','G'],'H':['A','C','T'],'D':['A','G','T'],'V':['A','C','G'],'B':['C','G','T'],'N':['A','C','G','T']}
alnFile = '/Users/hughcross/OneDrive/AnatAnalysis/Paua/find_haplotypes/aln_dr10_Sample-10_7_1wRef.fasta'
aln1 = AlignIO.read(alnFile,'fasta')
len(aln1)
sampleNames = []
for record in aln1:
samp = record.id
sampleNames.append(samp)
print(len(sampleNames))
print(sampleNames[0])
sampleNames[1]
len(aln1[2,:])
aln1.get_alignment_length()
aln1[0,-2]
position1 = aln1[:,1]
position1
# make a dictionary of bases at each position don't worry about end gaps for now
# bases = {} # {rep1:[a-ttcgt],rep2:[a-ttctt]} or string
rep1 = str(aln1[1,:].seq)
rep1
rep1name = aln1[1,:].id
print(rep1name)
sampleDict = {}
sampleCounts = {}
for samp in range(1,len(aln1)):
sequence = str(aln1[samp,:].seq).upper()
seqID = aln1[samp,:].id
counts = int(seqID.split('size=')[1])
sampleDict[seqID]=sequence
sampleCounts[seqID]=counts
print(len(sampleDict))
print(len(sampleCounts))
sampleDict['rep.2;size=4406']#.upper()
sampleCounts['rep.2;size=4406']
print(len(sampleNames))
sampleNames = sampleNames[1:]
print(len(sampleNames))
# now get raw counts for each
posCounts = {}
for pos in range(0,138):
baseDict = {}
for sample in sampleNames:
base = sampleDict[sample][pos]
ct = sampleCounts[sample]
baseDict.setdefault(base,[]).append(ct)
totalCounts = {}
for k,v in baseDict.items():
total = sum(v)
totalCounts[k]=total
posCounts[pos]=totalCounts
print(len(posCounts))
posCounts[10]
mmaf = 0.1
infAlleles = []
for k,v in posCounts.items():
ctlist = list(v.values())
topCount = max(ctlist)
sumCounts = sum(ctlist)
for key,value in v.items():
if value != topCount:
if (value/sumCounts) > mmaf:
infAlleles.append(k)
print(len(infAlleles))
infAlleles
for inf in infAlleles:
print(inf, posCounts[inf])
infBases = {}
for i in infAlleles:
baseList = list(posCounts[i].keys())
bases = set(baseList)
infBases[i]=bases
print(infBases)
haplotypes = {} # hap:count
for k,v in sampleDict.items():
hapStr = ''
for i in infAlleles:
base = v[i]
if base in infBases[i]:
hapStr += base
else:
hapStr += '-'
haplotypes.setdefault(hapStr, []).append(sampleCounts[k])
print(len(haplotypes))
hapTotals = {}
for k,v in haplotypes.items():
total = sum(v)
hapTotals[k]=total
print(hapTotals)
haps = set(haplotypes) # find how many of each
print(haps)
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(20,10), dpi= 100, facecolor='w', edgecolor='k')
# define x and y axes
x = list(posCounts.keys())
print(x[0])
# create y axis of A counts
yA = []
for k,v in posCounts.items():
if 'A' in v:
yA.append(v['A'])
else:
yA.append(0)
print(len(yA))
# create y axis of C counts
yC = []
#yG = []
#yT = []
#ygap = []
for k,v in posCounts.items():
if 'C' in v:
yC.append(v['C'])
else:
yC.append(0)
print(len(yC))
# create y axis of G counts
yG = []
#yT = []
#ygap = []
for k,v in posCounts.items():
if 'G' in v:
yG.append(v['G'])
else:
yG.append(0)
print(len(yG))
# create y axis of T counts
yT = []
#ygap = []
for k,v in posCounts.items():
if 'T' in v:
yT.append(v['T'])
else:
yT.append(0)
print(len(yT))
# create y axis of gap counts
ygap = []
for k,v in posCounts.items():
if '-' in v:
ygap.append(v['-'])
else:
ygap.append(0)
print(len(ygap))
plt.plot(x[:20],yA[:20])
plt.plot(x[:20],yC[:20])
plt.plot(x[:20],yG[:20])
plt.plot(x[:20],yT[:20])
plt.xlabel('position')
plt.ylabel('counts')
plt.rcParams['figure.figsize'] = [24, 12]
plt.rcParams['figure.dpi'] = 200 # 200 e.g. is really fine, but slower
plt.legend()
plt.show()
pltA = plt.bar(x[:40],yA[:40])
plt.bar(x[:40],yC[:40])
plt.bar(x[:40],yG[:40])
plt.bar(x[:40],yT[:40])
plt.xlabel('position')
plt.ylabel('counts')
plt.rcParams['figure.figsize'] = [24, 12]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.legend()
plt.show()
import numpy as np
from matplotlib import rc
yA30 = yA[:30]
yG30 = yG[:30]
yC30 = yC[:30]
yT30 = yT[:30]
x30 = x[:30]
# y-axis in bold
rc('font', weight='bold')
# Heights of bars1 + bars2
ac = np.add(yA30, yC30).tolist()
acg = np.add(ac, yG30).tolist()
# set barwidth
barWidth=1
# Create brown bars
plt.bar(x30, yA30, color='blue', edgecolor='white', width=barWidth)
# Create green bars (middle), on top of the firs ones
plt.bar(x30, yC30, bottom=yA30, color='red', edgecolor='white', width=barWidth)
# Create green bars (second to top)
plt.bar(x30, yG30, bottom=ac, color='green', edgecolor='white', width=barWidth)
# Create red bars
plt.bar(x30, yT30, bottom=acg, color='orange', edgecolor='white', width=barWidth)
##
# Show graphic
plt.show()