find haplotypes in alignment

This notebook works out deriving haplotypes from an alignment of NGS reads. First by finding variable sites in alignment, and then 'phasing' each read in the alignment, to see how SNPs are combined on each read. Then, the most numerous combinations are recorded.

In [1]:
from Bio import AlignIO
from Bio import SeqIO
#from StringIO import StringIO
In [2]:
ambigs = ['Y','K','M','R','W','S','D','V','H','B','N']
ambigDict = {'Y': ['C', 'T'], 'K':['G','T'],'M':['A','C'],'R':['A','G'],'W':['A','T'],'S':['C','G'],'H':['A','C','T'],'D':['A','G','T'],'V':['A','C','G'],'B':['C','G','T'],'N':['A','C','G','T']}

Trial with simple alignment

In [3]:
alnFile = '/Users/hughcross/OneDrive/AnatAnalysis/Paua/find_haplotypes/aln_dr10_Sample-10_7_1wRef.fasta'
In [4]:
aln1 = AlignIO.read(alnFile,'fasta')
In [5]:
len(aln1)
Out[5]:
164
In [6]:
sampleNames = []
for record in aln1:
    samp = record.id
    sampleNames.append(samp)
print(len(sampleNames))
print(sampleNames[0])
164
Haplotype1
In [7]:
sampleNames[1]
Out[7]:
'rep.1;size=7560'
In [8]:
len(aln1[2,:])
Out[8]:
138
In [9]:
aln1.get_alignment_length()
Out[9]:
138
In [10]:
aln1[0,-2]
Out[10]:
't'
In [11]:
position1 = aln1[:,1]
In [12]:
position1
Out[12]:
'tttttt-t-tttttttttttttttttt-ttttttttttttttttttttttttttttttttttt-ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttctttt-tttttt'
In [13]:
# make a dictionary of bases at each position don't worry about end gaps for now
# bases = {} # {rep1:[a-ttcgt],rep2:[a-ttctt]} or string
rep1 = str(aln1[1,:].seq)
In [14]:
rep1
Out[14]:
'-taactgactcgtcccactaatattaggcgcaccagatatgagggacagggtgaacagttgattacataccctcctttgtctagtaacctctacacttagccggacctagaacggataccttcgccattctacttct-'
In [15]:
rep1name = aln1[1,:].id
print(rep1name)
rep.1;size=7560
In [16]:
sampleDict = {}
sampleCounts = {}
for samp in range(1,len(aln1)):
    sequence = str(aln1[samp,:].seq).upper()
    seqID = aln1[samp,:].id
    counts = int(seqID.split('size=')[1])
    sampleDict[seqID]=sequence
    sampleCounts[seqID]=counts

print(len(sampleDict))
print(len(sampleCounts))
163
163
In [17]:
sampleDict['rep.2;size=4406']#.upper()
Out[17]:
'-TAACTGACTCGTCCCACTAATATTAGGCGCACCAGATATGAGGGACAGGATGAACAGTTGATTACATACCCTCCTTTATCTAGTAACCTCTACACTTAGCCGGACCTAGAACGGATACCTTCGCCATTCTACTTCT-'
In [18]:
sampleCounts['rep.2;size=4406']
Out[18]:
4406
In [19]:
print(len(sampleNames))
sampleNames = sampleNames[1:]
print(len(sampleNames))
164
163
In [20]:
# now get raw counts for each 
posCounts = {}
for pos in range(0,138):
    baseDict = {}
    for sample in sampleNames:
        base = sampleDict[sample][pos]
        ct = sampleCounts[sample]
        baseDict.setdefault(base,[]).append(ct)
    totalCounts = {}
    for k,v in baseDict.items():
        total = sum(v)
        totalCounts[k]=total
    posCounts[pos]=totalCounts

print(len(posCounts))
138
In [21]:
posCounts[10]
Out[21]:
{'C': 14445, 'T': 4212}

Find informative alleles

In [22]:
mmaf = 0.1
In [23]:
infAlleles = []
for k,v in posCounts.items():
    ctlist = list(v.values())
    topCount = max(ctlist)
    sumCounts = sum(ctlist)
    for key,value in v.items():
        if value != topCount:
            if (value/sumCounts) > mmaf:
                infAlleles.append(k)
print(len(infAlleles))
6
In [24]:
infAlleles
Out[24]:
[10, 25, 50, 78, 98, 128]
In [25]:
for inf in infAlleles:
    print(inf, posCounts[inf])
10 {'C': 14445, 'T': 4212}
25 {'A': 16363, 'G': 2294}
50 {'G': 13708, 'A': 4949}
78 {'G': 13694, 'A': 4963}
98 {'A': 16736, 'G': 1921}
128 {'T': 16709, 'C': 1948}

Derive haplotypes from alignments

In [26]:
infBases = {}
for i in infAlleles:
    baseList = list(posCounts[i].keys())
    bases = set(baseList)
    infBases[i]=bases
print(infBases)
{10: {'T', 'C'}, 25: {'A', 'G'}, 50: {'A', 'G'}, 78: {'A', 'G'}, 98: {'A', 'G'}, 128: {'T', 'C'}}
In [27]:
haplotypes = {} # hap:count
for k,v in sampleDict.items():
    hapStr = ''
    for i in infAlleles:
        base = v[i]
        if base in infBases[i]:
            hapStr += base
        else:
            hapStr += '-'
    haplotypes.setdefault(hapStr, []).append(sampleCounts[k])
print(len(haplotypes))
11
In [28]:
hapTotals = {}
for k,v in haplotypes.items():
    total = sum(v)
    hapTotals[k]=total
print(hapTotals)
{'CAGGAT': 9410, 'CAAAAT': 4914, 'TGGGAT': 2280, 'TAGGGC': 1911, 'CAGAAT': 37, 'CAGGAC': 25, 'CAAGAT': 23, 'TAGGAT': 21, 'CGGGAT': 14, 'CAAAAC': 12, 'CAGGGT': 10}
In [29]:
haps = set(haplotypes) # find how many of each
print(haps)
{'TAGGAT', 'CAGAAT', 'CAAAAC', 'CAGGGT', 'TGGGAT', 'CAGGAC', 'CAGGAT', 'TAGGGC', 'CGGGAT', 'CAAAAT', 'CAAGAT'}

Plot variable positions

In [30]:
import matplotlib.pyplot as plt
In [31]:
fig=plt.figure(figsize=(20,10), dpi= 100, facecolor='w', edgecolor='k')
<Figure size 2000x1000 with 0 Axes>
In [32]:
# define x and y axes
x = list(posCounts.keys())
print(x[0])
0
In [33]:
# create y axis of A counts
yA = []
for k,v in posCounts.items():
    if 'A' in v:
        yA.append(v['A'])
    else:
        yA.append(0)
print(len(yA))
138
In [34]:
# create y axis of C counts
yC = []
#yG = []
#yT = []
#ygap = []
for k,v in posCounts.items():
    if 'C' in v:
        yC.append(v['C'])
    else:
        yC.append(0)
print(len(yC))
138
In [35]:
# create y axis of G counts
yG = []
#yT = []
#ygap = []
for k,v in posCounts.items():
    if 'G' in v:
        yG.append(v['G'])
    else:
        yG.append(0)
print(len(yG))
138
In [36]:
# create y axis of T counts
yT = []
#ygap = []
for k,v in posCounts.items():
    if 'T' in v:
        yT.append(v['T'])
    else:
        yT.append(0)
print(len(yT))
138
In [37]:
# create y axis of gap counts
ygap = []
for k,v in posCounts.items():
    if '-' in v:
        ygap.append(v['-'])
    else:
        ygap.append(0)
print(len(ygap))
138

'Chromatogram plot: to show the variable sites as shorter bands

In [38]:
plt.plot(x[:20],yA[:20])
plt.plot(x[:20],yC[:20])
plt.plot(x[:20],yG[:20])
plt.plot(x[:20],yT[:20])
plt.xlabel('position')
plt.ylabel('counts')
plt.rcParams['figure.figsize'] = [24, 12]
plt.rcParams['figure.dpi'] = 200 # 200 e.g. is really fine, but slower
plt.legend()
plt.show()
No handles with labels found to put in legend.

scatter plots

In [39]:
pltA = plt.bar(x[:40],yA[:40])
plt.bar(x[:40],yC[:40])
plt.bar(x[:40],yG[:40])
plt.bar(x[:40],yT[:40])
plt.xlabel('position')
plt.ylabel('counts')
plt.rcParams['figure.figsize'] = [24, 12]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.legend()
plt.show()
No handles with labels found to put in legend.
In [40]:
import numpy as np
In [41]:
from matplotlib import rc
In [42]:
yA30 = yA[:30]
yG30 = yG[:30]
yC30 = yC[:30]
yT30 = yT[:30]
In [43]:
x30 = x[:30]
In [44]:
# y-axis in bold
rc('font', weight='bold')
# Heights of bars1 + bars2
ac = np.add(yA30, yC30).tolist()
acg = np.add(ac, yG30).tolist()
# set barwidth
barWidth=1
# Create brown bars
plt.bar(x30, yA30, color='blue', edgecolor='white', width=barWidth)
# Create green bars (middle), on top of the firs ones
plt.bar(x30, yC30, bottom=yA30, color='red', edgecolor='white', width=barWidth)
# Create green bars (second to top)
plt.bar(x30, yG30, bottom=ac, color='green', edgecolor='white', width=barWidth)
# Create red bars
plt.bar(x30, yT30, bottom=acg, color='orange', edgecolor='white', width=barWidth)
##

# Show graphic
plt.show()