find haplotypes in alignment¶

This notebook works out deriving haplotypes from an alignment of NGS reads. First by finding variable sites in alignment, and then 'phasing' each read in the alignment, to see how SNPs are combined on each read. Then, the most numerous combinations are recorded.

from Bio import AlignIO
from Bio import SeqIO
#from StringIO import StringIO

ambigs = ['Y','K','M','R','W','S','D','V','H','B','N']
ambigDict = {'Y': ['C', 'T'], 'K':['G','T'],'M':['A','C'],'R':['A','G'],'W':['A','T'],'S':['C','G'],'H':['A','C','T'],'D':['A','G','T'],'V':['A','C','G'],'B':['C','G','T'],'N':['A','C','G','T']}

Trial with simple alignment¶

alnFile = '/Users/hughcross/OneDrive/AnatAnalysis/Paua/find_haplotypes/aln_dr10_Sample-10_7_1wRef.fasta'

aln1 = AlignIO.read(alnFile,'fasta')

len(aln1)

164

sampleNames = []
for record in aln1:
    samp = record.id
    sampleNames.append(samp)
print(len(sampleNames))
print(sampleNames[0])

164
Haplotype1

sampleNames[1]

'rep.1;size=7560'

len(aln1[2,:])

138

aln1.get_alignment_length()

138

aln1[0,-2]

't'

position1 = aln1[:,1]

position1

'tttttt-t-tttttttttttttttttt-ttttttttttttttttttttttttttttttttttt-ttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttttctttt-tttttt'

# make a dictionary of bases at each position don't worry about end gaps for now
# bases = {} # {rep1:[a-ttcgt],rep2:[a-ttctt]} or string
rep1 = str(aln1[1,:].seq)

rep1

'-taactgactcgtcccactaatattaggcgcaccagatatgagggacagggtgaacagttgattacataccctcctttgtctagtaacctctacacttagccggacctagaacggataccttcgccattctacttct-'

rep1name = aln1[1,:].id
print(rep1name)

rep.1;size=7560

sampleDict = {}
sampleCounts = {}
for samp in range(1,len(aln1)):
    sequence = str(aln1[samp,:].seq).upper()
    seqID = aln1[samp,:].id
    counts = int(seqID.split('size=')[1])
    sampleDict[seqID]=sequence
    sampleCounts[seqID]=counts

print(len(sampleDict))
print(len(sampleCounts))

163
163

sampleDict['rep.2;size=4406']#.upper()

'-TAACTGACTCGTCCCACTAATATTAGGCGCACCAGATATGAGGGACAGGATGAACAGTTGATTACATACCCTCCTTTATCTAGTAACCTCTACACTTAGCCGGACCTAGAACGGATACCTTCGCCATTCTACTTCT-'

sampleCounts['rep.2;size=4406']

4406

print(len(sampleNames))
sampleNames = sampleNames[1:]
print(len(sampleNames))

164
163

# now get raw counts for each 
posCounts = {}
for pos in range(0,138):
    baseDict = {}
    for sample in sampleNames:
        base = sampleDict[sample][pos]
        ct = sampleCounts[sample]
        baseDict.setdefault(base,[]).append(ct)
    totalCounts = {}
    for k,v in baseDict.items():
        total = sum(v)
        totalCounts[k]=total
    posCounts[pos]=totalCounts

print(len(posCounts))

138

posCounts[10]

{'C': 14445, 'T': 4212}

Find informative alleles¶

mmaf = 0.1

infAlleles = []
for k,v in posCounts.items():
    ctlist = list(v.values())
    topCount = max(ctlist)
    sumCounts = sum(ctlist)
    for key,value in v.items():
        if value != topCount:
            if (value/sumCounts) > mmaf:
                infAlleles.append(k)
print(len(infAlleles))

6

infAlleles

[10, 25, 50, 78, 98, 128]

for inf in infAlleles:
    print(inf, posCounts[inf])

10 {'C': 14445, 'T': 4212}
25 {'A': 16363, 'G': 2294}
50 {'G': 13708, 'A': 4949}
78 {'G': 13694, 'A': 4963}
98 {'A': 16736, 'G': 1921}
128 {'T': 16709, 'C': 1948}

Derive haplotypes from alignments¶

infBases = {}
for i in infAlleles:
    baseList = list(posCounts[i].keys())
    bases = set(baseList)
    infBases[i]=bases
print(infBases)

{10: {'T', 'C'}, 25: {'A', 'G'}, 50: {'A', 'G'}, 78: {'A', 'G'}, 98: {'A', 'G'}, 128: {'T', 'C'}}

haplotypes = {} # hap:count
for k,v in sampleDict.items():
    hapStr = ''
    for i in infAlleles:
        base = v[i]
        if base in infBases[i]:
            hapStr += base
        else:
            hapStr += '-'
    haplotypes.setdefault(hapStr, []).append(sampleCounts[k])
print(len(haplotypes))

11

hapTotals = {}
for k,v in haplotypes.items():
    total = sum(v)
    hapTotals[k]=total
print(hapTotals)

{'CAGGAT': 9410, 'CAAAAT': 4914, 'TGGGAT': 2280, 'TAGGGC': 1911, 'CAGAAT': 37, 'CAGGAC': 25, 'CAAGAT': 23, 'TAGGAT': 21, 'CGGGAT': 14, 'CAAAAC': 12, 'CAGGGT': 10}

haps = set(haplotypes) # find how many of each
print(haps)

{'TAGGAT', 'CAGAAT', 'CAAAAC', 'CAGGGT', 'TGGGAT', 'CAGGAC', 'CAGGAT', 'TAGGGC', 'CGGGAT', 'CAAAAT', 'CAAGAT'}

Plot variable positions¶

import matplotlib.pyplot as plt

fig=plt.figure(figsize=(20,10), dpi= 100, facecolor='w', edgecolor='k')

<Figure size 2000x1000 with 0 Axes>

# define x and y axes
x = list(posCounts.keys())
print(x[0])

0

# create y axis of A counts
yA = []
for k,v in posCounts.items():
    if 'A' in v:
        yA.append(v['A'])
    else:
        yA.append(0)
print(len(yA))

138

# create y axis of C counts
yC = []
#yG = []
#yT = []
#ygap = []
for k,v in posCounts.items():
    if 'C' in v:
        yC.append(v['C'])
    else:
        yC.append(0)
print(len(yC))

138

# create y axis of G counts
yG = []
#yT = []
#ygap = []
for k,v in posCounts.items():
    if 'G' in v:
        yG.append(v['G'])
    else:
        yG.append(0)
print(len(yG))

138

# create y axis of T counts
yT = []
#ygap = []
for k,v in posCounts.items():
    if 'T' in v:
        yT.append(v['T'])
    else:
        yT.append(0)
print(len(yT))

138

# create y axis of gap counts
ygap = []
for k,v in posCounts.items():
    if '-' in v:
        ygap.append(v['-'])
    else:
        ygap.append(0)
print(len(ygap))

138

'Chromatogram plot: to show the variable sites as shorter bands¶

plt.plot(x[:20],yA[:20])
plt.plot(x[:20],yC[:20])
plt.plot(x[:20],yG[:20])
plt.plot(x[:20],yT[:20])
plt.xlabel('position')
plt.ylabel('counts')
plt.rcParams['figure.figsize'] = [24, 12]
plt.rcParams['figure.dpi'] = 200 # 200 e.g. is really fine, but slower
plt.legend()
plt.show()

No handles with labels found to put in legend.

scatter plots¶

pltA = plt.bar(x[:40],yA[:40])
plt.bar(x[:40],yC[:40])
plt.bar(x[:40],yG[:40])
plt.bar(x[:40],yT[:40])
plt.xlabel('position')
plt.ylabel('counts')
plt.rcParams['figure.figsize'] = [24, 12]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.legend()
plt.show()

No handles with labels found to put in legend.

import numpy as np

from matplotlib import rc

yA30 = yA[:30]
yG30 = yG[:30]
yC30 = yC[:30]
yT30 = yT[:30]

x30 = x[:30]

# y-axis in bold
rc('font', weight='bold')
# Heights of bars1 + bars2
ac = np.add(yA30, yC30).tolist()
acg = np.add(ac, yG30).tolist()
# set barwidth
barWidth=1
# Create brown bars
plt.bar(x30, yA30, color='blue', edgecolor='white', width=barWidth)
# Create green bars (middle), on top of the firs ones
plt.bar(x30, yC30, bottom=yA30, color='red', edgecolor='white', width=barWidth)
# Create green bars (second to top)
plt.bar(x30, yG30, bottom=ac, color='green', edgecolor='white', width=barWidth)
# Create red bars
plt.bar(x30, yT30, bottom=acg, color='orange', edgecolor='white', width=barWidth)
##

# Show graphic
plt.show()