Finding haplotypes from alignment

In [1]:
from Bio import AlignIO
from Bio import SeqIO
#from StringIO import StringIO

make functions

In [2]:
def sample_info(alignment):
    sampleDict = {}
    sampleCounts = {}
    sampleNames = []
    for samp in range(1,len(alignment)):
        sequence = str(alignment[samp,:].seq).upper()
        seqID = alignment[samp,:].id
        sampleNames.append(seqID)
        counts = int(seqID.split('size=')[1])
        sampleDict[seqID]=sequence
        sampleCounts[seqID]=counts
    return (sampleDict, sampleCounts, sampleNames)
In [3]:
def pos_counts(sampleDict, sampleCounts, sampleNames, alnLength):
    posCounts = {}
    for pos in range(0,alnLength):
        baseDict = {}
        for sample in sampleNames:
            base = sampleDict[sample][pos]
            ct = sampleCounts[sample]
            baseDict.setdefault(base,[]).append(ct)
        totalCounts = {}
        for k,v in baseDict.items():
            total = sum(v)
            totalCounts[k]=total
        posCounts[pos]=totalCounts
    return posCounts
In [4]:
def informative_alleles(posCounts, mmaf=0.1):
    infAlleles = []
    for k,v in posCounts.items():
        ctlist = list(v.values())
        topCount = max(ctlist)
        sumCounts = sum(ctlist)
        for key,value in v.items():
            if value != topCount:
                if (value/sumCounts) > mmaf:
                    infAlleles.append(k)
    return infAlleles
In [5]:
def informative_bases(infAlleles, posCounts):
    infBases = {}
    for i in infAlleles:
        baseList = list(posCounts[i].keys())
        bases = set(baseList)
        infBases[i]=bases
    return infBases
In [6]:
def raw_haplotypes(sampleDict, sampleCounts, infAlleles, infBases):
    haplotypes = {} # hap:count
    for k,v in sampleDict.items():
        hapStr = ''
        for i in infAlleles:
            base = v[i]
            if base in infBases[i]:
                hapStr += base
            else:
                hapStr += '-'
        haplotypes.setdefault(hapStr, []).append(sampleCounts[k])
    return haplotypes
In [7]:
def major_haplotypes(haplotypes, mhf=0.1):
    """mhf == major haplotype frequency"""
    hapTotals = {}
    for k,v in haplotypes.items():
        total = sum(v)
        hapTotals[k]=total
    totalFreq = sum(hapTotals.values())
    major_haps = []
    for key,value in hapTotals.items():
        if value/totalFreq >= mhf:
            major_haps.append(key)
    return major_haps

Test functions

In [8]:
alnFile = '/Users/hughcross/OneDrive/AnatAnalysis/Paua/find_haplotypes/aln_dr10_Sample-10_7_1wRef.fasta'
In [9]:
aln1 = AlignIO.read(alnFile,'fasta')
In [10]:
len(aln1)
Out[10]:
164
In [11]:
aln1.get_alignment_length()
Out[11]:
138
In [12]:
aln1_info = sample_info(aln1)
In [13]:
sampleNames = aln1_info[2]
sampleNames[0]
Out[13]:
'rep.1;size=7560'
In [14]:
aln1_pos_counts = pos_counts(aln1_info[0], aln1_info[1], aln1_info[2], 138)
In [15]:
aln1_pos_counts[1]
Out[15]:
{'T': 18485, '-': 162, 'C': 10}
In [16]:
aln1_infA = informative_alleles(aln1_pos_counts)
In [17]:
aln1_infA
Out[17]:
[10, 25, 50, 78, 98, 128]
In [18]:
aln1_infB = informative_bases(aln1_infA, aln1_pos_counts)
In [19]:
aln1_infB
Out[19]:
{10: {'C', 'T'},
 25: {'A', 'G'},
 50: {'A', 'G'},
 78: {'A', 'G'},
 98: {'A', 'G'},
 128: {'C', 'T'}}
In [20]:
aln1_rh = raw_haplotypes(aln1_info[0], aln1_info[1], aln1_infA, aln1_infB)
In [21]:
len(aln1_rh['CAGGAT'])
Out[21]:
108
In [22]:
aln1_mh = major_haplotypes(aln1_rh)
In [23]:
aln1_mh
Out[23]:
['CAGGAT', 'CAAAAT', 'TGGGAT', 'TAGGGC']

Output files for extracting haplotypes

In [24]:
def output_files(prefix, major_haplotypes, informative_alleles):
    positionFileName = prefix+'_positions.txt'
    haplotypeFileName = prefix+'_haplotypes.txt'
    posout = open(positionFileName, 'w')
    hapout = open(haplotypeFileName, 'w')
    counter = 1
    last_hap = major_haplotypes[-1]
    last_pos = informative_alleles[-1]
    for hap in major_haplotypes:
        if hap == last_hap:
            hapout.write(str(counter)+'\t'+hap)
        else:
            hapout.write(str(counter)+'\t'+hap+'\n')
            counter += 1
    for p in informative_alleles:
        if p == last_pos:
            posout.write(str(p+1))
        else:
            posout.write(str(p+1)+'\n')
    posout.close()
    hapout.close()
In [25]:
output_files('test1', aln1_mh, aln1_infA)

add global alignment adjustment

In [26]:
for p in range(0,138):
    if aln1[0,p] == '-':
        continue
    else:
        startpoint = p
        break
print(startpoint)
1
In [27]:
for e in range(1,138):
    if aln1[0,-e] == '-':
        continue
    else:
        endpoint = e-1
        break
print(endpoint)
1
In [28]:
alng = aln1[:,1:-1]
In [29]:
alng.get_alignment_length()
Out[29]:
136
In [30]:
alng[1,0]
Out[30]:
't'
In [31]:
def global_adjust(alignment):
    """this assumes that the reference is the first sequence"""
    alnLength = alignment.get_alignment_length()
    for p in range(0,alnLength):
        if alignment[0,p] == '-':
            continue
        else:
            startpoint = p
            break
    for e in range(1,alnLength):
        if alignment[0,-e] == '-':
            continue
        else:
            endpoint = e-1
            break
    new_alignment = alignment[:,startpoint:-endpoint]
    return new_alignment
In [32]:
aln2 = global_adjust(aln1)
In [33]:
aln2.get_alignment_length()
Out[33]:
136
In [34]:
print(aln1[0,10])
print(aln2[0,10])
c
g

Now redo with adjusted alignment

In [35]:
aln2.get_alignment_length()
Out[35]:
136
In [36]:
aln2_info = sample_info(aln2)
In [37]:
sampleNames = aln2_info[2]
sampleNames[0]
Out[37]:
'rep.1;size=7560'
In [38]:
aln2_pos_counts = pos_counts(aln2_info[0], aln2_info[1], aln2_info[2], 136)
In [39]:
aln2_pos_counts[10]
Out[39]:
{'G': 18638, 'A': 19}
In [40]:
aln2_infA = informative_alleles(aln2_pos_counts)
In [41]:
aln2_infA
Out[41]:
[9, 24, 49, 77, 97, 127]
In [42]:
aln1_infB = informative_bases(aln1_infA, aln1_pos_counts)
In [43]:
aln1_infB
Out[43]:
{10: {'C', 'T'},
 25: {'A', 'G'},
 50: {'A', 'G'},
 78: {'A', 'G'},
 98: {'A', 'G'},
 128: {'C', 'T'}}
In [44]:
aln1_rh = raw_haplotypes(aln1_info[0], aln1_info[1], aln1_infA, aln1_infB)
In [45]:
len(aln1_rh['CAGGAT'])
Out[45]:
108
In [46]:
aln1_mh = major_haplotypes(aln1_rh)
In [47]:
aln1_mh
Out[47]:
['CAGGAT', 'CAAAAT', 'TGGGAT', 'TAGGGC']