Trim trnL reference sequences based on blast results to qii2 primer extract

In [3]:
import sys
sys.path.append('/Users/hughcross/Analysis/repos/consensus_builder/consensus_functions/')
from file_funcs import make_list
from sequence_funcs import fasta_to_dict
In [1]:
blastfile = open('/Users/hughcross/Analysis/CAMEL/cam2018/taxonomyDB/trnLdev/pass1_blastout.txt')
In [2]:
blastdict = {} # {query1:{subj1:results1,subj2:results},query2:{subj3:results,subj4:results}}
for line in blastfile:
    parts = line.split('\t')
    query = parts[0]
    subj = parts[1]
    results = '\t'.join(parts[2:10])
    blastdict.setdefault(query, {})[subj]=results
print(len(blastdict))
blastfile.close()
115114
In [5]:
blastdict['X74572.1']
Out[5]:
{'DQ180487.1': '99.180\t122\t1\t0\t1\t122\t19\t140',
 'DQ180490.1': '99.180\t122\t1\t0\t1\t122\t19\t140',
 'JN661825.1': '99.180\t122\t1\t0\t1\t122\t19\t140',
 'JN661833.1': '99.180\t122\t1\t0\t1\t122\t19\t140',
 'KP756716.1': '100.000\t124\t0\t0\t1\t124\t19\t142'}

Get the reference sequences to check for length and remove from dict

In [6]:
refseqs = fasta_to_dict('/Users/hughcross/Analysis/CAMEL/cam2018/taxonomyDB/trnLdev/embryophyta_primer_pass1.fasta')
In [7]:
refseqs['AF023735.1']
Out[7]:
'GACTTAAATAATTTGAGCTTTAGTAGAAAAACTTACTAAATGCTAGCTTTCAGATTCAGGGAAACTTAGGTTGAAAAAGGTATAAGTAATCCTGAGCCAAATCTTATTTCATTTGAAAATAA'
In [8]:
refLengths = {}
for k,v in refseqs.items():
    length = len(v)
    refLengths[k]=length
print(len(refLengths))
12753
In [9]:
refLengths['DQ180487.1']
Out[9]:
141
In [10]:
reflist = list(refseqs.keys())
print(len(reflist))
print(reflist[0])
12753
AY626005.1
In [11]:
refset = set(reflist)

Filter blast dictionary by best hit

In [24]:
filtDict1 = {}
for k,v in blastdict.items():
    
    if k not in refset:
        lenlist = []
        sublist = []
        for key,value in v.items():
            results = value.split('\t')
            sublist.append(key)
            length = int(results[1])
            lenlist.append(length)
        maxlen = max(lenlist)
        if maxlen >= 70:
            for i in range(0,len(lenlist)):
                currentlen = lenlist[i]
                if currentlen == maxlen:
                    filtDict1[k]=sublist[i]
print(len(filtDict1))
93340
In [27]:
filtDict1['JQ954249.1']
Out[27]:
'KT945237.1'

Get ranges from dictionary, filtering by end of ref

In [28]:
rangeDict = {}
revcomps = []
for k,v in blastdict.items():
    if k in filtDict1:
        besthit = filtDict1[k]
        hitlength = refLengths[besthit]
        bestresults = v[besthit]
        results = bestresults.split('\t')
        length = int(results[1])
        query_start = int(results[4])
        query_end = int(results[5])
        subj_start = int(results[6])
        subj_end = int(results[7])
        if subj_start > subj_end:
            subjEnd = subj_start
            revcomps.append(k)
        else:
            subjEnd = subj_end
        if hitlength - subjEnd < 30: # to make sure there is sufficient variability at the end of the sequence
            rangeDict.setdefault(k, {})['start']=query_start
            rangeDict.setdefault(k, {})['end']=query_end
print(len(rangeDict))
print(len(revcomps))
90746
3978
In [29]:
rangeDict['LC378336.1']
Out[29]:
{'end': 1739, 'start': 1635}
In [30]:
from Bio.Seq import reverse_complement
In [31]:
'LC378336.1' in revcomps
Out[31]:
True
In [37]:
revset = set(revcomps)
print(len(revset))
3978

Slice sequences from set

In [32]:
trnl = fasta_to_dict('/Users/hughcross/Analysis/CAMEL/cam2018/taxonomyDB/trnLdev/embyrophyta_trnL_search.fasta')
In [33]:
trimmed = {}
for k,v in trnl.items():
    if k in rangeDict:
        trimseq = v[rangeDict[k]['start']:rangeDict[k]['end']]
        trimmed[k]=trimseq
In [34]:
print(len(trimmed))
90746
In [36]:
trimmed['MF683824.1']
Out[36]:
'AACCCCGGAAAAAAAAATGGGTTGAGCCAAATCCTTCTTTCGCAAACAAACAAAGATTCCGAAAGCTAAAAAAAAG'
In [38]:
finalTrim = {}
for k,v in trimmed.items():
    if k in revset:
        newseq = reverse_complement(v)
        finalTrim[k]=newseq
    else:
        finalTrim[k]=v
print(len(finalTrim))
90746
In [39]:
trimmed['LC378336.1']
Out[39]:
'CTTTTTTGATTATAACTTTCTGAACTTTTCTTCATTTACGGAAAAAAGGATTTGGCTCAGGATTGCCCATTGTGAATTCCAGGGTTTCTCTGAATTTGAAAGTT'
In [40]:
finalTrim['LC378336.1']
Out[40]:
'AACTTTCAAATTCAGAGAAACCCTGGAATTCACAATGGGCAATCCTGAGCCAAATCCTTTTTTCCGTAAATGAAGAAAAGTTCAGAAAGTTATAATCAAAAAAG'

Combine first pass and final trim

In [41]:
finalSeqs = {}
for key,value in refseqs.items():
    finalSeqs[key]=value
for k,v in finalTrim.items():
    finalSeqs[k]=v
print(len(finalSeqs))
103499
In [42]:
seqout = open('trnL_reference_seqs.fasta','w')
In [43]:
for k,v in finalSeqs.items():
    seqout.write('>'+k+'\n'+v+'\n')
seqout.close()
In [44]:
finalList = list(finalSeqs.keys())
print(len(finalList))
103499
In [45]:
listout = open('trnl_ref_seq_list.txt','w')
In [46]:
for i in finalList:
    listout.write(i+'\n')
listout.close()