import sys
sys.path.append('/Users/hughcross/Analysis/repos/consensus_builder/consensus_functions/')
from file_funcs import make_list
from sequence_funcs import fasta_to_dict
blastfile = open('/Users/hughcross/Analysis/CAMEL/cam2018/taxonomyDB/trnLdev/pass1_blastout.txt')
blastdict = {} # {query1:{subj1:results1,subj2:results},query2:{subj3:results,subj4:results}}
for line in blastfile:
parts = line.split('\t')
query = parts[0]
subj = parts[1]
results = '\t'.join(parts[2:10])
blastdict.setdefault(query, {})[subj]=results
print(len(blastdict))
blastfile.close()
blastdict['X74572.1']
refseqs = fasta_to_dict('/Users/hughcross/Analysis/CAMEL/cam2018/taxonomyDB/trnLdev/embryophyta_primer_pass1.fasta')
refseqs['AF023735.1']
refLengths = {}
for k,v in refseqs.items():
length = len(v)
refLengths[k]=length
print(len(refLengths))
refLengths['DQ180487.1']
reflist = list(refseqs.keys())
print(len(reflist))
print(reflist[0])
refset = set(reflist)
filtDict1 = {}
for k,v in blastdict.items():
if k not in refset:
lenlist = []
sublist = []
for key,value in v.items():
results = value.split('\t')
sublist.append(key)
length = int(results[1])
lenlist.append(length)
maxlen = max(lenlist)
if maxlen >= 70:
for i in range(0,len(lenlist)):
currentlen = lenlist[i]
if currentlen == maxlen:
filtDict1[k]=sublist[i]
print(len(filtDict1))
filtDict1['JQ954249.1']
rangeDict = {}
revcomps = []
for k,v in blastdict.items():
if k in filtDict1:
besthit = filtDict1[k]
hitlength = refLengths[besthit]
bestresults = v[besthit]
results = bestresults.split('\t')
length = int(results[1])
query_start = int(results[4])
query_end = int(results[5])
subj_start = int(results[6])
subj_end = int(results[7])
if subj_start > subj_end:
subjEnd = subj_start
revcomps.append(k)
else:
subjEnd = subj_end
if hitlength - subjEnd < 30: # to make sure there is sufficient variability at the end of the sequence
rangeDict.setdefault(k, {})['start']=query_start
rangeDict.setdefault(k, {})['end']=query_end
print(len(rangeDict))
print(len(revcomps))
rangeDict['LC378336.1']
from Bio.Seq import reverse_complement
'LC378336.1' in revcomps
revset = set(revcomps)
print(len(revset))
trnl = fasta_to_dict('/Users/hughcross/Analysis/CAMEL/cam2018/taxonomyDB/trnLdev/embyrophyta_trnL_search.fasta')
trimmed = {}
for k,v in trnl.items():
if k in rangeDict:
trimseq = v[rangeDict[k]['start']:rangeDict[k]['end']]
trimmed[k]=trimseq
print(len(trimmed))
trimmed['MF683824.1']
finalTrim = {}
for k,v in trimmed.items():
if k in revset:
newseq = reverse_complement(v)
finalTrim[k]=newseq
else:
finalTrim[k]=v
print(len(finalTrim))
trimmed['LC378336.1']
finalTrim['LC378336.1']
finalSeqs = {}
for key,value in refseqs.items():
finalSeqs[key]=value
for k,v in finalTrim.items():
finalSeqs[k]=v
print(len(finalSeqs))
seqout = open('trnL_reference_seqs.fasta','w')
for k,v in finalSeqs.items():
seqout.write('>'+k+'\n'+v+'\n')
seqout.close()
finalList = list(finalSeqs.keys())
print(len(finalList))
listout = open('trnl_ref_seq_list.txt','w')
for i in finalList:
listout.write(i+'\n')
listout.close()