path='/nesi/nobackup/uoo02328/tunicate/annotation/braker/'
tuninfoFile = open(path+'tunicate_gene_info_brk1.txt')
tunic = {} # {acc:info}
for line in tuninfoFile:
parts = line.split(';')
acc = parts[0]
geneInfo = parts[1]
tunic[acc]=geneInfo
print(len(tunic))
tuninfoFile.close()
tunic['AAA16288.1']
tunAccList = list(tunic.keys())
tunAcc = set(tunAccList)
print(len(tunAccList))
print(len(tunAcc))
dmdFile = open(path+'augHintsAA.diamond.txt')
dmd = {} # {gene: [acc1,acc2]}
dmdAcc = []
for line in dmdFile:
parts = line.split('\t')
geneVers = parts[0]
fullAcc = parts[1]
acc = fullAcc.split('|')[3]
if acc in tunAcc:
dmdAcc.append(acc)
dmd.setdefault(geneVers, []).append(acc)
print(len(dmd))
print(len(dmdAcc))
dmdFile.close()
dmd['g40762.t1']
tunic['XP_002120192.1']
tunic['XP_002124206.1']
dmdPicks = {}
for k,v in dmd.items():
if len(v) > 1:
for g in v:
if 'Ciona intestinalis' in v:
dmdPicks[k]=g
else:
dmdPicks[k]=v[0]
else:
dmdPicks[k]=v[0]
dmdPicks['g40762.t1']
geneDict = {}
genes = []
for k,v in dmd.items():
gene = k.split('.')[0]
genes.append(gene)
geneDict
geneSet = set(genes)
print(len(genes))
print(len(geneSet))
dmdPicks['g28349.t1']
'g19529' in geneSet
gtfFile = open(path+'augustus.hints.gtf')
geneDict = {}
transDict = {}
components = {}
gene_list = []
trans_list = {} # gene:list of trans
for line in gtfFile:
line = line.strip('\n')
parts = line.split('\t')
newline = '\t'.join(parts[:8])
if parts[2] == 'gene':
if parts[8] in geneSet:
#newline = '\t'.join(parts[:8])
geneDict[parts[8]]=newline
gene_list.append(parts[8])
#newGTF.write(line+'\n')
elif parts[2] == 'transcript':
if parts[8] in dmdPicks:
transDict[parts[8]]=newline
gn = parts[8].split('.')[0]
trans_list.setdefault(gn, []).append(parts[8])
else:
geneParts = parts[8].split(';')
trans_id = geneParts[0].replace('transcript_id "','').replace('"','')
#gene_list.append(trans_id)
if trans_id in dmd:
components.setdefault(trans_id, []).append(newline)
geneDict['g1']
trans_list['g1']
transDict['g1.t1']
components['g1.t1']
gene_list[0]
trans_list['g205']
dmdPicks['g205.t1']
dmdPicks['g205.t2']
print(len(dmdPicks))
tunic['XP_002122393.1']
# now merge accession and description information for trans
transTitles = {}
transDesc = {}
for k,v in dmdPicks.items():
info = tunic[v]
short = info.split(' [')[0]
title = k+': '+short
transTitles[k]=title
desc = v+': '+info
transDesc[k]=desc
print(len(transTitles))
transTitles['g205.t2']
transDesc['g205.t2']
newGTF = open(path+'converted_augustus_3.txt','w')
for gene in gene_list:
newGTF.write(geneDict[gene]+'\t'+gene+'\t'+gene+'\n')
for tran in trans_list[gene]:
newGTF.write(transDict[tran]+'\t'+transTitles[tran]+'\t'+transDesc[tran]+'\n')
trans_comp = components[tran]
for comp in trans_comp:
newGTF.write(comp+'\t'+transTitles[tran]+'\t'+transDesc[tran]+'\n')
newGTF.close()
'g339' in gene_list