Convert gtf file to seqmonk generic annotation and add gene names

In [1]:
path='/nesi/nobackup/uoo02328/tunicate/annotation/braker/'
In [2]:
tuninfoFile = open(path+'tunicate_gene_info_brk1.txt')
In [3]:
tunic = {} # {acc:info}
for line in tuninfoFile:
    parts = line.split(';')
    acc = parts[0]
    geneInfo = parts[1]
    tunic[acc]=geneInfo
print(len(tunic))
tuninfoFile.close()
6968
In [4]:
tunic['AAA16288.1']
Out[4]:
'homeobox protein, partial [Styela plicata]'
In [5]:
tunAccList = list(tunic.keys())
tunAcc = set(tunAccList)
print(len(tunAccList))
print(len(tunAcc))
6968
6968
In [6]:
dmdFile = open(path+'augHintsAA.diamond.txt')
In [7]:
dmd = {} # {gene: [acc1,acc2]}
dmdAcc = []
for line in dmdFile:
    parts = line.split('\t')
    geneVers = parts[0]
    fullAcc = parts[1]
    acc = fullAcc.split('|')[3]
    if acc in tunAcc:
        dmdAcc.append(acc)
        dmd.setdefault(geneVers, []).append(acc)
print(len(dmd))
print(len(dmdAcc))
dmdFile.close()
12123
16594
In [8]:
dmd['g40762.t1']
Out[8]:
['XP_002120192.1', 'XP_002124206.1']
In [15]:
tunic['XP_002120192.1']
Out[15]:
'probable sodium-coupled neutral amino acid transporter 6 [Ciona intestinalis]'
In [16]:
tunic['XP_002124206.1']
Out[16]:
'sodium-coupled neutral amino acid transporter 2 [Ciona intestinalis]'
In [17]:
dmdPicks = {}
for k,v in dmd.items():
    if len(v) > 1:
        for g in v:
            if 'Ciona intestinalis' in v:
                dmdPicks[k]=g
            else:
                dmdPicks[k]=v[0]
    else:
        dmdPicks[k]=v[0]
In [18]:
dmdPicks['g40762.t1']
Out[18]:
'XP_002120192.1'
In [9]:
geneDict = {}
genes = []
for k,v in dmd.items():
    
    gene = k.split('.')[0]
    genes.append(gene)
    geneDict
geneSet = set(genes)
print(len(genes))
print(len(geneSet))
12123
11383
In [19]:
dmdPicks['g28349.t1']
Out[19]:
'XP_002125676.2'
In [13]:
'g19529' in geneSet
Out[13]:
True
In [20]:
gtfFile = open(path+'augustus.hints.gtf')
In [21]:
geneDict = {}
transDict = {}
components = {}
gene_list = []
trans_list = {} # gene:list of trans
for line in gtfFile:
    line = line.strip('\n')
    parts = line.split('\t')
    newline = '\t'.join(parts[:8])
    if parts[2] == 'gene':
        if parts[8] in geneSet:
            #newline = '\t'.join(parts[:8])
            geneDict[parts[8]]=newline
            gene_list.append(parts[8])
            #newGTF.write(line+'\n')
            
    elif parts[2] == 'transcript':
        if parts[8] in dmdPicks:
            transDict[parts[8]]=newline
            gn = parts[8].split('.')[0]
            trans_list.setdefault(gn, []).append(parts[8])
            
    else:
        geneParts = parts[8].split(';')
        trans_id = geneParts[0].replace('transcript_id "','').replace('"','')
        #gene_list.append(trans_id)
        if trans_id in dmd:
            components.setdefault(trans_id, []).append(newline)
            
            
        
In [23]:
geneDict['g1']
Out[23]:
'scaffold33412_pilon\tAUGUSTUS\tgene\t533\t3272\t0.18\t+\t.'
In [25]:
trans_list['g1']
Out[25]:
['g1.t1']
In [26]:
transDict['g1.t1']
Out[26]:
'scaffold33412_pilon\tAUGUSTUS\ttranscript\t533\t3272\t0.18\t+\t.'
In [27]:
components['g1.t1']
Out[27]:
['scaffold33412_pilon\tAUGUSTUS\tstart_codon\t533\t535\t.\t+\t0',
 'scaffold33412_pilon\tAUGUSTUS\tCDS\t533\t622\t0.44\t+\t0',
 'scaffold33412_pilon\tAUGUSTUS\texon\t533\t622\t.\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tintron\t623\t1312\t1\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tCDS\t1313\t1452\t1\t+\t0',
 'scaffold33412_pilon\tAUGUSTUS\texon\t1313\t1452\t.\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tintron\t1453\t2115\t1\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tCDS\t2116\t2272\t1\t+\t1',
 'scaffold33412_pilon\tAUGUSTUS\texon\t2116\t2272\t.\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tintron\t2273\t3134\t1\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tCDS\t3135\t3272\t0.4\t+\t0',
 'scaffold33412_pilon\tAUGUSTUS\texon\t3135\t3272\t.\t+\t.',
 'scaffold33412_pilon\tAUGUSTUS\tstop_codon\t3270\t3272\t.\t+\t0']
In [29]:
gene_list[0]
Out[29]:
'g1'
In [31]:
trans_list['g205']
Out[31]:
['g205.t1', 'g205.t2']
In [32]:
dmdPicks['g205.t1']
Out[32]:
'XP_002122393.1'
In [33]:
dmdPicks['g205.t2']
Out[33]:
'XP_002122393.1'
In [35]:
print(len(dmdPicks))
12123
In [34]:
tunic['XP_002122393.1']
Out[34]:
'organic cation transporter protein-like isoform X1 [Ciona intestinalis]'
In [50]:
# now merge accession and description information for trans
transTitles = {}
transDesc = {}
for k,v in dmdPicks.items():
    info = tunic[v]
    short = info.split(' [')[0]
    title = k+': '+short
    transTitles[k]=title
    desc = v+': '+info
    transDesc[k]=desc
print(len(transTitles))
12123
In [51]:
transTitles['g205.t2']
Out[51]:
'g205.t2: organic cation transporter protein-like isoform X1'
In [52]:
transDesc['g205.t2']
Out[52]:
'XP_002122393.1: organic cation transporter protein-like isoform X1 [Ciona intestinalis]'
In [53]:
newGTF = open(path+'converted_augustus_3.txt','w')
In [54]:
for gene in gene_list:
    newGTF.write(geneDict[gene]+'\t'+gene+'\t'+gene+'\n')
    for tran in trans_list[gene]:
        newGTF.write(transDict[tran]+'\t'+transTitles[tran]+'\t'+transDesc[tran]+'\n')
        trans_comp = components[tran]
        for comp in trans_comp:
            newGTF.write(comp+'\t'+transTitles[tran]+'\t'+transDesc[tran]+'\n')
In [55]:
newGTF.close()
In [57]:
'g339' in gene_list
Out[57]:
False