convert_dN-gmap_to_dex-compat_map_gff_DEL

In [1]:
import pandas as pd
import numpy as np
In [2]:
import sys
sys.path.append('/Users/hughcross/analysis/git_projects/git_projects/consensus_builder/consensus_functions')
from file_funcs import make_list
In [3]:
gff_file = '/Users/hughcross/analysis/git_projects/rangefinder/gmap.spliced_alignments.gff3'
In [4]:
gold_dtypes = {'seqid':'str','source':'str','type':'str','start':'float','end':'float','score':'str','strand':'str','phase':'str','attributes':'str'}
In [5]:
col_names = ['seqid','source','type','start','end','score','strand','phase','attributes']
In [6]:
gmap = pd.read_table(gff_file, sep='\t', comment='#', dtype=gold_dtypes, names=col_names)
In [7]:
gmap.head()
Out[7]:
seqid source type start end score strand phase attributes
0 MA_339960 Pabies1.0_gene_only.fa.gmap cDNA_match 3398.0 3520.0 100 - . ID=TR547|c0_g1_i2.path1;Name=TR547|c0_g1_i2;Ta...
1 MA_339960 Pabies1.0_gene_only.fa.gmap cDNA_match 2959.0 3347.0 96 - . ID=TR547|c0_g1_i2.path1;Name=TR547|c0_g1_i2;Ta...
2 MA_82491 Pabies1.0_gene_only.fa.gmap cDNA_match 5764.0 6396.0 99 - . ID=TR392|c0_g1_i1.path1;Name=TR392|c0_g1_i1;Ta...
3 MA_474985 Pabies1.0_gene_only.fa.gmap cDNA_match 5839.0 6224.0 93 - . ID=TR466|c0_g1_i1.path1;Name=TR466|c0_g1_i1;Ta...
4 MA_77330 Pabies1.0_gene_only.fa.gmap cDNA_match 3304.0 4106.0 96 + . ID=TR393|c0_g1_i1.path1;Name=TR393|c0_g1_i1;Ta...
In [8]:
gmap_scafs = gmap['seqid'].tolist()
print(len(gmap_scafs))
12310
In [9]:
de_epi = make_list('/Users/hughcross/analysis/epiclim/dex_epi_phase1/DE_epigenes_scaffold_list.txt')
In [10]:
de_epi_gmapped = []
for i in de_epi:
    if i in gmap_scafs:
        de_epi_gmapped.append(i)
print(len(de_epi_gmapped))
30
In [11]:
#de_epi_gmapped
# more than one gene per scaffold, so maybe split the df by geneID 
#gff['gene_ID'], gff['attr_info'] = zip(*gff['attributes'].apply(lambda x: x.split(';', 1)))
In [12]:
gmap['attr_ID'], gmap['attr_Name'], gmap['attr_Target'], gmap['attr_Gap'] = zip(*gmap['attributes'].apply(lambda x: x.split(';', 3)))
In [15]:
gmap.head()
Out[15]:
seqid source type start end score strand phase attributes attr_ID attr_Name attr_Target attr_Gap
0 MA_339960 Pabies1.0_gene_only.fa.gmap cDNA_match 3398.0 3520.0 100 - . ID=TR547|c0_g1_i2.path1;Name=TR547|c0_g1_i2;Ta... TR547|c0_g1_i2.path1 TR547|c0_g1_i2 TR547|c0_g1_i2 1 123 M123
1 MA_339960 Pabies1.0_gene_only.fa.gmap cDNA_match 2959.0 3347.0 96 - . ID=TR547|c0_g1_i2.path1;Name=TR547|c0_g1_i2;Ta... TR547|c0_g1_i2.path1 TR547|c0_g1_i2 TR547|c0_g1_i2 124 501 M214 D11 M164
2 MA_82491 Pabies1.0_gene_only.fa.gmap cDNA_match 5764.0 6396.0 99 - . ID=TR392|c0_g1_i1.path1;Name=TR392|c0_g1_i1;Ta... TR392|c0_g1_i1.path1 TR392|c0_g1_i1 TR392|c0_g1_i1 1 633 M633
3 MA_474985 Pabies1.0_gene_only.fa.gmap cDNA_match 5839.0 6224.0 93 - . ID=TR466|c0_g1_i1.path1;Name=TR466|c0_g1_i1;Ta... TR466|c0_g1_i1.path1 TR466|c0_g1_i1 TR466|c0_g1_i1 8 403 M36 I5 M228 I8 M104 D3 M15
4 MA_77330 Pabies1.0_gene_only.fa.gmap cDNA_match 3304.0 4106.0 96 + . ID=TR393|c0_g1_i1.path1;Name=TR393|c0_g1_i1;Ta... TR393|c0_g1_i1.path1 TR393|c0_g1_i1 TR393|c0_g1_i1 1 781 M351 D22 M430
In [14]:
# now do the others
gmap['attr_ID'] = gmap.attr_ID.str.replace('ID=','') ## replacing it later, so retains ID= for making new attributes 
gmap['attr_Name'] = gmap.attr_Name.str.replace('Name=','')
gmap['attr_Target'] = gmap.attr_Target.str.replace('Target=','')
gmap['attr_Gap'] = gmap.attr_Gap.str.replace('Gap=','')
In [16]:
# try to get range info from attr_Target above
gmap['gene_name'], gmap['attr_start'], gmap['attr_end'] = zip(*gmap['attr_Target'].apply(lambda x: x.split(' ', 3)))
In [17]:
gmap_names = gmap['gene_name'].tolist()
print(len(gmap_names))
12310
In [21]:
# get a subset of those of correct scaffolds
## eug_sub = eug[eug['seqid'].isin(scaflist)]
gmap_sub = gmap[gmap['seqid'].isin(de_epi_gmapped)]
In [22]:
gmap_sub
Out[22]:
seqid source type start end score strand phase attributes attr_ID attr_Name attr_Target attr_Gap gene_name attr_start attr_end
60 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 1174.0 1448.0 99 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 67 341 M275 TR574|c2_g1_i2 67 341
61 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 1577.0 2225.0 98 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 342 990 M649 TR574|c2_g1_i2 342 990
62 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 2327.0 2526.0 100 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 991 1190 M200 TR574|c2_g1_i2 991 1190
63 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 2670.0 2914.0 98 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 1191 1435 M245 TR574|c2_g1_i2 1191 1435
64 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 3018.0 4085.0 97 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 1436 2500 M951 D3 M114 TR574|c2_g1_i2 1436 2500
65 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 4478.0 4537.0 79 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 2501 2567 M47 I7 M13 TR574|c2_g1_i2 2501 2567
66 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 4638.0 4964.0 99 + . ID=TR574|c2_g1_i2.path2;Name=TR574|c2_g1_i2;Ta... TR574|c2_g1_i2.path2 TR574|c2_g1_i2 TR574|c2_g1_i2 2568 2894 M327 TR574|c2_g1_i2 2568 2894
1034 MA_713159 Pabies1.0_gene_only.fa.gmap cDNA_match 316.0 577.0 96 + . ID=TR18492|c2_g1_i11.path1;Name=TR18492|c2_g1_... TR18492|c2_g1_i11.path1 TR18492|c2_g1_i11 TR18492|c2_g1_i11 1 262 M262 TR18492|c2_g1_i11 1 262
1035 MA_713159 Pabies1.0_gene_only.fa.gmap cDNA_match 736.0 758.0 86 + . ID=TR18492|c2_g1_i11.path1;Name=TR18492|c2_g1_... TR18492|c2_g1_i11.path1 TR18492|c2_g1_i11 TR18492|c2_g1_i11 263 285 M23 TR18492|c2_g1_i11 263 285
1036 MA_713159 Pabies1.0_gene_only.fa.gmap cDNA_match 358.0 548.0 96 + . ID=TR18492|c2_g1_i7.path1;Name=TR18492|c2_g1_i... TR18492|c2_g1_i7.path1 TR18492|c2_g1_i7 TR18492|c2_g1_i7 1 191 M191 TR18492|c2_g1_i7 1 191
1037 MA_713159 Pabies1.0_gene_only.fa.gmap cDNA_match 707.0 758.0 94 + . ID=TR18492|c2_g1_i7.path1;Name=TR18492|c2_g1_i... TR18492|c2_g1_i7.path1 TR18492|c2_g1_i7 TR18492|c2_g1_i7 192 243 M52 TR18492|c2_g1_i7 192 243
1041 MA_74102 Pabies1.0_gene_only.fa.gmap cDNA_match 2244.0 2430.0 94 + . ID=TR18492|c2_g1_i3.path1;Name=TR18492|c2_g1_i... TR18492|c2_g1_i3.path1 TR18492|c2_g1_i3 TR18492|c2_g1_i3 1 187 M187 TR18492|c2_g1_i3 1 187
1095 MA_97951 Pabies1.0_gene_only.fa.gmap cDNA_match 10453.0 10925.0 93 + . ID=TR18492|c2_g7_i2.path1;Name=TR18492|c2_g7_i... TR18492|c2_g7_i2.path1 TR18492|c2_g7_i2 TR18492|c2_g7_i2 1 473 M400 I1 M20 D1 M52 TR18492|c2_g7_i2 1 473
1164 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 3018.0 4012.0 90 - . ID=TR23182|c0_g1_i2.path1;Name=TR23182|c0_g1_i... TR23182|c0_g1_i2.path1 TR23182|c0_g1_i2 TR23182|c0_g1_i2 1 1085 M241 I90 M754 TR23182|c0_g1_i2 1 1085
1165 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 2670.0 2914.0 97 - . ID=TR23182|c0_g1_i2.path1;Name=TR23182|c0_g1_i... TR23182|c0_g1_i2.path1 TR23182|c0_g1_i2 TR23182|c0_g1_i2 1086 1330 M245 TR23182|c0_g1_i2 1086 1330
1166 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 2327.0 2526.0 98 - . ID=TR23182|c0_g1_i2.path1;Name=TR23182|c0_g1_i... TR23182|c0_g1_i2.path1 TR23182|c0_g1_i2 TR23182|c0_g1_i2 1331 1530 M200 TR23182|c0_g1_i2 1331 1530
1167 MA_10426199 Pabies1.0_gene_only.fa.gmap cDNA_match 1822.0 2225.0 99 - . ID=TR23182|c0_g1_i2.path1;Name=TR23182|c0_g1_i... TR23182|c0_g1_i2.path1 TR23182|c0_g1_i2 TR23182|c0_g1_i2 1531 1934 M404 TR23182|c0_g1_i2 1531 1934
1640 MA_15346 Pabies1.0_gene_only.fa.gmap cDNA_match 8899.0 8920.0 95 - . ID=TR34065|c5_g1_i1.path1;Name=TR34065|c5_g1_i... TR34065|c5_g1_i1.path1 TR34065|c5_g1_i1 TR34065|c5_g1_i1 1 22 M22 TR34065|c5_g1_i1 1 22
1641 MA_15346 Pabies1.0_gene_only.fa.gmap cDNA_match 4683.0 6303.0 98 - . ID=TR34065|c5_g1_i1.path1;Name=TR34065|c5_g1_i... TR34065|c5_g1_i1.path1 TR34065|c5_g1_i1 TR34065|c5_g1_i1 23 1643 M1621 TR34065|c5_g1_i1 23 1643
1696 MA_74102 Pabies1.0_gene_only.fa.gmap cDNA_match 2146.0 2426.0 100 - . ID=TR34944|c3_g1_i3.path1;Name=TR34944|c3_g1_i... TR34944|c3_g1_i3.path1 TR34944|c3_g1_i3 TR34944|c3_g1_i3 1 281 M281 TR34944|c3_g1_i3 1 281
2089 MA_920994 Pabies1.0_gene_only.fa.gmap cDNA_match 2912.0 2929.0 100 - . ID=TR45686|c0_g1_i1.path1;Name=TR45686|c0_g1_i... TR45686|c0_g1_i1.path1 TR45686|c0_g1_i1 TR45686|c0_g1_i1 1 18 M18 TR45686|c0_g1_i1 1 18
2090 MA_920994 Pabies1.0_gene_only.fa.gmap cDNA_match 2280.0 2391.0 100 - . ID=TR45686|c0_g1_i1.path1;Name=TR45686|c0_g1_i... TR45686|c0_g1_i1.path1 TR45686|c0_g1_i1 TR45686|c0_g1_i1 19 130 M112 TR45686|c0_g1_i1 19 130
2091 MA_920994 Pabies1.0_gene_only.fa.gmap cDNA_match 2107.0 2168.0 100 - . ID=TR45686|c0_g1_i1.path1;Name=TR45686|c0_g1_i... TR45686|c0_g1_i1.path1 TR45686|c0_g1_i1 TR45686|c0_g1_i1 131 192 M62 TR45686|c0_g1_i1 131 192
2092 MA_920994 Pabies1.0_gene_only.fa.gmap cDNA_match 1740.0 1827.0 100 - . ID=TR45686|c0_g1_i1.path1;Name=TR45686|c0_g1_i... TR45686|c0_g1_i1.path1 TR45686|c0_g1_i1 TR45686|c0_g1_i1 193 280 M88 TR45686|c0_g1_i1 193 280
2096 MA_106416 Pabies1.0_gene_only.fa.gmap cDNA_match 9216.0 9221.0 100 - . ID=TR45686|c2_g2_i1.path1;Name=TR45686|c2_g2_i... TR45686|c2_g2_i1.path1 TR45686|c2_g2_i1 TR45686|c2_g2_i1 1 6 M6 TR45686|c2_g2_i1 1 6
2097 MA_106416 Pabies1.0_gene_only.fa.gmap cDNA_match 4353.0 4422.0 100 - . ID=TR45686|c2_g2_i1.path1;Name=TR45686|c2_g2_i... TR45686|c2_g2_i1.path1 TR45686|c2_g2_i1 TR45686|c2_g2_i1 7 76 M70 TR45686|c2_g2_i1 7 76
2098 MA_106416 Pabies1.0_gene_only.fa.gmap cDNA_match 4163.0 4256.0 100 - . ID=TR45686|c2_g2_i1.path1;Name=TR45686|c2_g2_i... TR45686|c2_g2_i1.path1 TR45686|c2_g2_i1 TR45686|c2_g2_i1 77 170 M94 TR45686|c2_g2_i1 77 170
2099 MA_106416 Pabies1.0_gene_only.fa.gmap cDNA_match 3005.0 3116.0 98 - . ID=TR45686|c2_g2_i1.path1;Name=TR45686|c2_g2_i... TR45686|c2_g2_i1.path1 TR45686|c2_g2_i1 TR45686|c2_g2_i1 171 282 M112 TR45686|c2_g2_i1 171 282
2100 MA_106416 Pabies1.0_gene_only.fa.gmap cDNA_match 2829.0 2890.0 98 - . ID=TR45686|c2_g2_i1.path1;Name=TR45686|c2_g2_i... TR45686|c2_g2_i1.path1 TR45686|c2_g2_i1 TR45686|c2_g2_i1 283 344 M62 TR45686|c2_g2_i1 283 344
2101 MA_106416 Pabies1.0_gene_only.fa.gmap cDNA_match 2356.0 2497.0 100 - . ID=TR45686|c2_g2_i1.path1;Name=TR45686|c2_g2_i... TR45686|c2_g2_i1.path1 TR45686|c2_g2_i1 TR45686|c2_g2_i1 345 486 M142 TR45686|c2_g2_i1 345 486
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9115 MA_173651 Pabies1.0_gene_only.fa.gmap cDNA_match 21350.0 21418.0 100 + . ID=TR114591|c0_g1_i1.path1;Name=TR114591|c0_g1... TR114591|c0_g1_i1.path1 TR114591|c0_g1_i1 TR114591|c0_g1_i1 1113 1181 M69 TR114591|c0_g1_i1 1113 1181
9116 MA_173651 Pabies1.0_gene_only.fa.gmap cDNA_match 21533.0 21862.0 100 + . ID=TR114591|c0_g1_i1.path1;Name=TR114591|c0_g1... TR114591|c0_g1_i1.path1 TR114591|c0_g1_i1 TR114591|c0_g1_i1 1182 1511 M330 TR114591|c0_g1_i1 1182 1511
9129 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 5353.0 6891.0 99 - . ID=TR114594|c0_g1_i1.path1;Name=TR114594|c0_g1... TR114594|c0_g1_i1.path1 TR114594|c0_g1_i1 TR114594|c0_g1_i1 1 1539 M1539 TR114594|c0_g1_i1 1 1539
9130 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 3160.0 3457.0 100 - . ID=TR114594|c0_g1_i1.path1;Name=TR114594|c0_g1... TR114594|c0_g1_i1.path1 TR114594|c0_g1_i1 TR114594|c0_g1_i1 1540 1837 M298 TR114594|c0_g1_i1 1540 1837
9131 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 2604.0 3066.0 99 - . ID=TR114594|c0_g1_i1.path1;Name=TR114594|c0_g1... TR114594|c0_g1_i1.path1 TR114594|c0_g1_i1 TR114594|c0_g1_i1 1838 2300 M463 TR114594|c0_g1_i1 1838 2300
9132 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 2437.0 2553.0 100 - . ID=TR114594|c0_g1_i1.path1;Name=TR114594|c0_g1... TR114594|c0_g1_i1.path1 TR114594|c0_g1_i1 TR114594|c0_g1_i1 2301 2417 M117 TR114594|c0_g1_i1 2301 2417
10030 MA_9476 Pabies1.0_gene_only.fa.gmap cDNA_match 50394.0 51362.0 90 + . ID=TR121046|c0_g1_i1.path1;Name=TR121046|c0_g1... TR121046|c0_g1_i1.path1 TR121046|c0_g1_i1 TR121046|c0_g1_i1 1 965 M198 D49 M352 I45 M370 TR121046|c0_g1_i1 1 965
11214 MA_6194 Pabies1.0_gene_only.fa.gmap cDNA_match 46870.0 47715.0 100 - . ID=TR129570|c0_g1_i1.path1;Name=TR129570|c0_g1... TR129570|c0_g1_i1.path1 TR129570|c0_g1_i1 TR129570|c0_g1_i1 1 846 M846 TR129570|c0_g1_i1 1 846
11499 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 2449.0 2553.0 100 + . ID=TR131307|c0_g1_i1.path1;Name=TR131307|c0_g1... TR131307|c0_g1_i1.path1 TR131307|c0_g1_i1 TR131307|c0_g1_i1 1 105 M105 TR131307|c0_g1_i1 1 105
11500 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 2604.0 3114.0 99 + . ID=TR131307|c0_g1_i1.path1;Name=TR131307|c0_g1... TR131307|c0_g1_i1.path1 TR131307|c0_g1_i1 TR131307|c0_g1_i1 106 616 M511 TR131307|c0_g1_i1 106 616
11501 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 3208.0 3457.0 100 + . ID=TR131307|c0_g1_i1.path1;Name=TR131307|c0_g1... TR131307|c0_g1_i1.path1 TR131307|c0_g1_i1 TR131307|c0_g1_i1 617 866 M250 TR131307|c0_g1_i1 617 866
11502 MA_99694 Pabies1.0_gene_only.fa.gmap cDNA_match 5353.0 6111.0 99 + . ID=TR131307|c0_g1_i1.path1;Name=TR131307|c0_g1... TR131307|c0_g1_i1.path1 TR131307|c0_g1_i1 TR131307|c0_g1_i1 867 1625 M759 TR131307|c0_g1_i1 867 1625
11515 MA_2328 Pabies1.0_gene_only.fa.gmap cDNA_match 16898.0 17712.0 99 + . ID=TR131470|c0_g1_i1.path1;Name=TR131470|c0_g1... TR131470|c0_g1_i1.path1 TR131470|c0_g1_i1 TR131470|c0_g1_i1 1 815 M815 TR131470|c0_g1_i1 1 815
11516 MA_2328 Pabies1.0_gene_only.fa.gmap cDNA_match 17920.0 19635.0 99 + . ID=TR131470|c0_g1_i1.path1;Name=TR131470|c0_g1... TR131470|c0_g1_i1.path1 TR131470|c0_g1_i1 TR131470|c0_g1_i1 816 2529 M1614 D2 M100 TR131470|c0_g1_i1 816 2529
11699 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 1298.0 1995.0 98 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 1 690 M296 D8 M394 TR132765|c0_g1_i1 1 690
11700 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 2115.0 2203.0 100 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 691 779 M89 TR132765|c0_g1_i1 691 779
11701 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 2237.0 3054.0 100 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 780 1597 M818 TR132765|c0_g1_i1 780 1597
11702 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 3520.0 3662.0 100 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 1598 1740 M143 TR132765|c0_g1_i1 1598 1740
11703 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 4176.0 4304.0 100 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 1741 1869 M129 TR132765|c0_g1_i1 1741 1869
11704 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 4420.0 5381.0 99 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 1870 2831 M962 TR132765|c0_g1_i1 1870 2831
11705 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 5467.0 5563.0 100 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 2832 2928 M97 TR132765|c0_g1_i1 2832 2928
11706 MA_10428736 Pabies1.0_gene_only.fa.gmap cDNA_match 5814.0 6147.0 99 + . ID=TR132765|c0_g1_i1.path1;Name=TR132765|c0_g1... TR132765|c0_g1_i1.path1 TR132765|c0_g1_i1 TR132765|c0_g1_i1 2929 3262 M334 TR132765|c0_g1_i1 2929 3262
11962 MA_7658 Pabies1.0_gene_only.fa.gmap cDNA_match 21300.0 23353.0 99 - . ID=TR135599|c0_g2_i1.path1;Name=TR135599|c0_g2... TR135599|c0_g2_i1.path1 TR135599|c0_g2_i1 TR135599|c0_g2_i1 1 2054 M2054 TR135599|c0_g2_i1 1 2054
11969 MA_14500 Pabies1.0_gene_only.fa.gmap cDNA_match 38918.0 38958.0 87 - . ID=TR135287|c0_g1_i2.path1;Name=TR135287|c0_g1... TR135287|c0_g1_i2.path1 TR135287|c0_g1_i2 TR135287|c0_g1_i2 13 53 M41 TR135287|c0_g1_i2 13 53
12160 MA_111652 Pabies1.0_gene_only.fa.gmap cDNA_match 10123.0 10277.0 100 + . ID=TR137080|c0_g1_i1.path1;Name=TR137080|c0_g1... TR137080|c0_g1_i1.path1 TR137080|c0_g1_i1 TR137080|c0_g1_i1 1 155 M155 TR137080|c0_g1_i1 1 155
12161 MA_111652 Pabies1.0_gene_only.fa.gmap cDNA_match 10491.0 10691.0 100 + . ID=TR137080|c0_g1_i1.path1;Name=TR137080|c0_g1... TR137080|c0_g1_i1.path1 TR137080|c0_g1_i1 TR137080|c0_g1_i1 156 356 M201 TR137080|c0_g1_i1 156 356
12162 MA_111652 Pabies1.0_gene_only.fa.gmap cDNA_match 11729.0 11822.0 100 + . ID=TR137080|c0_g1_i1.path1;Name=TR137080|c0_g1... TR137080|c0_g1_i1.path1 TR137080|c0_g1_i1 TR137080|c0_g1_i1 357 450 M94 TR137080|c0_g1_i1 357 450
12163 MA_111652 Pabies1.0_gene_only.fa.gmap cDNA_match 11923.0 11994.0 100 + . ID=TR137080|c0_g1_i1.path1;Name=TR137080|c0_g1... TR137080|c0_g1_i1.path1 TR137080|c0_g1_i1 TR137080|c0_g1_i1 451 522 M72 TR137080|c0_g1_i1 451 522
12164 MA_111652 Pabies1.0_gene_only.fa.gmap cDNA_match 13383.0 13481.0 100 + . ID=TR137080|c0_g1_i1.path1;Name=TR137080|c0_g1... TR137080|c0_g1_i1.path1 TR137080|c0_g1_i1 TR137080|c0_g1_i1 523 621 M99 TR137080|c0_g1_i1 523 621
12165 MA_111652 Pabies1.0_gene_only.fa.gmap cDNA_match 13953.0 14402.0 100 + . ID=TR137080|c0_g1_i1.path1;Name=TR137080|c0_g1... TR137080|c0_g1_i1.path1 TR137080|c0_g1_i1 TR137080|c0_g1_i1 622 1071 M450 TR137080|c0_g1_i1 622 1071

330 rows × 16 columns

In [23]:
gmap_sub_ls = gmap_sub['gene_name'].tolist()
trans_genes = trans['seqid'].tolist()
In [24]:
subset = set(gmap_sub_ls)
transet = set(trans_genes)
print(len(subset))
print(len(transet))
68
552
In [25]:
epi_dn = []
for nam in subset:
    if nam in transet:
        epi_dn.append(nam)
print(len(epi_dn))
2
In [29]:
# iterate through the gmap gff df, and create dictionary of all genes
gene_starts = {}
gene_ends = {}
gene_dict = {}
gene_scaffs = {}
path_scaffs = {}
for ind,rw in gmap_sub.iterrows():
    gene_id = rw.attr_ID#.replace('ID=','')
    gene = rw.attr_Name
    gene_dict.setdefault(gene_id, {})['gene']=gene
    gene_dict.setdefault(gene_id, {})['scaffold']=rw.seqid
    gene_dict.setdefault(gene_id, {})['strand']=rw.strand
    gene_scaffs.setdefault(gene, []).append(rw.seqid)
    path_scaffs[gene_id]=rw.seqid
    gene_starts.setdefault(gene_id, []).append(int(rw.start))
    gene_ends.setdefault(gene_id, []).append(int(rw.end))
print(len(gene_dict))
print(len(gene_scaffs))
print(len(path_scaffs))
print(len(gene_starts))
print(len(gene_ends))
73
68
73
73
73
In [34]:
# now have to insert whole gene ranges in right place in lists
## have to create dictionary of whole gene ranges from previous for loop results
### get list of gene_ids
gene_list = gene_dict.keys()
print(gene_list[0])
print(len(gene_list))
TR55663|c3_g2_i1.path1
73
In [35]:
gene_range = {}
for gene in gene_list:
    gene_start = min(gene_starts[gene])
    gene_end = max(gene_ends[gene])
    gene_range.setdefault(gene, {})['g_start']=gene_start
    gene_range.setdefault(gene, {})['g_end']=gene_end
print(len(gene_range))
73

Repeat above loop, but now insert the gene type at the beginning of new genes

In [66]:
# rerun for loop iterating through df above, but create lists of all categories, 
## then make dict of lists, and apply to df
### repeat above loop, but add exons
scaff_ls = []
source_ls = []
type_ls = []
start_ls = []
end_ls = []
score_ls = []
orient_ls = []
phase_ls = []
id_ls = []
glist = []
for ind,rw in gmap_sub.iterrows():
    gene_id = rw.attr_ID#.replace('ID=','')
    # here, condition on if new gene
    if gene_id in glist:
        # first put in exon
        scaff_ls.append(rw.seqid)
        source_ls.append('dN_gmap')
        type_ls.append('exon') # this will have to be adjusted
        start_ls.append(int(rw.start))
        end_ls.append(int(rw.end))
        score_ls.append(rw.score)
        orient_ls.append(rw.strand)
        phase_ls.append('.')
        id_ls.append(gene_id)
        # repeat same, but for CDS
        scaff_ls.append(rw.seqid)
        source_ls.append('dN_gmap')
        type_ls.append('CDS') # this will have to be adjusted
        start_ls.append(int(rw.start))
        end_ls.append(int(rw.end))
        score_ls.append(rw.score)
        orient_ls.append(rw.strand)
        phase_ls.append('.')
        id_ls.append(gene_id)
    else:
        glist.append(gene_id)
        scaff_ls.append(gene_dict[gene_id]['scaffold'])
        source_ls.append('dN_gmap')
        type_ls.append('gene') # this will have to be adjusted
        start_ls.append(int(gene_range[gene_id]['g_start']))
        end_ls.append(int(gene_range[gene_id]['g_end']))
        score_ls.append(rw.score)
        orient_ls.append(gene_dict[gene_id]['strand'])
        phase_ls.append('.')
        id_ls.append(gene_id)
        # then do the normal line once overall gene is put in
        ## first do the exon
        scaff_ls.append(rw.seqid)
        source_ls.append('dN_gmap')
        type_ls.append('exon') # this will have to be adjusted
        start_ls.append(int(rw.start))
        end_ls.append(int(rw.end))
        score_ls.append(rw.score)
        orient_ls.append(rw.strand)
        phase_ls.append('.')
        id_ls.append(gene_id)
        # now do CDS
        scaff_ls.append(rw.seqid)
        source_ls.append('dN_gmap')
        type_ls.append('CDS') # this will have to be adjusted
        start_ls.append(int(rw.start))
        end_ls.append(int(rw.end))
        score_ls.append(rw.score)
        orient_ls.append(rw.strand)
        phase_ls.append('.')
        id_ls.append(gene_id)
print(len(scaff_ls))
print(len(id_ls))
733
733
In [68]:
# now try to make a df from these lists
## first make a dict of lists
### adding exons
data6 = {'source':source_ls,'Type':type_ls,'phase':phase_ls,'start':start_ls,'end':end_ls,'seqid':scaff_ls,'score':score_ls,'strand':orient_ls,'attributes':id_ls}
df6 = pd.DataFrame(data6, columns=['seqid','source','Type','start','end','score','strand','phase','attributes'])
df6.head()
Out[68]:
seqid source Type start end score strand phase attributes
0 MA_10426199 dN_gmap gene 1174 4964 99 + . TR574|c2_g1_i2.path2
1 MA_10426199 dN_gmap exon 1174 1448 99 + . TR574|c2_g1_i2.path2
2 MA_10426199 dN_gmap CDS 1174 1448 99 + . TR574|c2_g1_i2.path2
3 MA_10426199 dN_gmap exon 1577 2225 98 + . TR574|c2_g1_i2.path2
4 MA_10426199 dN_gmap CDS 1577 2225 98 + . TR574|c2_g1_i2.path2
In [71]:
# now have to modify attributes column for different types and number
## iterate through rows, grab type and attributes, then modify attributes and make new
### modifying a single cell:
#### df1.set_value(6, 'starts', 4600)
##### adding exons here
count = 1 # for exon counts
for ind,rw in df6.iterrows():
    tipo = rw.Type
    attr = rw.attributes
    if tipo == 'gene':
        new_attr = 'ID='+attr+'.g1;'+'Name=ORF'
        count = 1
    elif tipo == 'exon': # need to add exon numbers
        new_attr = 'ID='+attr+'.m1.exon'+str(count)+';'+'Parent='+attr+'.g1;'+'Name=ORF'
        count += 1
    elif tipo == 'CDS':
        new_attr = 'ID='+attr+'.m1;'+'Parent='+attr+'.g1;'+'Name=ORF'
    df6.set_value(ind, 'attributes', new_attr)
In [73]:
# now write to file and test with dexseq scripts and cufflinks
## filt_blast_results.to_csv(blastout, sep='\t', index=False, header=False)
df6.to_csv('epireg_converted_from_gmap2.gff3', sep='\t', index=False, header=False)