Identify overlap in ranges in genes from two GFF3 files

In [1]:
import pandas as pd
import numpy as np
In [2]:
# gff to check
gold_file = 'golden_candidates.gff3'
# reference to check against
te_file = '/Users/hughcross/analysis/git_projects/rangefinder/candidateTE.gff3'
In [3]:
gold_dtypes = {'seqid':'str','source':'str','type':'str','start':'float','end':'float','score':'str','strand':'str','phase':'str','attributes':'str'}
In [4]:
col_names = ['seqid','source','type','start','end','score','strand','phase','attributes']
In [5]:
gold = pd.read_table(gold_file, sep='\t', comment='#', dtype=gold_dtypes, names=col_names)
In [6]:
gold.head()
Out[6]:
seqid source type start end score strand phase attributes
0 MA_10 golden_set candidate_seq 22712.0 25712.0 . + . ID=MA_10g0010_prom
1 MA_10 golden_set Promoter 22712.0 24711.0 . + . ID=MA_10g0010_prom.Promoter;Parent=MA_10g0010_...
2 MA_10 golden_set CDS_start 24712.0 25712.0 . + . ID=MA_10g0010_prom.CDS;Parent=MA_10g0010_prom
3 MA_100058 golden_set candidate_seq 3890.0 6890.0 . + . ID=MA_100058g0010_prom
4 MA_100058 golden_set Promoter 3890.0 5889.0 . + . ID=MA_100058g0010_prom.Promoter;Parent=MA_1000...
In [7]:
# slightly different datatypes for the TE: score and phase are filled in in the file
te_dtypes = {'seqid':'str','source':'str','type':'str','start':'int','end':'int','score':'float','strand':'str','phase':'int','attributes':'str'}
In [8]:
te = pd.read_table(te_file, sep='\t', comment='#', dtype=te_dtypes, names=col_names)
In [9]:
te.head()
Out[9]:
seqid source type start end score strand phase attributes
0 MA_10 Master mRNA 30529 31128 0.69 + 0 ID="MA_10.g710";te_coverage="99.67%";te="TE";e...
1 MA_10 Master CDS 30529 31128 0.69 + 0 ID="MA_10.g710t3";Parent="MA_10.g710";te_cover...
2 MA_10 Master exon 30529 31128 0.69 + 0 ID="MA_10.g710t4";Parent="MA_10.g710";te_cover...
3 MA_10 Master mRNA 39676 40428 0.32 + 0 ID="MA_10.g711";te_coverage="100.00%";te="TE";...
4 MA_10 Master CDS 39676 40428 0.32 + 0 ID="MA_10.g711t6";Parent="MA_10.g711";te_cover...

split the columns and reformat

In [14]:
# more than one gene per scaffold, so maybe split the df by geneID 
te['gene_ID'], te['attr_info'] = zip(*te['attributes'].apply(lambda x: x.split(';', 1)))
In [26]:
te['gene_ID'] = te.gene_ID.str.replace('ID=','')
In [27]:
te['gene_ID'] = te.gene_ID.str.replace('"','')

create new dataframe of just the mrna rows for overall comparison

In [28]:
# get subset of df using formula:
## blast2 = blast_table.loc[blast_table['pident'] > 35]
te_mrna = te[te['type'] == 'mRNA']
In [29]:
te_mrna.head()
Out[29]:
seqid source type start end score strand phase attributes gene_ID attr_info
0 MA_10 Master mRNA 30529 31128 0.69 + 0 ID="MA_10.g710";te_coverage="99.67%";te="TE";e... MA_10.g710 te_coverage="99.67%";te="TE";est_id="Kmer10_co...
3 MA_10 Master mRNA 39676 40428 0.32 + 0 ID="MA_10.g711";te_coverage="100.00%";te="TE";... MA_10.g711 te_coverage="100.00%";te="TE";est_id="NA";pfam...
6 MA_10004132 Master mRNA 1 686 0.85 + 1 ID="MA_10004132.g7603";te_coverage="80.98%";te... MA_10004132.g7603 te_coverage="80.98%";te="TE";est_id="PgdbPsitc...
11 MA_10005 Master mRNA 45305 46008 0.58 - 0 ID="MA_10005.g984";te_coverage="99.39%";te="TE... MA_10005.g984 te_coverage="99.39%";te="TE";est_id="Total_iso...
16 MA_10005 Master mRNA 48007 50244 0.13 - 0 ID="MA_10005.g985";te_coverage="100.00%";te="T... MA_10005.g985 te_coverage="100.00%";te="TE";est_id="Total_is...

split and reformat the target columns

In [20]:
gold['idname'], gold['gene_ID'] = zip(*gold['attributes'].apply(lambda x: x.split('=', 1)))
In [22]:
del gold['idname']
In [24]:
gold_gene = gold[gold['type'] == 'candidate_seq']
In [25]:
gold_gene.head()
Out[25]:
seqid source type start end score strand phase attributes gene_ID
0 MA_10 golden_set candidate_seq 22712.0 25712.0 . + . ID=MA_10g0010_prom MA_10g0010_prom
3 MA_100058 golden_set candidate_seq 3890.0 6890.0 . + . ID=MA_100058g0010_prom MA_100058g0010_prom
6 MA_10013703 golden_set candidate_seq 0.0 2872.0 . - . ID=MA_10013703g0010_prom MA_10013703g0010_prom
9 MA_10019758 golden_set candidate_seq 0.0 2657.0 . + . ID=MA_10019758g0010_prom MA_10019758g0010_prom
12 MA_10020434 golden_set candidate_seq 2369.0 5369.0 . - . ID=MA_10020434g0010_prom MA_10020434g0010_prom
In [15]:
# first, get list of targets (in this case: golden)
## df_taxa = df['Taxon'].tolist()
gold_ls = gold_gene['seqid'].tolist()
len(gold_ls)
Out[15]:
1928
In [16]:
# get the ranges for target
targ1 = gold_gene.ix[gold_gene['seqid'] == 'MA_10']
targ1
Out[16]:
seqid source type start end score strand phase attributes
0 MA_10 golden_set candidate_seq 22712.0 25712.0 . + . ID=MA_10g0010_prom
In [17]:
targ1.start
Out[17]:
0    22712.0
Name: start, dtype: float64

Find overlap between genes (w/out reference to part of gene)

In [30]:
# get the gene_ID from ref
gold_ids = gold_gene['gene_ID'].tolist()
gold_ids[0]
Out[30]:
'MA_10g0010_prom'
In [19]:
targ1.end > 20000
Out[19]:
0    True
Name: end, dtype: bool
In [65]:
# get the ranges for ref
ref1 = te_mrna.ix[te_mrna['seqid'] == 'MA_10']
ref1
Out[65]:
seqid source type start end score strand phase attributes gene_ID attr_info
0 MA_10 Master mRNA 30529 31128 0.69 + 0 ID="MA_10.g710";te_coverage="99.67%";te="TE";e... "MA_10.g710" te_coverage="99.67%";te="TE";est_id="Kmer10_co...
3 MA_10 Master mRNA 39676 40428 0.32 + 0 ID="MA_10.g711";te_coverage="100.00%";te="TE";... "MA_10.g711" te_coverage="100.00%";te="TE";est_id="NA";pfam...
In [38]:
len(ref1)
Out[38]:
2
In [39]:
for index, row in ref1.iterrows():
    print row.start
30529
39676
In [40]:
for index, row in ref1.iterrows():
    print index
0
3
In [44]:
for index, row in ref1.iterrows():
    target_start = int(targ1.start)
    total = target_start + row.start
    print(target_start)
    print(row.start)
    print(total)
22712
30529
53241
22712
39676
62388
In [45]:
for index, row in ref1.iterrows():
    targ_start = int(targ1.start)
    targ_end = int(targ1.end)
    if targ_start <= row.start <= targ_end:
        print('go')
    else:
        print('no')
no
no
In [32]:
ref_scafs = te_mrna['seqid'].tolist()
print(ref_scafs[0])
print(len(ref_scafs))
MA_10
9569
In [80]:
# try to put them all together, OLD
overlap = [] # to test at first
within = {}
overlap_end = []
overlap_up = {}
overlap_dwn = {}
for g in gold_full_ids:#gold_ls:
    scaffold = g.split('g')[0]
    target_rows = gold_gene.ix[gold_gene['attributes'] == g]
    for index, row in target_rows.iterrows():
        #gold_id = row.attributes >> now g
        gold_id = g
        targ_start = int(row.start)
        targ_end = int(row.end)
        if g in ref_scafs:
            ref_rows = te_mrna.ix[te_mrna['seqid'] == scaffold]
            for ind, rw in ref_rows.iterrows():
                ref_gene = rw.gene_ID.replace('"','') # te_mrna.ix[te_mrna['gene_ID'] == 'MA_10']
                #gold_id = row.attributes

                if targ_start <= rw.start <= targ_end:
                    #overlap.append(g)
                    start_over = 'yes'
                else:
                    start_over = 'no'
                if targ_start <= rw.end <= targ_end:
                    end_over = 'yes'
                    #overlap_end.append(g)
                else:
                    end_over = 'no'
                if start_over == 'yes' and end_over == 'yes':
                    within.setdefault(gold_id, []).append(ref_gene)
                elif start_over == 'yes' and end_over == 'no':
                    overlap_dwn.setdefault(gold_id, []).append(ref_gene)
                elif start_over == 'no' and end_over == 'yes':
                    overlap_up.setdefault(gold_id, []).append(ref_gene)
                    
print(len(overlap_up))
print(len(overlap_dwn))
print(len(within))
0
0
0

build overlap function

In [33]:
# try to put them all together, REVISED
overlap = [] # to test at first
within = {}
overlap_end = []
overlap_up = {}
overlap_dwn = {}
for g in gold_ids:#gold_ls:
    scaffold = g.split('g')[0]
    target_row = gold_gene.ix[gold_gene['gene_ID'] == g]
    #for index, row in target_rows.iterrows():
        #gold_id = row.attributes >> now g
    gold_id = g
    targ_start = int(target_row.start)
    targ_end = int(target_row.end)
    if scaffold in ref_scafs:
        ref_rows = te_mrna.ix[te_mrna['seqid'] == scaffold]
        for ind, rw in ref_rows.iterrows():
            ref_gene = rw.gene_ID.replace('"','') # te_mrna.ix[te_mrna['gene_ID'] == 'MA_10']
            #gold_id = row.attributes

            if targ_start <= rw.start <= targ_end:
                #overlap.append(g)
                start_over = 'yes'
            else:
                start_over = 'no'
            if targ_start <= rw.end <= targ_end:
                end_over = 'yes'
                #overlap_end.append(g)
            else:
                end_over = 'no'
            if start_over == 'yes' and end_over == 'yes':
                within.setdefault(gold_id, []).append(ref_gene)
            elif start_over == 'yes' and end_over == 'no':
                overlap_dwn.setdefault(gold_id, []).append(ref_gene)
            elif start_over == 'no' and end_over == 'yes':
                overlap_up.setdefault(gold_id, []).append(ref_gene)
                    
print(len(overlap_up))
print(len(overlap_dwn))
print(len(within))
15
23
16
In [34]:
within['MA_10219750g0010_prom']
Out[34]:
['MA_10219750.g7637']
In [55]:
gold_gene.head()
Out[55]:
seqid source type start end score strand phase attributes
0 MA_10 golden_set candidate_seq 22712.0 25712.0 . + . MA_10g0010_prom
3 MA_100058 golden_set candidate_seq 3890.0 6890.0 . + . MA_100058g0010_prom
6 MA_10013703 golden_set candidate_seq 0.0 2872.0 . - . MA_10013703g0010_prom
9 MA_10019758 golden_set candidate_seq 0.0 2657.0 . + . MA_10019758g0010_prom
12 MA_10020434 golden_set candidate_seq 2369.0 5369.0 . - . MA_10020434g0010_prom
In [35]:
within
Out[35]:
{'MA_10219750g0010_prom': ['MA_10219750.g7637'],
 'MA_102811g0010_prom': ['MA_102811.g2491'],
 'MA_10429049g0010_prom': ['MA_10429049.g3862'],
 'MA_107182g0010_prom': ['MA_107182.g4508'],
 'MA_118377g0010_prom': ['MA_118377.g4478'],
 'MA_125231g0010_prom': ['MA_125231.g6369'],
 'MA_160636g0010_prom': ['MA_160636.g4010'],
 'MA_162433g0010_prom': ['MA_162433.g2983'],
 'MA_48578g0010_prom': ['MA_48578.g4317'],
 'MA_71280g0010_prom': ['MA_71280.g5347'],
 'MA_787567g0010_prom': ['MA_787567.g7667'],
 'MA_9195168g0010_prom': ['MA_9195168.g7992'],
 'MA_93384g0010_prom': ['MA_93384.g1396'],
 'MA_941227g0010_prom': ['MA_941227.g7505'],
 'MA_95664g0010_prom': ['MA_95664.g4925'],
 'MA_99665g0010_prom': ['MA_99665.g3483']}
In [85]:
overlap_up
Out[85]:
{'MA_10359153g0010_prom': ['MA_10359153.g7090'],
 'MA_10430676g0010_prom': ['MA_10430676.g1493'],
 'MA_10432911g0010_prom': ['MA_10432911.g1163'],
 'MA_10433638g0010_prom': ['MA_10433638.g6385'],
 'MA_10435940g0010_prom': ['MA_10435940.g5154'],
 'MA_10436082g0010_prom': ['MA_10436082.g3132'],
 'MA_10436523g0020_prom': ['MA_10436523.g272'],
 'MA_171506g0010_prom': ['MA_171506.g4319'],
 'MA_21185g0010_prom': ['MA_21185.g911'],
 'MA_25080g0010_prom': ['MA_25080.g2291'],
 'MA_8873395g0010_prom': ['MA_8873395.g7601'],
 'MA_9030577g0010_prom': ['MA_9030577.g7621'],
 'MA_9057g0030_prom': ['MA_9057.g2059'],
 'MA_9468439g0010_prom': ['MA_9468439.g7277'],
 'MA_96900g0010_prom': ['MA_96900.g2151']}
In [73]:
ref2 = te_mrna.ix[te_mrna['seqid'] == 'MA_10436523']
In [74]:
ref2
Out[74]:
seqid source type start end score strand phase attributes gene_ID attr_info
15105 MA_10436523 Master mRNA 33261 33720 0.85 - 2 ID="MA_10436523.g271";te_coverage="90.00%";te=... "MA_10436523.g271" te_coverage="90.00%";te="TE";est_id="PgdbPglau...
15110 MA_10436523 Master mRNA 36844 41095 0.45 - 2 ID="MA_10436523.g272";te_coverage="61.15%";te=... "MA_10436523.g272" te_coverage="61.15%";te="TE";est_id="NA";pfam_...
15115 MA_10436523 Master mRNA 50064 55338 0.77 - 2 ID="MA_10436523.g274";te_coverage="68.61%";te=... "MA_10436523.g274" te_coverage="68.61%";te="TE";est_id="White_WS0...
15120 MA_10436523 Master mRNA 56992 57568 0.74 - 1 ID="MA_10436523.g275";te_coverage="95.71%";te=... "MA_10436523.g275" te_coverage="95.71%";te="TE";est_id="NA";pfam_...
15127 MA_10436523 Master mRNA 62654 68367 0.58 + 0 ID="MA_10436523.g276";te_coverage="77.88%";te=... "MA_10436523.g276" te_coverage="77.88%";te="TE";est_id="Total_iso...
In [75]:
targ2 = gold_gene[gold_gene['seqid'] == 'MA_10436523']
In [76]:
targ2
Out[76]:
seqid source type start end score strand phase attributes
1512 MA_10436523 golden_set candidate_seq 7841.0 10841.0 . + . MA_10436523g0010_prom
1515 MA_10436523 golden_set candidate_seq 40224.0 43224.0 . + . MA_10436523g0020_prom
In [36]:
ls = ['a','a','a','b','b','c']
ls.count('a')
Out[36]:
3