import pandas as pd
import numpy as np
# gff to check
gold_file = 'golden_candidates.gff3'
# reference to check against
te_file = '/Users/hughcross/analysis/git_projects/rangefinder/candidateTE.gff3'
gold_dtypes = {'seqid':'str','source':'str','type':'str','start':'float','end':'float','score':'str','strand':'str','phase':'str','attributes':'str'}
col_names = ['seqid','source','type','start','end','score','strand','phase','attributes']
gold = pd.read_table(gold_file, sep='\t', comment='#', dtype=gold_dtypes, names=col_names)
gold.head()
# slightly different datatypes for the TE: score and phase are filled in in the file
te_dtypes = {'seqid':'str','source':'str','type':'str','start':'int','end':'int','score':'float','strand':'str','phase':'int','attributes':'str'}
te = pd.read_table(te_file, sep='\t', comment='#', dtype=te_dtypes, names=col_names)
te.head()
split the columns and reformat
# more than one gene per scaffold, so maybe split the df by geneID
te['gene_ID'], te['attr_info'] = zip(*te['attributes'].apply(lambda x: x.split(';', 1)))
te['gene_ID'] = te.gene_ID.str.replace('ID=','')
te['gene_ID'] = te.gene_ID.str.replace('"','')
create new dataframe of just the mrna rows for overall comparison
# get subset of df using formula:
## blast2 = blast_table.loc[blast_table['pident'] > 35]
te_mrna = te[te['type'] == 'mRNA']
te_mrna.head()
split and reformat the target columns
gold['idname'], gold['gene_ID'] = zip(*gold['attributes'].apply(lambda x: x.split('=', 1)))
del gold['idname']
gold_gene = gold[gold['type'] == 'candidate_seq']
gold_gene.head()
# first, get list of targets (in this case: golden)
## df_taxa = df['Taxon'].tolist()
gold_ls = gold_gene['seqid'].tolist()
len(gold_ls)
# get the ranges for target
targ1 = gold_gene.ix[gold_gene['seqid'] == 'MA_10']
targ1
targ1.start
# get the gene_ID from ref
gold_ids = gold_gene['gene_ID'].tolist()
gold_ids[0]
targ1.end > 20000
# get the ranges for ref
ref1 = te_mrna.ix[te_mrna['seqid'] == 'MA_10']
ref1
len(ref1)
for index, row in ref1.iterrows():
print row.start
for index, row in ref1.iterrows():
print index
for index, row in ref1.iterrows():
target_start = int(targ1.start)
total = target_start + row.start
print(target_start)
print(row.start)
print(total)
for index, row in ref1.iterrows():
targ_start = int(targ1.start)
targ_end = int(targ1.end)
if targ_start <= row.start <= targ_end:
print('go')
else:
print('no')
ref_scafs = te_mrna['seqid'].tolist()
print(ref_scafs[0])
print(len(ref_scafs))
# try to put them all together, OLD
overlap = [] # to test at first
within = {}
overlap_end = []
overlap_up = {}
overlap_dwn = {}
for g in gold_full_ids:#gold_ls:
scaffold = g.split('g')[0]
target_rows = gold_gene.ix[gold_gene['attributes'] == g]
for index, row in target_rows.iterrows():
#gold_id = row.attributes >> now g
gold_id = g
targ_start = int(row.start)
targ_end = int(row.end)
if g in ref_scafs:
ref_rows = te_mrna.ix[te_mrna['seqid'] == scaffold]
for ind, rw in ref_rows.iterrows():
ref_gene = rw.gene_ID.replace('"','') # te_mrna.ix[te_mrna['gene_ID'] == 'MA_10']
#gold_id = row.attributes
if targ_start <= rw.start <= targ_end:
#overlap.append(g)
start_over = 'yes'
else:
start_over = 'no'
if targ_start <= rw.end <= targ_end:
end_over = 'yes'
#overlap_end.append(g)
else:
end_over = 'no'
if start_over == 'yes' and end_over == 'yes':
within.setdefault(gold_id, []).append(ref_gene)
elif start_over == 'yes' and end_over == 'no':
overlap_dwn.setdefault(gold_id, []).append(ref_gene)
elif start_over == 'no' and end_over == 'yes':
overlap_up.setdefault(gold_id, []).append(ref_gene)
print(len(overlap_up))
print(len(overlap_dwn))
print(len(within))
# try to put them all together, REVISED
overlap = [] # to test at first
within = {}
overlap_end = []
overlap_up = {}
overlap_dwn = {}
for g in gold_ids:#gold_ls:
scaffold = g.split('g')[0]
target_row = gold_gene.ix[gold_gene['gene_ID'] == g]
#for index, row in target_rows.iterrows():
#gold_id = row.attributes >> now g
gold_id = g
targ_start = int(target_row.start)
targ_end = int(target_row.end)
if scaffold in ref_scafs:
ref_rows = te_mrna.ix[te_mrna['seqid'] == scaffold]
for ind, rw in ref_rows.iterrows():
ref_gene = rw.gene_ID.replace('"','') # te_mrna.ix[te_mrna['gene_ID'] == 'MA_10']
#gold_id = row.attributes
if targ_start <= rw.start <= targ_end:
#overlap.append(g)
start_over = 'yes'
else:
start_over = 'no'
if targ_start <= rw.end <= targ_end:
end_over = 'yes'
#overlap_end.append(g)
else:
end_over = 'no'
if start_over == 'yes' and end_over == 'yes':
within.setdefault(gold_id, []).append(ref_gene)
elif start_over == 'yes' and end_over == 'no':
overlap_dwn.setdefault(gold_id, []).append(ref_gene)
elif start_over == 'no' and end_over == 'yes':
overlap_up.setdefault(gold_id, []).append(ref_gene)
print(len(overlap_up))
print(len(overlap_dwn))
print(len(within))
within['MA_10219750g0010_prom']
gold_gene.head()
within
overlap_up
ref2 = te_mrna.ix[te_mrna['seqid'] == 'MA_10436523']
ref2
targ2 = gold_gene[gold_gene['seqid'] == 'MA_10436523']
targ2
ls = ['a','a','a','b','b','c']
ls.count('a')