Convert range data from different sources to GFF3 format

In [1]:
import sys
sys.path.append('/Users/hughcross/analysis/git_projects/consensus_builder')
from file_funcs import make_list
In [2]:
# lists of all genes for assigning to category
pr1 = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_pr1')
pathrel = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_pathrel')
lp18up = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_lp18_upreg')
lp18dwn = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_lp18_downreg')
mc18up = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_mc18_upreg')
mc18dwn = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_mc18_downreg')
both18up = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_both18_upreg')
both18dwn = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_both18_downreg')
i300 = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_igor300_remaining')
nodge_spec = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_no-dge_special_genes.txt')
dge_spec = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_dge_special_genes.txt')
golden = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_golden_genes.txt')
In [3]:
range_fasta = open('/Users/hughcross/analysis/epiclim/final_evaluation/final_fasta/epicapture_final_sequences.fasta')
In [4]:
# parse to get the info
range_dict = {}
scaf_list = []
scaf_dict = {}
for line in range_fasta:
    line = line.strip('\n')
    if line.startswith('>'):
        parts = line.split(' ')
        prom_name = parts[0][1:]
        scaffold = prom_name.split('g')[0]
        promoter = parts[2][9:]
        promo = promoter.replace(',','')
        promoter_ranges = promo.split('-')
        promoter_start = int(promoter_ranges[0])
        promoter_end = int(promoter_ranges[1])
        cds = parts[3][4:]
        cds_ranges = cds.split('-')
        cds_start = int(cds_ranges[0])
        cds_end = int(cds_ranges[1])
        if promoter_start > promoter_end:
            orient = '-'
            promo_start = promoter_end
            promo_end = promoter_start
            cd_start = cds_end
            cd_end = cds_start
        elif promoter_start < promoter_end:
            orient = '+'
            promo_start = promoter_start
            promo_end = promoter_end
            cd_start = cds_start
            cd_end = cds_end
        range_dict.setdefault(prom_name, {})['orient']=orient
        range_dict.setdefault(prom_name, {})['promo_start']=promo_start
        range_dict.setdefault(prom_name, {})['promo_end']=promo_end
        range_dict.setdefault(prom_name, {})['cds_start']=cd_start
        range_dict.setdefault(prom_name, {})['cds_end']=cd_end
        range_dict.setdefault(prom_name, {})['scaffold']=scaffold
        scaf_dict.setdefault(scaffold, []).append(prom_name)
        scaf_list.append(scaffold)
print(len(range_dict))
print(len(scaf_list))
print(len(scaf_dict))
range_fasta.close()
2872
2872
2843
In [5]:
range_dict['MA_103523g0020_prom']
Out[5]:
{'cds_end': 29349,
 'cds_start': 28349,
 'orient': '-',
 'promo_end': 31349,
 'promo_start': 29350,
 'scaffold': 'MA_103523'}
In [6]:
range_dict['MA_10433830g0010_prom']
Out[6]:
{'cds_end': 5495,
 'cds_start': 4495,
 'orient': '+',
 'promo_end': 4494,
 'promo_start': 2495,
 'scaffold': 'MA_10433830'}
In [7]:
# set and sort the list of scaffolds
scafset = set(scaf_list)
len(scafset)
Out[7]:
2843
In [8]:
sorted_scafs = sorted(scafset)
len(sorted_scafs)
Out[8]:
2843
In [9]:
## add in category dictionary so can assign different category to gff file
master = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/masterlist_24april.txt')
len(master)
Out[9]:
2872
In [10]:
cat_dict = {}
for gene in master:
    if gene in golden:
        cat_dict[gene]='golden_set'
    elif gene in pr1:
        cat_dict[gene]='pathogen-related'
    elif gene in pathrel:
        cat_dict[gene]='pathogen-related'
    elif gene in both18up:
        cat_dict[gene]='both_MC-LP_18c_upreg'
    elif gene in both18dwn:
        cat_dict[gene]='both_MC-LP_18c_dwnreg'
    elif gene in lp18up:
        cat_dict[gene]='leaf_primordia18C_upreg'
    elif gene in lp18dwn:
        cat_dict[gene]='leaf_primordia18C_dwnreg'
    elif gene in mc18up:
        cat_dict[gene]='mother_cell18C_upreg'
    elif gene in mc18dwn:
        cat_dict[gene]='mother_cell18C_dwnreg'
    elif gene in i300:
        cat_dict[gene]='top315_candidate'
    elif gene in dge_spec:
        cat_dict[gene]='gene_of_interest_DE'
    elif gene in nodge_spec:
        cat_dict[gene]='gene_of_interest_noDE'
print(len(cat_dict))
2872
In [11]:
# now output to file in gff format
gffout = open('epicapture_gene_candidates.gff3', 'w')
for scaf in sorted_scafs:
    prom_list = sorted(scaf_dict[scaf])
    for promo in prom_list:
        gene = promo.replace('_prom','')
        category = cat_dict[gene]
        prom_dict = range_dict[promo]
        if prom_dict['orient'] == '+':
            seq_start = prom_dict['promo_start']
            seq_end = prom_dict['cds_end']
        else:
            seq_start = prom_dict['cds_start']
            seq_end = prom_dict['promo_end']
        gffout.write(scaf+'\t'+category+'\tcandidate_seq\t'+str(seq_start)+'\t'+str(seq_end)+'\t.\t'+prom_dict['orient']+'\t.\t'+'ID='+promo+'\n')
        gffout.write(scaf+'\t'+category+'\tPromoter\t'+str(prom_dict['promo_start'])+'\t'+str(prom_dict['promo_end'])+'\t.\t'+prom_dict['orient']+'\t.\t'+'ID='+promo+'.Promoter;Parent='+promo+'\n')
        gffout.write(scaf+'\t'+category+'\tCDS_start\t'+str(prom_dict['cds_start'])+'\t'+str(prom_dict['cds_end'])+'\t.\t'+prom_dict['orient']+'\t.\t'+'ID='+promo+'.CDS;Parent='+promo+'\n')
gffout.close()