import sys
sys.path.append('/Users/hughcross/analysis/git_projects/consensus_builder')
from file_funcs import make_list
# lists of all genes for assigning to category
pr1 = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_pr1')
pathrel = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_pathrel')
lp18up = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_lp18_upreg')
lp18dwn = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_lp18_downreg')
mc18up = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_mc18_upreg')
mc18dwn = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_mc18_downreg')
both18up = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_both18_upreg')
both18dwn = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_both18_downreg')
i300 = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_genes_for_igor300_remaining')
nodge_spec = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_no-dge_special_genes.txt')
dge_spec = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_dge_special_genes.txt')
golden = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/final_list_of_golden_genes.txt')
range_fasta = open('/Users/hughcross/analysis/epiclim/final_evaluation/final_fasta/epicapture_final_sequences.fasta')
# parse to get the info
range_dict = {}
scaf_list = []
scaf_dict = {}
for line in range_fasta:
line = line.strip('\n')
if line.startswith('>'):
parts = line.split(' ')
prom_name = parts[0][1:]
scaffold = prom_name.split('g')[0]
promoter = parts[2][9:]
promo = promoter.replace(',','')
promoter_ranges = promo.split('-')
promoter_start = int(promoter_ranges[0])
promoter_end = int(promoter_ranges[1])
cds = parts[3][4:]
cds_ranges = cds.split('-')
cds_start = int(cds_ranges[0])
cds_end = int(cds_ranges[1])
if promoter_start > promoter_end:
orient = '-'
promo_start = promoter_end
promo_end = promoter_start
cd_start = cds_end
cd_end = cds_start
elif promoter_start < promoter_end:
orient = '+'
promo_start = promoter_start
promo_end = promoter_end
cd_start = cds_start
cd_end = cds_end
range_dict.setdefault(prom_name, {})['orient']=orient
range_dict.setdefault(prom_name, {})['promo_start']=promo_start
range_dict.setdefault(prom_name, {})['promo_end']=promo_end
range_dict.setdefault(prom_name, {})['cds_start']=cd_start
range_dict.setdefault(prom_name, {})['cds_end']=cd_end
range_dict.setdefault(prom_name, {})['scaffold']=scaffold
scaf_dict.setdefault(scaffold, []).append(prom_name)
scaf_list.append(scaffold)
print(len(range_dict))
print(len(scaf_list))
print(len(scaf_dict))
range_fasta.close()
range_dict['MA_103523g0020_prom']
range_dict['MA_10433830g0010_prom']
# set and sort the list of scaffolds
scafset = set(scaf_list)
len(scafset)
sorted_scafs = sorted(scafset)
len(sorted_scafs)
## add in category dictionary so can assign different category to gff file
master = make_list('/Users/hughcross/analysis/epiclim/final_evaluation/final_lists/masterlist_24april.txt')
len(master)
cat_dict = {}
for gene in master:
if gene in golden:
cat_dict[gene]='golden_set'
elif gene in pr1:
cat_dict[gene]='pathogen-related'
elif gene in pathrel:
cat_dict[gene]='pathogen-related'
elif gene in both18up:
cat_dict[gene]='both_MC-LP_18c_upreg'
elif gene in both18dwn:
cat_dict[gene]='both_MC-LP_18c_dwnreg'
elif gene in lp18up:
cat_dict[gene]='leaf_primordia18C_upreg'
elif gene in lp18dwn:
cat_dict[gene]='leaf_primordia18C_dwnreg'
elif gene in mc18up:
cat_dict[gene]='mother_cell18C_upreg'
elif gene in mc18dwn:
cat_dict[gene]='mother_cell18C_dwnreg'
elif gene in i300:
cat_dict[gene]='top315_candidate'
elif gene in dge_spec:
cat_dict[gene]='gene_of_interest_DE'
elif gene in nodge_spec:
cat_dict[gene]='gene_of_interest_noDE'
print(len(cat_dict))
# now output to file in gff format
gffout = open('epicapture_gene_candidates.gff3', 'w')
for scaf in sorted_scafs:
prom_list = sorted(scaf_dict[scaf])
for promo in prom_list:
gene = promo.replace('_prom','')
category = cat_dict[gene]
prom_dict = range_dict[promo]
if prom_dict['orient'] == '+':
seq_start = prom_dict['promo_start']
seq_end = prom_dict['cds_end']
else:
seq_start = prom_dict['cds_start']
seq_end = prom_dict['promo_end']
gffout.write(scaf+'\t'+category+'\tcandidate_seq\t'+str(seq_start)+'\t'+str(seq_end)+'\t.\t'+prom_dict['orient']+'\t.\t'+'ID='+promo+'\n')
gffout.write(scaf+'\t'+category+'\tPromoter\t'+str(prom_dict['promo_start'])+'\t'+str(prom_dict['promo_end'])+'\t.\t'+prom_dict['orient']+'\t.\t'+'ID='+promo+'.Promoter;Parent='+promo+'\n')
gffout.write(scaf+'\t'+category+'\tCDS_start\t'+str(prom_dict['cds_start'])+'\t'+str(prom_dict['cds_end'])+'\t.\t'+prom_dict['orient']+'\t.\t'+'ID='+promo+'.CDS;Parent='+promo+'\n')
gffout.close()