In [1]:
import pandas as pd
import numpy as np
In [5]:
gff_file = 'epicapture_gene_candidates.gff3'
In [7]:
col_dtypes = {'seqid':'str','source':'str','type':'str','start':'int','end':'int','score':'str','strand':'str','phase':'str','attributes':'str'}
In [8]:
col_names = ['seqid','source','type','start','end','score','strand','phase','attributes']
In [9]:
gff = pd.read_table(gff_file, sep='\t', comment='#', dtype=col_dtypes, names=col_names)
In [10]:
gff.head()
Out[10]:
seqid source type start end score strand phase attributes
0 MA_10 golden_set candidate_seq 22712 25712 . + . ID=MA_10g0010_prom
1 MA_10 golden_set Promoter 22712 24711 . + . ID=MA_10g0010_prom.Promoter;Parent=MA_10g0010_...
2 MA_10 golden_set CDS_start 24712 25712 . + . ID=MA_10g0010_prom.CDS;Parent=MA_10g0010_prom
3 MA_100058 golden_set candidate_seq 3890 6890 . + . ID=MA_100058g0010_prom
4 MA_100058 golden_set Promoter 3890 5889 . + . ID=MA_100058g0010_prom.Promoter;Parent=MA_1000...

format to replace a value in a column:

df.replace({'a' : { 'Medium' : 2, 'Small' : 1, 'High' : 3 }})

http://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column

In [13]:
newgff = gff.replace({'start' : { 0 : 1}})
In [15]:
newgff.to_csv('epicapture_candidates.gff3', sep='\t', index=False, header=False)
In [14]:
newgff.head()
Out[14]:
seqid source type start end score strand phase attributes
0 MA_10 golden_set candidate_seq 22712 25712 . + . ID=MA_10g0010_prom
1 MA_10 golden_set Promoter 22712 24711 . + . ID=MA_10g0010_prom.Promoter;Parent=MA_10g0010_...
2 MA_10 golden_set CDS_start 24712 25712 . + . ID=MA_10g0010_prom.CDS;Parent=MA_10g0010_prom
3 MA_100058 golden_set candidate_seq 3890 6890 . + . ID=MA_100058g0010_prom
4 MA_100058 golden_set Promoter 3890 5889 . + . ID=MA_100058g0010_prom.Promoter;Parent=MA_1000...