Let's say we're interested in getting information about a SNP located on chromosome 2R at position 28,492,890.
# import modules
import allel
import pandas
I downloaded the An. gmabiae gff from VectorBase here
gff_location = '/home/sean/Downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.gff3.gz'
def get_gff_info(position, chrom, gff_path):
"""This return returns all information for a given
position on a given chromosome where the supplied gff
has information for it"""
# Thanks to Alistair for this conversion function
# (http://alimanfoo.github.io/2017/01/25/vgsc-gene-models.html)
# slightly altered to account for deprecated methods
def geneset_to_pandas(geneset):
"""Life is a bit easier when a geneset is a pandas DataFrame."""
items = []
for n in geneset.dtype.names:
v = geneset[n]
# convert bytes columns to unicode (which pandas then converts to object)
if v.dtype.kind == 'S':
v = v.astype('U')
items.append((n, v))
return pandas.DataFrame.from_dict(dict(items))
gff = allel.FeatureTable.from_gff3(gff_path,
attributes=['ID', 'Parent'])
gff = geneset_to_pandas(gff)
_bool = gff.apply(lambda row: position in range(
row['start'], row['end']) and row['seqid'] == chrom, axis=1)
result = gff[_bool]
return result
chr2R_28492890 = get_gff_info(
position=28492890, chrom='2R', gff_path=gff_location)
chr2R_28492890
Hope that's of some use to someone. If there's an actual implemented function for this, please do let me know.
Happy coding,
Sean