Blue Collar Bioinformatics

import pprint
from BCBio.GFF.GFFParser import GFFExaminer
gff_examiner = GFFExaminer()
possible_limits = gff_examiner.available_limits(gff_file)
pprint.pprint(possible_limits)

{'gff_id': {('I',): 159,
            ('X',): 6},
 'gff_source_type': {('Coding_transcript', 'CDS'): 27,
                     ('Coding_transcript', 'exon'): 33,
                     ('Coding_transcript', 'five_prime_UTR'): 4,
                     ('Coding_transcript', 'gene'): 2,
                     ('Coding_transcript', 'intron'): 29,
                     ('Coding_transcript', 'mRNA'): 4,
                     ('Coding_transcript', 'three_prime_UTR'): 3,
                     ('mass_spec_genome', 'translated_nucleotide_match'): 7},
 'gff_type': {('CDS',): 57,
              ('exon',): 33,
              ('five_prime_UTR',): 4,
              ('gene',): 2,
              ('intron',): 29,
              ('mRNA',): 4,
              ('three_prime_UTR',): 3,
              ('translated_nucleotide_match',): 7}}

import pprint
from BCBio.GFF.GFFParser import GFFExaminer
gff_examiner = GFFExaminer()
pc_map = gff_examiner.parent_child_map(gff_file)
pprint.pprint(pc_map)

{('Coding_transcript', 'gene'): [('Coding_transcript', 'mRNA')],
 ('Coding_transcript', 'mRNA'): [('Coding_transcript', 'CDS'),
                                 ('Coding_transcript', 'exon'),
                                 ('Coding_transcript', 'five_prime_UTR'),
                                 ('Coding_transcript', 'intron'),
                                 ('Coding_transcript', 'three_prime_UTR')]}

3_336_815_F3    solid   read    55409   55428   10.4    +       .       i=1

{'id': '',
 'is_gff2': False,
 'location': [55408, 55428],
 'quals': {
           'i': ['1'],
           'score': ['10.4'],
           'source': ['solid']},
 'rec_id': '3_336_815_F3',
 'strand': 1,
 'type': 'read'}

from Bio import SeqIO
from BCBio.GFF.GFFParser import GFFAddingIterator

class SolidFastaRemap:
    def __init__(self, initial_fasta):
        self._in_map = self._get_index_map(initial_fasta)

    def _get_index_map(self, in_file):
        in_map = dict()
        in_handle = open(in_file)
        for index, rec in enumerate(SeqIO.parse(in_handle, "fasta")):
            in_map[index] = rec.id
        in_handle.close()
        return in_map

    def adjust_fn(self, results):
        # 1-based indexes; convert to 0-based
        rec_index = int(results['quals']['i'][0]) - 1
        read_name = results['rec_id']
        results['quals']['read_name'] = [read_name]
        results['rec_id'] = self._in_map[rec_index]
        return results

remapper = SolidFastaRemap(fasta_file)
gff_iterator = GFFAddingIterator(line_adjust_fn=remapper.adjust_fn)
rec_dict = gff_iterator.get_all_features(gff_file)

from BCBio.GFF.GFFParser import GFFAddingIterator

gff_iterator = GFFAddingIterator()
for rec_dict in gff_iterator.get_features(gff_file, target_lines=3000000):
    for rec in rec_dict.values():
        # deal with rec.features    

from BCBio.GFF.GFFParser import GFFAddingIterator
from Bio import SeqIO

with open(seq_file) as seq_handle:
    seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
gff_iterator = GFFAddingIterator(seq_dict)
cds_limit_info = dict(
        gff_types = [('Coding_transcript', 'gene'),
                     ('Coding_transcript', 'mRNA'),
                     ('Coding_transcript', 'CDS')],
        gff_id = ['I']
        )
for rec_dict in gff_iterator.get_features(gff_file,
        limit_info=cds_limit_info, target_lines=1000000):
    for rec in rec_dict.values():
        # deal with rec.features    

  ID=CDS:B0019.1;Parent=Transcript:B0019.1;locus=amx-2

  Transcript "B0019.1" ; WormPep "WP:CE40797" ; Note "amx-2"

def _gff_line_map(line, params):
    strand_map = {'+' : 1, '-' : -1, '?' : None, None: None}
    line = line.strip()
    if line[0] != "#":
        parts = line.split('\t')
        should_do = True
        if params.limit_info:
            for limit_name, limit_values in params.limit_info.items():
                cur_id = tuple([parts[i] for i in 
                    params.filter_info[limit_name]])
                if cur_id not in limit_values:
                    should_do = False
                    break

        if should_do:
            assert len(parts) == 9, line
            gff_parts = [(None if p == '.' else p) for p in parts]
            gff_info = dict()
            # collect all of the base qualifiers for this item
            quals = collections.defaultdict(list)
            if gff_parts[1]:
                quals["source"].append(gff_parts[1])
            if gff_parts[5]:
                quals["score"].append(gff_parts[5])
            if gff_parts[7]:
                quals["phase"].append(gff_parts[7])
            for key, val in [a.split('=') for a in gff_parts[8].split(';')]:
                quals[key].extend(val.split(','))
            gff_info['quals'] = dict(quals)
            gff_info['rec_id'] = gff_parts[0]
            # if we are describing a location, then we are a feature
            if gff_parts[3] and gff_parts[4]:
                gff_info['location'] = [int(gff_parts[3]) - 1,
                        int(gff_parts[4])]
                gff_info['type'] = gff_parts[2]
                gff_info['id'] = quals.get('ID', [''])[0]
                gff_info['strand'] = strand_map[gff_parts[6]]
                # Handle flat features
                if not gff_info['id']:
                    final_key = 'feature'
                # features that have parents need to link so we can pick up
                # the relationship
                elif gff_info['quals'].has_key('Parent'):
                    final_key = 'child'
                # top level features
                else:
                    final_key = 'parent'
            # otherwise, associate these annotations with the full record
            else:
                final_key = 'annotation'
            return [(final_key, (simplejson.dumps(gff_info) if params.jsonify
                else gff_info))]

def _gff_line_reduce(map_results, out, params):
    final_items = dict()
    for gff_type, final_val in map_results:
        send_val = (simplejson.loads(final_val) if params.jsonify else 
                final_val)
        try:
            final_items[gff_type].append(send_val)
        except KeyError:
            final_items[gff_type] = [send_val]
    for key, vals in final_items.items():
        out.add(key, (simplejson.dumps(vals) if params.jsonify else vals))

from Bio import SeqIO
from BCBio.SeqIO.GFFIO import GFFFeatureAdder

with open(seq_file) as seq_handle:
    seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta"))
feature_adder = GFFFeatureAdder(seq_dict)
cds_limit_info = dict(
        gff_types = [('Coding_transcript', 'gene'),
                     ('Coding_transcript', 'mRNA'),
                     ('Coding_transcript', 'CDS')],
        gff_id = ['I']
        )
with open(gff_file) as gff_handle:
    feature_adder.add_features(gff_handle, cds_limit_info)
final_rec = feature_adder.base['I']

NM_001078975    GenBank gene    1       1847    .       +       .       
ID=cbx8;Dbxref=GeneID:779897;Note=chromobox homolog 8;gene=cbx8;
gene_synonym=MGC147589

     gene            1..1847
                     /gene="cbx8"
                     /gene_synonym="MGC147589"
                     /note="chromobox homolog 8"
                     /db_xref="GeneID:779897"

bioentry
biosequence
bioentry
bioentry_qualifer_value
bioentry
bioentry
bioentry
bioentry_qualifer_value
taxon
reference
comment

GenBank	BioSQL table	Current BioPerl GFF	Proposed GFF key/value
LOCUS; identifier ACCESSION	bioentry name	ID
LOCUS; Molecule type	biosequence alphabet	mol_type
LOCUS; division	bioentry division		division
LOCUS; date	bioentry_qualifer_value term date_changed	date
DEFINITION	bioentry description	Note, but combined with COMMENT	description
VERSION	bioentry accession and version		hasVersion
GI	bioentry identifier		identifier
KEYWORDS	bioentry_qualifer_value term keywords		subject
SOURCE and ORGANISM	taxon	organism and Dbxref to taxon ID	Full lineage needs representation as well
REFERENCE	reference		Dbxref for PubMed IDs; need to store full reference information as well
COMMENT	comment	comment1 and Note, combined with DEFINITION	comment1 only

Blue Collar Bioinformatics

Posts Tagged ‘gff’

Examining and adjusting your GFF file

Examining a GFF file

Adjusting GFF lines during parsing

More python GFF parsing — iterative parsing and GFF2 nested features

Iterative parsing of GFF

Nested features for GFF2

MapReduce implementation of GFF parsing for Biopython

Initial GFF parser for Biopython

Implementation

Testing

BioSQL

Summary

Exploring BioPerl GenBank to GFF mapping

Recent Posts

Blue Collar Bioinformatics

Posts Tagged ‘gff’

Examining and adjusting your GFF file

Examining a GFF file

Adjusting GFF lines during parsing

More python GFF parsing — iterative parsing and GFF2 nested features

Iterative parsing of GFF

Nested features for GFF2

Python GFF parser update — parallel parsing and GFF2

Parallel parsing

GFF2 support

MapReduce implementation of GFF parsing for Biopython

Initial GFF parser for Biopython

Implementation

Testing

BioSQL

Summary

Exploring BioPerl GenBank to GFF mapping

Recent Posts