Logo Search packages:      
Sourcecode: python-biopython version File versions

fasta_dictionary.py

# The New Way
# ===========
# This next bit of code use Bio.SeqIO to parse a FASTA file, and turn
# it into an in-memory python dictionary.
# Note Bio.SeqIO will use a generic alphabet for the SeqRecord objects.

from Bio import SeqIO

def get_accession_num(seq_record):
    accession_atoms = seq_record.id.split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

orchid_dict = SeqIO.to_dict(SeqIO.parse(open("ls_orchid.fasta"),"fasta"), \
                            get_accession_num)

for id_num in orchid_dict.keys():
    print 'id number:', id_num
    print 'description:', orchid_dict[id_num].description
    print 'sequence:', orchid_dict[id_num].seq

# The Old Way
# ===========
# This next bit of code still works fine. It uses Bio.Fasta instead,
# and builds an index as a set of files on disc in the sub-directory
# my_orchid_dict.idx
# Note that the alphabet is explicitly defined for the sequences.

import os
from Bio import Fasta
from Bio.Alphabet import IUPAC

def get_accession_num(fasta_record):
    title_atoms = fasta_record.title.split()
    accession_atoms = title_atoms[0].split('|')
    gb_name = accession_atoms[3]
    # strip the version info before returning
    return gb_name[:-2]

if not os.path.isdir("my_orchid_dict.idx") :
    #Build a new index
    Fasta.index_file("ls_orchid.fasta", "my_orchid_dict.idx",
                     get_accession_num)
else :
    print "Reusing existing index"

dna_parser = Fasta.SequenceParser(IUPAC.ambiguous_dna)

orchid_dict = Fasta.Dictionary("my_orchid_dict.idx", dna_parser)

for id_num in orchid_dict.keys():
    print 'id number:', id_num
    print 'description:', orchid_dict[id_num].description
    print 'sequence:', orchid_dict[id_num].seq

Generated by  Doxygen 1.6.0   Back to index