Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package


# Copyright 2008 by Peter Cock.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
# This module is for reading and writing IntelliGenetics format files as
# SeqRecord objects.  This file format appears to be the same as the MASE
# multiple sequence alignment format.

"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.

You are expected to use this module via the Bio.SeqIO functions."""

from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

#This is a generator function!
def IgIterator(handle, alphabet = single_letter_alphabet) :
    """Iterate over IntelliGenetics records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet

    The optional free format file header lines (which start with two
    semi-colons) are ignored.

    The free format commentary lines at the start of each record (which
    start with a semi-colon) are recorded as a single string with embedded
    new line characters in the SeqRecord's annotations dictionary under the
    key 'comment'.
    #Skip any file header text before the first record (;; lines)
    while True :
        line = handle.readline()
        if not line : break #Premature end of file, or just empty?
        if not line.startswith(";;") : break

    while line :
        #Now iterate over the records
        if line[0]!=";" :
            raise ValueError( \
                  "Records should start with ';' and not:\n%s" % repr(line))

        #Try and agree with SeqRecord convention from the GenBank parser,
        #(and followed in the SwissProt parser) which stores the comments
        #as a long string with newlines under annotations key 'comment'.

        #Note some examples use "; ..." and others ";..."
        comment_lines = []
        while line.startswith(";") :
            #TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
            line = handle.readline()
        title = line.rstrip()

        seq_lines = []
        while True:
            line = handle.readline()
            if not line : break
            if line[0] == ";": break
            #Remove trailing whitespace, and any internal spaces
            seq_lines.append(line.rstrip().replace(" ",""))
        #Return the record and then continue...
        record= SeqRecord(Seq("".join(seq_lines), alphabet),
                          id = title, name = title)
        record.annotations['comment'] = "\n".join(comment_lines)
        yield record
    #We should be at the end of the file now
    assert not line

if __name__ == "__main__" :
    print "Running quick self test"
    import os
    for filename in os.listdir("../../Tests/Intelligenetics/") :
        if os.path.splitext(filename)[-1] == ".txt" :
            print filename
            print "-"*len(filename)
            handle = open(os.path.join("../../Tests/Intelligenetics/", filename))
            for record in IgIterator(handle) :
                print record.id, len(record)
    print "Done"

Generated by  Doxygen 1.6.0   Back to index