# Copyright 2008 by Peter Cock. All rights reserved. # This code is part of the Biopython distribution and governed by its # license. Please see the LICENSE file that should have been included # as part of this package. # # This module is for reading and writing IntelliGenetics format files as # SeqRecord objects. This file format appears to be the same as the MASE # multiple sequence alignment format. """Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format. You are expected to use this module via the Bio.SeqIO functions.""" from Bio.Alphabet import single_letter_alphabet from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord #This is a generator function! def IgIterator(handle, alphabet = single_letter_alphabet) : """Iterate over IntelliGenetics records (as SeqRecord objects). handle - input file alphabet - optional alphabet The optional free format file header lines (which start with two semi-colons) are ignored. The free format commentary lines at the start of each record (which start with a semi-colon) are recorded as a single string with embedded new line characters in the SeqRecord's annotations dictionary under the key 'comment'. """ #Skip any file header text before the first record (;; lines) while True : line = handle.readline() if not line : break #Premature end of file, or just empty? if not line.startswith(";;") : break while line : #Now iterate over the records if line!=";" : raise ValueError( \ "Records should start with ';' and not:\n%s" % repr(line)) #Try and agree with SeqRecord convention from the GenBank parser, #(and followed in the SwissProt parser) which stores the comments #as a long string with newlines under annotations key 'comment'. #Note some examples use "; ..." and others ";..." comment_lines =  while line.startswith(";") : #TODO - Extract identifier from lines like "LOCUS\tB_SF2"? comment_lines.append(line[1:].strip()) line = handle.readline() title = line.rstrip() seq_lines =  while True: line = handle.readline() if not line : break if line == ";": break #Remove trailing whitespace, and any internal spaces seq_lines.append(line.rstrip().replace(" ","")) #Return the record and then continue... record= SeqRecord(Seq("".join(seq_lines), alphabet), id = title, name = title) record.annotations['comment'] = "\n".join(comment_lines) yield record #We should be at the end of the file now assert not line if __name__ == "__main__" : print "Running quick self test" import os for filename in os.listdir("../../Tests/Intelligenetics/") : if os.path.splitext(filename)[-1] == ".txt" : print print filename print "-"*len(filename) handle = open(os.path.join("../../Tests/Intelligenetics/", filename)) for record in IgIterator(handle) : print record.id, len(record) handle.close() print "Done"