Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

cdd_format.py

# Copyright 2001 by Katharine Lindner.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Martel based parser to read CDD formatted files.

This is a huge regular regular expression for CDD, built using
the 'regular expressiona on steroids' capabilities of Martel.

http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml
Notes:
Just so I remember -- the new end of line syntax is:
  New regexp syntax - \R
     \R    means "\n|\r\n?"
     [\R]  means "[\n\r]"

This helps us have endlines be consistent across platforms.

# standard library
http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml
"""
import string

# Martel
import Martel
from Martel import RecordReader
from Martel import Str
from Martel import AnyEol
from Martel import ToEol
from Martel import Group
from Martel import Alt
from Martel import Opt
from Martel import Rep
from Martel import Rep1
from Martel import Any
from Martel import AnyBut
from Martel import Assert
from Martel import AssertNot





# --- first set up some helper constants and functions
# Copyright 2002 by Katharine Lindner.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

upper_alpha = Any( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" )
white_space = Any( "\t " )
eols = chr( 13 ) + chr( 10 )
white_spaces = Rep( white_space )
summary_line = Str( "CD summary" ) + ToEol()

cd_tag = Group( "cd_tag", Str( "CD:" ) )
description_tag = Group( "description_tag", Str( "Description:" ) )
status_tag = Group( "status_tag", Str( "CD status:" ) )
source_tag = Group( "source_tag", Str( "Source:" ) )
date_tag = Group( "date_tag", Str( "Created:" ) )
reference_tag = Group( "reference_tag", Str( "References:" ) )
taxonomy_tag =  Group( "taxonomy_tag", Str( "Taxonomy spanned:" ) )
aligned_tag = Group( "aligned_tag", Str( "Aligned sequences:" ) )
representative_tag = Group( "representative_tag", Str( "Representative:" ) )
range_tag = Group( "range_tag", Str( "Aligned range:" ) )
sequence_tag = Group( "sequence_tag", Str( "Sequence:" ) )
has_tag = Alt( cd_tag, description_tag, status_tag, source_tag, date_tag, \
    reference_tag, taxonomy_tag, aligned_tag, representative_tag, range_tag, sequence_tag )

cd_key_line = cd_tag + white_spaces + AnyEol()
description_key_line = description_tag + white_spaces + AnyEol()
status_key_line = status_tag + white_spaces + AnyEol()
source_key_line = source_tag + white_spaces + AnyEol()
date_key_line = date_tag + white_spaces + AnyEol()
reference_key_line = reference_tag + white_spaces + AnyEol()
taxonomy_key_line = taxonomy_tag + white_spaces + AnyEol()
aligned_key_line = aligned_tag + white_spaces + AnyEol()
representative_key_line = representative_tag + white_spaces + AnyEol()
range_key_line = range_tag + white_spaces + AnyEol()
sequence_key_line = sequence_tag + white_spaces + AnyEol()

cd_contents_line = Group( "cd_contents_line", AssertNot( has_tag ) + ToEol() )
description_contents_line = AssertNot( has_tag ) + ToEol()
status_contents_line = AssertNot( has_tag ) + ToEol()
source_contents_line = AssertNot( has_tag ) + ToEol()
date_contents_line = AssertNot( has_tag ) + ToEol()
reference_contents_line = AssertNot( has_tag ) + ToEol()
taxonomy_contents_line = AssertNot( has_tag ) + ToEol()
aligned_contents_line = AssertNot( has_tag ) + ToEol()
representative_contents_line = AssertNot( has_tag ) + ToEol()
range_contents_line = AssertNot( has_tag ) + ToEol()
sequence_contents_line = Group( "sequence_contents_line", \
                            white_spaces + Rep1( upper_alpha ) + white_spaces + AnyEol() )
sentinel_line = white_spaces + Str( "Definition" ) + white_spaces + AnyEol()
boiler_plate = AssertNot( sentinel_line ) + ToEol()
definition_line = Group( "definition_line", \
    Rep( AnyBut( eols + '[' ) ) + Str( '[CD]' ) + white_spaces + AnyEol() )
pdb_id_line = AssertNot( definition_line ) + ToEol()
pdb_id_multiline = Group( "pdb_id_multiline", Rep1( pdb_id_line ) )
table_entry = Group( "table_entry", \
    pdb_id_multiline + definition_line )
table = Group( "table", Rep1( table_entry ) )

cd_contents_multiline = Group( "cd_contents_multiline", \
    Rep( cd_contents_line ) )
description_contents_multiline = Group( "description_contents_multiline", \
    Rep( description_contents_line ) )
status_contents_multiline = Group( "status_contents_multiline", \
    Rep( status_contents_line ) )
source_contents_multiline = Group( "source_contents_multiline", \
    Rep( source_contents_line ) )
date_contents_multiline = Group( "date_contents_multiline", \
    Rep( date_contents_line ) )
reference_contents_multiline = Group( "reference_contents_multiline", \
    Rep( reference_contents_line ) )
taxonomy_contents_multiline = Group( "taxonomy_contents_multiline", \
    Rep( taxonomy_contents_line ) )
aligned_contents_multiline = Group( "aligned_contents_multiline", \
    Rep( aligned_contents_line ) )
representative_contents_multiline = Group( "representative_contents_multiline", \
    Rep( representative_contents_line ) )
range_contents_multiline = Group( "range_contents_multiline", \
    Rep( range_contents_line ) )
sequence_contents_multiline = Group( "sequence_contents_multiline", \
    Rep( sequence_contents_line ) )

cd_block = cd_key_line + cd_contents_multiline
description_block = description_key_line + description_contents_multiline
status_block = status_key_line + status_contents_multiline
source_block = source_key_line + source_contents_multiline
date_block = date_key_line + date_contents_multiline
reference_block = Assert(reference_tag ) + reference_key_line + \
    reference_contents_multiline
taxonomy_block = taxonomy_key_line + taxonomy_contents_multiline
aligned_block = aligned_key_line + aligned_contents_multiline
representative_block = representative_key_line + representative_contents_multiline
range_block = range_key_line + range_contents_multiline
sequence_block = sequence_key_line + sequence_contents_multiline
trailer_line = ToEol()

cdd_record = summary_line + cd_block + description_block + status_block + \
    source_block + date_block + Opt( reference_block ) + taxonomy_block + \
    aligned_block + representative_block + range_block + sequence_block + \
    Rep( boiler_plate ) + sentinel_line + table










Generated by  Doxygen 1.6.0   Back to index