Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package


# Copyright 2000 by Jeffrey Chang.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

This module is OBSOLETE.
Most of the functionality in this module has moved to Bio.ExPASy.Prodoc;
please see

Bio.ExPASy.Prodoc.read          To read a Prodoc file containing one entry.
Bio.ExPASy.Prodoc.parse         Iterates over entries in a Prodoc file.
Bio.ExPASy.Prodoc.Record        Holds Prodoc data.
Bio.ExPASy.Prodoc.Reference     Holds data from a Prodoc reference.

The other functions and classes in Bio.Prosite.Prodoc (including
Bio.Prosite.Prodoc.index_file and Bio.Prosite.Prodoc.Dictionary) are
considered deprecated, and were not moved to Bio.ExPASy.Prodoc. If you use
this functionality, please contact the Biopython developers at
biopython-dev@biopython.org to avoid permanent removal of this module from

This module provides code to work with the prosite.doc file from
Prosite, available at http://www.expasy.ch/prosite/.

Tested with:
Release 15.0, July 1998
Release 16.0, July 1999
Release 20.22, 13 November 2007

parse              Iterates over entries in a Prodoc file.
index_file         Index a Prodoc file for a Dictionary.
_extract_record    Extract Prodoc data from a web page.

Record             Holds Prodoc data.
Reference          Holds data from a Prodoc reference.
Dictionary         Accesses a Prodoc file using a dictionary interface.
RecordParser       Parses a Prodoc record into a Record object.

_Scanner           Scans Prodoc-formatted data.
_RecordConsumer    Consumes Prodoc data to a Record object.

import warnings
warnings.warn("This module is OBSOLETE. Most of the functionality in this module has moved to Bio.ExPASy.Prodoc.", PendingDeprecationWarning)

from types import *
import os
import sgmllib
from Bio import File
from Bio import Index
from Bio.ParserSupport import *

def parse(handle):
    import cStringIO
    parser = RecordParser()
    text = ""
    for line in handle:
        text += line
        if line[:5] == '{END}':
            handle = cStringIO.StringIO(text)
            record = parser.parse(handle)
            text = ""
            yield record

def read(handle):
    parser = RecordParser()
    record = parser.parse(handle)
    # We should have reached the end of the record by now
    remainder = handle.read()
    if remainder:
        raise ValueError("More than one Prodoc record found")
    return record

# It may be a good idea to rewrite read(), parse() at some point to avoid
# using the old-style "parser = RecordParser(); parser.parse(handle)" approach.

00086 class Record:
    """Holds information from a Prodoc record.

    accession      Accession number of the record.
    prosite_refs   List of tuples (prosite accession, prosite name).
    text           Free format text.
    references     List of reference objects.

    def __init__(self):
        self.accession = ''
        self.prosite_refs = []
        self.text = ''
        self.references = []

00102 class Reference:
    """Holds information from a Prodoc citation.

    number     Number of the reference. (string)
    authors    Names of the authors.
    citation   Describes the citation.

    def __init__(self):
        self.number = ''
        self.authors = ''
        self.citation = ''

00116 class Dictionary:
    """Accesses a Prodoc file using a dictionary interface.

    __filename_key = '__filename'
00122     def __init__(self, indexname, parser=None):
        """__init__(self, indexname, parser=None)

        Open a Prodoc Dictionary.  indexname is the name of the
        index for the dictionary.  The index should have been created
        using the index_file function.  parser is an optional Parser
        object to change the results into another form.  If set to None,
        then the raw contents of the file will be returned.

        self._index = Index.Index(indexname)
        self._handle = open(self._index[Dictionary.__filename_key])
        self._parser = parser

    def __len__(self):
        return len(self._index)

    def __getitem__(self, key):
        start, len = self._index[key]
        data = self._handle.read(len)
        if self._parser is not None:
            return self._parser.parse(File.StringHandle(data))
        return data

    def __getattr__(self, name):
        return getattr(self._index, name)

00150 class RecordParser(AbstractParser):
    """Parses Prodoc data into a Record object.

    def __init__(self):
        self._scanner = _Scanner()
        self._consumer = _RecordConsumer()

    def parse(self, handle):
        self._scanner.feed(handle, self._consumer)
        return self._consumer.data

00162 class _Scanner:
    """Scans Prodoc-formatted data.

    Tested with:
    Release 15.0, July 1998
00169     def feed(self, handle, consumer):
        """feed(self, handle, consumer)

        Feed in Prodoc data for scanning.  handle is a file-like
        object that contains prosite data.  consumer is a
        Consumer object that will receive events as the report is scanned.

        if isinstance(handle, File.UndoHandle):
            uhandle = handle
            uhandle = File.UndoHandle(handle)

        while 1:
            line = uhandle.peekline()
            if not line:
            elif is_blank_line(line):
                # Skip blank lines between records
                self._scan_record(uhandle, consumer)
    def _scan_record(self, uhandle, consumer):

        self._scan_accession(uhandle, consumer)
        self._scan_prosite_refs(uhandle, consumer)
        read_and_call(uhandle, consumer.noevent, start='{BEGIN}')
        self._scan_text(uhandle, consumer)
        self._scan_refs(uhandle, consumer)
        self._scan_copyright(uhandle, consumer)
        read_and_call(uhandle, consumer.noevent, start='{END}')


    def _scan_accession(self, uhandle, consumer):
        read_and_call(uhandle, consumer.accession, start='{PDOC')

    def _scan_prosite_refs(self, uhandle, consumer):
        while attempt_read_and_call(uhandle, consumer.prosite_reference,

    def _scan_text(self, uhandle, consumer):
        while 1:
            line = safe_readline(uhandle)
            if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \
               line[:5] == '{END}':

    def _scan_refs(self, uhandle, consumer):
        while 1:
            line = safe_readline(uhandle)
            if line[:5] == '{END}' or is_blank_line(line):

    def _scan_copyright(self, uhandle, consumer):
        # Cayte Lindner found some PRODOC records with the copyrights
        # appended at the end.  We'll try and recognize these.
        read_and_call_while(uhandle, consumer.noevent, blank=1)
        if attempt_read_and_call(uhandle, consumer.noevent, start='+----'):
            read_and_call_until(uhandle, consumer.noevent, start='+----')
            read_and_call(uhandle, consumer.noevent, start='+----')
        read_and_call_while(uhandle, consumer.noevent, blank=1)

00240 class _RecordConsumer(AbstractConsumer):
    """Consumer that converts a Prodoc record to a Record object.

    data    Record with Prodoc data.

    def __init__(self):
        self.data = None
    def start_record(self):
        self.data = Record()
    def end_record(self):

    def accession(self, line):
        line = line.rstrip()
        if line[0] != '{' or line[-1] != '}':
            raise ValueError("I don't understand accession line\n%s" % line)
        acc = line[1:-1]
        if acc[:4] != 'PDOC':
            raise ValueError("Invalid accession in line\n%s" % line)
        self.data.accession = acc

    def prosite_reference(self, line):
        line = line.rstrip()
        if line[0] != '{' or line[-1] != '}':
            raise ValueError("I don't understand accession line\n%s" % line)
        acc, name = line[1:-1].split('; ')
        self.data.prosite_refs.append((acc, name))
    def text(self, line):
        self.data.text = self.data.text + line
    def reference(self, line):
        if line[0] == '[' and line[3] == ']':  # new reference
            self._ref = Reference()
            self._ref.number = line[1:3].strip()
            if line[1] == 'E':
                # If it's an electronic reference, then the URL is on the
                # line, instead of the author.
                self._ref.citation = line[4:].strip()
                self._ref.authors = line[4:].strip()
        elif line[:4] == '    ':
            if not self._ref:
                raise ValueError("Unnumbered reference lines\n%s" % line)
            self._ref.citation = self._ref.citation + line[5:]
            raise Exception("I don't understand the reference line\n%s" % line)

    def _clean_data(self):
        # get rid of trailing newlines
        for ref in self.data.references:
            ref.citation = ref.citation.rstrip()
            ref.authors = ref.authors.rstrip()
def index_file(filename, indexname, rec2key=None):
    """index_file(filename, indexname, rec2key=None)

    Index a Prodoc file.  filename is the name of the file.
    indexname is the name of the dictionary.  rec2key is an
    optional callback that takes a Record and generates a unique key
    (e.g. the accession number) for the record.  If not specified,
    the id name will be used.

    import os
    if not os.path.exists(filename):
        raise ValueError("%s does not exist" % filename)

    index = Index.Index(indexname, truncate=1)
    index[Dictionary._Dictionary__filename_key] = filename

    handle = open(filename)
    records = parse(handle)
    end = 0L
    for record in records:
        start = end
        end = handle.tell()
        length = end - start

        if rec2key is not None:
            key = rec2key(record)
            key = record.accession
        if not key:
            raise KeyError("empty key was produced")
        elif key in index:
            raise KeyError("duplicate key %s found" % key)

        index[key] = start, length

Generated by  Doxygen 1.6.0   Back to index