Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

web_parse.py

import string
import copy
import operator
import urllib
import sgmllib
import Bio.File
import Martel
from mx import TextTools

"""
The LocusLink site is:
http://www.ncbi.nlm.nih.gov/LocusLink/
Parses a Locus web page.
"""

def is_empty_container( item ):
    response = 0
    if is_container( item ):
        if len( item ) == 0:
            response = 1
    return response

def is_container( item ):
    response = 0
    if type( item ) in [ type( [] ), type( {} ) ]:
        response = 1
    return response

def is_substring( a, b ):
    if( a.find( b ) < 0 ):
        return 0
    else:
        return 1

def print_params( params ):
    print '%s!!!!!\n' % 'PARAMS'
    for item in params:

        print 'param ' + str( item )
    print '-----------------'

def process_list( params ):
    len_params = len( params )
    container = []
    while 1:
        try:
            element = params.pop()
        except:
            break
        if is_close_token( element ): break
        elif is_open_token( element ):
            break
        else:
            container.append( element )
    return container

def put( dict, key, val ):
    if dict.has_key( key ):
        element = dict[ key ]
        dict[ key ] = [ element, val ]
    else:
        dict[ key ] = val


def process_dict( params ):
    container = {}
    while len( params ) > 0:
        element = params.pop()
        if type( element ) == type( {} ):
            for key, val in element.items():
                put( container, key, val )
        elif is_close_token( element ): break
        elif is_open_token( element ):
            params.append( element )
        else:
            val = params.pop()
            if type( val ) == type( [] ):
                if len( val ) == 1:
                    val = val[ 0 ]
                try:
                    put( container, element, val )
                except:
                    print 'Element'
                    print element
                    params.append( element )

            elif(  not is_close_token( val ) ):
                try:
                    put( container, element, val )
                except:
                    print 'Element'
                    print element
                    params.append( element )
            else:
                break
    return container

00098 class Token:
    def __init__( self, token  ):
        self.token = token

    def __eq__( self, other ):
        if not isinstance( other, self.__class__ ):
            return 0
        if self.token == other.token:
            return 1
        return 0

    def __ne__( self, other ):
        if not isinstance( other, Token ):
            return 1
        if self.token != other.token:
            return 1
        return 0

    def __str__( self ):
        output = 'token_%s\n' % self.token
        return output


open_list = Token( 'open_list' )
close_list = Token( 'close_list' )
open_dict = Token( 'open_dict' )
close_dict = Token( 'close_dict' )

def is_open_token( target ):
    answer = 0
    if isinstance( target, Token ):
        if ( open_list.__eq__( target ) ) or ( open_dict.__eq__(
target ) ):
            answer = 1
    return answer

def is_close_token( target ):
    answer = 0
    if isinstance( target, Token ):
        if ( close_list.__eq__( target ) ) or ( close_dict.__eq__(
target ) ):
            answer = 1
    return answer

def is_token( target ):
    return is_open_token( target ) or is_close_token( target )

00145 class Url:

    def __init__( self, url, label = '', description = '' ):
        self.url = url
        self.label = label
        self.description = description

    def __str__( self ):
        output = '%s\n' % self.label
        output = output + 'url = %s\n' % self.url
        output = output + '%s\n' % self.description
        return output


00159 class Record(dict):

    def __init__( self ):
        dict.__init__( self )

    def __str__( self ):
        queue_keys = self.keys()
        queue_keys.sort()
        out = ''
        for key in queue_keys:
            out = out +  '%s:\n' % key.upper()
            out = out + self.print_item( self[ key ] )
        out = out + '\n'

        return out

    def print_item( self, item, level = 1 ):
        indent = '    '
        out = ''
        for j in range( 0, level ):
            indent = indent + '    '
        if( type( item ) == type( '' ) ):
            if( item != '' ):
                out = out + '%s%s\n' % ( indent, item )
        elif( type( item ) == type([])):
            for subitem in item:
                out = out + self.print_item( subitem, level + 1 )
            out = out + '----------------------------------------------\n'
        elif( type( item ) == type ( {} ) ):
            keys = item.keys()
            keys.sort()
            for subitem in keys:
                out = out + '%skey is %s\n' % ( indent, subitem )
                out = out + self.print_item( item[ subitem ], level + 1 )
        elif( isinstance( item, dict ) ):
            keys = item.keys()
            keys.sort()
            for subitem in keys:
                out = out + '%skey is %s\n' % ( indent, subitem )
                out = out + self.print_item( item[ subitem ], level + 1 )
        else:
            out = out + '%s%s\n' % ( indent, str( item ) )
        return out


00204 class LocusLinkParser( sgmllib.SGMLParser ):

    def reset( self ):
        sgmllib.SGMLParser.reset( self )
        self.text = ''
        self.record = Record()
        self.open_tag_stack = []
        self.open_tag = 'open_html'
        self.outer_state = 'undefined'
        self.section_state = 'undefined'
        self.local_title = ''
        self.structure_stack = []
        self.category = ''
        self.context_chain = []
        self.outer_state_dict = { 'nomenclature' : 'nomenclature', 'overview' : 'overview', \
            'function' : 'function', \
            'relationships' : 'relationships', \
            'locus' : 'locus', \
            'map' : 'map', \
            'refseq' : 'refseq', \
            'genbank' : 'genbank', \
            'external' : 'external_annotation', \
           'additional' : 'additional_links' \
        }


    def parse( self, handle ):
        self.reset()
        self.feed( handle )
        return self.record

#
# Assumes an empty line between records
#
    def feed( self, handle ):
        if isinstance(handle, Bio.File.UndoHandle):
            uhandle = handle
        else:
            uhandle = Bio.File.UndoHandle(handle)
        text = ''
        while 1:
            line = uhandle.readline()
            if not line:
                break
            text = text + ' ' + line

        sgmllib.SGMLParser.feed( self, text )

    def get_text( self ):
        text = self.text
        self.text = ''
        return text

    def handle_comment( self, comment ):
        while comment.startswith( '-' ):
            comment = comment[ 1: ]
        comment = comment.strip()
        comment = comment.lower()

        keys = self.outer_state_dict.keys()
        for key in keys:
            if comment.startswith( key ):
                if key in [ 'nomenclature', 'overview', 'function',
'relationships', 'map', 'locus', 'external' ]:
                    self.structure_stack.append( open_dict )
                elif key in [ 'genbank', 'additional' ]:
                    self.structure_stack.append( open_list )
                elif key in [ 'refseq' ]:
                    self.structure_stack.append( open_list )
                self.outer_state = key
                self.section_state = 'local_title'
                self.detail_state = 'undefined'
                if( key == 'refseq' ):
                    self.detail_state = 'waiting_category'
                else:
                    self.detail_state = 'waiting_key'
                break
        if comment.startswith( 'end' ):
            if is_substring( comment.lower(), self.outer_state ):
                if self.outer_state == 'refseq':
                    self.structure_stack.append( close_list )
                elif self.outer_state == 'function':
                    self.structure_stack.append( close_list )
                    self.structure_stack.append( close_dict )
                self.process_structure_stack()
                while 1:
                    try:
                        item = self.structure_stack.pop()
                    except:
                        item = 'Not Available'
                    if not is_token( item ) : break
                key = self.outer_state
                self.record[ self.outer_state_dict[ key ] ] = item
                self.outer_state = 'undefined'


    def handle_data(self, newtext ):
        newtext = string.strip( newtext )
        self.text = self.text + newtext

    def start_a( self, attrs ):
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_a'
        attr_dict = {}
        for key, val in attrs:
            attr_dict[ key ] = val
        outer_state = self.outer_state
        if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'refseq', 'additional', 'external' ] ):
            if self.section_state == 'local_contents':
                if self.detail_state in [ 'scan_val', 'unpaired_key' ]:
                    if attr_dict.has_key( 'href' ):
                        href = attr_dict[ 'href' ]
                        self.text = ''
                        self.structure_stack.append( Url( href, '' ) )
        elif outer_state == 'function':
            if self.section_state == 'local_contents':
                if self.detail_state in [ 'scan_val', 'unpaired_key', 'may_be_val' ]:
                    if attr_dict.has_key( 'href' ):
                        href = attr_dict[ 'href' ]
                        self.text = ''
                        self.structure_stack.append( Url( href, '' ) )


    def end_a( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        outer_state = self.outer_state
        if( outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
            if self.section_state == 'local_contents':
                if self.detail_state in [ 'scan_val', 'unpaired_key' ]:
                    text = self.get_text()
                    url = self.structure_stack.pop()
                    if isinstance( url, Url ):
                        url.label = text
                    self.structure_stack.append( url )

        elif outer_state == 'function':
            if self.section_state == 'local_contents':
                if self.detail_state in [ 'scan_val', 'unpaired_key',
'may_be_val' ]:
                    text = self.get_text()
                    url = self.structure_stack.pop()
                    if isinstance( url, Url ):
                        url.label = text
                    self.structure_stack.append( url )

    def start_b( self, attrs ):

        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_b'
        outer_state = self.outer_state
        if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
            self.text = ''



    def end_b( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        outer_state = self.outer_state
        if( outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'locus', 'map', 'refseq', 'genbank', 'additional', 'external' ] ):
            if self.section_state == 'local_contents':
                text = self.get_text()
                cols = text.split( ':', 1 )
                key = cols[ 0 ]
                if( outer_state == 'refseq' ):
                    self.structure_stack.append( cols[ 1 ] )
                    self.structure_stack.append( open_dict )
                    self.detail_state = 'waiting_key'
                elif outer_state == 'relationships':
                    self.structure_stack.append( key )
                    self.structure_stack.append( open_list )
                    self.detail_state = 'skip'
                elif outer_state == 'additional':
                    self.structure_stack.append( open_dict )
                    self.structure_stack.append( key )
                    self.structure_stack.append( open_list )
                    self.detail_state = 'unpaired_key'
                elif outer_state == 'function':
                    if self.detail_state != 'waiting_key':
                        self.structure_stack.append( close_list )
                    self.structure_stack.append( key )
                    self.detail_state = 'unpaired_key'
                    self.structure_stack.append( open_list )
                    self.structure_stack.append( open_list )
                    try:
                        val = cols[ 1 ]
                        if val.strip() != '':
                            self.structure_stack.append( val )
                            self.detail_state = 'unpaired_key'

                    except IndexError:
                        pass
                else:
                    if self.detail_state != 'waiting_key':
                        self.structure_stack.append( close_list )
                    self.detail_state = 'scan_val'
                    self.structure_stack.append( key )
                    self.structure_stack.append( open_list )
                    self.structure_stack.append( open_list )
                    try:
                        val = cols[ 1 ]
                        if val.strip() != '':
                            self.structure_stack.append( val )
                    except IndexError:
                        pass


    def start_th( self, attrs ):

        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_th'
        outer_state = self.outer_state
        self.text = ''
        if outer_state in [ 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
            if self.section_state == 'local_contents':
                self.detail_state = 'scan_headings'



    def end_th( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        outer_state = self.outer_state
        if outer_state == 'refseq':
            if self.section_state == 'local_contents':
                text = self.get_text()
                cols = text.strip().split( ':', 1 )
                if text.strip().lower().startswith( 'category' ):
                    self.structure_stack.append( open_dict )
                    self.structure_stack.append( cols[ 1 ] )
                    self.structure_stack.append( open_list )
                    self.structure_stack.append( open_dict )
                    self.detail_state = 'found_category'

                elif self.detail_state in [ 'found_category', 'may_be_val' ]:
                    if  text.strip() != '':
                        if self.detail_state != 'found_category':
                            self.structure_stack.append( close_list )
                        cols  = text.split( ':' )
                        self.structure_stack.append( cols[ 0 ] )
                        self.structure_stack.append( open_list )
                        try:
                            val = cols[ 1 ]
                            self.structure_stack.append(  open_list )
                            self.structure_stack.append( val )
                            self.detail_state = 'scan_val'
                        except IndexError:
                            self.detail_state = 'may_be_val'





    def start_table( self, attrs ):
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_table'
        self.text = ''
        if self.outer_state == 'genbank':
            if self.section_state == 'local_contents':
                self.detail_state = 'skip'
        elif( self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'locus', 'map', 'genbank', 'additional', 'external' ] ):

            if self.section_state == 'local_contents':
                self.detail_state = 'waiting_key'

    def end_table( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        if( self.section_state == 'local_title' ):
            if self.outer_state == 'refseq':
                self.section_state = 'local_contents'
            elif self.outer_state == 'additional':
                self.section_state = 'local_contents'
                self.detail_state = 'scan_val'
            else:
                self.section_state = 'local_contents'
                self.detail_state = 'waiting_key'
        elif self.section_state == 'local_contents':
            if( self.outer_state in [  'nomenclature', 'relationships', 'locus', 'map', 'external' ] ):
                self.structure_stack.append( close_list )
            elif ( self.outer_state in [ 'genbank', 'additional' ] ):
                if self.detail_state == 'scan_val':
                    self.structure_stack.append( close_list )

            elif self.outer_state == 'refseq':
                if self.detail_state in ['may_be_val', 'scan_val' ]:
                    self.structure_stack.append( close_list )
                    self.structure_stack.append( close_dict )
                    self.structure_stack.append( close_list )
                    self.structure_stack.append( close_dict )
                    self.detail_state = 'scan_category'


    def start_tr( self, attrs ):
        top = self.open_tag
        self.open_tag_stack.append( self.open_tag )
        if top == 'open_table_row':
            if self.outer_state == 'refseq':
                if self.section_state == 'local_contents':
                    if self.detail_state in [ 'scan_val', ]:
                        self.structure_stack.append( close_list )
                        self.detail_state = 'may_be_val'
                        self.open_tag_stack.pop()
        self.open_tag = 'open_table_row'
        self.text = ''
        outer_state = self.outer_state
        if( outer_state in [   'relationships', 'locus', 'function', 'genbank', 'external'
] ):
            if self.section_state == 'local_contents':
                if self.detail_state == 'scan_val':
                    self.structure_stack.append( open_list )
        elif outer_state == 'map':
            if self.section_state == 'local_contents':
                if self.detail_state == 'scan_val':
                    self.structure_stack.append( open_list )

        elif outer_state == 'additional':
            if self.section_state == 'local_contents':
                self.detail_state = 'scan_val'
                self.structure_stack.append( open_list )


    def end_tr( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        if self.section_state == 'local_contents':
            if( self.outer_state in [  'overview', 'nomenclature', 'relationships',
'locus', 'genbank', 'external' ] ):
                if self.detail_state == 'scan_val':
                    self.structure_stack.append( close_list )
                elif self.detail_state == 'unpaired_key':
                    self.structure_stack.append( close_list )
                elif self.detail_state == 'skip':
                    self.detail_state = 'scan_val'
                elif self.detail_state == 'scan_headings':
                    self.detail_state = 'scan_val'
            elif self.outer_state in [ 'additional', ]:
                if self.detail_state == 'unpaired_key':
                    self.structure_stack.append( close_list )
                    self.structure_stack.append( close_dict )
                    self.structure_stack.append( close_list )
                elif self.detail_state == 'scan_val':
                    self.structure_stack.append( close_list )
            elif self.outer_state in [ 'function', ]:
                if self.detail_state == 'scan_headings':
                    self.detail_state = 'scan_val'
                elif self.detail_state == 'unpaired_key':
                    self.detail_state = 'may_be_val'
                    self.structure_stack.append( close_list )
                elif self.detail_state == 'scan_val':
                    self.detail_state = 'may_be_val'
                    self.structure_stack.append( close_list )
            elif self.outer_state in [ 'refseq', ]:
                if self.section_state == 'local_contents':
                    if self.detail_state == 'scan_val':
                        self.structure_stack.append( close_list )
                        self.detail_state = 'may_be_val'
            elif self.outer_state == 'map':
                if self.section_state == 'local_contents':
                    if self.detail_state == 'scan_val':
                        self.structure_stack.append( close_list )
                    self.detail_state = 'may_be_val'


    def start_td( self, attrs ):
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_table_data'
        if self.outer_state in [ 'nomenclature', 'overview', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
            if( self.section_state == 'local_contents' ):
                self.text = ''
        elif self.outer_state == 'refseq':
            if self.section_state == 'local_contents':
                self.text = ''
                if self.detail_state == 'may_be_val':
                    self.structure_stack.append( open_list )
                    self.detail_state = 'scan_val'

    def end_td( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'



        if self.outer_state in [ 'nomenclature', 'overview',  'relationships', 'locus', 'genbank', 'additional', 'external' ]:
            if( self.section_state == 'local_contents' ):
                if self.detail_state == 'scan_val':
                    text = self.get_text()
                    if( text != '' ):
                        self.structure_stack.append( text )
        elif self.outer_state == 'function':
            if self.section_state == 'local_contents':
                text = self.get_text()
                if( text != '' ):
                    if self.detail_state == 'may_be_val':
                        if text.strip() != '':
                            self.structure_stack.append( open_list )
                            self.detail_state = 'scan_val'
                    if self.detail_state in [ 'unpaired_key', 'scan_val' ]:
                        self.structure_stack.append( text )
        elif self.outer_state == 'map':
            if self.section_state == 'local_contents':
                text = self.get_text()
                if( text != '' ):
                    if self.detail_state == 'may_be_val':
                        if text.strip() != '':
                            self.structure_stack.append( open_list )
                            self.detail_state = 'scan_val'
                    if self.detail_state == 'scan_val':
                        self.structure_stack.append( text )
        elif self.outer_state == 'refseq':
            if self.section_state == 'local_contents':
                if self.detail_state == 'scan_val':
                    text = self.get_text()
                    if( text != '' ):
                        self.add_text_to_object( text )

    def do_br( self, attrs ):
        if self.outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'map', 'locus', 'genbank', 'additional', 'external' ]:
            if( self.section_state == 'local_contents' ):
                if self.detail_state == 'scan_val':
                    if self.is_contained_by( 'open_table_data' ):
                        text = self.get_text()
                        if( text != '' ):
                            self.structure_stack.append( text )


    def add_text_to_object( self, text ):
        stack_item = self.structure_stack.pop()
        if isinstance( stack_item, Url ):
            if stack_item.description == '':
                stack_item.description = text
            self.structure_stack.append( stack_item )
        else:
            self.structure_stack.append( stack_item )
            self.structure_stack.append( text )



    def is_contained_by( self, tag ):
        return tag in self.open_tag_stack

    def process_structure_stack( self ):
        params = []
        outer_state = self.outer_state
        if outer_state in [ 'nomenclature', 'overview', 'function', 'relationships', 'refseq', 'locus', 'map', 'genbank', 'additional', 'external' ]:
            while len( self.structure_stack ) > 1:
                len_stack = len( self.structure_stack )
#                self.print_stack()
                for i in range ( 0, len_stack ):
                    item = self.structure_stack.pop()
                    if not is_open_token( item ):
                        params.append( item )
                    else: break
                if( open_list.__eq__( item ) ):
                    container = process_list( params )
                    params.append( container )
                else:
                    container = process_dict( params )
                    if len( container ) > 0:
                        params.append( container )
                if ( len( self.structure_stack ) == 0  ) or is_open_token(
self.structure_stack[ -1 ] ):
                    for j in range( 0, len( params ) ):
                        item = params.pop()
                        self.structure_stack.append( item )
                    params = []


    def print_stack( self ):
        print '%s!!!!!\n' % self.outer_state.upper()
        for stack_item in self.structure_stack:
            print 'stack has ' + str( stack_item )
        print '-----------------'




if( __name__ == '__main__' ):
    handle = open( 'Hs13225.htm')
    undo_handle = Bio.File.UndoHandle( handle )
    locuslink_parser = LocusLinkParser()
    record = locuslink_parser.parse( handle )
    print record

Generated by  Doxygen 1.6.0   Back to index