Logo Search packages:      
Sourcecode: python-biopython version File versions

convert_re.py

# Copyright 2000-2001, Dalke Scientific Software, LLC
# Distributed under the Biopython License Agreement (see the LICENSE file).

"""Converts a regular expression pattern string into an Expression tree.

This is not meant to be an externally usable module.

This works by using msre_parse.py to parse the pattern.  The result is
a tree data structure, where the nodes in the tree are tuples.  The
first element of the tuple is the name of the node type.  The format
of the other elements depends on the type.

The conversion routine is pretty simple - convert each msre_parse tuple
node into a Martel Expression node.  It's a recusive implementation.

'msre_parse.py' is a modified version of Secret Labs' 'sre_parse.py'

"""

import string
import msre_parse, Expression

# The msre_parse parser uses a "master pattern object" which keeps
# track of the mapping from group id to group name.  This is okay for
# msre_parse because they only track the last group with a given name.
# I need to get all groups with the same name, so I need a new object
# which stores them in a list that I can use later.

class GroupNames:
    def __init__(self):
        self.flags = 0
        self.open = []
        self.groups = 1
        self.groupdict = {}
    def opengroup(self, name=None):
        gid = self.groups
        self.groups = gid + 1
        if name:
            self.groupdict[name] = self.groupdict.get(name, []) + [gid]
        self.open.append(gid)
        return gid
    def closegroup(self, gid):
        self.open.remove(gid)
    def checkgroup(self, gid):
        return gid < self.groups and gid not in self.open

    def reverse_name(self, id):
        """group number -> group name, or None if there is no name"""
        for key, val in self.groupdict.items():
            if id in val:
                return key
        # Ignore non-named groups



# Convert a 'literal' tuple into a Literal object
def convert_literal(group_names, name, val):
    return Expression.Literal(chr(val), 0)

# Convert a 'not_literal' tuple into a Literal object
def convert_not_literal(group_names, name, val):
    return Expression.Literal(chr(val), 1)

# Convert an 'at_beginning" tuple into an AtBeginning object
# Convert an 'at_end" tuple into an AtEnd object
def convert_at(group_names, name, where):
    if where == "at_beginning":
        return Expression.AtBeginning()
    elif where == "at_end":
        return Expression.AtEnd()
    raise AssertionError("Unknown at name: %s" % repr(where))

# Convert an 'any' tuple into a Dot object
def convert_any(group_names, name, ignore):
    assert ignore is None, "what does it mean when the field is '%s'?" % ignore
    return Expression.Dot()

# Convert an 'assert' tuple into a Assert object, as a positive assertion
def convert_assert(group_names, name, (direction, terms)):
    assert direction == 1, "does not support lookbehind"
    return Expression.Assert(convert_list(group_names, terms), 0)

# Convert an 'assert_not' tuple into a Assert object, as a negative assertion
def convert_assert_not(group_names, name, (direction, terms)):
    assert direction == 1, "does not support lookbehind"
    return Expression.Assert(convert_list(group_names, terms), 1)


# Convert a 'branch' tuple into an Alt object
def convert_branch(group_names, name, (ignore, branches)):
    assert ignore is None, "what is %s?" % repr(ignore)
    results = []
    for branch in branches:
        results.append(convert_list(group_names, branch))
    if len(results) == 1:
        return results[0]
    return Expression.Alt(tuple(results))

# I know, it's only good for ASCII...
def invert(s):
    """s -> a string containing all the characters not present in s"""
    letters = []
    if not(isinstance(s, type(""))):
        s = str(s)
    for c in map(chr, range(256)):
        if c not in s:
            letters.append(c)
    return string.join(letters, "") 

# Map from the msre_parse category names into actual characters.
# I can do this here since I don't worry about non-ASCII character sets.
categories = {
    "category_word": string.letters + "0123456789_",
    "category_digit": string.digits,
    "category_space": "\t\n\v\f\r ",
    "category_newline": "\n\r",
 
    "category_not_word": invert(string.letters + "0123456789_"),
    "category_not_digit": invert(string.digits),
    "category_not_space": invert("\t\n\v\f\r "),
    }

# Convert an 'in' tuple into an Any object
# Pass in the negate flag if given.
def convert_in(group_names, name, terms):
    negate = (terms[0][0] == 'negate')
    s = ""
    for c in terms[negate:]:
        if c[0] == 'literal':
            s = s + chr(c[1])
        elif c[0] == 'range':
            for i in range(c[1][0], c[1][1]+1):
                s = s + chr(i)
        elif c[0] == 'category':
            s = s + categories[c[1]]
        else:
            raise AssertionError("unknown option for 'in': %s" % c[0])
    return Expression.Any(s, negate)


# Convert a 'subpattern' tuple into a Group object
def convert_subpattern(group_names, name, (id, terms)):
    pattern_name = group_names.reverse_name(id)

    # The name in the ?P<group> may contain attr information
    # serialized in a URL-encoded form; if present, deconvolute.
    pos = -1
    attrs = {}
    if pattern_name is not None:
        pos = string.find(pattern_name, "?")
    
    if pos != -1:
        import cgi
        qs = pattern_name[pos+1:]
        if not qs:
            # cgi.parse_qs doesn't like parsing the empty string
            attrs = {}
        else:
            attrs = cgi.parse_qs(pattern_name[pos+1:],
                                 keep_blank_values = 1,
                                 strict_parsing = 1)
        pattern_name = pattern_name[:pos]

        for k, v in attrs.items():
            if len(v) != 1:
                raise AssertionError(
"The attribute name %s was found more than once (%d times) in the tag %s" %
            (repr(k), len(v), repr(pattern_name)))

            attrs[k] = v[0]
    
    return Expression.Group(pattern_name, convert_list(group_names, terms),
                            attrs)

# Convert a 'newline' tuple into an AnyEol object
def convert_newline(group_names, name, ignore):
    assert ignore is None, "what does it mean when field is %s?" % `ignore`
    return Expression.AnyEol()

# Convert a 'max_repeat' tuple into a MaxRepeat object
def convert_max_repeat(group_names, name, (min_count, max_count, terms)):
    return Expression.MaxRepeat(convert_list(group_names, terms),
                                min_count, max_count)

# Convert a 'groupref' tuple into a GroupRef object
def convert_groupref(group_names, name, id):
    assert type(id) != type(0), \
           "Martel cannot use numbered group reference: %d" % id
    # msre_parse returns the list from the GroupNames
    # Map that back to a number.
    pattern_name = group_names.reverse_name(id[0])
    return Expression.GroupRef(pattern_name)

# Map from the tuple typename into the function used to convert the
# tuple into an Expression.

converter_table = {
    "any": convert_any,
    "assert": convert_assert,
    "assert_not": convert_assert_not,
    "at": convert_at,
    "branch": convert_branch,
    "groupref": convert_groupref,
    "in": convert_in,
    "literal": convert_literal,
    "max_repeat":  convert_max_repeat,
    "newline": convert_newline,
    "not_literal": convert_not_literal,
    "subpattern":  convert_subpattern,
    }

# Convert a list of msre_parse tuples into a Seq
def convert_list(group_names, terms):
    # This is always a sequence of terms
    results = []
    for term in terms:
        name = term[0]
        try:
            func = converter_table[name]
        except KeyError:
            raise AssertionError, "Do not understand sre expression %s" % \
                  repr(name)
                  
        results.append( func(*(group_names,) + term) )
    if len(results) == 1:
        return results[0]
    return Expression.Seq(tuple(results))
        
    
# Primary entry point
def make_expression(pattern):
    """pattern -> the Expression tree for the given pattern string"""

    # In the following, the "pattern =" and "x.pattern" are the names
    # used by msre_parse.  They have nothing to do the input pattern.
    
    # Make the msre_parse tuple tree from the string ...
    x = msre_parse.parse(str = pattern, pattern = GroupNames())

    # ... and convert it into an Expression
    return convert_list(x.pattern, x)

Generated by  Doxygen 1.6.0   Back to index