Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

triefind.py

00001 """
Given a trie, find all occurrences of a word in the trie in a string.

Like searching a string for a substring, except that the substring is
any word in a trie.

Functions:
match         Find longest key in a trie matching the beginning of the string.
match_all     Find all keys in a trie matching the beginning of the string.
find          Find keys in a trie matching anywhere in a string.
find_words    Find keys in a trie matching whole words in a string.

"""
import string
import re

00017 def match(string, trie):
    """match(string, trie) -> longest key or None

    Find the longest key in the trie that matches the beginning of the
    string.

    """
    longest = None
    for i in range(len(string)):
        substr = string[:i+1]
        if not trie.has_prefix(substr):
            break
        if trie.has_key(substr):
            longest = substr
    return longest

00033 def match_all(string, trie):
    """match_all(string, trie) -> list of keys

    Find all the keys in the trie that matches the beginning of the
    string.

    """
    matches = []
    for i in range(len(string)):
        substr = string[:i+1]
        if not trie.has_prefix(substr):
            break
        if trie.has_key(substr):
            matches.append(substr)
    return matches

00049 def find(string, trie):
    """find(string, trie) -> list of tuples (key, start, end)

    Find all the keys in the trie that match anywhere in the string.

    """
    results = []
    start = 0     # index to start the search
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            results.append((key, start, start+len(key)))
        start += 1
    return results

DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace

00067 def find_words(string, trie):
    """find_words(string, trie) -> list of tuples (key, start, end)

    Find all the keys in the trie that match full words in the string.
    Word boundaries are defined as any punctuation or whitespace.

    """
    _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
        
    results = []
    start = 0     # index of word boundary
    while start < len(string):
        # Look for a match.
        keys = match_all(string[start:], trie)
        for key in keys:
            l = len(key)
            # Make sure it ends at a boundary.
            if start+l == len(string) or \
               _boundary_re.match(string[start+l]):
                results.append((key, start, start+l))
        # Move forward to the next boundary.
        m = _boundary_re.search(string, start)
        if m is None:
            break
        start = m.end()
    return results

Generated by  Doxygen 1.6.0   Back to index