Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package


# Copyright 2003 by Bartek Wilczynski.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
Implementation of sequence motifs.

from __future__ import generators
from Bio.SubsMat import FreqTable

00012 class Motif(object):
    A class representing sequence motifs.
    def __init__(self):
        self.instances = []
        self.score = 0.0
        self.mask = []
        self._pwm_is_current = 0
        self._pwm = []

    def _check_length(self, len):
        if self.length==None:
            self.length = len
        elif self.length != len:
            raise ValueError, "You can't change the length of the motif"

    def _check_alphabet(self,alphabet):
        if self.alphabet==None:
        elif self.alphabet != alphabet:
                raise ValueError, "Wrong Alphabet"
00037     def add_instance(self,instance):
        adds new instance to the motif
        self._pwm_is_current = False

00046     def set_mask(self,mask):
        sets the mask for the motif

        The mask should be a string containing asterisks in the position of significant columns and spaces in other columns
        for char in mask:
            if char=="*":
            elif char==" ":
                raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)

00062     def pwm(self):
        returns the PWM computed for the set of instances
        if self._pwm_is_current:
            return self._pwm
        #we need to compute new pwm
        self._pwm = []
        for i in xrange(len(self.mask)):
            dict = {}
            #filling the dict with 0's
            for letter in self.alphabet.letters:
            #counting the occurences of letters in instances
            for seq in self.instances:
        return self._pwm

00083     def search_instances(self,sequence):
        a generator function, returning found positions of instances of the motif in a given sequence
        for pos in xrange(0,len(sequence)-self.length+1):
            for instance in self.instances:
                if instance.tostring()==sequence[pos:pos+self.length].tostring():
                    break # no other instance will fit (we don't want to return multiple hits)

00093     def score_hit(self,sequence,position,normalized=1,masked=0):
        give the pwm score for a given position
        score = 0.0
        for pos in xrange(self.length):
            if not masked or self.mask[pos]:
                score += self.pwm()[pos][sequence[position+pos]]
        if normalized:
            if not masked:
                score/=len(filter(lambda x: x, self.mask))
        return score
00108     def search_pwm(self,sequence,threshold=0.0,normalized=1,masked=1):
        a generator function, returning found hits in a given sequence with the pwm score higher than the threshold

        for pos in xrange(0,len(sequence)-self.length+1):
            score = self.score_hit(sequence,pos,normalized,masked)
            if score > threshold:
                yield (pos,score)

00119     def sim(self, motif, masked = 0):
        return the similarity score for the given motif against self.

        We use the Pearson's correlation of the respective probabilities.
        If the motifs have different length or mask raise the ValueError.

        from math import sqrt
        if self.alphabet != motif.alphabet:
            raise ValueError("Wrong alphabet")
        if self.length != motif.length:
            raise ValueError("Wrong length")
        if masked and self.mask!=motif.mask:
            raise ValueError("Wrong mask")

        sxx = 0 # \sum x^2
        sxy = 0 # \sum x \cdot y
        sx = 0  # \sum x
        sy = 0  # \sum y
        syy = 0 # \sum x^2

        for pos in xrange(self.length):
            if not masked or self.mask:
                for l in self.alphabet.letters:
                    xi = self.pwm()[pos][l]
                    yi = motif.pwm()[pos][l]
                    sx = sx + xi
                    sy = sy + yi
                    sxx = sxx + xi * xi
                    syy = syy + yi * yi
                    sxy = sxy + xi * yi
        if masked:
            norm = len(filter(lambda x: x,self.mask))
            norm = self.length
        norm *= len(self.alphabet.letters)
        s1 = (sxy - sx*sy*1.0/norm)
        s2 = (sxx - sx*sx*1.0/norm)*(syy- sy*sy*1.0/norm)
        return s1/sqrt(s2)
00166     def read(self,stream):
        reads the motif from the stream

        the self.alphabet variable must be set before
        from Bio.Seq import Seq
        while 1:
            ln = stream.readline()
            if "*" in ln:
00180     def __str__(self):
        string representation of motif
        str = ""
        for inst in self.instances:
            str = str + inst.tostring() + "\n"

        for i in xrange(self.length):
            if self.mask[i]:
                str = str + "*"
                str = str + " "
        str = str + "\n"

        return str

00197     def write(self,stream):
        writes the motif to the stream



Generated by  Doxygen 1.6.0   Back to index