Logo Search packages:      
Sourcecode: python-biopython version File versions

lcc.py

# Copyright 2003, 2007 by Sebastian Bassi. sbassi@genesdigitales.com
# All rights reserved.  This code is part of the Biopython 
# distribution and governed by its license.
# Please see the LICENSE file that should have been included as part
# of this package.

import math

def lcc_mult(seq,wsize):
    """Local Composition Complexity (LCC) values over sliding window.

    Returns a list of floats, the LCC values for a sliding window over
    the sequence.

    seq - an unambiguous DNA sequence (a string or Seq object)
    wsize - window size, integer

    The result is the same as applying lcc_simp multiple times, but this
    version is optimized for speed. The optimization works by using the
    value of previous window as a base to compute the next one."""
    l2=math.log(2)
    tamseq=len(seq)
    try :
        #Assume its a string
        upper = seq.upper()
    except AttributeError :
        #Should be a Seq object then
        upper = seq.tostring().upper()
    compone=[0]
    lccsal=[0]
    for i in range(wsize):
        compone.append(((i+1)/float(wsize))*
                       ((math.log((i+1)/float(wsize)))/l2))
    window=seq[0:wsize]
    cant_a=window.count('A')
    cant_c=window.count('C')
    cant_t=window.count('T')
    cant_g=window.count('G')
    term_a=compone[cant_a]
    term_c=compone[cant_c]
    term_t=compone[cant_t]
    term_g=compone[cant_g]
    lccsal.append(-(term_a+term_c+term_t+term_g))
    tail=seq[0]
    for x in range (tamseq-wsize):
        window=upper[x+1:wsize+x+1]
        if tail==window[-1]:
            lccsal.append(lccsal[-1])
        elif tail=='A':
            cant_a=cant_a-1
            if window.endswith('C'):
                cant_c=cant_c+1
                term_a=compone[cant_a]
                term_c=compone[cant_c]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('T'):
                cant_t=cant_t+1
                term_a=compone[cant_a]
                term_t=compone[cant_t]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('G'):
                cant_g=cant_g+1
                term_a=compone[cant_a]
                term_g=compone[cant_g]
                lccsal.append(-(term_a+term_c+term_t+term_g))
        elif tail=='C':
            cant_c=cant_c-1
            if window.endswith('A'):
                cant_a=cant_a+1
                term_a=compone[cant_a]
                term_c=compone[cant_c]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('T'):
                cant_t=cant_t+1
                term_c=compone[cant_c]
                term_t=compone[cant_t]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('G'):
                cant_g=cant_g+1
                term_c=compone[cant_c]
                term_g=compone[cant_g]
                lccsal.append(-(term_a+term_c+term_t+term_g))
        elif tail=='T':
            cant_t=cant_t-1
            if window.endswith('A'):
                cant_a=cant_a+1
                term_a=compone[cant_a]
                term_t=compone[cant_t]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('C'):
                cant_c=cant_c+1
                term_c=compone[cant_c]
                term_t=compone[cant_t]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('G'):
                cant_g=cant_g+1
                term_t=compone[cant_t]
                term_g=compone[cant_g]
                lccsal.append(-(term_a+term_c+term_t+term_g))
        elif tail=='G':
            cant_g=cant_g-1
            if window.endswith('A'):
                cant_a=cant_a+1
                term_a=compone[cant_a]
                term_g=compone[cant_g]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('C'):
                cant_c=cant_c+1
                term_c=compone[cant_c]
                term_g=compone[cant_g]
                lccsal.append(-(term_a+term_c+term_t+term_g))
            elif window.endswith('T'):
                cant_t=cant_t+1
                term_t=compone[cant_t]
                term_g=compone[cant_g]
                lccsal.append(-(term_a+term_c+term_t+term_g))
        tail=window[0]
    return lccsal

def lcc_simp(seq):
    """Local Composition Complexity (LCC) for a sequence.

    seq - an unambiguous DNA sequence (a string or Seq object)
    
    Returns the Local Composition Complexity (LCC) value for the entire
    sequence (as a float).

    Reference:
    Andrzej K Konopka (2005) Sequence Complexity and Composition
    DOI: 10.1038/npg.els.0005260
    """
    wsize=len(seq)
    try :
        #Assume its a string
        upper = seq.upper()
    except AttributeError :
        #Should be a Seq object then
        upper = seq.tostring().upper()
    l2=math.log(2)
    if 'A' not in seq:
        term_a=0
      # Check to avoid calculating the log of 0.
    else:
        term_a=((upper.count('A'))/float(wsize))*((math.log((upper.count('A'))
                                                          /float(wsize)))/l2)
    if 'C' not in seq:
        term_c=0
    else:
        term_c=((upper.count('C'))/float(wsize))*((math.log((upper.count('C'))
                                                          /float(wsize)))/l2)
    if 'T' not in seq:
        term_t=0
    else:
        term_t=((upper.count('T'))/float(wsize))*((math.log((upper.count('T'))
                                                          /float(wsize)))/l2)
    if 'G' not in seq:
        term_g=0
    else:
        term_g=((upper.count('G'))/float(wsize))*((math.log((upper.count('G'))
                                                          /float(wsize)))/l2)
    lccsal=-(term_a+term_c+term_t+term_g)
    return lccsal

Generated by  Doxygen 1.6.0   Back to index