Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

LogisticRegression.py

#!/usr/bin/env python

"""
This module provides code for doing logistic regressions.


Classes:
LogisticRegression    Holds information for a LogisticRegression classifier.


Functions:
train        Train a new classifier.
calculate    Calculate the probabilities of each class, given an observation.
classify     Classify an observation into a class.
"""
try:
    from Numeric import *
    from LinearAlgebra import *  # inverse
except ImportError, x:
    raise ImportError, "This module requires NumPy with the LinearAlgebra lib"

from Bio import listfns

00024 class LogisticRegression:
    """Holds information necessary to do logistic regression
    classification.

    Members:
    beta    List of the weights for each dimension.

    """
00032     def __init__(self):
        """LogisticRegression()"""
        beta = []

def train(xs, ys, update_fn=None, typecode=None):
    """train(xs, ys[, update_fn]) -> LogisticRegression
    
    Train a logistic regression classifier on a training set.  xs is a
    list of observations and ys is a list of the class assignments,
    which should be 0 or 1.  xs and ys should contain the same number
    of elements.  update_fn is an optional callback function that
    takes as parameters that iteration number and log likelihood.
    
    """
    if len(xs) != len(ys):
        raise ValueError, "xs and ys should be the same length."
    if not xs or not xs[0]:
        raise ValueError, "No observations or observation of 0 dimension."
    classes = listfns.items(ys)
    classes.sort()
    if classes != [0, 1]:
        raise ValueError, "Classes should be 0's and 1's"
    if typecode is None:
        typecode = Float

    # Dimensionality of the data is the dimensionality of the
    # observations plus a constant dimension.
    N, ndims = len(xs), len(xs[0]) + 1

    # Make an X array, with a constant first dimension.
    X = ones((N, ndims), typecode)
    X[:, 1:] = xs
    Xt = transpose(X)
    y = asarray(ys, typecode)

    # Initialize the beta parameter to 0.
    beta = zeros(ndims, typecode)

    MAX_ITERATIONS = 500
    CONVERGE_THRESHOLD = 0.01
    stepsize = 1.0
    # Now iterate using Newton-Raphson until the log-likelihoods
    # converge.
    iter = 0
    old_beta = old_llik = None
    while iter < MAX_ITERATIONS:
        # Calculate the probabilities.  p = e^(beta X) / (1+e^(beta X))
        ebetaX = exp(dot(beta, Xt))
        p = ebetaX / (1+ebetaX)
        
        # Find the log likelihood score and see if I've converged.
        logp = y*log(p) + (1-y)*log(1-p)
        llik = sum(logp)
        if update_fn is not None:
            update_fn(iter, llik)
        # Check to see if the likelihood decreased.  If it did, then
        # restore the old beta parameters and half the step size.
        if llik < old_llik:
            stepsize = stepsize / 2.0
            beta = old_beta
        # If I've converged, then stop.
        if old_llik is not None and fabs(llik-old_llik) <= CONVERGE_THRESHOLD:
            break
        old_llik, old_beta = llik, beta
        iter += 1

        W = identity(N) * p
        Xtyp = dot(Xt, y-p)         # Calculate the first derivative.
        XtWX = dot(dot(Xt, W), X)   # Calculate the second derivative.
        #u, s, vt = singular_value_decomposition(XtWX)
        #print "U", u
        #print "S", s
        delta = dot(inverse(XtWX), Xtyp)
        if fabs(stepsize-1.0) > 0.001:
            delta = delta * stepsize
        beta = beta + delta                 # Update beta.
    else:
        raise AssertionError, "Didn't converge."

    lr = LogisticRegression()
    lr.beta = map(float, beta)   # Convert back to regular array.
    return lr

def calculate(lr, x):
    """calculate(lr, x) -> list of probabilities

    Calculate the probability for each class.  lr is a
    LogisticRegression object.  x is the observed data.  Returns a
    list of the probability that it fits each class.

    """
    # Insert a constant term for x.
    x = asarray([1.0] + x)
    # Calculate the probability.  p = e^(beta X) / (1+e^(beta X))
    ebetaX = exp(dot(lr.beta, x))
    p = ebetaX / (1+ebetaX)
    return [1-p, p]

def classify(lr, x):
    """classify(lr, x) -> 1 or 0

    Classify an observation into a class.

    """
    probs = calculate(lr, x)
    if probs[0] > probs[1]:
        return 0
    return 1

Generated by  Doxygen 1.6.0   Back to index