Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

cpairwise2module.c

/* Copyright 2002 by Jeffrey Chang.  All rights reserved.
 * This code is part of the Biopython distribution and governed by its
 * license.  Please see the LICENSE file that should have been included
 * as part of this package.
 *
 * cpairwise2module.c
 * Created 30 Sep 2001
 *
 * Optimized C routines that complement pairwise2.py.
 */

#include "Python.h"
#include "csupport.h"


#define _PRECISION 1000
#define rint(x) (int)((x)*_PRECISION+0.5)

/* Functions in this module. */

double calc_affine_penalty(int length, double open, double extend, 
    int penalize_extend_when_opening)
{
    double penalty;

    if(length <= 0)
      return 0.0;
    penalty = open + extend * length;
    if(!penalize_extend_when_opening)
      penalty -= extend;
    return penalty;
}

00034 struct IndexList {
    int num_used;
    int num_allocated;
    int *indexes; /* Array of ints.  Even ints are rows, odd ones are cols. */
};

static void IndexList_init(struct IndexList *il) 
{
    /* il->num_used = 0;
    il->num_allocated = 0;
    il->indexes = NULL; */
    memset((void *)il, 0, sizeof(struct IndexList));
}

static void IndexList_free(struct IndexList *il) 
{
    if(il->indexes)
      free(il->indexes);
    IndexList_init(il);
}

static void IndexList_clear(struct IndexList *il)
{
    il->num_used = 0;
}

static int IndexList_contains(struct IndexList *il, 
                        const int row, const int col) 
{
    int i;
    int stop;

    stop = il->num_used*2;
    for(i=0; i<stop; i+=2) {
      if((il->indexes[i] == row) && (il->indexes[i+1] == col))
          return 1;
    }
    return 0;
}

static int IndexList__verify_free_index(struct IndexList *il, int num_needed)
{
    int *indexes;
    int num_to_allocate;
    int num_bytes;

    if(il->num_allocated >= num_needed)
      return 1;

    /* Nearly all the cases are for list of length 1 or 2.
       Empirically, the code seems to run fastest when I allocate the
       exact amount for these. */
    if(num_needed <= 2)
      num_to_allocate = num_needed;
    else 
      num_to_allocate = num_needed*2;
    num_bytes = num_to_allocate*sizeof(int)*2;
    if(!(indexes = realloc((void *)il->indexes, num_bytes))) {
      PyErr_SetString(PyExc_MemoryError, "Out of memory");
      return 0;
    }
    il->indexes = indexes;
    il->num_allocated = num_to_allocate;
    return 1;
}

static void IndexList_append(struct IndexList *il, 
                       const int row, const int col) 
{
    int i;
    if(!IndexList__verify_free_index(il, il->num_used+1))
      return;
    i=il->num_used*2;
    il->indexes[i] = row;
    il->indexes[i+1] = col;
    il->num_used += 1;
}

static void IndexList_extend(struct IndexList *il1, struct IndexList *il2) 
{
    int i1, i2;
    int stop;

    if(!IndexList__verify_free_index(il1, il1->num_used+il2->num_used))
      return;
    stop=il2->num_used * 2;
    for(i1=il1->num_used*2, i2=0; i2<stop; i1+=2, i2+=2) {
      il1->indexes[i1] = il2->indexes[i2];
      il1->indexes[i1+1] = il2->indexes[i2+1];
    }
    il1->num_used += il2->num_used;
}

/* static void IndexList_copy(struct IndexList *il1, struct IndexList *il2) 
{
    IndexList_clear(il1);
    IndexList_extend(il1, il2);
    } */

/* static void IndexList_assign(struct IndexList *il1, struct IndexList *il2)
{
    if(il1->indexes)
      free(il1->indexes);
    il1->indexes = il2->indexes;
    il1->num_used = il2->num_used;
    il1->num_allocated = il2->num_allocated;
    } */


double _get_match_score(PyObject *py_sequenceA, PyObject *py_sequenceB, 
                  PyObject *py_match_fn, int i, int j,
                  char *sequenceA, char *sequenceB, 
                  int use_sequence_cstring,
                  double match, double mismatch, 
                  int use_match_mismatch_scores)
{
    PyObject *py_A=NULL, 
      *py_B=NULL;
    PyObject *py_arglist=NULL, *py_result=NULL;
    double score = 0;

    if(use_sequence_cstring && use_match_mismatch_scores) {
      score = (sequenceA[i] == sequenceB[j]) ? match : mismatch;
      return score;
    }
    /* Calculate the match score. */
    if(!(py_A = PySequence_GetItem(py_sequenceA, i)))
      goto _get_match_score_cleanup;
    if(!(py_B = PySequence_GetItem(py_sequenceB, j)))
      goto _get_match_score_cleanup;
    if(!(py_arglist = Py_BuildValue("(OO)", py_A, py_B)))
      goto _get_match_score_cleanup;

    if(!(py_result = PyEval_CallObject(py_match_fn, py_arglist)))
      goto _get_match_score_cleanup;
    score = PyNumber_AsDouble(py_result);
 _get_match_score_cleanup:
    if(py_A) {
      Py_DECREF(py_A);
    }
    if(py_B) {
      Py_DECREF(py_B);
    }
    if(py_arglist) {
      Py_DECREF(py_arglist);
    }
    if(py_result) {
      Py_DECREF(py_result);
    }
    return score;
}


/* This function is a more-or-less straightforward port of the
 * equivalent function in pairwise2.  Please see there for algorithm
 * documentation.
 */
static PyObject *cpairwise2__make_score_matrix_fast(
    PyObject *self, PyObject *args)
{
    int i;
    int row, col;

    PyObject *py_sequenceA, *py_sequenceB, *py_match_fn;
    char *sequenceA=NULL, *sequenceB=NULL;
    int use_sequence_cstring;
    double open_A, extend_A, open_B, extend_B;
    int penalize_extend_when_opening, penalize_end_gaps;
    int align_globally, score_only;

    double first_A_gap, first_B_gap;
    double match, mismatch;
    int use_match_mismatch_scores;
    int lenA, lenB;
    double *score_matrix = (double *)NULL;
    struct IndexList *trace_matrix = (struct IndexList *)NULL;
    PyObject *py_score_matrix=NULL, *py_trace_matrix=NULL;

    double *row_cache_score = (double *)NULL, 
      *col_cache_score = (double *)NULL;
    struct IndexList *row_cache_index = (struct IndexList *)NULL, 
      *col_cache_index = (struct IndexList *)NULL;

    PyObject *py_retval = NULL;

    if(!PyArg_ParseTuple(args, "OOOddddiiii", &py_sequenceA, &py_sequenceB,
                   &py_match_fn, &open_A, &extend_A, &open_B, &extend_B,
                   &penalize_extend_when_opening, &penalize_end_gaps,
                   &align_globally, &score_only))
      return NULL;
    if(!PySequence_Check(py_sequenceA) || !PySequence_Check(py_sequenceB)) {
      PyErr_SetString(PyExc_TypeError, 
                  "py_sequenceA and py_sequenceB should be sequences.");
      return NULL;
    }

    /* Optimize for the common case.  Check to see if py_sequenceA and
       py_sequenceB are strings.  If they are, use the c string
       representation. */
    use_sequence_cstring = 0;
    if(PyString_Check(py_sequenceA) && PyString_Check(py_sequenceB)) {
      sequenceA = PyString_AS_STRING(py_sequenceA);
      sequenceB = PyString_AS_STRING(py_sequenceB);
      use_sequence_cstring = 1;
    }

    if(!PyCallable_Check(py_match_fn)) {
      PyErr_SetString(PyExc_TypeError, "py_match_fn must be callable.");
      return NULL;
    }
    /* Optimize for the common case.  Check to see if py_match_fn is
       an identity_match.  If so, pull out the match and mismatch
       member variables and calculate the scores myself. */
    match = mismatch = 0;
    use_match_mismatch_scores = 0;
    if(PyInstance_Check(py_match_fn)) {
      PyObject *py_match=NULL, *py_mismatch=NULL;
      if(!(py_match = PyObject_GetAttrString(py_match_fn, "match")))
          goto cleanup_after_py_match_fn;
      match = PyNumber_AsDouble(py_match);
      if(PyErr_Occurred())
          goto cleanup_after_py_match_fn;
      if(!(py_mismatch = PyObject_GetAttrString(py_match_fn, "mismatch")))
          goto cleanup_after_py_match_fn;
      mismatch = PyNumber_AsDouble(py_mismatch);
      if(PyErr_Occurred())
          goto cleanup_after_py_match_fn;
      use_match_mismatch_scores = 1;
    cleanup_after_py_match_fn:
      if(PyErr_Occurred())
          PyErr_Clear();
      if(py_match) {
          Py_DECREF(py_match);
      }
      if(py_mismatch) {
          Py_DECREF(py_mismatch);
      }
    }

    /* Cache some commonly used gap penalties */
    first_A_gap = calc_affine_penalty(1, open_A, extend_A, 
                              penalize_extend_when_opening);
    first_B_gap = calc_affine_penalty(1, open_B, extend_B,
                              penalize_extend_when_opening);

    /* Allocate matrices for storing the results and initalize them. */
    lenA = PySequence_Length(py_sequenceA);
    lenB = PySequence_Length(py_sequenceB);
    score_matrix = (double *)malloc(lenA*lenB*sizeof(*score_matrix));
    trace_matrix = (struct IndexList *)malloc(lenA*lenB*sizeof(*trace_matrix));
    if(!score_matrix || !trace_matrix) {
      PyErr_SetString(PyExc_MemoryError, "Out of memory");
      goto _cleanup_make_score_matrix_fast;
    }
    for(i=0; i<lenA*lenB; i++) {
      score_matrix[i] = 0;
      IndexList_init(&trace_matrix[i]);
    }

    /* Initialize the first row and col of the score matrix. */
    for(i=0; i<lenA; i++) {
      double score = _get_match_score(py_sequenceA, py_sequenceB, 
                              py_match_fn, i, 0,
                              sequenceA, sequenceB,
                              use_sequence_cstring,
                              match, mismatch,
                              use_match_mismatch_scores);
      if(PyErr_Occurred())
          goto _cleanup_make_score_matrix_fast;
      if(penalize_end_gaps)
          score += calc_affine_penalty(i, open_B, extend_B, 
                               penalize_extend_when_opening);
      score_matrix[i*lenB] = score;
    }
    for(i=0; i<lenB; i++) {
      double score = _get_match_score(py_sequenceA, py_sequenceB, 
                              py_match_fn, 0, i,
                              sequenceA, sequenceB,
                              use_sequence_cstring,
                              match, mismatch,
                              use_match_mismatch_scores);
      if(PyErr_Occurred())
          goto _cleanup_make_score_matrix_fast;
      if(penalize_end_gaps)
          score += calc_affine_penalty(i, open_A, extend_A, 
                               penalize_extend_when_opening);
      score_matrix[i] = score;
    }

    /* Now initialize the row and col cache. */
    row_cache_score = (double *)malloc((lenA-1)*sizeof(*row_cache_score));
    row_cache_index = (struct IndexList *)malloc((lenA-1)*
                                     sizeof(*row_cache_index));
    col_cache_score = (double *)malloc((lenB-1)*sizeof(*col_cache_score));
    col_cache_index = (struct IndexList *)malloc((lenB-1)*
                                     sizeof(*col_cache_index));
    if(!row_cache_score || !row_cache_index || 
       !col_cache_score || !col_cache_index) {
      PyErr_SetString(PyExc_MemoryError, "Out of memory");
      goto _cleanup_make_score_matrix_fast;
    }
    memset((void *)row_cache_score, 0, (lenA-1)*sizeof(*row_cache_score));
    memset((void *)row_cache_index, 0, (lenA-1)*sizeof(*row_cache_index));
    memset((void *)col_cache_score, 0, (lenB-1)*sizeof(*col_cache_score));
    memset((void *)col_cache_index, 0, (lenB-1)*sizeof(*col_cache_index));
    for(i=0; i<lenA-1; i++) {
      row_cache_score[i] = score_matrix[i*lenB] + first_A_gap;
      IndexList_append(&row_cache_index[i], i, 0);
    }
    for(i=0; i<lenB-1; i++) {
      col_cache_score[i] = score_matrix[i] + first_B_gap;
      IndexList_append(&col_cache_index[i], 0, i);
    }

    /* Fill in the score matrix. */
    for(row=1; row<lenA; row++) {
      for(col=1; col<lenB; col++) {
          double nogap_score, row_score, col_score, best_score;
          int best_score_rint;
          struct IndexList *il;

          double score, open_score, extend_score;
          int open_score_rint, extend_score_rint;

          /* Calculate the best score. */
          nogap_score = score_matrix[(row-1)*lenB+col-1];
          if(col > 1) {
            row_score = row_cache_score[row-1];
          } else {
            row_score = nogap_score-1; /* Make sure it's not best score */
          }
          if(row > 1) {
            col_score = col_cache_score[col-1];
          } else {
            col_score = nogap_score-1; /* Make sure it's not best score */
          }

          best_score = (row_score > col_score) ? row_score : col_score;
          if(nogap_score > best_score)
            best_score = nogap_score;
          best_score_rint = rint(best_score);

          /* Set the score and traceback matrices. */
          score = best_score + _get_match_score(py_sequenceA, py_sequenceB, 
                                      py_match_fn, row, col,
                                      sequenceA, sequenceB,
                                      use_sequence_cstring,
                                      match, mismatch,
                                      use_match_mismatch_scores);
          if(PyErr_Occurred())
            goto _cleanup_make_score_matrix_fast;
          if(!align_globally && score < 0)
            score_matrix[row*lenB+col] = 0;
          else
            score_matrix[row*lenB+col] = score;

          il = &trace_matrix[row*lenB+col];
          if(best_score_rint == rint(nogap_score)) {
            IndexList_append(il, row-1, col-1);
          }
          if(best_score_rint == rint(row_score)) {
            IndexList_extend(il, &row_cache_index[row-1]);
          }
          if(best_score_rint == rint(col_score)) {
            IndexList_extend(il, &col_cache_index[col-1]);
          }

          /* Update the cached column scores. */
          open_score = score_matrix[(row-1)*lenB+col-1] + first_B_gap;
          extend_score = col_cache_score[col-1] + extend_B;
          open_score_rint = rint(open_score);
          extend_score_rint = rint(extend_score);
          if(open_score_rint > extend_score_rint) {
            col_cache_score[col-1] = open_score;
            IndexList_clear(&col_cache_index[col-1]);
            IndexList_append(&col_cache_index[col-1], row-1, col-1);
          } else if(extend_score_rint > open_score_rint) {
            col_cache_score[col-1] = extend_score;
          } else {
            col_cache_score[col-1] = open_score;
            if(!IndexList_contains(&col_cache_index[col-1], row-1, col-1))
                IndexList_append(&col_cache_index[col-1], row-1, col-1);
          }
          
          /* Update the cached row scores. */
          open_score = score_matrix[(row-1)*lenB+col-1] + first_A_gap;
          extend_score = row_cache_score[row-1] + extend_A;
          open_score_rint = rint(open_score);
          extend_score_rint = rint(extend_score);
          if(open_score_rint > extend_score_rint) {
            row_cache_score[row-1] = open_score;
            IndexList_clear(&row_cache_index[row-1]);
            IndexList_append(&row_cache_index[row-1], row-1, col-1);
          } else if(extend_score_rint > open_score_rint) {
            row_cache_score[row-1] = extend_score;
          } else {
            row_cache_score[row-1] = open_score;
            if(!IndexList_contains(&row_cache_index[row-1], row-1, col-1))
                IndexList_append(&row_cache_index[row-1], row-1, col-1);
          }
      }
    }

    /* Save the score and traceback matrices into real python objects. */
    if(!(py_score_matrix = PyList_New(lenA)))
      goto _cleanup_make_score_matrix_fast;
    if(!(py_trace_matrix = PyList_New(lenA)))
      goto _cleanup_make_score_matrix_fast;
    for(row=0; row<lenA; row++) {
      PyObject *py_score_row, *py_trace_row;
      if(!(py_score_row = PyList_New(lenB)))
          goto _cleanup_make_score_matrix_fast;
      PyList_SET_ITEM(py_score_matrix, row, py_score_row);
      if(!(py_trace_row = PyList_New(lenB)))
          goto _cleanup_make_score_matrix_fast;
      PyList_SET_ITEM(py_trace_matrix, row, py_trace_row);

      for(col=0; col<lenB; col++) {
          int i;
          PyObject *py_score, *py_indexlist;
          int offset = row*lenB + col;
          struct IndexList *il = &trace_matrix[offset];

          /* Set py_score_matrix[row][col] to the score. */
          if(!(py_score = PyFloat_FromDouble(score_matrix[offset])))
            goto _cleanup_make_score_matrix_fast;
          PyList_SET_ITEM(py_score_row, col, py_score);

          if(score_only)
            continue;
          /* Set py_trace_matrix[row][col] to a list of indexes.  On
             the edges of the matrix (row or column is 0), the
             matrix should be [None]. */
          if(!row || !col) {
            if(!(py_indexlist = PyList_New(1)))
                goto _cleanup_make_score_matrix_fast;
            Py_INCREF(Py_None);
            PyList_SET_ITEM(py_indexlist, 0, Py_None);
          }
          else {
            if(!(py_indexlist = PyList_New(il->num_used)))
                goto _cleanup_make_score_matrix_fast;
            for(i=0; i<il->num_used; i++) {
                PyObject *py_index=NULL;
                int row = il->indexes[i*2],
                  col = il->indexes[i*2+1];
                if(!(py_index = Py_BuildValue("(ii)", row, col)))
                  goto _cleanup_make_score_matrix_fast;
                PyList_SET_ITEM(py_indexlist, i, py_index);
            }
          }
          PyList_SET_ITEM(py_trace_row, col, py_indexlist);
      }
    }

    py_retval = Py_BuildValue("(OO)", py_score_matrix, py_trace_matrix);


 _cleanup_make_score_matrix_fast:
    if(score_matrix)
      free(score_matrix);
    if(trace_matrix) {
      for(i=0; i<lenA*lenB; i++)
          IndexList_free(&trace_matrix[i]);
      free(trace_matrix);
    }
    if(row_cache_score)
      free(row_cache_score);
    if(col_cache_score)
      free(col_cache_score);
    if(row_cache_index) {
      for(i=0; i<lenA-1; i++)
          IndexList_free(&row_cache_index[i]);
      free(row_cache_index);
    }
    if(col_cache_index) {
      for(i=0; i<lenB-1; i++) {
          IndexList_free(&col_cache_index[i]);
      }
      free(col_cache_index);
    }
    if(py_score_matrix) {
      Py_DECREF(py_score_matrix);
    }
    if(py_trace_matrix) {
      Py_DECREF(py_trace_matrix);
    }

    return py_retval;
}

static PyObject *cpairwise2_rint(
    PyObject *self, PyObject *args, PyObject *keywds)
{
    double x;
    int precision = _PRECISION;
    int rint_x;

    static char *kwlist[] = {"x", "precision", NULL};

    if(!PyArg_ParseTupleAndKeywords(args, keywds, "d|l", kwlist,
                            &x, &precision))
      return NULL;
    rint_x = (int)(x * precision + 0.5);
    return PyInt_FromLong(rint_x);
}

/* Module definition stuff */

static PyMethodDef cpairwise2Methods[] = {
    {"_make_score_matrix_fast", 
     (PyCFunction)cpairwise2__make_score_matrix_fast, METH_VARARGS, ""},
    {"rint", (PyCFunction)cpairwise2_rint, METH_VARARGS|METH_KEYWORDS, ""},
    {NULL, NULL, 0, NULL}
};

static char cpairwise2__doc__[] =
"XXX document here\n\
\n\
";

void initcpairwise2(void)
{
    (void) Py_InitModule3("cpairwise2", cpairwise2Methods, cpairwise2__doc__);
}

Generated by  Doxygen 1.6.0   Back to index