Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

dnal.py

#!/usr/bin/env python

from __future__ import division

__version__ = "$Revision: 1.10 $"

import commands
import itertools
import os
import re

from Bio import Wise

_SCORE_MATCH = 4
_SCORE_MISMATCH = -1
_SCORE_GAP_START = -5
_SCORE_GAP_EXTENSION = -1

_CMDLINE_DNAL = ["dnal", "-alb", "-nopretty"]

def _build_dnal_cmdline(match, mismatch, gap, extension):
    res = _CMDLINE_DNAL[:]
    res.extend(["-match", str(match)])
    res.extend(["-mis", str(mismatch)])
    res.extend(["-gap", str(-gap)]) # negative: convert score to penalty
    res.extend(["-ext", str(-extension)])  # negative: convert score to penalty

    return res

_CMDLINE_FGREP_COUNT = "fgrep -c '%s' %s"
def _fgrep_count(pattern, file):
    return int(commands.getoutput(_CMDLINE_FGREP_COUNT % (pattern, file)))

_re_alb_line2coords = re.compile(r"^\[([^:]+):[^\[]+\[([^:]+):")
def _alb_line2coords(line):
    return tuple([int(coord)+1 # one-based -> zero-based
                  for coord
                  in _re_alb_line2coords.match(line).groups()])

def _get_coords(filename):
    alb = file(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line # rstrip not needed
            else:
                end_line = line

    if end_line is None: # sequence is too short
        return [(0, 0), (0, 0)]
        
    return zip(*map(_alb_line2coords, [start_line, end_line])) # returns [(start0, end0), (start1, end1)]

def _any(seq, pred=bool):
    "Returns True if pred(x) is True at least one element in the iterable"
    return True in itertools.imap(pred, seq)

00062 class Statistics(object):
    """
    Calculate statistics from an ALB report
    """
    def __init__(self, filename, match, mismatch, gap, extension):
        self.matches = _fgrep_count('"SEQUENCE" %s' % match, filename)
        self.mismatches = _fgrep_count('"SEQUENCE" %s' % mismatch, filename)
        self.gaps = _fgrep_count('"INSERT" %s' % gap, filename)

        if gap == extension:
            self.extensions = 0
        else:
            self.extensions = _fgrep_count('"INSERT" %s' % extension, filename)
            
        self.score = (match*self.matches +
                      mismatch*self.mismatches +
                      gap*self.gaps +
                      extension*self.extensions)

        if _any([self.matches, self.mismatches, self.gaps, self.extensions]):
            self.coords = _get_coords(filename)
        else:
            self.coords = [(0, 0), (0,0)]

    def identity_fraction(self):
        return self.matches/(self.matches+self.mismatches)

    header = "identity_fraction\tmatches\tmismatches\tgaps\textensions"

    def __str__(self):
        return "\t".join([str(x) for x in (self.identity_fraction(), self.matches, self.mismatches, self.gaps, self.extensions)])

def align(pair, match=_SCORE_MATCH, mismatch=_SCORE_MISMATCH, gap=_SCORE_GAP_START, extension=_SCORE_GAP_EXTENSION, **keywds):
    cmdline = _build_dnal_cmdline(match, mismatch, gap, extension)
    temp_file = Wise.align(cmdline, pair, **keywds)
    try:
        return Statistics(temp_file.name, match, mismatch, gap, extension)
    except AttributeError:
        try:
            keywds['dry_run']
            return None
        except KeyError:
            raise

def main():
    import sys
    stats = align(sys.argv[1:3])
    print "\n".join(["%s: %s" % (attr, getattr(stats, attr))
                     for attr in
                     ("matches", "mismatches", "gaps", "extensions")])
    print "identity_fraction: %s" % stats.identity_fraction()
    print "coords: %s" % stats.coords

def _test(*args, **keywds):
    import doctest, sys
    doctest.testmod(sys.modules[__name__], *args, **keywds)

if __name__ == "__main__":
    if __debug__:
        _test()
    main()

Generated by  Doxygen 1.6.0   Back to index