Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

easy.py

#!/usr/bin/env python
#
# Copyright 2002 by Michael Hoffman.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""
Bio.easy: some functions to ease the use of Biopython
"""

from __future__ import generators # requires Python 2.2

__version__ = "$Revision: 1.5 $"
# $Source: /home/repository/biopython/biopython/Bio/GFF/easy.py,v $

import copy
import re
import string
import sys

import Bio
from Bio import GenBank
from Bio.Data import IUPACData
from Bio.Seq import Seq
from Bio.SeqIO.FASTA import FastaReader, FastaWriter
from Bio import SeqUtils

import GenericTools

00031 class FeatureDict(dict):
    """ JH:  accessing feature.qualifiers as a list is stupid.  Here's a dict that does it"""
    def __init__(self, feature_list, default=None):
        dict.__init__(self)
        self.default = default
        key_re = re.compile(r'/(\S+)=')

        for i in feature_list:
            key = key_re.match(i.key).group(1)
            val = string.replace(i.value,'"','')
            self[key] = val
    def __getitem__(self, key):
        try:
            return dict.__getitem__(self, key)
        except KeyError:
            return self.default

00048 class Location(GenericTools.VerboseList):
    """
    this is really best interfaced through LocationFromString
    fuzzy: < or >
    join: {0 = no join, 1 = join, 2 = order}
    
    >>> location = Location([Location([339]), Location([564])]) # zero-based
    >>> location
    Location(Location(339), Location(564))
    >>> print location # one-based
    340..565
    >>> print location.five_prime()
    340
    >>> location_rev = Location([Location([339]), Location([564])], 1)
    >>> print location_rev
    complement(340..565)
    >>> print location_rev.five_prime()
    565
    """
    def __init__(self, the_list, complement=0, seqname=None):
        self.complement = complement
        self.join = 0
        self.fuzzy = None
        self.seqname = seqname
        list.__init__(self, the_list)

    def _joinstr(self):
        if self.join == 1:
            label = 'join'
        elif self.join == 2:
            label = 'order'
        return "%s(%s)" % (label, ",".join(map(str, self)))
    
    def __str__(self):
        if self.seqname:
            format = "%s:%%s" % self.seqname
        else:
            format = "%s"

        if self.complement:
            format = format % "complement(%s)"

        if self.join:
            return format % self._joinstr()
            
        elif isinstance(self[0], list):
            return format % "%s..%s" % (str(self[0]), str(self[1]))
        else:
            if self.fuzzy:
                format = format % self.fuzzy + "%s"
            return format % str(self[0] + 1)

    def __repr__(self):
        return "Location(%s)" % ", ".join(map(repr, self))

    direction2index = {1: 0, -1: -1}
00104     def direction_and_index(self, direction):
        """
        1: 5'
        -1: 3'

        >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
        >>> loc1.direction_and_index(1)
        (1, 0)
        >>> loc1.direction_and_index(-1)
        (-1, -1)
        >>> loc1.reverse()
        >>> print loc1
        complement(join(1..3,complement(5..6)))
        >>> loc1.direction_and_index(1)
        (-1, -1)
        """
        if self.complement:
            direction = direction * -1
        index = self.direction2index[direction]
        return direction, index
    
00125     def findside(self, direction):
        """
        >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
        >>> loc.findside(1)
        Location(5)
        >>> loc.findside(-1)
        Location(0)
        """
        direction, index = self.direction_and_index(direction)
        if self.join or isinstance(self[0], list):
            return self[index].findside(direction)
        else:
            return self

00139     def findseqname_3prime(self):
        """
        >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
        >>> loc.findseqname_3prime()
        'MOOCOW'
        """
        return self.findseqname(-1)

00147     def findseqname(self, direction=1): # find 5' seqname
        """
        >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
        >>> loc.findseqname()
        'SEQ'
        >>> loc.findseqname(-1)
        'MOOCOW'
        """
        direction, index = self.direction_and_index(direction)
        if self.seqname:
            return self.seqname
        elif self.join:
            return self[index].findseqname(direction)
        else:
            raise AttributeError, 'no sequence name'

    def five_prime(self):
        return self.findside(1)
    def three_prime(self):
        return self.findside(-1)

00168     def length(self):
        """
        WARNING: doesn't deal with joins!!!!
        """
        return self.end()-self.start()

00174     def intersection(self, other):
        """
        WARNING: doesn't deal with joins!!!!

        >>> location1 = LocationFromString("1..50")
        >>> location2 = LocationFromString("25..200")
        >>> print location1.intersection(location2)
        25..50
        >>> print location1.intersection(location2)
        25..50
        """
        if self.start() >= other.start():
            start = self.start()
        else:
            start = other.start()
        if self.end() <= other.end():
            end = self.end()
        else:
            end = other.end()
        return Location([Location([start]), Location([end])])

    def start(self):
        # zero-based
        if self.complement:
            return self.three_prime()[0]
        else:
            return self.five_prime()[0]

    def end(self):
        # zero-based
        if self.complement:
            return self.five_prime()[0]
        else:
            return self.three_prime()[0]

    def three_prime_range(self, window):
        three_prime_loc = self.three_prime()
        if self.complement:
            return Location([three_prime_loc-window, three_prime_loc], complement=1)
        else:
            return Location([three_prime_loc, three_prime_loc+window])

00216     def sublocation(self, sub_location):
        """
        >>> fwd_location = LocationFromString('X:5830132..5831528')
        >>> print fwd_location.sublocation(LocationFromString('1..101'))
        X:5830132..5830232
        >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
        X:5831398..5831417
        >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
        >>> print rev_location.sublocation(LocationFromString('1..101'))
        I:complement(8416116..8416216)
        >>> print rev_location.sublocation(LocationFromString('100..200'))
        I:complement(8416017..8416117)
        """
        
        absolute_location = copy.deepcopy(self)
        for i in xrange(2):
            absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
        if absolute_location.complement:
            list.reverse(absolute_location)
        return absolute_location

    def __add__(self, addend):
        return self.add(addend)

    def add(self, addend, complement=0):
        self_copy = copy.deepcopy(self)
        if isinstance(addend, Location):
            addend = addend[0]
        if complement:
            addend *= -1
        self_copy[0] += addend
        return self_copy

    def __sub__(self, subtrahend):
        return self + -subtrahend

    def reverse(self):
        self.complement = [1, 0][self.complement]

00255     def reorient(self):
        """
        >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
        >>> loc1.reorient()
        >>> print loc1
        complement(join(I:1..9000,I:9001..10000))
        >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
        >>> loc2.reorient()
        >>> print loc2
        join(I:complement(1..9000),I:9001..10000)
        """
        if self.join:
            if len([x for x in self if x.complement]) == len(self):
                self.reverse()
                for segment in self:
                    segment.reverse()

00272     def bounding(self):
        """
        works for single level non-complex joins

        >>> LOC = LocationFromString
        >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
        >>> print l1.bounding()
        join(alpha:1..70)
        >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
        >>> print l2.bounding()
        join(alpha:1..30,alpha:complement(50..70))
        >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
        >>> print l3.bounding()
        join(alpha:1..45,alpha:complement(50..70),beta:6..20)

        """
        if not self.join:
            return

        seqdict = {}
        seqkeys = []
        for subloc in self:
            assert subloc.seqname
            assert not subloc.join
            try:
                seqdict[_hashname(subloc)].append(subloc)
            except KeyError:
                key = _hashname(subloc)
                seqdict[key] = [subloc]
                seqkeys.append(key)

        res = LocationJoin()
        for key in seqkeys:
            locations = seqdict[key]
            coords = []
            for subloc in locations:
                coords.append(subloc.start())
                coords.append(subloc.end())
            res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
        return res

def _hashname(location):
    return str(location.complement) + location.seqname

00316 class LocationJoin(Location):
    """
    >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
    >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
    >>> join.append(appendloc)
    >>> print join
    join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
    >>> join2 = LocationJoin()
    >>> join2.append(LocationFromString("complement(66..99)"))
    >>> join2.append(LocationFromString("complement(5..55)"))
    >>> print join2
    join(complement(66..99),complement(5..55))
    """
    def __init__(self, the_list = [], complement=0, seqname=None):
        self.complement = complement
        self.join = 1
        self.fuzzy = None
        self.seqname = seqname
        list.__init__(self, the_list)

00336 class LocationFromCoords(Location):
    """
    >>> print LocationFromCoords(339, 564)
    340..565
    >>> print LocationFromCoords(339, 564, seqname="I")
    I:340..565
    >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
    NC_343434:complement(1000..3235)
    """
    def __init__(self, start, end, strand=0, seqname=None):
        if strand == "+":
            strand = 0
        elif strand == "-":
            strand = 1
        Location.__init__(self, [Location([start]), Location([end])], strand, seqname)

# see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur
# for how this should actually be implemented
re_complement = re.compile(r"^complement\((.*)\)$")
re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec
re_join = re.compile(r"^(join|order)\((.*)\)$")
re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
re_fuzzy = re.compile(r"^([><])(\d+)")
00359 class LocationFromString(Location):
    """
    >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
    >>> print LocationFromString("467")
    467
    >>> print LocationFromString("340..565")
    340..565
    >>> print LocationFromString("<345..500")
    <345..500
    >>> print LocationFromString("<1..888")
    <1..888
    >>> # (102.110) and 123^124 syntax unimplemented
    >>> print LocationFromString("join(12..78,134..202)")
    join(12..78,134..202)
    >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
    complement(join(2691..4571,4918..5163))
    >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
    join(complement(4918..5163),complement(2691..4571))
    >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
    order(complement(4918..5163),complement(2691..4571))
    >>> print LocationFromString("NC_001802x.fna:73..78")
    NC_001802x.fna:73..78
    >>> print LocationFromString("J00194:100..202")
    J00194:100..202

    >>> print LocationFromString("join(117505..118584,1..609)")
    join(117505..118584,1..609)
    >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
    join(test3:complement(4..6),test3:complement(1..3))
    >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
    test3:join(test1:complement(1..3),4..6)
    """
    def __init__(self, location_str):
        match_seqname = re_seqname.match(location_str)
        if match_seqname:
            self.seqname = match_seqname.group(1)
            location_str = match_seqname.group(2)
        else:
            self.seqname = None
        match_complement = re_complement.match(location_str)
        if match_complement:
            self.complement = 1
            location_str = match_complement.group(1)
        else:
            self.complement = 0
        match_join = re_join.match(location_str)
        if match_join:
            self.join = {'join':1, 'order':2}[match_join.group(1)]
            list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(",")))
        else:
            self.join = 0
            match_dotdot = re_dotdot.match(location_str)
            if match_dotdot:
                list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2)))
            else:
                match_fuzzy = re_fuzzy.match(location_str)
                if match_fuzzy:
                    self.fuzzy = match_fuzzy.group(1)
                    location_str = match_fuzzy.group(2)
                else:
                    self.fuzzy = None
                    
                list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud

def open_file(filename):
    if filename:
        return open(filename)
    else:
        return sys.stdin

def fasta_single(filename=None, string=None):
    """
    >>> record = fasta_single(string=\"""
    ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
    ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
    ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
    ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
    ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
    ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
    ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
    ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
    ... SLFGNDPSSQ
    ... \""")
    >>> record.id
    'gi|9629360|ref|NP_057850.1|'
    >>> record.description
    'Gag [Human immunodeficiency virus type 1]'
    >>> record.seq[0:5]
    Seq('MGARA', Alphabet())
    """
    if string:
        import cStringIO
        return FastaReader(cStringIO.StringIO(string)).next()
    return FastaReader(open_file(filename)).next()

def fasta_multi(filename=None):
    reader = FastaReader(open_file(filename))
    while 1:
        record = reader.next()
        if record is None:
            raise StopIteration
        else:
            yield record

def fasta_readrecords(filename=None):
    """
    >>> records = fasta_readrecords('GFF/multi.fna')
    >>> records[0].id
    'test1'
    >>> records[2].seq
    Seq('AAACACAC', Alphabet())
    """
    records = []
    for record in fasta_multi(filename):
        records.append(record)
    return records

def fasta_write(filename, records):
    FastaWriter(file(filename, "w")).write_records(records)

def genbank_single(filename):
    """
    >>> record = genbank_single("GFF/NC_001422.gbk")
    >>> record.taxonomy
    ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
    >>> cds = record.features[-4]
    >>> cds.key
    'CDS'
    >>> location = LocationFromString(cds.location)
    >>> print location
    2931..3917
    >>> subseq = record_subseq(record, location)
    >>> subseq[0:20]
    Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
    """
    return GenBank.RecordParser().parse(open(filename))

def record_subseq(record, location, *args, **keywds):
    """
    >>> from Bio.SeqRecord import SeqRecord    
    >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
    ...                    "ref|NC_001422",
    ...                    "Coliphage phiX174, complete genome",
    ...                    "bases 1-11")
    >>> record_subseq(record, LocationFromString("1..4")) # one-based
    Seq('GAGT', Alphabet())
    >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
    Seq('ACTC', Alphabet())
    >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
    Seq('ACTCGAGT', Alphabet())
    >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
    >>> print loc
    complement(join(complement(5..7),1..4))
    >>> record_subseq(record, loc)
    Seq('ACTCTTT', Alphabet())
    >>> print loc
    complement(join(complement(5..7),1..4))
    >>> loc.reverse()
    >>> record_subseq(record, loc)
    Seq('AAAGAGT', Alphabet())
    >>> record_subseq(record, loc, upper=1)
    Seq('AAAGAGT', Alphabet())
    """
    if location.join:
        subsequence_list = []
        if location.complement:
            location_copy = copy.copy(location)
            list.reverse(location_copy)
        else:
            location_copy = location
        for sublocation in location_copy:
            if location.complement:
                sublocation_copy = copy.copy(sublocation)
                sublocation_copy.reverse()
            else:
                sublocation_copy = sublocation
            subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
        return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
    else:
        return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)

def record_sequence(record):
    """
    returns the sequence of a record

    can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record
    """
    if isinstance(record, Bio.SeqRecord.SeqRecord):
        return record.seq
    elif isinstance(record, Bio.GenBank.Record.Record):
        return Seq(record.sequence)
    else:
        raise TypeError, 'not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record'

def record_coords(record, start, end, strand=0, upper=0):
    """
    >>> from Bio.SeqRecord import SeqRecord
    >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
    ...                    "ref|NC_001422",
    ...                    "Coliphage phiX174, complete genome",
    ...                    "bases 1-11")
    >>> record_coords(record, 0, 4) # zero-based
    Seq('GAGT', Alphabet())
    >>> record_coords(record, 0, 4, "-") # zero-based
    Seq('ACTC', Alphabet())
    >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
    Seq('ACTC', Alphabet())
    """

    subseq = record_sequence(record)[start:end]
    subseq_str = subseq.tostring()
    subseq_str = subseq_str.upper()
    subseq = Seq(subseq_str, subseq.alphabet)
    if strand == '-' or strand == 1:
        return subseq.reverse_complement()
    else:
        return subseq

### The following section was replaced by the functions complement,
### forward_complement in Bio.Seq. The code here can be removed eventually
### (using deprecation warnings for now).

def forward_complement(seq):
    """
    returns the complementary sequence (NOT antiparallel)

    stolen from a broken BioPython sequtils.py

    deprecated

    # >>> forward_complement(Seq('aaatttc'))
    # Seq('TTTAAAG', Alphabet())
    # >>> forward_complement(Seq('aaauuuc'))
    # Seq('UUUAAAG', Alphabet())
    """
    import warnings
    warnings.warn(
        "forward_complement in Bio.GFF is deprecated; please use complement in Bio.Seq instead",
        category=DeprecationWarning)
    return _complement(seq, reverse=0)

def _forward_complement_list_with_table(table, seq):
    return [table[x] for x in seq.tostring().upper()]

from Bio.Data.IUPACData import ambiguous_dna_complement
ambiguous_rna_complement = ambiguous_dna_complement.copy()
ambiguous_rna_complement["A"] = "U"
ambiguous_rna_complement["U"] = "A"
del ambiguous_rna_complement["T"]

def _forward_complement_list(seq):
    """
    assumes DNA, then tries RNA
    """
    try:
        return _forward_complement_list_with_table(ambiguous_dna_complement, seq)
    except KeyError, err:
        if err[0] == "U":
            return _forward_complement_list_with_table(ambiguous_rna_complement, seq)
        raise

def _complement(seq, reverse=0):
    """
    forward or reverse complement

    >>> _complement(Seq('aaatttc'))
    Seq('TTTAAAG', Alphabet())
    >>> _complement(Seq('aaatttc'), reverse=1)
    Seq('GAAATTT', Alphabet())
    """
    complement_list = _forward_complement_list(seq)
    if reverse:
        complement_list.reverse()
    return Seq(''.join(complement_list), seq.alphabet)

def reverse_complement(seq):
    """
    deprecated
    
    # >>> reverse_complement(Seq('aaatttc'))
    # Seq('GAAATTT', Alphabet())
    """
    import warnings
    warnings.warn(
        "reverse_complement in Bio.GFF is deprecated; please use reverse_complement in Bio.Seq instead",
        category=DeprecationWarning)
    return _complement(seq, reverse=1)

###
### End section to be removed.
###

class TempFastaWriter(FastaWriter):
    def __init__(self, *args, **keywds):
        self.file = GenericTools.TempFile()
        FastaWriter.__init__(self, self.file, *args, **keywds)

class TempFastaWriterSingle(TempFastaWriter):
    def __init__(self, seqrecord, *args, **keywds):
        TempFastaWriter.__init__(self, *args, **keywds)
        self.write(seqrecord)
        self.close()

def _test(*args, **keywds):
    import doctest, sys
    doctest.testmod(sys.modules[__name__], *args, **keywds)

if __name__ == "__main__":
    if __debug__:
        _test()

Generated by  Doxygen 1.6.0   Back to index