Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package

easy.py

#!/usr/bin/env python
#
# Copyright 2002 by Michael Hoffman.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""
Bio.GFF.easy: some functions to ease the use of Biopython
"""

from __future__ import generators # requires Python 2.2

__version__ = "$Revision: 1.8 $"
# $Source: /home/repository/biopython/biopython/Bio/GFF/easy.py,v $

import copy
import re
import string
import sys

import Bio
from Bio import GenBank
from Bio.Data import IUPACData
from Bio.Seq import Seq

#from Bio.SeqIO.FASTA import FastaReader, FastaWriter
#The whole of Bio.SeqIO.FASTA has been deprecated, so
#we'll use the new Bio.SeqIO functions instead:
from Bio import SeqIO

from Bio import SeqUtils

import GenericTools

00036 class FeatureDict(dict):
    """ JH:  accessing feature.qualifiers as a list is stupid.  Here's a dict that does it"""
    def __init__(self, feature_list, default=None):
        dict.__init__(self)
        self.default = default
        key_re = re.compile(r'/(\S+)=')

        for i in feature_list:
            key = key_re.match(i.key).group(1)
            val = string.replace(i.value,'"','')
            self[key] = val
    def __getitem__(self, key):
        try:
            return dict.__getitem__(self, key)
        except KeyError:
            return self.default

00053 class Location(GenericTools.VerboseList):
    """
    this is really best interfaced through LocationFromString
    fuzzy: < or >
    join: {0 = no join, 1 = join, 2 = order}
    
    >>> location = Location([Location([339]), Location([564])]) # zero-based
    >>> location
    Location(Location(339), Location(564))
    >>> print location # one-based
    340..565
    >>> print location.five_prime()
    340
    >>> location_rev = Location([Location([339]), Location([564])], 1)
    >>> print location_rev
    complement(340..565)
    >>> print location_rev.five_prime()
    565
    """
    def __init__(self, the_list, complement=0, seqname=None):
        self.complement = complement
        self.join = 0
        self.fuzzy = None
        self.seqname = seqname
        list.__init__(self, the_list)

    def _joinstr(self):
        if self.join == 1:
            label = 'join'
        elif self.join == 2:
            label = 'order'
        return "%s(%s)" % (label, ",".join(map(str, self)))
    
    def __str__(self):
        if self.seqname:
            format = "%s:%%s" % self.seqname
        else:
            format = "%s"

        if self.complement:
            format = format % "complement(%s)"

        if self.join:
            return format % self._joinstr()
            
        elif isinstance(self[0], list):
            return format % "%s..%s" % (str(self[0]), str(self[1]))
        else:
            if self.fuzzy:
                format = format % self.fuzzy + "%s"
            return format % str(self[0] + 1)

    def __repr__(self):
        return "Location(%s)" % ", ".join(map(repr, self))

    direction2index = {1: 0, -1: -1}
00109     def direction_and_index(self, direction):
        """
        1: 5'
        -1: 3'

        >>> loc1 = LocationFromString("join(1..3,complement(5..6))")
        >>> loc1.direction_and_index(1)
        (1, 0)
        >>> loc1.direction_and_index(-1)
        (-1, -1)
        >>> loc1.reverse()
        >>> print loc1
        complement(join(1..3,complement(5..6)))
        >>> loc1.direction_and_index(1)
        (-1, -1)
        """
        if self.complement:
            direction = direction * -1
        index = self.direction2index[direction]
        return direction, index
    
00130     def findside(self, direction):
        """
        >>> loc = LocationFromString("complement(join(1..5,complement(6..10)))")
        >>> loc.findside(1)
        Location(5)
        >>> loc.findside(-1)
        Location(0)
        """
        direction, index = self.direction_and_index(direction)
        if self.join or isinstance(self[0], list):
            return self[index].findside(direction)
        else:
            return self

00144     def findseqname_3prime(self):
        """
        >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
        >>> loc.findseqname_3prime()
        'MOOCOW'
        """
        return self.findseqname(-1)

00152     def findseqname(self, direction=1): # find 5' seqname
        """
        >>> loc = LocationFromString("complement(join(MOOCOW:1..5,SEQ:complement(6..10)))")
        >>> loc.findseqname()
        'SEQ'
        >>> loc.findseqname(-1)
        'MOOCOW'
        """
        direction, index = self.direction_and_index(direction)
        if self.seqname:
            return self.seqname
        elif self.join:
            return self[index].findseqname(direction)
        else:
            raise AttributeError, 'no sequence name'

    def five_prime(self):
        return self.findside(1)
    def three_prime(self):
        return self.findside(-1)

00173     def length(self):
        """
        WARNING: doesn't deal with joins!!!!
        """
        return self.end()-self.start()

00179     def intersection(self, other):
        """
        WARNING: doesn't deal with joins!!!!

        >>> location1 = LocationFromString("1..50")
        >>> location2 = LocationFromString("25..200")
        >>> print location1.intersection(location2)
        25..50
        >>> print location1.intersection(location2)
        25..50
        """
        if self.start() >= other.start():
            start = self.start()
        else:
            start = other.start()
        if self.end() <= other.end():
            end = self.end()
        else:
            end = other.end()
        return Location([Location([start]), Location([end])])

    def start(self):
        # zero-based
        if self.complement:
            return self.three_prime()[0]
        else:
            return self.five_prime()[0]

    def end(self):
        # zero-based
        if self.complement:
            return self.five_prime()[0]
        else:
            return self.three_prime()[0]

    def three_prime_range(self, window):
        three_prime_loc = self.three_prime()
        if self.complement:
            return Location([three_prime_loc-window, three_prime_loc], complement=1)
        else:
            return Location([three_prime_loc, three_prime_loc+window])

00221     def sublocation(self, sub_location):
        """
        >>> fwd_location = LocationFromString('X:5830132..5831528')
        >>> print fwd_location.sublocation(LocationFromString('1..101'))
        X:5830132..5830232
        >>> print fwd_location.sublocation(LocationFromString('1267..1286'))
        X:5831398..5831417
        >>> rev_location = LocationFromString('I:complement(8415686..8416216)')
        >>> print rev_location.sublocation(LocationFromString('1..101'))
        I:complement(8416116..8416216)
        >>> print rev_location.sublocation(LocationFromString('100..200'))
        I:complement(8416017..8416117)
        """
        
        absolute_location = copy.deepcopy(self)
        for i in xrange(2):
            absolute_location[i] = self.five_prime().add(sub_location[i], self.complement)
        if absolute_location.complement:
            list.reverse(absolute_location)
        return absolute_location

    def __add__(self, addend):
        return self.add(addend)

    def add(self, addend, complement=0):
        self_copy = copy.deepcopy(self)
        if isinstance(addend, Location):
            addend = addend[0]
        if complement:
            addend *= -1
        self_copy[0] += addend
        return self_copy

    def __sub__(self, subtrahend):
        return self + -subtrahend

    def reverse(self):
        self.complement = [1, 0][self.complement]

00260     def reorient(self):
        """
        >>> loc1 = LocationFromString("join(I:complement(1..9000),I:complement(9001..10000))")
        >>> loc1.reorient()
        >>> print loc1
        complement(join(I:1..9000,I:9001..10000))
        >>> loc2 = LocationFromString("join(I:complement(1..9000),I:9001..10000)")
        >>> loc2.reorient()
        >>> print loc2
        join(I:complement(1..9000),I:9001..10000)
        """
        if self.join:
            if len([x for x in self if x.complement]) == len(self):
                self.reverse()
                for segment in self:
                    segment.reverse()

00277     def bounding(self):
        """
        works for single level non-complex joins

        >>> LOC = LocationFromString
        >>> l1 = LOC("join(alpha:1..30,alpha:50..70)")
        >>> print l1.bounding()
        join(alpha:1..70)
        >>> l2 = LOC("join(alpha:1..30,alpha:complement(50..70))")
        >>> print l2.bounding()
        join(alpha:1..30,alpha:complement(50..70))
        >>> l3 = LOC("join(alpha:1..30,alpha:complement(50..70),beta:6..20,alpha:25..45)")
        >>> print l3.bounding()
        join(alpha:1..45,alpha:complement(50..70),beta:6..20)

        """
        if not self.join:
            return

        seqdict = {}
        seqkeys = []
        for subloc in self:
            assert subloc.seqname
            assert not subloc.join
            try:
                seqdict[_hashname(subloc)].append(subloc)
            except KeyError:
                key = _hashname(subloc)
                seqdict[key] = [subloc]
                seqkeys.append(key)

        res = LocationJoin()
        for key in seqkeys:
            locations = seqdict[key]
            coords = []
            for subloc in locations:
                coords.append(subloc.start())
                coords.append(subloc.end())
            res.append(LocationFromCoords(min(coords), max(coords), locations[0].complement, locations[0].seqname))
        return res

def _hashname(location):
    return str(location.complement) + location.seqname

00321 class LocationJoin(Location):
    """
    >>> join = LocationJoin([LocationFromCoords(339, 564, 1), LocationFromString("complement(100..339)")])
    >>> appendloc = LocationFromString("order(complement(66..99),complement(5..55))")
    >>> join.append(appendloc)
    >>> print join
    join(complement(340..565),complement(100..339),order(complement(66..99),complement(5..55)))
    >>> join2 = LocationJoin()
    >>> join2.append(LocationFromString("complement(66..99)"))
    >>> join2.append(LocationFromString("complement(5..55)"))
    >>> print join2
    join(complement(66..99),complement(5..55))
    """
    def __init__(self, the_list = [], complement=0, seqname=None):
        self.complement = complement
        self.join = 1
        self.fuzzy = None
        self.seqname = seqname
        list.__init__(self, the_list)

00341 class LocationFromCoords(Location):
    """
    >>> print LocationFromCoords(339, 564)
    340..565
    >>> print LocationFromCoords(339, 564, seqname="I")
    I:340..565
    >>> print LocationFromCoords(999, 3234, "-", seqname="NC_343434")
    NC_343434:complement(1000..3235)
    """
    def __init__(self, start, end, strand=0, seqname=None):
        if strand == "+":
            strand = 0
        elif strand == "-":
            strand = 1
        Location.__init__(self, [Location([start]), Location([end])], strand, seqname)

# see http://www.ncbi.nlm.nih.gov/collab/FT/index.html#backus-naur
# for how this should actually be implemented
re_complement = re.compile(r"^complement\((.*)\)$")
re_seqname = re.compile(r"^(?!join|order|complement)([^\:]+?):(.*)$") # not every character is allowed by spec
re_join = re.compile(r"^(join|order)\((.*)\)$")
re_dotdot = re.compile(r"^([><]*\d+)\.\.([><]*\d+)$")
re_fuzzy = re.compile(r"^([><])(\d+)")
00364 class LocationFromString(Location):
    """
    >>> # here are some tests from http://www.ncbi.nlm.nih.gov/collab/FT/index.html#location
    >>> print LocationFromString("467")
    467
    >>> print LocationFromString("340..565")
    340..565
    >>> print LocationFromString("<345..500")
    <345..500
    >>> print LocationFromString("<1..888")
    <1..888
    >>> # (102.110) and 123^124 syntax unimplemented
    >>> print LocationFromString("join(12..78,134..202)")
    join(12..78,134..202)
    >>> print LocationFromString("complement(join(2691..4571,4918..5163))")
    complement(join(2691..4571,4918..5163))
    >>> print LocationFromString("join(complement(4918..5163),complement(2691..4571))")
    join(complement(4918..5163),complement(2691..4571))
    >>> print LocationFromString("order(complement(4918..5163),complement(2691..4571))")
    order(complement(4918..5163),complement(2691..4571))
    >>> print LocationFromString("NC_001802x.fna:73..78")
    NC_001802x.fna:73..78
    >>> print LocationFromString("J00194:100..202")
    J00194:100..202

    >>> print LocationFromString("join(117505..118584,1..609)")
    join(117505..118584,1..609)
    >>> print LocationFromString("join(test3:complement(4..6),test3:complement(1..3))")
    join(test3:complement(4..6),test3:complement(1..3))
    >>> print LocationFromString("test3:join(test1:complement(1..3),4..6)")
    test3:join(test1:complement(1..3),4..6)
    """
    def __init__(self, location_str):
        match_seqname = re_seqname.match(location_str)
        if match_seqname:
            self.seqname = match_seqname.group(1)
            location_str = match_seqname.group(2)
        else:
            self.seqname = None
        match_complement = re_complement.match(location_str)
        if match_complement:
            self.complement = 1
            location_str = match_complement.group(1)
        else:
            self.complement = 0
        match_join = re_join.match(location_str)
        if match_join:
            self.join = {'join':1, 'order':2}[match_join.group(1)]
            list.__init__(self, map(lambda x: LocationFromString(x), match_join.group(2).split(",")))
        else:
            self.join = 0
            match_dotdot = re_dotdot.match(location_str)
            if match_dotdot:
                list.__init__(self, map(lambda x: LocationFromString(match_dotdot.group(x)), (1, 2)))
            else:
                match_fuzzy = re_fuzzy.match(location_str)
                if match_fuzzy:
                    self.fuzzy = match_fuzzy.group(1)
                    location_str = match_fuzzy.group(2)
                else:
                    self.fuzzy = None
                    
                list.__init__(self, [int(location_str)-1]) # zero based, nip it in the bud

def open_file(filename):
    if filename:
        return open(filename)
    else:
        return sys.stdin

def fasta_single(filename=None, string=None):
    """
    >>> record = fasta_single(string=\"""
    ... >gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]
    ... MGARASVLSGGELDRWEKIRLRPGGKKKYKLKHIVWASRELERFAVNPGLLETSEGCRQILGQLQPSLQT
    ... GSEELRSLYNTVATLYCVHQRIEIKDTKEALDKIEEEQNKSKKKAQQAAADTGHSNQVSQNYPIVQNIQG
    ... QMVHQAISPRTLNAWVKVVEEKAFSPEVIPMFSALSEGATPQDLNTMLNTVGGHQAAMQMLKETINEEAA
    ... EWDRVHPVHAGPIAPGQMREPRGSDIAGTTSTLQEQIGWMTNNPPIPVGEIYKRWIILGLNKIVRMYSPT
    ... SILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPAATLEEMMTAC
    ... QGVGGPGHKARVLAEAMSQVTNSATIMMQRGNFRNQRKIVKCFNCGKEGHTARNCRAPRKKGCWKCGKEG
    ... HQMKDCTERQANFLGKIWPSYKGRPGNFLQSRPEPTAPPEESFRSGVETTTPPQKQEPIDKELYPLTSLR
    ... SLFGNDPSSQ
    ... \""")
    >>> record.id
    'gi|9629360|ref|NP_057850.1|'
    >>> record.description
    'gi|9629360|ref|NP_057850.1| Gag [Human immunodeficiency virus type 1]'
    >>> record.seq[0:5]
    Seq('MGARA', SingleLetterAlphabet())
    """
    #Returns the first record in a fasta file as a SeqRecord,
    #or None if there are no records in the file.
    if string:
        import cStringIO
        handle = cStringIO.StringIO(string)
    else :
        handle = open_file(filename)
    try :
        record = SeqIO.parse(handle, format="fasta").next()
    except StopIteration :
        record = None
    return record

def fasta_multi(filename=None):
    #Simple way is just:
    #return SeqIO.parse(open_file(filename), format="fasta")
    #However, for backwards compatibility make sure we raise
    #the StopIteration exception rather than returning None.
    reader = SeqIO.parse(open_file(filename), format="fasta")
    while True:
        record = reader.next()
        if record is None:
            raise StopIteration
        else:
            yield record

def fasta_readrecords(filename=None):
    """
    >>> records = fasta_readrecords('GFF/multi.fna')
    >>> records[0].id
    'test1'
    >>> records[2].seq
    Seq('AAACACAC', SingleLetterAlphabet())
    """
    return list(SeqIO.parse(open_file(filename), format="fasta"))

def fasta_write(filename, records):
    handle = open(filename, "w")
    SeqIO.write(records, handle, format="fasta")
    handle.close()

def genbank_single(filename):
    """
    >>> record = genbank_single("GFF/NC_001422.gbk")
    >>> record.taxonomy
    ['Viruses', 'ssDNA viruses', 'Microviridae', 'Microvirus']
    >>> cds = record.features[-4]
    >>> cds.key
    'CDS'
    >>> location = LocationFromString(cds.location)
    >>> print location
    2931..3917
    >>> subseq = record_subseq(record, location)
    >>> subseq[0:20]
    Seq('ATGTTTGGTGCTATTGCTGG', Alphabet())
    """
    return GenBank.RecordParser().parse(open(filename))

def record_subseq(record, location, *args, **keywds):
    """
    >>> from Bio.SeqRecord import SeqRecord    
    >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
    ...                    "ref|NC_001422",
    ...                    "Coliphage phiX174, complete genome",
    ...                    "bases 1-11")
    >>> record_subseq(record, LocationFromString("1..4")) # one-based
    Seq('GAGT', Alphabet())
    >>> record_subseq(record, LocationFromString("complement(1..4)")) # one-based
    Seq('ACTC', Alphabet())
    >>> record_subseq(record, LocationFromString("join(complement(1..4),1..4)")) # what an idea!
    Seq('ACTCGAGT', Alphabet())
    >>> loc = LocationFromString("complement(join(complement(5..7),1..4))")
    >>> print loc
    complement(join(complement(5..7),1..4))
    >>> record_subseq(record, loc)
    Seq('ACTCTTT', Alphabet())
    >>> print loc
    complement(join(complement(5..7),1..4))
    >>> loc.reverse()
    >>> record_subseq(record, loc)
    Seq('AAAGAGT', Alphabet())
    >>> record_subseq(record, loc, upper=1)
    Seq('AAAGAGT', Alphabet())
    """
    if location.join:
        subsequence_list = []
        if location.complement:
            location_copy = copy.copy(location)
            list.reverse(location_copy)
        else:
            location_copy = location
        for sublocation in location_copy:
            if location.complement:
                sublocation_copy = copy.copy(sublocation)
                sublocation_copy.reverse()
            else:
                sublocation_copy = sublocation
            subsequence_list.append(record_subseq(record, sublocation_copy, *args, **keywds).tostring())
        return Seq(''.join(subsequence_list), record_sequence(record).alphabet)
    else:
        return record_coords(record, location.start(), location.end()+1, location.complement, *args, **keywds)

def record_sequence(record):
    """
    returns the sequence of a record

    can be Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record
    """
    if isinstance(record, Bio.SeqRecord.SeqRecord):
        return record.seq
    elif isinstance(record, Bio.GenBank.Record.Record):
        return Seq(record.sequence)
    else:
        raise TypeError, 'not Bio.SeqRecord.SeqRecord or Bio.GenBank.Record.Record'

def record_coords(record, start, end, strand=0, upper=0):
    """
    >>> from Bio.SeqRecord import SeqRecord
    >>> record = SeqRecord(Seq("gagttttatcgcttccatga"),
    ...                    "ref|NC_001422",
    ...                    "Coliphage phiX174, complete genome",
    ...                    "bases 1-11")
    >>> record_coords(record, 0, 4) # zero-based
    Seq('GAGT', Alphabet())
    >>> record_coords(record, 0, 4, "-") # zero-based
    Seq('ACTC', Alphabet())
    >>> record_coords(record, 0, 4, "-", upper=1) # zero-based
    Seq('ACTC', Alphabet())
    """

    subseq = record_sequence(record)[start:end]
    subseq_str = subseq.tostring()
    subseq_str = subseq_str.upper()
    subseq = Seq(subseq_str, subseq.alphabet)
    if strand == '-' or strand == 1:
        return subseq.reverse_complement()
    else:
        return subseq

def _test(*args, **keywds):
    import doctest, sys
    doctest.testmod(sys.modules[__name__], *args, **keywds)

if __name__ == "__main__":
    if __debug__:
        _test()

Generated by  Doxygen 1.6.0   Back to index