Logo Search packages:      
Sourcecode: python-biopython version File versions

test_NNGene.py

#!/usr/bin/env python
"""Test the different representations of Genes.

This exercises the Motif, Schema and Signature methods of representing
genes, as well as generic Pattern methods.
"""
# standard library
import os
import unittest

# Biopython
from Bio import Fasta
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

# stuff we are testing
from Bio.NeuralNetwork.Gene import Schema
from Bio.NeuralNetwork.Gene import Motif
from Bio.NeuralNetwork.Gene import Signature
from Bio.NeuralNetwork.Gene import Pattern

VERBOSE = 0


# --- Tests for Pattern

00027 class PatternIOTest(unittest.TestCase):
    """Tests for reading and writing patterns to a file.
    """
    def setUp(self):
        self.alphabet = IUPAC.ambiguous_dna
        self.test_file = os.path.join("NeuralNetwork", "patternio.txt")
        #Remove any existing copy of the output file,
        if os.path.isfile(self.test_file) :
            os.remove(self.test_file)
        self.pattern_io = Pattern.PatternIO(self.alphabet)

    def tearDown(self):
        #Clean up by removing our output file,
        if os.path.isfile(self.test_file) :
            os.remove(self.test_file)

00043     def test_motif(self):
        """Reading and writing motifs to a file
        """
        # write to a file
        motifs = ["GAC", "AAA", "TTT", "GGG"]
        output_handle = open(self.test_file, "w")
        self.pattern_io.write(motifs, output_handle)
        output_handle.close()

        # read 'em back
        input_handle = open(self.test_file, "r")
        read_motifs = self.pattern_io.read(input_handle)
        input_handle.close()
        assert read_motifs == motifs, \
               "Failed to get back expected motifs %s, got %s" \
               % (motifs, read_motifs)

        # write seqs
        seq_motifs = []
        for motif in motifs:
            seq_motifs.append(Seq(motif, self.alphabet))
        output_handle = open(self.test_file, "w")
        self.pattern_io.write_seq(seq_motifs, output_handle)
        output_handle.close()

        # read the seqs back
        input_handle = open(self.test_file, "r")
        read_motifs = self.pattern_io.read(input_handle)
        input_handle.close()
        assert read_motifs == motifs, \
               "Failed to get back expected motifs %s from seqs, got %s" \
               % (motifs, read_motifs)

00076     def test_schema(self):
        """Reading and writing schemas to a file.
        """
        schemas = ["GTR", "GAC"]
        # write out the schemas
        output_handle = open(self.test_file, "w")
        self.pattern_io.write(schemas, output_handle)
        output_handle.close()

        # read back the schemas
        input_handle = open(self.test_file, "r")
        read_schemas = self.pattern_io.read(input_handle)
        input_handle.close()
        assert schemas == read_schemas, \
               "Read incorrect schemas %s, expected %s." \
               % (read_schemas, schemas)

        # --- make sure inappropriate alphabets are reported
        schemas = ["GTR", "G*C"] # '*' not in the unambigous alphabet
        output_handle = open(self.test_file, "w")
        self.pattern_io.write(schemas, output_handle)
        output_handle.close()

        input_handle = open(self.test_file, "r")
        try:
            read_schemas = self.pattern_io.read(input_handle)
            raise AssertionError("Did not report error on bad alphabet.")
        except ValueError:
            pass # expected behavior
        except:
            raise AssertionError("Got unexpected error while reading.")

        input_handle.close()

00110     def test_signature(self):
        """Reading and writing signatures to a file.
        """
        signatures = [("GAC", "GAC"), ("AAA", "TTT")]
        output_handle = open(self.test_file, "w")
        self.pattern_io.write(signatures, output_handle)
        output_handle.close()

        input_handle = open(self.test_file, "r")
        read_sigs = self.pattern_io.read(input_handle)
        input_handle.close()
        assert read_sigs == signatures, \
               "Got back unexpected signatures %s, wanted %s" \
               % (read_sigs, signatures)

00125 class PatternRepositoryTest(unittest.TestCase):
    """Tests for retrieving info from a repository of patterns.
    """ 
    def setUp(self):
        self.motifs = {"GATC" : 30,
                       "GGGG" : 10,
                       "GTAG" : 0,
                       "AAAA" : -10,
                       "ATAT" : -20}

        self.repository = Pattern.PatternRepository(self.motifs)

00137     def test_get_all(self):
        """Retrieve all patterns from a repository.
        """
        all_motifs = self.repository.get_all()

        assert all_motifs == ["GATC", "GGGG", "GTAG", "AAAA", "ATAT"], \
               "Unexpected motifs returned %s" % all_motifs

00145     def test_get_random(self):
        """Retrieve random patterns from the repository.
        """
        for num_patterns in range(5):
            patterns = self.repository.get_random(num_patterns)
            assert len(patterns) == num_patterns, \
                   "Got unexpected number of patterns %s, expected %s" \
                   % (len(patterns), num_patterns)

            for pattern in patterns:
                assert pattern in self.motifs.keys(), \
                       "Got unexpected pattern %s" % pattern

00158     def test_get_top_percentage(self):
        """Retrieve the top percentge of patterns from the repository.
        """
        for num_patterns, percentage in ((1, 0.2), (2, .4), (5, 1.0)):
            patterns = self.repository.get_top_percentage(percentage)
            assert len(patterns) == num_patterns, \
                   "Got unexpected number of patterns %s, expected %s" \
                   % (len(patterns), num_patterns)

            for pattern in patterns:
                assert pattern in self.motifs.keys(), \
                       "Got unexpected pattern %s" % pattern      

00171     def test_get_top(self):
        """Retrieve a certain number of the top patterns.
        """
        for num_patterns in range(5):
            patterns = self.repository.get_top(num_patterns)
            assert len(patterns) == num_patterns, \
                   "Got unexpected number of patterns %s, expected %s" \
                   % (len(patterns), num_patterns)

            for pattern in patterns:
                assert pattern in self.motifs.keys(), \
                       "Got unexpected pattern %s" % pattern       

00184     def test_get_differing(self):
        """Retrieve patterns from both sides of the list (top and bottom).
        """
        patterns = self.repository.get_differing(2, 2)
        assert patterns == ["GATC", "GGGG", "AAAA", "ATAT"], \
               "Got unexpected patterns %s" % patterns

00191     def test_remove_polyA(self):
        """Test the ability to remove A rich patterns from the repository.
        """
        patterns = self.repository.get_all()
        assert len(patterns) == 5, "Unexpected starting: %s" % patterns

        self.repository.remove_polyA()
        
        patterns = self.repository.get_all()
        assert len(patterns) == 3, "Unexpected ending: %s" % patterns
        assert patterns == ["GATC", "GGGG", "GTAG"], \
               "Unexpected patterns: %s" % patterns

00204     def test_count(self):
        """Retrieve counts for particular patterns in the repository.
        """
        num_times = self.repository.count("GGGG")
        assert num_times == 10, \
               "Did not count item in the respository: %s" % num_times

        num_times = self.repository.count("NOT_IN_THERE")
        assert num_times == 0, \
               "Counted items not in repository: %s" % num_times

# --- Tests for motifs

00217 class MotifFinderTest(unittest.TestCase):
    """Tests for finding motifs from sequences.
    """
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()
            
                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.motif_finder = Motif.MotifFinder()

00247     def test_find(self):
        """Find all motifs in a set of sequences.
        """
        motif_repository = self.motif_finder.find(self.test_records, 8)
        top_motif = motif_repository.get_top(1)

        assert top_motif[0] == 'TTGGAAAG', \
               "Got unexpected motif %s" % top_motif[0]

00256     def test_find_differences(self):
        """Find the difference in motif counts between two sets of sequences.
        """
        motif_repository = \
               self.motif_finder.find_differences(self.test_records,
                                                  self.diff_records, 8)

        top, bottom = motif_repository.get_differing(1, 1)

        assert top == "TTGGAAAG", "Got unexpected top motif %s" % top
        assert bottom == "AATGGCAT", "Got unexpected bottom motif %s" % bottom

00268 class MotifCoderTest(unittest.TestCase):
    """Test the ability to encode sequences as a set of motifs.
    """
    def setUp(self):
        motifs = ["GAG", "GAT", "GCC", "ATA"]

        self.match_strings = (("GATCGCC", [0.0, 1.0, 1.0, 0.0]),
                              ("GATGATCGAGCC", [.5, 1.0, .5, 0.0]))
        
        self.coder = Motif.MotifCoder(motifs)

        
00280     def test_representation(self):
        """Convert a sequence into its motif representation.
        """
        for match_string, expected in self.match_strings:
            seq_to_code = Seq(match_string, IUPAC.unambiguous_dna)
            matches = self.coder.representation(seq_to_code)

            assert matches == expected, \
                   "Did not match representation, expected %s, got %s" \
                   % (expected, matches)

# --- Tests for schemas

00293 class SchemaTest(unittest.TestCase):
    """Matching ambiguous motifs with multiple ambiguity characters.
    """
    def setUp(self):
        ambiguity_chars = {"G" : "G",
                           "A" : "A",
                           "T" : "T",
                           "C" : "C",
                           "R" : "AG",
                           "*" : "AGTC"}

        self.motif_coder = Schema.Schema(ambiguity_chars)

        self.match_string = "GATAG"
        self.match_info = [("GA", ["GA"]),
                           ("GATAG", ["GATAG"]),
                           ("GA*AG", ["GATAG"]),
                           ("GATRG", ["GATAG"]),
                           ("*A", ["GA", "TA"])]

00313     def test_find_matches(self):
        """Find all matches in a sequence.
        """
        for motif, expected in self.match_info:
            found_matches = self.motif_coder.find_matches(motif,
                                                          self.match_string)
            assert found_matches == expected, "Expected %s, got %s" \
                   % (expected, found_matches)

00322     def test_num_matches(self):
        """Find how many matches are present in a sequence.
        """
        for motif, expected in self.match_info:
            num_matches = self.motif_coder.num_matches(motif,
                                                       self.match_string)
            assert num_matches == len(expected), \
                   "Expected %s, got %s" % (num_matches, len(expected))

00331     def test_find_ambiguous(self):
        """Find the positions of ambiguous items in a sequence.
        """
        ambig_info = (("GATC", []),
                      ("G***", [1, 2, 3]),
                      ("GART", [2]),
                      ("*R*R", [0, 1, 2, 3]))

        for motif, expected in ambig_info:
            found_positions = self.motif_coder.find_ambiguous(motif)
            assert found_positions == expected, \
                   "Expected %s, got %s for %s" % (expected, found_positions,
                                                   motif)
        
00345     def test_num_ambiguous(self):
        """Find the number of ambiguous items in a sequence.
        """
        ambig_info = (("GATC", 0),
                      ("G***", 3),
                      ("GART", 1),
                      ("*R*R", 4))

        for motif, expected in ambig_info:
            found_num = self.motif_coder.num_ambiguous(motif)
            assert found_num == expected, \
                   "Expected %s, got %s for %s" % (expected, found_num, motif)

00358     def test_motif_cache(self):
        """Make sure motif compiled regular expressions are cached properly.
        """
        test_motif = "GATC"

        self.motif_coder.find_matches(test_motif, "GATCGATC")

        assert self.motif_coder._motif_cache.has_key(test_motif), \
               "Did not find motif cached properly."

        # make sure we don't bomb out if we use the same motif twice
        self.motif_coder.find_matches(test_motif, "GATCGATC")

00371     def test_all_unambiguous(self):
        """Return all unambiguous characters that can be in a motif.
        """
        found_unambig = self.motif_coder.all_unambiguous()

        expected = ["A", "C", "G", "T"]
        assert found_unambig == expected, \
               "Got %s, expected %s" % (found_unambig, expected)

00380 class SchemaFinderTest(unittest.TestCase):
    """Test finding schemas from a set of sequences.
    """
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')
        diff_file = os.path.join('NeuralNetwork', 'repeat.fasta')

        self.test_records = []
        self.diff_records = []

        # load the records
        for file, records in ((test_file, self.test_records),
                              (diff_file, self.diff_records)):

            handle = open(file, 'r')

            seq_parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna)
            iterator = Fasta.Iterator(handle, seq_parser)
            while 1:
                seq_record = iterator.next()
            
                if seq_record is None:
                    break

                records.append(seq_record)

            handle.close()

        self.num_schemas = 2
        schema_ga = Schema.GeneticAlgorithmFinder()
        schema_ga.min_generations = 1
        self.finder = Schema.SchemaFinder(num_schemas = self.num_schemas,
                                          schema_finder = schema_ga)

00414     def test_find(self):
        """Find schemas from sequence inputs.
        """
        # this test takes too long
        if VERBOSE:
            repository = self.finder.find(self.test_records + self.diff_records)
            schemas = repository.get_all()

            assert len(schemas) >= self.num_schemas, "Got too few schemas."

00424     def test_find_differences(self):
        """Find schemas that differentiate between two sets of sequences.
        """
        # this test takes too long
        if VERBOSE:
            repository = self.finder.find_differences(self.test_records,
                                                      self.diff_records)
            schemas = repository.get_all()

            assert len(schemas) >= self.num_schemas, "Got too few schemas."
        
00435 class SchemaCoderTest(unittest.TestCase):
    """Test encoding sequences as a grouping of motifs.
    """
    def setUp(self):
        ambiguity_chars = {"G" : "G",
                           "A" : "A",
                           "T" : "T",
                           "C" : "C",
                           "R" : "AG",
                           "*" : "AGTC"}

        motif_representation = Schema.Schema(ambiguity_chars)
        motifs = ("GA", "GATAG", "GA*AG", "GATRG", "*A")
        self.motif_coder = Schema.SchemaCoder(motifs,
                                              motif_representation)

        self.match_strings = [("GATAG", [.5, .5, .5, .5, 1.0]),
                              ("GAGAGATA", [float(3) / float(4), 0,
                                            float(1) / float(4), 0,
                                            1])]

00456     def test_representation(self):
        """Convert a string into a representation of motifs.
        """
        for match_string, expected in self.match_strings:
            match_seq = Seq(match_string, IUPAC.unambiguous_dna)
            found_rep = self.motif_coder.representation(match_seq)
            assert found_rep == expected, "Got %s, expected %s" % \
                   (found_rep, expected)
    
00465 class SchemaMatchingTest(unittest.TestCase):
    """Matching schema to strings works correctly.
    """
    def shortDescription(self):
        return "%s:%s" % (self.__class__.__name__, self.__doc__)
    
    def runTest(self):
        match = Schema.matches_schema("GATC", "AAAAA")
        assert match == 0, "Expected no match because of length differences"

        match = Schema.matches_schema("GATC", "GAT*")
        assert match == 1, "Expected match"

        match = Schema.matches_schema("GATC", "GATC")
        assert match == 1, "Expected match"

        match = Schema.matches_schema("GATC", "C*TC")
        assert match == 0, "Expected no match because of char mismatch."

        match = Schema.matches_schema("G*TC", "*TTC")
        assert match == 1, "Expected match because of ambiguity."

00487 class SchemaFactoryTest(unittest.TestCase):
    """Test the SchemaFactory for generating Schemas.
    """
    def __init__(self, method):
        unittest.TestCase.__init__(self, method)

        # a cached schema bank, so we don't have to load it multiple times
        self.schema_bank = None
    
    def setUp(self):
        self.factory = Schema.SchemaFactory()

        self.test_file = os.path.join(os.getcwd(), "NeuralNetwork", "enolase.fasta")

        ambiguity_chars = {"G" : "G",
                           "A" : "A",
                           "T" : "T",
                           "C" : "C",
                           "R" : "AG",
                           "*" : "AGTC"}

        self.schema = Schema.Schema(ambiguity_chars)

00510     def test_easy_from_motifs(self):
        """Generating schema from a simple list of motifs.
        """
        motifs = {"GATCGAA" : 20,
                  "GATCGAT" : 15,
                  "GATTGAC" : 25,
                  "TTTTTTT" : 10}

        motif_bank = Pattern.PatternRepository(motifs)

        schema_bank = self.factory.from_motifs(motif_bank, .5, 2)
        if VERBOSE:
            print "\nSchemas:"
            for schema in schema_bank.get_all():
                print "%s: %s" % (schema, schema_bank.count(schema))

00526     def test_hard_from_motifs(self):
        """Generating schema from a real life set of motifs.
        """
        schema_bank = self._load_schema_repository()

        if VERBOSE:
            print "\nSchemas:"
            for schema in schema_bank.get_top(5):
                print "%s: %s" % (schema, schema_bank.count(schema))

00536     def _load_schema_repository(self):
        """Helper function to load a schema repository from a file.

        This also caches a schema bank, to prevent having to do this
        time consuming operation multiple times.
        """
        # if we already have a cached repository, return it
        if self.schema_bank is not None:
            return self.schema_bank
        
        # otherwise, we'll read in a new schema bank

        # read in the all of the motif records
        motif_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(motif_handle, seq_parser)

        seq_records = []
        while 1:
            seq_record = iterator.next()
            
            if seq_record is None:
                break

            seq_records.append(seq_record)

        motif_handle.close()

        # find motifs from the file
        motif_finder = Motif.MotifFinder()
        motif_size = 9

        motif_bank = motif_finder.find(seq_records, motif_size)
        
        schema_bank = self.factory.from_motifs(motif_bank, .1, 2)

        # cache the repository
        self.schema_bank = schema_bank

        return schema_bank

00578     def test_schema_representation(self):
        """Convert sequences into schema representations.
        """
        # get a set of schemas we want to code the sequence in
        schema_bank = self._load_schema_repository()
        top_schemas = schema_bank.get_top(25)
        schema_coder = Schema.SchemaCoder(top_schemas, self.schema)

        # get the sequences one at a time, and encode them
        fasta_handle = open(self.test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(fasta_handle, seq_parser)

        while 1:
            seq_record = iterator.next()
            
            if seq_record is None:
                break

            schema_values = schema_coder.representation(seq_record.seq)
            if VERBOSE:
                print "Schema values:", schema_values

        fasta_handle.close()

# --- Tests for Signatures
00605 class SignatureFinderTest(unittest.TestCase):
    """Test the ability to find signatures in a set of sequences.
    """
    def setUp(self):
        test_file = os.path.join('NeuralNetwork', 'enolase.fasta')

        self.test_records = []

        # load the records
        handle = open(test_file, 'r')

        seq_parser = Fasta.SequenceParser(alphabet = IUPAC.unambiguous_dna)
        iterator = Fasta.Iterator(handle, seq_parser)
        while 1:
            seq_record = iterator.next()
            
            if seq_record is None:
                break

            self.test_records.append(seq_record)

        handle.close()

        self.sig_finder = Signature.SignatureFinder()

00630     def test_find(self):
        """Find signatures from sequence inputs.
        """
        repository = self.sig_finder.find(self.test_records, 6, 9)
        top_sig = repository.get_top(1)

        assert top_sig[0] == ('TTGGAA', 'TGGAAA'), \
               "Unexpected signature %s" % top_sig[0]

00639 class SignatureCoderTest(unittest.TestCase):
    """Test the ability to encode sequences as a set of signatures.
    """
    def setUp(self):
        signatures = [("GAC", "GAC"), ("AAA", "TTT"), ("CAA", "TTG")]

        self.coder = Signature.SignatureCoder(signatures, 9)

        self.test_seqs = [("GACAAAGACTTT", [1.0, 1.0, 0.0]),
                          ("CAAAGACGACTTTAAATTT", [0.5, 1.0, 0.0]),
                          ("AAATTTAAAGACTTTGAC", [1.0 / 3.0, 1.0, 0.0]),
                          ("GACGAC", [1.0, 0.0, 0.0]),
                          ("GACAAAAAAAAAGAC", [1.0, 0.0, 0.0]),
                          ("GACAAAAAAAAAAGAC", [0.0, 0.0, 0.0])]

00654     def test_representation(self):
        """Convert a sequence into its signature representation.
        """
        for seq_string, expected in self.test_seqs:
            test_seq = Seq(seq_string, IUPAC.unambiguous_dna)
            predicted = self.coder.representation(test_seq)

            assert predicted == expected, \
                   "Non-expected representation %s for %s, wanted %s" \
                   % (predicted, seq_string, expected)

if __name__ == "__main__":
    runner = unittest.TextTestRunner(verbosity = 2)
    unittest.main(testRunner=runner)

Generated by  Doxygen 1.6.0   Back to index