Logo Search packages:      
Sourcecode: python-biopython version File versions

def Bio::Align::AlignInfo::SummaryInfo::information_content (   self,
  start = 0,
  end = None,
  e_freq_table = None,
  log_base = 2,
  chars_to_ignore = [] 
)

Calculate the information content for each residue along an alignment.

Arguments:
o start, end - The starting an ending points to calculate the
information content. These points should be relative to the first
sequence in the alignment, starting at zero (ie. even if the 'real'
first position in the seq is 203 in the initial sequence, for
the info content, we need to use zero). This defaults to the entire
length of the first sequence.
o e_freq_table - A FreqTable object  specifying the expected frequencies
for each letter in the alphabet we are using (ie. {'G' : 0.4,
'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
included, since these should not have expected frequencies.
o log_base - The base of the logathrim to use in calculating the
information content. This defaults to 2 so the info is in bits.
o chars_to_ignore - A listing of characterw which should be ignored
in calculating the info content.

Returns:
o A number representing the info content for the specified region.

Please see the Biopython manual for more information on how information
content is calculated.

Definition at line 406 of file AlignInfo.py.

                                                 :
        """Calculate the information content for each residue along an alignment.

        Arguments:
        o start, end - The starting an ending points to calculate the
        information content. These points should be relative to the first
        sequence in the alignment, starting at zero (ie. even if the 'real'
        first position in the seq is 203 in the initial sequence, for
        the info content, we need to use zero). This defaults to the entire
        length of the first sequence.
        o e_freq_table - A FreqTable object  specifying the expected frequencies
        for each letter in the alphabet we are using (ie. {'G' : 0.4,
        'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be
        included, since these should not have expected frequencies.
        o log_base - The base of the logathrim to use in calculating the
        information content. This defaults to 2 so the info is in bits.
        o chars_to_ignore - A listing of characterw which should be ignored
        in calculating the info content.

        Returns:
        o A number representing the info content for the specified region.

        Please see the Biopython manual for more information on how information
        content is calculated.
        """
        # if no end was specified, then we default to the end of the sequence
        if end is None:
            end = len(self.alignment._records[0].seq)
        
        if start < 0 or end > len(self.alignment._records[0].seq):
            raise ValueError \
                  ("Start (%s) and end (%s) are not in the range %s to %s"
                   % (start, end, 0, len(self.alignment._records[0].seq)))
        # determine random expected frequencies, if necessary
        if not e_freq_table:
            if isinstance(self.alignment._alphabet.alphabet,
                Alphabet.ProteinAlphabet):
                random_expected = Protein20Random
            elif isinstance(self.alignment._alphabet.alphabet,
                Alphabet.NucleotideAlphabet):
                random_expected = Nucleotide4Random
            else:
                errstr = "Error in alphabet: not Nucleotide or Protein, "
                errstr += "supply expected frequencies"
                raise ValueError, errstr
        else:
            random_expected = None
        # determine all of the letters we have to deal with
        all_letters = self.alignment._alphabet.letters
        for char in chars_to_ignore:
            all_letters = string.replace(all_letters, char, '')

        info_content = {}
        for residue_num in range(start, end):
            freq_dict = self._get_letter_freqs(residue_num,
                                               self.alignment._records,
                                               all_letters, chars_to_ignore)
            # print freq_dict,
            column_score = self._get_column_info_content(freq_dict,
                                                         e_freq_table,
                                                         log_base,
                                                         random_expected)

            info_content[residue_num] = column_score
        # sum up the score
        total_info = 0
        for column_info in info_content.values():
            total_info = total_info + column_info
        # fill in the ic_vector member: holds IC for each column
        for i in info_content.keys():
            self.ic_vector[i] = info_content[i]
        return total_info

    def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):


Generated by  Doxygen 1.6.0   Back to index