Logo Search packages:      
Sourcecode: python-biopython version File versions  Download package


# Copyright 2010 by Tiago Antao.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

Large file parsing of Genepop files

The standard parser loads the whole file into memory. This parser
provides an iterator over data.

LargeRecord           Holds GenePop data.

read             Parses a GenePop record (file) into a Record object.


from copy import deepcopy

def get_indiv(line):
    indiv_name, marker_line = line.split(',')
    markers = marker_line.replace('\t', ' ').split(' ')
    markers = [marker for marker in markers if marker!='']
    if len(markers[0]) in [2, 4]: #2 digits per allele
        marker_len = 2
        marker_len = 3
        allele_list = [(int(marker[0:marker_len]),
                   for marker in markers]
    except ValueError: #Haploid
        allele_list = [(int(marker[0:marker_len]),)
                   for marker in markers]
    return indiv_name, allele_list, marker_len

def read(handle):
    """Parses a handle containing a GenePop file.

       handle is a file-like object that contains a GenePop record.
    record = Record(handle)
    record.comment_line = str(handle.readline()).rstrip()
    #We can now have one loci per line or all loci in a single line
    #separated by either space or comma+space...
    #We will remove all commas on loci... that should not be a problem
    sample_loci_line = str(handle.readline()).rstrip().replace(',', '')
    all_loci = sample_loci_line.split(' ')
    line = handle.readline()
    while line!="":
        line = line.rstrip()
        if line.upper()=="POP":
        line = handle.readline()
    next_line = handle.readline().rstrip()
    indiv_name, allele_list, record.marker_len = get_indiv(next_line)
    return record

00067 class Record:
    """Holds information from a GenePop record.

    marker_len         The marker length (2 or 3 digit code per allele).    
    comment_line       Comment line.

    loci_list          List of loci names.

    data_generator     Iterates over population data.

    The generator will only work once. If you want to read a handle
    twice you have to re-open it!

    data_generator can either be () - an empty tuple - marking a new
    population or an individual. An individual is something like
    ('Ind1', [(1,1), (3,None), (200,201)],
    In the case above the individual is called Ind1,
    has three diploid loci. For the second loci, one of the alleles
    is unknown.

    def __init__(self, handle):
        self.handle          = handle
        self.marker_len      = 0
        self.comment_line    = ""
        self.loci_list       = []
        self.populations     = []
        self.data_generator  = None
        self.stack           = [] 

    def data_generator(self):
        for handle in [self.stack, self.handle]:
            for line in handle:
                line = line.rstrip()
                if line.upper()=='POP':
                    yield ()
                    indiv_name, allele_list, marker_len = get_indiv(line)
                    clean_list = []
                    for locus in allele_list:
                        mk_real = []
                        for al in locus:
                            if al==0:
                    yield indiv_name, clean_list
        raise StopIteration()

Generated by  Doxygen 1.6.0   Back to index