Package Bio :: Package PopGen :: Package GenePop :: Module LargeFileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.LargeFileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  Large file parsing of Genepop files 
  8   
  9  The standard parser loads the whole file into memory. This parser 
 10  provides an iterator over data. 
 11   
 12  Classes: 
 13  LargeRecord           Holds GenePop data. 
 14   
 15  Functions: 
 16  read             Parses a GenePop record (file) into a Record object. 
 17   
 18  """ 
 19   
 20  from copy import deepcopy 
 21   
 22   
23 -def get_indiv(line):
24 indiv_name, marker_line = line.split(',') 25 markers = marker_line.replace('\t', ' ').split(' ') 26 markers = [marker for marker in markers if marker!=''] 27 if len(markers[0]) in [2, 4]: #2 digits per allele 28 marker_len = 2 29 else: 30 marker_len = 3 31 try: 32 allele_list = [(int(marker[0:marker_len]), 33 int(marker[marker_len:])) 34 for marker in markers] 35 except ValueError: #Haploid 36 allele_list = [(int(marker[0:marker_len]),) 37 for marker in markers] 38 return indiv_name, allele_list, marker_len
39
40 -def read(handle):
41 """Parses a handle containing a GenePop file. 42 43 handle is a file-like object that contains a GenePop record. 44 """ 45 record = Record(handle) 46 record.comment_line = str(handle.readline()).rstrip() 47 #We can now have one loci per line or all loci in a single line 48 #separated by either space or comma+space... 49 #We will remove all commas on loci... that should not be a problem 50 sample_loci_line = str(handle.readline()).rstrip().replace(',', '') 51 all_loci = sample_loci_line.split(' ') 52 record.loci_list.extend(all_loci) 53 line = handle.readline() 54 while line!="": 55 line = line.rstrip() 56 if line.upper()=="POP": 57 record.stack.append("POP") 58 break 59 record.loci_list.append(line) 60 line = handle.readline() 61 next_line = handle.readline().rstrip() 62 indiv_name, allele_list, record.marker_len = get_indiv(next_line) 63 record.stack.append(next_line) 64 return record
65 66
67 -class Record(object):
68 """Holds information from a GenePop record. 69 70 Members: 71 marker_len The marker length (2 or 3 digit code per allele). 72 73 comment_line Comment line. 74 75 loci_list List of loci names. 76 77 data_generator Iterates over population data. 78 79 The generator will only work once. If you want to read a handle 80 twice you have to re-open it! 81 82 data_generator can either be () - an empty tuple - marking a new 83 population or an individual. An individual is something like 84 ('Ind1', [(1,1), (3,None), (200,201)], 85 In the case above the individual is called Ind1, 86 has three diploid loci. For the second loci, one of the alleles 87 is unknown. 88 89 90 """
91 - def __init__(self, handle):
92 self.handle = handle 93 self.marker_len = 0 94 self.comment_line = "" 95 self.loci_list = [] 96 self.populations = [] 97 self.data_generator = None 98 self.stack = []
99
100 - def data_generator(self):
101 for handle in [self.stack, self.handle]: 102 for line in handle: 103 line = line.rstrip() 104 if line.upper()=='POP': 105 yield () 106 else: 107 indiv_name, allele_list, marker_len = get_indiv(line) 108 clean_list = [] 109 for locus in allele_list: 110 mk_real = [] 111 for al in locus: 112 if al==0: 113 mk_real.append(None) 114 else: 115 mk_real.append(al) 116 clean_list.append(tuple(mk_real)) 117 yield indiv_name, clean_list 118 raise StopIteration()
119