Package Bio :: Package NeuralNetwork :: Package Gene :: Module Schema
[hide private]
[frames] | no frames]

Source Code for Module Bio.NeuralNetwork.Gene.Schema

  1  """Deal with Motifs or Signatures allowing ambiguity in the sequences. 
  2   
  3  This class contains Schema which deal with Motifs and Signatures at 
  4  a higher level, by introducing `don't care` (ambiguity) symbols into 
  5  the sequences. For instance, you could combine the following Motifs: 
  6   
  7  'GATC', 'GATG', 'GATG', 'GATT' 
  8   
  9  as all falling under a schema like 'GAT*', where the star indicates a 
 10  character can be anything. This helps us condense a whole ton of 
 11  motifs or signatures. 
 12  """ 
 13  # standard modules 
 14  import random 
 15  import re 
 16   
 17  # biopython 
 18  from Bio import Alphabet 
 19  from Bio.Seq import MutableSeq 
 20   
 21  # neural network libraries 
 22  from Pattern import PatternRepository 
 23   
 24  # genetic algorithm libraries 
 25  from Bio.GA import Organism 
 26  from Bio.GA.Evolver import GenerationEvolver 
 27  from Bio.GA.Mutation.Simple import SinglePositionMutation 
 28  from Bio.GA.Crossover.Point import SinglePointCrossover 
 29  from Bio.GA.Repair.Stabilizing import AmbiguousRepair 
 30  from Bio.GA.Selection.Tournament import TournamentSelection 
 31  from Bio.GA.Selection.Diversity import DiversitySelection 
 32   
33 -class Schema(object):
34 """Deal with motifs that have ambiguity characters in it. 35 36 This motif class allows specific ambiguity characters and tries to 37 speed up finding motifs using regular expressions. 38 39 This is likely to be a replacement for the Schema representation, 40 since it allows multiple ambiguity characters to be used. 41 """
42 - def __init__(self, ambiguity_info):
43 """Initialize with ambiguity information. 44 45 Arguments: 46 47 o ambiguity_info - A dictionary which maps letters in the motifs to 48 the ambiguous characters which they might represent. For example, 49 {'R' : 'AG'} specifies that Rs in the motif can match a A or a G. 50 All letters in the motif must be represented in the ambiguity_info 51 dictionary. 52 """ 53 self._ambiguity_info = ambiguity_info 54 55 # a cache of all encoded motifs 56 self._motif_cache = {}
57
58 - def encode_motif(self, motif):
59 """Encode the passed motif as a regular expression pattern object. 60 61 Arguments: 62 63 o motif - The motif we want to encode. This should be a string. 64 65 Returns: 66 A compiled regular expression pattern object that can be used 67 for searching strings. 68 """ 69 regexp_string = "" 70 71 for motif_letter in motif: 72 try: 73 letter_matches = self._ambiguity_info[motif_letter] 74 except KeyError: 75 raise KeyError("No match information for letter %s" 76 % motif_letter) 77 78 if len(letter_matches) > 1: 79 regexp_match = "[" + letter_matches + "]" 80 elif len(letter_matches) == 1: 81 regexp_match = letter_matches 82 else: 83 raise ValueError("Unexpected match information %s" 84 % letter_matches) 85 86 regexp_string += regexp_match 87 88 return re.compile(regexp_string)
89
90 - def find_ambiguous(self, motif):
91 """Return the location of ambiguous items in the motif. 92 93 This just checks through the motif and compares each letter 94 against the ambiguity information. If a letter stands for multiple 95 items, it is ambiguous. 96 """ 97 ambig_positions = [] 98 for motif_letter_pos in range(len(motif)): 99 motif_letter = motif[motif_letter_pos] 100 try: 101 letter_matches = self._ambiguity_info[motif_letter] 102 except KeyError: 103 raise KeyError("No match information for letter %s" 104 % motif_letter) 105 106 if len(letter_matches) > 1: 107 ambig_positions.append(motif_letter_pos) 108 109 return ambig_positions
110
111 - def num_ambiguous(self, motif):
112 """Return the number of ambiguous letters in a given motif. 113 """ 114 ambig_positions = self.find_ambiguous(motif) 115 return len(ambig_positions)
116
117 - def find_matches(self, motif, query):
118 """Return all non-overlapping motif matches in the query string. 119 120 This utilizes the regular expression findall function, and will 121 return a list of all non-overlapping occurances in query that 122 match the ambiguous motif. 123 """ 124 try: 125 motif_pattern = self._motif_cache[motif] 126 except KeyError: 127 motif_pattern = self.encode_motif(motif) 128 self._motif_cache[motif] = motif_pattern 129 130 return motif_pattern.findall(query)
131
132 - def num_matches(self, motif, query):
133 """Find the number of non-overlapping times motif occurs in query. 134 """ 135 all_matches = self.find_matches(motif, query) 136 return len(all_matches)
137
138 - def all_unambiguous(self):
139 """Return a listing of all unambiguous letters allowed in motifs. 140 """ 141 all_letters = sorted(self._ambiguity_info) 142 unambig_letters = [] 143 144 for letter in all_letters: 145 possible_matches = self._ambiguity_info[letter] 146 if len(possible_matches) == 1: 147 unambig_letters.append(letter) 148 149 return unambig_letters
150 151 # --- helper classes and functions for the default SchemaFinder 152 153 # -- Alphabets 154
155 -class SchemaDNAAlphabet(Alphabet.Alphabet):
156 """Alphabet of a simple Schema for DNA sequences. 157 158 This defines a simple alphabet for DNA sequences that has a single 159 character which can match any other character. 160 161 o G,A,T,C - The standard unambiguous DNA alphabet. 162 163 o * - Any letter 164 """ 165 letters = ["G", "A", "T", "C", "*"] 166 167 alphabet_matches = {"G" : "G", 168 "A" : "A", 169 "T" : "T", 170 "C" : "C", 171 "*" : "GATC"}
172 173 # -- GA schema finder 174
175 -class GeneticAlgorithmFinder(object):
176 """Find schemas using a genetic algorithm approach. 177 178 This approach to finding schema uses Genetic Algorithms to evolve 179 a set of schema and find the best schema for a specific set of 180 records. 181 182 The 'default' finder searches for ambiguous DNA elements. This 183 can be overridden easily by creating a GeneticAlgorithmFinder 184 with a different alphabet. 185 """
186 - def __init__(self, alphabet = SchemaDNAAlphabet()):
187 """Initialize a finder to get schemas using Genetic Algorithms. 188 189 Arguments: 190 191 o alphabet -- The alphabet which specifies the contents of the 192 schemas we'll be generating. This alphabet must contain the 193 attribute 'alphabet_matches', which is a dictionary specifying 194 the potential ambiguities of each letter in the alphabet. These 195 ambiguities will be used in building up the schema. 196 """ 197 self.alphabet = alphabet 198 199 self.initial_population = 500 200 self.min_generations = 10 201 202 self._set_up_genetic_algorithm()
203
205 """Overrideable function to set up the genetic algorithm parameters. 206 207 This functions sole job is to set up the different genetic 208 algorithm functionality. Since this can be quite complicated, this 209 allows cusotmizablity of all of the parameters. If you want to 210 customize specially, you can inherit from this class and override 211 this function. 212 """ 213 self.motif_generator = RandomMotifGenerator(self.alphabet) 214 215 self.mutator = SinglePositionMutation(mutation_rate = 0.1) 216 self.crossover = SinglePointCrossover(crossover_prob = 0.25) 217 self.repair = AmbiguousRepair(Schema(self.alphabet.alphabet_matches), 218 4) 219 self.base_selector = TournamentSelection(self.mutator, self.crossover, 220 self.repair, 2) 221 self.selector = DiversitySelection(self.base_selector, 222 self.motif_generator.random_motif)
223
224 - def find_schemas(self, fitness, num_schemas):
225 """Find the given number of unique schemas using a genetic algorithm 226 227 Arguments: 228 229 o fitness - A callable object (ie. function) which will evaluate 230 the fitness of a motif. 231 232 o num_schemas - The number of unique schemas with good fitness 233 that we want to generate. 234 """ 235 start_population = \ 236 Organism.function_population(self.motif_generator.random_motif, 237 self.initial_population, 238 fitness) 239 finisher = SimpleFinisher(num_schemas, self.min_generations) 240 241 # set up the evolver and do the evolution 242 evolver = GenerationEvolver(start_population, self.selector) 243 evolved_pop = evolver.evolve(finisher.is_finished) 244 245 # convert the evolved population into a PatternRepository 246 schema_info = {} 247 for org in evolved_pop: 248 # convert the Genome from a MutableSeq to a Seq so that 249 # the schemas are just strings (and not array("c")s) 250 seq_genome = org.genome.toseq() 251 schema_info[seq_genome.tostring()] = org.fitness 252 253 return PatternRepository(schema_info)
254 255 # -- fitness classes 256
257 -class DifferentialSchemaFitness(object):
258 """Calculate fitness for schemas that differentiate between sequences. 259 """
260 - def __init__(self, positive_seqs, negative_seqs, schema_evaluator):
261 """Initialize with different sequences to evaluate 262 263 Arguments: 264 265 o positive_seq - A list of SeqRecord objects which are the 'positive' 266 sequences -- the ones we want to select for. 267 268 o negative_seq - A list of SeqRecord objects which are the 'negative' 269 sequences that we want to avoid selecting. 270 271 o schema_evaluator - An Schema class which can be used to 272 evaluate find motif matches in sequences. 273 """ 274 self._pos_seqs = positive_seqs 275 self._neg_seqs = negative_seqs 276 self._schema_eval = schema_evaluator
277
278 - def calculate_fitness(self, genome):
279 """Calculate the fitness for a given schema. 280 281 Fitness is specified by the number of occurances of the schema in 282 the positive sequences minus the number of occurances in the 283 negative examples. 284 285 This fitness is then modified by multiplying by the length of the 286 schema and then dividing by the number of ambiguous characters in 287 the schema. This helps select for schema which are longer and have 288 less redundancy. 289 """ 290 # convert the genome into a string 291 seq_motif = genome.toseq() 292 motif = seq_motif.tostring() 293 294 # get the counts in the positive examples 295 num_pos = 0 296 for seq_record in self._pos_seqs: 297 cur_counts = self._schema_eval.num_matches(motif, 298 seq_record.seq.tostring()) 299 num_pos += cur_counts 300 301 # get the counts in the negative examples 302 num_neg = 0 303 for seq_record in self._neg_seqs: 304 cur_counts = self._schema_eval.num_matches(motif, 305 seq_record.seq.tostring()) 306 307 num_neg += cur_counts 308 309 num_ambiguous = self._schema_eval.num_ambiguous(motif) 310 # weight the ambiguous stuff more highly 311 num_ambiguous = pow(2.0, num_ambiguous) 312 # increment num ambiguous to prevent division by zero errors. 313 num_ambiguous += 1 314 315 motif_size = len(motif) 316 motif_size = motif_size * 4.0 317 318 discerning_power = num_pos - num_neg 319 320 diff = (discerning_power * motif_size) / float(num_ambiguous) 321 return diff
322
323 -class MostCountSchemaFitness(object):
324 """Calculate a fitness giving weight to schemas that match many times. 325 326 This fitness function tries to maximize schemas which are found many 327 times in a group of sequences. 328 """
329 - def __init__(self, seq_records, schema_evaluator):
330 """Initialize with sequences to evaluate. 331 332 Arguments: 333 334 o seq_records -- A set of SeqRecord objects which we use to 335 calculate the fitness. 336 337 o schema_evaluator - An Schema class which can be used to 338 evaluate find motif matches in sequences. 339 """ 340 self._records = seq_records 341 self._evaluator = schema_evaluator
342
343 - def calculate_fitness(self, genome):
344 """Calculate the fitness of a genome based on schema matches. 345 346 This bases the fitness of a genome completely on the number of times 347 it matches in the set of seq_records. Matching more times gives a 348 better fitness 349 """ 350 # convert the genome into a string 351 seq_motif = genome.toseq() 352 motif = seq_motif.tostring() 353 354 # find the number of times the genome matches 355 num_times = 0 356 for seq_record in self._records: 357 cur_counts = self._evaluator.num_matches(motif, 358 seq_record.seq.tostring()) 359 num_times += cur_counts 360 361 return num_times
362 363 # -- Helper classes
364 -class RandomMotifGenerator(object):
365 """Generate a random motif within given parameters. 366 """
367 - def __init__(self, alphabet, min_size = 12, max_size = 17):
368 """Initialize with the motif parameters. 369 370 Arguments: 371 372 o alphabet - An alphabet specifying what letters can be inserted in 373 a motif. 374 375 o min_size, max_size - Specify the range of sizes for motifs. 376 """ 377 self._alphabet = alphabet 378 self._min_size = min_size 379 self._max_size = max_size
380
381 - def random_motif(self):
382 """Create a random motif within the given parameters. 383 384 This returns a single motif string with letters from the given 385 alphabet. The size of the motif will be randomly chosen between 386 max_size and min_size. 387 """ 388 motif_size = random.randrange(self._min_size, self._max_size) 389 390 motif = "" 391 for letter_num in range(motif_size): 392 cur_letter = random.choice(self._alphabet.letters) 393 motif += cur_letter 394 395 return MutableSeq(motif, self._alphabet)
396
397 -class SimpleFinisher(object):
398 """Determine when we are done evolving motifs. 399 400 This takes the very simple approach of halting evolution when the 401 GA has proceeded for a specified number of generations and has 402 a given number of unique schema with positive fitness. 403 """
404 - def __init__(self, num_schemas, min_generations = 100):
405 """Initialize the finisher with its parameters. 406 407 Arguments: 408 409 o num_schemas -- the number of useful (positive fitness) schemas 410 we want to generation 411 412 o min_generations -- The minimum number of generations to allow 413 the GA to proceed. 414 """ 415 self.num_generations = 0 416 417 self.num_schemas = num_schemas 418 self.min_generations = min_generations
419
420 - def is_finished(self, organisms):
421 """Determine when we can stop evolving the population. 422 """ 423 self.num_generations += 1 424 # print "generation %s" % self.num_generations 425 426 if self.num_generations >= self.min_generations: 427 all_seqs = [] 428 for org in organisms: 429 if org.fitness > 0: 430 if org.genome not in all_seqs: 431 all_seqs.append(org.genome) 432 433 if len(all_seqs) >= self.num_schemas: 434 return 1 435 436 return 0
437 # --- 438
439 -class SchemaFinder(object):
440 """Find schema in a set of sequences using a genetic algorithm approach. 441 442 Finding good schemas is very difficult because it takes forever to 443 enumerate all of the potential schemas. This finder using a genetic 444 algorithm approach to evolve good schema which match many times in 445 a set of sequences. 446 447 The default implementation of the finder is ready to find schemas 448 in a set of DNA sequences, but the finder can be customized to deal 449 with any type of data. 450 """
451 - def __init__(self, num_schemas = 100, 452 schema_finder = GeneticAlgorithmFinder()):
453 self.num_schemas = num_schemas 454 self._finder = schema_finder 455 456 self.evaluator = Schema(self._finder.alphabet.alphabet_matches)
457
458 - def find(self, seq_records):
459 """Find well-represented schemas in the given set of SeqRecords. 460 """ 461 fitness_evaluator = MostCountSchemaFitness(seq_records, 462 self.evaluator) 463 464 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 465 self.num_schemas)
466
467 - def find_differences(self, first_records, second_records):
468 """Find schemas which differentiate between the two sets of SeqRecords. 469 """ 470 fitness_evaluator = DifferentialSchemaFitness(first_records, 471 second_records, 472 self.evaluator) 473 474 return self._finder.find_schemas(fitness_evaluator.calculate_fitness, 475 self.num_schemas)
476
477 -class SchemaCoder(object):
478 """Convert a sequence into a representation of ambiguous motifs (schemas). 479 480 This takes a sequence, and returns the number of times specified 481 motifs are found in the sequence. This lets you represent a sequence 482 as just a count of (possibly ambiguous) motifs. 483 """
484 - def __init__(self, schemas, ambiguous_converter):
485 """Initialize the coder to convert sequences 486 487 Arguments: 488 489 o schema - A list of all of the schemas we want to search for 490 in input sequences. 491 492 o ambiguous_converter - An Schema class which can be 493 used to convert motifs into regular expressions for searching. 494 """ 495 self._schemas = schemas 496 self._converter = ambiguous_converter
497
498 - def representation(self, sequence):
499 """Represent the given input sequence as a bunch of motif counts. 500 501 Arguments: 502 503 o sequence - A Bio.Seq object we are going to represent as schemas. 504 505 This takes the sequence, searches for the motifs within it, and then 506 returns counts specifying the relative number of times each motifs 507 was found. The frequencies are in the order the original motifs were 508 passed into the initializer. 509 """ 510 schema_counts = [] 511 512 for schema in self._schemas: 513 num_counts = self._converter.num_matches(schema, sequence.tostring()) 514 schema_counts.append(num_counts) 515 516 # normalize the counts to go between zero and one 517 min_count = 0 518 max_count = max(schema_counts) 519 520 # only normalize if we've actually found something, otherwise 521 # we'll just return 0 for everything 522 if max_count > 0: 523 for count_num in range(len(schema_counts)): 524 schema_counts[count_num] = (float(schema_counts[count_num]) - 525 float(min_count)) / float(max_count) 526 527 return schema_counts
528
529 -def matches_schema(pattern, schema, ambiguity_character = '*'):
530 """Determine whether or not the given pattern matches the schema. 531 532 Arguments: 533 534 o pattern - A string representing the pattern we want to check for 535 matching. This pattern can contain ambiguity characters (which are 536 assumed to be the same as those in the schema). 537 538 o schema - A string schema with ambiguity characters. 539 540 o ambiguity_character - The character used for ambiguity in the schema. 541 """ 542 if len(pattern) != len(schema): 543 return 0 544 545 # check each position, and return a non match if the schema and pattern 546 # are non ambiguous and don't match 547 for pos in range(len(pattern)): 548 if (schema[pos] != ambiguity_character and 549 pattern[pos] != ambiguity_character and 550 pattern[pos] != schema[pos]): 551 552 return 0 553 554 return 1
555
556 -class SchemaFactory(object):
557 """Generate Schema from inputs of Motifs or Signatures. 558 """
559 - def __init__(self, ambiguity_symbol = '*'):
560 """Initialize the SchemaFactory 561 562 Arguments: 563 564 o ambiguity_symbol -- The symbol to use when specifying that 565 a position is arbitrary. 566 """ 567 self._ambiguity_symbol = ambiguity_symbol
568
569 - def from_motifs(self, motif_repository, motif_percent, num_ambiguous):
570 """Generate schema from a list of motifs. 571 572 Arguments: 573 574 o motif_repository - A MotifRepository class that has all of the 575 motifs we want to convert to Schema. 576 577 o motif_percent - The percentage of motifs in the motif bank which 578 should be matches. We'll try to create schema that match this 579 percentage of motifs. 580 581 o num_ambiguous - The number of ambiguous characters to include 582 in each schema. The positions of these ambiguous characters will 583 be randomly selected. 584 """ 585 # get all of the motifs we can deal with 586 all_motifs = motif_repository.get_top_percentage(motif_percent) 587 588 # start building up schemas 589 schema_info = {} 590 # continue until we've built schema matching the desired percentage 591 # of motifs 592 total_count = self._get_num_motifs(motif_repository, all_motifs) 593 matched_count = 0 594 assert total_count > 0, "Expected to have motifs to match" 595 while (float(matched_count) / float(total_count)) < motif_percent: 596 597 new_schema, matching_motifs = \ 598 self._get_unique_schema(schema_info.keys(), 599 all_motifs, num_ambiguous) 600 601 # get the number of counts for the new schema and clean up 602 # the motif list 603 schema_counts = 0 604 for motif in matching_motifs: 605 # get the counts for the motif 606 schema_counts += motif_repository.count(motif) 607 608 # remove the motif from the motif list since it is already 609 # represented by this schema 610 all_motifs.remove(motif) 611 612 613 # all the schema info 614 schema_info[new_schema] = schema_counts 615 616 matched_count += schema_counts 617 618 # print "percentage:", float(matched_count) / float(total_count) 619 620 return PatternRepository(schema_info)
621
622 - def _get_num_motifs(self, repository, motif_list):
623 """Return the number of motif counts for the list of motifs. 624 """ 625 motif_count = 0 626 for motif in motif_list: 627 motif_count += repository.count(motif) 628 629 return motif_count
630
631 - def _get_unique_schema(self, cur_schemas, motif_list, num_ambiguous):
632 """Retrieve a unique schema from a motif. 633 634 We don't want to end up with schema that match the same thing, 635 since this could lead to ambiguous results, and be messy. This 636 tries to create schema, and checks that they do not match any 637 currently existing schema. 638 """ 639 # create a schema starting with a random motif 640 # we'll keep doing this until we get a completely new schema that 641 # doesn't match any old schema 642 num_tries = 0 643 644 while 1: 645 # pick a motif to work from and make a schema from it 646 cur_motif = random.choice(motif_list) 647 648 num_tries += 1 649 650 new_schema, matching_motifs = \ 651 self._schema_from_motif(cur_motif, motif_list, 652 num_ambiguous) 653 654 has_match = 0 655 for old_schema in cur_schemas: 656 if matches_schema(new_schema, old_schema, 657 self._ambiguity_symbol): 658 has_match = 1 659 660 # if the schema doesn't match any other schema we've got 661 # a good one 662 if not(has_match): 663 break 664 665 # check for big loops in which we can't find a new schema 666 assert num_tries < 150, \ 667 "Could not generate schema in %s tries from %s with %s" \ 668 % (num_tries, motif_list, cur_schemas) 669 670 return new_schema, matching_motifs
671
672 - def _schema_from_motif(self, motif, motif_list, num_ambiguous):
673 """Create a schema from a given starting motif. 674 675 Arguments: 676 677 o motif - A motif with the pattern we will start from. 678 679 o motif_list - The total motifs we have.to match to. 680 681 o num_ambiguous - The number of ambiguous characters that should 682 be present in the schema. 683 684 Returns: 685 686 o A string representing the newly generated schema. 687 688 o A list of all of the motifs in motif_list that match the schema. 689 """ 690 assert motif in motif_list, \ 691 "Expected starting motif present in remaining motifs." 692 693 # convert random positions in the motif to ambiguous characters 694 # convert the motif into a list of characters so we can manipulate it 695 new_schema_list = list(motif) 696 for add_ambiguous in range(num_ambiguous): 697 # add an ambiguous position in a new place in the motif 698 while 1: 699 ambig_pos = random.choice(range(len(new_schema_list))) 700 701 # only add a position if it isn't already ambiguous 702 # otherwise, we'll try again 703 if new_schema_list[ambig_pos] != self._ambiguity_symbol: 704 new_schema_list[ambig_pos] = self._ambiguity_symbol 705 break 706 707 # convert the schema back to a string 708 new_schema = ''.join(new_schema_list) 709 710 # get the motifs that the schema matches 711 matched_motifs = [] 712 for motif in motif_list: 713 if matches_schema(motif, new_schema, self._ambiguity_symbol): 714 matched_motifs.append(motif) 715 716 return new_schema, matched_motifs
717
718 - def from_signatures(self, signature_repository, num_ambiguous):
719 raise NotImplementedError("Still need to code this.")
720