Package translate :: Package misc :: Module diff_match_patch
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.diff_match_patch

   1  #!/usr/bin/python2.4 
   2   
   3  """Diff Match and Patch 
   4   
   5  Copyright 2006 Google Inc. 
   6  http://code.google.com/p/google-diff-match-patch/ 
   7   
   8  Licensed under the Apache License, Version 2.0 (the "License"); 
   9  you may not use this file except in compliance with the License. 
  10  You may obtain a copy of the License at 
  11   
  12    http://www.apache.org/licenses/LICENSE-2.0 
  13   
  14  Unless required by applicable law or agreed to in writing, software 
  15  distributed under the License is distributed on an "AS IS" BASIS, 
  16  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
  17  See the License for the specific language governing permissions and 
  18  limitations under the License. 
  19  """ 
  20   
  21  """Functions for diff, match and patch. 
  22   
  23  Computes the difference between two texts to create a patch. 
  24  Applies the patch onto another text, allowing for errors. 
  25  """ 
  26   
  27  __author__ = 'fraser@google.com (Neil Fraser)' 
  28   
  29  import time 
  30  import re 
  31   
32 -class diff_match_patch:
33 """Class containing the diff, match and patch methods. 34 35 Also contains the behaviour settings. 36 """ 37
38 - def __init__(self):
39 """Inits a diff_match_patch object with default settings. 40 Redefine these in your program to override the defaults. 41 """ 42 43 # Number of seconds to map a diff before giving up (0 for infinity). 44 self.Diff_Timeout = 1.0 45 # Cost of an empty edit operation in terms of edit characters. 46 self.Diff_EditCost = 4 47 # The size beyond which the double-ended diff activates. 48 # Double-ending is twice as fast, but less accurate. 49 self.Diff_DualThreshold = 32 50 # At what point is no match declared (0.0 = perfection, 1.0 = very loose). 51 self.Match_Threshold = 0.5 52 # How far to search for a match (0 = exact location, 1000+ = broad match). 53 # A match this many characters away from the expected location will add 54 # 1.0 to the score (0.0 is a perfect match). 55 self.Match_Distance = 1000 56 # When deleting a large block of text (over ~64 characters), how close does 57 # the contents have to match the expected contents. (0.0 = perfection, 58 # 1.0 = very loose). Note that Match_Threshold controls how closely the 59 # end points of a delete need to match. 60 self.Patch_DeleteThreshold = 0.5 61 # Chunk size for context length. 62 self.Patch_Margin = 4 63 64 # How many bits in a number? 65 # Python has no maximum, thus to disable patch splitting set to 0. 66 # However to avoid long patches in certain pathological cases, use 32. 67 # Multiple short patches (using native ints) are much faster than long ones. 68 self.Match_MaxBits = 32
69 70 # DIFF FUNCTIONS 71 72 # The data structure representing a diff is an array of tuples: 73 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] 74 # which means: delete "Hello", add "Goodbye" and keep " world." 75 DIFF_DELETE = -1 76 DIFF_INSERT = 1 77 DIFF_EQUAL = 0 78
79 - def diff_main(self, text1, text2, checklines=True):
80 """Find the differences between two texts. Simplifies the problem by 81 stripping any common prefix or suffix off the texts before diffing. 82 83 Args: 84 text1: Old string to be diffed. 85 text2: New string to be diffed. 86 checklines: Optional speedup flag. If present and false, then don't run 87 a line-level diff first to identify the changed areas. 88 Defaults to true, which does a faster, slightly less optimal diff. 89 90 Returns: 91 Array of changes. 92 """ 93 94 # Check for null inputs. 95 if text1 == None or text2 == None: 96 raise ValueError("Null inputs. (diff_main)") 97 98 # Check for equality (speedup). 99 if text1 == text2: 100 return [(self.DIFF_EQUAL, text1)] 101 102 # Trim off common prefix (speedup). 103 commonlength = self.diff_commonPrefix(text1, text2) 104 commonprefix = text1[:commonlength] 105 text1 = text1[commonlength:] 106 text2 = text2[commonlength:] 107 108 # Trim off common suffix (speedup). 109 commonlength = self.diff_commonSuffix(text1, text2) 110 if commonlength == 0: 111 commonsuffix = '' 112 else: 113 commonsuffix = text1[-commonlength:] 114 text1 = text1[:-commonlength] 115 text2 = text2[:-commonlength] 116 117 # Compute the diff on the middle block. 118 diffs = self.diff_compute(text1, text2, checklines) 119 120 # Restore the prefix and suffix. 121 if commonprefix: 122 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] 123 if commonsuffix: 124 diffs.append((self.DIFF_EQUAL, commonsuffix)) 125 self.diff_cleanupMerge(diffs) 126 return diffs
127
128 - def diff_compute(self, text1, text2, checklines):
129 """Find the differences between two texts. Assumes that the texts do not 130 have any common prefix or suffix. 131 132 Args: 133 text1: Old string to be diffed. 134 text2: New string to be diffed. 135 checklines: Speedup flag. If false, then don't run a line-level diff 136 first to identify the changed areas. 137 If true, then run a faster, slightly less optimal diff. 138 139 Returns: 140 Array of changes. 141 """ 142 if not text1: 143 # Just add some text (speedup). 144 return [(self.DIFF_INSERT, text2)] 145 146 if not text2: 147 # Just delete some text (speedup). 148 return [(self.DIFF_DELETE, text1)] 149 150 if len(text1) > len(text2): 151 (longtext, shorttext) = (text1, text2) 152 else: 153 (shorttext, longtext) = (text1, text2) 154 i = longtext.find(shorttext) 155 if i != -1: 156 # Shorter text is inside the longer text (speedup). 157 diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), 158 (self.DIFF_INSERT, longtext[i + len(shorttext):])] 159 # Swap insertions for deletions if diff is reversed. 160 if len(text1) > len(text2): 161 diffs[0] = (self.DIFF_DELETE, diffs[0][1]) 162 diffs[2] = (self.DIFF_DELETE, diffs[2][1]) 163 return diffs 164 longtext = shorttext = None # Garbage collect. 165 166 # Check to see if the problem can be split in two. 167 hm = self.diff_halfMatch(text1, text2) 168 if hm: 169 # A half-match was found, sort out the return data. 170 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm 171 # Send both pairs off for separate processing. 172 diffs_a = self.diff_main(text1_a, text2_a, checklines) 173 diffs_b = self.diff_main(text1_b, text2_b, checklines) 174 # Merge the results. 175 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b 176 177 # Perform a real diff. 178 if checklines and (len(text1) < 100 or len(text2) < 100): 179 checklines = False # Too trivial for the overhead. 180 if checklines: 181 # Scan the text on a line-by-line basis first. 182 (text1, text2, linearray) = self.diff_linesToChars(text1, text2) 183 184 diffs = self.diff_map(text1, text2) 185 if not diffs: # No acceptable result. 186 diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] 187 if checklines: 188 # Convert the diff back to original text. 189 self.diff_charsToLines(diffs, linearray) 190 # Eliminate freak matches (e.g. blank lines) 191 self.diff_cleanupSemantic(diffs) 192 193 # Rediff any replacement blocks, this time character-by-character. 194 # Add a dummy entry at the end. 195 diffs.append((self.DIFF_EQUAL, '')) 196 pointer = 0 197 count_delete = 0 198 count_insert = 0 199 text_delete = '' 200 text_insert = '' 201 while pointer < len(diffs): 202 if diffs[pointer][0] == self.DIFF_INSERT: 203 count_insert += 1 204 text_insert += diffs[pointer][1] 205 elif diffs[pointer][0] == self.DIFF_DELETE: 206 count_delete += 1 207 text_delete += diffs[pointer][1] 208 elif diffs[pointer][0] == self.DIFF_EQUAL: 209 # Upon reaching an equality, check for prior redundancies. 210 if count_delete >= 1 and count_insert >= 1: 211 # Delete the offending records and add the merged ones. 212 a = self.diff_main(text_delete, text_insert, False) 213 diffs[pointer - count_delete - count_insert : pointer] = a 214 pointer = pointer - count_delete - count_insert + len(a) 215 count_insert = 0 216 count_delete = 0 217 text_delete = '' 218 text_insert = '' 219 220 pointer += 1 221 222 diffs.pop() # Remove the dummy entry at the end. 223 return diffs
224
225 - def diff_linesToChars(self, text1, text2):
226 """Split two texts into an array of strings. Reduce the texts to a string 227 of hashes where each Unicode character represents one line. 228 229 Args: 230 text1: First string. 231 text2: Second string. 232 233 Returns: 234 Three element tuple, containing the encoded text1, the encoded text2 and 235 the array of unique strings. The zeroth element of the array of unique 236 strings is intentionally blank. 237 """ 238 lineArray = [] # e.g. lineArray[4] == "Hello\n" 239 lineHash = {} # e.g. lineHash["Hello\n"] == 4 240 241 # "\x00" is a valid character, but various debuggers don't like it. 242 # So we'll insert a junk entry to avoid generating a null character. 243 lineArray.append('') 244 245 def diff_linesToCharsMunge(text): 246 """Split a text into an array of strings. Reduce the texts to a string 247 of hashes where each Unicode character represents one line. 248 Modifies linearray and linehash through being a closure. 249 250 Args: 251 text: String to encode. 252 253 Returns: 254 Encoded string. 255 """ 256 chars = [] 257 # Walk the text, pulling out a substring for each line. 258 # text.split('\n') would would temporarily double our memory footprint. 259 # Modifying text would create many large strings to garbage collect. 260 lineStart = 0 261 lineEnd = -1 262 while lineEnd < len(text) - 1: 263 lineEnd = text.find('\n', lineStart) 264 if lineEnd == -1: 265 lineEnd = len(text) - 1 266 line = text[lineStart:lineEnd + 1] 267 lineStart = lineEnd + 1 268 269 if line in lineHash: 270 chars.append(unichr(lineHash[line])) 271 else: 272 lineArray.append(line) 273 lineHash[line] = len(lineArray) - 1 274 chars.append(unichr(len(lineArray) - 1)) 275 return "".join(chars)
276 277 chars1 = diff_linesToCharsMunge(text1) 278 chars2 = diff_linesToCharsMunge(text2) 279 return (chars1, chars2, lineArray)
280
281 - def diff_charsToLines(self, diffs, lineArray):
282 """Rehydrate the text in a diff from a string of line hashes to real lines 283 of text. 284 285 Args: 286 diffs: Array of diff tuples. 287 lineArray: Array of unique strings. 288 """ 289 for x in xrange(len(diffs)): 290 text = [] 291 for char in diffs[x][1]: 292 text.append(lineArray[ord(char)]) 293 diffs[x] = (diffs[x][0], "".join(text))
294
295 - def diff_map(self, text1, text2):
296 """Explore the intersection points between the two texts. 297 298 Args: 299 text1: Old string to be diffed. 300 text2: New string to be diffed. 301 302 Returns: 303 Array of diff tuples or None if no diff available. 304 """ 305 306 # Unlike in most languages, Python counts time in seconds. 307 s_end = time.time() + self.Diff_Timeout # Don't run for too long. 308 # Cache the text lengths to prevent multiple calls. 309 text1_length = len(text1) 310 text2_length = len(text2) 311 max_d = text1_length + text2_length - 1 312 doubleEnd = self.Diff_DualThreshold * 2 < max_d 313 # Python efficiency note: (x << 32) + y is the fastest way to combine 314 # x and y into a single hashable value. Tested in Python 2.5. 315 # It is unclear why it is faster for v_map[d] to be indexed with an 316 # integer whereas footsteps is indexed with a string. 317 v_map1 = [] 318 v_map2 = [] 319 v1 = {} 320 v2 = {} 321 v1[1] = 0 322 v2[1] = 0 323 footsteps = {} 324 done = False 325 # If the total number of characters is odd, then the front path will 326 # collide with the reverse path. 327 front = (text1_length + text2_length) % 2 328 for d in xrange(max_d): 329 # Bail out if timeout reached. 330 if self.Diff_Timeout > 0 and time.time() > s_end: 331 return None 332 333 # Walk the front path one step. 334 v_map1.append({}) 335 for k in xrange(-d, d + 1, 2): 336 if k == -d or k != d and v1[k - 1] < v1[k + 1]: 337 x = v1[k + 1] 338 else: 339 x = v1[k - 1] + 1 340 y = x - k 341 if doubleEnd: 342 footstep = str((x << 32) + y) 343 if front and footstep in footsteps: 344 done = True 345 if not front: 346 footsteps[footstep] = d 347 348 while (not done and x < text1_length and y < text2_length and 349 text1[x] == text2[y]): 350 x += 1 351 y += 1 352 if doubleEnd: 353 footstep = str((x << 32) + y) 354 if front and footstep in footsteps: 355 done = True 356 if not front: 357 footsteps[footstep] = d 358 359 v1[k] = x 360 v_map1[d][(x << 32) + y] = True 361 if x == text1_length and y == text2_length: 362 # Reached the end in single-path mode. 363 return self.diff_path1(v_map1, text1, text2) 364 elif done: 365 # Front path ran over reverse path. 366 v_map2 = v_map2[:footsteps[footstep] + 1] 367 a = self.diff_path1(v_map1, text1[:x], text2[:y]) 368 b = self.diff_path2(v_map2, text1[x:], text2[y:]) 369 return a + b 370 371 if doubleEnd: 372 # Walk the reverse path one step. 373 v_map2.append({}) 374 for k in xrange(-d, d + 1, 2): 375 if k == -d or k != d and v2[k - 1] < v2[k + 1]: 376 x = v2[k + 1] 377 else: 378 x = v2[k - 1] + 1 379 y = x - k 380 footstep = str((text1_length - x << 32) + text2_length - y) 381 if not front and footstep in footsteps: 382 done = True 383 if front: 384 footsteps[footstep] = d 385 while (not done and x < text1_length and y < text2_length and 386 text1[-x - 1] == text2[-y - 1]): 387 x += 1 388 y += 1 389 footstep = str((text1_length - x << 32) + text2_length - y) 390 if not front and footstep in footsteps: 391 done = True 392 if front: 393 footsteps[footstep] = d 394 395 v2[k] = x 396 v_map2[d][(x << 32) + y] = True 397 if done: 398 # Reverse path ran over front path. 399 v_map1 = v_map1[:footsteps[footstep] + 1] 400 a = self.diff_path1(v_map1, text1[:text1_length - x], 401 text2[:text2_length - y]) 402 b = self.diff_path2(v_map2, text1[text1_length - x:], 403 text2[text2_length - y:]) 404 return a + b 405 406 # Number of diffs equals number of characters, no commonality at all. 407 return None
408
409 - def diff_path1(self, v_map, text1, text2):
410 """Work from the middle back to the start to determine the path. 411 412 Args: 413 v_map: Array of paths. 414 text1: Old string fragment to be diffed. 415 text2: New string fragment to be diffed. 416 417 Returns: 418 Array of diff tuples. 419 """ 420 path = [] 421 x = len(text1) 422 y = len(text2) 423 last_op = None 424 for d in xrange(len(v_map) - 2, -1, -1): 425 while True: 426 if (x - 1 << 32) + y in v_map[d]: 427 x -= 1 428 if last_op == self.DIFF_DELETE: 429 path[0] = (self.DIFF_DELETE, text1[x] + path[0][1]) 430 else: 431 path[:0] = [(self.DIFF_DELETE, text1[x])] 432 last_op = self.DIFF_DELETE 433 break 434 elif (x << 32) + y - 1 in v_map[d]: 435 y -= 1 436 if last_op == self.DIFF_INSERT: 437 path[0] = (self.DIFF_INSERT, text2[y] + path[0][1]) 438 else: 439 path[:0] = [(self.DIFF_INSERT, text2[y])] 440 last_op = self.DIFF_INSERT 441 break 442 else: 443 x -= 1 444 y -= 1 445 assert text1[x] == text2[y], ("No diagonal. " + 446 "Can't happen. (diff_path1)") 447 if last_op == self.DIFF_EQUAL: 448 path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1]) 449 else: 450 path[:0] = [(self.DIFF_EQUAL, text1[x])] 451 last_op = self.DIFF_EQUAL 452 return path
453
454 - def diff_path2(self, v_map, text1, text2):
455 """Work from the middle back to the end to determine the path. 456 457 Args: 458 v_map: Array of paths. 459 text1: Old string fragment to be diffed. 460 text2: New string fragment to be diffed. 461 462 Returns: 463 Array of diff tuples. 464 """ 465 path = [] 466 x = len(text1) 467 y = len(text2) 468 last_op = None 469 for d in xrange(len(v_map) - 2, -1, -1): 470 while True: 471 if (x - 1 << 32) + y in v_map[d]: 472 x -= 1 473 if last_op == self.DIFF_DELETE: 474 path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1]) 475 else: 476 path.append((self.DIFF_DELETE, text1[-x - 1])) 477 last_op = self.DIFF_DELETE 478 break 479 elif (x << 32) + y - 1 in v_map[d]: 480 y -= 1 481 if last_op == self.DIFF_INSERT: 482 path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1]) 483 else: 484 path.append((self.DIFF_INSERT, text2[-y - 1])) 485 last_op = self.DIFF_INSERT 486 break 487 else: 488 x -= 1 489 y -= 1 490 assert text1[-x - 1] == text2[-y - 1], ("No diagonal. " + 491 "Can't happen. (diff_path2)") 492 if last_op == self.DIFF_EQUAL: 493 path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1]) 494 else: 495 path.append((self.DIFF_EQUAL, text1[-x - 1])) 496 last_op = self.DIFF_EQUAL 497 return path
498
499 - def diff_commonPrefix(self, text1, text2):
500 """Determine the common prefix of two strings. 501 502 Args: 503 text1: First string. 504 text2: Second string. 505 506 Returns: 507 The number of characters common to the start of each string. 508 """ 509 # Quick check for common null cases. 510 if not text1 or not text2 or text1[0] != text2[0]: 511 return 0 512 # Binary search. 513 # Performance analysis: http://neil.fraser.name/news/2007/10/09/ 514 pointermin = 0 515 pointermax = min(len(text1), len(text2)) 516 pointermid = pointermax 517 pointerstart = 0 518 while pointermin < pointermid: 519 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: 520 pointermin = pointermid 521 pointerstart = pointermin 522 else: 523 pointermax = pointermid 524 pointermid = int((pointermax - pointermin) / 2 + pointermin) 525 return pointermid
526
527 - def diff_commonSuffix(self, text1, text2):
528 """Determine the common suffix of two strings. 529 530 Args: 531 text1: First string. 532 text2: Second string. 533 534 Returns: 535 The number of characters common to the end of each string. 536 """ 537 # Quick check for common null cases. 538 if not text1 or not text2 or text1[-1] != text2[-1]: 539 return 0 540 # Binary search. 541 # Performance analysis: http://neil.fraser.name/news/2007/10/09/ 542 pointermin = 0 543 pointermax = min(len(text1), len(text2)) 544 pointermid = pointermax 545 pointerend = 0 546 while pointermin < pointermid: 547 if (text1[-pointermid:len(text1) - pointerend] == 548 text2[-pointermid:len(text2) - pointerend]): 549 pointermin = pointermid 550 pointerend = pointermin 551 else: 552 pointermax = pointermid 553 pointermid = int((pointermax - pointermin) / 2 + pointermin) 554 return pointermid
555
556 - def diff_halfMatch(self, text1, text2):
557 """Do the two texts share a substring which is at least half the length of 558 the longer text? 559 560 Args: 561 text1: First string. 562 text2: Second string. 563 564 Returns: 565 Five element Array, containing the prefix of text1, the suffix of text1, 566 the prefix of text2, the suffix of text2 and the common middle. Or None 567 if there was no match. 568 """ 569 if len(text1) > len(text2): 570 (longtext, shorttext) = (text1, text2) 571 else: 572 (shorttext, longtext) = (text1, text2) 573 if len(longtext) < 10 or len(shorttext) < 1: 574 return None # Pointless. 575 576 def diff_halfMatchI(longtext, shorttext, i): 577 """Does a substring of shorttext exist within longtext such that the 578 substring is at least half the length of longtext? 579 Closure, but does not reference any external variables. 580 581 Args: 582 longtext: Longer string. 583 shorttext: Shorter string. 584 i: Start index of quarter length substring within longtext. 585 586 Returns: 587 Five element Array, containing the prefix of longtext, the suffix of 588 longtext, the prefix of shorttext, the suffix of shorttext and the 589 common middle. Or None if there was no match. 590 """ 591 seed = longtext[i:i + len(longtext) / 4] 592 best_common = '' 593 j = shorttext.find(seed) 594 while j != -1: 595 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) 596 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) 597 if len(best_common) < suffixLength + prefixLength: 598 best_common = (shorttext[j - suffixLength:j] + 599 shorttext[j:j + prefixLength]) 600 best_longtext_a = longtext[:i - suffixLength] 601 best_longtext_b = longtext[i + prefixLength:] 602 best_shorttext_a = shorttext[:j - suffixLength] 603 best_shorttext_b = shorttext[j + prefixLength:] 604 j = shorttext.find(seed, j + 1) 605 606 if len(best_common) >= len(longtext) / 2: 607 return (best_longtext_a, best_longtext_b, 608 best_shorttext_a, best_shorttext_b, best_common) 609 else: 610 return None
611 612 # First check if the second quarter is the seed for a half-match. 613 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4) 614 # Check again based on the third quarter. 615 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2) 616 if not hm1 and not hm2: 617 return None 618 elif not hm2: 619 hm = hm1 620 elif not hm1: 621 hm = hm2 622 else: 623 # Both matched. Select the longest. 624 if len(hm1[4]) > len(hm2[4]): 625 hm = hm1 626 else: 627 hm = hm2 628 629 # A half-match was found, sort out the return data. 630 if len(text1) > len(text2): 631 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm 632 else: 633 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm 634 return (text1_a, text1_b, text2_a, text2_b, mid_common) 635
636 - def diff_cleanupSemantic(self, diffs):
637 """Reduce the number of edits by eliminating semantically trivial 638 equalities. 639 640 Args: 641 diffs: Array of diff tuples. 642 """ 643 changes = False 644 equalities = [] # Stack of indices where equalities are found. 645 lastequality = None # Always equal to equalities[-1][1] 646 pointer = 0 # Index of current position. 647 length_changes1 = 0 # Number of chars that changed prior to the equality. 648 length_changes2 = 0 # Number of chars that changed after the equality. 649 while pointer < len(diffs): 650 if diffs[pointer][0] == self.DIFF_EQUAL: # equality found 651 equalities.append(pointer) 652 length_changes1 = length_changes2 653 length_changes2 = 0 654 lastequality = diffs[pointer][1] 655 else: # an insertion or deletion 656 length_changes2 += len(diffs[pointer][1]) 657 if (lastequality != None and (len(lastequality) <= length_changes1) and 658 (len(lastequality) <= length_changes2)): 659 # Duplicate record 660 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) 661 # Change second copy to insert. 662 diffs[equalities[-1] + 1] = (self.DIFF_INSERT, 663 diffs[equalities[-1] + 1][1]) 664 # Throw away the equality we just deleted. 665 equalities.pop() 666 # Throw away the previous equality (it needs to be reevaluated). 667 if len(equalities) != 0: 668 equalities.pop() 669 if len(equalities): 670 pointer = equalities[-1] 671 else: 672 pointer = -1 673 length_changes1 = 0 # Reset the counters. 674 length_changes2 = 0 675 lastequality = None 676 changes = True 677 pointer += 1 678 679 if changes: 680 self.diff_cleanupMerge(diffs) 681 682 self.diff_cleanupSemanticLossless(diffs)
683
684 - def diff_cleanupSemanticLossless(self, diffs):
685 """Look for single edits surrounded on both sides by equalities 686 which can be shifted sideways to align the edit to a word boundary. 687 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. 688 689 Args: 690 diffs: Array of diff tuples. 691 """ 692 693 def diff_cleanupSemanticScore(one, two): 694 """Given two strings, compute a score representing whether the 695 internal boundary falls on logical boundaries. 696 Scores range from 5 (best) to 0 (worst). 697 Closure, but does not reference any external variables. 698 699 Args: 700 one: First string. 701 two: Second string. 702 703 Returns: 704 The score. 705 """ 706 if not one or not two: 707 # Edges are the best. 708 return 5 709 710 # Each port of this function behaves slightly differently due to 711 # subtle differences in each language's definition of things like 712 # 'whitespace'. Since this function's purpose is largely cosmetic, 713 # the choice has been made to use each language's native features 714 # rather than force total conformity. 715 score = 0 716 # One point for non-alphanumeric. 717 if not one[-1].isalnum() or not two[0].isalnum(): 718 score += 1 719 # Two points for whitespace. 720 if one[-1].isspace() or two[0].isspace(): 721 score += 1 722 # Three points for line breaks. 723 if (one[-1] == "\r" or one[-1] == "\n" or 724 two[0] == "\r" or two[0] == "\n"): 725 score += 1 726 # Four points for blank lines. 727 if (re.search("\\n\\r?\\n$", one) or 728 re.match("^\\r?\\n\\r?\\n", two)): 729 score += 1 730 return score
731 732 pointer = 1 733 # Intentionally ignore the first and last element (don't need checking). 734 while pointer < len(diffs) - 1: 735 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and 736 diffs[pointer + 1][0] == self.DIFF_EQUAL): 737 # This is a single edit surrounded by equalities. 738 equality1 = diffs[pointer - 1][1] 739 edit = diffs[pointer][1] 740 equality2 = diffs[pointer + 1][1] 741 742 # First, shift the edit as far left as possible. 743 commonOffset = self.diff_commonSuffix(equality1, edit) 744 if commonOffset: 745 commonString = edit[-commonOffset:] 746 equality1 = equality1[:-commonOffset] 747 edit = commonString + edit[:-commonOffset] 748 equality2 = commonString + equality2 749 750 # Second, step character by character right, looking for the best fit. 751 bestEquality1 = equality1 752 bestEdit = edit 753 bestEquality2 = equality2 754 bestScore = (diff_cleanupSemanticScore(equality1, edit) + 755 diff_cleanupSemanticScore(edit, equality2)) 756 while edit and equality2 and edit[0] == equality2[0]: 757 equality1 += edit[0] 758 edit = edit[1:] + equality2[0] 759 equality2 = equality2[1:] 760 score = (diff_cleanupSemanticScore(equality1, edit) + 761 diff_cleanupSemanticScore(edit, equality2)) 762 # The >= encourages trailing rather than leading whitespace on edits. 763 if score >= bestScore: 764 bestScore = score 765 bestEquality1 = equality1 766 bestEdit = edit 767 bestEquality2 = equality2 768 769 if diffs[pointer - 1][1] != bestEquality1: 770 # We have an improvement, save it back to the diff. 771 if bestEquality1: 772 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) 773 else: 774 del diffs[pointer - 1] 775 pointer -= 1 776 diffs[pointer] = (diffs[pointer][0], bestEdit) 777 if bestEquality2: 778 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) 779 else: 780 del diffs[pointer + 1] 781 pointer -= 1 782 pointer += 1 783
784 - def diff_cleanupEfficiency(self, diffs):
785 """Reduce the number of edits by eliminating operationally trivial 786 equalities. 787 788 Args: 789 diffs: Array of diff tuples. 790 """ 791 changes = False 792 equalities = [] # Stack of indices where equalities are found. 793 lastequality = '' # Always equal to equalities[-1][1] 794 pointer = 0 # Index of current position. 795 pre_ins = False # Is there an insertion operation before the last equality. 796 pre_del = False # Is there a deletion operation before the last equality. 797 post_ins = False # Is there an insertion operation after the last equality. 798 post_del = False # Is there a deletion operation after the last equality. 799 while pointer < len(diffs): 800 if diffs[pointer][0] == self.DIFF_EQUAL: # equality found 801 if (len(diffs[pointer][1]) < self.Diff_EditCost and 802 (post_ins or post_del)): 803 # Candidate found. 804 equalities.append(pointer) 805 pre_ins = post_ins 806 pre_del = post_del 807 lastequality = diffs[pointer][1] 808 else: 809 # Not a candidate, and can never become one. 810 equalities = [] 811 lastequality = '' 812 813 post_ins = post_del = False 814 else: # an insertion or deletion 815 if diffs[pointer][0] == self.DIFF_DELETE: 816 post_del = True 817 else: 818 post_ins = True 819 820 # Five types to be split: 821 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> 822 # <ins>A</ins>X<ins>C</ins><del>D</del> 823 # <ins>A</ins><del>B</del>X<ins>C</ins> 824 # <ins>A</del>X<ins>C</ins><del>D</del> 825 # <ins>A</ins><del>B</del>X<del>C</del> 826 827 if lastequality and ((pre_ins and pre_del and post_ins and post_del) or 828 ((len(lastequality) < self.Diff_EditCost / 2) and 829 (pre_ins + pre_del + post_ins + post_del) == 3)): 830 # Duplicate record 831 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) 832 # Change second copy to insert. 833 diffs[equalities[-1] + 1] = (self.DIFF_INSERT, 834 diffs[equalities[-1] + 1][1]) 835 equalities.pop() # Throw away the equality we just deleted 836 lastequality = '' 837 if pre_ins and pre_del: 838 # No changes made which could affect previous entry, keep going. 839 post_ins = post_del = True 840 equalities = [] 841 else: 842 if len(equalities): 843 equalities.pop() # Throw away the previous equality 844 if len(equalities): 845 pointer = equalities[-1] 846 else: 847 pointer = -1 848 post_ins = post_del = False 849 changes = True 850 pointer += 1 851 852 if changes: 853 self.diff_cleanupMerge(diffs)
854
855 - def diff_cleanupMerge(self, diffs):
856 """Reorder and merge like edit sections. Merge equalities. 857 Any edit section can move as long as it doesn't cross an equality. 858 859 Args: 860 diffs: Array of diff tuples. 861 """ 862 diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end. 863 pointer = 0 864 count_delete = 0 865 count_insert = 0 866 text_delete = '' 867 text_insert = '' 868 while pointer < len(diffs): 869 if diffs[pointer][0] == self.DIFF_INSERT: 870 count_insert += 1 871 text_insert += diffs[pointer][1] 872 pointer += 1 873 elif diffs[pointer][0] == self.DIFF_DELETE: 874 count_delete += 1 875 text_delete += diffs[pointer][1] 876 pointer += 1 877 elif diffs[pointer][0] == self.DIFF_EQUAL: 878 # Upon reaching an equality, check for prior redundancies. 879 if count_delete != 0 or count_insert != 0: 880 if count_delete != 0 and count_insert != 0: 881 # Factor out any common prefixies. 882 commonlength = self.diff_commonPrefix(text_insert, text_delete) 883 if commonlength != 0: 884 x = pointer - count_delete - count_insert - 1 885 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: 886 diffs[x] = (diffs[x][0], diffs[x][1] + 887 text_insert[:commonlength]) 888 else: 889 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength])) 890 pointer += 1 891 text_insert = text_insert[commonlength:] 892 text_delete = text_delete[commonlength:] 893 # Factor out any common suffixies. 894 commonlength = self.diff_commonSuffix(text_insert, text_delete) 895 if commonlength != 0: 896 diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + 897 diffs[pointer][1]) 898 text_insert = text_insert[:-commonlength] 899 text_delete = text_delete[:-commonlength] 900 # Delete the offending records and add the merged ones. 901 if count_delete == 0: 902 diffs[pointer - count_insert : pointer] = [ 903 (self.DIFF_INSERT, text_insert)] 904 elif count_insert == 0: 905 diffs[pointer - count_delete : pointer] = [ 906 (self.DIFF_DELETE, text_delete)] 907 else: 908 diffs[pointer - count_delete - count_insert : pointer] = [ 909 (self.DIFF_DELETE, text_delete), 910 (self.DIFF_INSERT, text_insert)] 911 pointer = pointer - count_delete - count_insert + 1 912 if count_delete != 0: 913 pointer += 1 914 if count_insert != 0: 915 pointer += 1 916 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: 917 # Merge this equality with the previous one. 918 diffs[pointer - 1] = (diffs[pointer - 1][0], 919 diffs[pointer - 1][1] + diffs[pointer][1]) 920 del diffs[pointer] 921 else: 922 pointer += 1 923 924 count_insert = 0 925 count_delete = 0 926 text_delete = '' 927 text_insert = '' 928 929 if diffs[-1][1] == '': 930 diffs.pop() # Remove the dummy entry at the end. 931 932 # Second pass: look for single edits surrounded on both sides by equalities 933 # which can be shifted sideways to eliminate an equality. 934 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC 935 changes = False 936 pointer = 1 937 # Intentionally ignore the first and last element (don't need checking). 938 while pointer < len(diffs) - 1: 939 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and 940 diffs[pointer + 1][0] == self.DIFF_EQUAL): 941 # This is a single edit surrounded by equalities. 942 if diffs[pointer][1].endswith(diffs[pointer - 1][1]): 943 # Shift the edit over the previous equality. 944 diffs[pointer] = (diffs[pointer][0], 945 diffs[pointer - 1][1] + 946 diffs[pointer][1][:-len(diffs[pointer - 1][1])]) 947 diffs[pointer + 1] = (diffs[pointer + 1][0], 948 diffs[pointer - 1][1] + diffs[pointer + 1][1]) 949 del diffs[pointer - 1] 950 changes = True 951 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): 952 # Shift the edit over the next equality. 953 diffs[pointer - 1] = (diffs[pointer - 1][0], 954 diffs[pointer - 1][1] + diffs[pointer + 1][1]) 955 diffs[pointer] = (diffs[pointer][0], 956 diffs[pointer][1][len(diffs[pointer + 1][1]):] + 957 diffs[pointer + 1][1]) 958 del diffs[pointer + 1] 959 changes = True 960 pointer += 1 961 962 # If shifts were made, the diff needs reordering and another shift sweep. 963 if changes: 964 self.diff_cleanupMerge(diffs)
965
966 - def diff_xIndex(self, diffs, loc):
967 """loc is a location in text1, compute and return the equivalent location 968 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 969 970 Args: 971 diffs: Array of diff tuples. 972 loc: Location within text1. 973 974 Returns: 975 Location within text2. 976 """ 977 chars1 = 0 978 chars2 = 0 979 last_chars1 = 0 980 last_chars2 = 0 981 for x in xrange(len(diffs)): 982 (op, text) = diffs[x] 983 if op != self.DIFF_INSERT: # Equality or deletion. 984 chars1 += len(text) 985 if op != self.DIFF_DELETE: # Equality or insertion. 986 chars2 += len(text) 987 if chars1 > loc: # Overshot the location. 988 break 989 last_chars1 = chars1 990 last_chars2 = chars2 991 992 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: 993 # The location was deleted. 994 return last_chars2 995 # Add the remaining len(character). 996 return last_chars2 + (loc - last_chars1)
997
998 - def diff_prettyHtml(self, diffs):
999 """Convert a diff array into a pretty HTML report. 1000 1001 Args: 1002 diffs: Array of diff tuples. 1003 1004 Returns: 1005 HTML representation. 1006 """ 1007 html = [] 1008 i = 0 1009 for (op, data) in diffs: 1010 text = (data.replace("&", "&amp;").replace("<", "&lt;") 1011 .replace(">", "&gt;").replace("\n", "&para;<BR>")) 1012 if op == self.DIFF_INSERT: 1013 html.append("<INS STYLE=\"background:#E6FFE6;\" TITLE=\"i=%i\">%s</INS>" 1014 % (i, text)) 1015 elif op == self.DIFF_DELETE: 1016 html.append("<DEL STYLE=\"background:#FFE6E6;\" TITLE=\"i=%i\">%s</DEL>" 1017 % (i, text)) 1018 elif op == self.DIFF_EQUAL: 1019 html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text)) 1020 if op != self.DIFF_DELETE: 1021 i += len(data) 1022 return "".join(html)
1023
1024 - def diff_text1(self, diffs):
1025 """Compute and return the source text (all equalities and deletions). 1026 1027 Args: 1028 diffs: Array of diff tuples. 1029 1030 Returns: 1031 Source text. 1032 """ 1033 text = [] 1034 for (op, data) in diffs: 1035 if op != self.DIFF_INSERT: 1036 text.append(data) 1037 return "".join(text)
1038
1039 - def diff_text2(self, diffs):
1040 """Compute and return the destination text (all equalities and insertions). 1041 1042 Args: 1043 diffs: Array of diff tuples. 1044 1045 Returns: 1046 Destination text. 1047 """ 1048 text = [] 1049 for (op, data) in diffs: 1050 if op != self.DIFF_DELETE: 1051 text.append(data) 1052 return "".join(text)
1053
1054 - def diff_levenshtein(self, diffs):
1055 """Compute the Levenshtein distance; the number of inserted, deleted or 1056 substituted characters. 1057 1058 Args: 1059 diffs: Array of diff tuples. 1060 1061 Returns: 1062 Number of changes. 1063 """ 1064 levenshtein = 0 1065 insertions = 0 1066 deletions = 0 1067 for (op, data) in diffs: 1068 if op == self.DIFF_INSERT: 1069 insertions += len(data) 1070 elif op == self.DIFF_DELETE: 1071 deletions += len(data) 1072 elif op == self.DIFF_EQUAL: 1073 # A deletion and an insertion is one substitution. 1074 levenshtein += max(insertions, deletions) 1075 insertions = 0 1076 deletions = 0 1077 levenshtein += max(insertions, deletions) 1078 return levenshtein
1079
1080 - def diff_toDelta(self, diffs):
1081 """Crush the diff into an encoded string which describes the operations 1082 required to transform text1 into text2. 1083 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. 1084 Operations are tab-separated. Inserted text is escaped using %xx notation. 1085 1086 Args: 1087 diffs: Array of diff tuples. 1088 1089 Returns: 1090 Delta text. 1091 """ 1092 import urllib 1093 text = [] 1094 for (op, data) in diffs: 1095 if op == self.DIFF_INSERT: 1096 # High ascii will raise UnicodeDecodeError. Use Unicode instead. 1097 data = data.encode("utf-8") 1098 text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) 1099 elif op == self.DIFF_DELETE: 1100 text.append("-%d" % len(data)) 1101 elif op == self.DIFF_EQUAL: 1102 text.append("=%d" % len(data)) 1103 return "\t".join(text)
1104
1105 - def diff_fromDelta(self, text1, delta):
1106 """Given the original text1, and an encoded string which describes the 1107 operations required to transform text1 into text2, compute the full diff. 1108 1109 Args: 1110 text1: Source string for the diff. 1111 delta: Delta text. 1112 1113 Returns: 1114 Array of diff tuples. 1115 1116 Raises: 1117 ValueError: If invalid input. 1118 """ 1119 import urllib 1120 if type(delta) == unicode: 1121 # Deltas should be composed of a subset of ascii chars, Unicode not 1122 # required. If this encode raises UnicodeEncodeError, delta is invalid. 1123 delta = delta.encode("ascii") 1124 diffs = [] 1125 pointer = 0 # Cursor in text1 1126 tokens = delta.split("\t") 1127 for token in tokens: 1128 if token == "": 1129 # Blank tokens are ok (from a trailing \t). 1130 continue 1131 # Each token begins with a one character parameter which specifies the 1132 # operation of this token (delete, insert, equality). 1133 param = token[1:] 1134 if token[0] == "+": 1135 param = urllib.unquote(param).decode("utf-8") 1136 diffs.append((self.DIFF_INSERT, param)) 1137 elif token[0] == "-" or token[0] == "=": 1138 try: 1139 n = int(param) 1140 except ValueError: 1141 raise ValueError("Invalid number in diff_fromDelta: " + param) 1142 if n < 0: 1143 raise ValueError("Negative number in diff_fromDelta: " + param) 1144 text = text1[pointer : pointer + n] 1145 pointer += n 1146 if token[0] == "=": 1147 diffs.append((self.DIFF_EQUAL, text)) 1148 else: 1149 diffs.append((self.DIFF_DELETE, text)) 1150 else: 1151 # Anything else is an error. 1152 raise ValueError("Invalid diff operation in diff_fromDelta: " + 1153 token[0]) 1154 if pointer != len(text1): 1155 raise ValueError( 1156 "Delta length (%d) does not equal source text length (%d)." % 1157 (pointer, len(text1))) 1158 return diffs
1159 1160 # MATCH FUNCTIONS 1161
1162 - def match_main(self, text, pattern, loc):
1163 """Locate the best instance of 'pattern' in 'text' near 'loc'. 1164 1165 Args: 1166 text: The text to search. 1167 pattern: The pattern to search for. 1168 loc: The location to search around. 1169 1170 Returns: 1171 Best match index or -1. 1172 """ 1173 # Check for null inputs. 1174 if text == None or pattern == None: 1175 raise ValueError("Null inputs. (match_main)") 1176 1177 loc = max(0, min(loc, len(text))) 1178 if text == pattern: 1179 # Shortcut (potentially not guaranteed by the algorithm) 1180 return 0 1181 elif not text: 1182 # Nothing to match. 1183 return -1 1184 elif text[loc:loc + len(pattern)] == pattern: 1185 # Perfect match at the perfect spot! (Includes case of null pattern) 1186 return loc 1187 else: 1188 # Do a fuzzy compare. 1189 match = self.match_bitap(text, pattern, loc) 1190 return match
1191
1192 - def match_bitap(self, text, pattern, loc):
1193 """Locate the best instance of 'pattern' in 'text' near 'loc' using the 1194 Bitap algorithm. 1195 1196 Args: 1197 text: The text to search. 1198 pattern: The pattern to search for. 1199 loc: The location to search around. 1200 1201 Returns: 1202 Best match index or -1. 1203 """ 1204 # Python doesn't have a maxint limit, so ignore this check. 1205 #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: 1206 # raise ValueError("Pattern too long for this application.") 1207 1208 # Initialise the alphabet. 1209 s = self.match_alphabet(pattern) 1210 1211 def match_bitapScore(e, x): 1212 """Compute and return the score for a match with e errors and x location. 1213 Accesses loc and pattern through being a closure. 1214 1215 Args: 1216 e: Number of errors in match. 1217 x: Location of match. 1218 1219 Returns: 1220 Overall score for match (0.0 = good, 1.0 = bad). 1221 """ 1222 accuracy = float(e) / len(pattern) 1223 proximity = abs(loc - x) 1224 if not self.Match_Distance: 1225 # Dodge divide by zero error. 1226 return proximity and 1.0 or accuracy 1227 return accuracy + (proximity / float(self.Match_Distance))
1228 1229 # Highest score beyond which we give up. 1230 score_threshold = self.Match_Threshold 1231 # Is there a nearby exact match? (speedup) 1232 best_loc = text.find(pattern, loc) 1233 if best_loc != -1: 1234 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) 1235 # What about in the other direction? (speedup) 1236 best_loc = text.rfind(pattern, loc + len(pattern)) 1237 if best_loc != -1: 1238 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) 1239 1240 # Initialise the bit arrays. 1241 matchmask = 1 << (len(pattern) - 1) 1242 best_loc = -1 1243 1244 bin_max = len(pattern) + len(text) 1245 # Empty initialization added to appease pychecker. 1246 last_rd = None 1247 for d in xrange(len(pattern)): 1248 # Scan for the best match each iteration allows for one more error. 1249 # Run a binary search to determine how far from 'loc' we can stray at 1250 # this error level. 1251 bin_min = 0 1252 bin_mid = bin_max 1253 while bin_min < bin_mid: 1254 if match_bitapScore(d, loc + bin_mid) <= score_threshold: 1255 bin_min = bin_mid 1256 else: 1257 bin_max = bin_mid 1258 bin_mid = (bin_max - bin_min) / 2 + bin_min 1259 1260 # Use the result from this iteration as the maximum for the next. 1261 bin_max = bin_mid 1262 start = max(1, loc - bin_mid + 1) 1263 finish = min(loc + bin_mid, len(text)) + len(pattern) 1264 1265 rd = range(finish + 1) 1266 rd.append((1 << d) - 1) 1267 for j in xrange(finish, start - 1, -1): 1268 if len(text) <= j - 1: 1269 # Out of range. 1270 charMatch = 0 1271 else: 1272 charMatch = s.get(text[j - 1], 0) 1273 if d == 0: # First pass: exact match. 1274 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch 1275 else: # Subsequent passes: fuzzy match. 1276 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | ( 1277 ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] 1278 if rd[j] & matchmask: 1279 score = match_bitapScore(d, j - 1) 1280 # This match will almost certainly be better than any existing match. 1281 # But check anyway. 1282 if score <= score_threshold: 1283 # Told you so. 1284 score_threshold = score 1285 best_loc = j - 1 1286 if best_loc > loc: 1287 # When passing loc, don't exceed our current distance from loc. 1288 start = max(1, 2 * loc - best_loc) 1289 else: 1290 # Already passed loc, downhill from here on in. 1291 break 1292 # No hope for a (better) match at greater error levels. 1293 if match_bitapScore(d + 1, loc) > score_threshold: 1294 break 1295 last_rd = rd 1296 return best_loc 1297
1298 - def match_alphabet(self, pattern):
1299 """Initialise the alphabet for the Bitap algorithm. 1300 1301 Args: 1302 pattern: The text to encode. 1303 1304 Returns: 1305 Hash of character locations. 1306 """ 1307 s = {} 1308 for char in pattern: 1309 s[char] = 0 1310 for i in xrange(len(pattern)): 1311 s[pattern[i]] |= 1 << (len(pattern) - i - 1) 1312 return s
1313 1314 # PATCH FUNCTIONS 1315
1316 - def patch_addContext(self, patch, text):
1317 """Increase the context until it is unique, 1318 but don't let the pattern expand beyond Match_MaxBits. 1319 1320 Args: 1321 patch: The patch to grow. 1322 text: Source text. 1323 """ 1324 if len(text) == 0: 1325 return 1326 pattern = text[patch.start2 : patch.start2 + patch.length1] 1327 padding = 0 1328 1329 # Look for the first and last matches of pattern in text. If two different 1330 # matches are found, increase the pattern length. 1331 while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits == 1332 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - 1333 self.Patch_Margin)): 1334 padding += self.Patch_Margin 1335 pattern = text[max(0, patch.start2 - padding) : 1336 patch.start2 + patch.length1 + padding] 1337 # Add one chunk for good luck. 1338 padding += self.Patch_Margin 1339 1340 # Add the prefix. 1341 prefix = text[max(0, patch.start2 - padding) : patch.start2] 1342 if prefix: 1343 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] 1344 # Add the suffix. 1345 suffix = text[patch.start2 + patch.length1 : 1346 patch.start2 + patch.length1 + padding] 1347 if suffix: 1348 patch.diffs.append((self.DIFF_EQUAL, suffix)) 1349 1350 # Roll back the start points. 1351 patch.start1 -= len(prefix) 1352 patch.start2 -= len(prefix) 1353 # Extend lengths. 1354 patch.length1 += len(prefix) + len(suffix) 1355 patch.length2 += len(prefix) + len(suffix)
1356
1357 - def patch_make(self, a, b=None, c=None):
1358 """Compute a list of patches to turn text1 into text2. 1359 Use diffs if provided, otherwise compute it ourselves. 1360 There are four ways to call this function, depending on what data is 1361 available to the caller: 1362 Method 1: 1363 a = text1, b = text2 1364 Method 2: 1365 a = diffs 1366 Method 3 (optimal): 1367 a = text1, b = diffs 1368 Method 4 (deprecated, use method 3): 1369 a = text1, b = text2, c = diffs 1370 1371 Args: 1372 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to 1373 text2 (method 2). 1374 b: text2 (methods 1,4) or Array of diff tuples for text1 to 1375 text2 (method 3) or undefined (method 2). 1376 c: Array of diff tuples for text1 to text2 (method 4) or 1377 undefined (methods 1,2,3). 1378 1379 Returns: 1380 Array of patch objects. 1381 """ 1382 text1 = None 1383 diffs = None 1384 # Note that texts may arrive as 'str' or 'unicode'. 1385 if isinstance(a, basestring) and isinstance(b, basestring) and c is None: 1386 # Method 1: text1, text2 1387 # Compute diffs from text1 and text2. 1388 text1 = a 1389 diffs = self.diff_main(text1, b, True) 1390 if len(diffs) > 2: 1391 self.diff_cleanupSemantic(diffs) 1392 self.diff_cleanupEfficiency(diffs) 1393 elif isinstance(a, list) and b is None and c is None: 1394 # Method 2: diffs 1395 # Compute text1 from diffs. 1396 diffs = a 1397 text1 = self.diff_text1(diffs) 1398 elif isinstance(a, basestring) and isinstance(b, list) and c is None: 1399 # Method 3: text1, diffs 1400 text1 = a 1401 diffs = b 1402 elif (isinstance(a, basestring) and isinstance(b, basestring) and 1403 isinstance(c, list)): 1404 # Method 4: text1, text2, diffs 1405 # text2 is not used. 1406 text1 = a 1407 diffs = c 1408 else: 1409 raise ValueError("Unknown call format to patch_make.") 1410 1411 if not diffs: 1412 return [] # Get rid of the None case. 1413 patches = [] 1414 patch = patch_obj() 1415 char_count1 = 0 # Number of characters into the text1 string. 1416 char_count2 = 0 # Number of characters into the text2 string. 1417 prepatch_text = text1 # Recreate the patches to determine context info. 1418 postpatch_text = text1 1419 for x in xrange(len(diffs)): 1420 (diff_type, diff_text) = diffs[x] 1421 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: 1422 # A new patch starts here. 1423 patch.start1 = char_count1 1424 patch.start2 = char_count2 1425 if diff_type == self.DIFF_INSERT: 1426 # Insertion 1427 patch.diffs.append(diffs[x]) 1428 patch.length2 += len(diff_text) 1429 postpatch_text = (postpatch_text[:char_count2] + diff_text + 1430 postpatch_text[char_count2:]) 1431 elif diff_type == self.DIFF_DELETE: 1432 # Deletion. 1433 patch.length1 += len(diff_text) 1434 patch.diffs.append(diffs[x]) 1435 postpatch_text = (postpatch_text[:char_count2] + 1436 postpatch_text[char_count2 + len(diff_text):]) 1437 elif (diff_type == self.DIFF_EQUAL and 1438 len(diff_text) <= 2 * self.Patch_Margin and 1439 len(patch.diffs) != 0 and len(diffs) != x + 1): 1440 # Small equality inside a patch. 1441 patch.diffs.append(diffs[x]) 1442 patch.length1 += len(diff_text) 1443 patch.length2 += len(diff_text) 1444 1445 if (diff_type == self.DIFF_EQUAL and 1446 len(diff_text) >= 2 * self.Patch_Margin): 1447 # Time for a new patch. 1448 if len(patch.diffs) != 0: 1449 self.patch_addContext(patch, prepatch_text) 1450 patches.append(patch) 1451 patch = patch_obj() 1452 # Unlike Unidiff, our patch lists have a rolling context. 1453 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff 1454 # Update prepatch text & pos to reflect the application of the 1455 # just completed patch. 1456 prepatch_text = postpatch_text 1457 char_count1 = char_count2 1458 1459 # Update the current character count. 1460 if diff_type != self.DIFF_INSERT: 1461 char_count1 += len(diff_text) 1462 if diff_type != self.DIFF_DELETE: 1463 char_count2 += len(diff_text) 1464 1465 # Pick up the leftover patch if not empty. 1466 if len(patch.diffs) != 0: 1467 self.patch_addContext(patch, prepatch_text) 1468 patches.append(patch) 1469 return patches
1470
1471 - def patch_deepCopy(self, patches):
1472 """Given an array of patches, return another array that is identical. 1473 1474 Args: 1475 patches: Array of patch objects. 1476 1477 Returns: 1478 Array of patch objects. 1479 """ 1480 patchesCopy = [] 1481 for patch in patches: 1482 patchCopy = patch_obj() 1483 # No need to deep copy the tuples since they are immutable. 1484 patchCopy.diffs = patch.diffs[:] 1485 patchCopy.start1 = patch.start1 1486 patchCopy.start2 = patch.start2 1487 patchCopy.length1 = patch.length1 1488 patchCopy.length2 = patch.length2 1489 patchesCopy.append(patchCopy) 1490 return patchesCopy
1491
1492 - def patch_apply(self, patches, text):
1493 """Merge a set of patches onto the text. Return a patched text, as well 1494 as a list of true/false values indicating which patches were applied. 1495 1496 Args: 1497 patches: Array of patch objects. 1498 text: Old text. 1499 1500 Returns: 1501 Two element Array, containing the new text and an array of boolean values. 1502 """ 1503 if not patches: 1504 return (text, []) 1505 1506 # Deep copy the patches so that no changes are made to originals. 1507 patches = self.patch_deepCopy(patches) 1508 1509 nullPadding = self.patch_addPadding(patches) 1510 text = nullPadding + text + nullPadding 1511 self.patch_splitMax(patches) 1512 1513 # delta keeps track of the offset between the expected and actual location 1514 # of the previous patch. If there are patches expected at positions 10 and 1515 # 20, but the first patch was found at 12, delta is 2 and the second patch 1516 # has an effective expected position of 22. 1517 delta = 0 1518 results = [] 1519 for patch in patches: 1520 expected_loc = patch.start2 + delta 1521 text1 = self.diff_text1(patch.diffs) 1522 end_loc = -1 1523 if len(text1) > self.Match_MaxBits: 1524 # patch_splitMax will only provide an oversized pattern in the case of 1525 # a monster delete. 1526 start_loc = self.match_main(text, text1[:self.Match_MaxBits], 1527 expected_loc) 1528 if start_loc != -1: 1529 end_loc = self.match_main(text, text1[-self.Match_MaxBits:], 1530 expected_loc + len(text1) - self.Match_MaxBits) 1531 if end_loc == -1 or start_loc >= end_loc: 1532 # Can't find valid trailing context. Drop this patch. 1533 start_loc = -1 1534 else: 1535 start_loc = self.match_main(text, text1, expected_loc) 1536 if start_loc == -1: 1537 # No match found. :( 1538 results.append(False) 1539 # Subtract the delta for this failed patch from subsequent patches. 1540 delta -= patch.length2 - patch.length1 1541 else: 1542 # Found a match. :) 1543 results.append(True) 1544 delta = start_loc - expected_loc 1545 if end_loc == -1: 1546 text2 = text[start_loc : start_loc + len(text1)] 1547 else: 1548 text2 = text[start_loc : end_loc + self.Match_MaxBits] 1549 if text1 == text2: 1550 # Perfect match, just shove the replacement text in. 1551 text = (text[:start_loc] + self.diff_text2(patch.diffs) + 1552 text[start_loc + len(text1):]) 1553 else: 1554 # Imperfect match. 1555 # Run a diff to get a framework of equivalent indices. 1556 diffs = self.diff_main(text1, text2, False) 1557 if (len(text1) > self.Match_MaxBits and 1558 self.diff_levenshtein(diffs) / float(len(text1)) > 1559 self.Patch_DeleteThreshold): 1560 # The end points match, but the content is unacceptably bad. 1561 results[-1] = False 1562 else: 1563 self.diff_cleanupSemanticLossless(diffs) 1564 index1 = 0 1565 for (op, data) in patch.diffs: 1566 if op != self.DIFF_EQUAL: 1567 index2 = self.diff_xIndex(diffs, index1) 1568 if op == self.DIFF_INSERT: # Insertion 1569 text = text[:start_loc + index2] + data + text[start_loc + 1570 index2:] 1571 elif op == self.DIFF_DELETE: # Deletion 1572 text = text[:start_loc + index2] + text[start_loc + 1573 self.diff_xIndex(diffs, index1 + len(data)):] 1574 if op != self.DIFF_DELETE: 1575 index1 += len(data) 1576 # Strip the padding off. 1577 text = text[len(nullPadding):-len(nullPadding)] 1578 return (text, results)
1579
1580 - def patch_addPadding(self, patches):
1581 """Add some padding on text start and end so that edges can match 1582 something. Intended to be called only from within patch_apply. 1583 1584 Args: 1585 patches: Array of patch objects. 1586 1587 Returns: 1588 The padding string added to each side. 1589 """ 1590 paddingLength = self.Patch_Margin 1591 nullPadding = "" 1592 for x in xrange(1, paddingLength + 1): 1593 nullPadding += chr(x) 1594 1595 # Bump all the patches forward. 1596 for patch in patches: 1597 patch.start1 += paddingLength 1598 patch.start2 += paddingLength 1599 1600 # Add some padding on start of first diff. 1601 patch = patches[0] 1602 diffs = patch.diffs 1603 if not diffs or diffs[0][0] != self.DIFF_EQUAL: 1604 # Add nullPadding equality. 1605 diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) 1606 patch.start1 -= paddingLength # Should be 0. 1607 patch.start2 -= paddingLength # Should be 0. 1608 patch.length1 += paddingLength 1609 patch.length2 += paddingLength 1610 elif paddingLength > len(diffs[0][1]): 1611 # Grow first equality. 1612 extraLength = paddingLength - len(diffs[0][1]) 1613 newText = nullPadding[len(diffs[0][1]):] + diffs[0][1] 1614 diffs[0] = (diffs[0][0], newText) 1615 patch.start1 -= extraLength 1616 patch.start2 -= extraLength 1617 patch.length1 += extraLength 1618 patch.length2 += extraLength 1619 1620 # Add some padding on end of last diff. 1621 patch = patches[-1] 1622 diffs = patch.diffs 1623 if not diffs or diffs[-1][0] != self.DIFF_EQUAL: 1624 # Add nullPadding equality. 1625 diffs.append((self.DIFF_EQUAL, nullPadding)) 1626 patch.length1 += paddingLength 1627 patch.length2 += paddingLength 1628 elif paddingLength > len(diffs[-1][1]): 1629 # Grow last equality. 1630 extraLength = paddingLength - len(diffs[-1][1]) 1631 newText = diffs[-1][1] + nullPadding[:extraLength] 1632 diffs[-1] = (diffs[-1][0], newText) 1633 patch.length1 += extraLength 1634 patch.length2 += extraLength 1635 1636 return nullPadding
1637
1638 - def patch_splitMax(self, patches):
1639 """Look through the patches and break up any which are longer than the 1640 maximum limit of the match algorithm. 1641 1642 Args: 1643 patches: Array of patch objects. 1644 """ 1645 if self.Match_MaxBits == 0: 1646 return 1647 for x in xrange(len(patches)): 1648 if patches[x].length1 > self.Match_MaxBits: 1649 bigpatch = patches[x] 1650 # Remove the big old patch. 1651 del patches[x] 1652 x -= 1 1653 patch_size = self.Match_MaxBits 1654 start1 = bigpatch.start1 1655 start2 = bigpatch.start2 1656 precontext = '' 1657 while len(bigpatch.diffs) != 0: 1658 # Create one of several smaller patches. 1659 patch = patch_obj() 1660 empty = True 1661 patch.start1 = start1 - len(precontext) 1662 patch.start2 = start2 - len(precontext) 1663 if precontext: 1664 patch.length1 = patch.length2 = len(precontext) 1665 patch.diffs.append((self.DIFF_EQUAL, precontext)) 1666 1667 while (len(bigpatch.diffs) != 0 and 1668 patch.length1 < patch_size - self.Patch_Margin): 1669 (diff_type, diff_text) = bigpatch.diffs[0] 1670 if diff_type == self.DIFF_INSERT: 1671 # Insertions are harmless. 1672 patch.length2 += len(diff_text) 1673 start2 += len(diff_text) 1674 patch.diffs.append(bigpatch.diffs.pop(0)) 1675 empty = False 1676 elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and 1677 patch.diffs[0][0] == self.DIFF_EQUAL and 1678 len(diff_text) > 2 * patch_size): 1679 # This is a large deletion. Let it pass in one chunk. 1680 patch.length1 += len(diff_text) 1681 start1 += len(diff_text) 1682 empty = False 1683 patch.diffs.append((diff_type, diff_text)) 1684 del bigpatch.diffs[0] 1685 else: 1686 # Deletion or equality. Only take as much as we can stomach. 1687 diff_text = diff_text[:patch_size - patch.length1 - 1688 self.Patch_Margin] 1689 patch.length1 += len(diff_text) 1690 start1 += len(diff_text) 1691 if diff_type == self.DIFF_EQUAL: 1692 patch.length2 += len(diff_text) 1693 start2 += len(diff_text) 1694 else: 1695 empty = False 1696 1697 patch.diffs.append((diff_type, diff_text)) 1698 if diff_text == bigpatch.diffs[0][1]: 1699 del bigpatch.diffs[0] 1700 else: 1701 bigpatch.diffs[0] = (bigpatch.diffs[0][0], 1702 bigpatch.diffs[0][1][len(diff_text):]) 1703 1704 # Compute the head context for the next patch. 1705 precontext = self.diff_text2(patch.diffs) 1706 precontext = precontext[-self.Patch_Margin:] 1707 # Append the end context for this patch. 1708 postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin] 1709 if postcontext: 1710 patch.length1 += len(postcontext) 1711 patch.length2 += len(postcontext) 1712 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: 1713 patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] + 1714 postcontext) 1715 else: 1716 patch.diffs.append((self.DIFF_EQUAL, postcontext)) 1717 1718 if not empty: 1719 x += 1 1720 patches.insert(x, patch)
1721
1722 - def patch_toText(self, patches):
1723 """Take a list of patches and return a textual representation. 1724 1725 Args: 1726 patches: Array of patch objects. 1727 1728 Returns: 1729 Text representation of patches. 1730 """ 1731 text = [] 1732 for patch in patches: 1733 text.append(str(patch)) 1734 return "".join(text)
1735
1736 - def patch_fromText(self, textline):
1737 """Parse a textual representation of patches and return a list of patch 1738 objects. 1739 1740 Args: 1741 textline: Text representation of patches. 1742 1743 Returns: 1744 Array of patch objects. 1745 1746 Raises: 1747 ValueError: If invalid input. 1748 """ 1749 if type(textline) == unicode: 1750 # Patches should be composed of a subset of ascii chars, Unicode not 1751 # required. If this encode raises UnicodeEncodeError, patch is invalid. 1752 textline = textline.encode("ascii") 1753 patches = [] 1754 if not textline: 1755 return patches 1756 text = textline.split('\n') 1757 while len(text) != 0: 1758 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) 1759 if not m: 1760 raise ValueError("Invalid patch string: " + text[0]) 1761 patch = patch_obj() 1762 patches.append(patch) 1763 patch.start1 = int(m.group(1)) 1764 if m.group(2) == '': 1765 patch.start1 -= 1 1766 patch.length1 = 1 1767 elif m.group(2) == '0': 1768 patch.length1 = 0 1769 else: 1770 patch.start1 -= 1 1771 patch.length1 = int(m.group(2)) 1772 1773 patch.start2 = int(m.group(3)) 1774 if m.group(4) == '': 1775 patch.start2 -= 1 1776 patch.length2 = 1 1777 elif m.group(4) == '0': 1778 patch.length2 = 0 1779 else: 1780 patch.start2 -= 1 1781 patch.length2 = int(m.group(4)) 1782 1783 del text[0] 1784 1785 import urllib 1786 while len(text) != 0: 1787 if text[0]: 1788 sign = text[0][0] 1789 else: 1790 sign = '' 1791 line = urllib.unquote(text[0][1:]) 1792 line = line.decode("utf-8") 1793 if sign == '+': 1794 # Insertion. 1795 patch.diffs.append((self.DIFF_INSERT, line)) 1796 elif sign == '-': 1797 # Deletion. 1798 patch.diffs.append((self.DIFF_DELETE, line)) 1799 elif sign == ' ': 1800 # Minor equality. 1801 patch.diffs.append((self.DIFF_EQUAL, line)) 1802 elif sign == '@': 1803 # Start of next patch. 1804 break 1805 elif sign == '': 1806 # Blank line? Whatever. 1807 pass 1808 else: 1809 # WTF? 1810 raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line)) 1811 del text[0] 1812 return patches
1813 1814
1815 -class patch_obj:
1816 """Class representing one patch operation. 1817 """ 1818
1819 - def __init__(self):
1820 """Initializes with an empty list of diffs. 1821 """ 1822 self.diffs = [] 1823 self.start1 = None 1824 self.start2 = None 1825 self.length1 = 0 1826 self.length2 = 0
1827
1828 - def __str__(self):
1829 """Emmulate GNU diff's format. 1830 Header: @@ -382,8 +481,9 @@ 1831 Indicies are printed as 1-based, not 0-based. 1832 1833 Returns: 1834 The GNU diff string. 1835 """ 1836 import urllib 1837 if self.length1 == 0: 1838 coords1 = str(self.start1) + ",0" 1839 elif self.length1 == 1: 1840 coords1 = str(self.start1 + 1) 1841 else: 1842 coords1 = str(self.start1 + 1) + "," + str(self.length1) 1843 if self.length2 == 0: 1844 coords2 = str(self.start2) + ",0" 1845 elif self.length2 == 1: 1846 coords2 = str(self.start2 + 1) 1847 else: 1848 coords2 = str(self.start2 + 1) + "," + str(self.length2) 1849 text = ["@@ -", coords1, " +", coords2, " @@\n"] 1850 # Escape the body of the patch with %xx notation. 1851 for (op, data) in self.diffs: 1852 if op == diff_match_patch.DIFF_INSERT: 1853 text.append("+") 1854 elif op == diff_match_patch.DIFF_DELETE: 1855 text.append("-") 1856 elif op == diff_match_patch.DIFF_EQUAL: 1857 text.append(" ") 1858 # High ascii will raise UnicodeDecodeError. Use Unicode instead. 1859 data = data.encode("utf-8") 1860 text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n") 1861 return "".join(text)
1862