1 """
2 Given a trie, find all occurrences of a word in the trie in a string.
3
4 Like searching a string for a substring, except that the substring is
5 any word in a trie.
6
7 Functions:
8 match Find longest key in a trie matching the beginning of the string.
9 match_all Find all keys in a trie matching the beginning of the string.
10 find Find keys in a trie matching anywhere in a string.
11 find_words Find keys in a trie matching whole words in a string.
12
13 """
14 import string
15 import re
16
18 """match(string, trie) -> longest key or None
19
20 Find the longest key in the trie that matches the beginning of the
21 string.
22
23 """
24 longest = None
25 for i in range(len(string)):
26 substr = string[:i+1]
27 if not trie.has_prefix(substr):
28 break
29 if trie.has_key(substr):
30 longest = substr
31 return longest
32
34 """match_all(string, trie) -> list of keys
35
36 Find all the keys in the trie that matches the beginning of the
37 string.
38
39 """
40 matches = []
41 for i in range(len(string)):
42 substr = string[:i+1]
43 if not trie.has_prefix(substr):
44 break
45 if trie.has_key(substr):
46 matches.append(substr)
47 return matches
48
49 -def find(string, trie):
64
65 DEFAULT_BOUNDARY_CHARS = string.punctuation + string.whitespace
66
68 """find_words(string, trie) -> list of tuples (key, start, end)
69
70 Find all the keys in the trie that match full words in the string.
71 Word boundaries are defined as any punctuation or whitespace.
72
73 """
74 _boundary_re = re.compile(r"[%s]+" % re.escape(DEFAULT_BOUNDARY_CHARS))
75
76 results = []
77 start = 0
78 while start < len(string):
79
80 keys = match_all(string[start:], trie)
81 for key in keys:
82 l = len(key)
83
84 if start+l == len(string) or \
85 _boundary_re.match(string[start+l]):
86 results.append((key, start, start+l))
87
88 m = _boundary_re.search(string, start)
89 if m is None:
90 break
91 start = m.end()
92 return results
93