1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """A class to calculate a similarity based on the Levenshtein
22 distance. See http://en.wikipedia.org/wiki/Levenshtein_distance.
23
24 If available, the python-Levenshtein package will be used which will provide
25 better performance as it is implemented natively. See
26 http://trific.ath.cx/python/levenshtein/
27 """
28
29 import math
30 import sys
31
33 """Calculates the distance for use in similarity calculation. Python
34 version."""
35 l1 = len(a)
36 l2 = len(b)
37 if stopvalue == -1:
38 stopvalue = l2
39 current = range(l1+1)
40 for i in range(1, l2+1):
41 previous, current = current, [i]+[0]*l1
42 least = l2
43 for j in range(1, l1 + 1):
44 change = previous[j-1]
45 if a[j-1] != b[i-1]:
46 change = change + 1
47 insert = previous[j] + 1
48 delete = current[j-1] + 1
49 current[j] = min(insert, delete, change)
50 if least > current[j]:
51 least = current[j]
52
53
54 if least > stopvalue:
55 return least
56
57 return current[l1]
58
60 """Same as python_distance in functionality. This uses the fast C
61 version if we detected it earlier.
62
63 Note that this does not support arbitrary sequence types, but only
64 string types."""
65 return Levenshtein.distance(a, b)
66
67 try:
68 import Levenshtein as Levenshtein
69 distance = native_distance
70 except Exception:
71 import logging
72 logging.warning("Python-Levenshtein not found. Continuing with built-in (slower) fuzzy matching.")
73 distance = python_distance
74
77 self.MAX_LEN = max_len
78
98
100 """Returns the similarity between a and b based on Levenshtein distance. It
101 can stop prematurely as soon as it sees that a and b will be no simmilar than
102 the percentage specified in stoppercentage.
103
104 The Levenshtein distance is calculated, but the following should be noted:
105 - Only the first MAX_LEN characters are considered. Long strings differing
106 at the end will therefore seem to match better than they should. See the
107 use of the variable penalty to lessen the effect of this.
108 - Strings with widely different lengths give the opportunity for shortcut.
109 This is by definition of the Levenshtein distance: the distance will be
110 at least as much as the difference in string length.
111 - Calculation is stopped as soon as a similarity of stoppercentage becomes
112 unattainable. See the use of the variable stopvalue.
113 - Implementation uses memory O(min(len(a), len(b))
114 - Excecution time is O(len(a)*len(b))
115 """
116 l1, l2 = len(a), len(b)
117 if l1 == 0 or l2 == 0:
118 return 0
119
120 if l1 > l2:
121 l1, l2 = l2, l1
122 a, b = b, a
123
124
125
126 maxsimilarity = 100 - 100.0*abs(l1 - l2)/l2
127 if maxsimilarity < stoppercentage:
128 return maxsimilarity * 1.0
129
130
131 penalty = 0
132 if l2 > self.MAX_LEN:
133 b = b[:self.MAX_LEN]
134 l2 = self.MAX_LEN
135 penalty += 7
136 if l1 > self.MAX_LEN:
137 a = a[:self.MAX_LEN]
138 l1 = self.MAX_LEN
139 penalty += 7
140
141
142 stopvalue = math.ceil((100.0 - stoppercentage)/100 * l2)
143 dist = distance(a, b, stopvalue)
144 if dist > stopvalue:
145 return stoppercentage - 1.0
146
147
148
149 if dist != 0:
150 penalty = 0
151 return 100 - (dist*1.0/l2)*100 - penalty
152
153
154 if __name__ == "__main__":
155 from sys import argv
156 comparer = LevenshteinComparer()
157 print "Similarity:\n%s" % comparer.similarity(argv[1], argv[2], 50)
158