1 | # -*- coding: utf-8 -*- |
---|
2 | |
---|
3 | import unicodedata |
---|
4 | |
---|
5 | # from MythTV.utility import levenshtein |
---|
6 | ## see below copy |
---|
7 | |
---|
8 | |
---|
9 | def normalize_unicode(s): |
---|
10 | """Returns a unicode string in the normalized composition form. |
---|
11 | See |
---|
12 | https://en.wikipedia.org/wiki/Unicode_equivalence |
---|
13 | https://stackoverflow.com/questions/29243962/levenshtein-distance-in-python-wrong-result-with-national-characters |
---|
14 | https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work |
---|
15 | """ |
---|
16 | return unicodedata.normalize('NFKC', s) |
---|
17 | |
---|
18 | |
---|
19 | def levenshtein(s1, s2): |
---|
20 | """Compute the Levenshtein distance of two strings. |
---|
21 | """ |
---|
22 | # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance |
---|
23 | |
---|
24 | if len(s1) < len(s2): |
---|
25 | return levenshtein(s2, s1) |
---|
26 | if not s1: |
---|
27 | return len(s2) |
---|
28 | |
---|
29 | previous_row = range(len(s2) + 1) |
---|
30 | for i, c1 in enumerate(s1): |
---|
31 | current_row = [i + 1] |
---|
32 | for j, c2 in enumerate(s2): |
---|
33 | insertions = previous_row[j + 1] + 1 |
---|
34 | deletions = current_row[j] + 1 |
---|
35 | substitutions = previous_row[j] + (c1 != c2) |
---|
36 | current_row.append(min(insertions, deletions, substitutions)) |
---|
37 | previous_row = current_row |
---|
38 | |
---|
39 | return previous_row[-1] |
---|
40 | |
---|
41 | |
---|
42 | if __name__ == '__main__': |
---|
43 | |
---|
44 | # Unicode strings may look different, lets normalize them: |
---|
45 | # See https://en.wikipedia.org/wiki/Unicode_equivalence |
---|
46 | # decompose and recompose string containing an `Ã` : |
---|
47 | composed_str = u"Madam I'm Ãdam" |
---|
48 | decomposed_str = unicodedata.normalize('NFKD', composed_str) |
---|
49 | recomposed_str = unicodedata.normalize('NFKC', decomposed_str) |
---|
50 | |
---|
51 | print(len(composed_str)) # 14 |
---|
52 | print(len(decomposed_str)) # 15 |
---|
53 | |
---|
54 | print(levenshtein(u"Madam I'm Adam", composed_str)) # 1 |
---|
55 | print(levenshtein(u"Madam I'm Adam", decomposed_str)) # 1 |
---|
56 | print(levenshtein(composed_str, decomposed_str)) # 2 |
---|
57 | print(levenshtein(composed_str, recomposed_str)) # 0 |
---|
58 | |
---|
59 | |
---|
60 | # check utf-8 encoded strings: |
---|
61 | utf_str1 = u"Madam I'm Ãdam".encode('utf-8') |
---|
62 | utf_str2 = u"Madam I'm Adam".encode('utf-8') |
---|
63 | |
---|
64 | print(len(utf_str1)) # 14 |
---|
65 | print(len(utf_str2)) # 15 |
---|
66 | print(levenshtein(utf_str1, utf_str2)) # 2 |
---|
67 | print(levenshtein(utf_str1, utf_str1)) # 0 |
---|
68 | |
---|