Ticket #13514: levenshtein_normalize_unicode.py

File levenshtein_normalize_unicode.py, 2.4 KB (added by rcrdnalor, 2 years ago)

Usage of levenshtein within python bindings

Line 
1# -*- coding: utf-8 -*-
2
3import unicodedata
4
5# from MythTV.utility import levenshtein
6   ## see below copy
7
8
9def normalize_unicode(s):
10    """Returns a unicode string in the normalized composition form.
11       See
12       https://en.wikipedia.org/wiki/Unicode_equivalence
13       https://stackoverflow.com/questions/29243962/levenshtein-distance-in-python-wrong-result-with-national-characters
14       https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work
15    """
16    return unicodedata.normalize('NFKC', s)
17
18
19def levenshtein(s1, s2):
20    """Compute the Levenshtein distance of two strings.
21    """
22    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance
23
24    if len(s1) < len(s2):
25        return levenshtein(s2, s1)
26    if not s1:
27        return len(s2)
28
29    previous_row = range(len(s2) + 1)
30    for i, c1 in enumerate(s1):
31        current_row = [i + 1]
32        for j, c2 in enumerate(s2):
33            insertions = previous_row[j + 1] + 1
34            deletions = current_row[j] + 1
35            substitutions = previous_row[j] + (c1 != c2)
36            current_row.append(min(insertions, deletions, substitutions))
37        previous_row = current_row
38
39    return previous_row[-1]
40
41
42if __name__ == '__main__':
43
44    # Unicode strings may look different, lets normalize them:
45    # See https://en.wikipedia.org/wiki/Unicode_equivalence
46    # decompose and recompose string containing an `Ä` :
47    composed_str   = u"Madam I'm Ädam"
48    decomposed_str = unicodedata.normalize('NFKD', composed_str)
49    recomposed_str = unicodedata.normalize('NFKC', decomposed_str)
50
51    print(len(composed_str))                               # 14
52    print(len(decomposed_str))                             # 15
53
54    print(levenshtein(u"Madam I'm Adam", composed_str))     # 1
55    print(levenshtein(u"Madam I'm Adam", decomposed_str))   # 1
56    print(levenshtein(composed_str, decomposed_str))        # 2
57    print(levenshtein(composed_str, recomposed_str))        # 0
58
59
60    # check utf-8 encoded strings:
61    utf_str1 = u"Madam I'm Ädam".encode('utf-8')
62    utf_str2 = u"Madam I'm Adam".encode('utf-8')
63
64    print(len(utf_str1))                                   # 14
65    print(len(utf_str2))                                   # 15
66    print(levenshtein(utf_str1, utf_str2))                  # 2
67    print(levenshtein(utf_str1, utf_str1))                  # 0
68