repl.it
@heather/

jaro winkler distance

Python

No description

fork
loading
Files
  • main.py
  • foobar.txt
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import math

def match(s1, s2):
    set_of_matches = set.intersection(set(s1), set(s2))
    return set_of_matches

def technical_match(s1, s2):
    matches = match(s1, s2)
    max_distance = math.floor(max(len(s1), len(s2)/2)) - 1
    true_list = []
    for i in matches:
        distance = abs(s1.index(i) - s2.index(i))
        if distance <= max_distance:
            true_list.append(i)
    return true_list

def diff_letters(seq1, seq2):
    return sum(1 for a, b in zip(seq1, seq2) if a != b)

def transpositions(s1, s2):
    t = list(technical_match(s1, s2))
    s1_list = []
    s2_list = []
    for i in s1:
        if i in t:
            s1_list.append(i)
    for i in s2:
        if i in t:
            s2_list.append(i)
    s1 = ''.join(s1_list)
    s2 = ''.join(s2_list)
    return diff_letters(s1, s2)

def jaro_similarity(s1, s2):
    matches = len(technical_match(s1, s2))
    if matches == 0:
        return 0
    else:
        return 1/3*(matches/len(s1) + matches/len(s2) + (matches + transpositions(s1, s2))/matches)

match_text = open('foobar.txt', 'r').read().splitlines()
pattern = 'hat'
constant = .5

results = []
for i in match_text:
    if jaro_similarity(i, pattern) > constant:
        results.append(i)

print(results)