forked from suamin/PyNemex
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
73 lines (61 loc) · 2.1 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
import nemex
import collections
def test_nemex():
E = [
"kaushik ch",
"chakrabarti",
"chaudhuri",
"venkatesh",
"surajit ch"
]
D = "an efficient filter for approximate membership checking. venkaee shga kamunshik kabarati, dong xin, surauijt chadhurisigmod."
# similarity selection
similarity = "edit_dist"
t = 2
# tokenizer settings
q = 2
special_char = "_"
char = True
unique = False
tokenizer = nemex.Tokenizer(char, q, special_char, unique).tokenize
# create entities dictionary
ents_dict = nemex.EntitiesDictionary.from_list(E, tokenizer)
# setup faerie
pruner = "batch_count"
faerie = nemex.Faerie(ents_dict, similarity, t, q, pruner)
# run on document to find approximate entities from dictionary
doc_tokens = tokenizer(D)
entity2candidates = collections.defaultdict(set)
verified_only = True
for e, (i, j) in faerie(doc_tokens):
substring = doc_tokens[i:j+1]
if char:
substring = nemex.utils.qgrams_to_char(substring)
else:
substring = " ".join(substring)
entity2candidates[e].add(substring)
for e, candidates in entity2candidates.items():
if len(candidates) == 0:
continue
print("Entity:", ents_dict[e].entity)
print("----------------------------")
if char:
entity = nemex.utils.qgrams_to_char(ents_dict[e].tokens)
else:
entity = ents_dict[e].tokens
for candidate in candidates:
if not char:
substring = tokenizer(candidate)
else:
substring = candidate
valid, score = nemex.Verify.check(substring, entity, similarity, t)
if verified_only:
if not valid:
continue
print("[{}] {} -- t_true={} {} {}=t_bounded".format(
valid, candidate, score, "<=" if similarity == "edit_dist" else ">=",
t))
print()
if __name__=="__main__":
test_nemex()