-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGrammarChecker.py
125 lines (106 loc) · 4.69 KB
/
GrammarChecker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from collections import defaultdict
from Lexeme import Lexeme
import os
def load_lexicon(filename):
lexicon = defaultdict(lambda: []) # maps POS to the Lexemes
vocab = [] # holds all lexemes
with open(filename) as f:
for line in f:
if line.startswith('#') or line == "\n": # ignore comment lines or empty lines in files
continue
line = line.split()
word = line[0]
POS = line[1]
features = line[2:]
feature_dictionary = defaultdict(lambda: [])
for feature in features:
feat = feature.split(':')
feat_type = feat[0]
feat_val = feat[1]
# build the feature dictionary
feature_dictionary[feat_type].append(feat_val)
lexeme = Lexeme(word=word,
POS=POS,
features=feature_dictionary
)
vocab.append(lexeme)
lexicon[POS].append(lexeme)
return lexicon, vocab
def load_grammar(filename):
grammar = defaultdict(list)
with open(filename) as f:
for line in f:
if line.startswith('#') or line == "\n": # ignore comment lines or empty lines in files
continue
line = line.split()
lhs = line[0].split("-")
nonterminal = lhs[0]
lhs_features = lhs[1:]
productions = [production.split("-") for production in line[2:]]
grammar[nonterminal].append((productions,lhs_features))
return grammar
class GrammarChecker():
def __init__(self):
# lexicon maps POS to list of Lexemes with that POS, vocab is list of all lexemes
self.lexicon, self.vocab = load_lexicon(filename=os.path.join('language_rules', 'lexicon.txt'))
self.grammar = load_grammar(filename=os.path.join('language_rules', 'grammar.txt'))# dict of POS to rewrite rule
# self.vocab = [word[0] for word in self.lexicon[symbol]]
def get_parse(self, sentence):
sentence = sentence.split()
result = self.recursive_parse(sentence, "TOP", 0, debug=False)
return result[0]
def is_grammatical(self, sentence, debug=False):
sentence = sentence.split()
result = self.recursive_parse(sentence, "TOP", 0, debug=debug)
if type(result) == str:
return False
if result[1] == len(sentence):
return True
# we didn't complete a rule - stopped early
return False
def recursive_parse(self, sentence, symbol, index, debug=False, tabs=""):
if debug: print(tabs, symbol, index)
# base case: we have a terminal node, e.g. NNP > Elaine
if symbol in self.lexicon: # if this is a POS, terminal node, no rewrite rule
if index >= len(sentence):
return "YIKES: parse index is longer than sentence"
possible_words = set( [lexeme.word for lexeme in self.lexicon[symbol]] )
word = sentence[index]
if word in possible_words:
return ([symbol], index + 1)
else:
return "YIKES: Word is not in lexicon: %s" % word
## recursive case: we have a rewrite rule
tree = [symbol]
productions = self.grammar[symbol]
productions = [production[0] for production in productions]
for rule in productions:
tempindex = index
tempchildren = []
failure = False
for element in rule:
return_value = self.recursive_parse(sentence, element[0], tempindex, debug=debug, tabs=tabs+'\t')
if type(return_value) == str: # "YIKES" occured
failure = True
break
else:
subtree, tempindex = return_value
tempchildren.append(subtree)
# if symbol == 'CNP':
# x='hello'
if not failure:
if symbol == 'TOP' and tempindex == len(sentence): # we've finished parsing the whole sentence!
tree.extend(tempchildren)
return (tree, tempindex)
if symbol == 'TOP':
# "YIKES: we ended parsing too soon"
continue # try another rule
tree.extend(tempchildren)
return (tree, tempindex)
# we tried all the rules, and none of them worked. No good
return "YIKES: no production rule satisfied"
if __name__ == '__main__':
# sentence = "que hora es"
gc = GrammarChecker()
# print(gc.is_grammatical(sentence))
print(gc.recursive_parse("dime que hora es".split(), "TOP", 0, debug=True))