-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrules.rb
162 lines (131 loc) · 4.44 KB
/
rules.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
DEBUG=false
#class that holds the individual rule.
class Rule
attr_accessor :type, :class, :del, :add, :when, :rules
def initialize(rule)
@type, @class, @del, @add, @when = rule.split.map {|x| x.strip}
@add = @add.split("/")[0] if @add
end
def match? word
return false unless word
word.strip.end_with? @add
end
def root word
return nil unless match? word
repl = (@del == "0") ? "" : @del
word.sub(/#{@add}/, repl)
end
def to_s
"#{@type} #{@class} #{@del} #{@add} #{@when}"
end
end
#class that manages all the rules.
class RuleMachine
@rules = []
@dict = []
@doubt_file = nil
@doubt_suffixes = ''
def initialize rule_file,dictionary
rules_loader rule_file
@dict = File.open(dictionary).readlines.map {|z| z.strip}
@dict = @dict.delete_if {|x| x =~ /^\d+$/}
@doubt_file = File.new('doubt_file.txt','w+')
@doubt_suffixes = File.open('doubt_suffixes.txt','r').readlines[0].split(',').join('|') if File.exist? 'doubt_suffixes.txt'
puts "doubt_suffixes = #{@doubt_suffixes}"
puts "loaded #{@dict.size} words into @dict from file #{dictionary}"
end
def rules_loader rule_file
@rules = []
File.open(rule_file).readlines.each do |line|
next if line.strip.empty?
next unless line =~ /^SFX/ #load only SFX rules
rule = Rule.new(line) unless line =~ /^[A-Z]+ [A-Za-z] [Y|N] [0-9]+$/
@rules<<rule if rule
end
end
def valid? word
puts "validating #{word}" if DEBUG
@dict.include? word
end
#get the word and return its root word and rules.
def parse word
root = ""
rules = []
word = word.strip
rules.concat word.split("/")[1].split(//) if word.include? "/"
word_sans_rules = word.split("/")[0]
@rules.each do |rule|
if (root_word = rule.root (word_sans_rules))
puts "root_word = #{root_word}" if DEBUG
if valid? root_word
puts "root of #{word} => #{root_word}" if DEBUG
# if the root word is got by just removing one letter, like ம் then there is a change that this word could as well be a root word.
# (e.g) வணக்கம், by a rule breaks to வணக்க which is in the dict. so வணக்கம் is removed from the dic file.
# using this check, we can put this file into the doubt file, for manual check.
@doubt_file.puts word if word =~ /#{root_word}[#{@doubt_suffixes}]/
root = root_word
rules << rule.class
end
end
end
puts "Is root empty? #{root.strip.empty?}" if DEBUG
puts "word_sans_rules : #{word_sans_rules}" if DEBUG
root = word_sans_rules if root.strip.empty?
puts "root : #{root}" if DEBUG
#(root.strip.size > 0) ? [root, rules] : []
print "."
[root, rules.uniq]
end
end
#process the word list.
class WordParser
@words = []
def initialize(rule_file, dictionary)
@rm = RuleMachine.new rule_file, dictionary
puts "loaded rules from file #{rule_file}"
end
def match_word w
result = @rm.parse w
p result
end
def match word_file, out_file
puts "debug : #{DEBUG}"
words = File.open(word_file).readlines.map {|z| z.strip}
words = words.delete_if {|x| x =~ /^\d+$/} #delete if the line only has numbers.
puts "loaded #{words.size} words into @words from file #{word_file}"
root_words = Hash.new ([])
words.each do |w|
next if w.empty?
next if w =~ /^[0-9]$/
result = @rm.parse w
root_words[result[0]] = result[1]
end
File.open(out_file,'w+') do |out|
out.puts root_words.size
root_words.each_entry do |w,r|
out.puts w + "#{'/' + r.uniq.join if r.size > 0}"
end
end
puts "took #{word_file} and gave #{out_file}"
puts "word count in original file : #{words.size}"
puts "word count in processed file : #{root_words.size}"
end
end
t1 = Time.now
w = WordParser.new "ta_TA.aff","ta_complete.dic"
#w.match "ta_complete.dic","ta_complete.dic"
w.match "ta_TA.dic.old","ta_TA.dic.new"
#w.match "test.txt","test.dic"
#word = File.open('test.txt','r').readlines[0]
#File.open('result.txt','w+') {|out| out.puts w.match_word word}
=begin
suffixes = ["","_2","_3","_4"]
suffixes.each do |suffix|
file="ta_TA#{suffix}.dic.old"
w = WordParser.new "ta_TA.aff",file,file
print "start process on #{file}...."
w.match
end
=end
t2 = Time.now
puts "done.\ntime taken : #{t2-t1} seconds"