Skip to content

Commit 266a7de

Browse files
authored
Add files via upload
0 parents  commit 266a7de

11 files changed

+40656
-0
lines changed

DutchtoEnglish.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from nltk.tokenize import word_tokenize
2+
import nltk.data
3+
import string
4+
import ModelTrainer
5+
import ModelTester
6+
import Utils
7+
8+
def sentence_tokenizer(sentence_list) :
9+
final_list = list()
10+
index = 0
11+
for sen in sentence_list:
12+
if index == 0 :
13+
sen = sen.replace(u'\ufeff', '')
14+
index += 1
15+
16+
tokens = word_tokenize(sen.lower())
17+
18+
output_sentence = ""
19+
20+
for token in tokens :
21+
output_sentence += token + " "
22+
23+
output_sentence = output_sentence[:(len(output_sentence)-1)] #remove last space
24+
final_list.append(output_sentence)
25+
26+
final_list[0] = final_list[0].replace(u'\ufeff', '') # ufeff character from document start
27+
return final_list
28+
29+
30+
def translate() :
31+
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
32+
final_output = ""
33+
with open("input.txt") as f:
34+
tagalog_data = f.readlines()
35+
36+
tagalog_lines = sentence_tokenizer(tagalog_data)
37+
38+
tagalog_sentences = list()
39+
for line in tagalog_lines :
40+
l = tokenizer.tokenize(line)
41+
for sen in l :
42+
tagalog_sentences.append(sen)
43+
44+
out_file = open("output.txt", "w+")
45+
for index in range(len(tagalog_sentences)) :
46+
current_sen = tagalog_sentences[index]
47+
curr_translated_sen = ModelTester.sentence_tester1(current_sen, 1)
48+
out_file.write(curr_translated_sen)
49+
out_file.write(". ")
50+
51+
print("Successfully translated! Translated document is 'output.txt' ")

English.txt

+20,106
Large diffs are not rendered by default.

EnglishtoDutch.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from nltk.tokenize import word_tokenize
2+
import nltk.data
3+
import string
4+
import ModelTrainer
5+
import ModelTester
6+
import Utils
7+
8+
def sentence_tokenizer(sentence_list) :
9+
final_list = list()
10+
index = 0
11+
for sen in sentence_list:
12+
if index == 0 :
13+
sen = sen.replace(u'\ufeff', '')
14+
index += 1
15+
16+
tokens = word_tokenize(sen.lower())
17+
18+
output_sentence = ""
19+
20+
for token in tokens :
21+
output_sentence += token + " "
22+
23+
output_sentence = output_sentence[:(len(output_sentence)-1)] #remove last space
24+
final_list.append(output_sentence)
25+
26+
final_list[0] = final_list[0].replace(u'\ufeff', '') # ufeff character from document start
27+
return final_list
28+
29+
30+
def translate() :
31+
tokenizer = nltk.data.load('tokenizers/punkt/tagalog.pickle')
32+
final_output = ""
33+
with open("input.txt") as f:
34+
english_data = f.readlines()
35+
36+
english_lines = sentence_tokenizer(english_data)
37+
38+
english_sentences = list()
39+
for line in english_lines :
40+
l = tokenizer.tokenize(line)
41+
for sen in l :
42+
english_sentences.append(sen)
43+
44+
out_file = open("output.txt", "w+")
45+
for index in range(len(english_sentences)) :
46+
current_sen = english_sentences[index]
47+
curr_translated_sen = ModelTester.sentence_tester1(current_sen, 2)
48+
out_file.write(curr_translated_sen)
49+
out_file.write(". ")
50+
51+
print("Successfully translated! Translated document is 'output.txt' ")

IBM1_EM.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import numpy as np
2+
from datetime import datetime
3+
import math
4+
import Utils
5+
6+
7+
def expectation_maximization(tagalog_word_dict,english_word_dict,tagalog_sentences,english_sentences):
8+
total_tagalog_ocurrences = len(tagalog_word_dict)
9+
total_eng_occurrences = len(english_word_dict)
10+
11+
# IBM1 Expectaion Maximization algorithm :
12+
translate_eng_tagalog_matrix = np.full((len(tagalog_word_dict), len(english_word_dict)), 1 / len(english_word_dict),dtype=float)
13+
translate_eng_tagalog_matrix_prev = np.full((len(tagalog_word_dict), len(english_word_dict)), 1,dtype=float)
14+
15+
cnt_iter = 0
16+
while not Utils.is_converged(translate_eng_tagalog_matrix,translate_eng_tagalog_matrix_prev,cnt_iter) :
17+
cnt_iter += 1
18+
translate_eng_tagalog_matrix_prev = translate_eng_tagalog_matrix.copy()
19+
total_eng_tagalog = np.full((len(tagalog_word_dict), len(english_word_dict)), 0, dtype=float)
20+
total_f = np.full((len(english_word_dict)),0, dtype=float)
21+
22+
for marker_tur, tagalog_sen in enumerate(tagalog_sentences): #for all sentence pairs (e,f) do
23+
#compute normalization
24+
tagalog_sen_words = tagalog_sen.split(" ")
25+
s_total = np.full((len(tagalog_sen_words)),0,dtype=float)
26+
27+
for marker_word in range(len(tagalog_sen_words)): #for all words e in e do
28+
tagalog_word = tagalog_sen_words[marker_word]
29+
s_total[marker_word] = 0
30+
eng_sen_words = english_sentences[marker_tur].split(" ")
31+
32+
for eng_word in eng_sen_words: #for all words f in f do
33+
if eng_word == '' :
34+
continue
35+
marker_tagalog_in_dict =tagalog_word_dict[tagalog_word]
36+
marker_eng_in_dict = english_word_dict[eng_word]
37+
s_total[marker_word] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict]
38+
#end for
39+
#end for
40+
41+
#collect counts
42+
tagalog_sen_words = tagalog_sen.split(" ")
43+
44+
for marker_word in range(len(tagalog_sen_words)): #for all words e in e do
45+
tagalog_word = tagalog_sen_words[marker_word]
46+
eng_sen_words = english_sentences[marker_tur].split(" ")
47+
48+
for eng_word in eng_sen_words: #for all words f in f do
49+
if eng_word == '' :
50+
continue
51+
marker_tagalog_in_dict =tagalog_word_dict[tagalog_word]
52+
marker_eng_in_dict = english_word_dict[eng_word]
53+
total_eng_tagalog[marker_tagalog_in_dict][marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word]
54+
total_f[marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word]
55+
#end for
56+
#end for
57+
#end for
58+
59+
#estimate probabilities
60+
for eng_marker in range(total_eng_occurrences): #for all foreign words f do
61+
62+
for tagalog_marker in range(total_tagalog_ocurrences): #for all English words e do
63+
64+
if total_eng_tagalog[tagalog_marker][eng_marker] != 0 :
65+
translate_eng_tagalog_matrix[tagalog_marker][eng_marker] = total_eng_tagalog[tagalog_marker][eng_marker] / total_f[eng_marker]
66+
67+
#end for
68+
69+
#end for
70+
71+
#end while
72+
73+
print("EM Algorithm Converged in ",(cnt_iter-1)," iterations")
74+
return translate_eng_tagalog_matrix
75+
76+
77+
def get_translation_prob(e,f,t,e_dict,f_dict):
78+
const = Utils.const
79+
l_e = len(e)
80+
l_f = len(f)
81+
res = const / math.pow((l_f+1),l_e)
82+
for j in range(l_e):
83+
e_word = e[j]
84+
if e_word in e_dict:
85+
e_j = e_dict[e_word]
86+
else:
87+
print("word '"+ e_word +"' is not found in target language dictionary")
88+
continue
89+
#return 0
90+
91+
sum = 0
92+
for i in range(l_f):
93+
f_word = f[i]
94+
95+
if f_word in f_dict:
96+
f_i = f_dict[f_word]
97+
sum += t[e_j][f_i]
98+
else:
99+
print("word '" + f_word +"' is not found in source language dictionary")
100+
101+
res *= sum
102+
103+
return res

Main.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import ModelTrainer
2+
import ModelTester
3+
import DutchtoEnglish
4+
import EnglishtoDutch
5+
import nltk
6+
7+
nltk.download('punkt')
8+
while True:
9+
try:
10+
mode = int(input('\n\nPlease choose what you want to do: \n\t'
11+
'1: Train the Model\n\t'
12+
'2: Test sentence to translate\n\t'
13+
'3: Translate a tagalog document to English \n\t'
14+
'4: Translate an English document to tagalog \n\t'
15+
'5: For exit\n'))
16+
except ValueError:
17+
print("Not a number")
18+
19+
if mode == 1:
20+
ModelTrainer.model_trainer()
21+
22+
elif mode == 2:
23+
try:
24+
translate_option = int(input('Select translation option: \n\t1: tagalog to English \n\t2: English to tagalog\n'))
25+
except ValueError:
26+
print ("Not a number")
27+
if translate_option > 2 or translate_option < 1 :
28+
print("Invalid Option")
29+
exit()
30+
sentence_to_translate = input("Please provide sentence to translate: ")
31+
32+
translated_sentence = ModelTester.test(sentence_to_translate,translate_option)
33+
print(translated_sentence)
34+
35+
elif mode == 3: #translate tagalog document to English
36+
DutchtoEnglish.translate()
37+
38+
elif mode == 4: #translate English document to tagalog
39+
EnglishtoDutch.translate()
40+
41+
elif mode == 5:
42+
break
43+
44+
else:
45+
print("invalid mode")
46+
47+
print("goodbye!")

ModelTester.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import numpy as np
2+
from nltk.tokenize import word_tokenize
3+
import string
4+
import IBM1_EM
5+
import Utils
6+
7+
8+
def get_tokens_of_sentence(sentence):
9+
translate_table = dict((ord(char), None) for char in string.punctuation)
10+
sentence = sentence.translate(translate_table)
11+
tokens = word_tokenize(sentence.lower())
12+
return tokens
13+
14+
def sentence_tester1(sentence_to_translate,translate_option):
15+
tagalog_to_english_maximised = np.load("trained_data/tagalog_to_english_maximised.npy",allow_pickle = True).item()
16+
english_to_tagalog_maximised = np.load("trained_data/english_to_tagalog_maximised.npy",allow_pickle = True).item()
17+
18+
if translate_option == 1:
19+
f_sentence = get_tokens_of_sentence(sentence_to_translate)
20+
e_sentence = ""
21+
for word in f_sentence :
22+
if word in tagalog_to_english_maximised:
23+
e_sentence = e_sentence + tagalog_to_english_maximised[word] + " "
24+
else:
25+
print("word '"+ word +"' does not exist in trained language translation dictionary")
26+
continue
27+
return e_sentence
28+
elif translate_option == 2:
29+
e_sentence = get_tokens_of_sentence(sentence_to_translate)
30+
f_sentence = ""
31+
for word in e_sentence :
32+
if word in english_to_tagalog_maximised:
33+
f_sentence = f_sentence + english_to_tagalog_maximised[word] + " "
34+
else:
35+
print("word '"+ word +"' does not exist in trained language translation dictionary")
36+
continue
37+
return f_sentence
38+
39+
def test(sentence_to_translate,translate_option):
40+
return sentence_tester1(sentence_to_translate,translate_option)

0 commit comments

Comments
 (0)