JakeCob
diff --git a/‎DutchtoEnglish.py
+51 b/‎DutchtoEnglish.py
+51
diff --git a/‎English.txt
+20,106 b/‎English.txt
+20,106
diff --git a/‎EnglishtoDutch.py
+51 b/‎EnglishtoDutch.py
+51
diff --git a/‎IBM1_EM.py
+103 b/‎IBM1_EM.py
+103
diff --git a/‎Main.py
+47 b/‎Main.py
+47
diff --git a/‎ModelTester.py
+40 b/‎ModelTester.py
+40
@@ -0,0 +1,51 @@
+from nltk.tokenize import word_tokenize
+import nltk.data
+import string
+import ModelTrainer
+import ModelTester
+import Utils
+
+def sentence_tokenizer(sentence_list) :
+    final_list = list()
+    index = 0
+    for sen in sentence_list:
+        if index == 0 :
+            sen = sen.replace(u'\ufeff', '')
+            index += 1
+
+        tokens = word_tokenize(sen.lower())
+
+        output_sentence = ""
+
+        for token in tokens :
+            output_sentence += token + " "
+        
+        output_sentence = output_sentence[:(len(output_sentence)-1)]  #remove last space
+        final_list.append(output_sentence)    
+
+    final_list[0] = final_list[0].replace(u'\ufeff', '')  # ufeff character from document start
+    return final_list    
+
+
+def translate() :
+    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
+    final_output = ""
+    with open("input.txt") as f:
+        tagalog_data = f.readlines()
+
+    tagalog_lines = sentence_tokenizer(tagalog_data)
+
+    tagalog_sentences = list()
+    for line in tagalog_lines :
+        l = tokenizer.tokenize(line)
+        for sen in l :
+            tagalog_sentences.append(sen)
+
+    out_file = open("output.txt", "w+")
+    for index in range(len(tagalog_sentences)) :
+        current_sen = tagalog_sentences[index]
+        curr_translated_sen = ModelTester.sentence_tester1(current_sen, 1)
+        out_file.write(curr_translated_sen)
+        out_file.write(". ")
+        
+    print("Successfully translated! Translated document is 'output.txt' ")
@@ -0,0 +1,51 @@
+from nltk.tokenize import word_tokenize
+import nltk.data
+import string
+import ModelTrainer
+import ModelTester
+import Utils
+
+def sentence_tokenizer(sentence_list) :
+    final_list = list()
+    index = 0
+    for sen in sentence_list:
+        if index == 0 :
+            sen = sen.replace(u'\ufeff', '')
+            index += 1
+
+        tokens = word_tokenize(sen.lower())
+
+        output_sentence = ""
+
+        for token in tokens :
+            output_sentence += token + " "
+        
+        output_sentence = output_sentence[:(len(output_sentence)-1)]  #remove last space
+        final_list.append(output_sentence)    
+
+    final_list[0] = final_list[0].replace(u'\ufeff', '')  # ufeff character from document start
+    return final_list    
+
+
+def translate() :
+    tokenizer = nltk.data.load('tokenizers/punkt/tagalog.pickle')
+    final_output = ""
+    with open("input.txt") as f:
+        english_data = f.readlines()
+
+    english_lines = sentence_tokenizer(english_data)
+
+    english_sentences = list()
+    for line in english_lines :
+        l = tokenizer.tokenize(line)
+        for sen in l :
+            english_sentences.append(sen)
+
+    out_file = open("output.txt", "w+")
+    for index in range(len(english_sentences)) :
+        current_sen = english_sentences[index]
+        curr_translated_sen = ModelTester.sentence_tester1(current_sen, 2)
+        out_file.write(curr_translated_sen)
+        out_file.write(". ")
+        
+    print("Successfully translated! Translated document is 'output.txt' ")
@@ -0,0 +1,103 @@
+import numpy as np
+from datetime import datetime
+import math
+import Utils
+
+
+def expectation_maximization(tagalog_word_dict,english_word_dict,tagalog_sentences,english_sentences):
+    total_tagalog_ocurrences = len(tagalog_word_dict)
+    total_eng_occurrences = len(english_word_dict)
+
+    # IBM1 Expectaion Maximization algorithm :
+    translate_eng_tagalog_matrix = np.full((len(tagalog_word_dict), len(english_word_dict)), 1 / len(english_word_dict),dtype=float)
+    translate_eng_tagalog_matrix_prev = np.full((len(tagalog_word_dict), len(english_word_dict)), 1,dtype=float)
+
+    cnt_iter = 0
+    while not Utils.is_converged(translate_eng_tagalog_matrix,translate_eng_tagalog_matrix_prev,cnt_iter) :
+        cnt_iter += 1
+        translate_eng_tagalog_matrix_prev = translate_eng_tagalog_matrix.copy()
+        total_eng_tagalog = np.full((len(tagalog_word_dict), len(english_word_dict)), 0, dtype=float)
+        total_f = np.full((len(english_word_dict)),0, dtype=float)
+
+        for marker_tur, tagalog_sen in enumerate(tagalog_sentences): #for all sentence pairs (e,f) do
+            #compute normalization
+            tagalog_sen_words = tagalog_sen.split(" ")
+            s_total = np.full((len(tagalog_sen_words)),0,dtype=float)
+
+            for marker_word in range(len(tagalog_sen_words)): #for all words e in e do
+                tagalog_word = tagalog_sen_words[marker_word]
+                s_total[marker_word] = 0
+                eng_sen_words = english_sentences[marker_tur].split(" ")
+
+                for eng_word in eng_sen_words: #for all words f in f do
+                    if eng_word == '' :
+                        continue 
+                    marker_tagalog_in_dict =tagalog_word_dict[tagalog_word]
+                    marker_eng_in_dict = english_word_dict[eng_word]
+                    s_total[marker_word] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict]
+                #end for
+            #end for
+
+            #collect counts
+            tagalog_sen_words = tagalog_sen.split(" ")
+
+            for marker_word in range(len(tagalog_sen_words)): #for all words e in e do
+                tagalog_word = tagalog_sen_words[marker_word]
+                eng_sen_words = english_sentences[marker_tur].split(" ")
+
+                for eng_word in eng_sen_words: #for all words f in f do
+                    if eng_word == '' :
+                        continue
+                    marker_tagalog_in_dict =tagalog_word_dict[tagalog_word]
+                    marker_eng_in_dict = english_word_dict[eng_word]
+                    total_eng_tagalog[marker_tagalog_in_dict][marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word]
+                    total_f[marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word]
+                #end for
+            #end for
+        #end for
+
+        #estimate probabilities
+        for eng_marker in  range(total_eng_occurrences): #for all foreign words f do
+
+            for tagalog_marker in range(total_tagalog_ocurrences): #for all English words e do
+
+                if total_eng_tagalog[tagalog_marker][eng_marker] != 0 :
+                    translate_eng_tagalog_matrix[tagalog_marker][eng_marker] = total_eng_tagalog[tagalog_marker][eng_marker] / total_f[eng_marker]
+
+            #end for
+            
+        #end for
+
+    #end while
+
+    print("EM Algorithm Converged in ",(cnt_iter-1)," iterations")
+    return translate_eng_tagalog_matrix
+
+
+def get_translation_prob(e,f,t,e_dict,f_dict):
+    const = Utils.const
+    l_e = len(e)
+    l_f = len(f)
+    res = const / math.pow((l_f+1),l_e)
+    for j in range(l_e):
+        e_word = e[j]
+        if e_word in e_dict:
+            e_j = e_dict[e_word]
+        else:
+            print("word '"+ e_word +"' is not found in target language dictionary")
+            continue
+            #return 0
+
+        sum = 0
+        for i in range(l_f):
+            f_word = f[i]
+
+            if f_word in f_dict:
+                f_i = f_dict[f_word]
+                sum += t[e_j][f_i]
+            else:
+                print("word '" + f_word  +"' is not found in source language dictionary")
+
+        res *= sum
+
+    return res
@@ -0,0 +1,47 @@
+import ModelTrainer
+import ModelTester
+import DutchtoEnglish
+import EnglishtoDutch
+import nltk
+
+nltk.download('punkt')
+while True:
+    try:
+        mode = int(input('\n\nPlease choose what you want to do: \n\t'
+                         '1: Train the Model\n\t'
+                         '2: Test sentence to translate\n\t'
+                         '3: Translate a tagalog document to English \n\t'
+                         '4: Translate an English document to tagalog \n\t'
+                         '5: For exit\n'))
+    except ValueError:
+        print("Not a number")
+
+    if mode == 1:
+        ModelTrainer.model_trainer()
+
+    elif mode == 2:
+        try:
+            translate_option = int(input('Select translation option: \n\t1: tagalog to English \n\t2: English to tagalog\n'))
+        except ValueError:
+            print ("Not a number")
+        if translate_option > 2 or translate_option < 1 :
+            print("Invalid Option")
+            exit()
+        sentence_to_translate = input("Please provide sentence to translate: ")
+
+        translated_sentence = ModelTester.test(sentence_to_translate,translate_option)
+        print(translated_sentence)
+
+    elif mode == 3:             #translate tagalog document to English
+        DutchtoEnglish.translate()
+
+    elif mode == 4:             #translate English document to tagalog
+        EnglishtoDutch.translate()
+
+    elif mode == 5:
+        break
+
+    else:
+        print("invalid mode")
+
+print("goodbye!")
@@ -0,0 +1,40 @@
+import numpy as np
+from nltk.tokenize import word_tokenize
+import string
+import IBM1_EM
+import Utils
+
+
+def get_tokens_of_sentence(sentence):
+    translate_table = dict((ord(char), None) for char in string.punctuation)
+    sentence = sentence.translate(translate_table)
+    tokens = word_tokenize(sentence.lower())
+    return tokens
+
+def sentence_tester1(sentence_to_translate,translate_option):
+    tagalog_to_english_maximised = np.load("trained_data/tagalog_to_english_maximised.npy",allow_pickle = True).item()
+    english_to_tagalog_maximised = np.load("trained_data/english_to_tagalog_maximised.npy",allow_pickle = True).item()
+
+    if translate_option == 1:
+        f_sentence = get_tokens_of_sentence(sentence_to_translate)
+        e_sentence = ""
+        for word in f_sentence :
+            if word in tagalog_to_english_maximised:
+                e_sentence = e_sentence + tagalog_to_english_maximised[word] + " "
+            else:
+                print("word '"+ word +"' does not exist in trained language translation dictionary")
+                continue
+        return e_sentence
+    elif translate_option == 2:
+        e_sentence = get_tokens_of_sentence(sentence_to_translate)
+        f_sentence = ""
+        for word in e_sentence :
+            if word in english_to_tagalog_maximised:
+                f_sentence = f_sentence + english_to_tagalog_maximised[word] + " "
+            else:
+                print("word '"+ word +"' does not exist in trained language translation dictionary")
+                continue
+        return f_sentence
+
+def test(sentence_to_translate,translate_option):
+    return sentence_tester1(sentence_to_translate,translate_option)