Skip to content

Commit f065ae6

Browse files
authoredAug 2, 2022
Add files via upload
1 parent bc62239 commit f065ae6

File tree

1 file changed

+59
-49
lines changed

1 file changed

+59
-49
lines changed
 

‎IBM1_EM.py

+59-49
Original file line numberDiff line numberDiff line change
@@ -1,80 +1,90 @@
11
import numpy as np
2-
from datetime import datetime
32
import math
43
import Utils
54

65

7-
def expectation_maximization(tagalog_word_dict,english_word_dict,tagalog_sentences,english_sentences):
8-
total_tagalog_ocurrences = len(tagalog_word_dict)
9-
total_eng_occurrences = len(english_word_dict)
10-
11-
# IBM1 Expectaion Maximization algorithm :
12-
translate_eng_tagalog_matrix = np.full((len(tagalog_word_dict), len(english_word_dict)), 1 / len(english_word_dict),dtype=float)
13-
translate_eng_tagalog_matrix_prev = np.full((len(tagalog_word_dict), len(english_word_dict)), 1,dtype=float)
14-
15-
cnt_iter = 0
16-
while not Utils.is_converged(translate_eng_tagalog_matrix,translate_eng_tagalog_matrix_prev,cnt_iter) :
17-
cnt_iter += 1
18-
translate_eng_tagalog_matrix_prev = translate_eng_tagalog_matrix.copy()
19-
total_eng_tagalog = np.full((len(tagalog_word_dict), len(english_word_dict)), 0, dtype=float)
20-
total_f = np.full((len(english_word_dict)),0, dtype=float)
21-
22-
for marker_tur, tagalog_sen in enumerate(tagalog_sentences): #for all sentence pairs (e,f) do
23-
#compute normalization
24-
tagalog_sen_words = tagalog_sen.split(" ")
25-
s_total = np.full((len(tagalog_sen_words)),0,dtype=float)
26-
27-
for marker_word in range(len(tagalog_sen_words)): #for all words e in e do
28-
tagalog_word = tagalog_sen_words[marker_word]
29-
s_total[marker_word] = 0
30-
eng_sen_words = english_sentences[marker_tur].split(" ")
31-
32-
for eng_word in eng_sen_words: #for all words f in f do
33-
if eng_word == '' :
34-
continue
35-
marker_tagalog_in_dict =tagalog_word_dict[tagalog_word]
36-
marker_eng_in_dict = english_word_dict[eng_word]
37-
s_total[marker_word] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict]
6+
# expectation maximization
7+
def expect_max(pfil_word_dict, pen_word_dict, plst_fil_sen, plst_en_sen):
8+
fil_occur = len(pfil_word_dict)
9+
en_occur = len(pen_word_dict)
10+
11+
# IBM1 Expectaion Maximization algorithm
12+
trans_en_fil_matrix = np.full((len(pfil_word_dict), len(pen_word_dict)), 1 / len(pen_word_dict), dtype=float)
13+
trans_en_fil_matrix_prev = np.full((len(pfil_word_dict), len(pen_word_dict)), 1, dtype=float)
14+
15+
int_count = 0
16+
while not Utils.is_converged(trans_en_fil_matrix, trans_en_fil_matrix_prev, int_count):
17+
int_count += 1
18+
19+
# making the current matrix as the old one
20+
trans_en_fil_matrix_prev = trans_en_fil_matrix.copy()
21+
22+
# initializing the enfil's value as 0
23+
total_enfil = np.full((len(pfil_word_dict), len(pen_word_dict)), 0, dtype=float)
24+
25+
# initializing the final total value
26+
total_fin = np.full((len(pen_word_dict)),0, dtype=float)
27+
28+
for int_index, lst_fil_sen in enumerate(plst_fil_sen): # for all sentence pairs (e,f) do
29+
# computing for the normalization
30+
lst_fil_words = lst_fil_sen.split(" ")
31+
total_sen = np.full((len(lst_fil_words)), 0, dtype=float)
32+
33+
# for all words in the filipino list of words
34+
for int_index2 in range(len(lst_fil_words)):
35+
str_fil_word = lst_fil_words[int_index2]
36+
total_sen[int_index2] = 0
37+
lst_en_words = plst_en_sen[int_index].split(" ")
38+
39+
# for all string words in the list of words
40+
for str_en_word in lst_en_words:
41+
# continue even if the string is empty
42+
if str_en_word == '':
43+
continue
44+
45+
int_index_fildict = pfil_word_dict[str_fil_word]
46+
int_index_endict = pen_word_dict[str_en_word]
47+
total_sen[int_index2] += trans_en_fil_matrix[int_index_fildict][int_index_endict]
3848
#end for
3949
#end for
4050

4151
#collect counts
42-
tagalog_sen_words = tagalog_sen.split(" ")
52+
lst_fil_words = lst_fil_sen.split(" ")
4353

44-
for marker_word in range(len(tagalog_sen_words)): #for all words e in e do
45-
tagalog_word = tagalog_sen_words[marker_word]
46-
eng_sen_words = english_sentences[marker_tur].split(" ")
54+
for int_index2 in range(len(lst_fil_words)): #for all words e in e do
55+
str_fil_word = lst_fil_words[int_index2]
56+
lst_en_words = plst_en_sen[int_index].split(" ")
4757

48-
for eng_word in eng_sen_words: #for all words f in f do
49-
if eng_word == '' :
58+
for str_en_word in lst_en_words: #for all words f in f do
59+
if str_en_word == '' :
5060
continue
51-
marker_tagalog_in_dict =tagalog_word_dict[tagalog_word]
52-
marker_eng_in_dict = english_word_dict[eng_word]
53-
total_eng_tagalog[marker_tagalog_in_dict][marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word]
54-
total_f[marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word]
61+
int_index_fildict = pfil_word_dict[str_fil_word]
62+
int_index_endict = pen_word_dict[str_en_word]
63+
total_enfil[int_index_fildict][int_index_endict] += trans_en_fil_matrix[int_index_fildict][int_index_endict] / total_sen[int_index2]
64+
total_fin[int_index_endict] += trans_en_fil_matrix[int_index_fildict][int_index_endict] / total_sen[int_index2]
5565
#end for
5666
#end for
5767
#end for
5868

5969
#estimate probabilities
60-
for eng_marker in range(total_eng_occurrences): #for all foreign words f do
70+
for int_en_index in range(en_occur): # for all foreign words f do
6171

62-
for tagalog_marker in range(total_tagalog_ocurrences): #for all English words e do
72+
for int_fil_index in range(fil_occur): # for all English words e do
6373

64-
if total_eng_tagalog[tagalog_marker][eng_marker] != 0 :
65-
translate_eng_tagalog_matrix[tagalog_marker][eng_marker] = total_eng_tagalog[tagalog_marker][eng_marker] / total_f[eng_marker]
74+
if total_enfil[int_fil_index][int_en_index] != 0 :
75+
trans_en_fil_matrix[int_fil_index][int_en_index] = total_enfil[int_fil_index][int_en_index] / total_fin[int_en_index]
6676

6777
#end for
6878

6979
#end for
7080

7181
#end while
7282

73-
print("EM Algorithm Converged in ",(cnt_iter-1)," iterations")
74-
return translate_eng_tagalog_matrix
83+
print("EM Algorithm Converged in ",(int_count-1)," iterations")
84+
return trans_en_fil_matrix
7585

7686

77-
def get_translation_prob(e,f,t,e_dict,f_dict):
87+
def get_translation_prob(e, f, t, e_dict, f_dict):
7888
const = Utils.const
7989
l_e = len(e)
8090
l_f = len(f)

0 commit comments

Comments
 (0)
Please sign in to comment.