|
1 | 1 | import numpy as np
|
2 |
| -from datetime import datetime |
3 | 2 | import math
|
4 | 3 | import Utils
|
5 | 4 |
|
6 | 5 |
|
7 |
| -def expectation_maximization(tagalog_word_dict,english_word_dict,tagalog_sentences,english_sentences): |
8 |
| - total_tagalog_ocurrences = len(tagalog_word_dict) |
9 |
| - total_eng_occurrences = len(english_word_dict) |
10 |
| - |
11 |
| - # IBM1 Expectaion Maximization algorithm : |
12 |
| - translate_eng_tagalog_matrix = np.full((len(tagalog_word_dict), len(english_word_dict)), 1 / len(english_word_dict),dtype=float) |
13 |
| - translate_eng_tagalog_matrix_prev = np.full((len(tagalog_word_dict), len(english_word_dict)), 1,dtype=float) |
14 |
| - |
15 |
| - cnt_iter = 0 |
16 |
| - while not Utils.is_converged(translate_eng_tagalog_matrix,translate_eng_tagalog_matrix_prev,cnt_iter) : |
17 |
| - cnt_iter += 1 |
18 |
| - translate_eng_tagalog_matrix_prev = translate_eng_tagalog_matrix.copy() |
19 |
| - total_eng_tagalog = np.full((len(tagalog_word_dict), len(english_word_dict)), 0, dtype=float) |
20 |
| - total_f = np.full((len(english_word_dict)),0, dtype=float) |
21 |
| - |
22 |
| - for marker_tur, tagalog_sen in enumerate(tagalog_sentences): #for all sentence pairs (e,f) do |
23 |
| - #compute normalization |
24 |
| - tagalog_sen_words = tagalog_sen.split(" ") |
25 |
| - s_total = np.full((len(tagalog_sen_words)),0,dtype=float) |
26 |
| - |
27 |
| - for marker_word in range(len(tagalog_sen_words)): #for all words e in e do |
28 |
| - tagalog_word = tagalog_sen_words[marker_word] |
29 |
| - s_total[marker_word] = 0 |
30 |
| - eng_sen_words = english_sentences[marker_tur].split(" ") |
31 |
| - |
32 |
| - for eng_word in eng_sen_words: #for all words f in f do |
33 |
| - if eng_word == '' : |
34 |
| - continue |
35 |
| - marker_tagalog_in_dict =tagalog_word_dict[tagalog_word] |
36 |
| - marker_eng_in_dict = english_word_dict[eng_word] |
37 |
| - s_total[marker_word] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] |
| 6 | +# expectation maximization |
| 7 | +def expect_max(pfil_word_dict, pen_word_dict, plst_fil_sen, plst_en_sen): |
| 8 | + fil_occur = len(pfil_word_dict) |
| 9 | + en_occur = len(pen_word_dict) |
| 10 | + |
| 11 | + # IBM1 Expectaion Maximization algorithm |
| 12 | + trans_en_fil_matrix = np.full((len(pfil_word_dict), len(pen_word_dict)), 1 / len(pen_word_dict), dtype=float) |
| 13 | + trans_en_fil_matrix_prev = np.full((len(pfil_word_dict), len(pen_word_dict)), 1, dtype=float) |
| 14 | + |
| 15 | + int_count = 0 |
| 16 | + while not Utils.is_converged(trans_en_fil_matrix, trans_en_fil_matrix_prev, int_count): |
| 17 | + int_count += 1 |
| 18 | + |
| 19 | + # making the current matrix as the old one |
| 20 | + trans_en_fil_matrix_prev = trans_en_fil_matrix.copy() |
| 21 | + |
| 22 | + # initializing the enfil's value as 0 |
| 23 | + total_enfil = np.full((len(pfil_word_dict), len(pen_word_dict)), 0, dtype=float) |
| 24 | + |
| 25 | + # initializing the final total value |
| 26 | + total_fin = np.full((len(pen_word_dict)),0, dtype=float) |
| 27 | + |
| 28 | + for int_index, lst_fil_sen in enumerate(plst_fil_sen): # for all sentence pairs (e,f) do |
| 29 | + # computing for the normalization |
| 30 | + lst_fil_words = lst_fil_sen.split(" ") |
| 31 | + total_sen = np.full((len(lst_fil_words)), 0, dtype=float) |
| 32 | + |
| 33 | + # for all words in the filipino list of words |
| 34 | + for int_index2 in range(len(lst_fil_words)): |
| 35 | + str_fil_word = lst_fil_words[int_index2] |
| 36 | + total_sen[int_index2] = 0 |
| 37 | + lst_en_words = plst_en_sen[int_index].split(" ") |
| 38 | + |
| 39 | + # for all string words in the list of words |
| 40 | + for str_en_word in lst_en_words: |
| 41 | + # continue even if the string is empty |
| 42 | + if str_en_word == '': |
| 43 | + continue |
| 44 | + |
| 45 | + int_index_fildict = pfil_word_dict[str_fil_word] |
| 46 | + int_index_endict = pen_word_dict[str_en_word] |
| 47 | + total_sen[int_index2] += trans_en_fil_matrix[int_index_fildict][int_index_endict] |
38 | 48 | #end for
|
39 | 49 | #end for
|
40 | 50 |
|
41 | 51 | #collect counts
|
42 |
| - tagalog_sen_words = tagalog_sen.split(" ") |
| 52 | + lst_fil_words = lst_fil_sen.split(" ") |
43 | 53 |
|
44 |
| - for marker_word in range(len(tagalog_sen_words)): #for all words e in e do |
45 |
| - tagalog_word = tagalog_sen_words[marker_word] |
46 |
| - eng_sen_words = english_sentences[marker_tur].split(" ") |
| 54 | + for int_index2 in range(len(lst_fil_words)): #for all words e in e do |
| 55 | + str_fil_word = lst_fil_words[int_index2] |
| 56 | + lst_en_words = plst_en_sen[int_index].split(" ") |
47 | 57 |
|
48 |
| - for eng_word in eng_sen_words: #for all words f in f do |
49 |
| - if eng_word == '' : |
| 58 | + for str_en_word in lst_en_words: #for all words f in f do |
| 59 | + if str_en_word == '' : |
50 | 60 | continue
|
51 |
| - marker_tagalog_in_dict =tagalog_word_dict[tagalog_word] |
52 |
| - marker_eng_in_dict = english_word_dict[eng_word] |
53 |
| - total_eng_tagalog[marker_tagalog_in_dict][marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word] |
54 |
| - total_f[marker_eng_in_dict] += translate_eng_tagalog_matrix[marker_tagalog_in_dict][marker_eng_in_dict] / s_total[marker_word] |
| 61 | + int_index_fildict = pfil_word_dict[str_fil_word] |
| 62 | + int_index_endict = pen_word_dict[str_en_word] |
| 63 | + total_enfil[int_index_fildict][int_index_endict] += trans_en_fil_matrix[int_index_fildict][int_index_endict] / total_sen[int_index2] |
| 64 | + total_fin[int_index_endict] += trans_en_fil_matrix[int_index_fildict][int_index_endict] / total_sen[int_index2] |
55 | 65 | #end for
|
56 | 66 | #end for
|
57 | 67 | #end for
|
58 | 68 |
|
59 | 69 | #estimate probabilities
|
60 |
| - for eng_marker in range(total_eng_occurrences): #for all foreign words f do |
| 70 | + for int_en_index in range(en_occur): # for all foreign words f do |
61 | 71 |
|
62 |
| - for tagalog_marker in range(total_tagalog_ocurrences): #for all English words e do |
| 72 | + for int_fil_index in range(fil_occur): # for all English words e do |
63 | 73 |
|
64 |
| - if total_eng_tagalog[tagalog_marker][eng_marker] != 0 : |
65 |
| - translate_eng_tagalog_matrix[tagalog_marker][eng_marker] = total_eng_tagalog[tagalog_marker][eng_marker] / total_f[eng_marker] |
| 74 | + if total_enfil[int_fil_index][int_en_index] != 0 : |
| 75 | + trans_en_fil_matrix[int_fil_index][int_en_index] = total_enfil[int_fil_index][int_en_index] / total_fin[int_en_index] |
66 | 76 |
|
67 | 77 | #end for
|
68 | 78 |
|
69 | 79 | #end for
|
70 | 80 |
|
71 | 81 | #end while
|
72 | 82 |
|
73 |
| - print("EM Algorithm Converged in ",(cnt_iter-1)," iterations") |
74 |
| - return translate_eng_tagalog_matrix |
| 83 | + print("EM Algorithm Converged in ",(int_count-1)," iterations") |
| 84 | + return trans_en_fil_matrix |
75 | 85 |
|
76 | 86 |
|
77 |
| -def get_translation_prob(e,f,t,e_dict,f_dict): |
| 87 | +def get_translation_prob(e, f, t, e_dict, f_dict): |
78 | 88 | const = Utils.const
|
79 | 89 | l_e = len(e)
|
80 | 90 | l_f = len(f)
|
|
0 commit comments