1
+ import numpy as np
2
+ from datetime import datetime
3
+ import math
4
+ import Utils
5
+
6
+
7
+ def expectation_maximization (tagalog_word_dict ,english_word_dict ,tagalog_sentences ,english_sentences ):
8
+ total_tagalog_ocurrences = len (tagalog_word_dict )
9
+ total_eng_occurrences = len (english_word_dict )
10
+
11
+ # IBM1 Expectaion Maximization algorithm :
12
+ translate_eng_tagalog_matrix = np .full ((len (tagalog_word_dict ), len (english_word_dict )), 1 / len (english_word_dict ),dtype = float )
13
+ translate_eng_tagalog_matrix_prev = np .full ((len (tagalog_word_dict ), len (english_word_dict )), 1 ,dtype = float )
14
+
15
+ cnt_iter = 0
16
+ while not Utils .is_converged (translate_eng_tagalog_matrix ,translate_eng_tagalog_matrix_prev ,cnt_iter ) :
17
+ cnt_iter += 1
18
+ translate_eng_tagalog_matrix_prev = translate_eng_tagalog_matrix .copy ()
19
+ total_eng_tagalog = np .full ((len (tagalog_word_dict ), len (english_word_dict )), 0 , dtype = float )
20
+ total_f = np .full ((len (english_word_dict )),0 , dtype = float )
21
+
22
+ for marker_tur , tagalog_sen in enumerate (tagalog_sentences ): #for all sentence pairs (e,f) do
23
+ #compute normalization
24
+ tagalog_sen_words = tagalog_sen .split (" " )
25
+ s_total = np .full ((len (tagalog_sen_words )),0 ,dtype = float )
26
+
27
+ for marker_word in range (len (tagalog_sen_words )): #for all words e in e do
28
+ tagalog_word = tagalog_sen_words [marker_word ]
29
+ s_total [marker_word ] = 0
30
+ eng_sen_words = english_sentences [marker_tur ].split (" " )
31
+
32
+ for eng_word in eng_sen_words : #for all words f in f do
33
+ if eng_word == '' :
34
+ continue
35
+ marker_tagalog_in_dict = tagalog_word_dict [tagalog_word ]
36
+ marker_eng_in_dict = english_word_dict [eng_word ]
37
+ s_total [marker_word ] += translate_eng_tagalog_matrix [marker_tagalog_in_dict ][marker_eng_in_dict ]
38
+ #end for
39
+ #end for
40
+
41
+ #collect counts
42
+ tagalog_sen_words = tagalog_sen .split (" " )
43
+
44
+ for marker_word in range (len (tagalog_sen_words )): #for all words e in e do
45
+ tagalog_word = tagalog_sen_words [marker_word ]
46
+ eng_sen_words = english_sentences [marker_tur ].split (" " )
47
+
48
+ for eng_word in eng_sen_words : #for all words f in f do
49
+ if eng_word == '' :
50
+ continue
51
+ marker_tagalog_in_dict = tagalog_word_dict [tagalog_word ]
52
+ marker_eng_in_dict = english_word_dict [eng_word ]
53
+ total_eng_tagalog [marker_tagalog_in_dict ][marker_eng_in_dict ] += translate_eng_tagalog_matrix [marker_tagalog_in_dict ][marker_eng_in_dict ] / s_total [marker_word ]
54
+ total_f [marker_eng_in_dict ] += translate_eng_tagalog_matrix [marker_tagalog_in_dict ][marker_eng_in_dict ] / s_total [marker_word ]
55
+ #end for
56
+ #end for
57
+ #end for
58
+
59
+ #estimate probabilities
60
+ for eng_marker in range (total_eng_occurrences ): #for all foreign words f do
61
+
62
+ for tagalog_marker in range (total_tagalog_ocurrences ): #for all English words e do
63
+
64
+ if total_eng_tagalog [tagalog_marker ][eng_marker ] != 0 :
65
+ translate_eng_tagalog_matrix [tagalog_marker ][eng_marker ] = total_eng_tagalog [tagalog_marker ][eng_marker ] / total_f [eng_marker ]
66
+
67
+ #end for
68
+
69
+ #end for
70
+
71
+ #end while
72
+
73
+ print ("EM Algorithm Converged in " ,(cnt_iter - 1 )," iterations" )
74
+ return translate_eng_tagalog_matrix
75
+
76
+
77
+ def get_translation_prob (e ,f ,t ,e_dict ,f_dict ):
78
+ const = Utils .const
79
+ l_e = len (e )
80
+ l_f = len (f )
81
+ res = const / math .pow ((l_f + 1 ),l_e )
82
+ for j in range (l_e ):
83
+ e_word = e [j ]
84
+ if e_word in e_dict :
85
+ e_j = e_dict [e_word ]
86
+ else :
87
+ print ("word '" + e_word + "' is not found in target language dictionary" )
88
+ continue
89
+ #return 0
90
+
91
+ sum = 0
92
+ for i in range (l_f ):
93
+ f_word = f [i ]
94
+
95
+ if f_word in f_dict :
96
+ f_i = f_dict [f_word ]
97
+ sum += t [e_j ][f_i ]
98
+ else :
99
+ print ("word '" + f_word + "' is not found in source language dictionary" )
100
+
101
+ res *= sum
102
+
103
+ return res
0 commit comments