Merge branch 'RavivBarzilay-committee-speaker'

hasadna · Nov 25, 2017 · 15f5c05 · 15f5c05
2 parents 649e109 + 14946d8
commit 15f5c05
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 6 deletions.
diff --git a/knesset_data/protocols/committee.py b/knesset_data/protocols/committee.py
@@ -4,7 +4,7 @@
 from cached_property import cached_property
 import re
 import contextlib
-from .utils import fix_hyphens, get_people_list
+from .utils import fix_hyphens, get_people_list, get_speaker_list
 import six
 
 # solve issues with unicode for python3/2
@@ -141,6 +141,16 @@ def attendees(self):
 
         return None
 
+    @cached_property
+    def speakers(self):
+        """
+        finds the people who spoke in this committee meeting
+        """
+        if isinstance(self.text, (str, unicode)) and self.text:
+            return get_speaker_list(self.text)
+
+        return []
+
     @staticmethod
     def _get_committee_members(text):
         """
@@ -184,10 +194,6 @@ def _get_committee_manager(text):
         results.extend(get_people_list(text,u"מנהל הוועדה:"))
         return results
 
-
-
-
-
     @classmethod
     @contextlib.contextmanager
     def get_from_text(cls, text):

diff --git a/knesset_data/protocols/utils.py b/knesset_data/protocols/utils.py
@@ -43,12 +43,14 @@ def antiword(filename):
     os.remove(filename+'.awdb.xml')
     return xmldata
 
+
 def fix_hyphens(text):
     return text.replace(u"\n\n–\n\n",u" – ")
 
+
 def get_people_list(text,token):
     lines = text.split("\n")
-    #find the start of the list
+    # find the start of the list
     start_index = 0
     end_index = 0
     found = False
@@ -64,3 +66,24 @@ def get_people_list(text,token):
                 break
 
     return list(filter(lambda x: x and (len(x) > 0), lines[start_index +1 : end_index-1]) if found else [])
+
+
+def get_speaker_list(text, token=u'היו"ר'):
+    fixed_text = fix_hyphens(text)
+    lines = fixed_text.split("\n")
+    start_index = 0
+    found = False
+    for i in range(len(lines)):
+        start_index = i
+        if token in lines[i] and ":" in lines[i]:
+            found = True
+            break
+    if found:
+        speakers = list(set(filter(lambda x: x and x[-1] == u':', lines[start_index:])))
+        speakers = map(lambda x: x[:-1], speakers)
+        speakers = filter(lambda x: u"קריאה" != x, speakers)
+        speakers = filter(lambda x: u"קריאותנ" != x, speakers)
+
+        return speakers
+
+    return []