diff --git a/knesset_data/protocols/committee.py b/knesset_data/protocols/committee.py index fc21f39..9d36774 100644 --- a/knesset_data/protocols/committee.py +++ b/knesset_data/protocols/committee.py @@ -4,7 +4,7 @@ from cached_property import cached_property import re import contextlib -from .utils import fix_hyphens, get_people_list +from .utils import fix_hyphens, get_people_list, get_speaker_list import six # solve issues with unicode for python3/2 @@ -141,6 +141,16 @@ def attendees(self): return None + @cached_property + def speakers(self): + """ + finds the people who spoke in this committee meeting + """ + if isinstance(self.text, (str, unicode)) and self.text: + return get_speaker_list(self.text) + + return [] + @staticmethod def _get_committee_members(text): """ @@ -184,10 +194,6 @@ def _get_committee_manager(text): results.extend(get_people_list(text,u"מנהל הוועדה:")) return results - - - - @classmethod @contextlib.contextmanager def get_from_text(cls, text): diff --git a/knesset_data/protocols/utils.py b/knesset_data/protocols/utils.py index d4778cd..62ddae3 100644 --- a/knesset_data/protocols/utils.py +++ b/knesset_data/protocols/utils.py @@ -43,12 +43,14 @@ def antiword(filename): os.remove(filename+'.awdb.xml') return xmldata + def fix_hyphens(text): return text.replace(u"\n\n–\n\n",u" – ") + def get_people_list(text,token): lines = text.split("\n") - #find the start of the list + # find the start of the list start_index = 0 end_index = 0 found = False @@ -64,3 +66,24 @@ def get_people_list(text,token): break return list(filter(lambda x: x and (len(x) > 0), lines[start_index +1 : end_index-1]) if found else []) + + +def get_speaker_list(text, token=u'היו"ר'): + fixed_text = fix_hyphens(text) + lines = fixed_text.split("\n") + start_index = 0 + found = False + for i in range(len(lines)): + start_index = i + if token in lines[i] and ":" in lines[i]: + found = True + break + if found: + speakers = list(set(filter(lambda x: x and x[-1] == u':', lines[start_index:]))) + speakers = map(lambda x: x[:-1], speakers) + speakers = filter(lambda x: u"קריאה" != x, speakers) + speakers = filter(lambda x: u"קריאותנ" != x, speakers) + + return speakers + + return []