Skip to content

Commit

Permalink
Merge branch 'RavivBarzilay-committee-speaker'
Browse files Browse the repository at this point in the history
  • Loading branch information
OriHoch committed Nov 25, 2017
2 parents 649e109 + 14946d8 commit 15f5c05
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 6 deletions.
16 changes: 11 additions & 5 deletions knesset_data/protocols/committee.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from cached_property import cached_property
import re
import contextlib
from .utils import fix_hyphens, get_people_list
from .utils import fix_hyphens, get_people_list, get_speaker_list
import six

# solve issues with unicode for python3/2
Expand Down Expand Up @@ -141,6 +141,16 @@ def attendees(self):

return None

@cached_property
def speakers(self):
"""
finds the people who spoke in this committee meeting
"""
if isinstance(self.text, (str, unicode)) and self.text:
return get_speaker_list(self.text)

return []

@staticmethod
def _get_committee_members(text):
"""
Expand Down Expand Up @@ -184,10 +194,6 @@ def _get_committee_manager(text):
results.extend(get_people_list(text,u"מנהל הוועדה:"))
return results





@classmethod
@contextlib.contextmanager
def get_from_text(cls, text):
Expand Down
25 changes: 24 additions & 1 deletion knesset_data/protocols/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,14 @@ def antiword(filename):
os.remove(filename+'.awdb.xml')
return xmldata


def fix_hyphens(text):
return text.replace(u"\n\n\n\n",u" – ")


def get_people_list(text,token):
lines = text.split("\n")
#find the start of the list
# find the start of the list
start_index = 0
end_index = 0
found = False
Expand All @@ -64,3 +66,24 @@ def get_people_list(text,token):
break

return list(filter(lambda x: x and (len(x) > 0), lines[start_index +1 : end_index-1]) if found else [])


def get_speaker_list(text, token=u'היו"ר'):
fixed_text = fix_hyphens(text)
lines = fixed_text.split("\n")
start_index = 0
found = False
for i in range(len(lines)):
start_index = i
if token in lines[i] and ":" in lines[i]:
found = True
break
if found:
speakers = list(set(filter(lambda x: x and x[-1] == u':', lines[start_index:])))
speakers = map(lambda x: x[:-1], speakers)
speakers = filter(lambda x: u"קריאה" != x, speakers)
speakers = filter(lambda x: u"קריאותנ" != x, speakers)

return speakers

return []

0 comments on commit 15f5c05

Please sign in to comment.