From 115c110209ba4f134f7da0535dacc23c29d78b84 Mon Sep 17 00:00:00 2001 From: Chenhao Tan Date: Tue, 18 Oct 2016 11:41:13 -0700 Subject: [PATCH] update parsing scripts to work for stanford-corenlp-3.6.0 --- client.py | 4 +-- corenlp.py | 88 +++++++++++++++++++++++++++++------------------------- 2 files changed, 49 insertions(+), 43 deletions(-) diff --git a/client.py b/client.py index 11097ce..5339967 100644 --- a/client.py +++ b/client.py @@ -6,7 +6,7 @@ class StanfordNLP: def __init__(self): self.server = ServerProxy(JsonRpc20(), TransportTcpIp(addr=("127.0.0.1", 8080))) - + def parse(self, text): return json.loads(self.server.parse(text)) @@ -15,5 +15,5 @@ def parse(self, text): pprint(result) from nltk.tree import Tree -tree = Tree.parse(result['sentences'][0]['parsetree']) +tree = Tree.fromstring(result['sentences'][0]['parsetree']) pprint(tree) diff --git a/corenlp.py b/corenlp.py index 753e51c..7350ad4 100644 --- a/corenlp.py +++ b/corenlp.py @@ -3,17 +3,17 @@ # corenlp - Python interface to Stanford Core NLP tools # Copyright (c) 2014 Dustin Smith # https://github.com/dasmith/stanford-corenlp-python -# +# # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. @@ -74,30 +74,33 @@ def parse_parser_results(text): state = STATE_START for line in text.encode('utf-8').split("\n"): line = line.strip() - + if line.startswith("Sentence #"): sentence = {'words':[], 'parsetree':[], 'dependencies':[]} results["sentences"].append(sentence) state = STATE_TEXT - + elif state == STATE_TEXT: sentence['text'] = line state = STATE_WORDS - + elif state == STATE_WORDS: - if not line.startswith("[Text="): - raise Exception('Parse error. Could not find "[Text=" in: %s' % line) - for s in WORD_PATTERN.findall(line): - sentence['words'].append(parse_bracketed(s)) - state = STATE_TREE - + if line.startswith("[Text="): + # now each line is a word + for s in WORD_PATTERN.findall(line): + sentence['words'].append(parse_bracketed(s)) + else: + state = STATE_TREE + if len(line) != 0: + sentence['parsetree'].append(line) + elif state == STATE_TREE: if len(line) == 0: state = STATE_DEPENDENCY sentence['parsetree'] = " ".join(sentence['parsetree']) else: sentence['parsetree'].append(line) - + elif state == STATE_DEPENDENCY: if len(line) == 0: state = STATE_COREFERENCE @@ -106,7 +109,7 @@ def parse_parser_results(text): if len(split_entry) == 3: rel, left, right = map(lambda x: remove_id(x), split_entry) sentence['dependencies'].append(tuple([rel,left,right])) - + elif state == STATE_COREFERENCE: if "Coreference set" in line: if 'coref' not in results: @@ -118,7 +121,7 @@ def parse_parser_results(text): src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1 sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1 coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r))) - + return results @@ -132,36 +135,39 @@ def __init__(self, corenlp_path=None): Checks the location of the jar files. Spawns the server as a process. """ - jars = ["stanford-corenlp-3.4.1.jar", - "stanford-corenlp-3.4.1-models.jar", + jars = ["stanford-corenlp-3.6.0.jar", + "stanford-corenlp-3.6.0-models.jar", "joda-time.jar", "xom.jar", - "jollyday.jar"] - + "jollyday.jar", + "ejml-0.23.jar", + "slf4j-api.jar", + "slf4j-simple.jar"] + # if CoreNLP libraries are in a different directory, # change the corenlp_path variable to point to them if not corenlp_path: - corenlp_path = "./stanford-corenlp-full-2014-08-27/" - + corenlp_path = "./stanford-corenlp-full-2015-12-09/" + java_path = "java" classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP" # include the properties file, so you can change defaults # but any changes in output format will break parse_parser_results() - props = "-props default.properties" - + props = "-props default.properties" + # add and check classpaths jars = [corenlp_path + jar for jar in jars] for jar in jars: if not os.path.exists(jar): logger.error("Error! Cannot locate %s" % jar) sys.exit(1) - + # spawn the server - start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props) - if VERBOSE: + start_corenlp = "%s -Xmx3600m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props) + if VERBOSE: logger.debug(start_corenlp) self.corenlp = pexpect.spawn(start_corenlp) - + # show progress bar while loading the models widgets = ['Loading Models: ', Fraction()] pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start() @@ -177,11 +183,11 @@ def __init__(self, corenlp_path=None): pbar.update(5) self.corenlp.expect("Entering interactive shell.") pbar.finish() - + def _parse(self, text): """ This is the core interaction with the parser. - + It returns a Python data-structure, while the parse() function returns a JSON object """ @@ -191,11 +197,11 @@ def _parse(self, text): self.corenlp.read_nonblocking (4000, 0.3) except pexpect.TIMEOUT: break - + self.corenlp.sendline(text) - + # How much time should we give the parser to parse it? - # the idea here is that you increase the timeout as a + # the idea here is that you increase the timeout as a # function of the text's length. # anything longer than 5 seconds requires that you also # increase timeout=5 in jsonrpc.py @@ -207,7 +213,7 @@ def _parse(self, text): # Time left, read more data try: incoming += self.corenlp.read_nonblocking(2000, 1) - if "\nNLP>" in incoming: + if "\nNLP>" in incoming: break time.sleep(0.0001) except pexpect.TIMEOUT: @@ -218,20 +224,20 @@ def _parse(self, text): continue except pexpect.EOF: break - - if VERBOSE: + + if VERBOSE: logger.debug("%s\n%s" % ('='*40, incoming)) try: results = parse_parser_results(incoming) except Exception, e: - if VERBOSE: + if VERBOSE: logger.debug(traceback.format_exc()) raise e - + return results - + def parse(self, text): - """ + """ This function takes a text string, sends it to the Stanford parser, reads in the result, parses the results and returns a list with one dictionary entry for each parsed sentence, in JSON format. @@ -253,9 +259,9 @@ def parse(self, text): options, args = parser.parse_args() server = jsonrpc.Server(jsonrpc.JsonRpc20(), jsonrpc.TransportTcpIp(addr=(options.host, int(options.port)))) - + nlp = StanfordCoreNLP() server.register_function(nlp.parse) - + logger.info('Serving on http://%s:%s' % (options.host, options.port)) server.serve()