[core] add support for standford NLP 3.5.2

lendormi · lendormi · commit 87d1f2bc1e2c · 2015-07-20T01:15:04.000+02:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Python interface to Stanford Core NLP tools v3.4.1
+# Python interface to Stanford Core NLP tools v3.5.2
 
 This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml).  It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server.
 
@@ -10,7 +10,7 @@ This is a Python wrapper for Stanford University's NLP group's Java-based [CoreN
 
 It depends on [pexpect](http://www.noah.org/wiki/pexpect) and includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/).
 
-It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON.  The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.4.1** released 2014-08-27.
+It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON.  The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.5.2** released 2015-04-20.
 
 ## Download and Usage
 
@@ -19,8 +19,8 @@ To use this program you must [download](http://nlp.stanford.edu/software/corenlp
 	sudo pip install pexpect unidecode
 	git clone git://github.com/dasmith/stanford-corenlp-python.git
 	cd stanford-corenlp-python
-	wget http://nlp.stanford.edu/software/stanford-corenlp-full-2014-08-27.zip
-	unzip stanford-corenlp-full-2014-08-27.zip
+	wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-04-20.zip
+	unzip stanford-corenlp-full-2015-04-20.zip
 
 Then launch the server:
 
@@ -110,7 +110,7 @@ To use it in a regular script (useful for debugging), load the module instead:
     corenlp = StanfordCoreNLP()  # wait a few minutes...
     corenlp.parse("Parse this sentence.")
 
-The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files.  The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2014-08-27/")`.
+The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files.  The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2015-04-20/")`.
 
 ## Coreference Resolution
 
@@ -139,7 +139,7 @@ tar xvfz WNprolog-3.0.tar.gz
 **Stanford CoreNLP tools require a large amount of free memory**.  Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines.  32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less.
 If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process:
 
-	java -cp stanford-corenlp-2014-08-27.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
+	java -cp stanford-corenlp-2015-04-20.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
 
 You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)).
 
diff --git a/corenlp.py b/corenlp.py
@@ -28,7 +28,7 @@
 
 VERBOSE = True
 
-STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
+STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE, STATE_STOP = 0, 1, 2, 3, 4, 5, 99
 WORD_PATTERN = re.compile('\[([^\]]+)\]')
 CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\]\) -> \((\d*),(\d)*,\[(\d*),(\d*)\]\), that is: \"(.*)\" -> \"(.*)\"")
 
@@ -74,39 +74,41 @@ def parse_parser_results(text):
     state = STATE_START
     for line in text.split("\n"):
         line = line.strip()
-        
+        if line.startswith("(ROOT"):
+            state = STATE_TREE
+        if line.startswith("NLP>"):
+            state = STATE_STOP
         if line.startswith("Sentence #"):
-            sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
+            sentence = {'words': [], 'parsetree': [], 'dependencies': []}
             results["sentences"].append(sentence)
             state = STATE_TEXT
-        
+
         elif state == STATE_TEXT:
             sentence['text'] = line
             state = STATE_WORDS
-        
+
         elif state == STATE_WORDS:
             if not line.startswith("[Text="):
                 raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
             for s in WORD_PATTERN.findall(line):
                 sentence['words'].append(parse_bracketed(s))
-            state = STATE_TREE
-        
+
         elif state == STATE_TREE:
             if len(line) == 0:
                 state = STATE_DEPENDENCY
                 sentence['parsetree'] = " ".join(sentence['parsetree'])
             else:
                 sentence['parsetree'].append(line)
-        
+
         elif state == STATE_DEPENDENCY:
             if len(line) == 0:
                 state = STATE_COREFERENCE
             else:
                 split_entry = re.split("\(|, ", line[:-1])
                 if len(split_entry) == 3:
                     rel, left, right = map(lambda x: remove_id(x), split_entry)
-                    sentence['dependencies'].append(tuple([rel,left,right]))
-        
+                    sentence['dependencies'].append(tuple([rel, left, right]))
+
         elif state == STATE_COREFERENCE:
             if "Coreference set" in line:
                 if 'coref' not in results:
@@ -118,7 +120,7 @@ def parse_parser_results(text):
                     src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
                     sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
                     coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
-    
+
     return results
 
 
@@ -132,70 +134,70 @@ def __init__(self, corenlp_path=None):
         Checks the location of the jar files.
         Spawns the server as a process.
         """
-        jars = ["stanford-corenlp-3.5.1.jar",
-                "stanford-corenlp-3.5.1-models.jar",
+        jars = ["stanford-corenlp-3.5.2.jar",
+                "stanford-corenlp-3.5.2-models.jar",
                 "joda-time.jar",
                 "xom.jar",
                 "jollyday.jar"]
-       
+
         # if CoreNLP libraries are in a different directory,
         # change the corenlp_path variable to point to them
         if not corenlp_path:
-            corenlp_path = "./stanford-corenlp-full-2015-01-30/"
-        
+            corenlp_path = "./stanford-corenlp-full-2015-04-20/"
+
         java_path = "java"
         classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
         # include the properties file, so you can change defaults
         # but any changes in output format will break parse_parser_results()
-        props = "-props default.properties" 
-        
+        props = "-props default.properties"
+
         # add and check classpaths
         jars = [corenlp_path + jar for jar in jars]
         for jar in jars:
             if not os.path.exists(jar):
                 logger.error("Error! Cannot locate %s" % jar)
                 sys.exit(1)
-        
+
         # spawn the server
         start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
-        if VERBOSE: 
+        if VERBOSE:
             logger.debug(start_corenlp)
         self.corenlp = pexpect.spawn(start_corenlp)
-        
+
         # show progress bar while loading the models
         widgets = ['Loading Models: ', Fraction()]
         pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
-        self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
+        self.corenlp.expect("done.", timeout=20)  # Load pos tagger model (~5sec)
         pbar.update(1)
-        self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
+        self.corenlp.expect("done.", timeout=200)  # Load NER-all classifier (~33sec)
         pbar.update(2)
-        self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
+        self.corenlp.expect("done.", timeout=600)  # Load NER-muc classifier (~60sec)
         pbar.update(3)
-        self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
+        self.corenlp.expect("done.", timeout=600)  # Load CoNLL classifier (~50sec)
         pbar.update(4)
-        self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
+        self.corenlp.expect("done.", timeout=200)  # Loading PCFG (~3sec)
         pbar.update(5)
         self.corenlp.expect("Entering interactive shell.")
         pbar.finish()
-    
+
     def _parse(self, text):
         """
         This is the core interaction with the parser.
-        
+
         It returns a Python data-structure, while the parse()
         function returns a JSON object
         """
         # clean up anything leftover
         while True:
             try:
-                self.corenlp.read_nonblocking (4000, 0.3)
+                self.corenlp.read_nonblocking(4000, 0.3)
             except pexpect.TIMEOUT:
                 break
-        
+
         self.corenlp.sendline(text)
-        
+
         # How much time should we give the parser to parse it?
-        # the idea here is that you increase the timeout as a 
+        # the idea here is that you increase the timeout as a
         # function of the text's length.
         # anything longer than 5 seconds requires that you also
         # increase timeout=5 in jsonrpc.py
@@ -207,7 +209,7 @@ def _parse(self, text):
             # Time left, read more data
             try:
                 incoming += self.corenlp.read_nonblocking(2000, 1)
-                if "\nNLP>" in incoming: 
+                if "\nNLP>" in incoming:
                     break
                 time.sleep(0.0001)
             except pexpect.TIMEOUT:
@@ -218,20 +220,20 @@ def _parse(self, text):
                     continue
             except pexpect.EOF:
                 break
-        
-        if VERBOSE: 
+
+        if VERBOSE:
             logger.debug("%s\n%s" % ('='*40, incoming))
         try:
             results = parse_parser_results(incoming)
         except Exception, e:
-            if VERBOSE: 
+            if VERBOSE:
                 logger.debug(traceback.format_exc())
             raise e
-        
+
         return results
-    
+
     def parse(self, text):
-        """ 
+        """
         This function takes a text string, sends it to the Stanford parser,
         reads in the result, parses the results and returns a list
         with one dictionary entry for each parsed sentence, in JSON format.
@@ -253,9 +255,9 @@ def parse(self, text):
     options, args = parser.parse_args()
     server = jsonrpc.Server(jsonrpc.JsonRpc20(),
                             jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
-    
+
     nlp = StanfordCoreNLP()
     server.register_function(nlp.parse)
-    
+
     logger.info('Serving on http://%s:%s' % (options.host, options.port))
     server.serve()