Skip to content

Commit 87d1f2b

Browse files
committed
[core] add support for standford NLP 3.5.2
1 parent 2c41e75 commit 87d1f2b

File tree

2 files changed

+49
-47
lines changed

2 files changed

+49
-47
lines changed

README.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Python interface to Stanford Core NLP tools v3.4.1
1+
# Python interface to Stanford Core NLP tools v3.5.2
22

33
This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml). It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server.
44

@@ -10,7 +10,7 @@ This is a Python wrapper for Stanford University's NLP group's Java-based [CoreN
1010

1111
It depends on [pexpect](http://www.noah.org/wiki/pexpect) and includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/).
1212

13-
It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.4.1** released 2014-08-27.
13+
It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.5.2** released 2015-04-20.
1414

1515
## Download and Usage
1616

@@ -19,8 +19,8 @@ To use this program you must [download](http://nlp.stanford.edu/software/corenlp
1919
sudo pip install pexpect unidecode
2020
git clone git://github.com/dasmith/stanford-corenlp-python.git
2121
cd stanford-corenlp-python
22-
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2014-08-27.zip
23-
unzip stanford-corenlp-full-2014-08-27.zip
22+
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-04-20.zip
23+
unzip stanford-corenlp-full-2015-04-20.zip
2424

2525
Then launch the server:
2626

@@ -110,7 +110,7 @@ To use it in a regular script (useful for debugging), load the module instead:
110110
corenlp = StanfordCoreNLP() # wait a few minutes...
111111
corenlp.parse("Parse this sentence.")
112112

113-
The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files. The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2014-08-27/")`.
113+
The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files. The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2015-04-20/")`.
114114

115115
## Coreference Resolution
116116

@@ -139,7 +139,7 @@ tar xvfz WNprolog-3.0.tar.gz
139139
**Stanford CoreNLP tools require a large amount of free memory**. Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines. 32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less.
140140
If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process:
141141

142-
java -cp stanford-corenlp-2014-08-27.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
142+
java -cp stanford-corenlp-2015-04-20.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
143143

144144
You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)).
145145

corenlp.py

+43-41
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
VERBOSE = True
3030

31-
STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE = 0, 1, 2, 3, 4, 5
31+
STATE_START, STATE_TEXT, STATE_WORDS, STATE_TREE, STATE_DEPENDENCY, STATE_COREFERENCE, STATE_STOP = 0, 1, 2, 3, 4, 5, 99
3232
WORD_PATTERN = re.compile('\[([^\]]+)\]')
3333
CR_PATTERN = re.compile(r"\((\d*),(\d)*,\[(\d*),(\d*)\]\) -> \((\d*),(\d)*,\[(\d*),(\d*)\]\), that is: \"(.*)\" -> \"(.*)\"")
3434

@@ -74,39 +74,41 @@ def parse_parser_results(text):
7474
state = STATE_START
7575
for line in text.split("\n"):
7676
line = line.strip()
77-
77+
if line.startswith("(ROOT"):
78+
state = STATE_TREE
79+
if line.startswith("NLP>"):
80+
state = STATE_STOP
7881
if line.startswith("Sentence #"):
79-
sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
82+
sentence = {'words': [], 'parsetree': [], 'dependencies': []}
8083
results["sentences"].append(sentence)
8184
state = STATE_TEXT
82-
85+
8386
elif state == STATE_TEXT:
8487
sentence['text'] = line
8588
state = STATE_WORDS
86-
89+
8790
elif state == STATE_WORDS:
8891
if not line.startswith("[Text="):
8992
raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
9093
for s in WORD_PATTERN.findall(line):
9194
sentence['words'].append(parse_bracketed(s))
92-
state = STATE_TREE
93-
95+
9496
elif state == STATE_TREE:
9597
if len(line) == 0:
9698
state = STATE_DEPENDENCY
9799
sentence['parsetree'] = " ".join(sentence['parsetree'])
98100
else:
99101
sentence['parsetree'].append(line)
100-
102+
101103
elif state == STATE_DEPENDENCY:
102104
if len(line) == 0:
103105
state = STATE_COREFERENCE
104106
else:
105107
split_entry = re.split("\(|, ", line[:-1])
106108
if len(split_entry) == 3:
107109
rel, left, right = map(lambda x: remove_id(x), split_entry)
108-
sentence['dependencies'].append(tuple([rel,left,right]))
109-
110+
sentence['dependencies'].append(tuple([rel, left, right]))
111+
110112
elif state == STATE_COREFERENCE:
111113
if "Coreference set" in line:
112114
if 'coref' not in results:
@@ -118,7 +120,7 @@ def parse_parser_results(text):
118120
src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
119121
sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
120122
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
121-
123+
122124
return results
123125

124126

@@ -132,70 +134,70 @@ def __init__(self, corenlp_path=None):
132134
Checks the location of the jar files.
133135
Spawns the server as a process.
134136
"""
135-
jars = ["stanford-corenlp-3.5.1.jar",
136-
"stanford-corenlp-3.5.1-models.jar",
137+
jars = ["stanford-corenlp-3.5.2.jar",
138+
"stanford-corenlp-3.5.2-models.jar",
137139
"joda-time.jar",
138140
"xom.jar",
139141
"jollyday.jar"]
140-
142+
141143
# if CoreNLP libraries are in a different directory,
142144
# change the corenlp_path variable to point to them
143145
if not corenlp_path:
144-
corenlp_path = "./stanford-corenlp-full-2015-01-30/"
145-
146+
corenlp_path = "./stanford-corenlp-full-2015-04-20/"
147+
146148
java_path = "java"
147149
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
148150
# include the properties file, so you can change defaults
149151
# but any changes in output format will break parse_parser_results()
150-
props = "-props default.properties"
151-
152+
props = "-props default.properties"
153+
152154
# add and check classpaths
153155
jars = [corenlp_path + jar for jar in jars]
154156
for jar in jars:
155157
if not os.path.exists(jar):
156158
logger.error("Error! Cannot locate %s" % jar)
157159
sys.exit(1)
158-
160+
159161
# spawn the server
160162
start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
161-
if VERBOSE:
163+
if VERBOSE:
162164
logger.debug(start_corenlp)
163165
self.corenlp = pexpect.spawn(start_corenlp)
164-
166+
165167
# show progress bar while loading the models
166168
widgets = ['Loading Models: ', Fraction()]
167169
pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
168-
self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
170+
self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
169171
pbar.update(1)
170-
self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
172+
self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
171173
pbar.update(2)
172-
self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
174+
self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
173175
pbar.update(3)
174-
self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
176+
self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
175177
pbar.update(4)
176-
self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
178+
self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
177179
pbar.update(5)
178180
self.corenlp.expect("Entering interactive shell.")
179181
pbar.finish()
180-
182+
181183
def _parse(self, text):
182184
"""
183185
This is the core interaction with the parser.
184-
186+
185187
It returns a Python data-structure, while the parse()
186188
function returns a JSON object
187189
"""
188190
# clean up anything leftover
189191
while True:
190192
try:
191-
self.corenlp.read_nonblocking (4000, 0.3)
193+
self.corenlp.read_nonblocking(4000, 0.3)
192194
except pexpect.TIMEOUT:
193195
break
194-
196+
195197
self.corenlp.sendline(text)
196-
198+
197199
# How much time should we give the parser to parse it?
198-
# the idea here is that you increase the timeout as a
200+
# the idea here is that you increase the timeout as a
199201
# function of the text's length.
200202
# anything longer than 5 seconds requires that you also
201203
# increase timeout=5 in jsonrpc.py
@@ -207,7 +209,7 @@ def _parse(self, text):
207209
# Time left, read more data
208210
try:
209211
incoming += self.corenlp.read_nonblocking(2000, 1)
210-
if "\nNLP>" in incoming:
212+
if "\nNLP>" in incoming:
211213
break
212214
time.sleep(0.0001)
213215
except pexpect.TIMEOUT:
@@ -218,20 +220,20 @@ def _parse(self, text):
218220
continue
219221
except pexpect.EOF:
220222
break
221-
222-
if VERBOSE:
223+
224+
if VERBOSE:
223225
logger.debug("%s\n%s" % ('='*40, incoming))
224226
try:
225227
results = parse_parser_results(incoming)
226228
except Exception, e:
227-
if VERBOSE:
229+
if VERBOSE:
228230
logger.debug(traceback.format_exc())
229231
raise e
230-
232+
231233
return results
232-
234+
233235
def parse(self, text):
234-
"""
236+
"""
235237
This function takes a text string, sends it to the Stanford parser,
236238
reads in the result, parses the results and returns a list
237239
with one dictionary entry for each parsed sentence, in JSON format.
@@ -253,9 +255,9 @@ def parse(self, text):
253255
options, args = parser.parse_args()
254256
server = jsonrpc.Server(jsonrpc.JsonRpc20(),
255257
jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
256-
258+
257259
nlp = StanfordCoreNLP()
258260
server.register_function(nlp.parse)
259-
261+
260262
logger.info('Serving on http://%s:%s' % (options.host, options.port))
261263
server.serve()

0 commit comments

Comments
 (0)