Skip to content

Commit 001c974

Browse files
committed
add support for standford NLP 3.5.2
1 parent 2c41e75 commit 001c974

File tree

2 files changed

+46
-46
lines changed

2 files changed

+46
-46
lines changed

README.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Python interface to Stanford Core NLP tools v3.4.1
1+
# Python interface to Stanford Core NLP tools v3.5.2
22

33
This is a Python wrapper for Stanford University's NLP group's Java-based [CoreNLP tools](http://nlp.stanford.edu/software/corenlp.shtml). It can either be imported as a module or run as a JSON-RPC server. Because it uses many large trained models (requiring 3GB RAM on 64-bit machines and usually a few minutes loading time), most applications will probably want to run it as a server.
44

@@ -10,7 +10,7 @@ This is a Python wrapper for Stanford University's NLP group's Java-based [CoreN
1010

1111
It depends on [pexpect](http://www.noah.org/wiki/pexpect) and includes and uses code from [jsonrpc](http://www.simple-is-better.org/rpc/) and [python-progressbar](http://code.google.com/p/python-progressbar/).
1212

13-
It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.4.1** released 2014-08-27.
13+
It runs the Stanford CoreNLP jar in a separate process, communicates with the java process using its command-line interface, and makes assumptions about the output of the parser in order to parse it into a Python dict object and transfer it using JSON. The parser will break if the output changes significantly, but it has been tested on **Core NLP tools version 3.5.2** released 2015-04-20.
1414

1515
## Download and Usage
1616

@@ -19,8 +19,8 @@ To use this program you must [download](http://nlp.stanford.edu/software/corenlp
1919
sudo pip install pexpect unidecode
2020
git clone git://github.com/dasmith/stanford-corenlp-python.git
2121
cd stanford-corenlp-python
22-
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2014-08-27.zip
23-
unzip stanford-corenlp-full-2014-08-27.zip
22+
wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-04-20.zip
23+
unzip stanford-corenlp-full-2015-04-20.zip
2424

2525
Then launch the server:
2626

@@ -110,7 +110,7 @@ To use it in a regular script (useful for debugging), load the module instead:
110110
corenlp = StanfordCoreNLP() # wait a few minutes...
111111
corenlp.parse("Parse this sentence.")
112112

113-
The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files. The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2014-08-27/")`.
113+
The server, `StanfordCoreNLP()`, takes an optional argument `corenlp_path` which specifies the path to the jar files. The default value is `StanfordCoreNLP(corenlp_path="./stanford-corenlp-full-2015-04-20/")`.
114114

115115
## Coreference Resolution
116116

@@ -139,7 +139,7 @@ tar xvfz WNprolog-3.0.tar.gz
139139
**Stanford CoreNLP tools require a large amount of free memory**. Java 5+ uses about 50% more RAM on 64-bit machines than 32-bit machines. 32-bit machine users can lower the memory requirements by changing `-Xmx3g` to `-Xmx2g` or even less.
140140
If pexpect timesout while loading models, check to make sure you have enough memory and can run the server alone without your kernel killing the java process:
141141

142-
java -cp stanford-corenlp-2014-08-27.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
142+
java -cp stanford-corenlp-2015-04-20.jar:stanford-corenlp-3.4.1-models.jar:xom.jar:joda-time.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -props default.properties
143143

144144
You can reach me, Dustin Smith, by sending a message on GitHub or through email (contact information is available [on my webpage](http://web.media.mit.edu/~dustin)).
145145

corenlp.py

+40-40
Original file line numberDiff line numberDiff line change
@@ -74,39 +74,39 @@ def parse_parser_results(text):
7474
state = STATE_START
7575
for line in text.split("\n"):
7676
line = line.strip()
77-
77+
if line.startswith("(ROOT"):
78+
state = STATE_TREE
7879
if line.startswith("Sentence #"):
79-
sentence = {'words':[], 'parsetree':[], 'dependencies':[]}
80+
sentence = {'words': [], 'parsetree': [], 'dependencies': []}
8081
results["sentences"].append(sentence)
8182
state = STATE_TEXT
82-
83+
8384
elif state == STATE_TEXT:
8485
sentence['text'] = line
8586
state = STATE_WORDS
86-
87+
8788
elif state == STATE_WORDS:
8889
if not line.startswith("[Text="):
8990
raise Exception('Parse error. Could not find "[Text=" in: %s' % line)
9091
for s in WORD_PATTERN.findall(line):
9192
sentence['words'].append(parse_bracketed(s))
92-
state = STATE_TREE
93-
93+
9494
elif state == STATE_TREE:
9595
if len(line) == 0:
9696
state = STATE_DEPENDENCY
9797
sentence['parsetree'] = " ".join(sentence['parsetree'])
9898
else:
9999
sentence['parsetree'].append(line)
100-
100+
101101
elif state == STATE_DEPENDENCY:
102102
if len(line) == 0:
103103
state = STATE_COREFERENCE
104104
else:
105105
split_entry = re.split("\(|, ", line[:-1])
106106
if len(split_entry) == 3:
107107
rel, left, right = map(lambda x: remove_id(x), split_entry)
108-
sentence['dependencies'].append(tuple([rel,left,right]))
109-
108+
sentence['dependencies'].append(tuple([rel, left, right]))
109+
110110
elif state == STATE_COREFERENCE:
111111
if "Coreference set" in line:
112112
if 'coref' not in results:
@@ -118,7 +118,7 @@ def parse_parser_results(text):
118118
src_i, src_pos, src_l, src_r = int(src_i)-1, int(src_pos)-1, int(src_l)-1, int(src_r)-1
119119
sink_i, sink_pos, sink_l, sink_r = int(sink_i)-1, int(sink_pos)-1, int(sink_l)-1, int(sink_r)-1
120120
coref_set.append(((src_word, src_i, src_pos, src_l, src_r), (sink_word, sink_i, sink_pos, sink_l, sink_r)))
121-
121+
122122
return results
123123

124124

@@ -132,70 +132,70 @@ def __init__(self, corenlp_path=None):
132132
Checks the location of the jar files.
133133
Spawns the server as a process.
134134
"""
135-
jars = ["stanford-corenlp-3.5.1.jar",
136-
"stanford-corenlp-3.5.1-models.jar",
135+
jars = ["stanford-corenlp-3.5.2.jar",
136+
"stanford-corenlp-3.5.2-models.jar",
137137
"joda-time.jar",
138138
"xom.jar",
139139
"jollyday.jar"]
140-
140+
141141
# if CoreNLP libraries are in a different directory,
142142
# change the corenlp_path variable to point to them
143143
if not corenlp_path:
144-
corenlp_path = "./stanford-corenlp-full-2015-01-30/"
145-
144+
corenlp_path = "./stanford-corenlp-full-2015-04-20/"
145+
146146
java_path = "java"
147147
classname = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
148148
# include the properties file, so you can change defaults
149149
# but any changes in output format will break parse_parser_results()
150-
props = "-props default.properties"
151-
150+
props = "-props default.properties"
151+
152152
# add and check classpaths
153153
jars = [corenlp_path + jar for jar in jars]
154154
for jar in jars:
155155
if not os.path.exists(jar):
156156
logger.error("Error! Cannot locate %s" % jar)
157157
sys.exit(1)
158-
158+
159159
# spawn the server
160160
start_corenlp = "%s -Xmx1800m -cp %s %s %s" % (java_path, ':'.join(jars), classname, props)
161-
if VERBOSE:
161+
if VERBOSE:
162162
logger.debug(start_corenlp)
163163
self.corenlp = pexpect.spawn(start_corenlp)
164-
164+
165165
# show progress bar while loading the models
166166
widgets = ['Loading Models: ', Fraction()]
167167
pbar = ProgressBar(widgets=widgets, maxval=5, force_update=True).start()
168-
self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
168+
self.corenlp.expect("done.", timeout=20) # Load pos tagger model (~5sec)
169169
pbar.update(1)
170-
self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
170+
self.corenlp.expect("done.", timeout=200) # Load NER-all classifier (~33sec)
171171
pbar.update(2)
172-
self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
172+
self.corenlp.expect("done.", timeout=600) # Load NER-muc classifier (~60sec)
173173
pbar.update(3)
174-
self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
174+
self.corenlp.expect("done.", timeout=600) # Load CoNLL classifier (~50sec)
175175
pbar.update(4)
176-
self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
176+
self.corenlp.expect("done.", timeout=200) # Loading PCFG (~3sec)
177177
pbar.update(5)
178178
self.corenlp.expect("Entering interactive shell.")
179179
pbar.finish()
180-
180+
181181
def _parse(self, text):
182182
"""
183183
This is the core interaction with the parser.
184-
184+
185185
It returns a Python data-structure, while the parse()
186186
function returns a JSON object
187187
"""
188188
# clean up anything leftover
189189
while True:
190190
try:
191-
self.corenlp.read_nonblocking (4000, 0.3)
191+
self.corenlp.read_nonblocking(4000, 0.3)
192192
except pexpect.TIMEOUT:
193193
break
194-
194+
195195
self.corenlp.sendline(text)
196-
196+
197197
# How much time should we give the parser to parse it?
198-
# the idea here is that you increase the timeout as a
198+
# the idea here is that you increase the timeout as a
199199
# function of the text's length.
200200
# anything longer than 5 seconds requires that you also
201201
# increase timeout=5 in jsonrpc.py
@@ -207,7 +207,7 @@ def _parse(self, text):
207207
# Time left, read more data
208208
try:
209209
incoming += self.corenlp.read_nonblocking(2000, 1)
210-
if "\nNLP>" in incoming:
210+
if "\nNLP>" in incoming:
211211
break
212212
time.sleep(0.0001)
213213
except pexpect.TIMEOUT:
@@ -218,20 +218,20 @@ def _parse(self, text):
218218
continue
219219
except pexpect.EOF:
220220
break
221-
222-
if VERBOSE:
221+
222+
if VERBOSE:
223223
logger.debug("%s\n%s" % ('='*40, incoming))
224224
try:
225225
results = parse_parser_results(incoming)
226226
except Exception, e:
227-
if VERBOSE:
227+
if VERBOSE:
228228
logger.debug(traceback.format_exc())
229229
raise e
230-
230+
231231
return results
232-
232+
233233
def parse(self, text):
234-
"""
234+
"""
235235
This function takes a text string, sends it to the Stanford parser,
236236
reads in the result, parses the results and returns a list
237237
with one dictionary entry for each parsed sentence, in JSON format.
@@ -253,9 +253,9 @@ def parse(self, text):
253253
options, args = parser.parse_args()
254254
server = jsonrpc.Server(jsonrpc.JsonRpc20(),
255255
jsonrpc.TransportTcpIp(addr=(options.host, int(options.port))))
256-
256+
257257
nlp = StanfordCoreNLP()
258258
server.register_function(nlp.parse)
259-
259+
260260
logger.info('Serving on http://%s:%s' % (options.host, options.port))
261261
server.serve()

0 commit comments

Comments
 (0)