Skip to content

Commit 18c0f4c

Browse files
authored
add SCONJ to REMOVE_POS to exclude subordinating conjunction from mention span detection (#276)
* add SCONJ to REMOVE_POS * add SCONJ to train/document.py remove_pos
1 parent 654d906 commit 18c0f4c

File tree

2 files changed

+5
-5
lines changed

2 files changed

+5
-5
lines changed

neuralcoref/neuralcoref.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# cython: infer_types=True, boundscheck=False
22
# distutils: language=c++
3-
""" NeuralCoref resolution spaCy v2.0 pipeline component
3+
""" NeuralCoref resolution spaCy v2.0 pipeline component
44
Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
55
Compatible with: spaCy v2.0.0+
66
"""
@@ -126,7 +126,7 @@ NSUBJ_OR_DEP = ["nsubj", "dep"]
126126
CONJ_OR_PREP = ["conj", "prep"]
127127
LEAVE_DEP = ["det", "compound", "appos"]
128128
KEEP_DEP = ["nsubj", "dobj", "iobj", "pobj"]
129-
REMOVE_POS = ["CCONJ", "INTJ", "ADP"]
129+
REMOVE_POS = ["CCONJ", "SCONJ", "INTJ", "ADP"]
130130
LOWER_NOT_END = ["'s", ',', '.', '!', '?', ':', ';']
131131
PUNCTS = [".", "!", "?"]
132132
ACCEPTED_ENTS = ["PERSON", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"]
@@ -327,7 +327,7 @@ cdef (int, int) enlarge_span(TokenC* doc_c, int i, int sent_start, int sent_end,
327327
maxchild_idx -= 1 # We don't want mentions finishing with 's or conjunctions/punctuation
328328
# if debug: print("maxchild_idx", maxchild_idx)
329329
while minchild_idx <= maxchild_idx and minchild_idx < sent_end - 1 \
330-
and (inside(doc_c[minchild_idx].pos, hashes.remove_pos)
330+
and (inside(doc_c[minchild_idx].pos, hashes.remove_pos)
331331
or inside(doc_c[minchild_idx].lex.lower, hashes.lower_not_end)):
332332
minchild_idx += 1 # We don't want mentions starting with 's or conjunctions/punctuation
333333
# if debug: print("minchild_idx", minchild_idx)
@@ -882,7 +882,7 @@ cdef class NeuralCoref(object):
882882
if tuned and hash_w in self.tuned_vectors:
883883
return self.tuned_vectors[hash_w]
884884
return self.get_static(hash_w)
885-
885+
886886
def get_word_in_sentence(self, int i, Span sent):
887887
if i < sent.start or i >= sent.end:
888888
return self.tuned_vectors[self.hashes.missing_word]

neuralcoref/train/document.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def _extract_from_sent(doc, span, blacklist=True, debug=False):
9999
keep_dep = ["nsubj", "dobj", "iobj", "pobj"]
100100
nsubj_or_dep = ["nsubj", "dep"]
101101
conj_or_prep = ["conj", "prep"]
102-
remove_pos = ["CCONJ", "INTJ", "ADP"]
102+
remove_pos = ["CCONJ", "SCONJ", "INTJ", "ADP"]
103103
lower_not_end = ["'s", ",", ".", "!", "?", ":", ";"]
104104

105105
# Utility to remove bad endings

0 commit comments

Comments
 (0)