add SCONJ to REMOVE_POS to exclude subordinating conjunction from mention span detection (#276)

noelslice · web-flow · commit 18c0f4c5c4d5 · 2020-09-07T15:27:17.000+02:00
* add SCONJ to REMOVE_POS

* add SCONJ to train/document.py remove_pos
diff --git a/neuralcoref/neuralcoref.pyx b/neuralcoref/neuralcoref.pyx
@@ -1,6 +1,6 @@
 # cython: infer_types=True, boundscheck=False
 # distutils: language=c++
-""" NeuralCoref resolution spaCy v2.0 pipeline component 
+""" NeuralCoref resolution spaCy v2.0 pipeline component
 Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
 Compatible with: spaCy v2.0.0+
 """
@@ -126,7 +126,7 @@ NSUBJ_OR_DEP = ["nsubj", "dep"]
 CONJ_OR_PREP = ["conj", "prep"]
 LEAVE_DEP = ["det", "compound", "appos"]
 KEEP_DEP = ["nsubj", "dobj", "iobj", "pobj"]
-REMOVE_POS = ["CCONJ", "INTJ", "ADP"]
+REMOVE_POS = ["CCONJ", "SCONJ", "INTJ", "ADP"]
 LOWER_NOT_END = ["'s", ',', '.', '!', '?', ':', ';']
 PUNCTS = [".", "!", "?"]
 ACCEPTED_ENTS = ["PERSON", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"]
@@ -327,7 +327,7 @@ cdef (int, int) enlarge_span(TokenC* doc_c, int i, int sent_start, int sent_end,
         maxchild_idx -= 1 # We don't want mentions finishing with 's or conjunctions/punctuation
     # if debug: print("maxchild_idx", maxchild_idx)
     while minchild_idx <= maxchild_idx and minchild_idx < sent_end - 1 \
-          and (inside(doc_c[minchild_idx].pos, hashes.remove_pos) 
+          and (inside(doc_c[minchild_idx].pos, hashes.remove_pos)
                or inside(doc_c[minchild_idx].lex.lower, hashes.lower_not_end)):
         minchild_idx += 1 # We don't want mentions starting with 's or conjunctions/punctuation
         # if debug: print("minchild_idx", minchild_idx)
@@ -882,7 +882,7 @@ cdef class NeuralCoref(object):
         if tuned and hash_w in self.tuned_vectors:
             return self.tuned_vectors[hash_w]
         return self.get_static(hash_w)
- 
+
     def get_word_in_sentence(self, int i, Span sent):
         if i < sent.start or i >= sent.end:
             return self.tuned_vectors[self.hashes.missing_word]
diff --git a/neuralcoref/train/document.py b/neuralcoref/train/document.py
@@ -99,7 +99,7 @@ def _extract_from_sent(doc, span, blacklist=True, debug=False):
     keep_dep = ["nsubj", "dobj", "iobj", "pobj"]
     nsubj_or_dep = ["nsubj", "dep"]
     conj_or_prep = ["conj", "prep"]
-    remove_pos = ["CCONJ", "INTJ", "ADP"]
+    remove_pos = ["CCONJ", "SCONJ", "INTJ", "ADP"]
     lower_not_end = ["'s", ",", ".", "!", "?", ":", ";"]
 
     # Utility to remove bad endings