From 465440434f51659da50180e9e12a467f3da9e4e4 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Wed, 15 Jul 2020 13:35:59 -0400 Subject: [PATCH 1/2] add SCONJ to REMOVE_POS --- neuralcoref/neuralcoref.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/neuralcoref/neuralcoref.pyx b/neuralcoref/neuralcoref.pyx index 119a852..2ec327e 100644 --- a/neuralcoref/neuralcoref.pyx +++ b/neuralcoref/neuralcoref.pyx @@ -1,6 +1,6 @@ # cython: infer_types=True, boundscheck=False # distutils: language=c++ -""" NeuralCoref resolution spaCy v2.0 pipeline component +""" NeuralCoref resolution spaCy v2.0 pipeline component Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components Compatible with: spaCy v2.0.0+ """ @@ -126,7 +126,7 @@ NSUBJ_OR_DEP = ["nsubj", "dep"] CONJ_OR_PREP = ["conj", "prep"] LEAVE_DEP = ["det", "compound", "appos"] KEEP_DEP = ["nsubj", "dobj", "iobj", "pobj"] -REMOVE_POS = ["CCONJ", "INTJ", "ADP"] +REMOVE_POS = ["CCONJ", "SCONJ", "INTJ", "ADP"] LOWER_NOT_END = ["'s", ',', '.', '!', '?', ':', ';'] PUNCTS = [".", "!", "?"] ACCEPTED_ENTS = ["PERSON", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"] @@ -327,7 +327,7 @@ cdef (int, int) enlarge_span(TokenC* doc_c, int i, int sent_start, int sent_end, maxchild_idx -= 1 # We don't want mentions finishing with 's or conjunctions/punctuation # if debug: print("maxchild_idx", maxchild_idx) while minchild_idx <= maxchild_idx and minchild_idx < sent_end - 1 \ - and (inside(doc_c[minchild_idx].pos, hashes.remove_pos) + and (inside(doc_c[minchild_idx].pos, hashes.remove_pos) or inside(doc_c[minchild_idx].lex.lower, hashes.lower_not_end)): minchild_idx += 1 # We don't want mentions starting with 's or conjunctions/punctuation # if debug: print("minchild_idx", minchild_idx) @@ -882,7 +882,7 @@ cdef class NeuralCoref(object): if tuned and hash_w in self.tuned_vectors: return self.tuned_vectors[hash_w] return self.get_static(hash_w) - + def get_word_in_sentence(self, int i, Span sent): if i < sent.start or i >= sent.end: return self.tuned_vectors[self.hashes.missing_word] From 8d6f627721b8f9841363ca0a4fa90c921dcb5495 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Wed, 15 Jul 2020 14:04:54 -0400 Subject: [PATCH 2/2] add SCONJ to train/document.py remove_pos --- neuralcoref/train/document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralcoref/train/document.py b/neuralcoref/train/document.py index eaa22dd..044e8df 100644 --- a/neuralcoref/train/document.py +++ b/neuralcoref/train/document.py @@ -99,7 +99,7 @@ def _extract_from_sent(doc, span, blacklist=True, debug=False): keep_dep = ["nsubj", "dobj", "iobj", "pobj"] nsubj_or_dep = ["nsubj", "dep"] conj_or_prep = ["conj", "prep"] - remove_pos = ["CCONJ", "INTJ", "ADP"] + remove_pos = ["CCONJ", "SCONJ", "INTJ", "ADP"] lower_not_end = ["'s", ",", ".", "!", "?", ":", ";"] # Utility to remove bad endings