From 9893c3033cf75848775aa014b71e69fbc7fba7b4 Mon Sep 17 00:00:00 2001 From: Ayrton Massey Date: Thu, 20 Aug 2015 15:59:14 +0100 Subject: [PATCH] Enable CoNLL output. This patch adds the CoNLL output of Stanford CoreNLP to the JSON annotation. The data is returned in two forms: - In its raw form as `conll_raw`, in the same format as given when CoreNLP is run from the command line using the flag `-outputFormat conll` - Per-sentence as `deps_conll`, which adds CoNLL dependencies to each sentence. To enable the CoNLL output, pass `"outputFormat": "conll"` in the `configdict` when creating a new `CoreNLP` instance. --- .../javasrc/corenlp/JsonPipeline.java | 68 ++++++++++++++++++- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java b/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java index d35885b..29cace1 100644 --- a/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java +++ b/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java @@ -1,6 +1,7 @@ package corenlp; import java.io.FileInputStream; +import java.io.ByteArrayOutputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; @@ -36,6 +37,7 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.CoNLLOutputter; import edu.stanford.nlp.pipeline.StanfordCoreNLP; // paths for stanford 3.2.0. before that, it's e.s.nlp.trees.semgraph.SemanticGraph import edu.stanford.nlp.semgraph.SemanticGraph; @@ -281,6 +283,53 @@ void addAnnoToSentenceObject(Map sent_info, CoreMap sentence, Str } } + String getCoNLLRaw(Annotation document) throws IOException { + /* Didn't specify a length - potential for undefined behaviour */ + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + CoNLLOutputter.conllPrint(document,baos); + return baos.toString(); + } + + void addCoNLLAnnoToSentences(List outSentences, String conll_raw) throws NumberFormatException { + int sentence_index = 0; + + Map sent_info = outSentences.get(sentence_index); + + List conll_deps = Lists.newArrayList(); + + /* Generate the list of CoNLL dependencies for each sentence */ + String[] lines = conll_raw.split("\n"); + for(String line: lines) { + if(line.trim().length() == 0) { // Sentences separated by blank lines + sent_info.put("deps_conll",conll_deps); + sentence_index++; + sent_info = outSentences.get(sentence_index); + conll_deps = Lists.newArrayList(); + } else { + String[] fields = line.split("\t"); + + Map conll_fields = Maps.newHashMap(); + + /* Add the 7 fields: INDEX, WORD, LEMMA, POS, NER, DEPHEAD, DEPREL */ + conll_fields.put("index", Integer.parseInt(fields[0])); + conll_fields.put("word", fields[1]); + conll_fields.put("lemma", fields[2]); + conll_fields.put("pos", fields[3]); + conll_fields.put("ner", fields[4]); + if(!fields[5].equals("_")) { + conll_fields.put("dephead", Integer.parseInt(fields[5])); + } else { + conll_fields.put("dephead", fields[5]); + } + conll_fields.put("deprel", fields[6]); + + conll_deps.add(conll_fields); + } + } + + sent_info.put("deps_conll",conll_deps); + } + String[] annotators() { String annotatorsAllstr = (String) props.get("annotators"); if (annotatorsAllstr==null || annotatorsAllstr.trim().isEmpty()) { @@ -299,6 +348,8 @@ JsonNode processTextDocument(String doctext) { Annotation document = new Annotation(doctext); pipeline.annotate(document); + ImmutableMap.Builder b = new ImmutableMap.Builder(); + List sentences = document.get(SentencesAnnotation.class); List outSentences = Lists.newArrayList(); @@ -312,11 +363,22 @@ JsonNode processTextDocument(String doctext) { outSentences.add(sent_info); } + if(props.getProperty("outputFormat","default").equals("conll")) { + try { + String conll_raw = getCoNLLRaw(document); + b.put("conll_raw",conll_raw); - ImmutableMap.Builder b = new ImmutableMap.Builder(); -// b.put("text", doctext); + addCoNLLAnnoToSentences(outSentences,conll_raw); + } catch (IOException ex) { // annotation will not occur if getCoNLLRaw() fails - abandon? + ex.printStackTrace(); + } catch(NumberFormatException ex) { // some annotation may have occurred - abandon? + ex.printStackTrace(); + } + } + + //b.put("text", doctext); b.put("sentences", outSentences); - + if (Lists.newArrayList(annotators()).contains("dcoref")) { List outCoref = getCorefInfo(document); b.put("entities", outCoref);