From 9893c3033cf75848775aa014b71e69fbc7fba7b4 Mon Sep 17 00:00:00 2001
From: Ayrton Massey <ayrtonmassey@googlemail.com>
Date: Thu, 20 Aug 2015 15:59:14 +0100
Subject: [PATCH] Enable CoNLL output.

This patch adds the CoNLL output of Stanford CoreNLP to the JSON annotation.

The data is returned in two forms:

 - In its raw form as `conll_raw`, in the same format as given when CoreNLP is run
   from the command line using the flag `-outputFormat conll`

 - Per-sentence as `deps_conll`, which adds CoNLL dependencies to each sentence.

To enable the CoNLL output, pass `"outputFormat": "conll"` in the
`configdict` when creating a new `CoreNLP` instance.
---
 .../javasrc/corenlp/JsonPipeline.java         | 68 ++++++++++++++++++-
 1 file changed, 65 insertions(+), 3 deletions(-)
diff --git a/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java b/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java
index d35885b..29cace1 100644
--- a/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java
+++ b/stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java
@@ -1,6 +1,7 @@
 package corenlp;
 
 import java.io.FileInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.ArrayList;
@@ -36,6 +37,7 @@
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.IndexedWord;
 import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.CoNLLOutputter;
 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 // paths for stanford 3.2.0.  before that, it's e.s.nlp.trees.semgraph.SemanticGraph
 import edu.stanford.nlp.semgraph.SemanticGraph;
@@ -281,6 +283,53 @@ void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, Str
 		}
 	}
 
+	String getCoNLLRaw(Annotation document) throws IOException {
+		/* Didn't specify a length - potential for undefined behaviour */
+		ByteArrayOutputStream baos = new ByteArrayOutputStream();
+		CoNLLOutputter.conllPrint(document,baos);
+		return baos.toString();
+	}
+
+	void addCoNLLAnnoToSentences(List<Map> outSentences, String conll_raw) throws NumberFormatException {
+		int sentence_index = 0;
+
+		Map<String,Object> sent_info = outSentences.get(sentence_index);
+		
+		List<Map> conll_deps = Lists.newArrayList();
+
+		/* Generate the list of CoNLL dependencies for each sentence */
+		String[] lines = conll_raw.split("\n");
+		for(String line: lines) {
+			if(line.trim().length() == 0) {	// Sentences separated by blank lines
+				sent_info.put("deps_conll",conll_deps);
+				sentence_index++;
+				sent_info = outSentences.get(sentence_index);
+				conll_deps = Lists.newArrayList();
+			} else {
+				String[] fields = line.split("\t");
+
+				Map<String,Object> conll_fields = Maps.newHashMap();
+
+				/* Add the 7 fields: INDEX, WORD, LEMMA, POS, NER, DEPHEAD, DEPREL */
+				conll_fields.put("index", Integer.parseInt(fields[0]));
+				conll_fields.put("word", fields[1]);
+				conll_fields.put("lemma", fields[2]);
+				conll_fields.put("pos", fields[3]);
+				conll_fields.put("ner", fields[4]);
+				if(!fields[5].equals("_")) {
+					conll_fields.put("dephead", Integer.parseInt(fields[5]));
+				} else {
+					conll_fields.put("dephead", fields[5]);
+				}
+				conll_fields.put("deprel", fields[6]);
+				
+				conll_deps.add(conll_fields);
+			}
+		}
+		
+		sent_info.put("deps_conll",conll_deps);
+	}
+
 	String[] annotators() {
 		String annotatorsAllstr = (String) props.get("annotators");
 		if (annotatorsAllstr==null || annotatorsAllstr.trim().isEmpty()) {
@@ -299,6 +348,8 @@ JsonNode processTextDocument(String doctext) {
 		Annotation document = new Annotation(doctext);
 		pipeline.annotate(document);
 
+		ImmutableMap.Builder b = new ImmutableMap.Builder();
+
 		List<CoreMap> sentences = document.get(SentencesAnnotation.class);
 		List<Map> outSentences = Lists.newArrayList();
 
@@ -312,11 +363,22 @@ JsonNode processTextDocument(String doctext) {
 			outSentences.add(sent_info);
 		}
 
+		if(props.getProperty("outputFormat","default").equals("conll")) {
+			try {
+				String conll_raw = getCoNLLRaw(document);
+				b.put("conll_raw",conll_raw);
 
-		ImmutableMap.Builder b = new ImmutableMap.Builder();
-//		b.put("text", doctext);
+				addCoNLLAnnoToSentences(outSentences,conll_raw);
+			} catch (IOException ex) { // annotation will not occur if getCoNLLRaw() fails - abandon?
+			    ex.printStackTrace();
+			} catch(NumberFormatException ex) { // some annotation may have occurred - abandon?
+			    ex.printStackTrace();
+			}
+		}
+
+		//b.put("text", doctext);
 		b.put("sentences", outSentences);
-		
+
 		if (Lists.newArrayList(annotators()).contains("dcoref")) {
 			List outCoref = getCorefInfo(document);
 			b.put("entities", outCoref);