diff --git a/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java b/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java index 32cb0397ea4b..b3cb95631d91 100644 --- a/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java +++ b/languagetool-core/src/main/java/org/languagetool/rules/CleanOverlappingFilter.java @@ -56,8 +56,19 @@ public final List filter(List ruleMatches) { throw new IllegalArgumentException( "The list of rule matches is not ordered. Make sure it is sorted by start position."); } + // juxtaposed errors adding a comma in the same place + boolean isJuxtaposedComma = false; + if (ruleMatch.getFromPos() == prevRuleMatch.getToPos() + && ruleMatch.getSuggestedReplacements().size() > 0 + && prevRuleMatch.getSuggestedReplacements().size() > 0) { + String suggestion = ruleMatch.getSuggestedReplacements().get(0); + String prevSuggestion = prevRuleMatch.getSuggestedReplacements().get(0); + if (prevSuggestion.endsWith(",") && suggestion.startsWith(", ")) { + isJuxtaposedComma = true; + } + } // no overlapping (juxtaposed errors are not removed) - if (ruleMatch.getFromPos() >= prevRuleMatch.getToPos()) { + if (ruleMatch.getFromPos() >= prevRuleMatch.getToPos() && !isJuxtaposedComma) { cleanList.add(prevRuleMatch); prevRuleMatch = ruleMatch; continue; diff --git a/languagetool-core/src/test/java/org/languagetool/rules/CleanOverlappingFilterTest.java b/languagetool-core/src/test/java/org/languagetool/rules/CleanOverlappingFilterTest.java index 1d5361c71338..750fec82f5b2 100644 --- a/languagetool-core/src/test/java/org/languagetool/rules/CleanOverlappingFilterTest.java +++ b/languagetool-core/src/test/java/org/languagetool/rules/CleanOverlappingFilterTest.java @@ -135,6 +135,34 @@ public void testFilter() { assertThat(matches13.get(0).getRule().getId(), is("P1_RULE")); // hidden match should be kept */ + // juxtaposed matches, comma in the same place + RuleMatch ruleMatch1 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY"), sentence, 5, 10, "msg1"); + ruleMatch1.addSuggestedReplacement("right,"); + RuleMatch ruleMatch2 = new RuleMatch(new FakeRule("COMMA_HIGH_PRIORITY"), sentence, 10, 15, "msg2"); + ruleMatch2.addSuggestedReplacement(", left"); + List matches14 = Arrays.asList(ruleMatch1, ruleMatch2); + matches14 = filter.filter(matches14); + assertThat(matches14.size(), is(1)); // filtering + assertThat(matches14.get(0).getRule().getId(), is("COMMA_HIGH_PRIORITY")); + + RuleMatch ruleMatch3 = new RuleMatch(new FakeRule("COMMA_HIGH_PRIORITY"), sentence, 5, 10, "msg1"); + ruleMatch3.addSuggestedReplacement("right,"); + RuleMatch ruleMatch4 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY"), sentence, 10, 15, "msg2"); + ruleMatch4.addSuggestedReplacement(", left"); + List matches15 = Arrays.asList(ruleMatch3, ruleMatch4); + matches15 = filter.filter(matches15); + assertThat(matches15.size(), is(1)); // filtering + assertThat(matches15.get(0).getRule().getId(), is("COMMA_HIGH_PRIORITY")); + + RuleMatch ruleMatch5 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY2"), sentence, 5, 10, "msg1"); + ruleMatch5.addSuggestedReplacement("right,"); + RuleMatch ruleMatch6 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY"), sentence, 10, 15, "msg2"); + ruleMatch6.addSuggestedReplacement(", left"); + List matches16 = Arrays.asList(ruleMatch5, ruleMatch6); + matches16 = filter.filter(matches16); + assertThat(matches16.size(), is(1)); // filtering + assertThat(matches16.get(0).getRule().getId(), is("COMMA_LOW_PRIORITY")); + try { List unordered = Arrays.asList( new RuleMatch(new FakeRule("P1_RULE"), sentence, 11, 12, "msg2"), @@ -162,10 +190,13 @@ protected int getPriorityForId(String id) { case "P3_RULE": return 3; case "P2_RULE": case "P2_PREMIUM_RULE": + case "COMMA_HIGH_PRIORITY": return 2; case "P1_RULE": case "P1_RULE_B": case "P1_PREMIUM_RULE": + case "COMMA_LOW_PRIORITY": + case "COMMA_LOW_PRIORITY2": return 1; case "MISC": return 0; default: throw new RuntimeException("No priority defined for " + id); diff --git a/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java b/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java index f2060cf1ce3b..e1a58081b834 100644 --- a/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java +++ b/languagetool-language-modules/ca/src/main/java/org/languagetool/language/Catalan.java @@ -100,7 +100,8 @@ public List getRelevantRules(ResourceBundle messages, UserConfig userConfi new CatalanRepeatedWordsRule(messages), new SimpleReplaceDNVRule(messages, this), new SimpleReplaceDNVColloquialRule(messages, this), - new SimpleReplaceDNVSecondaryRule(messages, this) + new SimpleReplaceDNVSecondaryRule(messages, this), + new RemotePunctuationRule(messages, userConfig) ); } @@ -226,6 +227,7 @@ protected int getPriorityForId(String id) { case "MUNDAR": return -50; case "NOMBRES_ROMANS": return -90; case "MORFOLOGIK_RULE_CA_ES": return -100; + case "CA_REMOTE_PUNCTUATION_RULE": return -100; case "EXIGEIX_ACCENTUACIO_VALENCIANA": return -120; case "PHRASE_REPETITION": return -150; case "SUBSTANTIUS_JUNTS": return -150; diff --git a/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/RemotePunctuationRule.java b/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/RemotePunctuationRule.java new file mode 100644 index 000000000000..e07806c89027 --- /dev/null +++ b/languagetool-language-modules/ca/src/main/java/org/languagetool/rules/ca/RemotePunctuationRule.java @@ -0,0 +1,297 @@ +package org.languagetool.rules.ca; + +import org.languagetool.AnalyzedSentence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.util.*; +import org.languagetool.rules.*; +import java.net.URLEncoder; +import org.languagetool.AnalyzedTokenReadings; +import java.io.*; +import java.net.HttpURLConnection; +import java.net.URL; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.languagetool.JLanguageTool; +import org.languagetool.language.Catalan; +import org.apache.commons.lang3.StringUtils; +import java.nio.charset.Charset; +import org.languagetool.tools.StringTools; +import org.languagetool.UserConfig; + + +/** + * + */ +public class RemotePunctuationRule extends TextLevelRule { + + private static final Logger logger = LoggerFactory.getLogger(RemotePunctuationRule.class); + private UserConfig userConfig; + + String server_url; + final int TIMEOUT_MS = 2000; + boolean ab_test = false; //AB test disabled, opened 100% + + public RemotePunctuationRule(ResourceBundle messages, UserConfig userConfig) { + super.setCategory(Categories.PUNCTUATION.getCategory(messages)); + + this.userConfig = userConfig; + server_url = System.getenv("CA_PUNCT_SERVER"); + + } + + public void setABTest(boolean _ab_test) { + ab_test = _ab_test; + } + + private HttpURLConnection createConnection(URL url, String urlParameters) { + try { + + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("POST"); + connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); + connection.setUseCaches(false); + connection.setDoOutput(true); + connection.setConnectTimeout(TIMEOUT_MS); + connection.setReadTimeout(TIMEOUT_MS); + connection.setRequestProperty("Content-Length", Integer.toString(urlParameters.getBytes().length)); + return connection; + } + catch (Exception e) { + e.printStackTrace(); + logger.error("Could not connect remote service at " + url + " for punctuation service", e); + return null; + } + } + + public String connectRemoteServer(String url, String inputText) { + + if (StringUtils.isEmpty(url)) + return inputText; + + HttpURLConnection connection = null; + + try { + + String text = URLEncoder.encode(inputText, "utf-8"); + String urlParameters = "text=" + text; + + connection = createConnection(new URL(url), urlParameters); + if (connection == null) + return ""; + + //Send request + DataOutputStream wr = new DataOutputStream ( + connection.getOutputStream()); + wr.writeBytes(urlParameters); + wr.close(); + + //Get Response + String response = StringTools.streamToString(connection.getInputStream(), "UTF-8"); + ObjectMapper mapper = new ObjectMapper(); + Map map = mapper.readValue(response, Map.class); + String responseText = (String) map.get("text"); + String responseTime = (String) map.get("time"); +// System.out.println("Response Text:'" + responseText.toString() + "'"); +// System.out.println("Response Time:'" + responseTime.toString() + "'"); + + return responseText; + } catch (Exception e) { + logger.error("Error while talking to remote service at " + url + " for punctuation service", e); + return null; + } finally { + if (connection != null) { + connection.disconnect(); + } + } + } + + private String getTextFromAnalyzedSentences(List sentences) { + StringBuilder text = new StringBuilder(); + + for (AnalyzedSentence sentence : sentences) { + text.append(getTextFromAnalyzedSentence(sentence)); + } + + return text.toString(); + } + + private String getTextFromAnalyzedSentence(AnalyzedSentence sentence) { + + StringBuilder text = new StringBuilder(); + for (AnalyzedTokenReadings analyzedToken : sentence.getTokens()) { + text.append(analyzedToken.getToken()); + } + return text.toString(); + } + + private void ShowRuleMatch(RuleMatch ruleMatch) { + System.out.println("Rule: " + ruleMatch); + + } + + private boolean IsSessionInControlGroup() { + boolean inControlGroup = true; + + try { + + if (userConfig != null) { + Long textSessionID; + + textSessionID = userConfig.getTextSessionId(); + inControlGroup = textSessionID % 20 != 0; // 5% + System.out.println("SessionID: " + textSessionID + " in control group: " + inControlGroup); + } + } + catch (Exception e) { + logger.error("IsSessionInControlGroup error", e); + return true; + } + + return inControlGroup; + } + + @Override + public RuleMatch[] match(List sentences) throws IOException { + + try { + + if (ab_test && IsSessionInControlGroup() == true) { + return toRuleMatchArray(new ArrayList<>()); + } + else { + return doRule(sentences); + } + } + catch (Exception e) { + logger.error("Error while processing rule", e); + return toRuleMatchArray(new ArrayList<>()); + } + } + + //* Select until next word. It can be more than one token (e.g. 'del') */ + private String getUntilEndOfNextWord(AnalyzedTokenReadings[] tokens, int idx) { + + StringBuilder word = new StringBuilder(); + + for (;idx < tokens.length; idx++) { + AnalyzedTokenReadings token = tokens[idx]; + + if (!token.isWhitespace()) + break; + + word.append(token.getToken()); + } + + for (;idx < tokens.length; idx++) { + AnalyzedTokenReadings token = tokens[idx]; + + if (token.isWhitespace()) + break; + + word.append(token.getToken()); + } + return word.toString(); + } + + private RuleMatch[] doRule(List sentences) throws IOException { + final List ruleMatches = new ArrayList<>(); + int sentenceOffset = 0; + JLanguageTool lt = new JLanguageTool(new Catalan()); + + String allText = getTextFromAnalyzedSentences(sentences); + + String allCorrected = connectRemoteServer(server_url, allText); + + if (allCorrected == null) + return toRuleMatchArray(ruleMatches); + + System.out.println("Charset: " + Charset.defaultCharset()); + System.out.println("Original :'" + allText + "'"); + System.out.println("Corrected:'" + allCorrected + "'"); + + List correctedSentences = lt.analyzeText(allCorrected); + + if (correctedSentences.size() != sentences.size()) { + System.out.println("Sentences lists with diferent length:" + correctedSentences.size() + " - " + sentences.size()); + return toRuleMatchArray(ruleMatches); + } + + System.out.println("Sentences size: " + sentences.size()); + for (int idx = 0; idx < sentences.size(); idx++) { + + AnalyzedSentence originalSentence = sentences.get(idx); + AnalyzedSentence correctedSentence = correctedSentences.get(idx); + String originalSentenceText = getTextFromAnalyzedSentence(originalSentence); + String correctedSentenceText = getTextFromAnalyzedSentence(correctedSentence); + + System.out.println("Original sentence:'" + originalSentenceText + "'"); + System.out.println("Corrected sentence:'" + correctedSentenceText + "'"); + + if (originalSentenceText.equals(correctedSentenceText) == false) { + System.out.println("Not equal"); + + AnalyzedTokenReadings[] originalTokens = originalSentence.getTokens(); + AnalyzedTokenReadings[] correctedTokens = correctedSentence.getTokens(); + + int pass = 0; + for (int idxO = 0, idxC = 0; idxO < originalTokens.length && idxC < correctedTokens.length; idxO++, idxC++, pass++) { + AnalyzedTokenReadings originalToken = originalTokens[idxO]; + AnalyzedTokenReadings correctedToken = correctedTokens[idxC]; + + String originalTokenText = originalTokens[idxO].getToken(); + String correctedTokenText = correctedTokens[idxC].getToken(); + + //System.out.println("Original token: '" + originalTokenText + "' - start: " + originalToken.getStartPos() + " - pass: " + pass); + //System.out.println("Corrected token: '" + correctedTokenText + "' - start: " + correctedToken.getStartPos()+ " - pass: " + pass); + + if (originalTokenText.equals(correctedTokenText)) + continue; + + if (correctedTokenText.equals(",")) { + + System.out.println("Added comma"); + String nextToken = getUntilEndOfNextWord(originalTokens, idxO + 1); + int start = sentenceOffset + originalToken.getStartPos(); + int length = nextToken.length() + 1; + + RuleMatch ruleMatch = new RuleMatch(this, originalSentence, start, + start + length, "Probablement hi falta una coma", "Probablement hi falta una coma"); + + String suggestion = correctedTokenText + originalTokenText + nextToken; + System.out.println("Suggestion:'" + suggestion + "'"); + ruleMatch.addSuggestedReplacement(suggestion); + ShowRuleMatch(ruleMatch); + ruleMatches.add(ruleMatch); + idxC++; + continue; + } + + /* Target may contain less spaces than source*/ + if (originalToken.isWhitespace() && !correctedToken.isWhitespace()) { + System.out.println("Space out sync"); + idxC--; + continue; + } + } //for + } //if (corrected != null && original.equals(corrected) == false) { + sentenceOffset += originalSentenceText.length(); + }//for (AnalyzedSentence sentence : sentences) { + return toRuleMatchArray(ruleMatches); + } + + @Override + public final String getId() { + return "CA_REMOTE_PUNCTUATION_RULE"; + } + + @Override + public String getDescription() { + return "Detecta errors de puntuació usant un servei remot"; + } + + @Override + public int minToCheckParagraph() { + return -1; + } +} diff --git a/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/RemotePunctuationRuleTest.java b/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/RemotePunctuationRuleTest.java new file mode 100644 index 000000000000..cebd4aa0c13e --- /dev/null +++ b/languagetool-language-modules/ca/src/test/java/org/languagetool/rules/ca/RemotePunctuationRuleTest.java @@ -0,0 +1,92 @@ +package org.languagetool.rules.ca; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; + +import org.junit.Before; +import org.junit.Test; +import org.languagetool.JLanguageTool; +import org.languagetool.TestTools; +import org.languagetool.language.Catalan; +import org.languagetool.rules.RuleMatch; +import org.languagetool.*; +import java.util.ResourceBundle; +import org.languagetool.AnalyzedSentence; +import java.util.List; +import java.util.ArrayList; + +public class RemotePunctuationRuleTest { + + class RemotePunctuationRuleForTest extends RemotePunctuationRule { + + public RemotePunctuationRuleForTest(ResourceBundle messages, UserConfig userConfig) throws IOException { + super(messages, userConfig); + setABTest(false); + } + + public String connectRemoteServer(String url, String text) { + if (text.equals("Això però ningú ho sap")) { + return "Això, però ningú ho sap"; + } + + if (text.equals("Això però ningú, ho sap")) { + return"Això però ningú ho sap"; + } + + if (text.equals("Això vol dir una cosa allò una altra")) { + return "Això vol dir una cosa, allò, una altra"; + } + + if (text.equals("Apropament a Europa del qual el Kremlin sempre ha recelat.")) { + return "Apropament a Europa, del qual el Kremlin sempre ha recelat."; + } + + return text; + } + } + + private RemotePunctuationRuleForTest rule; + private JLanguageTool lt; + + @Before + public void setUp() throws IOException { + rule = new RemotePunctuationRuleForTest(TestTools.getEnglishMessages(), null); + lt = new JLanguageTool(new Catalan()); + } + + private List getAnalyzedSentence(String text) throws IOException { + List sentences = new ArrayList<>(); + AnalyzedSentence sentence = lt.getAnalyzedSentence(text); + sentences.add(sentence); + return sentences; + } + + @Test + public void testRuleNoCommas() throws IOException { + + assertEquals(0, rule.match(getAnalyzedSentence("Text sense errors")).length); + } + + + @Test + public void testRuleAddCommas() throws IOException { + + RuleMatch[] matches = rule.match(getAnalyzedSentence("Això però ningú ho sap")); + assertEquals(1, matches.length); + assertEquals(4, matches[0].getFromPos()); + assertEquals(9, matches[0].getToPos()); + + assertEquals(2, rule.match(getAnalyzedSentence("Això vol dir una cosa allò una altra")).length); + } + + @Test + public void testRuleAddCommasTokenSelection() throws IOException { + + RuleMatch[] matches = rule.match(getAnalyzedSentence("Apropament a Europa del qual el Kremlin sempre ha recelat.")); + assertEquals(1, matches.length); + assertEquals(19, matches[0].getFromPos()); + assertEquals(23, matches[0].getToPos()); + } + +}