Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catalan remote punctuation #1

Open
wants to merge 43 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
47abac9
Initial version
jordimas Apr 15, 2022
d9a7998
Extract open method
jordimas Apr 15, 2022
2631eb3
Logging
jordimas Apr 15, 2022
d3fd512
Unit tests
jordimas Apr 15, 2022
a9c4699
Fixes
jordimas Apr 15, 2022
e999f38
Timeout read
jordimas Apr 15, 2022
af2d698
Read json
jordimas Apr 15, 2022
36c38ac
Textlevel rule
jordimas Apr 16, 2022
7dfc158
Restore trailing plus position in multiline
jordimas Apr 16, 2022
0ea9e06
Move to tokens
jordimas Apr 16, 2022
db246d3
One server call
jordimas Apr 16, 2022
a09b639
Fixes
jordimas Apr 17, 2022
06ef0ce
Fix UT
jordimas Apr 17, 2022
21f4634
Use var
jordimas Apr 17, 2022
79ce826
UT
jordimas Apr 18, 2022
9efb665
Fix
jordimas Apr 18, 2022
663eaf9
Unnecessary code
jordimas Apr 20, 2022
ab3d2a8
Unused code
jordimas Apr 20, 2022
0ec689f
Fix
jordimas Apr 20, 2022
12f591a
Return on error
jordimas Apr 20, 2022
b3f1554
Timeout 250
jordimas Apr 20, 2022
89e21d3
Exception
jordimas Apr 20, 2022
6af8fad
2s
jordimas Apr 20, 2022
7cb33bd
Fix encoding
jordimas Apr 25, 2022
95356a4
Fix
jordimas Apr 25, 2022
f2469ca
Fix
jordimas Apr 25, 2022
ea178b2
Token selection when adding
jordimas Apr 27, 2022
12c5328
Remove also
jordimas Apr 27, 2022
c5ad2ec
var lower case
jordimas Apr 29, 2022
194c685
Only new && out of sync
jordimas May 8, 2022
58f13e1
SessionID
jordimas Jul 11, 2022
4adbb7c
Merge tag 'v4.8' into catalan-remote-punctuation
jordimas Jul 11, 2022
f3d25b3
Fixes
jordimas Jul 11, 2022
3701a98
Fix
jordimas Jul 11, 2022
ea74762
A/b test
jordimas Jul 17, 2022
f64367f
Fix group
jordimas Jul 20, 2022
19fb517
Control group
jordimas Jul 23, 2022
f73375f
Prio
jordimas Jul 23, 2022
f3576b5
5%
jordimas Jul 23, 2022
7b7928c
Fix UT + remove unused code
jordimas Jul 25, 2022
13a34e3
Open to 100%
jordimas Aug 7, 2022
dcff3dc
CleanOverlappingFilter: overlapping comma suggestions
jaumeortola Jul 21, 2022
d37f4a0
Prio -100
jordimas Aug 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,19 @@ public final List<RuleMatch> filter(List<RuleMatch> ruleMatches) {
throw new IllegalArgumentException(
"The list of rule matches is not ordered. Make sure it is sorted by start position.");
}
// juxtaposed errors adding a comma in the same place
boolean isJuxtaposedComma = false;
if (ruleMatch.getFromPos() == prevRuleMatch.getToPos()
&& ruleMatch.getSuggestedReplacements().size() > 0
&& prevRuleMatch.getSuggestedReplacements().size() > 0) {
String suggestion = ruleMatch.getSuggestedReplacements().get(0);
String prevSuggestion = prevRuleMatch.getSuggestedReplacements().get(0);
if (prevSuggestion.endsWith(",") && suggestion.startsWith(", ")) {
isJuxtaposedComma = true;
}
}
// no overlapping (juxtaposed errors are not removed)
if (ruleMatch.getFromPos() >= prevRuleMatch.getToPos()) {
if (ruleMatch.getFromPos() >= prevRuleMatch.getToPos() && !isJuxtaposedComma) {
cleanList.add(prevRuleMatch);
prevRuleMatch = ruleMatch;
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,34 @@ public void testFilter() {
assertThat(matches13.get(0).getRule().getId(), is("P1_RULE")); // hidden match should be kept
*/

// juxtaposed matches, comma in the same place
RuleMatch ruleMatch1 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY"), sentence, 5, 10, "msg1");
ruleMatch1.addSuggestedReplacement("right,");
RuleMatch ruleMatch2 = new RuleMatch(new FakeRule("COMMA_HIGH_PRIORITY"), sentence, 10, 15, "msg2");
ruleMatch2.addSuggestedReplacement(", left");
List<RuleMatch> matches14 = Arrays.asList(ruleMatch1, ruleMatch2);
matches14 = filter.filter(matches14);
assertThat(matches14.size(), is(1)); // filtering
assertThat(matches14.get(0).getRule().getId(), is("COMMA_HIGH_PRIORITY"));

RuleMatch ruleMatch3 = new RuleMatch(new FakeRule("COMMA_HIGH_PRIORITY"), sentence, 5, 10, "msg1");
ruleMatch3.addSuggestedReplacement("right,");
RuleMatch ruleMatch4 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY"), sentence, 10, 15, "msg2");
ruleMatch4.addSuggestedReplacement(", left");
List<RuleMatch> matches15 = Arrays.asList(ruleMatch3, ruleMatch4);
matches15 = filter.filter(matches15);
assertThat(matches15.size(), is(1)); // filtering
assertThat(matches15.get(0).getRule().getId(), is("COMMA_HIGH_PRIORITY"));

RuleMatch ruleMatch5 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY2"), sentence, 5, 10, "msg1");
ruleMatch5.addSuggestedReplacement("right,");
RuleMatch ruleMatch6 = new RuleMatch(new FakeRule("COMMA_LOW_PRIORITY"), sentence, 10, 15, "msg2");
ruleMatch6.addSuggestedReplacement(", left");
List<RuleMatch> matches16 = Arrays.asList(ruleMatch5, ruleMatch6);
matches16 = filter.filter(matches16);
assertThat(matches16.size(), is(1)); // filtering
assertThat(matches16.get(0).getRule().getId(), is("COMMA_LOW_PRIORITY"));

try {
List<RuleMatch> unordered = Arrays.asList(
new RuleMatch(new FakeRule("P1_RULE"), sentence, 11, 12, "msg2"),
Expand Down Expand Up @@ -162,10 +190,13 @@ protected int getPriorityForId(String id) {
case "P3_RULE": return 3;
case "P2_RULE":
case "P2_PREMIUM_RULE":
case "COMMA_HIGH_PRIORITY":
return 2;
case "P1_RULE":
case "P1_RULE_B":
case "P1_PREMIUM_RULE":
case "COMMA_LOW_PRIORITY":
case "COMMA_LOW_PRIORITY2":
return 1;
case "MISC": return 0;
default: throw new RuntimeException("No priority defined for " + id);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ public List<Rule> getRelevantRules(ResourceBundle messages, UserConfig userConfi
new CatalanRepeatedWordsRule(messages),
new SimpleReplaceDNVRule(messages, this),
new SimpleReplaceDNVColloquialRule(messages, this),
new SimpleReplaceDNVSecondaryRule(messages, this)
new SimpleReplaceDNVSecondaryRule(messages, this),
new RemotePunctuationRule(messages, userConfig)
);
}

Expand Down Expand Up @@ -226,6 +227,7 @@ protected int getPriorityForId(String id) {
case "MUNDAR": return -50;
case "NOMBRES_ROMANS": return -90;
case "MORFOLOGIK_RULE_CA_ES": return -100;
case "CA_REMOTE_PUNCTUATION_RULE": return -100;
case "EXIGEIX_ACCENTUACIO_VALENCIANA": return -120;
case "PHRASE_REPETITION": return -150;
case "SUBSTANTIUS_JUNTS": return -150;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
package org.languagetool.rules.ca;

import org.languagetool.AnalyzedSentence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
import org.languagetool.rules.*;
import java.net.URLEncoder;
import org.languagetool.AnalyzedTokenReadings;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.languagetool.JLanguageTool;
import org.languagetool.language.Catalan;
import org.apache.commons.lang3.StringUtils;
import java.nio.charset.Charset;
import org.languagetool.tools.StringTools;
import org.languagetool.UserConfig;


/**
*
*/
public class RemotePunctuationRule extends TextLevelRule {

private static final Logger logger = LoggerFactory.getLogger(RemotePunctuationRule.class);
private UserConfig userConfig;

String server_url;
final int TIMEOUT_MS = 2000;
boolean ab_test = false; //AB test disabled, opened 100%

public RemotePunctuationRule(ResourceBundle messages, UserConfig userConfig) {
super.setCategory(Categories.PUNCTUATION.getCategory(messages));

this.userConfig = userConfig;
server_url = System.getenv("CA_PUNCT_SERVER");

}

public void setABTest(boolean _ab_test) {
ab_test = _ab_test;
}

private HttpURLConnection createConnection(URL url, String urlParameters) {
try {

HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("POST");
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.setUseCaches(false);
connection.setDoOutput(true);
connection.setConnectTimeout(TIMEOUT_MS);
connection.setReadTimeout(TIMEOUT_MS);
connection.setRequestProperty("Content-Length", Integer.toString(urlParameters.getBytes().length));
return connection;
}
catch (Exception e) {
e.printStackTrace();
logger.error("Could not connect remote service at " + url + " for punctuation service", e);
return null;
}
}

public String connectRemoteServer(String url, String inputText) {

if (StringUtils.isEmpty(url))
return inputText;

HttpURLConnection connection = null;

try {

String text = URLEncoder.encode(inputText, "utf-8");
String urlParameters = "text=" + text;

connection = createConnection(new URL(url), urlParameters);
if (connection == null)
return "";

//Send request
DataOutputStream wr = new DataOutputStream (
connection.getOutputStream());
wr.writeBytes(urlParameters);
wr.close();

//Get Response
String response = StringTools.streamToString(connection.getInputStream(), "UTF-8");
ObjectMapper mapper = new ObjectMapper();
Map map = mapper.readValue(response, Map.class);
String responseText = (String) map.get("text");
String responseTime = (String) map.get("time");
// System.out.println("Response Text:'" + responseText.toString() + "'");
// System.out.println("Response Time:'" + responseTime.toString() + "'");

return responseText;
} catch (Exception e) {
logger.error("Error while talking to remote service at " + url + " for punctuation service", e);
return null;
} finally {
if (connection != null) {
connection.disconnect();
}
}
}

private String getTextFromAnalyzedSentences(List<AnalyzedSentence> sentences) {
StringBuilder text = new StringBuilder();

for (AnalyzedSentence sentence : sentences) {
text.append(getTextFromAnalyzedSentence(sentence));
}

return text.toString();
}

private String getTextFromAnalyzedSentence(AnalyzedSentence sentence) {

StringBuilder text = new StringBuilder();
for (AnalyzedTokenReadings analyzedToken : sentence.getTokens()) {
text.append(analyzedToken.getToken());
}
return text.toString();
}

private void ShowRuleMatch(RuleMatch ruleMatch) {
System.out.println("Rule: " + ruleMatch);

}

private boolean IsSessionInControlGroup() {
boolean inControlGroup = true;

try {

if (userConfig != null) {
Long textSessionID;

textSessionID = userConfig.getTextSessionId();
inControlGroup = textSessionID % 20 != 0; // 5%
System.out.println("SessionID: " + textSessionID + " in control group: " + inControlGroup);
}
}
catch (Exception e) {
logger.error("IsSessionInControlGroup error", e);
return true;
}

return inControlGroup;
}

@Override
public RuleMatch[] match(List<AnalyzedSentence> sentences) throws IOException {

try {

if (ab_test && IsSessionInControlGroup() == true) {
return toRuleMatchArray(new ArrayList<>());
}
else {
return doRule(sentences);
}
}
catch (Exception e) {
logger.error("Error while processing rule", e);
return toRuleMatchArray(new ArrayList<>());
}
}

//* Select until next word. It can be more than one token (e.g. 'del') */
private String getUntilEndOfNextWord(AnalyzedTokenReadings[] tokens, int idx) {

StringBuilder word = new StringBuilder();

for (;idx < tokens.length; idx++) {
AnalyzedTokenReadings token = tokens[idx];

if (!token.isWhitespace())
break;

word.append(token.getToken());
}

for (;idx < tokens.length; idx++) {
AnalyzedTokenReadings token = tokens[idx];

if (token.isWhitespace())
break;

word.append(token.getToken());
}
return word.toString();
}

private RuleMatch[] doRule(List<AnalyzedSentence> sentences) throws IOException {
final List<RuleMatch> ruleMatches = new ArrayList<>();
int sentenceOffset = 0;
JLanguageTool lt = new JLanguageTool(new Catalan());

String allText = getTextFromAnalyzedSentences(sentences);

String allCorrected = connectRemoteServer(server_url, allText);

if (allCorrected == null)
return toRuleMatchArray(ruleMatches);

System.out.println("Charset: " + Charset.defaultCharset());
System.out.println("Original :'" + allText + "'");
System.out.println("Corrected:'" + allCorrected + "'");

List<AnalyzedSentence> correctedSentences = lt.analyzeText(allCorrected);

if (correctedSentences.size() != sentences.size()) {
System.out.println("Sentences lists with diferent length:" + correctedSentences.size() + " - " + sentences.size());
return toRuleMatchArray(ruleMatches);
}

System.out.println("Sentences size: " + sentences.size());
for (int idx = 0; idx < sentences.size(); idx++) {

AnalyzedSentence originalSentence = sentences.get(idx);
AnalyzedSentence correctedSentence = correctedSentences.get(idx);
String originalSentenceText = getTextFromAnalyzedSentence(originalSentence);
String correctedSentenceText = getTextFromAnalyzedSentence(correctedSentence);

System.out.println("Original sentence:'" + originalSentenceText + "'");
System.out.println("Corrected sentence:'" + correctedSentenceText + "'");

if (originalSentenceText.equals(correctedSentenceText) == false) {
System.out.println("Not equal");

AnalyzedTokenReadings[] originalTokens = originalSentence.getTokens();
AnalyzedTokenReadings[] correctedTokens = correctedSentence.getTokens();

int pass = 0;
for (int idxO = 0, idxC = 0; idxO < originalTokens.length && idxC < correctedTokens.length; idxO++, idxC++, pass++) {
AnalyzedTokenReadings originalToken = originalTokens[idxO];
AnalyzedTokenReadings correctedToken = correctedTokens[idxC];

String originalTokenText = originalTokens[idxO].getToken();
String correctedTokenText = correctedTokens[idxC].getToken();

//System.out.println("Original token: '" + originalTokenText + "' - start: " + originalToken.getStartPos() + " - pass: " + pass);
//System.out.println("Corrected token: '" + correctedTokenText + "' - start: " + correctedToken.getStartPos()+ " - pass: " + pass);

if (originalTokenText.equals(correctedTokenText))
continue;

if (correctedTokenText.equals(",")) {

System.out.println("Added comma");
String nextToken = getUntilEndOfNextWord(originalTokens, idxO + 1);
int start = sentenceOffset + originalToken.getStartPos();
int length = nextToken.length() + 1;

RuleMatch ruleMatch = new RuleMatch(this, originalSentence, start,
start + length, "Probablement hi falta una coma", "Probablement hi falta una coma");

String suggestion = correctedTokenText + originalTokenText + nextToken;
System.out.println("Suggestion:'" + suggestion + "'");
ruleMatch.addSuggestedReplacement(suggestion);
ShowRuleMatch(ruleMatch);
ruleMatches.add(ruleMatch);
idxC++;
continue;
}

/* Target may contain less spaces than source*/
if (originalToken.isWhitespace() && !correctedToken.isWhitespace()) {
System.out.println("Space out sync");
idxC--;
continue;
}
} //for
} //if (corrected != null && original.equals(corrected) == false) {
sentenceOffset += originalSentenceText.length();
}//for (AnalyzedSentence sentence : sentences) {
return toRuleMatchArray(ruleMatches);
}

@Override
public final String getId() {
return "CA_REMOTE_PUNCTUATION_RULE";
}

@Override
public String getDescription() {
return "Detecta errors de puntuació usant un servei remot";
}

@Override
public int minToCheckParagraph() {
return -1;
}
}
Loading