Skip to content

Commit a6c2ba2

Browse files
committed
more features
1 parent 59a8fba commit a6c2ba2

File tree

10 files changed

+151
-18
lines changed

10 files changed

+151
-18
lines changed

assembly/dependency-reduced-pom.xml

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@
9090
<artifactId>breeze_2.12</artifactId>
9191
<groupId>org.scalanlp</groupId>
9292
</exclusion>
93+
<exclusion>
94+
<artifactId>commons-math3</artifactId>
95+
<groupId>org.apache.commons</groupId>
96+
</exclusion>
9397
<exclusion>
9498
<artifactId>jaxb-runtime</artifactId>
9599
<groupId>org.glassfish.jaxb</groupId>
@@ -110,6 +114,10 @@
110114
<version>3.1.2</version>
111115
<scope>provided</scope>
112116
<exclusions>
117+
<exclusion>
118+
<artifactId>paranamer</artifactId>
119+
<groupId>com.thoughtworks.paranamer</groupId>
120+
</exclusion>
113121
<exclusion>
114122
<artifactId>avro</artifactId>
115123
<groupId>org.apache.avro</groupId>
@@ -166,10 +174,18 @@
166174
<artifactId>jakarta.servlet-api</artifactId>
167175
<groupId>jakarta.servlet</groupId>
168176
</exclusion>
177+
<exclusion>
178+
<artifactId>commons-lang3</artifactId>
179+
<groupId>org.apache.commons</groupId>
180+
</exclusion>
169181
<exclusion>
170182
<artifactId>commons-text</artifactId>
171183
<groupId>org.apache.commons</groupId>
172184
</exclusion>
185+
<exclusion>
186+
<artifactId>jsr305</artifactId>
187+
<groupId>com.google.code.findbugs</groupId>
188+
</exclusion>
173189
<exclusion>
174190
<artifactId>jul-to-slf4j</artifactId>
175191
<groupId>org.slf4j</groupId>
@@ -178,6 +194,10 @@
178194
<artifactId>jcl-over-slf4j</artifactId>
179195
<groupId>org.slf4j</groupId>
180196
</exclusion>
197+
<exclusion>
198+
<artifactId>log4j</artifactId>
199+
<groupId>log4j</groupId>
200+
</exclusion>
181201
<exclusion>
182202
<artifactId>slf4j-log4j12</artifactId>
183203
<groupId>org.slf4j</groupId>
@@ -210,6 +230,10 @@
210230
<artifactId>scala-xml_2.12</artifactId>
211231
<groupId>org.scala-lang.modules</groupId>
212232
</exclusion>
233+
<exclusion>
234+
<artifactId>scala-reflect</artifactId>
235+
<groupId>org.scala-lang</groupId>
236+
</exclusion>
213237
<exclusion>
214238
<artifactId>json4s-jackson_2.12</artifactId>
215239
<groupId>org.json4s</groupId>
@@ -246,6 +270,10 @@
246270
<artifactId>stream</artifactId>
247271
<groupId>com.clearspring.analytics</groupId>
248272
</exclusion>
273+
<exclusion>
274+
<artifactId>metrics-core</artifactId>
275+
<groupId>io.dropwizard.metrics</groupId>
276+
</exclusion>
249277
<exclusion>
250278
<artifactId>metrics-jvm</artifactId>
251279
<groupId>io.dropwizard.metrics</groupId>
@@ -290,6 +318,10 @@
290318
<artifactId>commons-crypto</artifactId>
291319
<groupId>org.apache.commons</groupId>
292320
</exclusion>
321+
<exclusion>
322+
<artifactId>commons-math3</artifactId>
323+
<groupId>org.apache.commons</groupId>
324+
</exclusion>
293325
<exclusion>
294326
<artifactId>spark-tags_2.12</artifactId>
295327
<groupId>org.apache.spark</groupId>
@@ -386,12 +418,6 @@
386418
<version>1.3</version>
387419
<scope>test</scope>
388420
</dependency>
389-
<dependency>
390-
<groupId>log4j</groupId>
391-
<artifactId>log4j</artifactId>
392-
<version>1.2.17</version>
393-
<scope>provided</scope>
394-
</dependency>
395421
</dependencies>
396422
</project>
397423

client/src/main/java/zingg/client/MatchType.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ public enum MatchType implements Serializable {
3737
*/
3838
NUMERIC("NUMERIC"),
3939
/*eg P301d, P00231*/
40-
NUMERIC_WITH_UNITS("NUMERIC_WITH_UNITS"),
40+
NUMERIC_WITH_UNITS("NUMBER_WITH_UNITS"),
4141
NULL_OR_BLANK("NULL_OR_BLANK"),
42-
ONLY_ALPHABETS("ONLY_ALPHABETS"),
42+
ONLY_ALPHABETS_EXACT("ONLY_ALPHABETS_EXACT"),
4343
ONLY_ALPHABETS_FUZZY("ONLY_ALPHABETS_FUZZY"),
4444
DONT_USE("DONT USE");
4545

core/src/main/java/zingg/feature/DateFeature.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,10 @@ public void init(FieldDefinition f) {
2424
addSimFunction(new AffineGapSimilarityFunction());
2525
addSimFunction(new JaroWinklerFunction());
2626
}
27-
else*/ if (f.getMatchType().contains(MatchType.NUMERIC)) {
27+
else*/
28+
if (f.getMatchType().contains(MatchType.FUZZY)) {
2829
addSimFunction(new DateSimilarityFunction());
2930
}
30-
/*else if (f == MatchType.EXACT) {
31-
addSimFunction(new StringSimilarityFunction());
32-
}*/
3331
}
3432

3533
}

core/src/main/java/zingg/feature/DoubleFeature.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public DoubleFeature() {
1313

1414
public void init(FieldDefinition newParam) {
1515
setFieldDefinition(newParam);
16-
if (newParam.getMatchType().contains(MatchType.NUMERIC)) {
16+
if (newParam.getMatchType().contains(MatchType.FUZZY)) {
1717
addSimFunction(new DoubleSimilarityFunction());
1818
}
1919
}

core/src/main/java/zingg/feature/IntFeature.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ public IntFeature() {
1111

1212
public void init(FieldDefinition newParam) {
1313
setFieldDefinition(newParam);
14-
if (newParam.getMatchType().contains(MatchType.NUMERIC)) {
14+
if (newParam.getMatchType().contains(MatchType.FUZZY)) {
1515
addSimFunction(new IntegerSimilarityFunction());
1616
}
1717
}

core/src/main/java/zingg/feature/StringFeature.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import zingg.similarity.function.JaroWinklerFunction;
1010
import zingg.similarity.function.NumbersJaccardFunction;
1111
import zingg.similarity.function.OnlyAlphabetsAffineGapSimilarity;
12+
import zingg.similarity.function.OnlyAlphabetsExactSimilarity;
1213
import zingg.similarity.function.ProductCodeFunction;
1314
import zingg.similarity.function.SameFirstWordFunction;
1415
import zingg.similarity.function.StringSimilarityFunction;
@@ -49,6 +50,9 @@ public void init(FieldDefinition f) {
4950
if (f.getMatchType().contains(MatchType.ONLY_ALPHABETS_FUZZY)) {
5051
addSimFunction(new OnlyAlphabetsAffineGapSimilarity());
5152
}
53+
if (f.getMatchType().contains(MatchType.ONLY_ALPHABETS_EXACT)) {
54+
addSimFunction(new OnlyAlphabetsExactSimilarity());
55+
}
5256
}
5357

5458
}

core/src/main/java/zingg/similarity/function/OnlyAlphabetsAffineGapSimilarity.java

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,5 @@ public Double call(String first, String second) {
6262
return score;
6363
}
6464
}
65-
66-
67-
6865

6966
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
package zingg.similarity.function;
2+
3+
import org.apache.commons.logging.Log;
4+
import org.apache.commons.logging.LogFactory;
5+
import org.apache.spark.ml.util.Identifiable$;
6+
7+
8+
public class OnlyAlphabetsExactSimilarity extends StringSimilarityDistanceFunction {
9+
10+
public static final Log LOG = LogFactory
11+
.getLog(OnlyAlphabetsExactSimilarity.class);
12+
13+
14+
public OnlyAlphabetsExactSimilarity() {
15+
this("OnlyAlphabetsExactSimilarity");
16+
}
17+
18+
public OnlyAlphabetsExactSimilarity(String s) {
19+
super(s);
20+
}
21+
22+
@Override
23+
public String getUid() {
24+
if (uid == null) {
25+
uid = Identifiable$.MODULE$.randomUID("OnlyAlphabetsExactSimilarity");
26+
}
27+
return uid;
28+
}
29+
30+
@Override
31+
public Double call(String first, String second) {
32+
double score1 = 0.0;
33+
double score2 = 0.0;
34+
double score = 0.0;
35+
36+
try {
37+
if (first == null || first.equals("")) {
38+
score1 = 1.0d;
39+
}
40+
if (second == null || second.equals("")) {
41+
score2 = 1.0d;
42+
}
43+
if (score1 != 1.0d && score2 != 1.0d) {
44+
first = first.replaceAll("[0-9.]", "");
45+
second = second.replaceAll("[0-9.]", "");
46+
score = first.equalsIgnoreCase(second)? 1.0d : 0.0d;
47+
}
48+
else {
49+
score = 1.0d;
50+
}
51+
52+
53+
} catch (Exception e) {
54+
e.printStackTrace();
55+
LOG.warn("Error processing differences for " + first + "," + second);
56+
} finally {
57+
if (Double.isNaN(score)) {
58+
score = 0.0;
59+
}
60+
return score;
61+
}
62+
}
63+
64+
}

core/src/main/java/zingg/similarity/function/StringSimilarityFunction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ public StringSimilarityFunction(String name) {
2222
public Double call(String first, String second) {
2323
if (first == null || first.trim().length() ==0) return 0d;
2424
if (second == null || second.trim().length() ==0) return 0d;
25-
double score = first.equalsIgnoreCase(second) ? 1d : -1.0d;
25+
double score = first.equalsIgnoreCase(second) ? 1d : 0d;
2626
return score;
2727
}
2828

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package zingg.similarity.function;
2+
3+
import java.util.Arrays;
4+
5+
import org.junit.jupiter.api.Test;
6+
7+
import static org.junit.jupiter.api.Assertions.assertEquals;
8+
import static org.junit.jupiter.api.Assertions.assertTrue;
9+
10+
public class TestOnlyAlphabetsExactSimilarity {
11+
12+
13+
@Test
14+
public void testNotSameAlhpabets() {
15+
OnlyAlphabetsExactSimilarity sim = new OnlyAlphabetsExactSimilarity();
16+
double score = sim.call("I have 1 number", "I have no number");
17+
assertEquals(0d, score);
18+
}
19+
20+
@Test
21+
public void testSameAlphabetsDiffNumbers() {
22+
OnlyAlphabetsExactSimilarity sim = new OnlyAlphabetsExactSimilarity();
23+
double score = sim.call("I have 1 number", "I have 3 number");
24+
assertEquals(1d, score);
25+
}
26+
27+
@Test
28+
public void testSameNoNum() {
29+
OnlyAlphabetsExactSimilarity sim = new OnlyAlphabetsExactSimilarity();
30+
assertEquals(1d, sim.call("I have no number", "I have no number"));
31+
}
32+
33+
@Test
34+
public void testDiffNoNumber() {
35+
OnlyAlphabetsExactSimilarity sim = new OnlyAlphabetsExactSimilarity();
36+
assertEquals(0d, sim.call("I have a no number", "I have r number"));
37+
}
38+
39+
@Test
40+
public void testSameIgnoreCase() {
41+
OnlyAlphabetsExactSimilarity sim = new OnlyAlphabetsExactSimilarity();
42+
assertEquals(1d, sim.call("I have 1 number", "I HAVE 2 number"));
43+
}
44+
}

0 commit comments

Comments
 (0)