Skip to content

Commit 9ea14e4

Browse files
authored
Merge branch 'main' into zProperErrors
2 parents 086a29f + da06692 commit 9ea14e4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1678
-390
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ With Zingg, the analytics engineer and the data scientist can quickly integrate
1010

1111
![# Zingg - Data Mastering At Scale with ML](/assets/dataMastering.png)
1212

13+
Besides the probabilistic matching, also known as fuzzy matching, Zingg also does deterministic matching, which is useful in identity resolution and householding applications
14+
15+
![#Zingg Detereministic Matching](/assets/deterministicMatching.png)
16+
1317
## Why Zingg
1418

1519
Zingg is an ML based tool for entity resolution. The following features set Zingg apart from other tools and libraries

api/scala/FebrlExample.scala

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import zingg.client._;
2+
import java.util.ArrayList;
3+
4+
//setting silent mode for Argument creation only
5+
:silent
6+
7+
//build the arguments for zingg
8+
val args = new Arguments();
9+
//set field definitions
10+
val fname = new FieldDefinition();
11+
fname.setFieldName("fname");
12+
fname.setDataType("\"string\"");
13+
fname.setMatchType(MatchType.FUZZY);
14+
fname.setFields("fname");
15+
16+
val lname = new FieldDefinition();
17+
lname.setFieldName("lname");
18+
lname.setDataType("\"string\"");
19+
lname.setMatchType(MatchType.FUZZY);
20+
lname.setFields("lname");
21+
22+
val stNo = new FieldDefinition();
23+
stNo.setFieldName("stNo");
24+
stNo.setDataType("\"string\"");
25+
stNo.setMatchType(MatchType.FUZZY);
26+
stNo.setFields("stNo");
27+
28+
val add1 = new FieldDefinition();
29+
add1.setFieldName("add1");
30+
add1.setDataType("\"string\"");
31+
add1.setMatchType(MatchType.FUZZY);
32+
add1.setFields("add1");
33+
34+
val add2 = new FieldDefinition();
35+
add2.setFieldName("add2");
36+
add2.setDataType("\"string\"");
37+
add2.setMatchType(MatchType.FUZZY);
38+
add2.setFields("add2");
39+
40+
val city = new FieldDefinition();
41+
city.setFieldName("city");
42+
city.setDataType("\"string\"");
43+
city.setMatchType(MatchType.FUZZY);
44+
city.setFields("city");
45+
46+
val areacode = new FieldDefinition();
47+
areacode.setFieldName("areacode");
48+
areacode.setDataType("\"string\"");
49+
areacode.setMatchType(MatchType.FUZZY);
50+
areacode.setFields("areacode");
51+
52+
val state = new FieldDefinition();
53+
state.setFieldName("state");
54+
state.setDataType("\"string\"");
55+
state.setMatchType(MatchType.FUZZY);
56+
state.setFields("state");
57+
58+
val dob = new FieldDefinition();
59+
dob.setFieldName("dob");
60+
dob.setDataType("\"string\"");
61+
dob.setMatchType(MatchType.FUZZY);
62+
dob.setFields("dob");
63+
64+
val ssn = new FieldDefinition();
65+
ssn.setFieldName("ssn");
66+
ssn.setDataType("\"string\"");
67+
ssn.setMatchType(MatchType.FUZZY);
68+
ssn.setFields("ssn");
69+
:silent
70+
71+
val fieldDef = new ArrayList[FieldDefinition]();
72+
fieldDef.add(fname);
73+
fieldDef.add(lname);
74+
fieldDef.add(stNo);
75+
fieldDef.add(add1);
76+
fieldDef.add(add2);
77+
fieldDef.add(city);
78+
fieldDef.add(areacode);
79+
fieldDef.add(state);
80+
fieldDef.add(dob);
81+
fieldDef.add(ssn);
82+
83+
args.setFieldDefinition(fieldDef);
84+
//set the modelid and the zingg dir
85+
args.setModelId("100");
86+
args.setZinggDir("models");
87+
args.setNumPartitions(4);
88+
args.setLabelDataSampleSize(0.5f);
89+
90+
//reading dataset into inputPipe and settint it up in 'args'
91+
//below line should not be required if you are reading from in memory dataset
92+
//in that case, replace df with input df
93+
val df = spark.read.option("header", "true").csv("examples/febrl/test.csv")
94+
import zingg.client.pipe.InMemoryPipe;
95+
import java.util.HashMap
96+
97+
val inputPipe = new InMemoryPipe(df);
98+
inputPipe.setProps(new HashMap[String, String]());
99+
val pipes = Array[zingg.client.pipe.Pipe](inputPipe);
100+
args.setData(pipes);
101+
102+
//setting outputpipe in 'args'
103+
val outputPipe = new InMemoryPipe();
104+
//outputPipe.setProps(new HashMap[String, String]());
105+
val pipes = Array[zingg.client.pipe.Pipe](outputPipe);
106+
args.setOutput(pipes);
107+
108+
val options = new ClientOptions("--phase", "match", "--conf", "dummy", "--license", "dummy", "--email", "[email protected]");
109+
110+
//Zingg execution for the given phase
111+
val client = new Client(args, options);
112+
client.init();
113+
client.execute();
114+
//the output is in outputPipe.getRecords
115+
outputPipe.getRecords().show()

assembly/dependency-reduced-pom.xml

Lines changed: 53 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@
114114
<version>3.1.2</version>
115115
<scope>provided</scope>
116116
<exclusions>
117+
<exclusion>
118+
<artifactId>paranamer</artifactId>
119+
<groupId>com.thoughtworks.paranamer</groupId>
120+
</exclusion>
117121
<exclusion>
118122
<artifactId>avro</artifactId>
119123
<groupId>org.apache.avro</groupId>
@@ -170,10 +174,18 @@
170174
<artifactId>jakarta.servlet-api</artifactId>
171175
<groupId>jakarta.servlet</groupId>
172176
</exclusion>
177+
<exclusion>
178+
<artifactId>commons-lang3</artifactId>
179+
<groupId>org.apache.commons</groupId>
180+
</exclusion>
173181
<exclusion>
174182
<artifactId>commons-text</artifactId>
175183
<groupId>org.apache.commons</groupId>
176184
</exclusion>
185+
<exclusion>
186+
<artifactId>jsr305</artifactId>
187+
<groupId>com.google.code.findbugs</groupId>
188+
</exclusion>
177189
<exclusion>
178190
<artifactId>jul-to-slf4j</artifactId>
179191
<groupId>org.slf4j</groupId>
@@ -182,6 +194,10 @@
182194
<artifactId>jcl-over-slf4j</artifactId>
183195
<groupId>org.slf4j</groupId>
184196
</exclusion>
197+
<exclusion>
198+
<artifactId>log4j</artifactId>
199+
<groupId>log4j</groupId>
200+
</exclusion>
185201
<exclusion>
186202
<artifactId>slf4j-log4j12</artifactId>
187203
<groupId>org.slf4j</groupId>
@@ -214,6 +230,10 @@
214230
<artifactId>scala-xml_2.12</artifactId>
215231
<groupId>org.scala-lang.modules</groupId>
216232
</exclusion>
233+
<exclusion>
234+
<artifactId>scala-reflect</artifactId>
235+
<groupId>org.scala-lang</groupId>
236+
</exclusion>
217237
<exclusion>
218238
<artifactId>json4s-jackson_2.12</artifactId>
219239
<groupId>org.json4s</groupId>
@@ -250,6 +270,10 @@
250270
<artifactId>stream</artifactId>
251271
<groupId>com.clearspring.analytics</groupId>
252272
</exclusion>
273+
<exclusion>
274+
<artifactId>metrics-core</artifactId>
275+
<groupId>io.dropwizard.metrics</groupId>
276+
</exclusion>
253277
<exclusion>
254278
<artifactId>metrics-jvm</artifactId>
255279
<groupId>io.dropwizard.metrics</groupId>
@@ -347,14 +371,38 @@
347371
<scope>provided</scope>
348372
</dependency>
349373
<dependency>
350-
<groupId>junit</groupId>
351-
<artifactId>junit</artifactId>
352-
<version>4.13.1</version>
374+
<groupId>org.junit.jupiter</groupId>
375+
<artifactId>junit-jupiter-engine</artifactId>
376+
<version>5.8.1</version>
377+
<scope>test</scope>
378+
<exclusions>
379+
<exclusion>
380+
<artifactId>junit-platform-engine</artifactId>
381+
<groupId>org.junit.platform</groupId>
382+
</exclusion>
383+
<exclusion>
384+
<artifactId>apiguardian-api</artifactId>
385+
<groupId>org.apiguardian</groupId>
386+
</exclusion>
387+
</exclusions>
388+
</dependency>
389+
<dependency>
390+
<groupId>org.junit.jupiter</groupId>
391+
<artifactId>junit-jupiter-api</artifactId>
392+
<version>5.8.1</version>
353393
<scope>test</scope>
354394
<exclusions>
355395
<exclusion>
356-
<artifactId>hamcrest-core</artifactId>
357-
<groupId>org.hamcrest</groupId>
396+
<artifactId>opentest4j</artifactId>
397+
<groupId>org.opentest4j</groupId>
398+
</exclusion>
399+
<exclusion>
400+
<artifactId>junit-platform-commons</artifactId>
401+
<groupId>org.junit.platform</groupId>
402+
</exclusion>
403+
<exclusion>
404+
<artifactId>apiguardian-api</artifactId>
405+
<groupId>org.apiguardian</groupId>
358406
</exclusion>
359407
</exclusions>
360408
</dependency>
@@ -370,12 +418,6 @@
370418
<version>1.3</version>
371419
<scope>test</scope>
372420
</dependency>
373-
<dependency>
374-
<groupId>log4j</groupId>
375-
<artifactId>log4j</artifactId>
376-
<version>1.2.17</version>
377-
<scope>provided</scope>
378-
</dependency>
379421
</dependencies>
380422
</project>
381423

assets/deterministicMatching.png

970 KB
Loading

client/.classpath

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@
3636
<attribute name="m2e-apt" value="true"/>
3737
</attributes>
3838
</classpathentry>
39+
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
40+
<attributes>
41+
<attribute name="maven.pomderived" value="true"/>
42+
<attribute name="test" value="true"/>
43+
</attributes>
44+
</classpathentry>
3945
<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
4046
<attributes>
4147
<attribute name="optional" value="true"/>

client/.settings/org.eclipse.core.resources.prefs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ eclipse.preferences.version=1
22
encoding//src/main/java=UTF-8
33
encoding//src/main/resources=UTF-8
44
encoding//src/test/java=UTF-8
5+
encoding//src/test/resources=UTF-8
56
encoding/<project>=UTF-8

client/pom.xml

Lines changed: 23 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2-
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
32
<modelVersion>4.0.0</modelVersion>
43
<parent>
54
<groupId>zingg</groupId>
@@ -9,71 +8,39 @@
98
<artifactId>client</artifactId>
109
<packaging>jar</packaging>
1110
<properties>
12-
<codehaus.jackson.version>1.9.13</codehaus.jackson.version>
13-
<fasterxml.jackson.version>2.10.0</fasterxml.jackson.version>
14-
<fasterxml.jackson.databind.version>2.10.0</fasterxml.jackson.databind.version>
11+
<fasterxml.jackson.version>2.12.6</fasterxml.jackson.version>
12+
<fasterxml.jackson.databind.version>2.12.6.1</fasterxml.jackson.databind.version>
1513
</properties>
1614
<dependencies>
17-
18-
<dependency>
19-
<groupId>org.apache.poi</groupId>
20-
<artifactId>poi</artifactId>
21-
<version>3.17</version>
22-
</dependency>
2315
<dependency>
24-
<groupId>org.apache.poi</groupId>
25-
<artifactId>poi-ooxml</artifactId>
26-
<version>3.16</version>
27-
</dependency>
28-
<dependency>
29-
<groupId>org.apache.poi</groupId>
30-
<artifactId>poi-scratchpad</artifactId>
31-
<version>3.16</version>
16+
<groupId>com.fasterxml.jackson.core</groupId>
17+
<artifactId>jackson-core</artifactId>
18+
<version>${fasterxml.jackson.version}</version>
19+
<scope>provided</scope>
3220
</dependency>
3321
<dependency>
34-
<groupId>org.codehaus.jackson</groupId>
35-
<artifactId>jackson-core-asl</artifactId>
36-
<version>1.8.8</version>
37-
<scope>provided</scope>
22+
<groupId>com.fasterxml.jackson.core</groupId>
23+
<artifactId>jackson-databind</artifactId>
24+
<version>${fasterxml.jackson.databind.version}</version>
25+
<scope>provided</scope>
3826
</dependency>
3927
<dependency>
40-
<groupId>org.codehaus.jackson</groupId>
41-
<artifactId>jackson-mapper-asl</artifactId>
42-
<version>1.8.8</version>
43-
<scope>provided</scope>
28+
<groupId>com.fasterxml.jackson.core</groupId>
29+
<artifactId>jackson-annotations</artifactId>
30+
<version>${fasterxml.jackson.version}</version>
4431
</dependency>
45-
<dependency>
46-
<groupId>com.fasterxml.jackson.core</groupId>
47-
<artifactId>jackson-core</artifactId>
48-
<version>${fasterxml.jackson.version}</version>
49-
<scope>provided</scope>
50-
</dependency>
51-
<dependency>
52-
<groupId>com.fasterxml.jackson.core</groupId>
53-
<artifactId>jackson-databind</artifactId>
54-
<version>${fasterxml.jackson.databind.version}</version>
55-
<scope>provided</scope>
56-
</dependency>
57-
<dependency>
58-
<groupId>com.fasterxml.jackson.core</groupId>
59-
<artifactId>jackson-annotations</artifactId>
60-
<version>${fasterxml.jackson.version}</version>
61-
</dependency>
62-
63-
64-
</dependencies>
32+
</dependencies>
6533

6634
<build>
6735
<plugins>
68-
<plugin>
69-
<groupId>org.apache.maven.plugins</groupId>
70-
<artifactId>maven-javadoc-plugin</artifactId>
71-
<version>2.9.1</version>
72-
<configuration>
73-
<sourcepath>${basedir}/src/main/java/zingg/client</sourcepath>
74-
</configuration>
75-
</plugin>
36+
<plugin>
37+
<groupId>org.apache.maven.plugins</groupId>
38+
<artifactId>maven-javadoc-plugin</artifactId>
39+
<version>2.9.1</version>
40+
<configuration>
41+
<sourcepath>${basedir}/src/main/java/zingg/client</sourcepath>
42+
</configuration>
43+
</plugin>
7644
</plugins>
77-
7845
</build>
7946
</project>

client/src/main/java/zingg/client/Arguments.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.fasterxml.jackson.annotation.JsonSetter;
1919
import com.fasterxml.jackson.core.JsonParser;
2020
import com.fasterxml.jackson.databind.ObjectMapper;
21+
import com.fasterxml.jackson.databind.module.SimpleModule;
2122
import com.fasterxml.jackson.module.scala.DefaultScalaModule;
2223

2324
import org.apache.commons.logging.Log;
@@ -159,6 +160,10 @@ public static final Arguments createArgumentsFromJSON(String filePath, String ph
159160
mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS,
160161
true);
161162
LOG.warn("Config Argument is " + filePath);
163+
/*SimpleModule module = new SimpleModule();
164+
module.addDeserializer(List<MatchType>.class, new FieldDefinition.MatchTypeDeserializer());
165+
mapper.registerModule(module);
166+
*/
162167
Arguments args = mapper.readValue(new File(filePath), Arguments.class);
163168
LOG.warn("phase is " + phase);
164169
checkValid(args, phase);
@@ -508,7 +513,7 @@ public String getZinggBaseModelDir() {
508513

509514
@JsonIgnore
510515
public String getZinggDocDir() {
511-
return zinggDir + "/" + modelId;
516+
return zinggDir + "/" + modelId + "/docs/";
512517
}
513518

514519
@JsonIgnore

0 commit comments

Comments
 (0)