Skip to content

Commit da06692

Browse files
committed
match type tests
1 parent a6c2ba2 commit da06692

File tree

7 files changed

+281
-16
lines changed

7 files changed

+281
-16
lines changed

client/src/main/java/zingg/client/Arguments.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import com.fasterxml.jackson.annotation.JsonSetter;
1919
import com.fasterxml.jackson.core.JsonParser;
2020
import com.fasterxml.jackson.databind.ObjectMapper;
21+
import com.fasterxml.jackson.databind.module.SimpleModule;
2122
import com.fasterxml.jackson.module.scala.DefaultScalaModule;
2223

2324
import org.apache.commons.logging.Log;
@@ -159,6 +160,10 @@ public static final Arguments createArgumentsFromJSON(String filePath, String ph
159160
mapper.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS,
160161
true);
161162
LOG.warn("Config Argument is " + filePath);
163+
/*SimpleModule module = new SimpleModule();
164+
module.addDeserializer(List<MatchType>.class, new FieldDefinition.MatchTypeDeserializer());
165+
mapper.registerModule(module);
166+
*/
162167
Arguments args = mapper.readValue(new File(filePath), Arguments.class);
163168
LOG.warn("phase is " + phase);
164169
checkValid(args, phase);

client/src/main/java/zingg/client/FieldDefinition.java

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,13 @@
1010
import org.apache.commons.logging.LogFactory;
1111
import org.apache.spark.sql.types.DataType;
1212

13-
13+
import com.fasterxml.jackson.annotation.JsonIgnore;
1414
import com.fasterxml.jackson.annotation.JsonProperty;
1515
import com.fasterxml.jackson.core.JsonGenerator;
1616
import com.fasterxml.jackson.core.JsonParser;
1717
import com.fasterxml.jackson.core.JsonProcessingException;
1818
import com.fasterxml.jackson.databind.DeserializationContext;
19+
import com.fasterxml.jackson.databind.DeserializationFeature;
1920
import com.fasterxml.jackson.databind.JsonSerializer;
2021
import com.fasterxml.jackson.databind.SerializerProvider;
2122
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
@@ -35,8 +36,8 @@ public class FieldDefinition implements
3536
Serializable {
3637

3738
public static final Log LOG = LogFactory.getLog(FieldDefinition.class);
38-
@JsonDeserialize(using = MatchTypeDeserializer.class)
39-
public List<MatchType> matchType;
39+
40+
@JsonDeserialize(using = MatchTypeDeserializer.class) public List<MatchType> matchType;
4041
@JsonSerialize(using = DataTypeSerializer.class)
4142
public DataType dataType;
4243
public String fieldName;
@@ -66,10 +67,17 @@ public List<MatchType> getMatchType() {
6667
* @see MatchType
6768
* @param type
6869
* the type to set
69-
*/
70-
public void setMatchType(MatchType... type) {
70+
*/
71+
@JsonDeserialize(using = MatchTypeDeserializer.class)
72+
public void setMatchType(List<MatchType> type) {
73+
this.matchType = type; //MatchTypeDeserializer.getMatchTypeFromString(type);
74+
}
75+
76+
77+
public void setMatchTypeInternal(MatchType... type) {
7178
this.matchType = Arrays.asList(type);
7279
}
80+
7381

7482

7583
public DataType getDataType() {
@@ -168,14 +176,23 @@ public MatchTypeDeserializer(Class<String> t) {
168176
@Override
169177
public List<MatchType> deserialize(JsonParser parser, DeserializationContext context)
170178
throws IOException, JsonProcessingException {
171-
List<MatchType> matchTypes = new ArrayList<MatchType>();
172-
String m = parser.getText();
179+
ObjectMapper mapper = new ObjectMapper();
180+
mapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
181+
LOG.debug("Deserializing custom type");
182+
return getMatchTypeFromString(mapper.readValue(parser, String.class));
183+
}
184+
185+
public static List<MatchType> getMatchTypeFromString(String m) throws IOException{
186+
List<MatchType> matchTypes = new ArrayList<MatchType>();
173187
String[] matchTypeFromConfig = m.split(",");
174188
for (String s: matchTypeFromConfig) {
175-
matchTypes.add(MatchType.getMatchType(s));
189+
MatchType mt = MatchType.getMatchType(s);
190+
LOG.debug(mt);
191+
if (m == null) throw new IOException("Wrong value of matchType set");
192+
matchTypes.add(mt);
176193
}
177194
return matchTypes;
178-
}
195+
}
179196
}
180197

181198

client/src/main/java/zingg/client/MatchType.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public static MatchType getMatchType(String t) {
6262
if (types == null) {
6363
init();
6464
}
65-
return types.get(t.toUpperCase());
65+
return types.get(t.trim().toUpperCase());
6666
}
6767

6868
@JsonValue

client/src/test/java/zingg/client/TestArguments.java

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import java.nio.file.Files;
99
import java.nio.file.Paths;
1010
import java.util.HashMap;
11+
import java.util.List;
1112
import java.util.Map;
1213

1314
import org.apache.commons.logging.Log;
@@ -204,4 +205,40 @@ public void testInvalidFilePath() {
204205
LOG.warn("Expected exception received: NoSuchFileException");
205206
}
206207
}
208+
209+
@Test
210+
public void testMatchTypeMultiple() {
211+
Arguments args;
212+
try {
213+
args = Arguments.createArgumentsFromJSON(getClass().getResource("../../configWithMultipleMatchTypes.json").getFile(), "test");
214+
List<MatchType> fNameMatchType = args.getFieldDefinition().get(0).getMatchType();
215+
assertEquals(2, fNameMatchType.size());
216+
assertEquals(MatchType.FUZZY, fNameMatchType.get(0));
217+
assertEquals(MatchType.NULL_OR_BLANK, fNameMatchType.get(1));
218+
219+
220+
} catch (Exception | ZinggClientException e) {
221+
// TODO Auto-generated catch block
222+
e.printStackTrace();
223+
fail("Could not read config");
224+
}
225+
226+
}
227+
228+
@Test
229+
public void testMatchTypeWrong() {
230+
Arguments args;
231+
try {
232+
args = Arguments.createArgumentsFromJSON(getClass().getResource("../../configWithMultipleMatchTypesUnsupported.json").getFile(), "test");
233+
//List<MatchType> fNameMatchType = args.getFieldDefinition().get(0).getMatchType();
234+
fail("config had error, should have flagged");
235+
236+
} catch (Exception | ZinggClientException e) {
237+
// TODO Auto-generated catch block
238+
e.printStackTrace();
239+
}
240+
241+
242+
243+
}
207244
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
{
2+
"fieldDefinition":[
3+
{
4+
"fieldName" : "fname",
5+
"matchType" : "fuzzy,null_or_blank",
6+
"fields" : "fname",
7+
"dataType": "\"string\""
8+
},
9+
{
10+
"fieldName" : "lname",
11+
"matchType" : "fuzzy",
12+
"fields" : "lname",
13+
"dataType": "\"string\""
14+
},
15+
{
16+
"fieldName" : "stNo",
17+
"matchType": "exact",
18+
"fields" : "stNo",
19+
"dataType": "\"string\""
20+
},
21+
{
22+
"fieldName" : "add1",
23+
"matchType": "fuzzy",
24+
"fields" : "add1",
25+
"dataType": "\"string\""
26+
},
27+
{
28+
"fieldName" : "add2",
29+
"matchType": "fuzzy",
30+
"fields" : "add2",
31+
"dataType": "\"string\""
32+
},
33+
{
34+
"fieldName" : "city",
35+
"matchType": "fuzzy",
36+
"fields" : "city",
37+
"dataType": "\"string\""
38+
},
39+
{
40+
"fieldName" : "areacode",
41+
"matchType": "exact",
42+
"fields" : "areacode",
43+
"dataType": "\"string\""
44+
},
45+
{
46+
"fieldName" : "state",
47+
"matchType": "fuzzy",
48+
"fields" : "state",
49+
"dataType": "\"string\""
50+
},
51+
{
52+
"fieldName" : "dob",
53+
"matchType": "fuzzy",
54+
"fields" : "dob",
55+
"dataType": "\"string\""
56+
},
57+
{
58+
"fieldName" : "ssn",
59+
"matchType": "fuzzy",
60+
"fields" : "ssn",
61+
"dataType": "\"string\""
62+
}
63+
],
64+
"output" : [{
65+
"name":"output",
66+
"format":"csv",
67+
"props": {
68+
"location": "/tmp/zinggOutput",
69+
"delimiter": ",",
70+
"header":true
71+
}
72+
}],
73+
"data" : [{
74+
"name":"test",
75+
"format":"csv",
76+
"props": {
77+
"location": "examples/febrl/test.csv",
78+
"delimiter": ",",
79+
"header":false
80+
},
81+
"schema":
82+
"{\"type\" : \"struct\",
83+
\"fields\" : [
84+
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
85+
{\"name\":\"fname\", \"type\":\"string\", \"nullable\":true},
86+
{\"name\":\"lname\",\"type\":\"string\",\"nullable\":true} ,
87+
{\"name\":\"stNo\", \"type\":\"string\", \"nullable\":true},
88+
{\"name\":\"add1\", \"type\":\"string\", \"nullable\":true},
89+
{\"name\":\"add2\",\"type\":\"string\",\"nullable\":true} ,
90+
{\"name\":\"city\", \"type\":\"string\", \"nullable\":true},
91+
{\"name\":\"areacode\", \"type\":\"string\", \"nullable\":true},
92+
{\"name\":\"state\", \"type\":\"string\", \"nullable\":true},
93+
{\"name\":\"dob\",\"type\":\"string\",\"nullable\":true} ,
94+
{\"name\":\"ssn\",\"type\":\"string\",\"nullable\":true}
95+
]
96+
}"
97+
}],
98+
"labelDataSampleSize" : 0.5,
99+
"numPartitions":4,
100+
"modelId": 100,
101+
"zinggDir": "models"
102+
103+
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
{
2+
"fieldDefinition":[
3+
{
4+
"fieldName" : "fname",
5+
"matchType" : "fuzzy,null_wrong_blank",
6+
"fields" : "fname",
7+
"dataType": "\"string\""
8+
},
9+
{
10+
"fieldName" : "lname",
11+
"matchType" : "fuzzy",
12+
"fields" : "lname",
13+
"dataType": "\"string\""
14+
},
15+
{
16+
"fieldName" : "stNo",
17+
"matchType": "exact",
18+
"fields" : "stNo",
19+
"dataType": "\"string\""
20+
},
21+
{
22+
"fieldName" : "add1",
23+
"matchType": "fuzzy",
24+
"fields" : "add1",
25+
"dataType": "\"string\""
26+
},
27+
{
28+
"fieldName" : "add2",
29+
"matchType": "fuzzy",
30+
"fields" : "add2",
31+
"dataType": "\"string\""
32+
},
33+
{
34+
"fieldName" : "city",
35+
"matchType": "fuzzy",
36+
"fields" : "city",
37+
"dataType": "\"string\""
38+
},
39+
{
40+
"fieldName" : "areacode",
41+
"matchType": "exact",
42+
"fields" : "areacode",
43+
"dataType": "\"string\""
44+
},
45+
{
46+
"fieldName" : "state",
47+
"matchType": "fuzzy",
48+
"fields" : "state",
49+
"dataType": "\"string\""
50+
},
51+
{
52+
"fieldName" : "dob",
53+
"matchType": "fuzzy",
54+
"fields" : "dob",
55+
"dataType": "\"string\""
56+
},
57+
{
58+
"fieldName" : "ssn",
59+
"matchType": "fuzzy",
60+
"fields" : "ssn",
61+
"dataType": "\"string\""
62+
}
63+
],
64+
"output" : [{
65+
"name":"output",
66+
"format":"csv",
67+
"props": {
68+
"location": "/tmp/zinggOutput",
69+
"delimiter": ",",
70+
"header":true
71+
}
72+
}],
73+
"data" : [{
74+
"name":"test",
75+
"format":"csv",
76+
"props": {
77+
"location": "examples/febrl/test.csv",
78+
"delimiter": ",",
79+
"header":false
80+
},
81+
"schema":
82+
"{\"type\" : \"struct\",
83+
\"fields\" : [
84+
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
85+
{\"name\":\"fname\", \"type\":\"string\", \"nullable\":true},
86+
{\"name\":\"lname\",\"type\":\"string\",\"nullable\":true} ,
87+
{\"name\":\"stNo\", \"type\":\"string\", \"nullable\":true},
88+
{\"name\":\"add1\", \"type\":\"string\", \"nullable\":true},
89+
{\"name\":\"add2\",\"type\":\"string\",\"nullable\":true} ,
90+
{\"name\":\"city\", \"type\":\"string\", \"nullable\":true},
91+
{\"name\":\"areacode\", \"type\":\"string\", \"nullable\":true},
92+
{\"name\":\"state\", \"type\":\"string\", \"nullable\":true},
93+
{\"name\":\"dob\",\"type\":\"string\",\"nullable\":true} ,
94+
{\"name\":\"ssn\",\"type\":\"string\",\"nullable\":true}
95+
]
96+
}"
97+
}],
98+
"labelDataSampleSize" : 0.5,
99+
"numPartitions":4,
100+
"modelId": 100,
101+
"zinggDir": "models"
102+
103+
}

0 commit comments

Comments
 (0)