Skip to content

Commit

Permalink
Merge pull request #12 from NSel1727/main
Browse files Browse the repository at this point in the history
HPCC-27713 - Create confidence test and key file for the Learning Trees bundle

Reviewed-By: Attila Vamos [email protected]
Reviewed-by: Gavin Halliday <[email protected]>
Merged-by: Gavin Halliday <[email protected]>
  • Loading branch information
ghalliday authored Jun 28, 2022
2 parents 386e8f8 + a592661 commit 0ea12a3
Show file tree
Hide file tree
Showing 6 changed files with 268 additions and 0 deletions.
97 changes: 97 additions & 0 deletions ecl/ClassificationTestModified.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2022 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#ONWARNING(2007, ignore);
#ONWARNING(4531, ignore);
#ONWARNING(4550, ignore);

// Modified version of the testCovTypeClass test file that works with the
// OBT test system

IMPORT $.^.test.datasets.CovTypeDS;
IMPORT $.^ AS LT;
IMPORT LT.LT_Types;
IMPORT ML_Core;
IMPORT ML_Core.Types;

numTrees := 100;
maxDepth := 255;
numFeatures := 0; // Zero is automatic choice
balanceClasses := FALSE;
nonSequentialIds := TRUE; // True to renumber ids, numbers and work-items to test
// support for non-sequentiality
numWIs := 2; // The number of independent work-items to create
maxRecs := 5000; // Note that this has to be less than or equal to the number of records
// in CovTypeDS (currently 5000)
DependentVar := 52; // Dependent Variable meant for this function


DiscreteField := Types.DiscreteField;
NumericField := Types.NumericField;
trainDat := CovTypeDS.trainRecs;
testDat := CovTypeDS.testRecs;
nominalFields := CovTypeDS.nominalCols;

ClassTest() := FUNCTION
ML_Core.ToField(trainDat, trainNF); // Get training data as a field
ML_Core.ToField(testDat, testNF); // Get test data as a field

//Ind = independent, Dep = dependent
Ind1 := PROJECT(trainNF(number != DependentVar AND id <= maxRecs), TRANSFORM(NumericField,
SELF.number := IF(nonSequentialIds, 5*LEFT.number, LEFT.number),
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));
Dep1 := PROJECT(trainNF(number = DependentVar AND id <= maxRecs), TRANSFORM(DiscreteField,
SELF.number := 1,
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));

// Generate multiple work items
Ind2 := NORMALIZE(Ind1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));
Dep2 := NORMALIZE(Dep1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));

Forest := LT.ClassificationForest(numTrees, numFeatures, maxDepth, nominalFields, balanceClasses);
model := Forest.GetModel(Ind2, Dep2);

IndTest1 := PROJECT(testNF(number != DependentVar), TRANSFORM(NumericField,
SELF.number := IF(nonSequentialIds, 5*LEFT.number, LEFT.number),
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));
DepCmp1 := PROJECT(testNF(number = DependentVar), TRANSFORM(DiscreteField,
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));

// Generate multiple work items
IndTest2 := NORMALIZE(IndTest1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));
DepCmp2 := NORMALIZE(DepCmp1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF.number := 1;
SELF := LEFT));

RETURN Forest.Accuracy(model, DepCmp2, IndTest2);
END;

accuracy := ClassTest();

// Both work items should be at least 78% accurate
OUTPUT(accuracy, {passing := IF((COUNT(GROUP, raw_accuracy >= 0.78) = numWIs), 'Pass', 'Fail: ' + raw_accuracy + ' < 0.78')}, NAMED('Result'));
59 changes: 59 additions & 0 deletions ecl/CommonPrefixTest2.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2022 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

// Updated version of CommonPrefixLenTest that outputs whether the correct
// output is reached or what any differences were, plus additional inputs

IMPORT $.^ AS LT;
IMPORT LT.Internal AS int;

inp1 := [1, 2, 3, 4, 5];
inp2 := [1, 2, 4, 5, 6, 7];
inp3 := [1, 2, 3, 4, 5, 6, 7];
inp4 := [2, 3, 4, 5];
inp5 := [7, 9, 13, 20];
inp6 := [7, 9, 13, 20];

Res1x2 := int.CommonPrefixLen(inp1, inp2);
Res1x3 := int.CommonPrefixLen(inp1, inp3);
Res2x3 := int.CommonPrefixLen(inp2, inp3);
Res3x2 := int.CommonPrefixLen(inp3, inp2); // Test that function is symmetrix as f(2, 3) should equal f(3, 2)
Res3x4 := int.CommonPrefixLen(inp3, inp4);
Res5x6 := int.CommonPrefixLen(inp5, inp6); // Equal sets, should return the length

Expected1x2 := 2;
Expected1x3 := 5;
Expected2x3 := 2;
Expected3x2 := 2;
Expected3x4 := 0;
Expected5x6 := 4;

Test_Result := RECORD
STRING Test;
INTEGER Expected;
INTEGER Result;
END;

tests := DATASET([{'1x2', Expected1x2, Res1x2},
{'1x3', Expected1x3, Res1x3},
{'2x3', Expected2x3, Res2x3},
{'3x2', Expected3x2, Res3x2},
{'3x4', Expected3x4, Res3x4},
{'5x6', Expected5x6, Res5x6}], Test_Result);

OUTPUT(IF(COUNT(tests(Expected != Result)) = 0, 'All Tests Passed', 'Test Cases Failed'), NAMED('Result'));
OUTPUT(tests(Expected != Result), NAMED('Errors'));
101 changes: 101 additions & 0 deletions ecl/RegressionTestModified.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*##############################################################################
HPCC SYSTEMS software Copyright (C) 2022 HPCC Systems®.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

#ONWARNING(4550, ignore);

// Modified version of the testCovTypeReg test file that works with the
// OBT test system

IMPORT $.^.test.datasets.CovTypeDS;
IMPORT $.^ AS LT;
IMPORT LT.LT_Types;
IMPORT ML_Core;
IMPORT ML_Core.Types;

numTrees := 400;
maxDepth := 255;
numFeatures := 0; // Zero is automatic choice
nonSequentialIds := TRUE; // True to renumber ids, numbers and work-items to test
// support for non-sequentiality
numWIs := 1; // The number of independent work-items to create
maxRecs := 500; // Note that this has to be less than or equal to the number of records
// in CovTypeDS (currently 500)

maxTestRecs := 100;
NumericField := Types.NumericField;
trainDat := CovTypeDS.trainRecs;
testDat := CovTypeDS.testRecs;
nominalFields := CovTypeDS.nominalCols;
DependentVar := 1; // Dependent Variable meant for this function

RegressTest() := FUNCTION

ML_Core.ToField(trainDat, trainNF); // Get training data as a field
ML_Core.ToField(testDat, testNF); // Get test data as a field

// Take out the first field from training set (Elevation) to use as the target value. Re-number the other fields
// to fill the gap

//Ind = independent, Dep = dependent
Ind1 := PROJECT(trainNF(number != DependentVar AND id <= maxRecs), TRANSFORM(NumericField,
SELF.number := IF(nonSequentialIds, (5*LEFT.number -1), LEFT.number -1),
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));
Dep1 := PROJECT(trainNF(number = DependentVar AND id <= maxRecs), TRANSFORM(NumericField,
SELF.number := DependentVar,
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));

// Generate multiple work items
Ind2 := NORMALIZE(Ind1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));
Dep2 := NORMALIZE(Dep1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));

Forest := LT.RegressionForest(numTrees:=numTrees, featuresPerNode:=numFeatures, maxDepth:=maxDepth, nominalFields:=nominalFields);
model := Forest.GetModel(Ind2, Dep2);

maxTestId := MIN(testNF, id) + maxTestRecs;
testNF2 := testNF(id < maxTestId);

Indtest1 := PROJECT(testNF2(number != DependentVar), TRANSFORM(NumericField,
SELF.number := IF(nonSequentialIds, (5*LEFT.number -1), LEFT.number -1),
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));
DepCmp1 := PROJECT(testNF2(number = DependentVar), TRANSFORM(NumericField,
SELF.number := DependentVar,
SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
SELF := LEFT));

// Generate multiple work items
IndTest2 := NORMALIZE(IndTest1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));
DepCmp2 := NORMALIZE(DepCmp1, numWIs, TRANSFORM(RECORDOF(LEFT),
SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
SELF := LEFT));

// Determine accuracy
RETURN Forest.Accuracy(model, DepCmp2, IndTest2);
END;

accuracy := RegressTest();

// Result should be at least 70% accurate
OUTPUT(accuracy, {passing := IF(r2 > 0.70, 'Pass', 'Fail, ' + r2)}, NAMED('Result'));
3 changes: 3 additions & 0 deletions ecl/key/ClassificationTestModified.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='Result'>
<Row><passing>Pass</passing></Row>
</Dataset>
5 changes: 5 additions & 0 deletions ecl/key/CommonPrefixTest2.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<Dataset name='Result'>
<Row><Result>All Tests Passed</Result></Row>
</Dataset>
<Dataset name='Errors'>
</Dataset>
3 changes: 3 additions & 0 deletions ecl/key/RegressionTestModified.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<Dataset name='Result'>
<Row><passing>Pass</passing></Row>
</Dataset>

0 comments on commit 0ea12a3

Please sign in to comment.