Merge pull request #12 from NSel1727/main

ghalliday · web-flow · commit 0ea12a36d4e4 · 2022-06-28T14:34:25.000+01:00
HPCC-27713 - Create confidence test and key file for the Learning Trees bundle

Reviewed-By: Attila Vamos attila.vamos@gmail.com
Reviewed-by: Gavin Halliday &lt;ghalliday@hpccsystems.com&gt;
Merged-by: Gavin Halliday &lt;ghalliday@hpccsystems.com&gt;
diff --git a/ecl/ClassificationTestModified.ecl b/ecl/ClassificationTestModified.ecl
@@ -0,0 +1,97 @@
+﻿/*##############################################################################
+
+    HPCC SYSTEMS software Copyright (C) 2022 HPCC Systems®.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+############################################################################## */
+
+#ONWARNING(2007, ignore);
+#ONWARNING(4531, ignore);
+#ONWARNING(4550, ignore);
+
+// Modified version of the testCovTypeClass test file that works with the
+// OBT test system
+
+IMPORT $.^.test.datasets.CovTypeDS;
+IMPORT $.^ AS LT;
+IMPORT LT.LT_Types;
+IMPORT ML_Core;
+IMPORT ML_Core.Types;
+
+numTrees := 100;
+maxDepth := 255;
+numFeatures := 0; // Zero is automatic choice
+balanceClasses := FALSE;
+nonSequentialIds := TRUE; // True to renumber ids, numbers and work-items to test
+                            // support for non-sequentiality
+numWIs := 2;     // The number of independent work-items to create
+maxRecs := 5000; // Note that this has to be less than or equal to the number of records
+                 // in CovTypeDS (currently 5000)
+DependentVar := 52; // Dependent Variable meant for this function
+
+										
+DiscreteField := Types.DiscreteField;
+NumericField := Types.NumericField;
+trainDat := CovTypeDS.trainRecs;
+testDat := CovTypeDS.testRecs;
+nominalFields := CovTypeDS.nominalCols;
+
+ClassTest() := FUNCTION
+	ML_Core.ToField(trainDat, trainNF); // Get training data as a field
+	ML_Core.ToField(testDat, testNF); // Get test data as a field
+        
+	//Ind = independent, Dep = dependent
+	Ind1 := PROJECT(trainNF(number != DependentVar AND id <= maxRecs), TRANSFORM(NumericField,
+					SELF.number := IF(nonSequentialIds, 5*LEFT.number, LEFT.number),
+					SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+					SELF := LEFT));
+	Dep1 := PROJECT(trainNF(number = DependentVar AND id <= maxRecs), TRANSFORM(DiscreteField,
+					SELF.number := 1,
+					SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+					SELF := LEFT));
+					
+	// Generate multiple work items
+	Ind2 := NORMALIZE(Ind1, numWIs, TRANSFORM(RECORDOF(LEFT),
+						SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+						SELF := LEFT));
+	Dep2 := NORMALIZE(Dep1, numWIs, TRANSFORM(RECORDOF(LEFT),
+						SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+						SELF := LEFT));
+
+	Forest := LT.ClassificationForest(numTrees, numFeatures, maxDepth, nominalFields, balanceClasses);
+	model := Forest.GetModel(Ind2, Dep2);
+
+	IndTest1 :=  PROJECT(testNF(number != DependentVar), TRANSFORM(NumericField,
+					SELF.number := IF(nonSequentialIds, 5*LEFT.number, LEFT.number),
+					SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+					SELF := LEFT));
+	DepCmp1 := PROJECT(testNF(number = DependentVar), TRANSFORM(DiscreteField,
+					SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+					SELF := LEFT));
+					
+	// Generate multiple work items
+	IndTest2 := NORMALIZE(IndTest1, numWIs, TRANSFORM(RECORDOF(LEFT),
+						SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+						SELF := LEFT));
+	DepCmp2 := NORMALIZE(DepCmp1, numWIs, TRANSFORM(RECORDOF(LEFT),
+						SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+						SELF.number := 1;
+						SELF := LEFT));
+
+	RETURN Forest.Accuracy(model, DepCmp2, IndTest2);
+END;
+       
+accuracy := ClassTest();
+
+// Both work items should be at least 78% accurate
+OUTPUT(accuracy, {passing := IF((COUNT(GROUP, raw_accuracy >= 0.78) = numWIs), 'Pass', 'Fail: ' + raw_accuracy + ' < 0.78')}, NAMED('Result'));
diff --git a/ecl/CommonPrefixTest2.ecl b/ecl/CommonPrefixTest2.ecl
@@ -0,0 +1,59 @@
+﻿/*##############################################################################
+
+    HPCC SYSTEMS software Copyright (C) 2022 HPCC Systems®.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+############################################################################## */
+
+// Updated version of CommonPrefixLenTest that outputs whether the correct
+// output is reached or what any differences were, plus additional inputs
+
+IMPORT $.^ AS LT;
+IMPORT LT.Internal AS int;
+
+inp1 := [1, 2, 3, 4, 5];
+inp2 := [1, 2, 4, 5, 6, 7];
+inp3 := [1, 2, 3, 4, 5, 6, 7];
+inp4 := [2, 3, 4, 5];
+inp5 := [7, 9, 13, 20];
+inp6 := [7, 9, 13, 20];
+
+Res1x2 := int.CommonPrefixLen(inp1, inp2);
+Res1x3 := int.CommonPrefixLen(inp1, inp3);
+Res2x3 := int.CommonPrefixLen(inp2, inp3);
+Res3x2 := int.CommonPrefixLen(inp3, inp2); // Test that function is symmetrix as f(2, 3) should equal f(3, 2) 
+Res3x4 := int.CommonPrefixLen(inp3, inp4);
+Res5x6 := int.CommonPrefixLen(inp5, inp6); // Equal sets, should return the length
+
+Expected1x2 := 2;
+Expected1x3 := 5;
+Expected2x3 := 2;
+Expected3x2 := 2;
+Expected3x4 := 0;
+Expected5x6 := 4;
+
+Test_Result := RECORD
+    STRING Test;
+    INTEGER Expected;
+    INTEGER Result;
+END;
+
+tests := DATASET([{'1x2', Expected1x2, Res1x2},
+                  {'1x3', Expected1x3, Res1x3},
+                  {'2x3', Expected2x3, Res2x3},
+                  {'3x2', Expected3x2, Res3x2},
+                  {'3x4', Expected3x4, Res3x4},
+                  {'5x6', Expected5x6, Res5x6}], Test_Result);    
+
+OUTPUT(IF(COUNT(tests(Expected != Result)) = 0, 'All Tests Passed', 'Test Cases Failed'), NAMED('Result'));
+OUTPUT(tests(Expected != Result), NAMED('Errors'));
diff --git a/ecl/RegressionTestModified.ecl b/ecl/RegressionTestModified.ecl
@@ -0,0 +1,101 @@
+﻿/*##############################################################################
+
+    HPCC SYSTEMS software Copyright (C) 2022 HPCC Systems®.
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+############################################################################## */
+
+#ONWARNING(4550, ignore);
+
+// Modified version of the testCovTypeReg test file that works with the
+// OBT test system
+
+IMPORT $.^.test.datasets.CovTypeDS;
+IMPORT $.^ AS LT;
+IMPORT LT.LT_Types;
+IMPORT ML_Core;
+IMPORT ML_Core.Types;
+
+numTrees := 400;
+maxDepth := 255;
+numFeatures := 0; // Zero is automatic choice
+nonSequentialIds := TRUE; // True to renumber ids, numbers and work-items to test
+                            // support for non-sequentiality
+numWIs := 1;     // The number of independent work-items to create
+maxRecs := 500; // Note that this has to be less than or equal to the number of records
+                 // in CovTypeDS (currently 500)
+								
+maxTestRecs := 100;
+NumericField := Types.NumericField;
+trainDat := CovTypeDS.trainRecs;
+testDat := CovTypeDS.testRecs;
+nominalFields := CovTypeDS.nominalCols;
+DependentVar := 1; // Dependent Variable meant for this function
+
+RegressTest() := FUNCTION
+	
+	ML_Core.ToField(trainDat, trainNF); // Get training data as a field
+	ML_Core.ToField(testDat, testNF); // Get test data as a field
+  	
+	// Take out the first field from training set (Elevation) to use as the target value.  Re-number the other fields
+	// to fill the gap
+		
+	//Ind = independent, Dep = dependent
+	Ind1 := PROJECT(trainNF(number != DependentVar AND id <= maxRecs), TRANSFORM(NumericField,
+				SELF.number := IF(nonSequentialIds, (5*LEFT.number -1), LEFT.number -1),
+				SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+				SELF := LEFT));
+	Dep1 := PROJECT(trainNF(number = DependentVar AND id <= maxRecs), TRANSFORM(NumericField,
+				SELF.number := DependentVar,
+				SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+				SELF := LEFT));
+					
+	// Generate multiple work items
+	Ind2 := NORMALIZE(Ind1, numWIs, TRANSFORM(RECORDOF(LEFT),
+				SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+				SELF := LEFT));
+	Dep2 := NORMALIZE(Dep1, numWIs, TRANSFORM(RECORDOF(LEFT),
+				SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+				SELF := LEFT));
+
+	Forest := LT.RegressionForest(numTrees:=numTrees, featuresPerNode:=numFeatures, maxDepth:=maxDepth, nominalFields:=nominalFields);
+	model := Forest.GetModel(Ind2, Dep2);
+
+	maxTestId := MIN(testNF, id) + maxTestRecs;
+	testNF2 := testNF(id < maxTestId);
+
+	Indtest1 := PROJECT(testNF2(number != DependentVar), TRANSFORM(NumericField,
+				SELF.number := IF(nonSequentialIds, (5*LEFT.number -1), LEFT.number -1),
+				SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+				SELF := LEFT));
+	DepCmp1 := PROJECT(testNF2(number = DependentVar), TRANSFORM(NumericField,
+				SELF.number := DependentVar,
+				SELF.id := IF(nonSequentialIds, 5*LEFT.id, LEFT.id),
+				SELF := LEFT));
+											
+	// Generate multiple work items
+	IndTest2 := NORMALIZE(IndTest1, numWIs, TRANSFORM(RECORDOF(LEFT),
+				SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+				SELF := LEFT));
+	DepCmp2 := NORMALIZE(DepCmp1, numWIs, TRANSFORM(RECORDOF(LEFT),
+				SELF.wi := IF(nonSequentialIds, 5*COUNTER, COUNTER),
+				SELF := LEFT));
+	
+	// Determine accuracy 
+	RETURN Forest.Accuracy(model, DepCmp2, IndTest2);
+END;
+
+accuracy := RegressTest();
+
+// Result should be at least 70% accurate
+OUTPUT(accuracy, {passing := IF(r2 > 0.70, 'Pass', 'Fail, ' + r2)}, NAMED('Result'));
diff --git a/ecl/key/ClassificationTestModified.xml b/ecl/key/ClassificationTestModified.xml
@@ -0,0 +1,3 @@
+<Dataset name='Result'>
+ <Row><passing>Pass</passing></Row>
+</Dataset>
diff --git a/ecl/key/CommonPrefixTest2.xml b/ecl/key/CommonPrefixTest2.xml
@@ -0,0 +1,5 @@
+<Dataset name='Result'>
+ <Row><Result>All Tests Passed</Result></Row>
+</Dataset>
+<Dataset name='Errors'>
+</Dataset>
diff --git a/ecl/key/RegressionTestModified.xml b/ecl/key/RegressionTestModified.xml
@@ -0,0 +1,3 @@
+<Dataset name='Result'>
+ <Row><passing>Pass</passing></Row>
+</Dataset>

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+<Dataset name='Result'>`
	`2`	`+ <Row><passing>Pass</passing></Row>`
	`3`	`+</Dataset>`