Cleanups from spark 1.5 mllib perf tests

jkbradley · mengxr · commit a16367686832 · 2015-11-18T23:01:26.000-08:00
More tests should be runnable for GLMs. Some config fixes. CC mengxr Author: Joseph K. Bradley <joseph@databricks.com> Closes #88 from jkbradley/ml-perf-cleanup-1.6 and squashes the following commits: ca82fa9 [Joseph K. Bradley] updated 1.4 ALS test to use same numRatings as 1.5 b71373d [Joseph K. Bradley] Cleanups from spark 1.5 mllib perf tests. More tests should be runnable for GLMs. some config fixes
diff --git a/config/config.py.template b/config/config.py.template
@@ -370,7 +370,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
 #  * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
 #  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
 #  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
-MLLIB_SPARK_VERSION = 1.2
+MLLIB_SPARK_VERSION = 1.5
 
 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
 if MLLIB_SPARK_VERSION >= 1.1:
@@ -413,7 +413,12 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
 if MLLIB_SPARK_VERSION >= 1.1:
     MLLIB_GLM_TEST_OPTS += [
         # Optimization algorithm: sgd, lbfgs
-        OptionSet("optimizer", ["sgd"])
+        OptionSet("optimizer", ["sgd", "lbfgs"])
+    ]
+if MLLIB_SPARK_VERSION >= 1.5:
+    MLLIB_GLM_TEST_OPTS += [
+        # Ignored, but required for config
+        OptionSet("elastic-net-param", [0.0])
     ]
 
 # GLM Regression Tests #
@@ -441,8 +446,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
         OptionSet("reg-type", ["elastic-net"]),
         # Runs with L2 (param = 0.0), L1 (param = 1.0).
         OptionSet("elastic-net-param", [0.0, 1.0]),
-        # Runs with lambda = [0.0, 0.5]
-        OptionSet("reg-param", [0.0, 0.5]),
+        # Regularization param (lambda)
+        OptionSet("reg-param", [0.01]),
         # The scale factor for the noise
         OptionSet("epsilon", [0.1]),
         # The intercept for the data
@@ -466,7 +471,7 @@ MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
 # GLM Classification Tests #
 MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [
     # Loss to minimize: logistic, hinge (SVM)
-    OptionSet("loss", ["logistic", "hinge"])
+    OptionSet("loss", ["logistic"])
 ]
 
 MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
@@ -483,8 +488,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
         OptionSet("reg-type", ["elastic-net"]),
         # Runs with L2 (param = 0.0), L1 (param = 1.0).
         OptionSet("elastic-net-param", [0.0, 1.0]),
-        # Runs with lambda = [0.0, 0.5]
-        OptionSet("reg-param", [0.0, 0.5]),
+        # Regularization param (lambda)
+        OptionSet("reg-param", [0.01]),
         # The scale factor for the noise
         OptionSet("epsilon", [0.1]),
         # The intercept for the data
@@ -504,30 +509,14 @@ NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
     OptionSet("scale-factor", [1.0]),
     # Naive Bayes smoothing lambda.
     OptionSet("nb-lambda", [1.0]),
-    # Model type: either Multinomial or Bernoulli
+    # Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+)
     OptionSet("model-type", ["multinomial"]),
 ]
 
 MLLIB_TESTS += [("naive-bayes", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
     MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] +
     NAIVE_BAYES_TEST_OPTS)]
 
-if MLLIB_SPARK_VERSION >= 1.4:
-    NAIVE_BAYES_TEST_OPTS_BERNOULLI = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-        # Expected fraction of examples which are negative
-        OptionSet("per-negative", [0.3]),
-        # The scale factor for the noise in feature values
-        OptionSet("scale-factor", [1.0]),
-        # Naive Bayes smoothing lambda.
-        OptionSet("nb-lambda", [1.0]),
-        # MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-        OptionSet("model-type", ["bernoulli"]),
-    ]
-
-    MLLIB_TESTS += [("naive-bayes-bernoulli", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] +
-        NAIVE_BAYES_TEST_OPTS_BERNOULLI)]
-
 # Decision Trees #
 MLLIB_DECISION_TREE_TEST_OPTS = MLLIB_COMMON_OPTS + [
     # The number of rows or examples
@@ -564,7 +553,8 @@ if MLLIB_SPARK_VERSION >= 1.2:
         # Path to test dataset (only used if training dataset given).
         # If not given, hold out part of training data for validation.
         OptionSet("test-data", [""]),
-        # Fraction of data to hold out for testing (ignored if given training and test dataset).
+        # Fraction of data to hold out for testing
+        #  (Ignored if given training and test dataset, or if using synthetic data.)
         OptionSet("test-data-fraction", [0.2], can_scale=False),
         # Number of trees. If 1, then run DecisionTree. If >1, then run RandomForest.
         OptionSet("num-trees", [1, 10], can_scale=False),
@@ -622,22 +612,19 @@ MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
 
 if MLLIB_SPARK_VERSION >= 1.3:
     MLLIB_TESTS += [("gmm", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_CLUSTERING_TEST_OPTS)]
+        MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_GMM_TEST_OPTS)]
 
 MLLIB_LDA_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-documents", [10000], can_scale=True),
-    OptionSet("num-vocab", [1000], can_scale=False),
+    OptionSet("num-documents", [50000], can_scale=True),
+    OptionSet("num-vocab", [10000], can_scale=False),
     OptionSet("num-topics", [20], can_scale=False),
     OptionSet("num-iterations", [20]),
-    OptionSet("document-length", [100])]
-
-if MLLIB_SPARK_VERSION >= 1.4:
-    MLLIB_TESTS += [("emlda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("emlda")] + MLLIB_LDA_TEST_OPTS)]
+    OptionSet("document-length", [100]),
+    OptionSet("optimizer", ["em", "online"])]
 
 if MLLIB_SPARK_VERSION >= 1.4:
-    MLLIB_TESTS += [("onlinelda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("onlinelda")] + MLLIB_LDA_TEST_OPTS)]
+    MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
 
 # TODO: tune PIC test size to run in 20-30 seconds
 MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -273,7 +273,7 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
     val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     val data = DataGenerator.generateRatings(sc, numUsers, numProducts,
-      math.ceil(numRatings * 1.25).toLong, implicitRatings,numPartitions,seed)
+      numRatings, implicitRatings, numPartitions, seed)
 
     rdd = data._1.cache()
     testRdd = data._2
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -10,6 +10,7 @@ import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, RandomF
 import org.apache.spark.mllib.classification._
 import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater, SimpleUpdater}
 import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest}
@@ -140,46 +141,34 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
     val regParam = doubleOptionValue(REG_PARAM)
     val elasticNetParam = doubleOptionValue(ELASTIC_NET_PARAM)
     val numIterations = intOptionValue(NUM_ITERATIONS)
-    val optimizer = stringOptionValue(OPTIMIZER)
+    // val optimizer = stringOptionValue(OPTIMIZER)  // ignore for now since it makes config hard to do
 
     // Linear Regression only supports squared loss for now.
     if (!Array("l2").contains(loss)) {
       throw new IllegalArgumentException(
         s"GLMRegressionTest run with unknown loss ($loss).  Supported values: l2.")
     }
 
-    if (Array("sgd").contains(optimizer)) {
-      if (!Array("none", "l1", "l2").contains(regType)) {
-        throw new IllegalArgumentException(
-          s"GLMRegressionTest run with unknown regType ($regType) with sgd.  Supported values: none, l1, l2.")
-      }
-    } else if (Array("lbfgs").contains(optimizer)) {
-      if (!Array("elastic-net").contains(regType)) {
-        throw new IllegalArgumentException(
-          s"GLMRegressionTest run with unknown regType ($regType) with lbfgs.  Supported values: elastic-net.")
-      }
-    } else {
-      throw new IllegalArgumentException(
-        s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, lbfgs.")
-    }
-
     (loss, regType) match {
       case ("l2", "none") =>
         val lr = new LinearRegressionWithSGD().setIntercept(addIntercept = true)
-        lr.optimizer.setNumIterations(numIterations).setStepSize(stepSize)
+        lr.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setConvergenceTol(0.0)
         lr.run(rdd)
       case ("l2", "l1") =>
         val lasso = new LassoWithSGD().setIntercept(addIntercept = true)
         lasso.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setRegParam(regParam)
+          .setConvergenceTol(0.0)
         lasso.run(rdd)
       case ("l2", "l2") =>
         val rr = new RidgeRegressionWithSGD().setIntercept(addIntercept = true)
         rr.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setRegParam(regParam)
+          .setConvergenceTol(0.0)
         rr.run(rdd)
       case ("l2", "elastic-net") =>
-        println("WARNING: Linear Regression with elastic-net in ML package uses LBFGS/OWLQN for optimization" +
-          " which ignores stepSize and uses numIterations for maxIter in Spark 1.5.")
-        val rr = new LinearRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam).setMaxIter(numIterations)
+        println("WARNING: Linear Regression with elastic-net in ML package uses LBFGS/OWLQN for" +
+          " optimization which ignores stepSize and uses numIterations for maxIter in Spark 1.5.")
+        val rr = new LinearRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam)
+          .setMaxIter(numIterations)
         val sqlContext = new SQLContext(rdd.context)
         import sqlContext.implicits._
         val mlModel = rr.fit(rdd.toDF())
@@ -247,46 +236,68 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
         s"GLMClassificationTest run with unknown loss ($loss).  Supported values: logistic, hinge.")
     }
 
-    if (Array("sgd").contains(optimizer)) {
-      if (!Array("none", "l1", "l2").contains(regType)) {
-        throw new IllegalArgumentException(
-          s"GLMRegressionTest run with unknown regType ($regType) with sgd.  Supported values: none, l1, l2.")
-      }
-    } else if (Array("lbfgs").contains(optimizer)) {
-      if (!Array("logistic").contains(loss)) {
-        throw new IllegalArgumentException(
-          s"GLMRegressionTest with lbfgs only supports logistic loss.")
-      }
-      if (!Array("none", "elastic-net").contains(regType)) {
-        throw new IllegalArgumentException(
-          s"GLMRegressionTest run with unknown regType ($regType) with lbfgs.  Supported values: none, elastic-net.")
+    if (regType == "elastic-net") {  // use spark.ml
+      loss match {
+        case "logistic" =>
+          println("WARNING: Logistic Regression with elastic-net in ML package uses LBFGS/OWLQN for optimization" +
+            " which ignores stepSize in Spark 1.5.")
+          val lor = new LogisticRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam)
+            .setMaxIter(numIterations)
+          val sqlContext = new SQLContext(rdd.context)
+          import sqlContext.implicits._
+          val mlModel = lor.fit(rdd.toDF())
+          new LogisticRegressionModel(mlModel.weights, mlModel.intercept)
+        case _ =>
+          throw new IllegalArgumentException(
+            s"GLMClassificationTest given unsupported loss = $loss." +
+              s" Note the set of supported combinations increases in later Spark versions.")
       }
     } else {
-      throw new IllegalArgumentException(
-        s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, lbfgs.")
-    }
-
-    (loss, regType, optimizer) match {
-      case ("logistic", "none", "sgd") =>
-        LogisticRegressionWithSGD.train(rdd, numIterations, stepSize)
-      case ("logistic", "none", "lbfgs") =>
-        println("WARNING: LogisticRegressionWithLBFGS ignores numIterations, stepSize" +
-          " in this Spark version.")
-        new LogisticRegressionWithLBFGS().run(rdd)
-      case ("logistic", "elastic-net", _) =>
-        println("WARNING: Logistic Regression with elastic-net in ML package uses LBFGS/OWLQN for optimization" +
-          " which ignores stepSize and uses numIterations for maxIter in Spark 1.5.")
-        val lor = new LogisticRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam).setMaxIter(numIterations)
-        val sqlContext = new SQLContext(rdd.context)
-        import sqlContext.implicits._
-        val mlModel = lor.fit(rdd.toDF())
-        new LogisticRegressionModel(mlModel.weights, mlModel.intercept)
-      case ("hinge", "l2", "sgd") =>
-        SVMWithSGD.train(rdd, numIterations, stepSize, regParam)
-      case _ =>
-        throw new IllegalArgumentException(
-          s"GLMClassificationTest given incompatible (loss, regType) = ($loss, $regType)." +
-            s" Note the set of supported combinations increases in later Spark versions.")
+      (loss, optimizer) match {
+        case ("logistic", "sgd") =>
+          val lr = new LogisticRegressionWithSGD()
+          lr.optimizer.setStepSize(stepSize).setNumIterations(numIterations).setConvergenceTol(0.0)
+          regType match {
+            case "none" =>
+              lr.optimizer.setUpdater(new SimpleUpdater)
+            case "l1" =>
+              lr.optimizer.setUpdater(new L1Updater)
+            case "l2" =>
+              lr.optimizer.setUpdater(new SquaredL2Updater)
+          }
+          lr.run(rdd)
+        case ("logistic", "lbfgs") =>
+          println("WARNING: LogisticRegressionWithLBFGS ignores stepSize in this Spark version.")
+          val lr = new LogisticRegressionWithLBFGS()
+          lr.optimizer.setNumIterations(numIterations).setConvergenceTol(0.0)
+          regType match {
+            case "none" =>
+              lr.optimizer.setUpdater(new SimpleUpdater)
+            case "l1" =>
+              lr.optimizer.setUpdater(new L1Updater)
+            case "l2" =>
+              lr.optimizer.setUpdater(new SquaredL2Updater)
+          }
+          lr.run(rdd)
+        case ("hinge", "sgd") =>
+          val svm = new SVMWithSGD()
+          svm.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setRegParam(regParam)
+            .setConvergenceTol(0.0)
+          regType match {
+            case "none" =>
+              svm.optimizer.setUpdater(new SimpleUpdater)
+            case "l1" =>
+              svm.optimizer.setUpdater(new L1Updater)
+            case "l2" =>
+              svm.optimizer.setUpdater(new SquaredL2Updater)
+          }
+          svm.run(rdd)
+        case _ =>
+          throw new IllegalArgumentException(
+            s"GLMClassificationTest given incompatible (loss, regType) = ($loss, $regType)." +
+              s" Supported combinations include: (elastic-net, _), (logistic, sgd), (logistic, lbfgs), (hinge, sgd)." +
+              s" Note the set of supported combinations increases in later Spark versions.")
+      }
     }
   }
 }
@@ -322,7 +333,7 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
     val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     val data = DataGenerator.generateRatings(sc, numUsers, numProducts,
-      math.ceil(numRatings * 1.25).toLong, implicitRatings,numPartitions,seed)
+      numRatings, implicitRatings, numPartitions, seed)
 
     rdd = data._1.cache()
     testRdd = data._2
@@ -490,7 +501,7 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
     val seed = intOptionValue(RANDOM_SEED) + 12
 
     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
-      .setBlocks(rdd.partitions.size).run(rdd)
+      .setBlocks(rdd.partitions.length).run(rdd)
   }
 }
 
@@ -627,7 +638,6 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
       seed: Long): (Array[RDD[LabeledPoint]], Map[Int, Int], Int) = {
     // Generic test options
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
-    val testDataFraction: Double = getTestDataFraction
     // Data dimensions and type
     val numExamples: Long = longOptionValue(NUM_EXAMPLES)
     val numFeatures: Int = intOptionValue(NUM_FEATURES)
@@ -642,7 +652,7 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
         numFeatures, numPartitions, labelType,
         fracCategoricalFeatures, fracBinaryFeatures, treeDepth, seed)
 
-    val splits = rdd_.randomSplit(Array(1.0 - testDataFraction, testDataFraction), seed)
+    val splits = rdd_.randomSplit(Array(0.8, 0.2), seed)
     (splits, categoricalFeaturesInfo_, labelType)
   }
 
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/TestRunner.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/TestRunner.scala
@@ -8,16 +8,16 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.{SparkConf, SparkContext}
 
-import mllib.perf.clustering.{EMLDATest, GaussianMixtureTest, OnlineLDATest, PICTest}
+import mllib.perf.clustering.{GaussianMixtureTest, LDATest, PICTest}
 import mllib.perf.feature.Word2VecTest
 import mllib.perf.fpm.{FPGrowthTest, PrefixSpanTest}
 import mllib.perf.linalg.BlockMatrixMultTest
 
 object TestRunner {
     def main(args: Array[String]) {
-      if (args.size < 1) {
+      if (args.length < 1) {
         println(
-          "mllib.perf.TestRunner requires 1 or more args, you gave %s, exiting".format(args.size))
+          "mllib.perf.TestRunner requires 1 or more args, you gave %s, exiting".format(args.length))
         System.exit(1)
       }
       val testName = args(0)
@@ -34,8 +34,7 @@ object TestRunner {
         // clustering
         case "gmm" => new GaussianMixtureTest(sc)
         case "kmeans" => new KMeansTest(sc)
-        case "emlda" => new EMLDATest(sc)
-        case "onlinelda" => new OnlineLDATest(sc)
+        case "lda" => new LDATest(sc)
         case "pic" => new PICTest(sc)
         // trees
         case "decision-tree" => new DecisionTreeTest(sc)
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/LDATest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/LDATest.scala
diff --git a/pyspark-tests/mllib_tests.py b/pyspark-tests/mllib_tests.py