databricks
diff --git a/‎config/config.py.template
Lines changed: 53 additions & 49 deletions b/‎config/config.py.template
Lines changed: 53 additions & 49 deletions
diff --git a/‎mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
Lines changed: 18 additions & 16 deletions b/‎mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
Lines changed: 18 additions & 16 deletions
diff --git a/‎mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
Lines changed: 11 additions & 11 deletions b/‎mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
Lines changed: 11 additions & 11 deletions
@@ -394,13 +394,13 @@ MLLIB_COMMON_OPTS = COMMON_OPTS + [
 # Regression and Classification Tests #
 MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
     # The number of rows or examples
-    OptionSet("num-examples", [1000000], can_scale=True),
-    # The number of features per example
-    OptionSet("num-features", [10000], can_scale=False)
+    OptionSet("num-examples", [1000000], can_scale=True)
 ]
 
 # Generalized Linear Model (GLM) Tests #
 MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+    # The number of features per example
+    OptionSet("num-features", [10000], can_scale=False),
     # The number of iterations for SGD
     OptionSet("num-iterations", [20]),
     # The step size for SGD
@@ -412,8 +412,8 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
 ]
 if MLLIB_SPARK_VERSION >= 1.1:
     MLLIB_GLM_TEST_OPTS += [
-        # Optimization algorithm: sgd, lbfgs
-        OptionSet("optimizer", ["sgd", "lbfgs"])
+        # Optimization algorithm: sgd, l-bfgs
+        OptionSet("optimizer", ["sgd", "l-bfgs"])
     ]
 if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_GLM_TEST_OPTS += [
@@ -425,47 +425,19 @@ if MLLIB_SPARK_VERSION >= 1.5:
 MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
     # The intercept for the data
     OptionSet("intercept", [0.0]),
-    # The scale factor for the noise
-    OptionSet("epsilon", [0.1]),
+    # The scale factor for label noise
+    OptionSet("label-noise", [0.1]),
     # Loss to minimize: l2 (squared error)
     OptionSet("loss", ["l2"])
 ]
 
 MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
     MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_REGRESSION_TEST_OPTS)]
 
-if MLLIB_SPARK_VERSION >= 1.5:
-    MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-        # Loss to minimize: l2 (squared error)
-        OptionSet("loss", ["l2"]),
-        # The max number of iterations for LBFGS/OWLQN
-        OptionSet("num-iterations", [20]),
-        # LBFGS/OWLQN is used with elastic-net regularization.
-        OptionSet("optimizer", ["lbfgs"]),
-        # Using elastic-net regularization.
-        OptionSet("reg-type", ["elastic-net"]),
-        # Runs with L2 (param = 0.0), L1 (param = 1.0).
-        OptionSet("elastic-net-param", [0.0, 1.0]),
-        # Regularization param (lambda)
-        OptionSet("reg-param", [0.01]),
-        # The scale factor for the noise
-        OptionSet("epsilon", [0.1]),
-        # The intercept for the data
-        OptionSet("intercept", [0.0]),
-        # The step size is not used in LBFGS, but this is required in parameter checking.
-        OptionSet("step-size", [0.0])
-    ]
-
-    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
-        MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS)]
-
 # Classification Tests #
 MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
-     # Expected fraction of examples which are negative
-     OptionSet("per-negative", [0.3]),
-     # The scale factor for the noise in feature values
-     OptionSet("scale-factor", [1.0])
+    # Expected fraction of examples which are negative
+    OptionSet("per-negative", [0.3]),
 ]
 
 # GLM Classification Tests #
@@ -475,38 +447,70 @@ MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [
 ]
 
 MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
-    MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
+                 MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
+                 MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
 
 if MLLIB_SPARK_VERSION >= 1.5:
-    MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_CLASSIFICATION_TEST_OPTS + [
-        # In GLM classification with elastic-net regularization, only logistic loss is supported.
-        OptionSet("loss", ["logistic"]),
+    MLLIB_GLM_ELASTIC_NET_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+        # The max number of iterations for LBFGS/OWLQN
+        OptionSet("num-iterations", [20]),
         # LBFGS/OWLQN is used with elastic-net regularization.
-        OptionSet("optimizer", ["lbfgs"]),
+        OptionSet("optimizer", ["auto"]),
         # Using elastic-net regularization.
         OptionSet("reg-type", ["elastic-net"]),
         # Runs with L2 (param = 0.0), L1 (param = 1.0).
         OptionSet("elastic-net-param", [0.0, 1.0]),
         # Regularization param (lambda)
         OptionSet("reg-param", [0.01]),
-        # The scale factor for the noise
-        OptionSet("epsilon", [0.1]),
+        # The scale factor for the noise in feature values
+        OptionSet("feature-noise", [1.0]),
+        # The scale factor for the noise in label values
+        OptionSet("label-noise", [0.1]),
         # The intercept for the data
-        OptionSet("intercept", [0.0]),
+        OptionSet("intercept", [0.2]),
         # The step size is not used in LBFGS, but this is required in parameter checking.
         OptionSet("step-size", [0.0])
     ]
 
+    MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # Loss to minimize: l2 (squared error)
+        OptionSet("loss", ["l2"])
+    ]
+
+    # Test L-BFGS
+    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
+        MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
+        [OptionSet("num-features", [10000], can_scale=False)])]
+    # Test normal equation solver
+    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                     MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
+                     MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
+                     [OptionSet("num-features", [100], can_scale=False)])]
+
+    MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # In GLM classification with elastic-net regularization, only logistic loss is supported.
+        OptionSet("loss", ["logistic"])
+    ]
+
+    # Test L-BFGS
     MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
-        MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS)]
+                     MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
+                     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
+                     [OptionSet("num-features", [10000], can_scale=False)])]
+    # Test normal equation solver
+    MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                     MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
+                     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
+                     [OptionSet("num-features", [100], can_scale=False)])]
 
 NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+    # The number of features per example
+    OptionSet("num-features", [10000], can_scale=False),
     # Expected fraction of examples which are negative
     OptionSet("per-negative", [0.3]),
     # The scale factor for the noise in feature values
-    OptionSet("scale-factor", [1.0]),
+    OptionSet("feature-noise", [1.0]),
     # Naive Bayes smoothing lambda.
     OptionSet("nb-lambda", [1.0]),
     # Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+)
 
@@ -79,7 +79,7 @@ abstract class GLMTests(sc: SparkContext)
   val NUM_ITERATIONS = ("num-iterations",   "number of iterations for the algorithm")
   val REG_TYPE =       ("reg-type",   "type of regularization: none, l1, l2")
   val REG_PARAM =      ("reg-param",   "the regularization parameter against overfitting")
-  val OPTIMIZER =      ("optimizer", "optimization algorithm: sgd, lbfgs")
+  val OPTIMIZER =      ("optimizer", "optimization algorithm: sgd, l-bfgs")
 
   intOptions = intOptions ++ Seq(NUM_ITERATIONS)
   doubleOptions = doubleOptions ++ Seq(STEP_SIZE, REG_PARAM)
@@ -89,10 +89,10 @@ abstract class GLMTests(sc: SparkContext)
 class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
 
   val INTERCEPT =  ("intercept",   "intercept for random data generation")
-  val EPS =  ("epsilon",   "scale factor for the noise during data generation")
+  val LABEL_NOISE =  ("label-noise",   "scale factor for the noise during label generation")
   val LOSS =  ("loss",   "loss to minimize. Supported: l2 (squared error).")
 
-  doubleOptions = doubleOptions ++ Seq(INTERCEPT, EPS)
+  doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE)
   stringOptions = stringOptions ++ Seq(LOSS)
 
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -104,10 +104,10 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
     val intercept: Double = doubleOptionValue(INTERCEPT)
-    val eps: Double = doubleOptionValue(EPS)
+    val labelNoise: Double = doubleOptionValue(LABEL_NOISE)
 
     val data = DataGenerator.generateLabeledPoints(sc, math.ceil(numExamples * 1.25).toLong,
-      numFeatures, intercept, eps, numPartitions, seed)
+      numFeatures, intercept, labelNoise, numPartitions, seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
 
@@ -167,10 +167,10 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
 class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
 
   val THRESHOLD =  ("per-negative",   "probability for a negative label during data generation")
-  val SCALE =  ("scale-factor",   "scale factor for the noise during data generation")
+  val FEATURE_NOISE =  ("feature-noise",   "scale factor for the noise during feature generation")
   val LOSS =  ("loss",   "loss to minimize. Supported: logistic, hinge (SVM).")
 
-  doubleOptions = doubleOptions ++ Seq(THRESHOLD, SCALE)
+  doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE)
   stringOptions = stringOptions ++ Seq(LOSS)
 
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -190,10 +190,11 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
     val threshold: Double = doubleOptionValue(THRESHOLD)
-    val sf: Double = doubleOptionValue(SCALE)
+    val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
 
     val data = DataGenerator.generateClassificationLabeledPoints(sc,
-      math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, sf, numPartitions, seed)
+      math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
+      seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
 
@@ -220,15 +221,15 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
       throw new IllegalArgumentException(s"GLMClassificationTest run with unknown regType" +
         s" ($regType).  Supported values: none, l1, l2.")
     }
-    if (!Array("sgd", "lbfgs").contains(optimizer)) {
+    if (!Array("sgd", "l-bfgs").contains(optimizer)) {
       throw new IllegalArgumentException(
-        s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, lbfgs.")
+        s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, l-bfgs.")
     }
 
     (loss, regType, optimizer) match {
       case ("logistic", "none", "sgd") =>
         LogisticRegressionWithSGD.train(rdd, numIterations, stepSize)
-      case ("logistic", "none", "lbfgs") =>
+      case ("logistic", "none", "l-bfgs") =>
         println("WARNING: LogisticRegressionWithLBFGS ignores numIterations, stepSize" +
           " in this Spark version.")
         new LogisticRegressionWithLBFGS().run(rdd)
@@ -375,11 +376,11 @@ class NaiveBayesTest(sc: SparkContext)
   extends RegressionAndClassificationTests[NaiveBayesModel](sc) {
 
   val THRESHOLD =  ("per-negative",   "probability for a negative label during data generation")
-  val SCALE =  ("scale-factor",   "scale factor for the noise during data generation")
+  val FEATURE_NOISE =  ("feature-noise",   "scale factor for the noise during feature generation")
   val SMOOTHING =     ("nb-lambda",   "the smoothing parameter lambda for Naive Bayes")
   val MODEL_TYPE = ("model-type", "either multinomial (default) or bernoulli")
 
-  doubleOptions = doubleOptions ++ Seq(THRESHOLD, SCALE, SMOOTHING)
+  doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE, SMOOTHING)
   stringOptions = stringOptions ++ Seq(MODEL_TYPE)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
   addOptionsToParser()
@@ -391,15 +392,16 @@ class NaiveBayesTest(sc: SparkContext)
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
     val threshold: Double = doubleOptionValue(THRESHOLD)
-    val sf: Double = doubleOptionValue(SCALE)
+    val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
     val modelType = stringOptionValue(MODEL_TYPE)
 
     val data = if (modelType == "Bernoulli") {
       DataGenerator.generateBinaryLabeledPoints(sc,
         math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed)
     } else {
       val negdata = DataGenerator.generateClassificationLabeledPoints(sc,
-      math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, sf, numPartitions, seed)
+        math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
+        seed)
       val dataNonneg = negdata.map { lp =>
         LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map(math.abs)))
       }
 
@@ -19,13 +19,13 @@ object DataGenerator {
       numRows: Long,
       numCols: Int,
       intercept: Double,
-      eps: Double,
+      labelNoise: Double,
       numPartitions: Int,
       seed: Long = System.currentTimeMillis(),
       problem: String = ""): RDD[LabeledPoint] = {
 
-    RandomRDDs.randomRDD(sc,
-      new LinearDataGenerator(numCols,intercept, seed, eps, problem), numRows, numPartitions, seed)
+    RandomRDDs.randomRDD(sc, new LinearDataGenerator(numCols,intercept, seed, labelNoise, problem),
+      numRows, numPartitions, seed)
 
   }
 
@@ -46,12 +46,12 @@ object DataGenerator {
       numRows: Long,
       numCols: Int,
       threshold: Double,
-      scaleFactor: Double,
+      featureNoise: Double,
       numPartitions: Int,
       seed: Long = System.currentTimeMillis(),
       chiSq: Boolean = false): RDD[LabeledPoint] = {
 
-    RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, scaleFactor, chiSq),
+    RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, featureNoise, chiSq),
       numRows, numPartitions, seed)
   }
 
@@ -325,15 +325,15 @@ class RatingGenerator(
 class ClassLabelGenerator(
     private val numFeatures: Int,
     private val threshold: Double,
-    private val scaleFactor: Double,
+    private val featureNoise: Double,
     private val chiSq: Boolean) extends RandomDataGenerator[LabeledPoint] {
 
   private val rng = new java.util.Random()
 
   override def nextValue(): LabeledPoint = {
     val y = if (rng.nextDouble() < threshold) 0.0 else 1.0
     val x = Array.fill[Double](numFeatures) {
-      if (!chiSq) rng.nextGaussian() + (y * scaleFactor) else rng.nextInt(6) * 1.0
+      if (!chiSq) rng.nextGaussian() + (y * featureNoise) else rng.nextInt(6) * 1.0
     }
 
     LabeledPoint(y, Vectors.dense(x))
@@ -344,7 +344,7 @@ class ClassLabelGenerator(
   }
 
   override def copy(): ClassLabelGenerator =
-    new ClassLabelGenerator(numFeatures, threshold, scaleFactor, chiSq)
+    new ClassLabelGenerator(numFeatures, threshold, featureNoise, chiSq)
 }
 
 class BinaryLabeledDataGenerator(
@@ -374,7 +374,7 @@ class LinearDataGenerator(
     val numFeatures: Int,
     val intercept: Double,
     val seed: Long,
-    val eps: Double,
+    val labelNoise: Double,
     val problem: String = "",
     val sparsity: Double = 1.0) extends RandomDataGenerator[LabeledPoint] {
 
@@ -386,7 +386,7 @@ class LinearDataGenerator(
   override def nextValue(): LabeledPoint = {
     val x = Array.fill[Double](nnz)(2*rng.nextDouble()-1)
 
-    val y = weights.zip(x).map(p => p._1 * p._2).sum + intercept + eps*rng.nextGaussian()
+    val y = weights.zip(x).map(p => p._1 * p._2).sum + intercept + labelNoise*rng.nextGaussian()
     val yD =
       if (problem == "SVM"){
         if (y < 0.0) 0.0 else 1.0
@@ -402,7 +402,7 @@ class LinearDataGenerator(
   }
 
   override def copy(): LinearDataGenerator =
-    new LinearDataGenerator(numFeatures, intercept, seed, eps, problem, sparsity)
+    new LinearDataGenerator(numFeatures, intercept, seed, labelNoise, problem, sparsity)
 }