databricks
diff --git a/‎config/config.py.template
Lines changed: 27 additions & 24 deletions b/‎config/config.py.template
Lines changed: 27 additions & 24 deletions
diff --git a/‎mllib-tests/project/MLlibTestsBuild.scala
Lines changed: 2 additions & 0 deletions b/‎mllib-tests/project/MLlibTestsBuild.scala
Lines changed: 2 additions & 0 deletions
diff --git a/‎mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
Lines changed: 11 additions & 10 deletions b/‎mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
Lines changed: 11 additions & 10 deletions
diff --git a/‎mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
Lines changed: 7 additions & 7 deletions b/‎mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
Lines changed: 7 additions & 7 deletions
diff --git a/‎mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
Lines changed: 7 additions & 6 deletions b/‎mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
Lines changed: 7 additions & 6 deletions
diff --git a/‎mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
Lines changed: 4 additions & 4 deletions b/‎mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
Lines changed: 4 additions & 4 deletions
@@ -399,6 +399,9 @@ MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
 
 # Generalized Linear Model (GLM) Tests #
 MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+    # The scale factor for the noise in feature values.
+    # Currently ignored for regression.
+    OptionSet("feature-noise", [1.0]),
     # The number of features per example
     OptionSet("num-features", [10000], can_scale=False),
     # The number of iterations for SGD
@@ -410,11 +413,6 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
     # Regularization parameter
     OptionSet("reg-param", [0.1])
 ]
-if MLLIB_SPARK_VERSION >= 1.1:
-    MLLIB_GLM_TEST_OPTS += [
-        # Optimization algorithm: sgd, l-bfgs
-        OptionSet("optimizer", ["sgd", "l-bfgs"])
-    ]
 if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_GLM_TEST_OPTS += [
         # Ignored, but required for config
@@ -423,6 +421,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
 
 # GLM Regression Tests #
 MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
+    # Optimization algorithm: sgd
+    OptionSet("optimizer", ["sgd"]),
     # The intercept for the data
     OptionSet("intercept", [0.0]),
     # The scale factor for label noise
@@ -438,6 +438,8 @@ MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
 MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
     # Expected fraction of examples which are negative
     OptionSet("per-negative", [0.3]),
+    # Optimization algorithm: sgd, l-bfgs
+    OptionSet("optimizer", ["sgd", "l-bfgs"])
 ]
 
 # GLM Classification Tests #
@@ -464,15 +466,15 @@ if MLLIB_SPARK_VERSION >= 1.5:
         OptionSet("reg-param", [0.01]),
         # The scale factor for the noise in feature values
         OptionSet("feature-noise", [1.0]),
-        # The scale factor for the noise in label values
-        OptionSet("label-noise", [0.1]),
-        # The intercept for the data
-        OptionSet("intercept", [0.2]),
         # The step size is not used in LBFGS, but this is required in parameter checking.
         OptionSet("step-size", [0.0])
     ]
 
     MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # The scale factor for the noise in label values
+        OptionSet("label-noise", [0.1]),
+        # The intercept for the data
+        OptionSet("intercept", [0.2]),
         # Loss to minimize: l2 (squared error)
         OptionSet("loss", ["l2"])
     ]
@@ -486,9 +488,11 @@ if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
                      MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
                      MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
-                     [OptionSet("num-features", [100], can_scale=False)])]
+                     [OptionSet("num-features", [200], can_scale=False)])]
 
     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # Expected fraction of examples which are negative
+        OptionSet("per-negative", [0.3]),
         # In GLM classification with elastic-net regularization, only logistic loss is supported.
         OptionSet("loss", ["logistic"])
     ]
@@ -502,7 +506,7 @@ if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
                      MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
                      MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
-                     [OptionSet("num-features", [100], can_scale=False)])]
+                     [OptionSet("num-features", [200], can_scale=False)])]
 
 NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
     # The number of features per example
@@ -595,10 +599,10 @@ MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
 
 # Clustering Tests #
 MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [
-     # The number of points
-     OptionSet("num-points", [1000000], can_scale=True),
+     # The number of examples
+     OptionSet("num-examples", [1000000], can_scale=True),
      # The number of features per point
-     OptionSet("num-columns", [10000], can_scale=False),
+     OptionSet("num-features", [10000], can_scale=False),
      # The number of centers
      OptionSet("num-centers", [20]),
      # The number of iterations for KMeans
@@ -609,8 +613,8 @@ MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
     MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)]
 
 MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-points", [1000000], can_scale=True),
-    OptionSet("num-columns", [100], can_scale=False),
+    OptionSet("num-examples", [1000000], can_scale=True),
+    OptionSet("num-features", [100], can_scale=False),
     OptionSet("num-centers", [20], can_scale=False),
     OptionSet("num-iterations", [20])]
 
@@ -630,16 +634,15 @@ if MLLIB_SPARK_VERSION >= 1.4:
     MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
         MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
 
-# TODO: tune PIC test size to run in 20-30 seconds
 MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-points", [10000], can_scale=True),
-    OptionSet("node-degree", [10], can_scale=False),
-    OptionSet("num-centers", [20], can_scale=False),
+    OptionSet("num-examples", [10000000], can_scale=True),
+    OptionSet("node-degree", [20], can_scale=False),
+    OptionSet("num-centers", [40], can_scale=False),
     OptionSet("num-iterations", [20])]
 
 if MLLIB_SPARK_VERSION >= 1.3:
     MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_CLUSTERING_TEST_OPTS)]
+        MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_PIC_TEST_OPTS)]
 
 # Linear Algebra Tests #
 MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
@@ -668,7 +671,7 @@ MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
 
 MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
     MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] +
-    MLLIB_LINALG_TEST_OPTS)]
+    MLLIB_BIG_LINALG_TEST_OPTS)]
 
 MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [
     OptionSet("m", [20000], can_scale=True),
@@ -752,8 +755,8 @@ if MLLIB_SPARK_VERSION >= 1.3:
 MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
                             [OptionSet("num-sequences", [5000000], can_scale=True),
                              OptionSet("avg-sequence-size", [5], can_scale=False),
-                             OptionSet("avg-itemset-size", [1], can_scale=False),
-                             OptionSet("num-items", [100], can_scale=False),
+                             OptionSet("avg-itemset-size", [2], can_scale=False),
+                             OptionSet("num-items", [500], can_scale=False),
                              OptionSet("min-support", [0.5], can_scale=False),
                              OptionSet("max-pattern-len", [10], can_scale=False),
                              OptionSet("max-local-proj-db-size", [32000000], can_scale=False)]
 
@@ -35,6 +35,8 @@ object MLlibTestsBuild extends Build {
         val targetFolder = sparkVersion.value match {
           case v if v.startsWith("1.4.") => "v1p4"
           case v if v.startsWith("1.5.") => "v1p5"
+          case v if v.startsWith("1.6.") =>
+            "v1p5" // acceptable for now, but change later when new algs are added
           case _ => throw new IllegalArgumentException(s"Do not support Spark ${sparkVersion.value}.")
         }
         baseDirectory.value / targetFolder / "src" / "main" / "scala"
 
@@ -315,35 +315,35 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
 
   def runTest(rdd: RDD[Vector]): KMeansModel
 
-  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
-  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
+  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
+  val NUM_FEATURES =   ("num-features",   "number of features for each example for clustering tests")
   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
 
-  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
-  longOptions = longOptions ++ Seq(NUM_POINTS)
+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var rdd: RDD[Vector] = _
   var testRdd: RDD[Vector] = _
 
   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
-    val numPoints = rdd.cache().count()
+    val numExamples = rdd.cache().count()
 
     val error = model.computeCost(rdd)
 
-    math.sqrt(error/numPoints)
+    math.sqrt(error/numExamples)
   }
 
   override def createInputData(seed: Long) = {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
-    val numPoints: Long = longOptionValue(NUM_POINTS)
-    val numColumns: Int = intOptionValue(NUM_COLUMNS)
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
     val numCenters: Int = intOptionValue(NUM_CENTERS)
 
-    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
       numCenters, numPartitions, seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
     val rank: Int = intOptionValue(RANK)
     val regParam = doubleOptionValue(REG_PARAM)
     val seed = intOptionValue(RANDOM_SEED) + 12
+    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
-      .setBlocks(rdd.partitions.size).run(rdd)
+      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
   }
 }
 
 
@@ -16,21 +16,21 @@ import mllib.perf.PerfTest
 class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
 
   // TODO: refactor k-means and GMM code
-  val NUM_POINTS = ("num-points", "number of points for clustering tests")
-  val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
+  val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
+  val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
-  intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_POINTS)
+  intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[Vector] = _
 
   override def createInputData(seed: Long): Unit = {
-    val m = longOptionValue(NUM_POINTS)
-    val n = intOptionValue(NUM_COLUMNS)
+    val m = longOptionValue(NUM_EXAMPLES)
+    val n = intOptionValue(NUM_FEATURES)
     val k = intOptionValue(NUM_CENTERS)
     val p = intOptionValue(NUM_PARTITIONS)
 
@@ -47,7 +47,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
           Vectors.dense(y.data)
         }
       }.cache()
-    logInfo(s"Generated ${data.count()} points.")
+    logInfo(s"Generated ${data.count()} examples.")
   }
 
   override def run(): JValue = {
 
@@ -11,28 +11,28 @@ import mllib.perf.PerfTest
 
 class PICTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_POINTS = ("num-points", "number of points")
+  val NUM_EXAMPLES = ("num-examples", "number of examples")
   val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
   intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_POINTS)
+  longOptions ++= Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[(Long, Long, Double)] = _
 
   override def createInputData(seed: Long): Unit = {
-    val numPoints = longOptionValue(NUM_POINTS)
+    val numExamples = longOptionValue(NUM_EXAMPLES)
     val nodeDegree = intOptionValue(NODE_DEGREE)
     val numPartitions = intOptionValue(NUM_PARTITIONS)
 
     // Generates a periodic banded matrix with bandwidth = nodeDegree
-    val data = sc.parallelize(0L to numPoints, numPartitions)
+    data = sc.parallelize(0L to numExamples, numPartitions)
       .flatMap { id =>
-        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
-          (id, (nbr + numPoints) % numPoints, 1D)
+        (((id - nodeDegree / 2) % numExamples) until id).map { nbr =>
+          (id, (nbr + numExamples) % numExamples, 1D)
         }
       }
     logInfo(s"Generated ${data.count()} pairwise similarities.")
@@ -46,6 +46,7 @@ class PICTest(sc: SparkContext) extends PerfTest {
       .setK(k)
       .setMaxIterations(numIterations)
     val model = pic.run(data)
+    model.assignments.count()
     val duration = (System.currentTimeMillis() - start) / 1e3
     "time" -> duration
   }
 
@@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I
 
 class KMeansDataGenerator(
     val numCenters: Int,
-    val numColumns: Int,
+    val numFeatures: Int,
     val seed: Long) extends RandomDataGenerator[Vector] {
 
   private val rng = new java.util.Random(seed)
@@ -528,20 +528,20 @@ class KMeansDataGenerator(
   }
 
   private val centers = (0 until numCenters).map{i =>
-    Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
+    Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))
   }
 
   override def nextValue(): Vector = {
     val pick_center_rand = rng2.nextDouble()
 
     val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
 
-    Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
+    Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))
   }
 
   override def setSeed(seed: Long) {
     rng.setSeed(seed)
   }
 
-  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
+  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)
 }
Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,8 @@ object MLlibTestsBuild extends Build {`
`35`	`35`	`val targetFolder = sparkVersion.value match {`
`36`	`36`	`case v if v.startsWith("1.4.") => "v1p4"`
`37`	`37`	`case v if v.startsWith("1.5.") => "v1p5"`
	`38`	`+ case v if v.startsWith("1.6.") =>`
	`39`	`+ "v1p5" // acceptable for now, but change later when new algs are added`
`38`	`40`	`case _ => throw new IllegalArgumentException(s"Do not support Spark ${sparkVersion.value}.")`
`39`	`41`	`}`
`40`	`42`	`baseDirectory.value / targetFolder / "src" / "main" / "scala"`
Original file line number	Diff line number	Diff line change
`@@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I`
`509`	`509`
`510`	`510`	`class KMeansDataGenerator(`
`511`	`511`	`val numCenters: Int,`
`512`		`- val numColumns: Int,`
	`512`	`+ val numFeatures: Int,`
`513`	`513`	`val seed: Long) extends RandomDataGenerator[Vector] {`
`514`	`514`
`515`	`515`	`private val rng = new java.util.Random(seed)`
`@@ -528,20 +528,20 @@ class KMeansDataGenerator(`
`528`	`528`	`}`
`529`	`529`
`530`	`530`	`private val centers = (0 until numCenters).map{i =>`
`531`		`- Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))`
	`531`	`+ Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))`
`532`	`532`	`}`
`533`	`533`
`534`	`534`	`override def nextValue(): Vector = {`
`535`	`535`	`val pick_center_rand = rng2.nextDouble()`
`536`	`536`
`537`	`537`	`val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))`
`538`	`538`
`539`		`- Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))`
	`539`	`+ Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))`
`540`	`540`	`}`
`541`	`541`
`542`	`542`	`override def setSeed(seed: Long) {`
`543`	`543`	`rng.setSeed(seed)`
`544`	`544`	`}`
`545`	`545`
`546`		`- override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)`
	`546`	`+ override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)`
`547`	`547`	`}`