Skip to content

Commit 6e4f26d

Browse files
jkbradleymengxr
authored andcommitted
QA 1.6 fixes and cleanups
Various cleanups and fixes made during QA for 1.6 release * Fixed PIC test, and made it larger * Fixed NaiveBayes test * Fixed ALSTest in Scala to use implicitPrefs * Removed recommend all in ALSTest * Updated config: * elastic net params * removed numPoints, numColumns from clustering and used numExamples, numFeatures instead * set tol to 0 in elastic net tests CC: mengxr Author: Joseph K. Bradley <[email protected]> Closes #94 from jkbradley/qa1.6-fixes and squashes the following commits: f7509b4 [Joseph K. Bradley] set tol to 0 in elastic net tests. Note this does not affect validity of the 1.6 QA tests since 1.5,1.6 should use tol the same way and have the same defaults 6d0120d [Joseph K. Bradley] fixed config.py.template for elastic-net params, and for eliminating num-points, num-columns cd7d77e [Joseph K. Bradley] Fixes after initial 1.6 perf tests. * Changed clustering tests to use numExamples, numFeatures instead of numPoints, numColumns * Fixed Scala ALS to use implicitPrefs option in training * Fixed Python NaiveBayes to use numExamples instead of numPoints * Changes to config to increase very short test times 39abc13 [Joseph K. Bradley] made PIC test larger. removed recommend all in ALSTest 4844b34 [Joseph K. Bradley] update to PICTest ad7b453 [Joseph K. Bradley] fixed PIC test 5a43830 [Joseph K. Bradley] fixes to get ml tests to run for 1.6 qa
1 parent e8ea1d6 commit 6e4f26d

File tree

11 files changed

+101
-88
lines changed

11 files changed

+101
-88
lines changed

config/config.py.template

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,9 @@ MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
399399

400400
# Generalized Linear Model (GLM) Tests #
401401
MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
402+
# The scale factor for the noise in feature values.
403+
# Currently ignored for regression.
404+
OptionSet("feature-noise", [1.0]),
402405
# The number of features per example
403406
OptionSet("num-features", [10000], can_scale=False),
404407
# The number of iterations for SGD
@@ -410,11 +413,6 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
410413
# Regularization parameter
411414
OptionSet("reg-param", [0.1])
412415
]
413-
if MLLIB_SPARK_VERSION >= 1.1:
414-
MLLIB_GLM_TEST_OPTS += [
415-
# Optimization algorithm: sgd, l-bfgs
416-
OptionSet("optimizer", ["sgd", "l-bfgs"])
417-
]
418416
if MLLIB_SPARK_VERSION >= 1.5:
419417
MLLIB_GLM_TEST_OPTS += [
420418
# Ignored, but required for config
@@ -423,6 +421,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
423421

424422
# GLM Regression Tests #
425423
MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
424+
# Optimization algorithm: sgd
425+
OptionSet("optimizer", ["sgd"]),
426426
# The intercept for the data
427427
OptionSet("intercept", [0.0]),
428428
# The scale factor for label noise
@@ -438,6 +438,8 @@ MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
438438
MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
439439
# Expected fraction of examples which are negative
440440
OptionSet("per-negative", [0.3]),
441+
# Optimization algorithm: sgd, l-bfgs
442+
OptionSet("optimizer", ["sgd", "l-bfgs"])
441443
]
442444

443445
# GLM Classification Tests #
@@ -464,15 +466,15 @@ if MLLIB_SPARK_VERSION >= 1.5:
464466
OptionSet("reg-param", [0.01]),
465467
# The scale factor for the noise in feature values
466468
OptionSet("feature-noise", [1.0]),
467-
# The scale factor for the noise in label values
468-
OptionSet("label-noise", [0.1]),
469-
# The intercept for the data
470-
OptionSet("intercept", [0.2]),
471469
# The step size is not used in LBFGS, but this is required in parameter checking.
472470
OptionSet("step-size", [0.0])
473471
]
474472

475473
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
474+
# The scale factor for the noise in label values
475+
OptionSet("label-noise", [0.1]),
476+
# The intercept for the data
477+
OptionSet("intercept", [0.2]),
476478
# Loss to minimize: l2 (squared error)
477479
OptionSet("loss", ["l2"])
478480
]
@@ -486,9 +488,11 @@ if MLLIB_SPARK_VERSION >= 1.5:
486488
MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
487489
MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
488490
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
489-
[OptionSet("num-features", [100], can_scale=False)])]
491+
[OptionSet("num-features", [200], can_scale=False)])]
490492

491493
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
494+
# Expected fraction of examples which are negative
495+
OptionSet("per-negative", [0.3]),
492496
# In GLM classification with elastic-net regularization, only logistic loss is supported.
493497
OptionSet("loss", ["logistic"])
494498
]
@@ -502,7 +506,7 @@ if MLLIB_SPARK_VERSION >= 1.5:
502506
MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
503507
MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
504508
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
505-
[OptionSet("num-features", [100], can_scale=False)])]
509+
[OptionSet("num-features", [200], can_scale=False)])]
506510

507511
NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
508512
# The number of features per example
@@ -595,10 +599,10 @@ MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
595599

596600
# Clustering Tests #
597601
MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [
598-
# The number of points
599-
OptionSet("num-points", [1000000], can_scale=True),
602+
# The number of examples
603+
OptionSet("num-examples", [1000000], can_scale=True),
600604
# The number of features per point
601-
OptionSet("num-columns", [10000], can_scale=False),
605+
OptionSet("num-features", [10000], can_scale=False),
602606
# The number of centers
603607
OptionSet("num-centers", [20]),
604608
# The number of iterations for KMeans
@@ -609,8 +613,8 @@ MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
609613
MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)]
610614

611615
MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
612-
OptionSet("num-points", [1000000], can_scale=True),
613-
OptionSet("num-columns", [100], can_scale=False),
616+
OptionSet("num-examples", [1000000], can_scale=True),
617+
OptionSet("num-features", [100], can_scale=False),
614618
OptionSet("num-centers", [20], can_scale=False),
615619
OptionSet("num-iterations", [20])]
616620

@@ -630,16 +634,15 @@ if MLLIB_SPARK_VERSION >= 1.4:
630634
MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
631635
MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
632636

633-
# TODO: tune PIC test size to run in 20-30 seconds
634637
MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [
635-
OptionSet("num-points", [10000], can_scale=True),
636-
OptionSet("node-degree", [10], can_scale=False),
637-
OptionSet("num-centers", [20], can_scale=False),
638+
OptionSet("num-examples", [10000000], can_scale=True),
639+
OptionSet("node-degree", [20], can_scale=False),
640+
OptionSet("num-centers", [40], can_scale=False),
638641
OptionSet("num-iterations", [20])]
639642

640643
if MLLIB_SPARK_VERSION >= 1.3:
641644
MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
642-
MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_CLUSTERING_TEST_OPTS)]
645+
MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_PIC_TEST_OPTS)]
643646

644647
# Linear Algebra Tests #
645648
MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
@@ -668,7 +671,7 @@ MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
668671

669672
MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
670673
MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] +
671-
MLLIB_LINALG_TEST_OPTS)]
674+
MLLIB_BIG_LINALG_TEST_OPTS)]
672675

673676
MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [
674677
OptionSet("m", [20000], can_scale=True),
@@ -752,8 +755,8 @@ if MLLIB_SPARK_VERSION >= 1.3:
752755
MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
753756
[OptionSet("num-sequences", [5000000], can_scale=True),
754757
OptionSet("avg-sequence-size", [5], can_scale=False),
755-
OptionSet("avg-itemset-size", [1], can_scale=False),
756-
OptionSet("num-items", [100], can_scale=False),
758+
OptionSet("avg-itemset-size", [2], can_scale=False),
759+
OptionSet("num-items", [500], can_scale=False),
757760
OptionSet("min-support", [0.5], can_scale=False),
758761
OptionSet("max-pattern-len", [10], can_scale=False),
759762
OptionSet("max-local-proj-db-size", [32000000], can_scale=False)]

mllib-tests/project/MLlibTestsBuild.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ object MLlibTestsBuild extends Build {
3535
val targetFolder = sparkVersion.value match {
3636
case v if v.startsWith("1.4.") => "v1p4"
3737
case v if v.startsWith("1.5.") => "v1p5"
38+
case v if v.startsWith("1.6.") =>
39+
"v1p5" // acceptable for now, but change later when new algs are added
3840
case _ => throw new IllegalArgumentException(s"Do not support Spark ${sparkVersion.value}.")
3941
}
4042
baseDirectory.value / targetFolder / "src" / "main" / "scala"

mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -315,35 +315,35 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
315315

316316
def runTest(rdd: RDD[Vector]): KMeansModel
317317

318-
val NUM_POINTS = ("num-points", "number of points for clustering tests")
319-
val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
318+
val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
319+
val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
320320
val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
321321
val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
322322

323-
intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
324-
longOptions = longOptions ++ Seq(NUM_POINTS)
323+
intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
324+
longOptions = longOptions ++ Seq(NUM_EXAMPLES)
325325
val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions
326326
addOptionsToParser()
327327

328328
var rdd: RDD[Vector] = _
329329
var testRdd: RDD[Vector] = _
330330

331331
def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
332-
val numPoints = rdd.cache().count()
332+
val numExamples = rdd.cache().count()
333333

334334
val error = model.computeCost(rdd)
335335

336-
math.sqrt(error/numPoints)
336+
math.sqrt(error/numExamples)
337337
}
338338

339339
override def createInputData(seed: Long) = {
340340
val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
341341

342-
val numPoints: Long = longOptionValue(NUM_POINTS)
343-
val numColumns: Int = intOptionValue(NUM_COLUMNS)
342+
val numExamples: Long = longOptionValue(NUM_EXAMPLES)
343+
val numFeatures: Int = intOptionValue(NUM_FEATURES)
344344
val numCenters: Int = intOptionValue(NUM_CENTERS)
345345

346-
val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
346+
val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
347347
numCenters, numPartitions, seed)
348348

349349
val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
441441
val rank: Int = intOptionValue(RANK)
442442
val regParam = doubleOptionValue(REG_PARAM)
443443
val seed = intOptionValue(RANDOM_SEED) + 12
444+
val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
444445

445446
new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
446-
.setBlocks(rdd.partitions.size).run(rdd)
447+
.setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
447448
}
448449
}
449450

mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,21 @@ import mllib.perf.PerfTest
1616
class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
1717

1818
// TODO: refactor k-means and GMM code
19-
val NUM_POINTS = ("num-points", "number of points for clustering tests")
20-
val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
19+
val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
20+
val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
2121
val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
2222
val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
2323

24-
intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
25-
longOptions ++= Seq(NUM_POINTS)
24+
intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
25+
longOptions ++= Seq(NUM_EXAMPLES)
2626
val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions
2727
addOptionsToParser()
2828

2929
var data: RDD[Vector] = _
3030

3131
override def createInputData(seed: Long): Unit = {
32-
val m = longOptionValue(NUM_POINTS)
33-
val n = intOptionValue(NUM_COLUMNS)
32+
val m = longOptionValue(NUM_EXAMPLES)
33+
val n = intOptionValue(NUM_FEATURES)
3434
val k = intOptionValue(NUM_CENTERS)
3535
val p = intOptionValue(NUM_PARTITIONS)
3636

@@ -47,7 +47,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
4747
Vectors.dense(y.data)
4848
}
4949
}.cache()
50-
logInfo(s"Generated ${data.count()} points.")
50+
logInfo(s"Generated ${data.count()} examples.")
5151
}
5252

5353
override def run(): JValue = {

mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,28 @@ import mllib.perf.PerfTest
1111

1212
class PICTest(sc: SparkContext) extends PerfTest {
1313

14-
val NUM_POINTS = ("num-points", "number of points")
14+
val NUM_EXAMPLES = ("num-examples", "number of examples")
1515
val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
1616
val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
1717
val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
1818

1919
intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
20-
longOptions ++= Seq(NUM_POINTS)
20+
longOptions ++= Seq(NUM_EXAMPLES)
2121
val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions
2222
addOptionsToParser()
2323

2424
var data: RDD[(Long, Long, Double)] = _
2525

2626
override def createInputData(seed: Long): Unit = {
27-
val numPoints = longOptionValue(NUM_POINTS)
27+
val numExamples = longOptionValue(NUM_EXAMPLES)
2828
val nodeDegree = intOptionValue(NODE_DEGREE)
2929
val numPartitions = intOptionValue(NUM_PARTITIONS)
3030

3131
// Generates a periodic banded matrix with bandwidth = nodeDegree
32-
val data = sc.parallelize(0L to numPoints, numPartitions)
32+
data = sc.parallelize(0L to numExamples, numPartitions)
3333
.flatMap { id =>
34-
(((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
35-
(id, (nbr + numPoints) % numPoints, 1D)
34+
(((id - nodeDegree / 2) % numExamples) until id).map { nbr =>
35+
(id, (nbr + numExamples) % numExamples, 1D)
3636
}
3737
}
3838
logInfo(s"Generated ${data.count()} pairwise similarities.")
@@ -46,6 +46,7 @@ class PICTest(sc: SparkContext) extends PerfTest {
4646
.setK(k)
4747
.setMaxIterations(numIterations)
4848
val model = pic.run(data)
49+
model.assignments.count()
4950
val duration = (System.currentTimeMillis() - start) / 1e3
5051
"time" -> duration
5152
}

mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I
509509

510510
class KMeansDataGenerator(
511511
val numCenters: Int,
512-
val numColumns: Int,
512+
val numFeatures: Int,
513513
val seed: Long) extends RandomDataGenerator[Vector] {
514514

515515
private val rng = new java.util.Random(seed)
@@ -528,20 +528,20 @@ class KMeansDataGenerator(
528528
}
529529

530530
private val centers = (0 until numCenters).map{i =>
531-
Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
531+
Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))
532532
}
533533

534534
override def nextValue(): Vector = {
535535
val pick_center_rand = rng2.nextDouble()
536536

537537
val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
538538

539-
Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
539+
Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))
540540
}
541541

542542
override def setSeed(seed: Long) {
543543
rng.setSeed(seed)
544544
}
545545

546-
override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
546+
override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)
547547
}

0 commit comments

Comments
 (0)