Skip to content

Commit a163676

Browse files
jkbradleymengxr
authored andcommitted
Cleanups from spark 1.5 mllib perf tests
More tests should be runnable for GLMs. Some config fixes. CC mengxr Author: Joseph K. Bradley <[email protected]> Closes #88 from jkbradley/ml-perf-cleanup-1.6 and squashes the following commits: ca82fa9 [Joseph K. Bradley] updated 1.4 ALS test to use same numRatings as 1.5 b71373d [Joseph K. Bradley] Cleanups from spark 1.5 mllib perf tests. More tests should be runnable for GLMs. some config fixes
1 parent 93e4560 commit a163676

File tree

6 files changed

+143
-145
lines changed

6 files changed

+143
-145
lines changed

config/config.py.template

Lines changed: 22 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
370370
# * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
371371
# * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
372372
# * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
373-
MLLIB_SPARK_VERSION = 1.2
373+
MLLIB_SPARK_VERSION = 1.5
374374

375375
MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
376376
if MLLIB_SPARK_VERSION >= 1.1:
@@ -413,7 +413,12 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
413413
if MLLIB_SPARK_VERSION >= 1.1:
414414
MLLIB_GLM_TEST_OPTS += [
415415
# Optimization algorithm: sgd, lbfgs
416-
OptionSet("optimizer", ["sgd"])
416+
OptionSet("optimizer", ["sgd", "lbfgs"])
417+
]
418+
if MLLIB_SPARK_VERSION >= 1.5:
419+
MLLIB_GLM_TEST_OPTS += [
420+
# Ignored, but required for config
421+
OptionSet("elastic-net-param", [0.0])
417422
]
418423

419424
# GLM Regression Tests #
@@ -441,8 +446,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
441446
OptionSet("reg-type", ["elastic-net"]),
442447
# Runs with L2 (param = 0.0), L1 (param = 1.0).
443448
OptionSet("elastic-net-param", [0.0, 1.0]),
444-
# Runs with lambda = [0.0, 0.5]
445-
OptionSet("reg-param", [0.0, 0.5]),
449+
# Regularization param (lambda)
450+
OptionSet("reg-param", [0.01]),
446451
# The scale factor for the noise
447452
OptionSet("epsilon", [0.1]),
448453
# The intercept for the data
@@ -466,7 +471,7 @@ MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
466471
# GLM Classification Tests #
467472
MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [
468473
# Loss to minimize: logistic, hinge (SVM)
469-
OptionSet("loss", ["logistic", "hinge"])
474+
OptionSet("loss", ["logistic"])
470475
]
471476

472477
MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
@@ -483,8 +488,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
483488
OptionSet("reg-type", ["elastic-net"]),
484489
# Runs with L2 (param = 0.0), L1 (param = 1.0).
485490
OptionSet("elastic-net-param", [0.0, 1.0]),
486-
# Runs with lambda = [0.0, 0.5]
487-
OptionSet("reg-param", [0.0, 0.5]),
491+
# Regularization param (lambda)
492+
OptionSet("reg-param", [0.01]),
488493
# The scale factor for the noise
489494
OptionSet("epsilon", [0.1]),
490495
# The intercept for the data
@@ -504,30 +509,14 @@ NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
504509
OptionSet("scale-factor", [1.0]),
505510
# Naive Bayes smoothing lambda.
506511
OptionSet("nb-lambda", [1.0]),
507-
# Model type: either Multinomial or Bernoulli
512+
# Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+)
508513
OptionSet("model-type", ["multinomial"]),
509514
]
510515

511516
MLLIB_TESTS += [("naive-bayes", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
512517
MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] +
513518
NAIVE_BAYES_TEST_OPTS)]
514519

515-
if MLLIB_SPARK_VERSION >= 1.4:
516-
NAIVE_BAYES_TEST_OPTS_BERNOULLI = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
517-
# Expected fraction of examples which are negative
518-
OptionSet("per-negative", [0.3]),
519-
# The scale factor for the noise in feature values
520-
OptionSet("scale-factor", [1.0]),
521-
# Naive Bayes smoothing lambda.
522-
OptionSet("nb-lambda", [1.0]),
523-
# MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
524-
OptionSet("model-type", ["bernoulli"]),
525-
]
526-
527-
MLLIB_TESTS += [("naive-bayes-bernoulli", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
528-
MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] +
529-
NAIVE_BAYES_TEST_OPTS_BERNOULLI)]
530-
531520
# Decision Trees #
532521
MLLIB_DECISION_TREE_TEST_OPTS = MLLIB_COMMON_OPTS + [
533522
# The number of rows or examples
@@ -564,7 +553,8 @@ if MLLIB_SPARK_VERSION >= 1.2:
564553
# Path to test dataset (only used if training dataset given).
565554
# If not given, hold out part of training data for validation.
566555
OptionSet("test-data", [""]),
567-
# Fraction of data to hold out for testing (ignored if given training and test dataset).
556+
# Fraction of data to hold out for testing
557+
# (Ignored if given training and test dataset, or if using synthetic data.)
568558
OptionSet("test-data-fraction", [0.2], can_scale=False),
569559
# Number of trees. If 1, then run DecisionTree. If >1, then run RandomForest.
570560
OptionSet("num-trees", [1, 10], can_scale=False),
@@ -622,22 +612,19 @@ MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
622612

623613
if MLLIB_SPARK_VERSION >= 1.3:
624614
MLLIB_TESTS += [("gmm", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
625-
MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_CLUSTERING_TEST_OPTS)]
615+
MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_GMM_TEST_OPTS)]
626616

627617
MLLIB_LDA_TEST_OPTS = MLLIB_COMMON_OPTS + [
628-
OptionSet("num-documents", [10000], can_scale=True),
629-
OptionSet("num-vocab", [1000], can_scale=False),
618+
OptionSet("num-documents", [50000], can_scale=True),
619+
OptionSet("num-vocab", [10000], can_scale=False),
630620
OptionSet("num-topics", [20], can_scale=False),
631621
OptionSet("num-iterations", [20]),
632-
OptionSet("document-length", [100])]
633-
634-
if MLLIB_SPARK_VERSION >= 1.4:
635-
MLLIB_TESTS += [("emlda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
636-
MLLIB_JAVA_OPTS, [ConstantOption("emlda")] + MLLIB_LDA_TEST_OPTS)]
622+
OptionSet("document-length", [100]),
623+
OptionSet("optimizer", ["em", "online"])]
637624

638625
if MLLIB_SPARK_VERSION >= 1.4:
639-
MLLIB_TESTS += [("onlinelda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
640-
MLLIB_JAVA_OPTS, [ConstantOption("onlinelda")] + MLLIB_LDA_TEST_OPTS)]
626+
MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
627+
MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
641628

642629
# TODO: tune PIC test size to run in 20-30 seconds
643630
MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [

mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
273273
val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
274274

275275
val data = DataGenerator.generateRatings(sc, numUsers, numProducts,
276-
math.ceil(numRatings * 1.25).toLong, implicitRatings,numPartitions,seed)
276+
numRatings, implicitRatings, numPartitions, seed)
277277

278278
rdd = data._1.cache()
279279
testRdd = data._2

mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala

Lines changed: 72 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, RandomF
1010
import org.apache.spark.mllib.classification._
1111
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
1212
import org.apache.spark.mllib.linalg.{Vector, Vectors}
13+
import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater, SimpleUpdater}
1314
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
1415
import org.apache.spark.mllib.regression._
1516
import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest}
@@ -140,46 +141,34 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
140141
val regParam = doubleOptionValue(REG_PARAM)
141142
val elasticNetParam = doubleOptionValue(ELASTIC_NET_PARAM)
142143
val numIterations = intOptionValue(NUM_ITERATIONS)
143-
val optimizer = stringOptionValue(OPTIMIZER)
144+
// val optimizer = stringOptionValue(OPTIMIZER) // ignore for now since it makes config hard to do
144145

145146
// Linear Regression only supports squared loss for now.
146147
if (!Array("l2").contains(loss)) {
147148
throw new IllegalArgumentException(
148149
s"GLMRegressionTest run with unknown loss ($loss). Supported values: l2.")
149150
}
150151

151-
if (Array("sgd").contains(optimizer)) {
152-
if (!Array("none", "l1", "l2").contains(regType)) {
153-
throw new IllegalArgumentException(
154-
s"GLMRegressionTest run with unknown regType ($regType) with sgd. Supported values: none, l1, l2.")
155-
}
156-
} else if (Array("lbfgs").contains(optimizer)) {
157-
if (!Array("elastic-net").contains(regType)) {
158-
throw new IllegalArgumentException(
159-
s"GLMRegressionTest run with unknown regType ($regType) with lbfgs. Supported values: elastic-net.")
160-
}
161-
} else {
162-
throw new IllegalArgumentException(
163-
s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, lbfgs.")
164-
}
165-
166152
(loss, regType) match {
167153
case ("l2", "none") =>
168154
val lr = new LinearRegressionWithSGD().setIntercept(addIntercept = true)
169-
lr.optimizer.setNumIterations(numIterations).setStepSize(stepSize)
155+
lr.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setConvergenceTol(0.0)
170156
lr.run(rdd)
171157
case ("l2", "l1") =>
172158
val lasso = new LassoWithSGD().setIntercept(addIntercept = true)
173159
lasso.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setRegParam(regParam)
160+
.setConvergenceTol(0.0)
174161
lasso.run(rdd)
175162
case ("l2", "l2") =>
176163
val rr = new RidgeRegressionWithSGD().setIntercept(addIntercept = true)
177164
rr.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setRegParam(regParam)
165+
.setConvergenceTol(0.0)
178166
rr.run(rdd)
179167
case ("l2", "elastic-net") =>
180-
println("WARNING: Linear Regression with elastic-net in ML package uses LBFGS/OWLQN for optimization" +
181-
" which ignores stepSize and uses numIterations for maxIter in Spark 1.5.")
182-
val rr = new LinearRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam).setMaxIter(numIterations)
168+
println("WARNING: Linear Regression with elastic-net in ML package uses LBFGS/OWLQN for" +
169+
" optimization which ignores stepSize and uses numIterations for maxIter in Spark 1.5.")
170+
val rr = new LinearRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam)
171+
.setMaxIter(numIterations)
183172
val sqlContext = new SQLContext(rdd.context)
184173
import sqlContext.implicits._
185174
val mlModel = rr.fit(rdd.toDF())
@@ -247,46 +236,68 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
247236
s"GLMClassificationTest run with unknown loss ($loss). Supported values: logistic, hinge.")
248237
}
249238

250-
if (Array("sgd").contains(optimizer)) {
251-
if (!Array("none", "l1", "l2").contains(regType)) {
252-
throw new IllegalArgumentException(
253-
s"GLMRegressionTest run with unknown regType ($regType) with sgd. Supported values: none, l1, l2.")
254-
}
255-
} else if (Array("lbfgs").contains(optimizer)) {
256-
if (!Array("logistic").contains(loss)) {
257-
throw new IllegalArgumentException(
258-
s"GLMRegressionTest with lbfgs only supports logistic loss.")
259-
}
260-
if (!Array("none", "elastic-net").contains(regType)) {
261-
throw new IllegalArgumentException(
262-
s"GLMRegressionTest run with unknown regType ($regType) with lbfgs. Supported values: none, elastic-net.")
239+
if (regType == "elastic-net") { // use spark.ml
240+
loss match {
241+
case "logistic" =>
242+
println("WARNING: Logistic Regression with elastic-net in ML package uses LBFGS/OWLQN for optimization" +
243+
" which ignores stepSize in Spark 1.5.")
244+
val lor = new LogisticRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam)
245+
.setMaxIter(numIterations)
246+
val sqlContext = new SQLContext(rdd.context)
247+
import sqlContext.implicits._
248+
val mlModel = lor.fit(rdd.toDF())
249+
new LogisticRegressionModel(mlModel.weights, mlModel.intercept)
250+
case _ =>
251+
throw new IllegalArgumentException(
252+
s"GLMClassificationTest given unsupported loss = $loss." +
253+
s" Note the set of supported combinations increases in later Spark versions.")
263254
}
264255
} else {
265-
throw new IllegalArgumentException(
266-
s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, lbfgs.")
267-
}
268-
269-
(loss, regType, optimizer) match {
270-
case ("logistic", "none", "sgd") =>
271-
LogisticRegressionWithSGD.train(rdd, numIterations, stepSize)
272-
case ("logistic", "none", "lbfgs") =>
273-
println("WARNING: LogisticRegressionWithLBFGS ignores numIterations, stepSize" +
274-
" in this Spark version.")
275-
new LogisticRegressionWithLBFGS().run(rdd)
276-
case ("logistic", "elastic-net", _) =>
277-
println("WARNING: Logistic Regression with elastic-net in ML package uses LBFGS/OWLQN for optimization" +
278-
" which ignores stepSize and uses numIterations for maxIter in Spark 1.5.")
279-
val lor = new LogisticRegression().setElasticNetParam(elasticNetParam).setRegParam(regParam).setMaxIter(numIterations)
280-
val sqlContext = new SQLContext(rdd.context)
281-
import sqlContext.implicits._
282-
val mlModel = lor.fit(rdd.toDF())
283-
new LogisticRegressionModel(mlModel.weights, mlModel.intercept)
284-
case ("hinge", "l2", "sgd") =>
285-
SVMWithSGD.train(rdd, numIterations, stepSize, regParam)
286-
case _ =>
287-
throw new IllegalArgumentException(
288-
s"GLMClassificationTest given incompatible (loss, regType) = ($loss, $regType)." +
289-
s" Note the set of supported combinations increases in later Spark versions.")
256+
(loss, optimizer) match {
257+
case ("logistic", "sgd") =>
258+
val lr = new LogisticRegressionWithSGD()
259+
lr.optimizer.setStepSize(stepSize).setNumIterations(numIterations).setConvergenceTol(0.0)
260+
regType match {
261+
case "none" =>
262+
lr.optimizer.setUpdater(new SimpleUpdater)
263+
case "l1" =>
264+
lr.optimizer.setUpdater(new L1Updater)
265+
case "l2" =>
266+
lr.optimizer.setUpdater(new SquaredL2Updater)
267+
}
268+
lr.run(rdd)
269+
case ("logistic", "lbfgs") =>
270+
println("WARNING: LogisticRegressionWithLBFGS ignores stepSize in this Spark version.")
271+
val lr = new LogisticRegressionWithLBFGS()
272+
lr.optimizer.setNumIterations(numIterations).setConvergenceTol(0.0)
273+
regType match {
274+
case "none" =>
275+
lr.optimizer.setUpdater(new SimpleUpdater)
276+
case "l1" =>
277+
lr.optimizer.setUpdater(new L1Updater)
278+
case "l2" =>
279+
lr.optimizer.setUpdater(new SquaredL2Updater)
280+
}
281+
lr.run(rdd)
282+
case ("hinge", "sgd") =>
283+
val svm = new SVMWithSGD()
284+
svm.optimizer.setNumIterations(numIterations).setStepSize(stepSize).setRegParam(regParam)
285+
.setConvergenceTol(0.0)
286+
regType match {
287+
case "none" =>
288+
svm.optimizer.setUpdater(new SimpleUpdater)
289+
case "l1" =>
290+
svm.optimizer.setUpdater(new L1Updater)
291+
case "l2" =>
292+
svm.optimizer.setUpdater(new SquaredL2Updater)
293+
}
294+
svm.run(rdd)
295+
case _ =>
296+
throw new IllegalArgumentException(
297+
s"GLMClassificationTest given incompatible (loss, regType) = ($loss, $regType)." +
298+
s" Supported combinations include: (elastic-net, _), (logistic, sgd), (logistic, lbfgs), (hinge, sgd)." +
299+
s" Note the set of supported combinations increases in later Spark versions.")
300+
}
290301
}
291302
}
292303
}
@@ -322,7 +333,7 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
322333
val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
323334

324335
val data = DataGenerator.generateRatings(sc, numUsers, numProducts,
325-
math.ceil(numRatings * 1.25).toLong, implicitRatings,numPartitions,seed)
336+
numRatings, implicitRatings, numPartitions, seed)
326337

327338
rdd = data._1.cache()
328339
testRdd = data._2
@@ -490,7 +501,7 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
490501
val seed = intOptionValue(RANDOM_SEED) + 12
491502

492503
new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
493-
.setBlocks(rdd.partitions.size).run(rdd)
504+
.setBlocks(rdd.partitions.length).run(rdd)
494505
}
495506
}
496507

@@ -627,7 +638,6 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
627638
seed: Long): (Array[RDD[LabeledPoint]], Map[Int, Int], Int) = {
628639
// Generic test options
629640
val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
630-
val testDataFraction: Double = getTestDataFraction
631641
// Data dimensions and type
632642
val numExamples: Long = longOptionValue(NUM_EXAMPLES)
633643
val numFeatures: Int = intOptionValue(NUM_FEATURES)
@@ -642,7 +652,7 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
642652
numFeatures, numPartitions, labelType,
643653
fracCategoricalFeatures, fracBinaryFeatures, treeDepth, seed)
644654

645-
val splits = rdd_.randomSplit(Array(1.0 - testDataFraction, testDataFraction), seed)
655+
val splits = rdd_.randomSplit(Array(0.8, 0.2), seed)
646656
(splits, categoricalFeaturesInfo_, labelType)
647657
}
648658

mllib-tests/v1p5/src/main/scala/mllib/perf/TestRunner.scala

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@ import org.json4s.jackson.JsonMethods._
88

99
import org.apache.spark.{SparkConf, SparkContext}
1010

11-
import mllib.perf.clustering.{EMLDATest, GaussianMixtureTest, OnlineLDATest, PICTest}
11+
import mllib.perf.clustering.{GaussianMixtureTest, LDATest, PICTest}
1212
import mllib.perf.feature.Word2VecTest
1313
import mllib.perf.fpm.{FPGrowthTest, PrefixSpanTest}
1414
import mllib.perf.linalg.BlockMatrixMultTest
1515

1616
object TestRunner {
1717
def main(args: Array[String]) {
18-
if (args.size < 1) {
18+
if (args.length < 1) {
1919
println(
20-
"mllib.perf.TestRunner requires 1 or more args, you gave %s, exiting".format(args.size))
20+
"mllib.perf.TestRunner requires 1 or more args, you gave %s, exiting".format(args.length))
2121
System.exit(1)
2222
}
2323
val testName = args(0)
@@ -34,8 +34,7 @@ object TestRunner {
3434
// clustering
3535
case "gmm" => new GaussianMixtureTest(sc)
3636
case "kmeans" => new KMeansTest(sc)
37-
case "emlda" => new EMLDATest(sc)
38-
case "onlinelda" => new OnlineLDATest(sc)
37+
case "lda" => new LDATest(sc)
3938
case "pic" => new PICTest(sc)
4039
// trees
4140
case "decision-tree" => new DecisionTreeTest(sc)

0 commit comments

Comments
 (0)