Skip to content

Commit e8ea1d6

Browse files
jkbradleymengxr
authored andcommitted
ALS recommend all, GLM normal equation, pyspark elastic-net
Updated ML spark-perf config to have separate test for normal equation solver for GLMs. Added ALS test for recommendUsersForProducts, recommendProductsForUsers Renamed config "lbfgs" to "l-bfgs" to match spark.ml Param value. Added elastic net to PySpark tests. CC: mengxr Author: Joseph K. Bradley <[email protected]> Closes #90 from jkbradley/missing-algs and squashes the following commits: 09acb37 [Joseph K. Bradley] Renamed configs: * scaleFactor, scale -> featureNoise * epsilon, eps -> labelNoise 91e6eba [Joseph K. Bradley] updates pyspark tests to run elastic-net b04d304 [Joseph K. Bradley] Updated ML spark-perf config to have separate test for normal equation solver for GLMs. Added ALS test for recommendUsersForProducts, recommendProductsForUsers
1 parent a163676 commit e8ea1d6

File tree

6 files changed

+218
-173
lines changed

6 files changed

+218
-173
lines changed

config/config.py.template

Lines changed: 53 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -394,13 +394,13 @@ MLLIB_COMMON_OPTS = COMMON_OPTS + [
394394
# Regression and Classification Tests #
395395
MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
396396
# The number of rows or examples
397-
OptionSet("num-examples", [1000000], can_scale=True),
398-
# The number of features per example
399-
OptionSet("num-features", [10000], can_scale=False)
397+
OptionSet("num-examples", [1000000], can_scale=True)
400398
]
401399

402400
# Generalized Linear Model (GLM) Tests #
403401
MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
402+
# The number of features per example
403+
OptionSet("num-features", [10000], can_scale=False),
404404
# The number of iterations for SGD
405405
OptionSet("num-iterations", [20]),
406406
# The step size for SGD
@@ -412,8 +412,8 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
412412
]
413413
if MLLIB_SPARK_VERSION >= 1.1:
414414
MLLIB_GLM_TEST_OPTS += [
415-
# Optimization algorithm: sgd, lbfgs
416-
OptionSet("optimizer", ["sgd", "lbfgs"])
415+
# Optimization algorithm: sgd, l-bfgs
416+
OptionSet("optimizer", ["sgd", "l-bfgs"])
417417
]
418418
if MLLIB_SPARK_VERSION >= 1.5:
419419
MLLIB_GLM_TEST_OPTS += [
@@ -425,47 +425,19 @@ if MLLIB_SPARK_VERSION >= 1.5:
425425
MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
426426
# The intercept for the data
427427
OptionSet("intercept", [0.0]),
428-
# The scale factor for the noise
429-
OptionSet("epsilon", [0.1]),
428+
# The scale factor for label noise
429+
OptionSet("label-noise", [0.1]),
430430
# Loss to minimize: l2 (squared error)
431431
OptionSet("loss", ["l2"])
432432
]
433433

434434
MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
435435
MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_REGRESSION_TEST_OPTS)]
436436

437-
if MLLIB_SPARK_VERSION >= 1.5:
438-
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
439-
# Loss to minimize: l2 (squared error)
440-
OptionSet("loss", ["l2"]),
441-
# The max number of iterations for LBFGS/OWLQN
442-
OptionSet("num-iterations", [20]),
443-
# LBFGS/OWLQN is used with elastic-net regularization.
444-
OptionSet("optimizer", ["lbfgs"]),
445-
# Using elastic-net regularization.
446-
OptionSet("reg-type", ["elastic-net"]),
447-
# Runs with L2 (param = 0.0), L1 (param = 1.0).
448-
OptionSet("elastic-net-param", [0.0, 1.0]),
449-
# Regularization param (lambda)
450-
OptionSet("reg-param", [0.01]),
451-
# The scale factor for the noise
452-
OptionSet("epsilon", [0.1]),
453-
# The intercept for the data
454-
OptionSet("intercept", [0.0]),
455-
# The step size is not used in LBFGS, but this is required in parameter checking.
456-
OptionSet("step-size", [0.0])
457-
]
458-
459-
MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
460-
MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
461-
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS)]
462-
463437
# Classification Tests #
464438
MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
465-
# Expected fraction of examples which are negative
466-
OptionSet("per-negative", [0.3]),
467-
# The scale factor for the noise in feature values
468-
OptionSet("scale-factor", [1.0])
439+
# Expected fraction of examples which are negative
440+
OptionSet("per-negative", [0.3]),
469441
]
470442

471443
# GLM Classification Tests #
@@ -475,38 +447,70 @@ MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [
475447
]
476448

477449
MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
478-
MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
479-
MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
450+
MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
451+
MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
480452

481453
if MLLIB_SPARK_VERSION >= 1.5:
482-
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_CLASSIFICATION_TEST_OPTS + [
483-
# In GLM classification with elastic-net regularization, only logistic loss is supported.
484-
OptionSet("loss", ["logistic"]),
454+
MLLIB_GLM_ELASTIC_NET_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
455+
# The max number of iterations for LBFGS/OWLQN
456+
OptionSet("num-iterations", [20]),
485457
# LBFGS/OWLQN is used with elastic-net regularization.
486-
OptionSet("optimizer", ["lbfgs"]),
458+
OptionSet("optimizer", ["auto"]),
487459
# Using elastic-net regularization.
488460
OptionSet("reg-type", ["elastic-net"]),
489461
# Runs with L2 (param = 0.0), L1 (param = 1.0).
490462
OptionSet("elastic-net-param", [0.0, 1.0]),
491463
# Regularization param (lambda)
492464
OptionSet("reg-param", [0.01]),
493-
# The scale factor for the noise
494-
OptionSet("epsilon", [0.1]),
465+
# The scale factor for the noise in feature values
466+
OptionSet("feature-noise", [1.0]),
467+
# The scale factor for the noise in label values
468+
OptionSet("label-noise", [0.1]),
495469
# The intercept for the data
496-
OptionSet("intercept", [0.0]),
470+
OptionSet("intercept", [0.2]),
497471
# The step size is not used in LBFGS, but this is required in parameter checking.
498472
OptionSet("step-size", [0.0])
499473
]
500474

475+
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
476+
# Loss to minimize: l2 (squared error)
477+
OptionSet("loss", ["l2"])
478+
]
479+
480+
# Test L-BFGS
481+
MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
482+
MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
483+
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
484+
[OptionSet("num-features", [10000], can_scale=False)])]
485+
# Test normal equation solver
486+
MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
487+
MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
488+
MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
489+
[OptionSet("num-features", [100], can_scale=False)])]
490+
491+
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
492+
# In GLM classification with elastic-net regularization, only logistic loss is supported.
493+
OptionSet("loss", ["logistic"])
494+
]
495+
496+
# Test L-BFGS
501497
MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
502-
MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
503-
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS)]
498+
MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
499+
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
500+
[OptionSet("num-features", [10000], can_scale=False)])]
501+
# Test normal equation solver
502+
MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
503+
MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
504+
MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
505+
[OptionSet("num-features", [100], can_scale=False)])]
504506

505507
NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
508+
# The number of features per example
509+
OptionSet("num-features", [10000], can_scale=False),
506510
# Expected fraction of examples which are negative
507511
OptionSet("per-negative", [0.3]),
508512
# The scale factor for the noise in feature values
509-
OptionSet("scale-factor", [1.0]),
513+
OptionSet("feature-noise", [1.0]),
510514
# Naive Bayes smoothing lambda.
511515
OptionSet("nb-lambda", [1.0]),
512516
# Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+)

mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ abstract class GLMTests(sc: SparkContext)
7979
val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
8080
val REG_TYPE = ("reg-type", "type of regularization: none, l1, l2")
8181
val REG_PARAM = ("reg-param", "the regularization parameter against overfitting")
82-
val OPTIMIZER = ("optimizer", "optimization algorithm: sgd, lbfgs")
82+
val OPTIMIZER = ("optimizer", "optimization algorithm: sgd, l-bfgs")
8383

8484
intOptions = intOptions ++ Seq(NUM_ITERATIONS)
8585
doubleOptions = doubleOptions ++ Seq(STEP_SIZE, REG_PARAM)
@@ -89,10 +89,10 @@ abstract class GLMTests(sc: SparkContext)
8989
class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
9090

9191
val INTERCEPT = ("intercept", "intercept for random data generation")
92-
val EPS = ("epsilon", "scale factor for the noise during data generation")
92+
val LABEL_NOISE = ("label-noise", "scale factor for the noise during label generation")
9393
val LOSS = ("loss", "loss to minimize. Supported: l2 (squared error).")
9494

95-
doubleOptions = doubleOptions ++ Seq(INTERCEPT, EPS)
95+
doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE)
9696
stringOptions = stringOptions ++ Seq(LOSS)
9797

9898
val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -104,10 +104,10 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
104104
val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
105105

106106
val intercept: Double = doubleOptionValue(INTERCEPT)
107-
val eps: Double = doubleOptionValue(EPS)
107+
val labelNoise: Double = doubleOptionValue(LABEL_NOISE)
108108

109109
val data = DataGenerator.generateLabeledPoints(sc, math.ceil(numExamples * 1.25).toLong,
110-
numFeatures, intercept, eps, numPartitions, seed)
110+
numFeatures, intercept, labelNoise, numPartitions, seed)
111111

112112
val split = data.randomSplit(Array(0.8, 0.2), seed)
113113

@@ -167,10 +167,10 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
167167
class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
168168

169169
val THRESHOLD = ("per-negative", "probability for a negative label during data generation")
170-
val SCALE = ("scale-factor", "scale factor for the noise during data generation")
170+
val FEATURE_NOISE = ("feature-noise", "scale factor for the noise during feature generation")
171171
val LOSS = ("loss", "loss to minimize. Supported: logistic, hinge (SVM).")
172172

173-
doubleOptions = doubleOptions ++ Seq(THRESHOLD, SCALE)
173+
doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE)
174174
stringOptions = stringOptions ++ Seq(LOSS)
175175

176176
val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -190,10 +190,11 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
190190
val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
191191

192192
val threshold: Double = doubleOptionValue(THRESHOLD)
193-
val sf: Double = doubleOptionValue(SCALE)
193+
val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
194194

195195
val data = DataGenerator.generateClassificationLabeledPoints(sc,
196-
math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, sf, numPartitions, seed)
196+
math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
197+
seed)
197198

198199
val split = data.randomSplit(Array(0.8, 0.2), seed)
199200

@@ -220,15 +221,15 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
220221
throw new IllegalArgumentException(s"GLMClassificationTest run with unknown regType" +
221222
s" ($regType). Supported values: none, l1, l2.")
222223
}
223-
if (!Array("sgd", "lbfgs").contains(optimizer)) {
224+
if (!Array("sgd", "l-bfgs").contains(optimizer)) {
224225
throw new IllegalArgumentException(
225-
s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, lbfgs.")
226+
s"GLMRegressionTest run with unknown optimizer ($optimizer). Supported values: sgd, l-bfgs.")
226227
}
227228

228229
(loss, regType, optimizer) match {
229230
case ("logistic", "none", "sgd") =>
230231
LogisticRegressionWithSGD.train(rdd, numIterations, stepSize)
231-
case ("logistic", "none", "lbfgs") =>
232+
case ("logistic", "none", "l-bfgs") =>
232233
println("WARNING: LogisticRegressionWithLBFGS ignores numIterations, stepSize" +
233234
" in this Spark version.")
234235
new LogisticRegressionWithLBFGS().run(rdd)
@@ -375,11 +376,11 @@ class NaiveBayesTest(sc: SparkContext)
375376
extends RegressionAndClassificationTests[NaiveBayesModel](sc) {
376377

377378
val THRESHOLD = ("per-negative", "probability for a negative label during data generation")
378-
val SCALE = ("scale-factor", "scale factor for the noise during data generation")
379+
val FEATURE_NOISE = ("feature-noise", "scale factor for the noise during feature generation")
379380
val SMOOTHING = ("nb-lambda", "the smoothing parameter lambda for Naive Bayes")
380381
val MODEL_TYPE = ("model-type", "either multinomial (default) or bernoulli")
381382

382-
doubleOptions = doubleOptions ++ Seq(THRESHOLD, SCALE, SMOOTHING)
383+
doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE, SMOOTHING)
383384
stringOptions = stringOptions ++ Seq(MODEL_TYPE)
384385
val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions
385386
addOptionsToParser()
@@ -391,15 +392,16 @@ class NaiveBayesTest(sc: SparkContext)
391392
val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
392393

393394
val threshold: Double = doubleOptionValue(THRESHOLD)
394-
val sf: Double = doubleOptionValue(SCALE)
395+
val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
395396
val modelType = stringOptionValue(MODEL_TYPE)
396397

397398
val data = if (modelType == "Bernoulli") {
398399
DataGenerator.generateBinaryLabeledPoints(sc,
399400
math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed)
400401
} else {
401402
val negdata = DataGenerator.generateClassificationLabeledPoints(sc,
402-
math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, sf, numPartitions, seed)
403+
math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
404+
seed)
403405
val dataNonneg = negdata.map { lp =>
404406
LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map(math.abs)))
405407
}

mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ object DataGenerator {
1919
numRows: Long,
2020
numCols: Int,
2121
intercept: Double,
22-
eps: Double,
22+
labelNoise: Double,
2323
numPartitions: Int,
2424
seed: Long = System.currentTimeMillis(),
2525
problem: String = ""): RDD[LabeledPoint] = {
2626

27-
RandomRDDs.randomRDD(sc,
28-
new LinearDataGenerator(numCols,intercept, seed, eps, problem), numRows, numPartitions, seed)
27+
RandomRDDs.randomRDD(sc, new LinearDataGenerator(numCols,intercept, seed, labelNoise, problem),
28+
numRows, numPartitions, seed)
2929

3030
}
3131

@@ -46,12 +46,12 @@ object DataGenerator {
4646
numRows: Long,
4747
numCols: Int,
4848
threshold: Double,
49-
scaleFactor: Double,
49+
featureNoise: Double,
5050
numPartitions: Int,
5151
seed: Long = System.currentTimeMillis(),
5252
chiSq: Boolean = false): RDD[LabeledPoint] = {
5353

54-
RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, scaleFactor, chiSq),
54+
RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, featureNoise, chiSq),
5555
numRows, numPartitions, seed)
5656
}
5757

@@ -325,15 +325,15 @@ class RatingGenerator(
325325
class ClassLabelGenerator(
326326
private val numFeatures: Int,
327327
private val threshold: Double,
328-
private val scaleFactor: Double,
328+
private val featureNoise: Double,
329329
private val chiSq: Boolean) extends RandomDataGenerator[LabeledPoint] {
330330

331331
private val rng = new java.util.Random()
332332

333333
override def nextValue(): LabeledPoint = {
334334
val y = if (rng.nextDouble() < threshold) 0.0 else 1.0
335335
val x = Array.fill[Double](numFeatures) {
336-
if (!chiSq) rng.nextGaussian() + (y * scaleFactor) else rng.nextInt(6) * 1.0
336+
if (!chiSq) rng.nextGaussian() + (y * featureNoise) else rng.nextInt(6) * 1.0
337337
}
338338

339339
LabeledPoint(y, Vectors.dense(x))
@@ -344,7 +344,7 @@ class ClassLabelGenerator(
344344
}
345345

346346
override def copy(): ClassLabelGenerator =
347-
new ClassLabelGenerator(numFeatures, threshold, scaleFactor, chiSq)
347+
new ClassLabelGenerator(numFeatures, threshold, featureNoise, chiSq)
348348
}
349349

350350
class BinaryLabeledDataGenerator(
@@ -374,7 +374,7 @@ class LinearDataGenerator(
374374
val numFeatures: Int,
375375
val intercept: Double,
376376
val seed: Long,
377-
val eps: Double,
377+
val labelNoise: Double,
378378
val problem: String = "",
379379
val sparsity: Double = 1.0) extends RandomDataGenerator[LabeledPoint] {
380380

@@ -386,7 +386,7 @@ class LinearDataGenerator(
386386
override def nextValue(): LabeledPoint = {
387387
val x = Array.fill[Double](nnz)(2*rng.nextDouble()-1)
388388

389-
val y = weights.zip(x).map(p => p._1 * p._2).sum + intercept + eps*rng.nextGaussian()
389+
val y = weights.zip(x).map(p => p._1 * p._2).sum + intercept + labelNoise*rng.nextGaussian()
390390
val yD =
391391
if (problem == "SVM"){
392392
if (y < 0.0) 0.0 else 1.0
@@ -402,7 +402,7 @@ class LinearDataGenerator(
402402
}
403403

404404
override def copy(): LinearDataGenerator =
405-
new LinearDataGenerator(numFeatures, intercept, seed, eps, problem, sparsity)
405+
new LinearDataGenerator(numFeatures, intercept, seed, labelNoise, problem, sparsity)
406406
}
407407

408408

0 commit comments

Comments
 (0)