Skip to content

Commit 383f5e9

Browse files
committed
[SPARK-32310][ML][PYSPARK] ML params default value parity in classification, regression, clustering and fpm
### What changes were proposed in this pull request? set params default values in trait ...Params in both Scala and Python. I will do this in two PRs. I will change classification, regression, clustering and fpm in this PR. Will change the rest in another PR. ### Why are the changes needed? Make ML has the same default param values between estimator and its corresponding transformer, and also between Scala and Python. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests Closes apache#29112 from huaxingao/set_default. Authored-by: Huaxin Gao <[email protected]> Signed-off-by: Huaxin Gao <[email protected]>
1 parent d5c672a commit 383f5e9

21 files changed

+141
-157
lines changed

mllib/src/main/scala/org/apache/spark/ml/classification/FMClassifier.scala

-10
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ class FMClassifier @Since("3.0.0") (
8585
*/
8686
@Since("3.0.0")
8787
def setFactorSize(value: Int): this.type = set(factorSize, value)
88-
setDefault(factorSize -> 8)
8988

9089
/**
9190
* Set whether to fit intercept term.
@@ -95,7 +94,6 @@ class FMClassifier @Since("3.0.0") (
9594
*/
9695
@Since("3.0.0")
9796
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
98-
setDefault(fitIntercept -> true)
9997

10098
/**
10199
* Set whether to fit linear term.
@@ -105,7 +103,6 @@ class FMClassifier @Since("3.0.0") (
105103
*/
106104
@Since("3.0.0")
107105
def setFitLinear(value: Boolean): this.type = set(fitLinear, value)
108-
setDefault(fitLinear -> true)
109106

110107
/**
111108
* Set the L2 regularization parameter.
@@ -115,7 +112,6 @@ class FMClassifier @Since("3.0.0") (
115112
*/
116113
@Since("3.0.0")
117114
def setRegParam(value: Double): this.type = set(regParam, value)
118-
setDefault(regParam -> 0.0)
119115

120116
/**
121117
* Set the mini-batch fraction parameter.
@@ -125,7 +121,6 @@ class FMClassifier @Since("3.0.0") (
125121
*/
126122
@Since("3.0.0")
127123
def setMiniBatchFraction(value: Double): this.type = set(miniBatchFraction, value)
128-
setDefault(miniBatchFraction -> 1.0)
129124

130125
/**
131126
* Set the standard deviation of initial coefficients.
@@ -135,7 +130,6 @@ class FMClassifier @Since("3.0.0") (
135130
*/
136131
@Since("3.0.0")
137132
def setInitStd(value: Double): this.type = set(initStd, value)
138-
setDefault(initStd -> 0.01)
139133

140134
/**
141135
* Set the maximum number of iterations.
@@ -145,7 +139,6 @@ class FMClassifier @Since("3.0.0") (
145139
*/
146140
@Since("3.0.0")
147141
def setMaxIter(value: Int): this.type = set(maxIter, value)
148-
setDefault(maxIter -> 100)
149142

150143
/**
151144
* Set the initial step size for the first step (like learning rate).
@@ -155,7 +148,6 @@ class FMClassifier @Since("3.0.0") (
155148
*/
156149
@Since("3.0.0")
157150
def setStepSize(value: Double): this.type = set(stepSize, value)
158-
setDefault(stepSize -> 1.0)
159151

160152
/**
161153
* Set the convergence tolerance of iterations.
@@ -165,7 +157,6 @@ class FMClassifier @Since("3.0.0") (
165157
*/
166158
@Since("3.0.0")
167159
def setTol(value: Double): this.type = set(tol, value)
168-
setDefault(tol -> 1E-6)
169160

170161
/**
171162
* Set the solver algorithm used for optimization.
@@ -176,7 +167,6 @@ class FMClassifier @Since("3.0.0") (
176167
*/
177168
@Since("3.0.0")
178169
def setSolver(value: String): this.type = set(solver, value)
179-
setDefault(solver -> AdamW)
180170

181171
/**
182172
* Set the random seed for weight initialization.

mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala

+3-9
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
5555
*/
5656
final override val threshold: DoubleParam = new DoubleParam(this, "threshold",
5757
"threshold in binary classification prediction applied to rawPrediction")
58+
59+
setDefault(regParam -> 0.0, maxIter -> 100, fitIntercept -> true, tol -> 1E-6,
60+
standardization -> true, threshold -> 0.0, aggregationDepth -> 2, blockSize -> 1)
5861
}
5962

6063
/**
@@ -82,7 +85,6 @@ class LinearSVC @Since("2.2.0") (
8285
*/
8386
@Since("2.2.0")
8487
def setRegParam(value: Double): this.type = set(regParam, value)
85-
setDefault(regParam -> 0.0)
8688

8789
/**
8890
* Set the maximum number of iterations.
@@ -92,7 +94,6 @@ class LinearSVC @Since("2.2.0") (
9294
*/
9395
@Since("2.2.0")
9496
def setMaxIter(value: Int): this.type = set(maxIter, value)
95-
setDefault(maxIter -> 100)
9697

9798
/**
9899
* Whether to fit an intercept term.
@@ -102,7 +103,6 @@ class LinearSVC @Since("2.2.0") (
102103
*/
103104
@Since("2.2.0")
104105
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
105-
setDefault(fitIntercept -> true)
106106

107107
/**
108108
* Set the convergence tolerance of iterations.
@@ -113,7 +113,6 @@ class LinearSVC @Since("2.2.0") (
113113
*/
114114
@Since("2.2.0")
115115
def setTol(value: Double): this.type = set(tol, value)
116-
setDefault(tol -> 1E-6)
117116

118117
/**
119118
* Whether to standardize the training features before fitting the model.
@@ -123,7 +122,6 @@ class LinearSVC @Since("2.2.0") (
123122
*/
124123
@Since("2.2.0")
125124
def setStandardization(value: Boolean): this.type = set(standardization, value)
126-
setDefault(standardization -> true)
127125

128126
/**
129127
* Set the value of param [[weightCol]].
@@ -142,7 +140,6 @@ class LinearSVC @Since("2.2.0") (
142140
*/
143141
@Since("2.2.0")
144142
def setThreshold(value: Double): this.type = set(threshold, value)
145-
setDefault(threshold -> 0.0)
146143

147144
/**
148145
* Suggested depth for treeAggregate (greater than or equal to 2).
@@ -154,7 +151,6 @@ class LinearSVC @Since("2.2.0") (
154151
*/
155152
@Since("2.2.0")
156153
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
157-
setDefault(aggregationDepth -> 2)
158154

159155
/**
160156
* Set block size for stacking input data in matrices.
@@ -173,7 +169,6 @@ class LinearSVC @Since("2.2.0") (
173169
*/
174170
@Since("3.1.0")
175171
def setBlockSize(value: Int): this.type = set(blockSize, value)
176-
setDefault(blockSize -> 1)
177172

178173
@Since("2.2.0")
179174
override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
@@ -381,7 +376,6 @@ class LinearSVCModel private[classification] (
381376

382377
@Since("2.2.0")
383378
def setThreshold(value: Double): this.type = set(threshold, value)
384-
setDefault(threshold, 0.0)
385379

386380
private val margin: Vector => Double = (features) => {
387381
BLAS.dot(features, coefficients) + intercept

mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala

+4-10
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,10 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
243243
@Since("2.2.0")
244244
def getUpperBoundsOnIntercepts: Vector = $(upperBoundsOnIntercepts)
245245

246+
setDefault(regParam -> 0.0, elasticNetParam -> 0.0, maxIter -> 100, tol -> 1E-6,
247+
fitIntercept -> true, family -> "auto", standardization -> true, threshold -> 0.5,
248+
aggregationDepth -> 2, blockSize -> 1)
249+
246250
protected def usingBoundConstrainedOptimization: Boolean = {
247251
isSet(lowerBoundsOnCoefficients) || isSet(upperBoundsOnCoefficients) ||
248252
isSet(lowerBoundsOnIntercepts) || isSet(upperBoundsOnIntercepts)
@@ -290,7 +294,6 @@ class LogisticRegression @Since("1.2.0") (
290294
*/
291295
@Since("1.2.0")
292296
def setRegParam(value: Double): this.type = set(regParam, value)
293-
setDefault(regParam -> 0.0)
294297

295298
/**
296299
* Set the ElasticNet mixing parameter.
@@ -306,7 +309,6 @@ class LogisticRegression @Since("1.2.0") (
306309
*/
307310
@Since("1.4.0")
308311
def setElasticNetParam(value: Double): this.type = set(elasticNetParam, value)
309-
setDefault(elasticNetParam -> 0.0)
310312

311313
/**
312314
* Set the maximum number of iterations.
@@ -316,7 +318,6 @@ class LogisticRegression @Since("1.2.0") (
316318
*/
317319
@Since("1.2.0")
318320
def setMaxIter(value: Int): this.type = set(maxIter, value)
319-
setDefault(maxIter -> 100)
320321

321322
/**
322323
* Set the convergence tolerance of iterations.
@@ -327,7 +328,6 @@ class LogisticRegression @Since("1.2.0") (
327328
*/
328329
@Since("1.4.0")
329330
def setTol(value: Double): this.type = set(tol, value)
330-
setDefault(tol -> 1E-6)
331331

332332
/**
333333
* Whether to fit an intercept term.
@@ -337,7 +337,6 @@ class LogisticRegression @Since("1.2.0") (
337337
*/
338338
@Since("1.4.0")
339339
def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
340-
setDefault(fitIntercept -> true)
341340

342341
/**
343342
* Sets the value of param [[family]].
@@ -347,7 +346,6 @@ class LogisticRegression @Since("1.2.0") (
347346
*/
348347
@Since("2.1.0")
349348
def setFamily(value: String): this.type = set(family, value)
350-
setDefault(family -> "auto")
351349

352350
/**
353351
* Whether to standardize the training features before fitting the model.
@@ -361,11 +359,9 @@ class LogisticRegression @Since("1.2.0") (
361359
*/
362360
@Since("1.5.0")
363361
def setStandardization(value: Boolean): this.type = set(standardization, value)
364-
setDefault(standardization -> true)
365362

366363
@Since("1.5.0")
367364
override def setThreshold(value: Double): this.type = super.setThreshold(value)
368-
setDefault(threshold -> 0.5)
369365

370366
@Since("1.5.0")
371367
override def getThreshold: Double = super.getThreshold
@@ -396,7 +392,6 @@ class LogisticRegression @Since("1.2.0") (
396392
*/
397393
@Since("2.1.0")
398394
def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
399-
setDefault(aggregationDepth -> 2)
400395

401396
/**
402397
* Set the lower bounds on coefficients if fitting under bound constrained optimization.
@@ -447,7 +442,6 @@ class LogisticRegression @Since("1.2.0") (
447442
*/
448443
@Since("3.1.0")
449444
def setBlockSize(value: Int): this.type = set(blockSize, value)
450-
setDefault(blockSize -> 1)
451445

452446
private def assertBoundConstrainedOptimizationParamsValid(
453447
numCoefficientSets: Int,

mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ private[classification] trait NaiveBayesParams extends PredictorParams with HasW
6464

6565
/** @group getParam */
6666
final def getModelType: String = $(modelType)
67+
68+
setDefault(smoothing -> 1.0, modelType -> NaiveBayes.Multinomial)
6769
}
6870

6971
// scalastyle:off line.size.limit
@@ -107,7 +109,6 @@ class NaiveBayes @Since("1.5.0") (
107109
*/
108110
@Since("1.5.0")
109111
def setSmoothing(value: Double): this.type = set(smoothing, value)
110-
setDefault(smoothing -> 1.0)
111112

112113
/**
113114
* Set the model type using a string (case-sensitive).
@@ -117,7 +118,6 @@ class NaiveBayes @Since("1.5.0") (
117118
*/
118119
@Since("1.5.0")
119120
def setModelType(value: String): this.type = set(modelType, value)
120-
setDefault(modelType -> Multinomial)
121121

122122
/**
123123
* Sets the value of param [[weightCol]].

mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala

+2-5
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ private[clustering] trait BisectingKMeansParams extends Params with HasMaxIter
7272
@Since("2.0.0")
7373
def getMinDivisibleClusterSize: Double = $(minDivisibleClusterSize)
7474

75+
setDefault(k -> 4, maxIter -> 20, minDivisibleClusterSize -> 1.0)
76+
7577
/**
7678
* Validates and transforms the input schema.
7779
* @param schema input schema
@@ -226,11 +228,6 @@ class BisectingKMeans @Since("2.0.0") (
226228
@Since("2.0.0") override val uid: String)
227229
extends Estimator[BisectingKMeansModel] with BisectingKMeansParams with DefaultParamsWritable {
228230

229-
setDefault(
230-
k -> 4,
231-
maxIter -> 20,
232-
minDivisibleClusterSize -> 1.0)
233-
234231
@Since("2.0.0")
235232
override def copy(extra: ParamMap): BisectingKMeans = defaultCopy(extra)
236233

mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala

+2-6
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ private[clustering] trait GaussianMixtureParams extends Params with HasMaxIter w
5959
@Since("2.0.0")
6060
def getK: Int = $(k)
6161

62+
setDefault(k -> 2, maxIter -> 100, tol -> 0.01, blockSize -> 1)
63+
6264
/**
6365
* Validates and transforms the input schema.
6466
*
@@ -328,11 +330,6 @@ class GaussianMixture @Since("2.0.0") (
328330
@Since("2.0.0") override val uid: String)
329331
extends Estimator[GaussianMixtureModel] with GaussianMixtureParams with DefaultParamsWritable {
330332

331-
setDefault(
332-
k -> 2,
333-
maxIter -> 100,
334-
tol -> 0.01)
335-
336333
@Since("2.0.0")
337334
override def copy(extra: ParamMap): GaussianMixture = defaultCopy(extra)
338335

@@ -392,7 +389,6 @@ class GaussianMixture @Since("2.0.0") (
392389
*/
393390
@Since("3.1.0")
394391
def setBlockSize(value: Int): this.type = set(blockSize, value)
395-
setDefault(blockSize -> 1)
396392

397393
/**
398394
* Number of samples per cluster to use when initializing Gaussians.

mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala

+3-8
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
8787
@Since("1.5.0")
8888
def getInitSteps: Int = $(initSteps)
8989

90+
setDefault(k -> 2, maxIter -> 20, initMode -> MLlibKMeans.K_MEANS_PARALLEL, initSteps -> 2,
91+
tol -> 1e-4, distanceMeasure -> DistanceMeasure.EUCLIDEAN)
92+
9093
/**
9194
* Validates and transforms the input schema.
9295
* @param schema input schema
@@ -271,14 +274,6 @@ class KMeans @Since("1.5.0") (
271274
@Since("1.5.0") override val uid: String)
272275
extends Estimator[KMeansModel] with KMeansParams with DefaultParamsWritable {
273276

274-
setDefault(
275-
k -> 2,
276-
maxIter -> 20,
277-
initMode -> MLlibKMeans.K_MEANS_PARALLEL,
278-
initSteps -> 2,
279-
tol -> 1e-4,
280-
distanceMeasure -> DistanceMeasure.EUCLIDEAN)
281-
282277
@Since("1.5.0")
283278
override def copy(extra: ParamMap): KMeans = defaultCopy(extra)
284279

mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala

+5-6
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,6 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
199199
" with estimates of the topic mixture distribution for each document (often called \"theta\"" +
200200
" in the literature). Returns a vector of zeros for an empty document.")
201201

202-
setDefault(topicDistributionCol -> "topicDistribution")
203-
204202
/** @group getParam */
205203
@Since("1.6.0")
206204
def getTopicDistributionCol: String = $(topicDistributionCol)
@@ -315,6 +313,11 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
315313
@Since("2.0.0")
316314
def getKeepLastCheckpoint: Boolean = $(keepLastCheckpoint)
317315

316+
setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
317+
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
318+
optimizeDocConcentration -> true, keepLastCheckpoint -> true,
319+
topicDistributionCol -> "topicDistribution")
320+
318321
/**
319322
* Validates and transforms the input schema.
320323
*
@@ -863,10 +866,6 @@ class LDA @Since("1.6.0") (
863866
@Since("1.6.0")
864867
def this() = this(Identifiable.randomUID("lda"))
865868

866-
setDefault(maxIter -> 20, k -> 10, optimizer -> "online", checkpointInterval -> 10,
867-
learningOffset -> 1024, learningDecay -> 0.51, subsamplingRate -> 0.05,
868-
optimizeDocConcentration -> true, keepLastCheckpoint -> true)
869-
870869
/**
871870
* The features for LDA should be a `Vector` representing the word counts in a document.
872871
* The vector should be of length vocabSize, with counts for each term (word).

0 commit comments

Comments
 (0)