Skip to content

Commit 79f8cfa

Browse files
committed
copy PIC test to 1.3 and 1.4
Copy #86 changes to 1.3 and 1.4. feynmanliang Author: Xiangrui Meng <[email protected]> Closes #87 from mengxr/copy-pic-test and squashes the following commits: fa4e004 [Xiangrui Meng] update test runner 099ae1d [Xiangrui Meng] copy PIC test to 1.3 and 1.4
1 parent 9b1c586 commit 79f8cfa

File tree

5 files changed

+116
-11
lines changed

5 files changed

+116
-11
lines changed

mllib-tests/v1p3/src/main/scala/mllib/perf/TestRunner.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@ package mllib.perf
22

33
import scala.collection.JavaConverters._
44

5-
import org.json4s.JsonDSL._
65
import org.json4s.JsonAST._
6+
import org.json4s.JsonDSL._
77
import org.json4s.jackson.JsonMethods._
88

9-
import org.apache.spark.SparkConf
10-
import org.apache.spark.SparkContext
9+
import org.apache.spark.{SparkConf, SparkContext}
1110

12-
import mllib.perf.clustering.GaussianMixtureTest
11+
import mllib.perf.clustering.{GaussianMixtureTest, PICTest}
1312
import mllib.perf.feature.Word2VecTest
1413
import mllib.perf.fpm.FPGrowthTest
1514
import mllib.perf.linalg.BlockMatrixMultTest
@@ -34,6 +33,7 @@ object TestRunner {
3433
case "als" => new ALSTest(sc)
3534
// clustering
3635
case "kmeans" => new KMeansTest(sc)
36+
case "pic" => new PICTest(sc)
3737
// trees
3838
case "decision-tree" => new DecisionTreeTest(sc)
3939
// linalg
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package mllib.perf.clustering
2+
3+
import org.json4s.JValue
4+
import org.json4s.JsonDSL._
5+
6+
import org.apache.spark.SparkContext
7+
import org.apache.spark.mllib.clustering.PowerIterationClustering
8+
import org.apache.spark.rdd.RDD
9+
10+
import mllib.perf.PerfTest
11+
12+
class PICTest(sc: SparkContext) extends PerfTest {
13+
14+
val NUM_POINTS = ("num-points", "number of points")
15+
val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
16+
val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
17+
val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
18+
19+
intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
20+
longOptions ++= Seq(NUM_POINTS)
21+
val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions
22+
addOptionsToParser()
23+
24+
var data: RDD[(Long, Long, Double)] = _
25+
26+
override def createInputData(seed: Long): Unit = {
27+
val numPoints = longOptionValue(NUM_POINTS)
28+
val nodeDegree = intOptionValue(NODE_DEGREE)
29+
val numPartitions = intOptionValue(NUM_PARTITIONS)
30+
31+
// Generates a periodic banded matrix with bandwidth = nodeDegree
32+
val data = sc.parallelize(0L to numPoints, numPartitions)
33+
.flatMap { id =>
34+
(((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
35+
(id, (nbr + numPoints) % numPoints, 1D)
36+
}
37+
}
38+
logInfo(s"Generated ${data.count()} pairwise similarities.")
39+
}
40+
41+
override def run(): JValue = {
42+
val numIterations = intOptionValue(NUM_ITERATIONS)
43+
val k = intOptionValue(NUM_CENTERS)
44+
val start = System.currentTimeMillis()
45+
val pic = new PowerIterationClustering()
46+
.setK(k)
47+
.setMaxIterations(numIterations)
48+
val model = pic.run(data)
49+
val duration = (System.currentTimeMillis() - start) / 1e3
50+
"time" -> duration
51+
}
52+
}
53+

mllib-tests/v1p4/src/main/scala/mllib/perf/TestRunner.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@ package mllib.perf
22

33
import scala.collection.JavaConverters._
44

5-
import org.json4s.JsonDSL._
65
import org.json4s.JsonAST._
6+
import org.json4s.JsonDSL._
77
import org.json4s.jackson.JsonMethods._
88

9-
import org.apache.spark.SparkConf
10-
import org.apache.spark.SparkContext
9+
import org.apache.spark.{SparkConf, SparkContext}
1110

12-
import mllib.perf.clustering.{EMLDATest, OnlineLDATest, GaussianMixtureTest}
11+
import mllib.perf.clustering.{EMLDATest, GaussianMixtureTest, OnlineLDATest, PICTest}
1312
import mllib.perf.feature.Word2VecTest
1413
import mllib.perf.fpm.FPGrowthTest
1514
import mllib.perf.linalg.BlockMatrixMultTest
@@ -37,6 +36,7 @@ object TestRunner {
3736
case "gmm" => new GaussianMixtureTest(sc)
3837
case "emlda" => new EMLDATest(sc)
3938
case "onlinelda" => new OnlineLDATest(sc)
39+
case "pic" => new PICTest(sc)
4040
// trees
4141
case "decision-tree" => new DecisionTreeTest(sc)
4242
// linalg
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package mllib.perf.clustering
2+
3+
import org.json4s.JValue
4+
import org.json4s.JsonDSL._
5+
6+
import org.apache.spark.SparkContext
7+
import org.apache.spark.mllib.clustering.PowerIterationClustering
8+
import org.apache.spark.rdd.RDD
9+
10+
import mllib.perf.PerfTest
11+
12+
class PICTest(sc: SparkContext) extends PerfTest {
13+
14+
val NUM_POINTS = ("num-points", "number of points")
15+
val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
16+
val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
17+
val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
18+
19+
intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
20+
longOptions ++= Seq(NUM_POINTS)
21+
val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions
22+
addOptionsToParser()
23+
24+
var data: RDD[(Long, Long, Double)] = _
25+
26+
override def createInputData(seed: Long): Unit = {
27+
val numPoints = longOptionValue(NUM_POINTS)
28+
val nodeDegree = intOptionValue(NODE_DEGREE)
29+
val numPartitions = intOptionValue(NUM_PARTITIONS)
30+
31+
// Generates a periodic banded matrix with bandwidth = nodeDegree
32+
val data = sc.parallelize(0L to numPoints, numPartitions)
33+
.flatMap { id =>
34+
(((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
35+
(id, (nbr + numPoints) % numPoints, 1D)
36+
}
37+
}
38+
logInfo(s"Generated ${data.count()} pairwise similarities.")
39+
}
40+
41+
override def run(): JValue = {
42+
val numIterations = intOptionValue(NUM_ITERATIONS)
43+
val k = intOptionValue(NUM_CENTERS)
44+
val start = System.currentTimeMillis()
45+
val pic = new PowerIterationClustering()
46+
.setK(k)
47+
.setMaxIterations(numIterations)
48+
val model = pic.run(data)
49+
val duration = (System.currentTimeMillis() - start) / 1e3
50+
"time" -> duration
51+
}
52+
}
53+

mllib-tests/v1p5/src/main/scala/mllib/perf/TestRunner.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,11 @@ package mllib.perf
22

33
import scala.collection.JavaConverters._
44

5-
import org.json4s.JsonDSL._
65
import org.json4s.JsonAST._
6+
import org.json4s.JsonDSL._
77
import org.json4s.jackson.JsonMethods._
88

9-
import org.apache.spark.SparkConf
10-
import org.apache.spark.SparkContext
9+
import org.apache.spark.{SparkConf, SparkContext}
1110

1211
import mllib.perf.clustering.{EMLDATest, GaussianMixtureTest, OnlineLDATest, PICTest}
1312
import mllib.perf.feature.Word2VecTest

0 commit comments

Comments
 (0)