copy PIC test to 1.3 and 1.4

mengxr · mengxr · commit 79f8cfa6494e · 2015-08-31T09:50:44.000-07:00
Copy #86 changes to 1.3 and 1.4. feynmanliang Author: Xiangrui Meng <meng@databricks.com> Closes #87 from mengxr/copy-pic-test and squashes the following commits: fa4e004 [Xiangrui Meng] update test runner 099ae1d [Xiangrui Meng] copy PIC test to 1.3 and 1.4
diff --git a/mllib-tests/v1p3/src/main/scala/mllib/perf/TestRunner.scala b/mllib-tests/v1p3/src/main/scala/mllib/perf/TestRunner.scala
@@ -2,14 +2,13 @@ package mllib.perf
 
 import scala.collection.JavaConverters._
 
-import org.json4s.JsonDSL._
 import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 
-import mllib.perf.clustering.GaussianMixtureTest
+import mllib.perf.clustering.{GaussianMixtureTest, PICTest}
 import mllib.perf.feature.Word2VecTest
 import mllib.perf.fpm.FPGrowthTest
 import mllib.perf.linalg.BlockMatrixMultTest
@@ -34,6 +33,7 @@ object TestRunner {
         case "als" => new ALSTest(sc)
         // clustering
         case "kmeans" => new KMeansTest(sc)
+        case "pic" => new PICTest(sc)
         // trees
         case "decision-tree" => new DecisionTreeTest(sc)
         // linalg
diff --git a/mllib-tests/v1p3/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p3/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -0,0 +1,53 @@
+package mllib.perf.clustering
+
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class PICTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_POINTS = ("num-points", "number of points")
+  val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
+  val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
+  val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
+
+  intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_POINTS)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var data: RDD[(Long, Long, Double)] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numPoints = longOptionValue(NUM_POINTS)
+    val nodeDegree = intOptionValue(NODE_DEGREE)
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+
+    // Generates a periodic banded matrix with bandwidth = nodeDegree
+    val data = sc.parallelize(0L to numPoints, numPartitions)
+      .flatMap { id =>
+        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
+          (id, (nbr + numPoints) % numPoints, 1D)
+        }
+      }
+    logInfo(s"Generated ${data.count()} pairwise similarities.")
+  }
+
+  override def run(): JValue = {
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val k = intOptionValue(NUM_CENTERS)
+    val start = System.currentTimeMillis()
+    val pic = new PowerIterationClustering()
+      .setK(k)
+      .setMaxIterations(numIterations)
+    val model = pic.run(data)
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
+
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/TestRunner.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/TestRunner.scala
@@ -2,14 +2,13 @@ package mllib.perf
 
 import scala.collection.JavaConverters._
 
-import org.json4s.JsonDSL._
 import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 
-import mllib.perf.clustering.{EMLDATest, OnlineLDATest, GaussianMixtureTest}
+import mllib.perf.clustering.{EMLDATest, GaussianMixtureTest, OnlineLDATest, PICTest}
 import mllib.perf.feature.Word2VecTest
 import mllib.perf.fpm.FPGrowthTest
 import mllib.perf.linalg.BlockMatrixMultTest
@@ -37,6 +36,7 @@ object TestRunner {
         case "gmm" => new GaussianMixtureTest(sc)
         case "emlda" => new EMLDATest(sc)
         case "onlinelda" => new OnlineLDATest(sc)
+        case "pic" => new PICTest(sc)
         // trees
         case "decision-tree" => new DecisionTreeTest(sc)
         // linalg
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -0,0 +1,53 @@
+package mllib.perf.clustering
+
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class PICTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_POINTS = ("num-points", "number of points")
+  val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
+  val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
+  val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
+
+  intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_POINTS)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var data: RDD[(Long, Long, Double)] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numPoints = longOptionValue(NUM_POINTS)
+    val nodeDegree = intOptionValue(NODE_DEGREE)
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+
+    // Generates a periodic banded matrix with bandwidth = nodeDegree
+    val data = sc.parallelize(0L to numPoints, numPartitions)
+      .flatMap { id =>
+        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
+          (id, (nbr + numPoints) % numPoints, 1D)
+        }
+      }
+    logInfo(s"Generated ${data.count()} pairwise similarities.")
+  }
+
+  override def run(): JValue = {
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val k = intOptionValue(NUM_CENTERS)
+    val start = System.currentTimeMillis()
+    val pic = new PowerIterationClustering()
+      .setK(k)
+      .setMaxIterations(numIterations)
+    val model = pic.run(data)
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
+
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/TestRunner.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/TestRunner.scala
@@ -2,12 +2,11 @@ package mllib.perf
 
 import scala.collection.JavaConverters._
 
-import org.json4s.JsonDSL._
 import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
-import org.apache.spark.SparkConf
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 
 import mllib.perf.clustering.{EMLDATest, GaussianMixtureTest, OnlineLDATest, PICTest}
 import mllib.perf.feature.Word2VecTest