huang06
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 63 additions & 0 deletions b/‎README.md
Lines changed: 63 additions & 0 deletions
diff --git a/‎clean.sh
Lines changed: 8 additions & 0 deletions b/‎clean.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎conda.yaml
Lines changed: 18 additions & 0 deletions b/‎conda.yaml
Lines changed: 18 additions & 0 deletions
diff --git a/‎examples/svdpp_example.py
Lines changed: 13 additions & 0 deletions b/‎examples/svdpp_example.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎img/pyspark-flow.png
53.7 KB b/‎img/pyspark-flow.png
53.7 KB
diff --git a/‎setup.py
Lines changed: 16 additions & 0 deletions b/‎setup.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎spark_svdpp/__init__.py b/‎spark_svdpp/__init__.py
diff --git a/‎spark_svdpp/algos/__init__.py b/‎spark_svdpp/algos/__init__.py
@@ -0,0 +1,3 @@
+.vscode/
+*.zip
+data/
@@ -0,0 +1,63 @@
+# PySpark implementation of SVD++ for Top-N Recommendation
+
+![pyspark-flow](img/pyspark-flow.png)
+
+## Getting Started
+
+### Prerequisites
+
+You need to install *Apache Hadoop* and *Apache Spark* on every nodes of the cluster.
+
+#### Install Hadoop
+
+```bash
+tar zxvf hadoop-3.y.z.tgz
+ln -s /your/hadoop/path/hadoop-3.x.z /your/hadoop/path/hadoop
+```
+
+#### Install Spark
+
+```bash
+tar zxvf spark-2.y.z-bin-hadoop2.7.tgz
+ln -s /your/spark/path/spark-2.y.z /your/spark/path/spark
+```
+
+### Installing
+
+#### Clone the repository
+
+```bash
+git clone [email protected]:citomhuang/spark_svdpp.git
+```
+
+#### Create the Python environment
+
+```bash
+cd spark_svdpp
+conda env create -f conda.yaml
+conda activate spark-svdpp-env
+```
+
+#### Run the tests
+
+```bash
+pytest spark_svdpp/tests
+```
+
+## Run a example
+
+```bash
+./yarn-client.sh
+```
+
+## References
+
+1. [Factorization Meets the Neighborhood: A Multifaceted Collaborative Filtering Model. Yehuda Koren, KDD’08](https://www.cs.rochester.edu/twiki/pub/Main/HarpSeminar/Factorization_Meets_the_Neighborhood-_a_Multifaceted_Collaborative_Filtering_Model.pdf)
+
+2. [Spark: Cluster Computing with Working Sets](https://www.usenix.org/legacy/event/hotcloud10/tech/full_papers/Zaharia.pdf)
+
+3. [Scaling Collaborative Filtering with PySpark](https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html)
+
+4. [Running Spark on YARN](https://spark.apache.org/docs/latest/running-on-yarn.html)
+
+5. [NicolasHug/Surprise](https://github.com/NicolasHug/Surprise)
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+rm -f spark_svdpp.zip
+rm -rf spark-warehouse/
+rm -rf .pytest_cache/
+rm -rf dist/
+rm -rf build/
+rm -rf spark_svdpp.egg-info/
@@ -0,0 +1,18 @@
+name: spark-svdpp-env
+channels:
+  - defaults
+  - anaconda
+dependencies:
+  - python=3.7.5
+  - pylint=2.4.4
+  - flake8=3.7.9
+  - h5py=2.9.0
+  - ipython=7.9.0
+  - numpy=1.17.3
+  - pandas=0.25.3
+  - pip=19.3.1
+  - scipy=1.3.1
+  - pyarrow=0.13.0
+  - pytest=5.3.0
+  - psutil=5.6.5
+  - prompt_toolkit=2.0.10
@@ -0,0 +1,13 @@
+import sys
+from pyspark.sql import SparkSession
+from spark_svdpp.algos.svdpp import run
+
+
+n_pars = int(sys.argv[1])
+spark = SparkSession.builder.getOrCreate()
+output_data_path = 'hdfs:///svdpp/output.parquet'
+input_data_path = 'hdfs:///svdpp/dataset_train.parquet'
+
+run(spark=spark, n_pars=n_pars,
+    input_data_path=input_data_path,
+    output_data_path=output_data_path)
@@ -0,0 +1,16 @@
+from setuptools import setup, find_packages
+import os
+
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+with open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+setup(
+    name='spark-svdpp',
+    version='0.1.0',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    packages=find_packages()
+)