From 0fb11167547eb109064f2c687554a3709bc65312 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Wed, 17 Jan 2018 16:38:24 +0800 Subject: [PATCH 01/16] Add toree installation script for Scala&PySpark kernel. --- toree_install.sh | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100755 toree_install.sh diff --git a/toree_install.sh b/toree_install.sh new file mode 100755 index 0000000..f5eb80c --- /dev/null +++ b/toree_install.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Find the path of BigDL and Spark +export BIGDL_PIP_HOME=`pip show BigDL | sed -n -e '/^Location/p' | sed 's/[^ ]* //'` +export BIGDL_HOME=${BIGDL_PIP_HOME}/bigdl/share +export PYSPARK_PIP_HOME=`pip show pyspark | sed -n -e '/^Location/p' | sed 's/[^ ]* //'` +export SPARK_HOME=`python ${PYSPARK_PIP_HOME}/pyspark/find_spark_home.py` +# export SPARK_HOME=${SPARK_PATH}/bin + +# Check installation of BigDL +if [ -z "${BIGDL_HOME}" ]; then + echo "Please install BigDL correctly!" + exit 1 +fi + +# Check installation of Spark +if [ -z "${SPARK_HOME}" ]; then + echo "Please set install Spark correctly!" + exit 1 +fi + +# Set paths +export BIGDL_JAR_NAME=`ls ${BIGDL_HOME}/lib/ | grep jar-with-dependencies.jar` +export BIGDL_JAR=${BIGDL_HOME}/lib/${BIGDL_JAR_NAME} +export BIGDL_PY_ZIP_NAME=`ls ${BIGDL_HOME}/lib/ | grep python-api.zip` +export BIGDL_PY_ZIP=${BIGDL_HOME}/lib/${BIGDL_PY_ZIP_NAME} +export BIGDL_CONF=${BIGDL_HOME}/conf/spark-bigdl.conf +echo ${BIGDL_JAR} +echo ${BIGDL_PY_ZIP} +echo ${BIGDL_CONF} +# Check files +if [ ! -f ${BIGDL_JAR} ]; then + echo "Cannot find ${BIGDL_JAR}!" + exit 1 +fi + +if [ ! -f ${BIGDL_PY_ZIP} ]; then + echo "Cannot find ${BIGDL_PY_ZIP}!" + exit 1 +fi + +if [ ! -f ${BIGDL_CONF} ]; then + echo "Cannot find ${BIGDL_CONF}!" + exit 1 +fi + +# Configure Spark +export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory'" + +# Install Toree +jupyter toree install --interpreters=Scala,PySpark --spark_home=${SPARK_HOME} --spark_opts='${SPARK_OPTS}' From e3b482e36195be9d179ebf8d77d3f51981e41bf0 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Wed, 17 Jan 2018 16:43:39 +0800 Subject: [PATCH 02/16] Refine the script. --- toree_install.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/toree_install.sh b/toree_install.sh index f5eb80c..1db0fc5 100755 --- a/toree_install.sh +++ b/toree_install.sh @@ -5,7 +5,6 @@ export BIGDL_PIP_HOME=`pip show BigDL | sed -n -e '/^Location/p' | sed 's/[^ ]* export BIGDL_HOME=${BIGDL_PIP_HOME}/bigdl/share export PYSPARK_PIP_HOME=`pip show pyspark | sed -n -e '/^Location/p' | sed 's/[^ ]* //'` export SPARK_HOME=`python ${PYSPARK_PIP_HOME}/pyspark/find_spark_home.py` -# export SPARK_HOME=${SPARK_PATH}/bin # Check installation of BigDL if [ -z "${BIGDL_HOME}" ]; then @@ -25,9 +24,7 @@ export BIGDL_JAR=${BIGDL_HOME}/lib/${BIGDL_JAR_NAME} export BIGDL_PY_ZIP_NAME=`ls ${BIGDL_HOME}/lib/ | grep python-api.zip` export BIGDL_PY_ZIP=${BIGDL_HOME}/lib/${BIGDL_PY_ZIP_NAME} export BIGDL_CONF=${BIGDL_HOME}/conf/spark-bigdl.conf -echo ${BIGDL_JAR} -echo ${BIGDL_PY_ZIP} -echo ${BIGDL_CONF} + # Check files if [ ! -f ${BIGDL_JAR} ]; then echo "Cannot find ${BIGDL_JAR}!" From fff42edaeb5c81699efdfa35c461a370c7d41257 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Mon, 29 Jan 2018 16:30:39 +0800 Subject: [PATCH 03/16] Check BigDL and Spark version. --- toree_install.sh | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/toree_install.sh b/toree_install.sh index 1db0fc5..05228b5 100755 --- a/toree_install.sh +++ b/toree_install.sh @@ -1,20 +1,37 @@ #!/bin/bash +# Required BigDL and Spark version +export BIGDL_VERSION=0.4.0 +export SPARK_VERSION=2.2.1 + # Find the path of BigDL and Spark export BIGDL_PIP_HOME=`pip show BigDL | sed -n -e '/^Location/p' | sed 's/[^ ]* //'` export BIGDL_HOME=${BIGDL_PIP_HOME}/bigdl/share export PYSPARK_PIP_HOME=`pip show pyspark | sed -n -e '/^Location/p' | sed 's/[^ ]* //'` -export SPARK_HOME=`python ${PYSPARK_PIP_HOME}/pyspark/find_spark_home.py` +export SPARK_HOME=${PYSPARK_PIP_HOME}/pyspark # Check installation of BigDL if [ -z "${BIGDL_HOME}" ]; then - echo "Please install BigDL correctly!" + echo "Cannot find BigDL installation directory. Have you run 'pip install BigDL=${BIGDL_VERSION}'?" exit 1 fi # Check installation of Spark if [ -z "${SPARK_HOME}" ]; then - echo "Please set install Spark correctly!" + echo "Cannot find Spark installation directory. Have you run 'pip install BigDL=${BIGDL_VERSION}'?" + exit 1 +fi + +# Check the version BigDL and Spark +export BIGDL_TEMP_VERSION=`pip show BigDL | sed -n -e '/^Version/p' | sed 's/[^ ]* //'` +if [ "${BIGDL_VERSION}" != "${BIGDL_TEMP_VERSION}" ]; then + echo "Wrong version of BigDL. Please run 'pip install BigDL=${BIGDL_VERSION}'." + exit 1 +fi + +export SPARK_TEMP_VERSION=`pip show pyspark | sed -n -e '/^Version/p' | sed 's/[^ ]* //'` +if [ "${SPARK_VERSION}" != "${SPARK_TEMP_VERSION}" ]; then + echo "Wrong version of Spark. Please run 'pip install BigDL=${BIGDL_VERSION}'." exit 1 fi From 39ea406d7d33aff818ae822d396da9e2c5926834 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 11:14:37 +0800 Subject: [PATCH 04/16] Fix errors in script and add proxy configuration. --- toree_install.sh | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/toree_install.sh b/toree_install.sh index 05228b5..671d8cb 100755 --- a/toree_install.sh +++ b/toree_install.sh @@ -12,26 +12,26 @@ export SPARK_HOME=${PYSPARK_PIP_HOME}/pyspark # Check installation of BigDL if [ -z "${BIGDL_HOME}" ]; then - echo "Cannot find BigDL installation directory. Have you run 'pip install BigDL=${BIGDL_VERSION}'?" + echo "Cannot find BigDL installation directory. Have you run 'pip install BigDL==${BIGDL_VERSION}'?" exit 1 fi # Check installation of Spark if [ -z "${SPARK_HOME}" ]; then - echo "Cannot find Spark installation directory. Have you run 'pip install BigDL=${BIGDL_VERSION}'?" + echo "Cannot find Spark installation directory. Have you run 'pip install BigDL==${BIGDL_VERSION}'?" exit 1 fi # Check the version BigDL and Spark export BIGDL_TEMP_VERSION=`pip show BigDL | sed -n -e '/^Version/p' | sed 's/[^ ]* //'` if [ "${BIGDL_VERSION}" != "${BIGDL_TEMP_VERSION}" ]; then - echo "Wrong version of BigDL. Please run 'pip install BigDL=${BIGDL_VERSION}'." + echo "Wrong version of BigDL. Please run 'pip install BigDL==${BIGDL_VERSION}'." exit 1 fi export SPARK_TEMP_VERSION=`pip show pyspark | sed -n -e '/^Version/p' | sed 's/[^ ]* //'` if [ "${SPARK_VERSION}" != "${SPARK_TEMP_VERSION}" ]; then - echo "Wrong version of Spark. Please run 'pip install BigDL=${BIGDL_VERSION}'." + echo "Wrong version of Spark. Please run 'pip install BigDL==${BIGDL_VERSION}'." exit 1 fi @@ -58,8 +58,20 @@ if [ ! -f ${BIGDL_CONF} ]; then exit 1 fi -# Configure Spark -export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory'" +# Configure proxy and Spark +if [ ! -z "{HTTP_PROXY}" ]; then + HTTP_PROXY_PORT=${HTTP_PROXY##*:} + HTTP_PROXY_NAME=${HTTP_PROXY##*/} + HTTP_PROXY_HOST=${HTTP_PROXY_NAME%%:*} +fi + +if [ ! -z "{HTTPS_PROXY}" ]; then + HTTPS_PROXY_PORT=${HTTPS_PROXY##*:} + HTTPS_PROXY_NAME=${HTTPS_PROXY##*/} + HTTPS_PROXY_HOST=${HTTPS_PROXY_NAME%%:*} +fi + +export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory' --driver-java-options='-Dhttp.proxyHost=${HTTP_PROXY_HOST} -Dhttp.proxyPort=${HTTP_PROXY_PORT} -Dhttps.proxyHost=${HTTPS_PROXY_HOST} -Dhttps.proxyPort=${HTTPS_PROXY_PORT}'" # Install Toree -jupyter toree install --interpreters=Scala,PySpark --spark_home=${SPARK_HOME} --spark_opts='${SPARK_OPTS}' +jupyter toree install --interpreters=Scala,PySpark --spark_home=${SPARK_HOME} --spark_opts="${SPARK_OPTS}" From 015cc4db713129668e56618c01826897efe3fede Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 11:20:18 +0800 Subject: [PATCH 05/16] Refine proxy configuration. --- toree_install.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/toree_install.sh b/toree_install.sh index 671d8cb..e697dbc 100755 --- a/toree_install.sh +++ b/toree_install.sh @@ -59,15 +59,15 @@ if [ ! -f ${BIGDL_CONF} ]; then fi # Configure proxy and Spark -if [ ! -z "{HTTP_PROXY}" ]; then - HTTP_PROXY_PORT=${HTTP_PROXY##*:} - HTTP_PROXY_NAME=${HTTP_PROXY##*/} +if [ ! -z "{http_proxy}" ]; then + HTTP_PROXY_PORT=${http_proxy##*:} + HTTP_PROXY_NAME=${http_proxy##*/} HTTP_PROXY_HOST=${HTTP_PROXY_NAME%%:*} fi -if [ ! -z "{HTTPS_PROXY}" ]; then - HTTPS_PROXY_PORT=${HTTPS_PROXY##*:} - HTTPS_PROXY_NAME=${HTTPS_PROXY##*/} +if [ ! -z "{https_proxy}" ]; then + HTTPS_PROXY_PORT=${https_proxy##*:} + HTTPS_PROXY_NAME=${https_proxy##*/} HTTPS_PROXY_HOST=${HTTPS_PROXY_NAME%%:*} fi From 1016207aa0ba2bfac5d227eb2866f7adb4b14e1b Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 11:23:47 +0800 Subject: [PATCH 06/16] Update README.md --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 71c7116..d6c213f 100644 --- a/README.md +++ b/README.md @@ -23,16 +23,17 @@ Step-by-step Deep Leaning Tutorials on Apache Spark using [BigDL](https://github + JDK 8 + Apache Spark 2.2.0 + Jupyter Notebook 4.1 -+ BigDL 0.3.0 ++ BigDL 0.4.0 + [Setup env on Mac OS](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/SetupMac.md) / [Setup env on Linux](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/SetupLinux.md) -### Start Jupyter Server +### Start Jupyter Server and Toree Kernel * Run ```pip install BigDL==0.3.0``` -* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser``` +* Run ```pip install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz``` +* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser --NotebookApp.token='' --allow-root``` ## Run Demo * Open a browser - Suggest Chrome or Firefox or Safari * Access notebook client at address http://localhost:8888, open the example ipynb files and execute. ## Note -* This notebook is for BigDL 0.3.0. Please refer branch-0.2 if you need to use BigDL 0.2.0. +* This notebook is for BigDL 0.4.0. Please refer branch-0.3 if you need to use BigDL 0.3.0. From 84dca49dfeac730b8b4ece8b3bbd31366532dab6 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 13:25:12 +0800 Subject: [PATCH 07/16] Refine toree_install.sh and README. --- README.md | 12 ++++++------ toree_install.sh | 8 +------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index d6c213f..e0124e2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Deep Leaning Tutorials on Apache Spark using BigDL +# Deep Learning Tutorials on Apache Spark using BigDL -Step-by-step Deep Leaning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). +Step-by-step Deep Learning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). ### Topics 1. [RDD](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/RDD.ipynb) @@ -26,14 +26,14 @@ Step-by-step Deep Leaning Tutorials on Apache Spark using [BigDL](https://github + BigDL 0.4.0 + [Setup env on Mac OS](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/SetupMac.md) / [Setup env on Linux](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/SetupLinux.md) -### Start Jupyter Server and Toree Kernel -* Run ```pip install BigDL==0.3.0``` +### Start Jupyter Server +* Run ```pip install BigDL==0.4.0``` * Run ```pip install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz``` -* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser --NotebookApp.token='' --allow-root``` +* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser``` ## Run Demo * Open a browser - Suggest Chrome or Firefox or Safari * Access notebook client at address http://localhost:8888, open the example ipynb files and execute. ## Note -* This notebook is for BigDL 0.4.0. Please refer branch-0.3 if you need to use BigDL 0.3.0. +* This notebook is for BigDL 0.4.0. Please refer branch-0.3 if you need to use BigDL 0.3.0. ${SPARK_OPTS} diff --git a/toree_install.sh b/toree_install.sh index e697dbc..c0f5a8b 100755 --- a/toree_install.sh +++ b/toree_install.sh @@ -65,13 +65,7 @@ if [ ! -z "{http_proxy}" ]; then HTTP_PROXY_HOST=${HTTP_PROXY_NAME%%:*} fi -if [ ! -z "{https_proxy}" ]; then - HTTPS_PROXY_PORT=${https_proxy##*:} - HTTPS_PROXY_NAME=${https_proxy##*/} - HTTPS_PROXY_HOST=${HTTPS_PROXY_NAME%%:*} -fi - -export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory' --driver-java-options='-Dhttp.proxyHost=${HTTP_PROXY_HOST} -Dhttp.proxyPort=${HTTP_PROXY_PORT} -Dhttps.proxyHost=${HTTPS_PROXY_HOST} -Dhttps.proxyPort=${HTTPS_PROXY_PORT}'" +export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory' --driver-java-options='-Dhttp.proxyHost=${HTTP_PROXY_HOST} -Dhttp.proxyPort=${HTTP_PROXY_PORT}'" # Install Toree jupyter toree install --interpreters=Scala,PySpark --spark_home=${SPARK_HOME} --spark_opts="${SPARK_OPTS}" From f4b7d22ff3c3b1c429a1321fef16ec2b00da44bc Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 13:29:37 +0800 Subject: [PATCH 08/16] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e0124e2..180f1c7 100644 --- a/README.md +++ b/README.md @@ -29,11 +29,11 @@ Step-by-step Deep Learning Tutorials on Apache Spark using [BigDL](https://githu ### Start Jupyter Server * Run ```pip install BigDL==0.4.0``` * Run ```pip install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz``` -* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser``` +* Run ```jupyter notebook --notebook-dir=./ --ip=* --no-browser``` ## Run Demo * Open a browser - Suggest Chrome or Firefox or Safari * Access notebook client at address http://localhost:8888, open the example ipynb files and execute. ## Note -* This notebook is for BigDL 0.4.0. Please refer branch-0.3 if you need to use BigDL 0.3.0. ${SPARK_OPTS} +* This notebook is for BigDL 0.4.0. Please refer branch-0.3 if you need to use BigDL 0.3.0. From afea2066bc3b68455d4db941c23cf1c772a0a59c Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 13:53:15 +0800 Subject: [PATCH 09/16] Keep fork up to date. --- README.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..c42f7b1 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Deep Leaning Tutorials on Apache Spark using BigDL + +Step-by-step Deep Leaning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). + +### Topics +1. [RDD](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/RDD.ipynb) +2. [DataFrame](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/DataFrame.ipynb) +3. [SparkSQL](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/spark_sql.ipynb) +4. [StructureStreaming](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/structured_streaming.ipynb) +5. [Forward and backward](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/forward_and_backward.ipynb) +6. [Linear Regression](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/linear_regression.ipynb) +7. [Introduction to MNIST](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/introduction_to_mnist.ipynb) +8. [Logistic Regression](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/logistic_regression.ipynb) +9. [Feedforward Neural Network](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/deep_feed_forward_neural_network.ipynb) +10. [Convolutional Neural Network](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/cnn.ipynb) +11. [Recurrent Neural Network](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/rnn.ipynb) +12. [LSTM](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/lstm.ipynb) +13. [Bi-directional RNN](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/birnn.ipynb) +14. [Auto-encoder](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/autoencoder.ipynb) + +### Environment ++ Python 2.7 ++ JDK 8 ++ Apache Spark 2.2.0 ++ Jupyter Notebook 4.1 ++ BigDL 0.4.0 ++ [Setup env on Mac OS](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/SetupMac.md) / [Setup env on Linux](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/SetupLinux.md) + +### Start Jupyter Server +* Run ```pip install BigDL==0.4.0``` +* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser``` + +## Run Demo +* Open a browser - Suggest Chrome or Firefox or Safari +* Access notebook client at address http://localhost:8888, open the example ipynb files and execute. + +## Note +* This notebook is for BigDL 0.4.0. Please refer branch-0.3 if you need to use BigDL 0.3.0. From e016752fcea7704de53084bbacaa1466f812d339 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 13:55:38 +0800 Subject: [PATCH 10/16] Delete proxy configuration. --- toree_install.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/toree_install.sh b/toree_install.sh index c0f5a8b..d72c205 100755 --- a/toree_install.sh +++ b/toree_install.sh @@ -58,14 +58,8 @@ if [ ! -f ${BIGDL_CONF} ]; then exit 1 fi -# Configure proxy and Spark -if [ ! -z "{http_proxy}" ]; then - HTTP_PROXY_PORT=${http_proxy##*:} - HTTP_PROXY_NAME=${http_proxy##*/} - HTTP_PROXY_HOST=${HTTP_PROXY_NAME%%:*} -fi - -export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory' --driver-java-options='-Dhttp.proxyHost=${HTTP_PROXY_HOST} -Dhttp.proxyPort=${HTTP_PROXY_PORT}'" +# Configure Spark +export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory'" # Install Toree jupyter toree install --interpreters=Scala,PySpark --spark_home=${SPARK_HOME} --spark_opts="${SPARK_OPTS}" From 84cafdd5f4f46a5ec6add359edc9e54c6799c6ea Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 14:07:03 +0800 Subject: [PATCH 11/16] Update README.md --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c42f7b1..fda2068 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Deep Leaning Tutorials on Apache Spark using BigDL +# Deep Learning Tutorials on Apache Spark using BigDL -Step-by-step Deep Leaning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). +Step-by-step Deep Learning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). ### Topics 1. [RDD](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/RDD.ipynb) @@ -28,7 +28,13 @@ Step-by-step Deep Leaning Tutorials on Apache Spark using [BigDL](https://github ### Start Jupyter Server * Run ```pip install BigDL==0.4.0``` -* Run ``` jupyter notebook --notebook-dir=./ --ip=* --no-browser``` +* Run ```jupyter notebook --notebook-dir=./ --ip=* --no-browser``` + +### Start Toree Kernel to Run Scala Notebooks +* Run ```pip install BigDL==0.4.0``` +* Run ```pip install https://dist.apache.org/repos/dist/dev/incubator/toree/0.2.0/snapshots/dev1/toree-pip/toree-0.2.0.dev1.tar.gz``` +* Run ```./toree_install.sh``` +* Run ```jupyter notebook --notebook-dir=./ --ip=* --no-browser``` ## Run Demo * Open a browser - Suggest Chrome or Firefox or Safari From bbf39e753016213026b98f861f9f99e2362434ff Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 15:12:41 +0800 Subject: [PATCH 12/16] Delete start_toree.sh. --- start_toree.sh | 43 ------------------------------------------- 1 file changed, 43 deletions(-) delete mode 100755 start_toree.sh diff --git a/start_toree.sh b/start_toree.sh deleted file mode 100755 index 5f3c031..0000000 --- a/start_toree.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -# Check environment variables -if [ -z "${BIGDL_HOME}" ]; then - echo "Please set BIGDL_HOME environment variable" - exit 1 -fi - -if [ -z "${SPARK_HOME}" ]; then - echo "Please set SPARK_HOME environment variable" - exit 1 -fi - -#setup pathes -export PYSPARK_DRIVER_PYTHON=jupyter -export PYSPARK_DRIVER_PYTHON_OPTS="notebook --notebook-dir=./ --ip=* --no-browser --NotebookApp.token=''" -export BIGDL_JAR_NAME=`ls ${BIGDL_HOME}/lib/ | grep jar-with-dependencies.jar` -export BIGDL_JAR="${BIGDL_HOME}/lib/$BIGDL_JAR_NAME" -export BIGDL_PY_ZIP_NAME=`ls ${BIGDL_HOME}/lib/ | grep python-api.zip` -export BIGDL_PY_ZIP="${BIGDL_HOME}/lib/$BIGDL_PY_ZIP_NAME" -export BIGDL_CONF=${BIGDL_HOME}/conf/spark-bigdl.conf - -# Check files -if [ ! -f ${BIGDL_CONF} ]; then - echo "Cannot find ${BIGDL_CONF}" - exit 1 -fi - -if [ ! -f ${BIGDL_PY_ZIP} ]; then - echo "Cannot find ${BIGDL_PY_ZIP}" - exit 1 -fi - -if [ ! -f $BIGDL_JAR ]; then - echo "Cannot find $BIGDL_JAR" - exit 1 -fi - -export SPARK_OPTS="--master local[4] --driver-memory 4g --properties-file ${BIGDL_CONF} --jars ${BIGDL_JAR} --conf spark.driver.extraClassPath=${BIGDL_JAR} --conf spark.executor.extraClassPath=${BIGDL_JAR} --conf spark.sql.catalogImplementation='in-memory'" - -echo 'Install toree to jupyter, this may need root privilege' -sudo jupyter toree install --spark_home=${SPARK_HOME} --spark_opts='${SPARK_OPTS}' -jupyter notebook --notebook-dir=./ --ip=* --no-browser --NotebookApp.token='' From 7398563f8e368a7ac1cd9c3b1e482135b8af8d1e Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 15:41:10 +0800 Subject: [PATCH 13/16] Add scala notebooks and change the structure of notebooks. --- .../neural_networks/autoencoder.ipynb | 0 .../{ => python}/neural_networks/birnn.ipynb | 0 .../{ => python}/neural_networks/cnn.ipynb | 0 .../deep_feed_forward_neural_network.ipynb | 0 .../forward_and_backward.ipynb | 0 .../introduction_to_mnist.ipynb | 0 .../neural_networks/linear_regression.ipynb | 0 .../neural_networks/logistic_regression.ipynb | 0 .../{ => python}/neural_networks/lstm.ipynb | 0 .../{ => python}/neural_networks/rnn.ipynb | 0 .../Bi-directional_RNN/Bi-directional_RNN.jpg | Bin .../autoencoder/autoencoder_schema.jpg | Bin .../feedforwardNN_structure.png | Bin .../{ => python}/neural_networks/utils.py | 0 .../{ => python}/spark_basics/DataFrame.ipynb | 0 notebooks/{ => python}/spark_basics/RDD.ipynb | 0 .../{ => python}/spark_basics/spark_sql.ipynb | 0 .../spark_basics/structured_streaming.ipynb | 0 .../introduction_to_mnist.ipynb | 201 +++++++++++ .../neural_networks/linear_regression.ipynb | 243 +++++++++++++ .../neural_networks/logistic_regression.ipynb | 321 ++++++++++++++++++ 21 files changed, 765 insertions(+) rename notebooks/{ => python}/neural_networks/autoencoder.ipynb (100%) rename notebooks/{ => python}/neural_networks/birnn.ipynb (100%) rename notebooks/{ => python}/neural_networks/cnn.ipynb (100%) rename notebooks/{ => python}/neural_networks/deep_feed_forward_neural_network.ipynb (100%) rename notebooks/{ => python}/neural_networks/forward_and_backward.ipynb (100%) rename notebooks/{ => python}/neural_networks/introduction_to_mnist.ipynb (100%) rename notebooks/{ => python}/neural_networks/linear_regression.ipynb (100%) rename notebooks/{ => python}/neural_networks/logistic_regression.ipynb (100%) rename notebooks/{ => python}/neural_networks/lstm.ipynb (100%) rename notebooks/{ => python}/neural_networks/rnn.ipynb (100%) rename notebooks/{ => python}/neural_networks/tutorial_images/Bi-directional_RNN/Bi-directional_RNN.jpg (100%) rename notebooks/{ => python}/neural_networks/tutorial_images/autoencoder/autoencoder_schema.jpg (100%) rename notebooks/{ => python}/neural_networks/tutorial_images/deep_feed_forward_NN/feedforwardNN_structure.png (100%) rename notebooks/{ => python}/neural_networks/utils.py (100%) rename notebooks/{ => python}/spark_basics/DataFrame.ipynb (100%) rename notebooks/{ => python}/spark_basics/RDD.ipynb (100%) rename notebooks/{ => python}/spark_basics/spark_sql.ipynb (100%) rename notebooks/{ => python}/spark_basics/structured_streaming.ipynb (100%) create mode 100644 notebooks/scala/neural_networks/introduction_to_mnist.ipynb create mode 100644 notebooks/scala/neural_networks/linear_regression.ipynb create mode 100644 notebooks/scala/neural_networks/logistic_regression.ipynb diff --git a/notebooks/neural_networks/autoencoder.ipynb b/notebooks/python/neural_networks/autoencoder.ipynb similarity index 100% rename from notebooks/neural_networks/autoencoder.ipynb rename to notebooks/python/neural_networks/autoencoder.ipynb diff --git a/notebooks/neural_networks/birnn.ipynb b/notebooks/python/neural_networks/birnn.ipynb similarity index 100% rename from notebooks/neural_networks/birnn.ipynb rename to notebooks/python/neural_networks/birnn.ipynb diff --git a/notebooks/neural_networks/cnn.ipynb b/notebooks/python/neural_networks/cnn.ipynb similarity index 100% rename from notebooks/neural_networks/cnn.ipynb rename to notebooks/python/neural_networks/cnn.ipynb diff --git a/notebooks/neural_networks/deep_feed_forward_neural_network.ipynb b/notebooks/python/neural_networks/deep_feed_forward_neural_network.ipynb similarity index 100% rename from notebooks/neural_networks/deep_feed_forward_neural_network.ipynb rename to notebooks/python/neural_networks/deep_feed_forward_neural_network.ipynb diff --git a/notebooks/neural_networks/forward_and_backward.ipynb b/notebooks/python/neural_networks/forward_and_backward.ipynb similarity index 100% rename from notebooks/neural_networks/forward_and_backward.ipynb rename to notebooks/python/neural_networks/forward_and_backward.ipynb diff --git a/notebooks/neural_networks/introduction_to_mnist.ipynb b/notebooks/python/neural_networks/introduction_to_mnist.ipynb similarity index 100% rename from notebooks/neural_networks/introduction_to_mnist.ipynb rename to notebooks/python/neural_networks/introduction_to_mnist.ipynb diff --git a/notebooks/neural_networks/linear_regression.ipynb b/notebooks/python/neural_networks/linear_regression.ipynb similarity index 100% rename from notebooks/neural_networks/linear_regression.ipynb rename to notebooks/python/neural_networks/linear_regression.ipynb diff --git a/notebooks/neural_networks/logistic_regression.ipynb b/notebooks/python/neural_networks/logistic_regression.ipynb similarity index 100% rename from notebooks/neural_networks/logistic_regression.ipynb rename to notebooks/python/neural_networks/logistic_regression.ipynb diff --git a/notebooks/neural_networks/lstm.ipynb b/notebooks/python/neural_networks/lstm.ipynb similarity index 100% rename from notebooks/neural_networks/lstm.ipynb rename to notebooks/python/neural_networks/lstm.ipynb diff --git a/notebooks/neural_networks/rnn.ipynb b/notebooks/python/neural_networks/rnn.ipynb similarity index 100% rename from notebooks/neural_networks/rnn.ipynb rename to notebooks/python/neural_networks/rnn.ipynb diff --git a/notebooks/neural_networks/tutorial_images/Bi-directional_RNN/Bi-directional_RNN.jpg b/notebooks/python/neural_networks/tutorial_images/Bi-directional_RNN/Bi-directional_RNN.jpg similarity index 100% rename from notebooks/neural_networks/tutorial_images/Bi-directional_RNN/Bi-directional_RNN.jpg rename to notebooks/python/neural_networks/tutorial_images/Bi-directional_RNN/Bi-directional_RNN.jpg diff --git a/notebooks/neural_networks/tutorial_images/autoencoder/autoencoder_schema.jpg b/notebooks/python/neural_networks/tutorial_images/autoencoder/autoencoder_schema.jpg similarity index 100% rename from notebooks/neural_networks/tutorial_images/autoencoder/autoencoder_schema.jpg rename to notebooks/python/neural_networks/tutorial_images/autoencoder/autoencoder_schema.jpg diff --git a/notebooks/neural_networks/tutorial_images/deep_feed_forward_NN/feedforwardNN_structure.png b/notebooks/python/neural_networks/tutorial_images/deep_feed_forward_NN/feedforwardNN_structure.png similarity index 100% rename from notebooks/neural_networks/tutorial_images/deep_feed_forward_NN/feedforwardNN_structure.png rename to notebooks/python/neural_networks/tutorial_images/deep_feed_forward_NN/feedforwardNN_structure.png diff --git a/notebooks/neural_networks/utils.py b/notebooks/python/neural_networks/utils.py similarity index 100% rename from notebooks/neural_networks/utils.py rename to notebooks/python/neural_networks/utils.py diff --git a/notebooks/spark_basics/DataFrame.ipynb b/notebooks/python/spark_basics/DataFrame.ipynb similarity index 100% rename from notebooks/spark_basics/DataFrame.ipynb rename to notebooks/python/spark_basics/DataFrame.ipynb diff --git a/notebooks/spark_basics/RDD.ipynb b/notebooks/python/spark_basics/RDD.ipynb similarity index 100% rename from notebooks/spark_basics/RDD.ipynb rename to notebooks/python/spark_basics/RDD.ipynb diff --git a/notebooks/spark_basics/spark_sql.ipynb b/notebooks/python/spark_basics/spark_sql.ipynb similarity index 100% rename from notebooks/spark_basics/spark_sql.ipynb rename to notebooks/python/spark_basics/spark_sql.ipynb diff --git a/notebooks/spark_basics/structured_streaming.ipynb b/notebooks/python/spark_basics/structured_streaming.ipynb similarity index 100% rename from notebooks/spark_basics/structured_streaming.ipynb rename to notebooks/python/spark_basics/structured_streaming.ipynb diff --git a/notebooks/scala/neural_networks/introduction_to_mnist.ipynb b/notebooks/scala/neural_networks/introduction_to_mnist.ipynb new file mode 100644 index 0000000..98a2a1d --- /dev/null +++ b/notebooks/scala/neural_networks/introduction_to_mnist.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to the MNIST database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the following tutorials, we are going to use the MNIST database of handwritten digits. MNIST is a simple computer vision dataset of handwritten digits. It has 60,000 training examles and 10,000 test examples. \"It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting.\" For more details of this database, please checkout the website [MNIST](http://yann.lecun.com/exdb/mnist/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In BigDL, we need to write a function to download and read the MNIST data when using Scala." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import java.nio.ByteBuffer\n", + "import java.nio.file.{Files, Path, Paths}\n", + "\n", + "import com.intel.analytics.bigdl.dataset.ByteRecord\n", + "import com.intel.analytics.bigdl.utils.File\n", + "import scopt.OptionParser\n", + "\n", + "def load(featureFile: String, labelFile: String): Array[ByteRecord] = {\n", + " val featureBuffer = ByteBuffer.wrap(Files.readAllBytes(Paths.get(featureFile)))\n", + " val labelBuffer = ByteBuffer.wrap(Files.readAllBytes(Paths.get(labelFile)))\n", + " \n", + " val labelMagicNumber = labelBuffer.getInt()\n", + " require(labelMagicNumber == 2049)\n", + " val featureMagicNumber = featureBuffer.getInt()\n", + " require(featureMagicNumber == 2051)\n", + "\n", + " val labelCount = labelBuffer.getInt()\n", + " val featureCount = featureBuffer.getInt()\n", + " require(labelCount == featureCount)\n", + "\n", + " val rowNum = featureBuffer.getInt()\n", + " val colNum = featureBuffer.getInt()\n", + "\n", + " val result = new Array[ByteRecord](featureCount)\n", + " var i = 0\n", + " while (i < featureCount) {\n", + " val img = new Array[Byte]((rowNum * colNum))\n", + " var y = 0\n", + " while (y < rowNum) {\n", + " var x = 0\n", + " while (x < colNum) {\n", + " img(x + y * colNum) = featureBuffer.get()\n", + " x += 1\n", + " }\n", + " y += 1\n", + " }\n", + " result(i) = ByteRecord(img, labelBuffer.get().toFloat + 1.0f)\n", + " i += 1\n", + " }\n", + "\n", + " result\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we need to import the necessary packages and initialize the engine." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import org.apache.log4j.{Level, Logger}\n", + "import org.apache.spark.SparkContext\n", + "\n", + "import com.intel.analytics.bigdl.utils._\n", + "import com.intel.analytics.bigdl.dataset.DataSet\n", + "import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch, GreyImgToSample}\n", + "import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}\n", + "import com.intel.analytics.bigdl.models.lenet.Utils._\n", + "import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, LogSoftMax, Sequential, Reshape}\n", + "import com.intel.analytics.bigdl.numeric.NumericFloat\n", + "import com.intel.analytics.bigdl.optim.{SGD, Top1Accuracy}\n", + "import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, T, Table}\n", + "import com.intel.analytics.bigdl.tensor.Tensor\n", + "\n", + "Engine.init" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, the paths of training data and validation data should be set." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "val trainData = \"../datasets/mnist/train-images-idx3-ubyte\"\n", + "val trainLabel = \"../datasets/mnist/train-labels-idx1-ubyte\"\n", + "val validationData = \"../datasets/mnist/t10k-images-idx3-ubyte\"\n", + "val validationLabel = \"../datasets/mnist/t10k-labels-idx1-ubyte\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Then, we need to define some parameters for loading the MINST data." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "//Parameters\n", + "val batchSize = 2048\n", + "val learningRate = 0.2\n", + "val maxEpochs = 15\n", + "\n", + "//Network Parameters\n", + "val nInput = 784 //MNIST data input (img shape: 28*28)\n", + "val nClasses = 10 //MNIST total classes (0-9 digits)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can use predefined function to load and serialize MNIST data. If you want to output the data, some modifications on the funtion should be applied." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "val trainSet = \n", + " DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(batchSize)\n", + "val validationSet = \n", + " DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(batchSize)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sc.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Apache Toree - Scala", + "language": "scala", + "name": "apache_toree_scala" + }, + "language_info": { + "file_extension": ".scala", + "name": "scala", + "version": "2.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/scala/neural_networks/linear_regression.ipynb b/notebooks/scala/neural_networks/linear_regression.ipynb new file mode 100644 index 0000000..3f4437d --- /dev/null +++ b/notebooks/scala/neural_networks/linear_regression.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will introduce how to use BigDL to train to a simple linear regression model. The first thing we need to do it to import necessary packages and inilialize the engine." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import org.apache.log4j.{Level, Logger}\n", + "import org.apache.spark.SparkContext\n", + "\n", + "import com.intel.analytics.bigdl._\n", + "import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, T, Table}\n", + "import com.intel.analytics.bigdl.dataset.{DataSet, Sample}\n", + "import com.intel.analytics.bigdl.nn.{Sequential, Linear, MSECriterion}\n", + "import com.intel.analytics.bigdl.optim._\n", + "import com.intel.analytics.bigdl.models.lenet.Utils._\n", + "import com.intel.analytics.bigdl.optim.{SGD, Top1Accuracy}\n", + "import com.intel.analytics.bigdl.tensor._\n", + "import com.intel.analytics.bigdl.numeric.NumericFloat\n", + "\n", + "Engine.init" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we randomly create datasets for training." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "val featuresDim = 2\n", + "val dataLen = 100\n", + "\n", + "def GetRandSample() = {\n", + " val features = Tensor(featuresDim).rand(0, 1)\n", + " val label = (0.4 + features.sum * 2).toFloat\n", + " val sample = Sample[Float](features, label)\n", + " sample\n", + "}\n", + "\n", + "val rddTrain = sc.parallelize(0 until dataLen).map(_ => GetRandSample())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Then we specify the necessary parameters and construct a linear regression model using BigDL. Please notice that batch_size should be devided by the number of cores you use. In this example, it was set as 8 since there are 4 cores when running the example." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "// Parameters\n", + "val learningRate = 0.2\n", + "val trainingEpochs = 5\n", + "val batchSize = 4\n", + "val nInput = featuresDim\n", + "val nOutput = 1 \n", + "\n", + "def LinearRegression(nInput: Int, nOutput: Int) = {\n", + " // Initialize a sequential container\n", + " val model = Sequential()\n", + " // Add a linear layer\n", + " model.add(Linear(nInput, nOutput))\n", + " model\n", + "}\n", + "\n", + "val model = LinearRegression(nInput, nOutput)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we construct the optimizer to optimize the linear regression problem. You can specific your own learning rate in $SGD()$ method, also, you can replace the $SGD()$ with other optimizer such like $Adam()$. Click [here](https://github.com/intel-analytics/BigDL/tree/master/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim) to see more optimizer." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "com.intel.analytics.bigdl.optim.DistriOptimizer@2ec4ba5b" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val optimizer = Optimizer(model = model, sampleRDD = rddTrain, criterion = MSECriterion[Float](), batchSize = batchSize)\n", + "optimizer.setOptimMethod(new SGD(learningRate=learningRate))\n", + "optimizer.setEndWhen(Trigger.maxEpoch(trainingEpochs))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "// Start to train\n", + "val trainedModel = optimizer.optimize()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predict result:\n", + "3.7649865,2.7541423,1.9586959,1.5578532,3.7649865\n" + ] + } + ], + "source": [ + "val predictResult = trainedModel.predict(rddTrain)\n", + "val p = predictResult.take(5).map(_.toTensor.valueAt(1)).mkString(\",\")\n", + "println(\"Predict result:\")\n", + "println(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To test the trained model, we construct a dataset for testing and print the result of _Mean Square Error_." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5.747768\n" + ] + } + ], + "source": [ + "val r = new scala.util.Random(100)\n", + "val totalLength = 10\n", + "val features = Tensor(totalLength, featuresDim).rand(0, 1)\n", + "var label = (0.4 + features.sum).toFloat\n", + "val prediction = sc.parallelize(0 until totalLength).map(r => Sample[Float](features(r + 1), label))\n", + "val predictResult = trainedModel.predict(prediction)\n", + "val p = predictResult.take(6).map(_.toTensor.valueAt(1))\n", + "val groundLabel = Tensor(T(\n", + " | T(-0.47596836f),\n", + " | T(-0.37598032f),\n", + " | T(-0.00492062f),\n", + " | T(-0.5906958f),\n", + " | T(-0.12307882f),\n", + " | T(-0.77907401f)))\n", + "\n", + "var mse = 0f\n", + "for (i <- 1 to 6) {\n", + " mse += (p(i - 1) - groundLabel(i).valueAt(1)) * (p(i - 1) - groundLabel(i).valueAt(1))\n", + "}\n", + "mse /= 6f\n", + "println(mse)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we stop the Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sc.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Apache Toree - Scala", + "language": "scala", + "name": "apache_toree_scala" + }, + "language_info": { + "file_extension": ".scala", + "name": "scala", + "version": "2.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/scala/neural_networks/logistic_regression.ipynb b/notebooks/scala/neural_networks/logistic_regression.ipynb new file mode 100644 index 0000000..e8a7ac4 --- /dev/null +++ b/notebooks/scala/neural_networks/logistic_regression.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will introduce how to build a logistic regression model using BigDL. We use *MNIST* data for experiments in this tutorial. For more information about MNIST, please refer to this [site](http://yann.lecun.com/exdb/mnist/). The first thing we need to do it to import necessary packages and inilialize the engine." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This part aims at preparing for loading MNIST data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import java.nio.ByteBuffer\n", + "import java.nio.file.{Files, Path, Paths}\n", + "\n", + "import com.intel.analytics.bigdl.dataset.ByteRecord\n", + "import com.intel.analytics.bigdl.utils.File\n", + "import scopt.OptionParser\n", + "\n", + "def load(featureFile: String, labelFile: String): Array[ByteRecord] = {\n", + " val featureBuffer = ByteBuffer.wrap(Files.readAllBytes(Paths.get(featureFile)))\n", + " val labelBuffer = ByteBuffer.wrap(Files.readAllBytes(Paths.get(labelFile)))\n", + " \n", + " val labelMagicNumber = labelBuffer.getInt()\n", + " require(labelMagicNumber == 2049)\n", + " val featureMagicNumber = featureBuffer.getInt()\n", + " require(featureMagicNumber == 2051)\n", + "\n", + " val labelCount = labelBuffer.getInt()\n", + " val featureCount = featureBuffer.getInt()\n", + " require(labelCount == featureCount)\n", + "\n", + " val rowNum = featureBuffer.getInt()\n", + " val colNum = featureBuffer.getInt()\n", + "\n", + " val result = new Array[ByteRecord](featureCount)\n", + " var i = 0\n", + " while (i < featureCount) {\n", + " val img = new Array[Byte]((rowNum * colNum))\n", + " var y = 0\n", + " while (y < rowNum) {\n", + " var x = 0\n", + " while (x < colNum) {\n", + " img(x + y * colNum) = featureBuffer.get()\n", + " x += 1\n", + " }\n", + " y += 1\n", + " }\n", + " result(i) = ByteRecord(img, labelBuffer.get().toFloat + 1.0f)\n", + " i += 1\n", + " }\n", + "\n", + " result\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import org.apache.log4j.{Level, Logger}\n", + "import org.apache.spark.SparkContext\n", + "\n", + "import com.intel.analytics.bigdl._\n", + "import com.intel.analytics.bigdl.utils._\n", + "import com.intel.analytics.bigdl.dataset.DataSet\n", + "import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch, GreyImgToSample}\n", + "import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}\n", + "import com.intel.analytics.bigdl.numeric.NumericFloat\n", + "import com.intel.analytics.bigdl.optim._\n", + "import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, T, Table}\n", + "import com.intel.analytics.bigdl.models.lenet.Utils._\n", + "import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, LogSoftMax, Sequential, Reshape}\n", + "import com.intel.analytics.bigdl.optim.SGD\n", + "import com.intel.analytics.bigdl.optim.Top1Accuracy\n", + "import com.intel.analytics.bigdl.tensor._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we get and store MNIST for training and testing. You should edit the paths below according to your system settings." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "val trainData = \"../datasets/mnist/train-images-idx3-ubyte\"\n", + "val trainLabel = \"../datasets/mnist/train-labels-idx1-ubyte\"\n", + "val validationData = \"../datasets/mnist/t10k-images-idx3-ubyte\"\n", + "val validationLabel = \"../datasets/mnist/t10k-labels-idx1-ubyte\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "//Parameters\n", + "val batchSize = 2048\n", + "val learningRate = 0.2\n", + "val maxEpochs = 15\n", + "\n", + "//Network Parameters\n", + "val nInput = 784 //MNIST data input (img shape: 28*28)\n", + "val nClasses = 10 //MNIST total classes (0-9 digits)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "Engine.init" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "val trainSet = \n", + " DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(batchSize)\n", + "val validationSet = \n", + " DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(batchSize)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sequential[fca52368]{\n", + " [input -> (1) -> (2) -> (3) -> output]\n", + " (1): Reshape[75153ad2](784)\n", + " (2): Linear[aacbecbd](784 -> 10)\n", + " (3): LogSoftMax[378e7035]\n", + "}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val model = Sequential().add(Reshape(Array(28 * 28))).add(Linear(nInput, nClasses)).add(LogSoftMax())\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "com.intel.analytics.bigdl.optim.DistriOptimizer@4ea3ca6e" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val optimizer = Optimizer(model = model, dataset = trainSet, criterion = ClassNLLCriterion[Float]())\n", + "optimizer.setValidation(trigger = Trigger.everyEpoch, dataset = validationSet, vMethods = Array(new Top1Accuracy[Float], new Top5Accuracy[Float], new Loss[Float]))\n", + "optimizer.setOptimMethod(new SGD(learningRate=learningRate))\n", + "optimizer.setEndWhen(Trigger.maxEpoch(maxEpochs))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "can't find locality partition for partition 0 Partition locations are (ArrayBuffer(172.168.0.21)) Candidate partition locations are\n", + "(0,List()).\n" + ] + }, + { + "data": { + "text/plain": [ + "Sequential[fca52368]{\n", + " [input -> (1) -> (2) -> (3) -> output]\n", + " (1): Reshape[75153ad2](784)\n", + " (2): Linear[aacbecbd](784 -> 10)\n", + " (3): LogSoftMax[378e7035]\n", + "}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val trainedModel = optimizer.optimize()\n", + "trainedModel" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top1Accuracy is Accuracy(correct: 9209, count: 10000, accuracy: 0.9209)\n" + ] + } + ], + "source": [ + "val rddData = sc.parallelize(load(validationData, validationLabel), batchSize)\n", + "val transformer = BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample()\n", + "val evaluationSet = transformer(rddData)\n", + " \n", + "val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(batchSize))\n", + "\n", + "result.foreach(r => println(s\"${r._2} is ${r._1}\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.0,3.0,2.0,1.0,5.0,2.0,5.0,10.0,7.0,10.0,1.0,7.0,10.0,1.0,2.0,6.0,10.0,8.0,4.0,5.0\n", + "8.0,3.0,2.0,1.0,5.0,2.0,5.0,10.0,6.0,10.0,1.0,7.0,10.0,1.0,2.0,6.0,10.0,8.0,4.0,5.0\n" + ] + } + ], + "source": [ + "val predictions = model.predict(evaluationSet)\n", + "val preLabels = predictions.take(20).map(_.toTensor.max(1)._2.valueAt(1)).mkString(\",\")\n", + "val labels = evaluationSet.take(20).map(_.label.valueAt(1)).mkString(\",\")\n", + "println(preLabels)\n", + "println(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "sc.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Apache Toree - Scala", + "language": "scala", + "name": "apache_toree_scala" + }, + "language_info": { + "file_extension": ".scala", + "name": "scala", + "version": "2.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 2f9c3e5c53dd42ed2a78c709f7877c0a5907fc10 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 15:52:16 +0800 Subject: [PATCH 14/16] Modify hyperlinks in README.md. --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index fda2068..85a745c 100644 --- a/README.md +++ b/README.md @@ -3,20 +3,20 @@ Step-by-step Deep Learning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). ### Topics -1. [RDD](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/RDD.ipynb) -2. [DataFrame](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/DataFrame.ipynb) -3. [SparkSQL](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/spark_sql.ipynb) -4. [StructureStreaming](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/spark_basics/structured_streaming.ipynb) -5. [Forward and backward](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/forward_and_backward.ipynb) -6. [Linear Regression](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/linear_regression.ipynb) -7. [Introduction to MNIST](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/introduction_to_mnist.ipynb) -8. [Logistic Regression](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/logistic_regression.ipynb) -9. [Feedforward Neural Network](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/deep_feed_forward_neural_network.ipynb) -10. [Convolutional Neural Network](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/cnn.ipynb) -11. [Recurrent Neural Network](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/rnn.ipynb) -12. [LSTM](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/lstm.ipynb) -13. [Bi-directional RNN](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/birnn.ipynb) -14. [Auto-encoder](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/autoencoder.ipynb) +1. RDD [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/RDD.ipynb) +2. DataFrame [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/DataFrame.ipynb) +3. SparkSQL [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/spark_sql.ipynb) +4. StructureStreaming [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/structured_streaming.ipynb) +5. Forward and backward [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/forward_and_backward.ipynb) +6. Linear Regression [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/linear_regression.ipynb) [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/linear_regression.ipynb) +7. Introduction to MNIST [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/introduction_to_mnist.ipynb) [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/introduction_to_mnist.ipynb) +8. Logistic Regression [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/logistic_regression.ipynb) [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/logistic_regression.ipynb) +9. Feedforward Neural Network [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/deep_feed_forward_neural_network.ipynb) +10. Convolutional Neural Network [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/cnn.ipynb) +11. Recurrent Neural Network [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/rnn.ipynb) +12. LSTM [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/lstm.ipynb) +13. Bi-directional RNN[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/birnn.ipynb) +14. Auto-encoder [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/autoencoder.ipynb) ### Environment + Python 2.7 From d4256e60e9aba3cfdac1a155a0ec1a7887877185 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Thu, 1 Feb 2018 16:36:35 +0800 Subject: [PATCH 15/16] Update README.md --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 85a745c..eea1673 100644 --- a/README.md +++ b/README.md @@ -3,20 +3,20 @@ Step-by-step Deep Learning Tutorials on Apache Spark using [BigDL](https://github.com/intel-analytics/BigDL/). The tutorials are inspired by [Apache Spark examples](http://spark.apache.org/examples.html), the [Theano Tutorials](https://github.com/Newmu/Theano-Tutorials) and the [Tensorflow tutorials](https://github.com/nlintz/TensorFlow-Tutorials). ### Topics -1. RDD [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/RDD.ipynb) -2. DataFrame [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/DataFrame.ipynb) -3. SparkSQL [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/spark_sql.ipynb) -4. StructureStreaming [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/structured_streaming.ipynb) -5. Forward and backward [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/forward_and_backward.ipynb) -6. Linear Regression [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/linear_regression.ipynb) [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/linear_regression.ipynb) -7. Introduction to MNIST [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/introduction_to_mnist.ipynb) [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/introduction_to_mnist.ipynb) -8. Logistic Regression [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/logistic_regression.ipynb) [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/logistic_regression.ipynb) -9. Feedforward Neural Network [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/deep_feed_forward_neural_network.ipynb) -10. Convolutional Neural Network [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/cnn.ipynb) -11. Recurrent Neural Network [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/rnn.ipynb) -12. LSTM [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/lstm.ipynb) -13. Bi-directional RNN[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/birnn.ipynb) -14. Auto-encoder [Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/autoencoder.ipynb) +1. RDD [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/RDD.ipynb)] +2. DataFrame [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/DataFrame.ipynb)] +3. SparkSQL [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/spark_sql.ipynb)] +4. StructureStreaming [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/spark_basics/structured_streaming.ipynb)] +5. Forward and backward [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/forward_and_backward.ipynb)] +6. Linear Regression [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/linear_regression.ipynb) | [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/linear_regression.ipynb)] +7. Introduction to MNIST [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/introduction_to_mnist.ipynb) | [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/introduction_to_mnist.ipynb)] +8. Logistic Regression [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/logistic_regression.ipynb) | [Scala](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/scala/neural_networks/logistic_regression.ipynb)] +9. Feedforward Neural Network [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/deep_feed_forward_neural_network.ipynb)] +10. Convolutional Neural Network [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/cnn.ipynb)] +11. Recurrent Neural Network [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/rnn.ipynb)] +12. LSTM [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/lstm.ipynb)] +13. Bi-directional RNN [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/birnn.ipynb)] +14. Auto-encoder [[Python](https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/python/neural_networks/autoencoder.ipynb)] ### Environment + Python 2.7 From 55ec57adfc2c6b6a516cc3ce2f604c1edab65dc1 Mon Sep 17 00:00:00 2001 From: Calculuser Date: Fri, 2 Feb 2018 13:35:45 +0800 Subject: [PATCH 16/16] Delete linear regression and logistic regression. --- .../neural_networks/linear_regression.ipynb | 243 ------------- .../neural_networks/logistic_regression.ipynb | 321 ------------------ 2 files changed, 564 deletions(-) delete mode 100644 notebooks/scala/neural_networks/linear_regression.ipynb delete mode 100644 notebooks/scala/neural_networks/logistic_regression.ipynb diff --git a/notebooks/scala/neural_networks/linear_regression.ipynb b/notebooks/scala/neural_networks/linear_regression.ipynb deleted file mode 100644 index 3f4437d..0000000 --- a/notebooks/scala/neural_networks/linear_regression.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we will introduce how to use BigDL to train to a simple linear regression model. The first thing we need to do it to import necessary packages and inilialize the engine." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import org.apache.log4j.{Level, Logger}\n", - "import org.apache.spark.SparkContext\n", - "\n", - "import com.intel.analytics.bigdl._\n", - "import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, T, Table}\n", - "import com.intel.analytics.bigdl.dataset.{DataSet, Sample}\n", - "import com.intel.analytics.bigdl.nn.{Sequential, Linear, MSECriterion}\n", - "import com.intel.analytics.bigdl.optim._\n", - "import com.intel.analytics.bigdl.models.lenet.Utils._\n", - "import com.intel.analytics.bigdl.optim.{SGD, Top1Accuracy}\n", - "import com.intel.analytics.bigdl.tensor._\n", - "import com.intel.analytics.bigdl.numeric.NumericFloat\n", - "\n", - "Engine.init" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we randomly create datasets for training." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "val featuresDim = 2\n", - "val dataLen = 100\n", - "\n", - "def GetRandSample() = {\n", - " val features = Tensor(featuresDim).rand(0, 1)\n", - " val label = (0.4 + features.sum * 2).toFloat\n", - " val sample = Sample[Float](features, label)\n", - " sample\n", - "}\n", - "\n", - "val rddTrain = sc.parallelize(0 until dataLen).map(_ => GetRandSample())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "Then we specify the necessary parameters and construct a linear regression model using BigDL. Please notice that batch_size should be devided by the number of cores you use. In this example, it was set as 8 since there are 4 cores when running the example." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "// Parameters\n", - "val learningRate = 0.2\n", - "val trainingEpochs = 5\n", - "val batchSize = 4\n", - "val nInput = featuresDim\n", - "val nOutput = 1 \n", - "\n", - "def LinearRegression(nInput: Int, nOutput: Int) = {\n", - " // Initialize a sequential container\n", - " val model = Sequential()\n", - " // Add a linear layer\n", - " model.add(Linear(nInput, nOutput))\n", - " model\n", - "}\n", - "\n", - "val model = LinearRegression(nInput, nOutput)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we construct the optimizer to optimize the linear regression problem. You can specific your own learning rate in $SGD()$ method, also, you can replace the $SGD()$ with other optimizer such like $Adam()$. Click [here](https://github.com/intel-analytics/BigDL/tree/master/spark/dl/src/main/scala/com/intel/analytics/bigdl/optim) to see more optimizer." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "com.intel.analytics.bigdl.optim.DistriOptimizer@2ec4ba5b" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val optimizer = Optimizer(model = model, sampleRDD = rddTrain, criterion = MSECriterion[Float](), batchSize = batchSize)\n", - "optimizer.setOptimMethod(new SGD(learningRate=learningRate))\n", - "optimizer.setEndWhen(Trigger.maxEpoch(trainingEpochs))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "// Start to train\n", - "val trainedModel = optimizer.optimize()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Predict result:\n", - "3.7649865,2.7541423,1.9586959,1.5578532,3.7649865\n" - ] - } - ], - "source": [ - "val predictResult = trainedModel.predict(rddTrain)\n", - "val p = predictResult.take(5).map(_.toTensor.valueAt(1)).mkString(\",\")\n", - "println(\"Predict result:\")\n", - "println(p)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To test the trained model, we construct a dataset for testing and print the result of _Mean Square Error_." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "5.747768\n" - ] - } - ], - "source": [ - "val r = new scala.util.Random(100)\n", - "val totalLength = 10\n", - "val features = Tensor(totalLength, featuresDim).rand(0, 1)\n", - "var label = (0.4 + features.sum).toFloat\n", - "val prediction = sc.parallelize(0 until totalLength).map(r => Sample[Float](features(r + 1), label))\n", - "val predictResult = trainedModel.predict(prediction)\n", - "val p = predictResult.take(6).map(_.toTensor.valueAt(1))\n", - "val groundLabel = Tensor(T(\n", - " | T(-0.47596836f),\n", - " | T(-0.37598032f),\n", - " | T(-0.00492062f),\n", - " | T(-0.5906958f),\n", - " | T(-0.12307882f),\n", - " | T(-0.77907401f)))\n", - "\n", - "var mse = 0f\n", - "for (i <- 1 to 6) {\n", - " mse += (p(i - 1) - groundLabel(i).valueAt(1)) * (p(i - 1) - groundLabel(i).valueAt(1))\n", - "}\n", - "mse /= 6f\n", - "println(mse)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we stop the Spark." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sc.stop()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Apache Toree - Scala", - "language": "scala", - "name": "apache_toree_scala" - }, - "language_info": { - "file_extension": ".scala", - "name": "scala", - "version": "2.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/scala/neural_networks/logistic_regression.ipynb b/notebooks/scala/neural_networks/logistic_regression.ipynb deleted file mode 100644 index e8a7ac4..0000000 --- a/notebooks/scala/neural_networks/logistic_regression.ipynb +++ /dev/null @@ -1,321 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Logistic Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial we will introduce how to build a logistic regression model using BigDL. We use *MNIST* data for experiments in this tutorial. For more information about MNIST, please refer to this [site](http://yann.lecun.com/exdb/mnist/). The first thing we need to do it to import necessary packages and inilialize the engine." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This part aims at preparing for loading MNIST data" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import java.nio.ByteBuffer\n", - "import java.nio.file.{Files, Path, Paths}\n", - "\n", - "import com.intel.analytics.bigdl.dataset.ByteRecord\n", - "import com.intel.analytics.bigdl.utils.File\n", - "import scopt.OptionParser\n", - "\n", - "def load(featureFile: String, labelFile: String): Array[ByteRecord] = {\n", - " val featureBuffer = ByteBuffer.wrap(Files.readAllBytes(Paths.get(featureFile)))\n", - " val labelBuffer = ByteBuffer.wrap(Files.readAllBytes(Paths.get(labelFile)))\n", - " \n", - " val labelMagicNumber = labelBuffer.getInt()\n", - " require(labelMagicNumber == 2049)\n", - " val featureMagicNumber = featureBuffer.getInt()\n", - " require(featureMagicNumber == 2051)\n", - "\n", - " val labelCount = labelBuffer.getInt()\n", - " val featureCount = featureBuffer.getInt()\n", - " require(labelCount == featureCount)\n", - "\n", - " val rowNum = featureBuffer.getInt()\n", - " val colNum = featureBuffer.getInt()\n", - "\n", - " val result = new Array[ByteRecord](featureCount)\n", - " var i = 0\n", - " while (i < featureCount) {\n", - " val img = new Array[Byte]((rowNum * colNum))\n", - " var y = 0\n", - " while (y < rowNum) {\n", - " var x = 0\n", - " while (x < colNum) {\n", - " img(x + y * colNum) = featureBuffer.get()\n", - " x += 1\n", - " }\n", - " y += 1\n", - " }\n", - " result(i) = ByteRecord(img, labelBuffer.get().toFloat + 1.0f)\n", - " i += 1\n", - " }\n", - "\n", - " result\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import org.apache.log4j.{Level, Logger}\n", - "import org.apache.spark.SparkContext\n", - "\n", - "import com.intel.analytics.bigdl._\n", - "import com.intel.analytics.bigdl.utils._\n", - "import com.intel.analytics.bigdl.dataset.DataSet\n", - "import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch, GreyImgToSample}\n", - "import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Module}\n", - "import com.intel.analytics.bigdl.numeric.NumericFloat\n", - "import com.intel.analytics.bigdl.optim._\n", - "import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter, T, Table}\n", - "import com.intel.analytics.bigdl.models.lenet.Utils._\n", - "import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, LogSoftMax, Sequential, Reshape}\n", - "import com.intel.analytics.bigdl.optim.SGD\n", - "import com.intel.analytics.bigdl.optim.Top1Accuracy\n", - "import com.intel.analytics.bigdl.tensor._" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then we get and store MNIST for training and testing. You should edit the paths below according to your system settings." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "val trainData = \"../datasets/mnist/train-images-idx3-ubyte\"\n", - "val trainLabel = \"../datasets/mnist/train-labels-idx1-ubyte\"\n", - "val validationData = \"../datasets/mnist/t10k-images-idx3-ubyte\"\n", - "val validationLabel = \"../datasets/mnist/t10k-labels-idx1-ubyte\"" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "//Parameters\n", - "val batchSize = 2048\n", - "val learningRate = 0.2\n", - "val maxEpochs = 15\n", - "\n", - "//Network Parameters\n", - "val nInput = 784 //MNIST data input (img shape: 28*28)\n", - "val nClasses = 10 //MNIST total classes (0-9 digits)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "Engine.init" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "val trainSet = \n", - " DataSet.array(load(trainData, trainLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(batchSize)\n", - "val validationSet = \n", - " DataSet.array(load(validationData, validationLabel), sc) -> BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(batchSize)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Sequential[fca52368]{\n", - " [input -> (1) -> (2) -> (3) -> output]\n", - " (1): Reshape[75153ad2](784)\n", - " (2): Linear[aacbecbd](784 -> 10)\n", - " (3): LogSoftMax[378e7035]\n", - "}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val model = Sequential().add(Reshape(Array(28 * 28))).add(Linear(nInput, nClasses)).add(LogSoftMax())\n", - "model" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "com.intel.analytics.bigdl.optim.DistriOptimizer@4ea3ca6e" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val optimizer = Optimizer(model = model, dataset = trainSet, criterion = ClassNLLCriterion[Float]())\n", - "optimizer.setValidation(trigger = Trigger.everyEpoch, dataset = validationSet, vMethods = Array(new Top1Accuracy[Float], new Top5Accuracy[Float], new Loss[Float]))\n", - "optimizer.setOptimMethod(new SGD(learningRate=learningRate))\n", - "optimizer.setEndWhen(Trigger.maxEpoch(maxEpochs))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "can't find locality partition for partition 0 Partition locations are (ArrayBuffer(172.168.0.21)) Candidate partition locations are\n", - "(0,List()).\n" - ] - }, - { - "data": { - "text/plain": [ - "Sequential[fca52368]{\n", - " [input -> (1) -> (2) -> (3) -> output]\n", - " (1): Reshape[75153ad2](784)\n", - " (2): Linear[aacbecbd](784 -> 10)\n", - " (3): LogSoftMax[378e7035]\n", - "}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val trainedModel = optimizer.optimize()\n", - "trainedModel" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Top1Accuracy is Accuracy(correct: 9209, count: 10000, accuracy: 0.9209)\n" - ] - } - ], - "source": [ - "val rddData = sc.parallelize(load(validationData, validationLabel), batchSize)\n", - "val transformer = BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToSample()\n", - "val evaluationSet = transformer(rddData)\n", - " \n", - "val result = model.evaluate(evaluationSet, Array(new Top1Accuracy[Float]), Some(batchSize))\n", - "\n", - "result.foreach(r => println(s\"${r._2} is ${r._1}\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8.0,3.0,2.0,1.0,5.0,2.0,5.0,10.0,7.0,10.0,1.0,7.0,10.0,1.0,2.0,6.0,10.0,8.0,4.0,5.0\n", - "8.0,3.0,2.0,1.0,5.0,2.0,5.0,10.0,6.0,10.0,1.0,7.0,10.0,1.0,2.0,6.0,10.0,8.0,4.0,5.0\n" - ] - } - ], - "source": [ - "val predictions = model.predict(evaluationSet)\n", - "val preLabels = predictions.take(20).map(_.toTensor.max(1)._2.valueAt(1)).mkString(\",\")\n", - "val labels = evaluationSet.take(20).map(_.label.valueAt(1)).mkString(\",\")\n", - "println(preLabels)\n", - "println(labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sc.stop()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Apache Toree - Scala", - "language": "scala", - "name": "apache_toree_scala" - }, - "language_info": { - "file_extension": ".scala", - "name": "scala", - "version": "2.11.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}