diff --git a/README.md b/README.md index 00cb9a8..924256b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # python-spark -This image is based off the [`python:2.7`](https://hub.docker.com/_/python/) image and +This image is based off the [`iron/python:2`](https://hub.docker.com/r/iron/python/) image and contains Hadoop, Sqoop and Spark binaries. This is used as a base image for [`airflow-pipeline`](https://github.com/datagovsg/airflow-pipeline), a simplified setup for Airflow to launch Hadoop and Spark jobs. diff --git a/python2/spark1.6/Dockerfile b/python2/spark1.6/Dockerfile index d50d99f..5540e7e 100644 --- a/python2/spark1.6/Dockerfile +++ b/python2/spark1.6/Dockerfile @@ -1,21 +1,11 @@ -FROM python:2.7 +FROM iron/python:2 -# Setup Java -RUN set -x && \ - apt-get update && \ - apt-get install --no-install-recommends -y software-properties-common && \ - echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" > \ - /etc/apt/sources.list.d/webupd8team-java.list && \ - echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" >> \ - /etc/apt/sources.list.d/webupd8team-java.list && \ - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys EEA14886 && \ - echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && \ - apt-get update && echo yes | apt-get install -y --force-yes oracle-java8-installer && \ - apt-get update && apt-get install oracle-java8-set-default && \ - apt-get remove software-properties-common -y --auto-remove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +# Install python pip +RUN wget -qO - https://bootstrap.pypa.io/get-pip.py | python + +# Setup Java & SSL +RUN echo http://dl-cdn.alpinelinux.org/alpine/v3.6/community >> /etc/apk/repositories && apk --update add openjdk8-jre openssl libc6-compat && rm -rf /var/cache/apk/* +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk ARG HADOOP_VERSION=2.6.1 ARG SPARK_VERSION=1.6.1 @@ -58,9 +48,9 @@ ENV SQOOP_HOME /opt/sqoop ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin # Download binaries -RUN /bin/bash -c 'set -x && \ +RUN mkdir /opt/ && /bin/sh -c 'set -x && \ echo "Downloading Hadoop ${HADOOP_VERSION}" && \ - wget -qO - http://apache.stu.edu.tw/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ tar -xz -C /opt/ && \ mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ @@ -90,6 +80,7 @@ RUN /bin/bash -c 'set -x && \ EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 # Install kerberos client support -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y krb5-user +RUN DEBIAN_FRONTEND=noninteractive apk --update add krb5 && rm -rf /var/cache/apk/* + +CMD '/bin/sh' -CMD '/bin/bash' diff --git a/python2/spark2.0/Dockerfile b/python2/spark2.0/Dockerfile index c3de2a8..f1a4711 100644 --- a/python2/spark2.0/Dockerfile +++ b/python2/spark2.0/Dockerfile @@ -1,21 +1,11 @@ -FROM python:2.7 +FROM iron/python:2 -# Setup Java -RUN set -x && \ - apt-get update && \ - apt-get install --no-install-recommends -y software-properties-common && \ - echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" > \ - /etc/apt/sources.list.d/webupd8team-java.list && \ - echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" >> \ - /etc/apt/sources.list.d/webupd8team-java.list && \ - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys EEA14886 && \ - echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && \ - apt-get update && echo yes | apt-get install -y --force-yes oracle-java8-installer && \ - apt-get update && apt-get install oracle-java8-set-default && \ - apt-get remove software-properties-common -y --auto-remove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +# Install python pip +RUN wget -qO - https://bootstrap.pypa.io/get-pip.py | python + +# Setup Java & SSL +RUN echo http://dl-cdn.alpinelinux.org/alpine/v3.6/community >> /etc/apk/repositories && apk --update add openjdk8-jre openssl libc6-compat && rm -rf /var/cache/apk/* +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk ARG HADOOP_VERSION=2.7.3 ARG SPARK_VERSION=2.0.1 @@ -58,9 +48,9 @@ ENV SQOOP_HOME /opt/sqoop ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin # Download binaries -RUN /bin/bash -c 'set -x && \ +RUN mkdir /opt/ && /bin/sh -c 'set -x && \ echo "Downloading Hadoop ${HADOOP_VERSION}" && \ - wget -qO - http://apache.stu.edu.tw/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ tar -xz -C /opt/ && \ mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ @@ -90,6 +80,7 @@ RUN /bin/bash -c 'set -x && \ EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 # Install kerberos client support -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y krb5-user +RUN DEBIAN_FRONTEND=noninteractive apk --update add krb5 && rm -rf /var/cache/apk/* + +CMD '/bin/sh' -CMD '/bin/bash' diff --git a/python3/spark1.6/Dockerfile b/python3/spark1.6/Dockerfile index cbd356c..8de0f3f 100644 --- a/python3/spark1.6/Dockerfile +++ b/python3/spark1.6/Dockerfile @@ -1,21 +1,11 @@ -FROM python:3.5 +FROM iron/python:3 -# Setup Java -RUN set -x && \ - apt-get update && \ - apt-get install --no-install-recommends -y software-properties-common && \ - echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" > \ - /etc/apt/sources.list.d/webupd8team-java.list && \ - echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu xenial main" >> \ - /etc/apt/sources.list.d/webupd8team-java.list && \ - apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys EEA14886 && \ - echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && \ - apt-get update && echo yes | apt-get install -y --force-yes oracle-java8-installer && \ - apt-get update && apt-get install oracle-java8-set-default && \ - apt-get remove software-properties-common -y --auto-remove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -ENV JAVA_HOME /usr/lib/jvm/java-8-oracle +# Install python pip +RUN wget -qO - https://bootstrap.pypa.io/get-pip.py | python3 + +# Setup Java & SSL +RUN echo http://dl-cdn.alpinelinux.org/alpine/v3.6/community >> /etc/apk/repositories && apk --update add openjdk8-jre openssl libc6-compat && rm -rf /var/cache/apk/* +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk ARG HADOOP_VERSION=2.6.1 ARG SPARK_VERSION=1.6.1 @@ -58,9 +48,9 @@ ENV SQOOP_HOME /opt/sqoop ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin # Download binaries -RUN /bin/bash -c 'set -x && \ +RUN mkdir /opt/ && /bin/sh -c 'set -x && \ echo "Downloading Hadoop ${HADOOP_VERSION}" && \ - wget -qO - http://apache.stu.edu.tw/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ tar -xz -C /opt/ && \ mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ @@ -90,6 +80,7 @@ RUN /bin/bash -c 'set -x && \ EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 # Install kerberos client support -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y krb5-user +RUN DEBIAN_FRONTEND=noninteractive apk --update add krb5 && rm -rf /var/cache/apk/* + +CMD '/bin/sh' -CMD '/bin/bash' diff --git a/python3/spark2.0/Dockerfile b/python3/spark2.0/Dockerfile new file mode 100644 index 0000000..66d5c30 --- /dev/null +++ b/python3/spark2.0/Dockerfile @@ -0,0 +1,86 @@ +FROM iron/python:3 + +# Install python pip +RUN wget -qO - https://bootstrap.pypa.io/get-pip.py | python3 + +# Setup Java & SSL +RUN echo http://dl-cdn.alpinelinux.org/alpine/v3.6/community >> /etc/apk/repositories && apk --update add openjdk8-jre openssl libc6-compat && rm -rf /var/cache/apk/* +ENV JAVA_HOME /usr/lib/jvm/java-1.8-openjdk + +ARG HADOOP_VERSION=2.7.3 +ARG SPARK_VERSION=2.0.1 + +# Setup Hadoop variables +ENV HADOOP_HOME /opt/hadoop +ENV PATH ${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin +ENV HADOOP_MAPRED_HOME ${HADOOP_HOME} +ENV HADOOP_COMMON_HOME ${HADOOP_HOME} +ENV HADOOP_HDFS_HOME ${HADOOP_HOME} +ENV YARN_HOME ${HADOOP_HOME} +ENV HADOOP_COMMON_LIB_NATIVE_DIR ${HADOOP_HOME}/lib/native +ENV HADOOP_OPTS "-Djava.library.path=${HADOOP_HOME}/lib" +ENV HDFS_CONF_DIR ${HADOOP_HOME}/etc/hadoop +ENV YARN_CONF_DIR ${HADOOP_HOME}/etc/hadoop +ENV HADOOP_CONF_DIR ${HADOOP_HOME}/etc/hadoop + +# Setup Hive +ENV HIVE_CONF_DIR ${HADOOP_CONF_DIR} + +# Setup Spark +ENV SPARK_HOME=/opt/spark-${SPARK_VERSION} +ENV PYSPARK_PYTHON=python +ENV PATH=$PATH:${SPARK_HOME}/bin + +# Set Python Spark 2 specific settings +ENV PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-csv_2.11:1.5.0,com.databricks:spark-avro_2.11:3.1.0,graphframes:graphframes:0.5.0-spark2.0-s_2.11 pyspark-shell" +ENV PYTHONPATH=${SPARK_HOME}/python:${SPARK_HOME}/python/lib/py4j-0.10.3-src.zip + +# Exposes the relevant ports and setup the port settings +ENV SPARK_MASTER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" +ENV SPARK_WORKER_OPTS="-Dspark.driver.port=7001 -Dspark.fileserver.port=7002 -Dspark.broadcast.port=7003 -Dspark.replClassServer.port=7004 -Dspark.blockManager.port=7005 -Dspark.executor.port=7006 -Dspark.ui.port=4040 -Dspark.broadcast.factory=org.apache.spark.broadcast.HttpBroadcastFactory" +ENV SPARK_MASTER_PORT 7077 +ENV SPARK_MASTER_WEBUI_PORT 8080 +ENV SPARK_WORKER_PORT 8888 +ENV SPARK_WORKER_WEBUI_PORT 8081 + +# Set up Sqoop +ENV SQOOP_HOME /opt/sqoop +ENV PATH ${PATH}:${SQOOP_HOME}/bin:${HADOOP_HOME}/bin + +# Download binaries +RUN mkdir /opt/ && /bin/sh -c 'set -x && \ + echo "Downloading Hadoop ${HADOOP_VERSION}" && \ + wget -qO - http://www-us.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | \ + tar -xz -C /opt/ && \ + mv /opt/hadoop-${HADOOP_VERSION} /opt/hadoop && \ + echo "Downloading Spark ${SPARK_VERSION} for Hadoop ${HADOOP_VERSION:0:3}" && \ + wget -qO - http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3}.tgz |\ + tar -xz -C /opt/ && \ + mv /opt/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION:0:3} /opt/spark-${SPARK_VERSION} && \ + echo "Downloading Spark packages" && \ + wget -q http://repo1.maven.org/maven2/com/databricks/spark-avro_2.11/3.1.0/spark-avro_2.11-3.1.0.jar -P ${SPARK_HOME}/jars && \ + wget -q http://repo1.maven.org/maven2/com/databricks/spark-csv_2.11/1.5.0/spark-csv_2.11-1.5.0.jar -P ${SPARK_HOME}/jars && \ + echo "Downloading Sqoop" && \ + wget -qO - http://www.apache.org/dist/sqoop/1.4.6/sqoop-1.4.6.bin__hadoop-2.0.4-alpha.tar.gz | tar -xz -C /opt && \ + cd /opt && ln -s ./sqoop-1.4.6.bin__hadoop-2.0.4-alpha sqoop && \ + echo "Downloading the JDBC drivers for Postgresql" && \ + wget -qP /opt/sqoop/lib/ https://jdbc.postgresql.org/download/postgresql-9.4-1201.jdbc4.jar && \ + echo "Downloading the JDBC drivers for MySQL" && \ + wget -qP /tmp/ http://dev.mysql.com/get/Downloads/Connector-J/mysql-connector-java-5.1.37.tar.gz && \ + tar -C /tmp/ -xzf /tmp/mysql-connector-java-5.1.37.tar.gz && \ + cp /tmp/mysql-connector-java-5.1.37/mysql-connector-java-5.1.37-bin.jar /opt/sqoop/lib/ && \ + echo "Downloading the JDBC drivers for MS SQL" && \ + wget -qO - https://download.microsoft.com/download/0/2/A/02AAE597-3865-456C-AE7F-613F99F850A8/enu/sqljdbc_6.0.8112.100_enu.tar.gz | \ + tar xz -C /tmp && \ + mv /tmp/sqljdbc_6.0/enu/jre7/sqljdbc41.jar ${SQOOP_HOME}/lib && \ + rm -r /tmp/sqljdbc_6.0 && \ + echo "Cleaning up" && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*' + +EXPOSE 8080 7077 8888 8081 4040 7001 7002 7003 7004 7005 7006 + +# Install kerberos client support +RUN DEBIAN_FRONTEND=noninteractive apk --update add krb5 && rm -rf /var/cache/apk/* + +CMD '/bin/sh' +