Druid setup on Docker

Kristian Østergaard Martensen · Kristian Østergaard Martensen · commit 5998996e0d09 · 2018-06-08T10:37:26.000+02:00
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,6 @@
+logs/
+deep_storage/
+data/
+docker-compose.yml
+README.md
+wikiticker-*
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+logs/*
+deep_storage/*
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,53 @@
+FROM anapsix/alpine-java:8_server-jre_unlimited
+
+MAINTAINER Kristian Martensen @ Linkfire <km@linkfire.com>
+# thanks to Anastas Dancha <anapsix@random.io> for the alpine-java image
+# this image is very heavily inspired by the znly/docker-druid image by jbaptiste <jb@zen.ly>
+
+# Don't change these on runtime
+ENV DRUID_VERSION      0.12.1
+ENV DRUID_DIR          /opt/druid
+
+# These must be set, but not necessarily changed on runtime
+ENV MYSQL_HOST         mysql
+ENV MYSQL_PORT         3306
+ENV MYSQL_DBNAME       druid
+ENV MYSQL_USERNAME     druid
+ENV MYSQL_PASSWORD     druid
+ENV ZOOKEEPER_HOST     zookeeper
+
+# These variables must be adjusted on runtime
+ENV S3_STORAGE_BUCKET  druid-deep-storage
+ENV S3_INDEXING_BUCKET druid-indexing
+ENV S3_ACCESS_KEY      xxxxxxxxxxxx
+ENV S3_SECRET_KEY      xxxxxxxxxxxx
+
+# Optional variables
+ENV DRUID_XMX              '-'
+ENV DRUID_XMS              '-'
+ENV DRUID_NEWSIZE          '-'
+ENV DRUID_MAXNEWSIZE       '-'
+ENV DRUID_MAXDIRECTMEMORY  '-'
+ENV DRUID_HOSTNAME         '-'
+ENV DRUID_LOGLEVEL         '-'
+ENV DRUID_PROCESS_BUFFER   '-'
+ENV DRUID_PROCESS_THREADS  '-'
+
+RUN set -ex \
+    && apk add --no-cache bash curl \
+    && curl http://static.druid.io/artifacts/releases/druid-$DRUID_VERSION-bin.tar.gz | tar -xzf - -C /opt \
+    && ln -s $DRUID_DIR-$DRUID_VERSION $DRUID_DIR \
+    && cp $DRUID_DIR/extensions/druid-hdfs-storage/aws-java-sdk-s3-1.10.77.jar $DRUID_DIR/lib \
+    && cp $DRUID_DIR/extensions/druid-hdfs-storage/hadoop-aws-2.7.3.jar $DRUID_DIR/lib \
+    && mkdir $DRUID_DIR/log \
+    && mkdir -p $DRUID_DIR/var/tmp \
+    && mkdir -p $DRUID_DIR/var/druid/hadoop-tmp \
+    && rm -rf $DRUID_DIR/quickstart $DRUID_DIR/conf-quickstart $DRUID_DIR/bin/init $DRUID_DIR/bin/generate-example-metrics $DRUID_DIR/bin/jconsole.sh \
+    && curl http://static.druid.io/artifacts/releases/mysql-metadata-storage-$DRUID_VERSION.tar.gz | tar -xzf - -C $DRUID_DIR/extensions \
+    && java -classpath "/opt/druid/lib/*" io.druid.cli.Main tools pull-deps --clean  -c io.druid.extensions.contrib:kafka-emitter:0.12.0 --no-default-hadoop \
+    && mv /extensions/kafka-emitter /opt/druid/extensions/
+
+COPY conf $DRUID_DIR/conf
+COPY start-druid.sh /start-druid.sh
+
+ENTRYPOINT ["/start-druid.sh"]
diff --git a/README.md b/README.md
@@ -0,0 +1,36 @@
+Linkfire DevDataOps assesment
+================
+
+What to do?
+===========
+
+First of all, clone this repository:
+
+```
+git clone git@github.com:getlinkfire/devDataOps-assesment.git
+```
+
+Now, given [this Docker Compose file](./docker-compose.yml), stand up a [Druid](http://druid.io) cluster.
+
+The compose file is going to launch :
+
+- 1 zookeeper node
+- 1 MySQL database
+- 1 Kafka message broker
+
+and the following druid services :
+
+- 1 broker
+- 1 middlemanager
+- 1 historical
+- 1 coordinator/overlord
+
+This will take a couple of minutes, but once it is up, you will have:  
+- The Druid cluster dashboard on: (http://localhost:3001/#/)
+- The Druid indexing console on: (http://localhost:3001/console.html)
+
+Now that you have a running cluster, [ingest](wikiticker-index.json) the data found in (data/wikiticker-2015-09-12-sampled.json.gz).  
+Once your ingestion job has succesfully executed, perform [a simple query](wikiticker-top-pages.json) on the freshly created *datasource*.
+
+The dataset is kindly borrowed from the [Druid quickstart tutorial](http://druid.io/docs/0.12.1/tutorials/quickstart.html), you may find some inspiration there.  
+Read little about the [Druid indexing service](http://druid.io/docs/0.12.1/design/indexing-service.html) if you like ;)
diff --git a/conf/druid/_common/common.runtime.properties b/conf/druid/_common/common.runtime.properties
@@ -0,0 +1,77 @@
+#
+# Licensed to Metamarkets Group Inc. (Metamarkets) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. Metamarkets licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#
+# Extensions
+#
+druid.extensions.directory=/opt/druid/extensions
+druid.extensions.loadList=["druid-kafka-eight", "druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "mysql-metadata-storage", "druid-kafka-indexing-service", "druid-stats", "kafka-emitter"]
+
+#
+# Logging
+#
+# Log all runtime properties on startup
+druid.startup.logging.logProperties=true
+
+#
+# Zookeeper
+#
+druid.zk.service.host=${ZOOKEEPER_HOST}
+druid.zk.paths.base=/druid
+
+#
+# Metadata storage
+#
+druid.metadata.storage.type=mysql
+druid.metadata.storage.connector.connectURI=jdbc:mysql://${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DBNAME}
+druid.metadata.storage.connector.user=${MYSQL_USERNAME}
+druid.metadata.storage.connector.password=${MYSQL_PASSWORD}
+
+#
+# Deep storage
+#
+druid.storage.type=local
+druid.storage.storageDirectory=var/druid/segments
+
+#
+# Indexing service logs
+#
+druid.indexer.logs.type=file
+druid.indexer.logs.directory=var/druid/indexing-logs
+
+#
+# Service discovery
+#
+druid.selectors.indexing.serviceName=druid/overlord
+druid.selectors.coordinator.serviceName=druid/coordinator
+
+#
+# Monitoring
+#
+druid.emitter=kafka
+druid.emitter.kafka.bootstrap.servers=${KAFKA_BOOTSTRAP}
+druid.emitter.kafka.metric.topic=druid-metrics
+druid.emitter.kafka.alert.topic=druid-alerts
+druid.emitter.kafka.producer.config={"max.block.ms":10000}
+druid.emitter.kafka.clusterName=${ENVIRONMENT}
+
+#
+# Javascript post-processors
+#
+druid.javascript.enabled=true
diff --git a/conf/druid/_common/log4j2.xml b/conf/druid/_common/log4j2.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<Configuration status="WARN">
+    <Appenders>
+        <Console name="Console" target="SYSTEM_OUT">
+            <PatternLayout pattern="${hostName} %d{ISO8601} %p [%t] %c - %m%n"/>
+        </Console>
+    </Appenders>
+    <Loggers>
+        <Root level="warn">
+            <AppenderRef ref="Console"/>
+        </Root>
+    </Loggers>
+</Configuration>
diff --git a/conf/druid/broker/jvm.config b/conf/druid/broker/jvm.config
@@ -0,0 +1,13 @@
+-server
+-Xmx10g
+-Xms10g
+-XX:NewSize=2g
+-XX:MaxNewSize=2g
+-XX:MaxDirectMemorySize=4096m
+-XX:+ExitOnOutOfMemoryError
+-XX:+UseG1GC
+-XX:+PrintGCTimeStamps
+-Duser.timezone=UTC
+-Dfile.encoding=UTF-8
+-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
+-Djava.io.tmpdir=/opt/druid/var/tmp
diff --git a/conf/druid/broker/runtime.properties b/conf/druid/broker/runtime.properties
@@ -0,0 +1,40 @@
+# Advertised hostname.
+druid.host=broker
+# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
+druid.service=druid/broker
+# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
+druid.port=8082
+
+# Determines how the broker balances connections to historical nodes.
+druid.broker.balancer.type=connectionCount
+
+# Size of connection pool for the Broker to connect to historical and real-time nodes.
+druid.broker.http.numConnections=32
+# Number of threads for HTTP requests.
+druid.server.http.numThreads=21
+# Query timeout
+druid.server.http.defaultQueryTimeout=60000
+# The timeout for data reads.
+druid.broker.http.readTimeout=PT1M
+# The Jetty max idle time for a connection.
+druid.server.http.maxIdleTime=PT2M
+
+# This specifies a buffer size for the storage of intermediate results.
+# The computation engine in both the Historical and Realtime nodes will use a
+# scratch buffer of this size to do all of their intermediate computations
+# off-heap.
+# Larger values allow for more aggregations in a single pass over the data
+# while smaller values can require more passes depending on the query that is
+# being executed.
+druid.processing.buffer.sizeBytes=1073741824
+# The number of processing threads to have available for parallel processing of segments.
+druid.processing.numThreads=5
+
+druid.broker.cache.useCache=true
+druid.broker.cache.populateCache=true
+druid.cache.l1.type=local
+druid.cache.l1.sizeInBytes=2000000000
+druid.cache.l2.type=memcached
+druid.cache.l2.hosts=memcached:11211
+
+druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor","io.druid.server.metrics.QueryCountStatsMonitor"]
diff --git a/conf/druid/coordinator/jvm.config b/conf/druid/coordinator/jvm.config
@@ -0,0 +1,11 @@
+-server
+-Xmx3g
+-Xms3g
+-XX:NewSize=512m
+-XX:MaxNewSize=512m
+-XX:+UseG1GC
+-XX:+PrintGCTimeStamps
+-Duser.timezone=UTC
+-Dfile.encoding=UTF-8
+-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
+-Djava.io.tmpdir=/opt/druid/var/tmp
diff --git a/conf/druid/coordinator/runtime.properties b/conf/druid/coordinator/runtime.properties
@@ -0,0 +1,30 @@
+# Advertised hostname.
+druid.host=coordinator
+# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
+druid.service=druid/coordinator
+# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
+druid.port=8081
+
+# The operation of the Coordinator works on the assumption that it has an up-to-date view of the state of the world when it runs, the current ZK interaction code, however, is written in a way that doesn’t allow the Coordinator to know for a fact that it’s done loading the current state of the world. This delay is a hack to give it enough time to believe that it has all the data.
+druid.coordinator.startDelay=PT60S
+# The run period for the coordinator. The coordinator’s operates by maintaining the current state of the world in memory and periodically looking at the set of segments available and segments being served to make decisions about whether any changes need to be made to the data topology. This property sets the delay between each of these runs.
+druid.coordinator.period=PT20S
+
+# Boolean flag for whether or not the coordinator should try and merge small segments into a more optimal segment size.
+druid.coordinator.merge.on=true
+
+druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor"]
+# Distribute data evenly among historicals
+druid.coordinator.balancer.strategy=diskNormalized
+
+# Might as well...
+druid.coordinator.asOverlord.enabled=true
+druid.coordinator.asOverlord.overlordService=druid/overlord
+
+# Sleep this long before starting overlord queue management. This can be useful to give a cluster time to re-orient itself after e.g. a widespread network issue.
+druid.indexer.queue.startDelay=PT30S
+# Sleep this long when overlord queue management throws an exception before trying again.
+druid.indexer.queue.restartDelay=PT30S
+
+druid.indexer.runner.type=remote
+druid.indexer.storage.type=metadata
diff --git a/conf/druid/historical/jvm.config b/conf/druid/historical/jvm.config
@@ -0,0 +1,12 @@
+-server
+-Xms8g
+-Xmx8g
+-XX:MaxDirectMemorySize=8g
+-XX:NewSize=1g
+-XX:MaxNewSize=1g
+-XX:+UseG1GC
+-XX:+PrintGCTimeStamps
+-Duser.timezone=UTC
+-Dfile.encoding=UTF-8
+-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
+-Djava.io.tmpdir=/opt/druid/var/tmp
diff --git a/conf/druid/historical/runtime.properties b/conf/druid/historical/runtime.properties
@@ -0,0 +1,27 @@
+# Advertised hostname.
+druid.host=historical
+# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
+druid.service=druid/historical
+# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
+druid.port=8083
+
+# Number of threads for HTTP requests.
+druid.server.http.numThreads=21
+# The Jetty max idle time for a connection.
+druid.server.http.maxIdleTime=PT2M
+# Query timeout
+druid.server.http.defaultQueryTimeout=60000
+
+# This specifies a buffer size for the storage of intermediate results. The computation engine in both the Historical and Realtime nodes will use a scratch buffer of this size to do all of their intermediate computations off-heap. Larger values allow for more aggregations in a single pass over the data while smaller values can require more passes depending on the query that is being executed.
+druid.processing.buffer.sizeBytes=1073741824
+# The number of processing threads to have available for parallel processing of segments. Our rule of thumb is num_cores - 1, which means that even under heavy load there will still be one core available to do background tasks like talking with ZooKeeper and pulling down segments. If only one core is available, this property defaults to the value 1.
+druid.processing.numThreads=7
+
+# Segments assigned to a Historical node are first stored on the local file system (in a disk cache) and then served by the Historical node. These locations define where that local cache resides.
+druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize"\:130000000000}]
+# The maximum number of bytes-worth of segments that the node wants assigned to it. This is not a limit that Historical nodes actually enforces, just a value published to the Coordinator node so it can plan accordingly.
+druid.server.maxSize=500000000000
+# How frequently to announce segments while segments are loading from cache. Set this value to zero to wait for all segments to be loaded before announcing.
+druid.segmentCache.announceIntervalMillis=0
+
+druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor","io.druid.server.metrics.HistoricalMetricsMonitor","io.druid.server.metrics.QueryCountStatsMonitor"]
diff --git a/conf/druid/middleManager/jvm.config b/conf/druid/middleManager/jvm.config
@@ -0,0 +1,12 @@
+-server
+-Xms1g
+-Xmx1g
+-XX:NewSize=1g
+-XX:MaxNewSize=1g
+-XX:+ExitOnOutOfMemoryError
+-XX:+UseG1GC
+-XX:+PrintGCTimeStamps
+-Duser.timezone=UTC
+-Dfile.encoding=UTF-8
+-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
+-Djava.io.tmpdir=/opt/druid/var/tmp
diff --git a/conf/druid/middleManager/runtime.properties b/conf/druid/middleManager/runtime.properties
@@ -0,0 +1,28 @@
+# Advertised hostname.
+druid.host=middlemanager
+# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
+druid.service=druid/middleManager
+# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
+druid.port=8091
+
+# Number of tasks per middleManager
+druid.worker.capacity=1
+
+# Task launch parameters
+# druid.indexer.runner.javaOpts: -server -Xmx5g -Xms5g -XX:NewSize=1g -XX:MaxDirectMemorySize=10g -XX:+UseConcMarkSweepGC -XX:+UseStringDeduplication -XX:MaxGCPauseMillis=300 -XX:ParallelGCThreads=4 -XX:ConcGCThreads=2 -XX:InitiatingHeapOccupancyPercent=85 -verbosegc -XX:+PrintGC -XX:+PrintGCDetails -XX:+PrintGCApplicationStoppedTime -XX:+PrintSafepointStatistics -XX:PrintSafepointStatisticsCount=1 -XX:+SafepointTimeout -XX:SafepointTimeoutDelay=500 -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManage
+druid.indexer.runner.javaOpts=-server -Xms25m -Xmx25m -XX:MaxNewSize=20m -XX:MaxDirectMemorySize=25m -XX:+ExitOnOutOfMemoryError -XX:+UseG1GC -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
+druid.indexer.task.baseTaskDir=var/druid/task
+druid.peon.taskActionClient.retry.maxRetryCount=10
+
+# HTTP server threads
+druid.server.http.numThreads=25
+
+# Processing threads and buffers
+druid.processing.buffer.sizeBytes=4194304
+druid.processing.numThreads=2
+
+# Hadoop indexing
+druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp
+druid.indexer.task.defaultHadoopCoordinates=["org.apache.hadoop:hadoop-client:2.7.3"]
+
+druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor"]
diff --git a/data/wikiticker-2015-09-12-sampled.json.gz b/data/wikiticker-2015-09-12-sampled.json.gz
diff --git a/docker-compose.yml b/docker-compose.yml
diff --git a/start-druid.sh b/start-druid.sh
diff --git a/wikiticker-index.json b/wikiticker-index.json
diff --git a/wikiticker-top-pages.json b/wikiticker-top-pages.json