Skip to content
This repository was archived by the owner on Jan 29, 2024. It is now read-only.

Commit 5998996

Browse files
author
Kristian Østergaard Martensen
committed
Druid setup on Docker
0 parents  commit 5998996

19 files changed

+676
-0
lines changed

.dockerignore

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
logs/
2+
deep_storage/
3+
data/
4+
docker-compose.yml
5+
README.md
6+
wikiticker-*

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
logs/*
2+
deep_storage/*

Dockerfile

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
FROM anapsix/alpine-java:8_server-jre_unlimited
2+
3+
MAINTAINER Kristian Martensen @ Linkfire <[email protected]>
4+
# thanks to Anastas Dancha <[email protected]> for the alpine-java image
5+
# this image is very heavily inspired by the znly/docker-druid image by jbaptiste <[email protected]>
6+
7+
# Don't change these on runtime
8+
ENV DRUID_VERSION 0.12.1
9+
ENV DRUID_DIR /opt/druid
10+
11+
# These must be set, but not necessarily changed on runtime
12+
ENV MYSQL_HOST mysql
13+
ENV MYSQL_PORT 3306
14+
ENV MYSQL_DBNAME druid
15+
ENV MYSQL_USERNAME druid
16+
ENV MYSQL_PASSWORD druid
17+
ENV ZOOKEEPER_HOST zookeeper
18+
19+
# These variables must be adjusted on runtime
20+
ENV S3_STORAGE_BUCKET druid-deep-storage
21+
ENV S3_INDEXING_BUCKET druid-indexing
22+
ENV S3_ACCESS_KEY xxxxxxxxxxxx
23+
ENV S3_SECRET_KEY xxxxxxxxxxxx
24+
25+
# Optional variables
26+
ENV DRUID_XMX '-'
27+
ENV DRUID_XMS '-'
28+
ENV DRUID_NEWSIZE '-'
29+
ENV DRUID_MAXNEWSIZE '-'
30+
ENV DRUID_MAXDIRECTMEMORY '-'
31+
ENV DRUID_HOSTNAME '-'
32+
ENV DRUID_LOGLEVEL '-'
33+
ENV DRUID_PROCESS_BUFFER '-'
34+
ENV DRUID_PROCESS_THREADS '-'
35+
36+
RUN set -ex \
37+
&& apk add --no-cache bash curl \
38+
&& curl http://static.druid.io/artifacts/releases/druid-$DRUID_VERSION-bin.tar.gz | tar -xzf - -C /opt \
39+
&& ln -s $DRUID_DIR-$DRUID_VERSION $DRUID_DIR \
40+
&& cp $DRUID_DIR/extensions/druid-hdfs-storage/aws-java-sdk-s3-1.10.77.jar $DRUID_DIR/lib \
41+
&& cp $DRUID_DIR/extensions/druid-hdfs-storage/hadoop-aws-2.7.3.jar $DRUID_DIR/lib \
42+
&& mkdir $DRUID_DIR/log \
43+
&& mkdir -p $DRUID_DIR/var/tmp \
44+
&& mkdir -p $DRUID_DIR/var/druid/hadoop-tmp \
45+
&& rm -rf $DRUID_DIR/quickstart $DRUID_DIR/conf-quickstart $DRUID_DIR/bin/init $DRUID_DIR/bin/generate-example-metrics $DRUID_DIR/bin/jconsole.sh \
46+
&& curl http://static.druid.io/artifacts/releases/mysql-metadata-storage-$DRUID_VERSION.tar.gz | tar -xzf - -C $DRUID_DIR/extensions \
47+
&& java -classpath "/opt/druid/lib/*" io.druid.cli.Main tools pull-deps --clean -c io.druid.extensions.contrib:kafka-emitter:0.12.0 --no-default-hadoop \
48+
&& mv /extensions/kafka-emitter /opt/druid/extensions/
49+
50+
COPY conf $DRUID_DIR/conf
51+
COPY start-druid.sh /start-druid.sh
52+
53+
ENTRYPOINT ["/start-druid.sh"]

README.md

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
Linkfire DevDataOps assesment
2+
================
3+
4+
What to do?
5+
===========
6+
7+
First of all, clone this repository:
8+
9+
```
10+
git clone [email protected]:getlinkfire/devDataOps-assesment.git
11+
```
12+
13+
Now, given [this Docker Compose file](./docker-compose.yml), stand up a [Druid](http://druid.io) cluster.
14+
15+
The compose file is going to launch :
16+
17+
- 1 zookeeper node
18+
- 1 MySQL database
19+
- 1 Kafka message broker
20+
21+
and the following druid services :
22+
23+
- 1 broker
24+
- 1 middlemanager
25+
- 1 historical
26+
- 1 coordinator/overlord
27+
28+
This will take a couple of minutes, but once it is up, you will have:
29+
- The Druid cluster dashboard on: (http://localhost:3001/#/)
30+
- The Druid indexing console on: (http://localhost:3001/console.html)
31+
32+
Now that you have a running cluster, [ingest](wikiticker-index.json) the data found in (data/wikiticker-2015-09-12-sampled.json.gz).
33+
Once your ingestion job has succesfully executed, perform [a simple query](wikiticker-top-pages.json) on the freshly created *datasource*.
34+
35+
The dataset is kindly borrowed from the [Druid quickstart tutorial](http://druid.io/docs/0.12.1/tutorials/quickstart.html), you may find some inspiration there.
36+
Read little about the [Druid indexing service](http://druid.io/docs/0.12.1/design/indexing-service.html) if you like ;)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#
2+
# Licensed to Metamarkets Group Inc. (Metamarkets) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. Metamarkets licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing,
13+
# software distributed under the License is distributed on an
14+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
# KIND, either express or implied. See the License for the
16+
# specific language governing permissions and limitations
17+
# under the License.
18+
#
19+
20+
#
21+
# Extensions
22+
#
23+
druid.extensions.directory=/opt/druid/extensions
24+
druid.extensions.loadList=["druid-kafka-eight", "druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "mysql-metadata-storage", "druid-kafka-indexing-service", "druid-stats", "kafka-emitter"]
25+
26+
#
27+
# Logging
28+
#
29+
# Log all runtime properties on startup
30+
druid.startup.logging.logProperties=true
31+
32+
#
33+
# Zookeeper
34+
#
35+
druid.zk.service.host=${ZOOKEEPER_HOST}
36+
druid.zk.paths.base=/druid
37+
38+
#
39+
# Metadata storage
40+
#
41+
druid.metadata.storage.type=mysql
42+
druid.metadata.storage.connector.connectURI=jdbc:mysql://${MYSQL_HOST}:${MYSQL_PORT}/${MYSQL_DBNAME}
43+
druid.metadata.storage.connector.user=${MYSQL_USERNAME}
44+
druid.metadata.storage.connector.password=${MYSQL_PASSWORD}
45+
46+
#
47+
# Deep storage
48+
#
49+
druid.storage.type=local
50+
druid.storage.storageDirectory=var/druid/segments
51+
52+
#
53+
# Indexing service logs
54+
#
55+
druid.indexer.logs.type=file
56+
druid.indexer.logs.directory=var/druid/indexing-logs
57+
58+
#
59+
# Service discovery
60+
#
61+
druid.selectors.indexing.serviceName=druid/overlord
62+
druid.selectors.coordinator.serviceName=druid/coordinator
63+
64+
#
65+
# Monitoring
66+
#
67+
druid.emitter=kafka
68+
druid.emitter.kafka.bootstrap.servers=${KAFKA_BOOTSTRAP}
69+
druid.emitter.kafka.metric.topic=druid-metrics
70+
druid.emitter.kafka.alert.topic=druid-alerts
71+
druid.emitter.kafka.producer.config={"max.block.ms":10000}
72+
druid.emitter.kafka.clusterName=${ENVIRONMENT}
73+
74+
#
75+
# Javascript post-processors
76+
#
77+
druid.javascript.enabled=true

conf/druid/_common/log4j2.xml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<?xml version="1.0" encoding="UTF-8" ?>
2+
<Configuration status="WARN">
3+
<Appenders>
4+
<Console name="Console" target="SYSTEM_OUT">
5+
<PatternLayout pattern="${hostName} %d{ISO8601} %p [%t] %c - %m%n"/>
6+
</Console>
7+
</Appenders>
8+
<Loggers>
9+
<Root level="warn">
10+
<AppenderRef ref="Console"/>
11+
</Root>
12+
</Loggers>
13+
</Configuration>

conf/druid/broker/jvm.config

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
-server
2+
-Xmx10g
3+
-Xms10g
4+
-XX:NewSize=2g
5+
-XX:MaxNewSize=2g
6+
-XX:MaxDirectMemorySize=4096m
7+
-XX:+ExitOnOutOfMemoryError
8+
-XX:+UseG1GC
9+
-XX:+PrintGCTimeStamps
10+
-Duser.timezone=UTC
11+
-Dfile.encoding=UTF-8
12+
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
13+
-Djava.io.tmpdir=/opt/druid/var/tmp

conf/druid/broker/runtime.properties

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Advertised hostname.
2+
druid.host=broker
3+
# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
4+
druid.service=druid/broker
5+
# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
6+
druid.port=8082
7+
8+
# Determines how the broker balances connections to historical nodes.
9+
druid.broker.balancer.type=connectionCount
10+
11+
# Size of connection pool for the Broker to connect to historical and real-time nodes.
12+
druid.broker.http.numConnections=32
13+
# Number of threads for HTTP requests.
14+
druid.server.http.numThreads=21
15+
# Query timeout
16+
druid.server.http.defaultQueryTimeout=60000
17+
# The timeout for data reads.
18+
druid.broker.http.readTimeout=PT1M
19+
# The Jetty max idle time for a connection.
20+
druid.server.http.maxIdleTime=PT2M
21+
22+
# This specifies a buffer size for the storage of intermediate results.
23+
# The computation engine in both the Historical and Realtime nodes will use a
24+
# scratch buffer of this size to do all of their intermediate computations
25+
# off-heap.
26+
# Larger values allow for more aggregations in a single pass over the data
27+
# while smaller values can require more passes depending on the query that is
28+
# being executed.
29+
druid.processing.buffer.sizeBytes=1073741824
30+
# The number of processing threads to have available for parallel processing of segments.
31+
druid.processing.numThreads=5
32+
33+
druid.broker.cache.useCache=true
34+
druid.broker.cache.populateCache=true
35+
druid.cache.l1.type=local
36+
druid.cache.l1.sizeInBytes=2000000000
37+
druid.cache.l2.type=memcached
38+
druid.cache.l2.hosts=memcached:11211
39+
40+
druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor","io.druid.server.metrics.QueryCountStatsMonitor"]

conf/druid/coordinator/jvm.config

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
-server
2+
-Xmx3g
3+
-Xms3g
4+
-XX:NewSize=512m
5+
-XX:MaxNewSize=512m
6+
-XX:+UseG1GC
7+
-XX:+PrintGCTimeStamps
8+
-Duser.timezone=UTC
9+
-Dfile.encoding=UTF-8
10+
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
11+
-Djava.io.tmpdir=/opt/druid/var/tmp
+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Advertised hostname.
2+
druid.host=coordinator
3+
# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
4+
druid.service=druid/coordinator
5+
# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
6+
druid.port=8081
7+
8+
# The operation of the Coordinator works on the assumption that it has an up-to-date view of the state of the world when it runs, the current ZK interaction code, however, is written in a way that doesn’t allow the Coordinator to know for a fact that it’s done loading the current state of the world. This delay is a hack to give it enough time to believe that it has all the data.
9+
druid.coordinator.startDelay=PT60S
10+
# The run period for the coordinator. The coordinator’s operates by maintaining the current state of the world in memory and periodically looking at the set of segments available and segments being served to make decisions about whether any changes need to be made to the data topology. This property sets the delay between each of these runs.
11+
druid.coordinator.period=PT20S
12+
13+
# Boolean flag for whether or not the coordinator should try and merge small segments into a more optimal segment size.
14+
druid.coordinator.merge.on=true
15+
16+
druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor"]
17+
# Distribute data evenly among historicals
18+
druid.coordinator.balancer.strategy=diskNormalized
19+
20+
# Might as well...
21+
druid.coordinator.asOverlord.enabled=true
22+
druid.coordinator.asOverlord.overlordService=druid/overlord
23+
24+
# Sleep this long before starting overlord queue management. This can be useful to give a cluster time to re-orient itself after e.g. a widespread network issue.
25+
druid.indexer.queue.startDelay=PT30S
26+
# Sleep this long when overlord queue management throws an exception before trying again.
27+
druid.indexer.queue.restartDelay=PT30S
28+
29+
druid.indexer.runner.type=remote
30+
druid.indexer.storage.type=metadata

conf/druid/historical/jvm.config

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
-server
2+
-Xms8g
3+
-Xmx8g
4+
-XX:MaxDirectMemorySize=8g
5+
-XX:NewSize=1g
6+
-XX:MaxNewSize=1g
7+
-XX:+UseG1GC
8+
-XX:+PrintGCTimeStamps
9+
-Duser.timezone=UTC
10+
-Dfile.encoding=UTF-8
11+
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
12+
-Djava.io.tmpdir=/opt/druid/var/tmp
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Advertised hostname.
2+
druid.host=historical
3+
# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
4+
druid.service=druid/historical
5+
# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
6+
druid.port=8083
7+
8+
# Number of threads for HTTP requests.
9+
druid.server.http.numThreads=21
10+
# The Jetty max idle time for a connection.
11+
druid.server.http.maxIdleTime=PT2M
12+
# Query timeout
13+
druid.server.http.defaultQueryTimeout=60000
14+
15+
# This specifies a buffer size for the storage of intermediate results. The computation engine in both the Historical and Realtime nodes will use a scratch buffer of this size to do all of their intermediate computations off-heap. Larger values allow for more aggregations in a single pass over the data while smaller values can require more passes depending on the query that is being executed.
16+
druid.processing.buffer.sizeBytes=1073741824
17+
# The number of processing threads to have available for parallel processing of segments. Our rule of thumb is num_cores - 1, which means that even under heavy load there will still be one core available to do background tasks like talking with ZooKeeper and pulling down segments. If only one core is available, this property defaults to the value 1.
18+
druid.processing.numThreads=7
19+
20+
# Segments assigned to a Historical node are first stored on the local file system (in a disk cache) and then served by the Historical node. These locations define where that local cache resides.
21+
druid.segmentCache.locations=[{"path":"var/druid/segment-cache","maxSize"\:130000000000}]
22+
# The maximum number of bytes-worth of segments that the node wants assigned to it. This is not a limit that Historical nodes actually enforces, just a value published to the Coordinator node so it can plan accordingly.
23+
druid.server.maxSize=500000000000
24+
# How frequently to announce segments while segments are loading from cache. Set this value to zero to wait for all segments to be loaded before announcing.
25+
druid.segmentCache.announceIntervalMillis=0
26+
27+
druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor","io.druid.server.metrics.HistoricalMetricsMonitor","io.druid.server.metrics.QueryCountStatsMonitor"]

conf/druid/middleManager/jvm.config

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
-server
2+
-Xms1g
3+
-Xmx1g
4+
-XX:NewSize=1g
5+
-XX:MaxNewSize=1g
6+
-XX:+ExitOnOutOfMemoryError
7+
-XX:+UseG1GC
8+
-XX:+PrintGCTimeStamps
9+
-Duser.timezone=UTC
10+
-Dfile.encoding=UTF-8
11+
-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
12+
-Djava.io.tmpdir=/opt/druid/var/tmp
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Advertised hostname.
2+
druid.host=middlemanager
3+
# The name of the service. This is used as a dimension when emitting metrics and alerts to differentiate between the various services.
4+
druid.service=druid/middleManager
5+
# This is the port to actually listen on; unless port mapping is used, this will be the same port as is on druid.host.
6+
druid.port=8091
7+
8+
# Number of tasks per middleManager
9+
druid.worker.capacity=1
10+
11+
# Task launch parameters
12+
# druid.indexer.runner.javaOpts: -server -Xmx5g -Xms5g -XX:NewSize=1g -XX:MaxDirectMemorySize=10g -XX:+UseConcMarkSweepGC -XX:+UseStringDeduplication -XX:MaxGCPauseMillis=300 -XX:ParallelGCThreads=4 -XX:ConcGCThreads=2 -XX:InitiatingHeapOccupancyPercent=85 -verbosegc -XX:+PrintGC -XX:+PrintGCDetails -XX:+PrintGCApplicationStoppedTime -XX:+PrintSafepointStatistics -XX:PrintSafepointStatisticsCount=1 -XX:+SafepointTimeout -XX:SafepointTimeoutDelay=500 -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManage
13+
druid.indexer.runner.javaOpts=-server -Xms25m -Xmx25m -XX:MaxNewSize=20m -XX:MaxDirectMemorySize=25m -XX:+ExitOnOutOfMemoryError -XX:+UseG1GC -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
14+
druid.indexer.task.baseTaskDir=var/druid/task
15+
druid.peon.taskActionClient.retry.maxRetryCount=10
16+
17+
# HTTP server threads
18+
druid.server.http.numThreads=25
19+
20+
# Processing threads and buffers
21+
druid.processing.buffer.sizeBytes=4194304
22+
druid.processing.numThreads=2
23+
24+
# Hadoop indexing
25+
druid.indexer.task.hadoopWorkingPath=var/druid/hadoop-tmp
26+
druid.indexer.task.defaultHadoopCoordinates=["org.apache.hadoop:hadoop-client:2.7.3"]
27+
28+
druid.monitoring.monitors=["com.metamx.metrics.JvmMonitor","io.druid.client.cache.CacheMonitor"]
2.26 MB
Binary file not shown.

0 commit comments

Comments
 (0)