From 03bf9b7de04f53c8c307d5e9d8da2a735fe65fb8 Mon Sep 17 00:00:00 2001 From: Raghu Angadi Date: Tue, 13 Dec 2016 16:59:32 -0800 Subject: [PATCH 01/77] Increase KafkaIO version to 0.2.0 Recent PR #491 changes how KafkaIO splits. This makes it incompatible with Dataflow update across these two versions. --- contrib/kafka/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/kafka/pom.xml b/contrib/kafka/pom.xml index 4beecd7ba5..29f64d5577 100644 --- a/contrib/kafka/pom.xml +++ b/contrib/kafka/pom.xml @@ -25,7 +25,7 @@ google-cloud-dataflow-java-contrib-kafka Google Cloud Dataflow Kafka IO Library Library to read Kafka topics. - 0.1.0-SNAPSHOT + 0.2.0-SNAPSHOT [1.6.0, 2.0.0) From 90fcfc07be037b96e1b98ef785569324984433c9 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Fri, 6 Jan 2017 18:07:50 -0800 Subject: [PATCH 02/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 4c8584a91c..c987e94893 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index fa781ac432..a640cd4e5e 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT ../../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index c600393712..ec3bec3948 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT ../../pom.xml diff --git a/pom.xml b/pom.xml index c3c64811f4..60d054b063 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.0.0-beta1 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index 8dd014f9f7..511e4e0a73 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT google-cloud-dataflow-java-sdk-all From d8b880def712cc990be0f0e1eb440d6a114e6d1c Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Fri, 6 Jan 2017 18:11:05 -0800 Subject: [PATCH 03/77] Manual fixes to archetypes to pin next development version --- .../examples/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/test/resources/projects/basic/reference/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 163212d61a..20e89f3064 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -125,7 +125,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index 183b29062c..01e9aa951b 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -74,7 +74,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index db4b1c9784..5530617bfe 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -74,7 +74,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta1-SNAPSHOT + 2.0.0-beta2-SNAPSHOT From 4a55dda33be95db80d9d20c2ab95b307ef1f0cfd Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Fri, 6 Jan 2017 18:34:40 -0800 Subject: [PATCH 04/77] Generate empty Javadoc --- pom.xml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pom.xml b/pom.xml index 60d054b063..d7d7e4dab7 100644 --- a/pom.xml +++ b/pom.xml @@ -215,6 +215,24 @@ + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.3 + + false + + + + javadoc + package + + jar + + + + + org.apache.maven.plugins maven-dependency-plugin @@ -306,6 +324,11 @@ maven-source-plugin + + org.apache.maven.plugins + maven-javadoc-plugin + + org.apache.maven.plugins maven-dependency-plugin From aad7cafb876c03a50e16bd753a55e2450f461d50 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Fri, 6 Jan 2017 19:16:02 -0800 Subject: [PATCH 05/77] Move properties to root pom.xml --- pom.xml | 4 ++++ .../org/apache/beam/runners/dataflow/dataflow.properties | 6 +++--- sdk/src/main/resources/org/apache/beam/sdk/sdk.properties | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index d7d7e4dab7..ef65d33de6 100644 --- a/pom.xml +++ b/pom.xml @@ -78,6 +78,10 @@ yyyy-MM-dd HH:mm 0.4.0 + + Google Cloud Dataflow SDK for Java + ${project.version}-20170103 + 6 pom diff --git a/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties b/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties index 1af27fdd48..d499ad6699 100644 --- a/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties +++ b/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties @@ -12,7 +12,7 @@ # License for the specific language governing permissions and limitations under # the License. -environment.major.version=6 +environment.major.version=${dataflow.environment_major_version} -worker.image.batch=dataflow.gcr.io/v1beta3/beam-java-batch:${project.version}-20170103 -worker.image.streaming=dataflow.gcr.io/v1beta3/beam-java-streaming:${project.version}-20170103 +worker.image.batch=dataflow.gcr.io/v1beta3/beam-java-batch:${dataflow.container_version} +worker.image.streaming=dataflow.gcr.io/v1beta3/beam-java-streaming:${dataflow.container_version} diff --git a/sdk/src/main/resources/org/apache/beam/sdk/sdk.properties b/sdk/src/main/resources/org/apache/beam/sdk/sdk.properties index a9df3b5b5c..5b7997a43a 100644 --- a/sdk/src/main/resources/org/apache/beam/sdk/sdk.properties +++ b/sdk/src/main/resources/org/apache/beam/sdk/sdk.properties @@ -12,6 +12,6 @@ # License for the specific language governing permissions and limitations under # the License. -name=Google Cloud Dataflow SDK for Java +name=${dataflow.release_name} version=${pom.version} build.date=${timestamp} From 8fd6661267d254b2089f35649f9f1102eeecda57 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Mon, 9 Jan 2017 10:30:55 -0800 Subject: [PATCH 06/77] Update version range dependency to exclude 2.0.0-betaX versions (#528) * Update version range dependency to exclude 2.0.0-betaX versions * fixup --- contrib/firebaseio/pom.xml | 2 +- contrib/hadoop/pom.xml | 2 +- contrib/join-library/pom.xml | 2 +- contrib/kafka/pom.xml | 2 +- contrib/sorter/pom.xml | 2 +- .../examples/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/test/resources/projects/basic/reference/pom.xml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/contrib/firebaseio/pom.xml b/contrib/firebaseio/pom.xml index b50ef89167..bf5414e24f 100644 --- a/contrib/firebaseio/pom.xml +++ b/contrib/firebaseio/pom.xml @@ -21,7 +21,7 @@ UTF-8 - [1.2.0, 2.0.0) + [1.2.0, 1.99) diff --git a/contrib/hadoop/pom.xml b/contrib/hadoop/pom.xml index 0659e3ebaa..60327b7fda 100644 --- a/contrib/hadoop/pom.xml +++ b/contrib/hadoop/pom.xml @@ -36,7 +36,7 @@ UTF-8 - [1.2.0,2.0.0) + [1.2.0, 1.99) diff --git a/contrib/join-library/pom.xml b/contrib/join-library/pom.xml index f15ef9794c..3f5675de4d 100644 --- a/contrib/join-library/pom.xml +++ b/contrib/join-library/pom.xml @@ -50,7 +50,7 @@ UTF-8 - [1.0.0, 2.0.0) + [1.0.0, 1.99) diff --git a/contrib/kafka/pom.xml b/contrib/kafka/pom.xml index 4beecd7ba5..c13946cc64 100644 --- a/contrib/kafka/pom.xml +++ b/contrib/kafka/pom.xml @@ -28,7 +28,7 @@ 0.1.0-SNAPSHOT - [1.6.0, 2.0.0) + [1.6.0, 1.99) 0.9.0.1 19.0 diff --git a/contrib/sorter/pom.xml b/contrib/sorter/pom.xml index d9ffb65bd9..4580ea5bab 100644 --- a/contrib/sorter/pom.xml +++ b/contrib/sorter/pom.xml @@ -36,7 +36,7 @@ UTF-8 - [1.2.0,2.0.0) + [1.2.0, 1.99) 2.7.1 diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 486a82b602..f58aafce9c 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -107,7 +107,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - [1.0.0, 2.0.0) + [1.0.0, 1.99) diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index ab2bcf9a70..70bf1e53d3 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -38,7 +38,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - [1.0.0, 2.0.0) + [1.0.0, 1.99) diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index b94fb39e12..db136756b3 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -38,7 +38,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - [1.0.0, 2.0.0) + [1.0.0, 1.99) From faa4c2e9af1c3c28a6ba78ebeabeced4866f82fc Mon Sep 17 00:00:00 2001 From: Raghu Angadi Date: Mon, 9 Jan 2017 11:13:31 -0800 Subject: [PATCH 07/77] README.md for contrib/kafka --- contrib/kafka/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 contrib/kafka/README.md diff --git a/contrib/kafka/README.md b/contrib/kafka/README.md new file mode 100644 index 0000000000..e69de29bb2 From 2829f00fa224e56d832d8a2e7174b5d5d05da223 Mon Sep 17 00:00:00 2001 From: Raghu Angadi Date: Mon, 9 Jan 2017 11:16:13 -0800 Subject: [PATCH 08/77] README.md for contrib/kafka --- contrib/kafka/README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/contrib/kafka/README.md b/contrib/kafka/README.md index e69de29bb2..386072317f 100644 --- a/contrib/kafka/README.md +++ b/contrib/kafka/README.md @@ -0,0 +1,36 @@ +# KafkaIO : Dataflow Unbounded Source and Sink for Kafka Topics + +KafkaIO provides unbounded sources and sinks for [Kafka](https://www.firebase.com/) +topics. Kafka version 0.9 and above are supported. + +## Basic Usage + * Read from a topic with 8 byte long keys and string values: + ```java + PCollection> kafkaRecords = + pipeline + .applY(KafkaIO.read() + .withBootstrapServers("broker_1:9092,broker_2:9092") + .withTopics(ImmutableList.of("topic_a")) + .withKeyCoder(BigEndianLongCoder.of()) + .withValueCoder(StringUtf8Coder.of()) + .withoutMetadata() + ); + ``` + * Write the same PCollection to a Kafka topic: + ```java + kafkaRecords.apply(KafkaIO.write() + .withBootstrapServers("broker_1:9092,broker_2:9092") + .withTopic("results") + .withKeyCoder(BigEndianLongCoder.of()) + .withValueCoder(StringUtf8Coder.of()) + ``` + +Please see JavaDoc for KafkaIO in +[KafkaIO.java](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaIO.java#L100) +for complete documentation and a more descriptive usage example. + +## Release Notes + * **0.2.0** : Assign one split for each of the Kafka topic partitions. This makes Dataflow + [Update](https://cloud.google.com/dataflow/pipelines/updating-a-pipeline) + from previous version incompatible. + * **0.1.0** : KafkaIO with support for Unbounded Source and Sink. From 3e6116294f53085dab04dba9094b5a462f00104e Mon Sep 17 00:00:00 2001 From: Raghu Angadi Date: Mon, 9 Jan 2017 13:55:49 -0800 Subject: [PATCH 09/77] Fix formatting --- contrib/kafka/README.md | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/contrib/kafka/README.md b/contrib/kafka/README.md index 386072317f..40b6c00ddf 100644 --- a/contrib/kafka/README.md +++ b/contrib/kafka/README.md @@ -4,26 +4,28 @@ KafkaIO provides unbounded sources and sinks for [Kafka](https://www.firebase.co topics. Kafka version 0.9 and above are supported. ## Basic Usage - * Read from a topic with 8 byte long keys and string values: - ```java - PCollection> kafkaRecords = - pipeline - .applY(KafkaIO.read() - .withBootstrapServers("broker_1:9092,broker_2:9092") - .withTopics(ImmutableList.of("topic_a")) - .withKeyCoder(BigEndianLongCoder.of()) - .withValueCoder(StringUtf8Coder.of()) - .withoutMetadata() - ); - ``` - * Write the same PCollection to a Kafka topic: - ```java - kafkaRecords.apply(KafkaIO.write() + +* Read from a topic with 8 byte long keys and string values: +```java + PCollection> kafkaRecords = + pipeline + .applY(KafkaIO.read() .withBootstrapServers("broker_1:9092,broker_2:9092") - .withTopic("results") + .withTopics(ImmutableList.of("topic_a")) .withKeyCoder(BigEndianLongCoder.of()) .withValueCoder(StringUtf8Coder.of()) - ``` + .withoutMetadata() + ); +``` + +* Write the same PCollection to a Kafka topic: +```java + kafkaRecords.apply(KafkaIO.write() + .withBootstrapServers("broker_1:9092,broker_2:9092") + .withTopic("results") + .withKeyCoder(BigEndianLongCoder.of()) + .withValueCoder(StringUtf8Coder.of()) +``` Please see JavaDoc for KafkaIO in [KafkaIO.java](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/contrib/kafka/src/main/java/com/google/cloud/dataflow/contrib/kafka/KafkaIO.java#L100) From 40d174f955e3f999b35388deea47fcd6c250c273 Mon Sep 17 00:00:00 2001 From: Raghu Angadi Date: Mon, 9 Jan 2017 15:04:07 -0800 Subject: [PATCH 10/77] review comments --- contrib/kafka/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/contrib/kafka/README.md b/contrib/kafka/README.md index 40b6c00ddf..a84a493a03 100644 --- a/contrib/kafka/README.md +++ b/contrib/kafka/README.md @@ -1,7 +1,7 @@ # KafkaIO : Dataflow Unbounded Source and Sink for Kafka Topics -KafkaIO provides unbounded sources and sinks for [Kafka](https://www.firebase.com/) -topics. Kafka version 0.9 and above are supported. +KafkaIO provides unbounded source and sink for [Kafka](http://kafka.apache.org/) +topics. Kafka versions 0.9 and above are supported. ## Basic Usage @@ -9,7 +9,7 @@ topics. Kafka version 0.9 and above are supported. ```java PCollection> kafkaRecords = pipeline - .applY(KafkaIO.read() + .apply(KafkaIO.read() .withBootstrapServers("broker_1:9092,broker_2:9092") .withTopics(ImmutableList.of("topic_a")) .withKeyCoder(BigEndianLongCoder.of()) @@ -25,6 +25,7 @@ topics. Kafka version 0.9 and above are supported. .withTopic("results") .withKeyCoder(BigEndianLongCoder.of()) .withValueCoder(StringUtf8Coder.of()) + ); ``` Please see JavaDoc for KafkaIO in From efd33cc43061e54d8fe1e16415157de21b904d90 Mon Sep 17 00:00:00 2001 From: igorbernstein2 Date: Thu, 26 Jan 2017 14:57:48 -0500 Subject: [PATCH 11/77] =?UTF-8?q?Fix=20HadoopFileSource=E2=80=99s=20split?= =?UTF-8?q?=20size=20estimate=20(#534)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix HadoopFileSource’s split size estimate * Properly set interrupted state --- .../contrib/hadoop/HadoopFileSource.java | 11 ++++++++- .../contrib/hadoop/HadoopFileSourceTest.java | 23 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/contrib/hadoop/src/main/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSource.java b/contrib/hadoop/src/main/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSource.java index e8981d2d6a..cffc475d71 100644 --- a/contrib/hadoop/src/main/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSource.java +++ b/contrib/hadoop/src/main/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSource.java @@ -239,12 +239,21 @@ private Coder getDefaultCoder(Class c) { public long getEstimatedSizeBytes(PipelineOptions options) { long size = 0; try { + // If this source represents a split from splitIntoBundles, then return the size of the split, + // rather then the entire input + if (serializableSplit != null) { + return serializableSplit.getSplit().getLength(); + } + Job job = Job.getInstance(); // new instance for (FileStatus st : listStatus(createFormat(job), job)) { size += st.getLen(); } } catch (IOException | NoSuchMethodException | InvocationTargetException - | IllegalAccessException | InstantiationException e) { + | IllegalAccessException | InstantiationException) { + // ignore, and return 0 + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); // ignore, and return 0 } return size; diff --git a/contrib/hadoop/src/test/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSourceTest.java b/contrib/hadoop/src/test/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSourceTest.java index cef3c08348..eac54a1e31 100644 --- a/contrib/hadoop/src/test/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSourceTest.java +++ b/contrib/hadoop/src/test/java/com/google/cloud/dataflow/contrib/hadoop/HadoopFileSourceTest.java @@ -152,6 +152,29 @@ public void testSplits() throws Exception { assertTrue(nonEmptySplits > 2); } + @Test + public void testSplitEstimatedSize() throws Exception { + PipelineOptions options = PipelineOptionsFactory.create(); + + List> expectedResults = createRandomRecords(3, 10000, 0); + File file = createFileWithData("tmp.avro", expectedResults); + + HadoopFileSource source = HadoopFileSource.from( + file.toString(), SequenceFileInputFormat.class, IntWritable.class, Text.class + ); + + long originalSize = source.getEstimatedSizeBytes(options); + long splitTotalSize = 0; + List>> splits = source.splitIntoBundles( + SequenceFile.SYNC_INTERVAL, options + ); + for (BoundedSource> splitSource : splits) { + splitTotalSize += splitSource.getEstimatedSizeBytes(options); + } + // Assert that the estimated size of the whole is the sum of its parts + assertEquals(originalSize, splitTotalSize); + } + private File createFileWithData(String filename, List> records) throws IOException { File tmpFile = tmpFolder.newFile(filename); From dbe464440011decfc6d1a11ff6af9cc7b5ee3b68 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Mon, 30 Jan 2017 12:25:27 -0800 Subject: [PATCH 12/77] BigQuery: fix an issue with option propagation and refactor to future-proof * We created a helper in BigQueryIO to create a JobConfigurationQuery capturing all options, but we had not yet propagated this cleanup into the Services abstraction or helper classes. Refactor BigQueryServices and BigQueryTableRowIterator to propagate the same configuration. Adds a new deprecated constructor to BigQueryTableRowIterator for backwards-compatibility. This fixes GoogleCloudPlatform/DataflowJavaSDK#539. --- .../cloud/dataflow/sdk/io/BigQueryIO.java | 10 ++- .../dataflow/sdk/util/BigQueryServices.java | 3 +- .../sdk/util/BigQueryServicesImpl.java | 17 ++--- .../sdk/util/BigQueryTableRowIterator.java | 67 ++++++++++--------- .../cloud/dataflow/sdk/io/BigQueryIOTest.java | 8 +-- 5 files changed, 52 insertions(+), 53 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java index ace18eff5e..3472a8afed 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java @@ -1075,7 +1075,7 @@ public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { public BoundedReader createReader(PipelineOptions options) throws IOException { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); return new BigQueryReader(this, bqServices.getReaderFromQuery( - bqOptions, query.get(), executingProject.get(), flattenResults, useLegacySql)); + bqOptions, createBasicQueryConfig(), executingProject.get())); } @Override @@ -1152,11 +1152,12 @@ private void executeQuery( .setProjectId(executingProject) .setJobId(jobId); + // When changing options here, consider whether to change the defaults from + // #createBasicQueryConfig instead. JobConfigurationQuery queryConfig = createBasicQueryConfig() .setAllowLargeResults(true) .setCreateDisposition("CREATE_IF_NEEDED") .setDestinationTable(destinationTable) - .setPriority("BATCH") .setWriteDisposition("WRITE_EMPTY"); jobService.startQueryJob(jobRef, queryConfig); @@ -1167,9 +1168,12 @@ private void executeQuery( } private JobConfigurationQuery createBasicQueryConfig() { + // Due to deprecated functionality, if this function is updated + // then the similar code in BigQueryTableRowIterator#fromQuery should be updated. return new JobConfigurationQuery() - .setQuery(query.get()) .setFlattenResults(flattenResults) + .setPriority("BATCH") + .setQuery(query.get()) .setUseLegacySql(useLegacySql); } diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java index ec96009494..df247629f5 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java @@ -58,8 +58,7 @@ public interface BigQueryServices extends Serializable { * Returns a real, mock, or fake {@link BigQueryJsonReader} to query tables. */ BigQueryJsonReader getReaderFromQuery( - BigQueryOptions bqOptions, String query, String projectId, @Nullable Boolean flatten, - @Nullable Boolean useLegacySql); + BigQueryOptions bqOptions, JobConfigurationQuery queryConfig, String projectId); /** * An interface for the Cloud BigQuery load service. diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java index 1a37e01375..af2ed9e73b 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java @@ -39,14 +39,11 @@ import com.google.cloud.dataflow.sdk.options.BigQueryOptions; import com.google.cloud.hadoop.util.ApiErrorExtractor; import com.google.common.annotations.VisibleForTesting; - import org.joda.time.Duration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import java.io.IOException; import java.util.NoSuchElementException; - import javax.annotation.Nullable; /** @@ -83,9 +80,8 @@ public BigQueryJsonReader getReaderFromTable(BigQueryOptions bqOptions, TableRef @Override public BigQueryJsonReader getReaderFromQuery( - BigQueryOptions bqOptions, String query, String projectId, @Nullable Boolean flatten, - @Nullable Boolean useLegacySql) { - return BigQueryJsonReaderImpl.fromQuery(bqOptions, query, projectId, flatten, useLegacySql); + BigQueryOptions bqOptions, JobConfigurationQuery queryConfig, String projectId) { + return BigQueryJsonReaderImpl.fromQuery(bqOptions, queryConfig, projectId); } @VisibleForTesting @@ -521,14 +517,11 @@ private BigQueryJsonReaderImpl(BigQueryTableRowIterator iterator) { private static BigQueryJsonReader fromQuery( BigQueryOptions bqOptions, - String query, - String projectId, - @Nullable Boolean flattenResults, - @Nullable Boolean useLegacySql) { + JobConfigurationQuery queryConfig, + String projectId) { return new BigQueryJsonReaderImpl( BigQueryTableRowIterator.fromQuery( - query, projectId, Transport.newBigQueryClient(bqOptions).build(), flattenResults, - useLegacySql)); + queryConfig, projectId, Transport.newBigQueryClient(bqOptions).build())); } private static BigQueryJsonReader fromTable( diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java index 8f4ff793dc..63bd025099 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java @@ -46,11 +46,9 @@ import com.google.common.base.MoreObjects; import com.google.common.collect.ImmutableList; import com.google.common.util.concurrent.Uninterruptibles; - import org.joda.time.Duration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import java.io.IOException; import java.util.Collection; import java.util.Collections; @@ -61,7 +59,6 @@ import java.util.Objects; import java.util.Random; import java.util.concurrent.TimeUnit; - import javax.annotation.Nullable; /** @@ -73,6 +70,7 @@ public class BigQueryTableRowIterator implements AutoCloseable { @Nullable private TableReference ref; @Nullable private final String projectId; @Nullable private TableSchema schema; + @Nullable private final JobConfigurationQuery queryConfig; private final Bigquery client; private String pageToken; private Iterator iteratorOverCurrentBatch; @@ -89,25 +87,18 @@ public class BigQueryTableRowIterator implements AutoCloseable { // following interval to check the status of query execution job private static final Duration QUERY_COMPLETION_POLL_TIME = Duration.standardSeconds(1); - private final String query; - // Whether to flatten query results. - private final boolean flattenResults; - // Whether to use the BigQuery legacy SQL dialect.. - private final boolean useLegacySql; // Temporary dataset used to store query results. private String temporaryDatasetId = null; // Temporary table used to store query results. private String temporaryTableId = null; private BigQueryTableRowIterator( - @Nullable TableReference ref, @Nullable String query, @Nullable String projectId, - Bigquery client, boolean flattenResults, boolean useLegacySql) { + @Nullable TableReference ref, @Nullable JobConfigurationQuery queryConfig, + @Nullable String projectId, Bigquery client) { this.ref = ref; - this.query = query; + this.queryConfig = queryConfig; this.projectId = projectId; this.client = checkNotNull(client, "client"); - this.flattenResults = flattenResults; - this.useLegacySql = useLegacySql; } /** @@ -116,7 +107,7 @@ private BigQueryTableRowIterator( public static BigQueryTableRowIterator fromTable(TableReference ref, Bigquery client) { checkNotNull(ref, "ref"); checkNotNull(client, "client"); - return new BigQueryTableRowIterator(ref, null, ref.getProjectId(), client, true, true); + return new BigQueryTableRowIterator(ref, null, ref.getProjectId(), client); } /** @@ -135,15 +126,31 @@ public static BigQueryTableRowIterator fromQuery( * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the * specified query in the specified project. */ + @Deprecated public static BigQueryTableRowIterator fromQuery( String query, String projectId, Bigquery client, @Nullable Boolean flattenResults, @Nullable Boolean useLegacySql) { checkNotNull(query, "query"); checkNotNull(projectId, "projectId"); checkNotNull(client, "client"); - return new BigQueryTableRowIterator(null, query, projectId, client, - MoreObjects.firstNonNull(flattenResults, Boolean.TRUE), - MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE)); + JobConfigurationQuery queryConfig = new JobConfigurationQuery() + .setFlattenResults(MoreObjects.firstNonNull(flattenResults, Boolean.TRUE)) + .setPriority("BATCH") + .setQuery(query) + .setUseLegacySql(MoreObjects.firstNonNull(useLegacySql, Boolean.TRUE)); + return new BigQueryTableRowIterator(null, queryConfig, projectId, client); + } + + /** + * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the + * specified query in the specified project. + */ + public static BigQueryTableRowIterator fromQuery( + JobConfigurationQuery queryConfig, String projectId, Bigquery client) { + checkNotNull(queryConfig, "queryConfig"); + checkNotNull(projectId, "projectId"); + checkNotNull(client, "client"); + return new BigQueryTableRowIterator(null, queryConfig, projectId, client); } /** @@ -151,7 +158,7 @@ public static BigQueryTableRowIterator fromQuery( * @throws IOException on failure */ public void open() throws IOException, InterruptedException { - if (query != null) { + if (queryConfig != null) { ref = executeQueryAndWaitForCompletion(); } // Get table schema. @@ -401,15 +408,17 @@ private void deleteDataset(String datasetId) throws IOException, InterruptedExce */ private TableReference executeQueryAndWaitForCompletion() throws IOException, InterruptedException { + checkState(projectId != null, "Cannot dryRun a query in unknown (null) project"); + checkState(queryConfig != null, "Cannot dryRun a null query"); // Dry run query to get source table location Job dryRunJob = new Job() .setConfiguration(new JobConfiguration() - .setQuery(new JobConfigurationQuery() - .setQuery(query)) + .setQuery(queryConfig) .setDryRun(true)); JobStatistics jobStats = executeWithBackOff( client.jobs().insert(projectId, dryRunJob), - String.format("Error when trying to dry run query %s.", query)).getStatistics(); + String.format("Error when trying to dry run query %s.", + queryConfig.toPrettyString())).getStatistics(); // Let BigQuery to pick default location if the query does not read any tables. String location = null; @@ -428,14 +437,8 @@ private TableReference executeQueryAndWaitForCompletion() createDataset(temporaryDatasetId, location); Job job = new Job(); JobConfiguration config = new JobConfiguration(); - JobConfigurationQuery queryConfig = new JobConfigurationQuery(); config.setQuery(queryConfig); job.setConfiguration(config); - queryConfig.setQuery(query); - queryConfig.setAllowLargeResults(true); - queryConfig.setFlattenResults(flattenResults); - queryConfig.setUseLegacySql(useLegacySql); - TableReference destinationTable = new TableReference(); destinationTable.setProjectId(projectId); @@ -445,13 +448,15 @@ private TableReference executeQueryAndWaitForCompletion() Job queryJob = executeWithBackOff( client.jobs().insert(projectId, job), - String.format("Error when trying to execute the job for query %s.", query)); + String.format("Error when trying to execute the job for query %s.", + queryConfig.toPrettyString())); JobReference jobId = queryJob.getJobReference(); while (true) { Job pollJob = executeWithBackOff( client.jobs().get(projectId, jobId.getJobId()), - String.format("Error when trying to get status of the job for query %s.", query)); + String.format("Error when trying to get status of the job for query %s.", + queryConfig.toPrettyString())); JobStatus status = pollJob.getStatus(); if (status.getState().equals("DONE")) { // Job is DONE, but did not necessarily succeed. @@ -461,7 +466,9 @@ private TableReference executeQueryAndWaitForCompletion() } else { // There will be no temporary table to delete, so null out the reference. temporaryTableId = null; - throw new IOException("Executing query " + query + " failed: " + error.getMessage()); + throw new IOException( + String.format("Executing query %s failed: %s", + queryConfig.toPrettyString(), error.getMessage())); } } Uninterruptibles.sleepUninterruptibly( diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java index a5bddec315..61356e18d0 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java @@ -106,7 +106,6 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; - import org.hamcrest.CoreMatchers; import org.hamcrest.Matchers; import org.junit.Assert; @@ -122,7 +121,6 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.MockitoAnnotations; - import java.io.File; import java.io.FileFilter; import java.io.IOException; @@ -135,8 +133,6 @@ import java.util.NoSuchElementException; import java.util.Set; -import javax.annotation.Nullable; - /** * Tests for BigQueryIO. */ @@ -187,8 +183,7 @@ public BigQueryJsonReader getReaderFromTable( @Override public BigQueryJsonReader getReaderFromQuery( - BigQueryOptions bqOptions, String query, String projectId, @Nullable Boolean flatten, - @Nullable Boolean useLegacySql) { + BigQueryOptions bqOptions, JobConfigurationQuery queryConfig, String projectId) { return new FakeBigQueryReader(jsonTableRowReturns); } @@ -1749,3 +1744,4 @@ public boolean accept(File pathname) { }}).length); } } + From b4e391ee3fd3fd7cbd4b7e499601ae201f6dcf6d Mon Sep 17 00:00:00 2001 From: Eugene Hlyzov Date: Mon, 30 Jan 2017 23:45:34 +0300 Subject: [PATCH 13/77] [BEAM-359] Treat erased type variables as non-deterministic in AvroCoder (#531) --- .../cloud/dataflow/sdk/coders/AvroCoder.java | 9 +++++-- .../dataflow/sdk/coders/AvroCoderTest.java | 26 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java index c5aa029531..d85eb9b937 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/coders/AvroCoder.java @@ -474,6 +474,10 @@ private void doCheck(String context, TypeDescriptor type, Schema schema) { checkMap(context, type, schema); break; case RECORD: + if (!(type.getType() instanceof Class)) { + reportError(context, "Cannot determine type from generic %s due to erasure", type); + return; + } checkRecord(type, schema); break; case UNION: @@ -694,7 +698,8 @@ private void checkArray(String context, TypeDescriptor type, Schema schema) { * Extract a field from a class. We need to look at the declared fields so that we can * see private fields. We may need to walk up to the parent to get classes from the parent. */ - private static Field getField(Class clazz, String name) { + private static Field getField(Class originalClazz, String name) { + Class clazz = originalClazz; while (clazz != null) { for (Field field : clazz.getDeclaredFields()) { AvroName avroName = field.getAnnotation(AvroName.class); @@ -708,7 +713,7 @@ private static Field getField(Class clazz, String name) { } throw new IllegalArgumentException( - "Unable to get field " + name + " from class " + clazz); + "Unable to get field " + name + " from class " + originalClazz); } } } diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java index 3ed055bc4a..d6a2a172c9 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/coders/AvroCoderTest.java @@ -751,4 +751,30 @@ public int hashCode() { return Objects.hash(getClass(), onlySomeTypesAllowed); } } + + @Test + public void testAvroCoderForGenerics() throws Exception { + Schema fooSchema = AvroCoder.of(Foo.class).getSchema(); + Schema schema = new Schema.Parser().parse("{" + + "\"type\":\"record\"," + + "\"name\":\"SomeGeneric\"," + + "\"namespace\":\"ns\"," + + "\"fields\":[" + + " {\"name\":\"foo\", \"type\":" + fooSchema.toString() + "}" + + "]}"); + @SuppressWarnings("rawtypes") + AvroCoder coder = AvroCoder.of(SomeGeneric.class, schema); + + assertNonDeterministic(coder, + reasonField(SomeGeneric.class, "foo", "erasure")); + } + + private static class SomeGeneric { + @SuppressWarnings("unused") + private T foo; + } + private static class Foo { + @SuppressWarnings("unused") + String id; + } } From 502f99f6dfb233b2681ea6be55b8f63d900c4e67 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Mon, 30 Jan 2017 14:20:15 -0800 Subject: [PATCH 14/77] fixups --- .../sdk/util/BigQueryTableRowIterator.java | 1 + .../util/BigQueryTableRowIteratorTest.java | 37 ++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java index 63bd025099..a518032228 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java @@ -445,6 +445,7 @@ private TableReference executeQueryAndWaitForCompletion() destinationTable.setDatasetId(temporaryDatasetId); destinationTable.setTableId(temporaryTableId); queryConfig.setDestinationTable(destinationTable); + queryConfig.setAllowLargeResults(Boolean.TRUE); Job queryJob = executeWithBackOff( client.jobs().insert(projectId, job), diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIteratorTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIteratorTest.java index d6ac5b36ba..94d858aebb 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIteratorTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIteratorTest.java @@ -258,14 +258,18 @@ public void testReadFromQueryNoTables() throws IOException, InterruptedException // Mock job polling. JobStatus status = new JobStatus().setState("DONE"); - TableReference tableRef = - new TableReference().setProjectId("project").setDatasetId("dataset").setTableId("table"); - JobConfigurationQuery queryConfig = new JobConfigurationQuery().setDestinationTable(tableRef); + JobConfigurationQuery resultQueryConfig = + new JobConfigurationQuery().setDestinationTable( + new TableReference() + .setProjectId("project") + .setDatasetId("tempdataset") + .setTableId("temptable") + ); Job getJob = new Job() .setJobReference(new JobReference()) .setStatus(status) - .setConfiguration(new JobConfiguration().setQuery(queryConfig)); + .setConfiguration(new JobConfiguration().setQuery(resultQueryConfig)); when(mockJobsGet.execute()).thenReturn(getJob); // Mock table schema fetch. @@ -281,8 +285,9 @@ public void testReadFromQueryNoTables() throws IOException, InterruptedException String query = String.format( "SELECT \"Arthur\" as name, 42 as count, \"%s\" as photo", photoBytesEncoded); + JobConfigurationQuery queryConfig = new JobConfigurationQuery().setQuery(query); try (BigQueryTableRowIterator iterator = - BigQueryTableRowIterator.fromQuery(query, "project", mockClient, null, null)) { + BigQueryTableRowIterator.fromQuery(queryConfig, "project", mockClient)) { iterator.open(); assertTrue(iterator.advance()); TableRow row = iterator.getCurrent(); @@ -317,7 +322,7 @@ public void testReadFromQueryNoTables() throws IOException, InterruptedException verify(mockTablesDelete).execute(); // Table data read. verify(mockClient).tabledata(); - verify(mockTabledata).list("project", "dataset", "table"); + verify(mockTabledata).list("project", "tempdataset", "temptable"); verify(mockTabledataList).execute(); } @@ -334,18 +339,16 @@ public void testQueryFailed() throws IOException { when(mockJobsInsert.execute()).thenThrow(exception, exception, exception, exception); String query = "NOT A QUERY"; + JobConfigurationQuery queryConfig = new JobConfigurationQuery().setQuery(query); try (BigQueryTableRowIterator iterator = - BigQueryTableRowIterator.fromQuery(query, "project", mockClient, null, null)) { - - try { - iterator.open(); - fail(); - } catch (Exception expected) { - // Verify message explains cause and reports the query. - assertThat(expected.getMessage(), containsString("Error")); - assertThat(expected.getMessage(), containsString(query)); - assertThat(expected.getCause().getMessage(), containsString(errorReason)); - } + BigQueryTableRowIterator.fromQuery(queryConfig, "project", mockClient)) { + iterator.open(); + fail(); + } catch (Exception expected) { + // Verify message explains cause and reports the query. + assertThat(expected.getMessage(), containsString("Error")); + assertThat(expected.getMessage(), containsString(query)); + assertThat(expected.getCause().getMessage(), containsString(errorReason)); } // Job inserted to run the query, then polled once. From 254180dc87b182b63c6cbfaf8036bf26744686fd Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Thu, 2 Feb 2017 13:13:08 -0800 Subject: [PATCH 15/77] Upgrade to Apache Beam, version 0.5.0 This is based on the release candidate #2, currenly being considered as the release. --- .../resources/archetype-resources/pom.xml | 10 +++---- .../src/main/java/DebuggingWordCount.java | 4 +-- .../src/main/java/WordCount.java | 2 +- .../src/test/java/WordCountTest.java | 7 +++-- .../resources/archetype-resources/pom.xml | 6 ++-- .../projects/basic/reference/pom.xml | 6 ++-- pom.xml | 28 ++++++++++++++++++- 7 files changed, 45 insertions(+), 18 deletions(-) diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 20e89f3064..c5e16cd789 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -48,7 +48,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.3 + 3.5.1 ${targetPlatform} ${targetPlatform} @@ -90,7 +90,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.18.1 + 2.19.1 all 4 @@ -100,7 +100,7 @@ org.apache.maven.surefire surefire-junit47 - 2.18.1 + 2.19.1 @@ -201,7 +201,7 @@ org.slf4j slf4j-api - 1.7.7 + 1.7.14 @@ -223,7 +223,7 @@ junit junit - 4.11 + 4.12 diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java index 99ae79687c..dd9b91decc 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java @@ -95,9 +95,9 @@ public FilterTextFn(String pattern) { * in a dashboard, etc. */ private final Aggregator matchedWords = - createAggregator("matchedWords", new Sum.SumLongFn()); + createAggregator("matchedWords", Sum.ofLongs()); private final Aggregator unmatchedWords = - createAggregator("umatchedWords", new Sum.SumLongFn()); + createAggregator("unmatchedWords", Sum.ofLongs()); @ProcessElement public void processElement(ProcessContext c) { diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java index 634dea1a0a..b3ef26c493 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -87,7 +87,7 @@ public class WordCount { */ static class ExtractWordsFn extends DoFn { private final Aggregator emptyLines = - createAggregator("emptyLines", new Sum.SumLongFn()); + createAggregator("emptyLines", Sum.ofLongs()); @ProcessElement public void processElement(ProcessContext c) { diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java index e86c2aac96..e9621032e5 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java @@ -22,7 +22,6 @@ import ${package}.WordCount.CountWords; import ${package}.WordCount.ExtractWordsFn; import ${package}.WordCount.FormatAsTextFn; -import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.RunnableOnService; @@ -34,6 +33,7 @@ import org.apache.beam.sdk.values.PCollection; import org.hamcrest.CoreMatchers; import org.junit.Assert; +import org.junit.Rule; import org.junit.Test; import org.junit.experimental.categories.Category; import org.junit.runner.RunWith; @@ -68,12 +68,13 @@ public void testExtractWordsFn() throws Exception { static final String[] COUNTS_ARRAY = new String[] { "hi: 5", "there: 1", "sue: 2", "bob: 2"}; + @Rule + public TestPipeline p = TestPipeline.create(); + /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ @Test @Category(RunnableOnService.class) public void testCountWords() throws Exception { - Pipeline p = TestPipeline.create(); - PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); PCollection output = input.apply(new CountWords()) diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index 01e9aa951b..a73e9db9ed 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -48,7 +48,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.3 + 3.5.1 ${targetPlatform} ${targetPlatform} @@ -81,12 +81,12 @@ org.slf4j slf4j-api - 1.7.7 + 1.7.14 org.slf4j slf4j-jdk14 - 1.7.7 + 1.7.14 diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index 5530617bfe..ea55b72c91 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -48,7 +48,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.3 + 3.5.1 1.7 1.7 @@ -81,12 +81,12 @@ org.slf4j slf4j-api - 1.7.7 + 1.7.14 org.slf4j slf4j-jdk14 - 1.7.7 + 1.7.14 diff --git a/pom.xml b/pom.xml index ef65d33de6..0ee36a4c19 100644 --- a/pom.xml +++ b/pom.xml @@ -68,6 +68,32 @@ + + + apache.staging + Apache Software Foundation Staging Repository + https://repository.apache.org/content/repositories/staging/ + + true + + + false + + + + + apache.snapshots + Apache Software Foundation Snapshot Repository + https://repository.apache.org/content/repositories/snapshots/ + + false + + + true + + + + 3.0.3 @@ -77,7 +103,7 @@ ${maven.build.timestamp} yyyy-MM-dd HH:mm - 0.4.0 + 0.5.0 Google Cloud Dataflow SDK for Java ${project.version}-20170103 From 2ea5a233e1e895cf4f8babf9924d2b88cffbc5e9 Mon Sep 17 00:00:00 2001 From: Sam McVeety Date: Thu, 2 Feb 2017 13:24:23 -0800 Subject: [PATCH 16/77] Fix InProcessPipelineRunner to handle a null subscription --- .../dataflow/sdk/io/PubsubUnboundedSource.java | 11 ++++++----- .../dataflow/sdk/util/PubsubTestClient.java | 2 +- .../sdk/io/PubsubUnboundedSourceTest.java | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSource.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSource.java index 575fe39771..4da8ad1bad 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSource.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSource.java @@ -30,6 +30,7 @@ import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; import com.google.cloud.dataflow.sdk.options.PipelineOptions; import com.google.cloud.dataflow.sdk.options.ValueProvider; +import com.google.cloud.dataflow.sdk.options.ValueProvider.StaticValueProvider; import com.google.cloud.dataflow.sdk.transforms.Aggregator; import com.google.cloud.dataflow.sdk.transforms.Combine; import com.google.cloud.dataflow.sdk.transforms.DoFn; @@ -1290,6 +1291,7 @@ public String getIdLabel() { @Override public PCollection apply(PBegin input) { + ValueProvider subscriptionPath = subscription; if (subscription == null) { try { try (PubsubClient pubsubClient = @@ -1299,9 +1301,8 @@ public PCollection apply(PBegin input) { .as(DataflowPipelineOptions.class))) { checkState(project.isAccessible(), "createRandomSubscription must be called at runtime."); checkState(topic.isAccessible(), "createRandomSubscription must be called at runtime."); - SubscriptionPath subscriptionPath = - pubsubClient.createRandomSubscription( - project.get(), topic.get(), DEAULT_ACK_TIMEOUT_SEC); + subscriptionPath = StaticValueProvider.of(pubsubClient.createRandomSubscription( + project.get(), topic.get(), DEAULT_ACK_TIMEOUT_SEC)); LOG.warn("Created subscription {} to topic {}." + " Note this subscription WILL NOT be deleted when the pipeline terminates", subscription, topic); @@ -1314,7 +1315,7 @@ public PCollection apply(PBegin input) { return input.getPipeline().begin() .apply(Read.from(new PubsubSource(this))) .apply(ParDo.named("PubsubUnboundedSource.Stats") - .of(new StatsFn(pubsubFactory, subscription, - timestampLabel, idLabel))); + .of(new StatsFn(pubsubFactory, checkNotNull(subscriptionPath), + timestampLabel, idLabel))); } } diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java index c3a5a4e959..2f8a1db18c 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java @@ -372,7 +372,7 @@ public List listTopics(ProjectPath project) throws IOException { @Override public void createSubscription( TopicPath topic, SubscriptionPath subscription, int ackDeadlineSeconds) throws IOException { - throw new UnsupportedOperationException(); + return; } @Override diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java index f7e4f863de..dc48bab878 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java @@ -36,7 +36,10 @@ import com.google.cloud.dataflow.sdk.util.CoderUtils; import com.google.cloud.dataflow.sdk.util.PubsubClient; import com.google.cloud.dataflow.sdk.util.PubsubClient.IncomingMessage; +import com.google.cloud.dataflow.sdk.util.PubsubClient.OutgoingMessage; +import com.google.cloud.dataflow.sdk.util.PubsubClient.ProjectPath; import com.google.cloud.dataflow.sdk.util.PubsubClient.SubscriptionPath; +import com.google.cloud.dataflow.sdk.util.PubsubClient.TopicPath; import com.google.cloud.dataflow.sdk.util.PubsubTestClient; import com.google.cloud.dataflow.sdk.util.PubsubTestClient.PubsubTestClientFactory; @@ -60,8 +63,12 @@ */ @RunWith(JUnit4.class) public class PubsubUnboundedSourceTest { + private static final ProjectPath PROJECT = + PubsubClient.projectPathFromId("testProject"); private static final SubscriptionPath SUBSCRIPTION = PubsubClient.subscriptionPathFromName("testProject", "testSubscription"); + private static final TopicPath TOPIC = + PubsubClient.topicPathFromName("testProject", "testTopic"); private static final String DATA = "testData"; private static final long TIMESTAMP = 1234L; private static final long REQ_TIME = 6373L; @@ -320,4 +327,14 @@ public void readManyMessages() throws IOException { assertTrue(dataToMessageNum.isEmpty()); reader.close(); } + + @Test + public void testNullTopic() throws Exception { + factory = PubsubTestClient.createFactoryForPublish( + TOPIC, ImmutableList.of(), ImmutableList.of()); + TestPipeline p = TestPipeline.create(); + p.apply(new PubsubUnboundedSource<>( + clock, factory, StaticValueProvider.of(PROJECT), StaticValueProvider.of(TOPIC), + null, StringUtf8Coder.of(), TIMESTAMP_LABEL, ID_LABEL)); + } } From 669fa4308b7b61f0ef5fdde90c24e8e03bd31ed6 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Thu, 2 Feb 2017 13:38:43 -0800 Subject: [PATCH 17/77] [maven-release-plugin] prepare branch release-2.0.0-beta2 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 0ee36a4c19..004a9d1269 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.0.0-beta2 From a5d6a666b5432f309250b722e2c92c2c5531d3fc Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Thu, 2 Feb 2017 13:38:52 -0800 Subject: [PATCH 18/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index c987e94893..5bddc98bf4 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index a640cd4e5e..80d03bde09 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT ../../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index ec3bec3948..eee032d419 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT ../../pom.xml diff --git a/pom.xml b/pom.xml index 004a9d1269..f24f902820 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.0.0-beta2 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index 511e4e0a73..b43e014a63 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT google-cloud-dataflow-java-sdk-all From 211b76d257a1481ae17cac89b9362d049c53e288 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Thu, 2 Feb 2017 13:41:58 -0800 Subject: [PATCH 19/77] Update archetype versioning after cutting 2.0.0-beta2 release branch --- .../examples/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/test/resources/projects/basic/reference/pom.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index c5e16cd789..e1ce538e52 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -125,7 +125,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index a73e9db9ed..2ba93d7291 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -74,7 +74,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index ea55b72c91..1f06820c65 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -74,7 +74,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta2-SNAPSHOT + 2.0.0-beta3-SNAPSHOT From c827108b61a8b2145b325ec4636e64922e2695c7 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Thu, 2 Feb 2017 13:57:29 -0800 Subject: [PATCH 20/77] Update worker container image for 2.0.0-beta3-SNAPSHOT --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index f24f902820..8af9511fc5 100644 --- a/pom.xml +++ b/pom.xml @@ -106,7 +106,7 @@ 0.5.0 Google Cloud Dataflow SDK for Java - ${project.version}-20170103 + ${project.version}-20170202 6 From 5625ffbf5ab9cfee4debbc38afc6a1fe5c69c892 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Mon, 6 Feb 2017 14:48:43 -0800 Subject: [PATCH 21/77] fixups --- .../main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java index 3472a8afed..a6d9871287 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java @@ -1158,6 +1158,7 @@ private void executeQuery( .setAllowLargeResults(true) .setCreateDisposition("CREATE_IF_NEEDED") .setDestinationTable(destinationTable) + .setPriority("BATCH") .setWriteDisposition("WRITE_EMPTY"); jobService.startQueryJob(jobRef, queryConfig); @@ -1172,7 +1173,6 @@ private JobConfigurationQuery createBasicQueryConfig() { // then the similar code in BigQueryTableRowIterator#fromQuery should be updated. return new JobConfigurationQuery() .setFlattenResults(flattenResults) - .setPriority("BATCH") .setQuery(query.get()) .setUseLegacySql(useLegacySql); } From 9c59d78112a51451eb9e6ae64a966b0ee2677fb8 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Mon, 6 Feb 2017 14:56:50 -0800 Subject: [PATCH 22/77] fixups --- .../java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java | 2 +- .../google/cloud/dataflow/sdk/util/BigQueryServices.java | 2 +- .../cloud/dataflow/sdk/util/BigQueryServicesImpl.java | 8 +++----- .../cloud/dataflow/sdk/util/BigQueryTableRowIterator.java | 4 ++-- .../com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java | 2 +- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java index a6d9871287..f844f49aed 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java @@ -1075,7 +1075,7 @@ public long getEstimatedSizeBytes(PipelineOptions options) throws Exception { public BoundedReader createReader(PipelineOptions options) throws IOException { BigQueryOptions bqOptions = options.as(BigQueryOptions.class); return new BigQueryReader(this, bqServices.getReaderFromQuery( - bqOptions, createBasicQueryConfig(), executingProject.get())); + bqOptions, executingProject.get(), createBasicQueryConfig())); } @Override diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java index df247629f5..43232f699a 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServices.java @@ -58,7 +58,7 @@ public interface BigQueryServices extends Serializable { * Returns a real, mock, or fake {@link BigQueryJsonReader} to query tables. */ BigQueryJsonReader getReaderFromQuery( - BigQueryOptions bqOptions, JobConfigurationQuery queryConfig, String projectId); + BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig); /** * An interface for the Cloud BigQuery load service. diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java index af2ed9e73b..84e718addd 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryServicesImpl.java @@ -80,8 +80,8 @@ public BigQueryJsonReader getReaderFromTable(BigQueryOptions bqOptions, TableRef @Override public BigQueryJsonReader getReaderFromQuery( - BigQueryOptions bqOptions, JobConfigurationQuery queryConfig, String projectId) { - return BigQueryJsonReaderImpl.fromQuery(bqOptions, queryConfig, projectId); + BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig) { + return BigQueryJsonReaderImpl.fromQuery(bqOptions, projectId, queryConfig); } @VisibleForTesting @@ -516,9 +516,7 @@ private BigQueryJsonReaderImpl(BigQueryTableRowIterator iterator) { } private static BigQueryJsonReader fromQuery( - BigQueryOptions bqOptions, - JobConfigurationQuery queryConfig, - String projectId) { + BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig) { return new BigQueryJsonReaderImpl( BigQueryTableRowIterator.fromQuery( queryConfig, projectId, Transport.newBigQueryClient(bqOptions).build())); diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java index a518032228..5ab5c897ed 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/BigQueryTableRowIterator.java @@ -408,8 +408,8 @@ private void deleteDataset(String datasetId) throws IOException, InterruptedExce */ private TableReference executeQueryAndWaitForCompletion() throws IOException, InterruptedException { - checkState(projectId != null, "Cannot dryRun a query in unknown (null) project"); - checkState(queryConfig != null, "Cannot dryRun a null query"); + checkState(projectId != null, "Unable to execute a query without a configured project id"); + checkState(queryConfig != null, "Unable to execute a query without a configured query"); // Dry run query to get source table location Job dryRunJob = new Job() .setConfiguration(new JobConfiguration() diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java index 61356e18d0..7f9d2e95ce 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java @@ -183,7 +183,7 @@ public BigQueryJsonReader getReaderFromTable( @Override public BigQueryJsonReader getReaderFromQuery( - BigQueryOptions bqOptions, JobConfigurationQuery queryConfig, String projectId) { + BigQueryOptions bqOptions, String projectId, JobConfigurationQuery queryConfig) { return new FakeBigQueryReader(jsonTableRowReturns); } From 20862aa7b9a690e8025ac5c1a6756eaffb05794c Mon Sep 17 00:00:00 2001 From: Sam McVeety Date: Fri, 10 Feb 2017 17:09:34 -0800 Subject: [PATCH 23/77] Fixups --- .../google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java index dc48bab878..06979315d8 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java @@ -329,7 +329,7 @@ public void readManyMessages() throws IOException { } @Test - public void testNullTopic() throws Exception { + public void testNullSubscription() throws Exception { factory = PubsubTestClient.createFactoryForPublish( TOPIC, ImmutableList.of(), ImmutableList.of()); TestPipeline p = TestPipeline.create(); From d7a70fef5f68074fd175dabce9870d55369a6dc0 Mon Sep 17 00:00:00 2001 From: Sam McVeety Date: Tue, 14 Feb 2017 16:57:33 -0800 Subject: [PATCH 24/77] Fixups --- .../dataflow/sdk/util/PubsubTestClient.java | 39 +++++++++++++++++++ .../sdk/io/PubsubUnboundedSourceTest.java | 2 +- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java index 2f8a1db18c..01831218d4 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PubsubTestClient.java @@ -107,6 +107,11 @@ private static class State { */ @Nullable Map ackDeadline; + + /** + * Whether a subscription has been created. + */ + boolean createdSubscription; } private static final State STATE = new State(); @@ -124,12 +129,40 @@ public static PubsubTestClientFactory createFactoryForPublish( final TopicPath expectedTopic, final Iterable expectedOutgoingMessages, final Iterable failingOutgoingMessages) { + return createFactoryForPublishInternal( + expectedTopic, expectedOutgoingMessages, failingOutgoingMessages, false); + } + + /** + * Return a factory for testing publishers. Only one factory may be in-flight at a time. + * The factory must be closed when the test is complete, at which point final validation will + * occur. Additionally, verify that createSubscription was called. + */ + public static PubsubTestClientFactory createFactoryForPublishVerifySubscription( + final TopicPath expectedTopic, + final Iterable expectedOutgoingMessages, + final Iterable failingOutgoingMessages) { + return createFactoryForPublishInternal( + expectedTopic, expectedOutgoingMessages, failingOutgoingMessages, true); + } + + /** + * Return a factory for testing publishers. Only one factory may be in-flight at a time. + * The factory must be closed when the test is complete, at which point final validation will + * occur. + */ + public static PubsubTestClientFactory createFactoryForPublishInternal( + final TopicPath expectedTopic, + final Iterable expectedOutgoingMessages, + final Iterable failingOutgoingMessages, + final boolean verifySubscriptionCreated) { synchronized (STATE) { checkState(!STATE.isActive, "Test still in flight"); STATE.expectedTopic = expectedTopic; STATE.remainingExpectedOutgoingMessages = Sets.newHashSet(expectedOutgoingMessages); STATE.remainingFailingOutgoingMessages = Sets.newHashSet(failingOutgoingMessages); STATE.isActive = true; + STATE.createdSubscription = false; } return new PubsubTestClientFactory() { @Override @@ -148,6 +181,9 @@ public String getKind() { @Override public void close() { synchronized (STATE) { + if (verifySubscriptionCreated) { + checkState(STATE.createdSubscription, "Did not call create subscription"); + } checkState(STATE.isActive, "No test still in flight"); checkState(STATE.remainingExpectedOutgoingMessages.isEmpty(), "Still waiting for %s messages to be published", @@ -372,6 +408,9 @@ public List listTopics(ProjectPath project) throws IOException { @Override public void createSubscription( TopicPath topic, SubscriptionPath subscription, int ackDeadlineSeconds) throws IOException { + synchronized (STATE) { + STATE.createdSubscription = true; + } return; } diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java index 06979315d8..65fdf737af 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/PubsubUnboundedSourceTest.java @@ -330,7 +330,7 @@ public void readManyMessages() throws IOException { @Test public void testNullSubscription() throws Exception { - factory = PubsubTestClient.createFactoryForPublish( + factory = PubsubTestClient.createFactoryForPublishVerifySubscription( TOPIC, ImmutableList.of(), ImmutableList.of()); TestPipeline p = TestPipeline.create(); p.apply(new PubsubUnboundedSource<>( From 4a9f16469fec467cf54efaa98a658413da231a00 Mon Sep 17 00:00:00 2001 From: gsgalloway Date: Wed, 1 Mar 2017 15:22:22 -0800 Subject: [PATCH 25/77] Small fix for BigtableIO.WriteOperation.finalize --- .../com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java index 3751d160f5..62f3b25d57 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/bigtable/BigtableIO.java @@ -976,9 +976,9 @@ public void initialize(PipelineOptions options) {} public void finalize(Iterable writerResults, PipelineOptions options) { long count = 0; for (Long value : writerResults) { - value += count; + count += value; } - logger.debug("Wrote {} elements to BigtableIO.Sink {}", sink); + logger.debug("Wrote {} elements to BigtableIO.Sink {}", count, sink); } @Override From e70747838e0af9c6d7ff0c6cb64177ba6d7ad2ea Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Fri, 17 Mar 2017 15:09:15 -0700 Subject: [PATCH 26/77] Upgrade to Apache Beam, version 0.6.0 Signed-off-by: Jason Kuster --- maven-archetypes/examples-java8/pom.xml | 79 ++++ .../META-INF/maven/archetype-metadata.xml | 38 ++ .../examples-java8/src/main/resources/NOTICE | 5 + .../resources/archetype-resources/pom.xml | 239 ++++++++++ .../src/main/java/DebuggingWordCount.java | 164 +++++++ .../src/main/java/MinimalWordCount.java | 118 +++++ .../src/main/java/MinimalWordCountJava8.java | 72 +++ .../src/main/java/WindowedWordCount.java | 242 ++++++++++ .../src/main/java/WordCount.java | 186 ++++++++ .../common/ExampleBigQueryTableOptions.java | 55 +++ .../src/main/java/common/ExampleOptions.java | 37 ++ ...mplePubsubTopicAndSubscriptionOptions.java | 45 ++ .../common/ExamplePubsubTopicOptions.java | 45 ++ .../src/main/java/common/ExampleUtils.java | 352 +++++++++++++++ .../java/common/WriteWindowedFilesDoFn.java | 77 ++++ .../main/java/complete/game/GameStats.java | 343 +++++++++++++++ .../java/complete/game/HourlyTeamScore.java | 196 +++++++++ .../main/java/complete/game/LeaderBoard.java | 286 ++++++++++++ .../src/main/java/complete/game/README.md | 131 ++++++ .../main/java/complete/game/UserScore.java | 243 ++++++++++ .../java/complete/game/injector/Injector.java | 414 ++++++++++++++++++ .../complete/game/injector/InjectorUtils.java | 100 +++++ .../injector/RetryHttpInitializerWrapper.java | 129 ++++++ .../complete/game/utils/WriteToBigQuery.java | 141 ++++++ .../game/utils/WriteWindowedToBigQuery.java | 71 +++ .../src/test/java/DebuggingWordCountTest.java | 52 +++ .../test/java/MinimalWordCountJava8Test.java | 104 +++++ .../src/test/java/WordCountTest.java | 86 ++++ .../java/complete/game/GameStatsTest.java | 81 ++++ .../complete/game/HourlyTeamScoreTest.java | 117 +++++ .../java/complete/game/LeaderBoardTest.java | 366 ++++++++++++++++ .../java/complete/game/UserScoreTest.java | 154 +++++++ .../projects/basic/archetype.properties | 19 + .../test/resources/projects/basic/goal.txt | 1 + maven-archetypes/examples/pom.xml | 38 +- .../resources/archetype-resources/pom.xml | 59 +-- .../projects/basic/archetype.properties | 2 +- maven-archetypes/pom.xml | 38 ++ maven-archetypes/starter/pom.xml | 44 +- .../resources/archetype-resources/pom.xml | 2 - .../projects/basic/archetype.properties | 2 +- .../projects/basic/reference/pom.xml | 4 +- pom.xml | 5 +- 43 files changed, 4940 insertions(+), 42 deletions(-) create mode 100644 maven-archetypes/examples-java8/pom.xml create mode 100644 maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml create mode 100644 maven-archetypes/examples-java8/src/main/resources/NOTICE create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/README.md create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java create mode 100644 maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties create mode 100644 maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt create mode 100644 maven-archetypes/pom.xml diff --git a/maven-archetypes/examples-java8/pom.xml b/maven-archetypes/examples-java8/pom.xml new file mode 100644 index 0000000000..3c19164933 --- /dev/null +++ b/maven-archetypes/examples-java8/pom.xml @@ -0,0 +1,79 @@ + + + + + 4.0.0 + + + com.google.cloud.dataflow + google-cloud-dataflow-java-archetypes-parent + 2.0.0-beta3-SNAPSHOT + ../pom.xml + + + google-cloud-dataflow-java-archetypes-examples-java8 + Google Cloud Dataflow SDK for Java - Java 8 Examples Archetype + Google Cloud Dataflow SDK for Java is a distribution of Apache + Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow + service. This archetype creates a project containing all the example + pipelines targeting Java 8. + + maven-archetype + + + + + org.apache.maven.archetype + archetype-packaging + 2.4 + + + + + + + maven-archetype-plugin + 2.4 + + + org.apache.maven.shared + maven-invoker + 2.2 + + + + + + default-integration-test + install + + integration-test + + + + + + + + + diff --git a/maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml new file mode 100644 index 0000000000..326fdaa528 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml @@ -0,0 +1,38 @@ + + + + + + + src/main/java + + **/*.java + + + + + src/test/java + + **/*.java + + + + diff --git a/maven-archetypes/examples-java8/src/main/resources/NOTICE b/maven-archetypes/examples-java8/src/main/resources/NOTICE new file mode 100644 index 0000000000..981fde5a9e --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/NOTICE @@ -0,0 +1,5 @@ +Google Cloud Dataflow SDK for Java +Copyright 2017, Google Inc. + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml new file mode 100644 index 0000000000..0f5c2d13aa --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -0,0 +1,239 @@ + + + + 4.0.0 + + ${groupId} + ${artifactId} + ${version} + + jar + + + UTF-8 + + + + + ossrh.snapshots + Sonatype OSS Repository Hosting + https://oss.sonatype.org/content/repositories/snapshots/ + + false + + + true + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.5.1 + + 1.8 + 1.8 + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + all + 4 + true + + + + org.apache.maven.surefire + surefire-junit47 + 2.19.1 + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.4.1 + + + package + + shade + + + ${project.artifactId}-bundled-${project.version} + + + *:* + + META-INF/LICENSE + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.4.0 + + false + + + + + + + + + + com.google.cloud.dataflow + google-cloud-dataflow-java-sdk-all + 2.0.0-beta3-SNAPSHOT + + + + + com.google.api-client + google-api-client + 1.22.0 + + + + com.google.guava + guava-jdk5 + + + + + + com.google.apis + google-api-services-bigquery + v2-rev295-1.22.0 + + + + com.google.guava + guava-jdk5 + + + + + + com.google.http-client + google-http-client + 1.22.0 + + + + com.google.guava + guava-jdk5 + + + + + + com.google.apis + google-api-services-pubsub + v1-rev10-1.22.0 + + + + com.google.guava + guava-jdk5 + + + + + + joda-time + joda-time + 2.4 + + + + com.google.guava + guava + 20.0 + + + + + org.slf4j + slf4j-api + 1.7.14 + + + + org.slf4j + slf4j-jdk14 + 1.7.14 + + runtime + + + + + org.hamcrest + hamcrest-all + 1.3 + + + + junit + junit + 4.12 + + + + org.mockito + mockito-all + 1.9.5 + test + + + diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java new file mode 100644 index 0000000000..dd9b91decc --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.transforms.Aggregator; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Sum; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * An example that verifies word counts in Shakespeare and includes Beam best practices. + * + *

This class, {@link DebuggingWordCount}, is the third in a series of four successively more + * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount} + * and {@link WordCount}. After you've looked at this example, then see the + * {@link WindowedWordCount} pipeline, for introduction of additional concepts. + * + *

Basic concepts, also in the MinimalWordCount and WordCount examples: + * Reading text files; counting a PCollection; executing a Pipeline both locally + * and using a selected runner; defining DoFns. + * + *

New Concepts: + *

+ *   1. Logging using SLF4J, even in a distributed environment
+ *   2. Creating a custom aggregator (runners have varying levels of support)
+ *   3. Testing your Pipeline via PAssert
+ * 
+ * + *

To execute this pipeline locally, specify general pipeline configuration: + *

{@code
+ *   --project=YOUR_PROJECT_ID
+ * }
+ * 
+ * + *

To change the runner, specify: + *

{@code
+ *   --runner=YOUR_SELECTED_RUNNER
+ * }
+ * 
+ * + *

The input file defaults to a public data set containing the text of of King Lear, + * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. + * + */ +public class DebuggingWordCount { + /** A DoFn that filters for a specific key based upon a regular expression. */ + public static class FilterTextFn extends DoFn, KV> { + /** + * Concept #1: The logger below uses the fully qualified class name of FilterTextFn as the + * logger. Depending on your SLF4J configuration, log statements will likely be qualified by + * this name. + * + *

Note that this is entirely standard SLF4J usage. Some runners may provide a default SLF4J + * configuration that is most appropriate for their logging integration. + */ + private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class); + + private final Pattern filter; + public FilterTextFn(String pattern) { + filter = Pattern.compile(pattern); + } + + /** + * Concept #2: A custom aggregator can track values in your pipeline as it runs. Each + * runner provides varying levels of support for aggregators, and may expose them + * in a dashboard, etc. + */ + private final Aggregator matchedWords = + createAggregator("matchedWords", Sum.ofLongs()); + private final Aggregator unmatchedWords = + createAggregator("unmatchedWords", Sum.ofLongs()); + + @ProcessElement + public void processElement(ProcessContext c) { + if (filter.matcher(c.element().getKey()).matches()) { + // Log at the "DEBUG" level each element that we match. When executing this pipeline + // these log lines will appear only if the log level is set to "DEBUG" or lower. + LOG.debug("Matched: " + c.element().getKey()); + matchedWords.addValue(1L); + c.output(c.element()); + } else { + // Log at the "TRACE" level each element that is not matched. Different log levels + // can be used to control the verbosity of logging providing an effective mechanism + // to filter less important information. + LOG.trace("Did not match: " + c.element().getKey()); + unmatchedWords.addValue(1L); + } + } + } + + /** + * Options supported by {@link DebuggingWordCount}. + * + *

Inherits standard configuration options and all options defined in + * {@link WordCount.WordCountOptions}. + */ + public interface WordCountOptions extends WordCount.WordCountOptions { + + @Description("Regex filter pattern to use in DebuggingWordCount. " + + "Only words matching this pattern will be counted.") + @Default.String("Flourish|stomach") + String getFilterPattern(); + void setFilterPattern(String value); + } + + public static void main(String[] args) { + WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() + .as(WordCountOptions.class); + Pipeline p = Pipeline.create(options); + + PCollection> filteredWords = + p.apply("ReadLines", TextIO.Read.from(options.getInputFile())) + .apply(new WordCount.CountWords()) + .apply(ParDo.of(new FilterTextFn(options.getFilterPattern()))); + + /** + * Concept #3: PAssert is a set of convenient PTransforms in the style of + * Hamcrest's collection matchers that can be used when writing Pipeline level tests + * to validate the contents of PCollections. PAssert is best used in unit tests + * with small data sets but is demonstrated here as a teaching tool. + * + *

Below we verify that the set of filtered words matches our expected counts. Note + * that PAssert does not provide any output and that successful completion of the + * Pipeline implies that the expectations were met. Learn more at + * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test + * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test. + */ + List> expectedResults = Arrays.asList( + KV.of("Flourish", 3L), + KV.of("stomach", 1L)); + PAssert.that(filteredWords).containsInAnyOrder(expectedResults); + + p.run().waitUntilFinish(); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java new file mode 100644 index 0000000000..97bd8243b8 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.values.KV; + + +/** + * An example that counts words in Shakespeare. + * + *

This class, {@link MinimalWordCount}, is the first in a series of four successively more + * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or + * argument processing, and focus on construction of the pipeline, which chains together the + * application of core transforms. + * + *

Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally the + * {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional + * concepts. + * + *

Concepts: + * + *

+ *   1. Reading data from text files
+ *   2. Specifying 'inline' transforms
+ *   3. Counting items in a PCollection
+ *   4. Writing data to text files
+ * 
+ * + *

No arguments are required to run this pipeline. It will be executed with the DirectRunner. You + * can see the results in the output files in your current working directory, with names like + * "wordcounts-00001-of-00005. When running on a distributed service, you would use an appropriate + * file service. + */ +public class MinimalWordCount { + + public static void main(String[] args) { + // Create a PipelineOptions object. This object lets us set various execution + // options for our pipeline, such as the runner you wish to use. This example + // will run with the DirectRunner by default, based on the class path configured + // in its dependencies. + PipelineOptions options = PipelineOptionsFactory.create(); + + // Create the Pipeline object with the options we defined above. + Pipeline p = Pipeline.create(options); + + // Apply the pipeline's transforms. + + // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set + // of input text files. TextIO.Read returns a PCollection where each element is one line from + // the input text (a set of Shakespeare's texts). + + // This example reads a public data set consisting of the complete works of Shakespeare. + p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*")) + + // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a + // DoFn (defined in-line) on each element that tokenizes the text line into individual words. + // The ParDo returns a PCollection, where each element is an individual word in + // Shakespeare's collected texts. + .apply("ExtractWords", ParDo.of(new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) { + for (String word : c.element().split("[^a-zA-Z']+")) { + if (!word.isEmpty()) { + c.output(word); + } + } + } + })) + + // Concept #3: Apply the Count transform to our PCollection of individual words. The Count + // transform returns a new PCollection of key/value pairs, where each key represents a unique + // word in the text. The associated value is the occurrence count for that word. + .apply(Count.perElement()) + + // Apply a MapElements transform that formats our PCollection of word counts into a printable + // string, suitable for writing to an output file. + .apply("FormatResults", MapElements.via(new SimpleFunction, String>() { + @Override + public String apply(KV input) { + return input.getKey() + ": " + input.getValue(); + } + })) + + // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. + // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of + // formatted strings) to a series of text files. + // + // By default, it will write to a set of files with names like wordcount-00001-of-00005 + .apply(TextIO.Write.to("wordcounts")); + + // Run the pipeline. + p.run().waitUntilFinish(); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java new file mode 100644 index 0000000000..532ca352ff --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import java.util.Arrays; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.FlatMapElements; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TypeDescriptors; + +/** + * An example that counts words in Shakespeare, using Java 8 language features. + * + *

See {@link MinimalWordCount} for a comprehensive explanation. + */ +public class MinimalWordCountJava8 { + + public static void main(String[] args) { + PipelineOptions options = PipelineOptionsFactory.create(); + // In order to run your pipeline, you need to make following runner specific changes: + // + // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner + // or FlinkRunner. + // CHANGE 2/3: Specify runner-required options. + // For BlockingDataflowRunner, set project and temp location as follows: + // DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); + // dataflowOptions.setRunner(BlockingDataflowRunner.class); + // dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE"); + // dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY"); + // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions} + // for more details. + // options.as(FlinkPipelineOptions.class) + // .setRunner(FlinkRunner.class); + + Pipeline p = Pipeline.create(options); + + p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*")) + .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+"))) + .withOutputType(TypeDescriptors.strings())) + .apply(Filter.by((String word) -> !word.isEmpty())) + .apply(Count.perElement()) + .apply(MapElements + .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()) + .withOutputType(TypeDescriptors.strings())) + + // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to. + .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX")); + + p.run().waitUntilFinish(); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java new file mode 100644 index 0000000000..052d7b6a0e --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import java.io.IOException; +import java.util.concurrent.ThreadLocalRandom; +import ${package}.common.ExampleBigQueryTableOptions; +import ${package}.common.ExampleOptions; +import ${package}.common.WriteWindowedFilesDoFn; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.DefaultValueFactory; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.GroupByKey; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.joda.time.Duration; +import org.joda.time.Instant; + + +/** + * An example that counts words in text, and can run over either unbounded or bounded input + * collections. + * + *

This class, {@link WindowedWordCount}, is the last in a series of four successively more + * detailed 'word count' examples. First take a look at {@link MinimalWordCount}, + * {@link WordCount}, and {@link DebuggingWordCount}. + * + *

Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples: + * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally + * and using a selected runner; defining DoFns; creating a custom aggregator; + * user-defined PTransforms; defining PipelineOptions. + * + *

New Concepts: + *

+ *   1. Unbounded and bounded pipeline input modes
+ *   2. Adding timestamps to data
+ *   3. Windowing
+ *   4. Re-using PTransforms over windowed PCollections
+ *   5. Accessing the window of an element
+ *   6. Writing data to per-window text files
+ * 
+ * + *

By default, the examples will run with the {@code DirectRunner}. + * To change the runner, specify: + *

{@code
+ *   --runner=YOUR_SELECTED_RUNNER
+ * }
+ * 
+ * See examples/java/README.md for instructions about how to configure different runners. + * + *

To execute this pipeline locally, specify a local output file (if using the + * {@code DirectRunner}) or output prefix on a supported distributed file system. + *

{@code
+ *   --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
+ * }
+ * + *

The input file defaults to a public data set containing the text of of King Lear, + * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. + * + *

By default, the pipeline will do fixed windowing, on 1-minute windows. You can + * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10} + * for 10-minute windows. + * + *

The example will try to cancel the pipeline on the signal to terminate the process (CTRL-C). + */ +public class WindowedWordCount { + static final int WINDOW_SIZE = 10; // Default window duration in minutes + /** + * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for + * this example, for the bounded data case. + * + *

Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate + * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a + * 2-hour period. + */ + static class AddTimestampFn extends DoFn { + private static final Duration RAND_RANGE = Duration.standardHours(1); + private final Instant minTimestamp; + private final Instant maxTimestamp; + + AddTimestampFn(Instant minTimestamp, Instant maxTimestamp) { + this.minTimestamp = minTimestamp; + this.maxTimestamp = maxTimestamp; + } + + @ProcessElement + public void processElement(ProcessContext c) { + Instant randomTimestamp = + new Instant( + ThreadLocalRandom.current() + .nextLong(minTimestamp.getMillis(), maxTimestamp.getMillis())); + + /** + * Concept #2: Set the data element with that timestamp. + */ + c.outputWithTimestamp(c.element(), new Instant(randomTimestamp)); + } + } + + /** A {@link DefaultValueFactory} that returns the current system time. */ + public static class DefaultToCurrentSystemTime implements DefaultValueFactory { + @Override + public Long create(PipelineOptions options) { + return System.currentTimeMillis(); + } + } + + /** A {@link DefaultValueFactory} that returns the minimum timestamp plus one hour. */ + public static class DefaultToMinTimestampPlusOneHour implements DefaultValueFactory { + @Override + public Long create(PipelineOptions options) { + return options.as(Options.class).getMinTimestampMillis() + + Duration.standardHours(1).getMillis(); + } + } + + /** + * Options for {@link WindowedWordCount}. + * + *

Inherits standard example configuration options, which allow specification of the + * runner, as well as the {@link WordCount.WordCountOptions} support for + * specification of the input and output files. + */ + public interface Options extends WordCount.WordCountOptions, + ExampleOptions, ExampleBigQueryTableOptions { + @Description("Fixed window duration, in minutes") + @Default.Integer(WINDOW_SIZE) + Integer getWindowSize(); + void setWindowSize(Integer value); + + @Description("Minimum randomly assigned timestamp, in milliseconds-since-epoch") + @Default.InstanceFactory(DefaultToCurrentSystemTime.class) + Long getMinTimestampMillis(); + void setMinTimestampMillis(Long value); + + @Description("Maximum randomly assigned timestamp, in milliseconds-since-epoch") + @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class) + Long getMaxTimestampMillis(); + void setMaxTimestampMillis(Long value); + } + + public static void main(String[] args) throws IOException { + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + final String output = options.getOutput(); + final Duration windowSize = Duration.standardMinutes(options.getWindowSize()); + final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); + final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); + + Pipeline pipeline = Pipeline.create(options); + + /** + * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or + * unbounded input source. + */ + PCollection input = pipeline + /** Read from the GCS file. */ + .apply(TextIO.Read.from(options.getInputFile())) + // Concept #2: Add an element timestamp, using an artificial time just to show windowing. + // See AddTimestampFn for more detail on this. + .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp))); + + /** + * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1 + * minute (you can change this with a command-line option). See the documentation for more + * information on how fixed windows work, and for information on the other types of windowing + * available (e.g., sliding windows). + */ + PCollection windowedWords = + input.apply( + Window.into( + FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); + + /** + * Concept #4: Re-use our existing CountWords transform that does not have knowledge of + * windows over a PCollection containing windowed values. + */ + PCollection> wordCounts = windowedWords.apply(new WordCount.CountWords()); + + /** + * Concept #5: Customize the output format using windowing information + * + *

At this point, the data is organized by window. We're writing text files and and have no + * late data, so for simplicity we can use the window as the key and {@link GroupByKey} to get + * one output file per window. (if we had late data this key would not be unique) + * + *

To access the window in a {@link DoFn}, add a {@link BoundedWindow} parameter. This will + * be automatically detected and populated with the window for the current element. + */ + PCollection>> keyedByWindow = + wordCounts.apply( + ParDo.of( + new DoFn, KV>>() { + @ProcessElement + public void processElement(ProcessContext context, IntervalWindow window) { + context.output(KV.of(window, context.element())); + } + })); + + /** + * Concept #6: Format the results and write to a sharded file partitioned by window, using a + * simple ParDo operation. Because there may be failures followed by retries, the + * writes must be idempotent, but the details of writing to files is elided here. + */ + keyedByWindow + .apply(GroupByKey.>create()) + .apply(ParDo.of(new WriteWindowedFilesDoFn(output))); + + PipelineResult result = pipeline.run(); + try { + result.waitUntilFinish(); + } catch (Exception exc) { + result.cancel(); + } + } + +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java new file mode 100644 index 0000000000..b3ef26c493 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.Validation.Required; +import org.apache.beam.sdk.transforms.Aggregator; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.SimpleFunction; +import org.apache.beam.sdk.transforms.Sum; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; + +/** + * An example that counts words in Shakespeare and includes Beam best practices. + * + *

This class, {@link WordCount}, is the second in a series of four successively more detailed + * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. + * After you've looked at this example, then see the {@link DebuggingWordCount} + * pipeline, for introduction of additional concepts. + * + *

For a detailed walkthrough of this example, see + * + * http://beam.apache.org/use/walkthroughs/ + * + * + *

Basic concepts, also in the MinimalWordCount example: + * Reading text files; counting a PCollection; writing to text files + * + *

New Concepts: + *

+ *   1. Executing a Pipeline both locally and using the selected runner
+ *   2. Using ParDo with static DoFns defined out-of-line
+ *   3. Building a composite transform
+ *   4. Defining your own pipeline options
+ * 
+ * + *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. + * These are now command-line options and not hard-coded as they were in the MinimalWordCount + * example. + * + *

To change the runner, specify: + *

{@code
+ *   --runner=YOUR_SELECTED_RUNNER
+ * }
+ * 
+ * + *

To execute this pipeline, specify a local output file (if using the + * {@code DirectRunner}) or output prefix on a supported distributed file system. + *

{@code
+ *   --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
+ * }
+ * + *

The input file defaults to a public data set containing the text of of King Lear, + * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. + */ +public class WordCount { + + /** + * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns + * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it + * to a ParDo in the pipeline. + */ + static class ExtractWordsFn extends DoFn { + private final Aggregator emptyLines = + createAggregator("emptyLines", Sum.ofLongs()); + + @ProcessElement + public void processElement(ProcessContext c) { + if (c.element().trim().isEmpty()) { + emptyLines.addValue(1L); + } + + // Split the line into words. + String[] words = c.element().split("[^a-zA-Z']+"); + + // Output each word encountered into the output PCollection. + for (String word : words) { + if (!word.isEmpty()) { + c.output(word); + } + } + } + } + + /** A SimpleFunction that converts a Word and Count into a printable string. */ + public static class FormatAsTextFn extends SimpleFunction, String> { + @Override + public String apply(KV input) { + return input.getKey() + ": " + input.getValue(); + } + } + + /** + * A PTransform that converts a PCollection containing lines of text into a PCollection of + * formatted word counts. + * + *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and + * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, + * modular testing, and an improved monitoring experience. + */ + public static class CountWords extends PTransform, + PCollection>> { + @Override + public PCollection> expand(PCollection lines) { + + // Convert lines of text into individual words. + PCollection words = lines.apply( + ParDo.of(new ExtractWordsFn())); + + // Count the number of times each word occurs. + PCollection> wordCounts = + words.apply(Count.perElement()); + + return wordCounts; + } + } + + /** + * Options supported by {@link WordCount}. + * + *

Concept #4: Defining your own configuration options. Here, you can add your own arguments + * to be processed by the command-line parser, and specify default values for them. You can then + * access the options values in your pipeline code. + * + *

Inherits standard configuration options. + */ + public interface WordCountOptions extends PipelineOptions { + + /** + * By default, this example reads from a public dataset containing the text of + * King Lear. Set this option to choose a different input file or glob. + */ + @Description("Path of the file to read from") + @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") + String getInputFile(); + void setInputFile(String value); + + /** + * Set this required option to specify where to write the output. + */ + @Description("Path of the file to write to") + @Required + String getOutput(); + void setOutput(String value); + } + + public static void main(String[] args) { + WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() + .as(WordCountOptions.class); + Pipeline p = Pipeline.create(options); + + // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the + // static FormatAsTextFn() to the ParDo transform. + p.apply("ReadLines", TextIO.Read.from(options.getInputFile())) + .apply(new CountWords()) + .apply(MapElements.via(new FormatAsTextFn())) + .apply("WriteCounts", TextIO.Write.to(options.getOutput())); + + p.run().waitUntilFinish(); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java new file mode 100644 index 0000000000..6b51074f44 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import com.google.api.services.bigquery.model.TableSchema; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.DefaultValueFactory; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.GcpOptions; +import org.apache.beam.sdk.options.PipelineOptions; + +/** + * Options that can be used to configure BigQuery tables in Beam examples. + * The project defaults to the project being used to run the example. + */ +public interface ExampleBigQueryTableOptions extends GcpOptions { + @Description("BigQuery dataset name") + @Default.String("beam_examples") + String getBigQueryDataset(); + void setBigQueryDataset(String dataset); + + @Description("BigQuery table name") + @Default.InstanceFactory(BigQueryTableFactory.class) + String getBigQueryTable(); + void setBigQueryTable(String table); + + @Description("BigQuery table schema") + TableSchema getBigQuerySchema(); + void setBigQuerySchema(TableSchema schema); + + /** + * Returns the job name as the default BigQuery table name. + */ + class BigQueryTableFactory implements DefaultValueFactory { + @Override + public String create(PipelineOptions options) { + return options.getJobName().replace('-', '_'); + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java new file mode 100644 index 0000000000..90f935c3ce --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; + +/** + * Options that can be used to configure the Beam examples. + */ +public interface ExampleOptions extends PipelineOptions { + @Description("Whether to keep jobs running after local process exit") + @Default.Boolean(false) + boolean getKeepJobsRunning(); + void setKeepJobsRunning(boolean keepJobsRunning); + + @Description("Number of workers to use when executing the injector pipeline") + @Default.Integer(1) + int getInjectorNumWorkers(); + void setInjectorNumWorkers(int numWorkers); +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java new file mode 100644 index 0000000000..daeb398f7f --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.DefaultValueFactory; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.GcpOptions; +import org.apache.beam.sdk.options.PipelineOptions; + +/** + * Options that can be used to configure Pub/Sub topic/subscription in Beam examples. + */ +public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions { + @Description("Pub/Sub subscription") + @Default.InstanceFactory(PubsubSubscriptionFactory.class) + String getPubsubSubscription(); + void setPubsubSubscription(String subscription); + + /** + * Returns a default Pub/Sub subscription based on the project and the job names. + */ + class PubsubSubscriptionFactory implements DefaultValueFactory { + @Override + public String create(PipelineOptions options) { + return "projects/" + options.as(GcpOptions.class).getProject() + + "/subscriptions/" + options.getJobName(); + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java new file mode 100644 index 0000000000..936bff5675 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.DefaultValueFactory; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.GcpOptions; +import org.apache.beam.sdk.options.PipelineOptions; + +/** + * Options that can be used to configure Pub/Sub topic in Beam examples. + */ +public interface ExamplePubsubTopicOptions extends GcpOptions { + @Description("Pub/Sub topic") + @Default.InstanceFactory(PubsubTopicFactory.class) + String getPubsubTopic(); + void setPubsubTopic(String topic); + + /** + * Returns a default Pub/Sub topic based on the project and the job names. + */ + class PubsubTopicFactory implements DefaultValueFactory { + @Override + public String create(PipelineOptions options) { + return "projects/" + options.as(GcpOptions.class).getProject() + + "/topics/" + options.getJobName(); + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java new file mode 100644 index 0000000000..570b3827b7 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import com.google.api.client.googleapis.json.GoogleJsonResponseException; +import com.google.api.client.googleapis.services.AbstractGoogleClientRequest; +import com.google.api.client.util.BackOff; +import com.google.api.client.util.BackOffUtils; +import com.google.api.client.util.Sleeper; +import com.google.api.services.bigquery.Bigquery; +import com.google.api.services.bigquery.Bigquery.Datasets; +import com.google.api.services.bigquery.Bigquery.Tables; +import com.google.api.services.bigquery.model.Dataset; +import com.google.api.services.bigquery.model.DatasetReference; +import com.google.api.services.bigquery.model.Table; +import com.google.api.services.bigquery.model.TableReference; +import com.google.api.services.bigquery.model.TableSchema; +import com.google.api.services.pubsub.Pubsub; +import com.google.api.services.pubsub.model.Subscription; +import com.google.api.services.pubsub.model.Topic; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.google.common.util.concurrent.Uninterruptibles; +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.options.BigQueryOptions; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PubsubOptions; +import org.apache.beam.sdk.util.FluentBackoff; +import org.apache.beam.sdk.util.Transport; +import org.joda.time.Duration; + +/** + * The utility class that sets up and tears down external resources, + * and cancels the streaming pipelines once the program terminates. + * + *

It is used to run Beam examples. + */ +public class ExampleUtils { + + private static final int SC_NOT_FOUND = 404; + + private final PipelineOptions options; + private Bigquery bigQueryClient = null; + private Pubsub pubsubClient = null; + private Set pipelinesToCancel = Sets.newHashSet(); + private List pendingMessages = Lists.newArrayList(); + + /** + * Do resources and runner options setup. + */ + public ExampleUtils(PipelineOptions options) { + this.options = options; + } + + /** + * Sets up external resources that are required by the example, + * such as Pub/Sub topics and BigQuery tables. + * + * @throws IOException if there is a problem setting up the resources + */ + public void setup() throws IOException { + Sleeper sleeper = Sleeper.DEFAULT; + BackOff backOff = + FluentBackoff.DEFAULT + .withMaxRetries(3).withInitialBackoff(Duration.millis(200)).backoff(); + Throwable lastException = null; + try { + do { + try { + setupPubsub(); + setupBigQueryTable(); + return; + } catch (GoogleJsonResponseException e) { + lastException = e; + } + } while (BackOffUtils.next(sleeper, backOff)); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + // Ignore InterruptedException + } + throw new RuntimeException(lastException); + } + + /** + * Sets up the Google Cloud Pub/Sub topic. + * + *

If the topic doesn't exist, a new topic with the given name will be created. + * + * @throws IOException if there is a problem setting up the Pub/Sub topic + */ + public void setupPubsub() throws IOException { + ExamplePubsubTopicAndSubscriptionOptions pubsubOptions = + options.as(ExamplePubsubTopicAndSubscriptionOptions.class); + if (!pubsubOptions.getPubsubTopic().isEmpty()) { + pendingMessages.add("**********************Set Up Pubsub************************"); + setupPubsubTopic(pubsubOptions.getPubsubTopic()); + pendingMessages.add("The Pub/Sub topic has been set up for this example: " + + pubsubOptions.getPubsubTopic()); + + if (!pubsubOptions.getPubsubSubscription().isEmpty()) { + setupPubsubSubscription( + pubsubOptions.getPubsubTopic(), pubsubOptions.getPubsubSubscription()); + pendingMessages.add("The Pub/Sub subscription has been set up for this example: " + + pubsubOptions.getPubsubSubscription()); + } + } + } + + /** + * Sets up the BigQuery table with the given schema. + * + *

If the table already exists, the schema has to match the given one. Otherwise, the example + * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema + * will be created. + * + * @throws IOException if there is a problem setting up the BigQuery table + */ + public void setupBigQueryTable() throws IOException { + ExampleBigQueryTableOptions bigQueryTableOptions = + options.as(ExampleBigQueryTableOptions.class); + if (bigQueryTableOptions.getBigQueryDataset() != null + && bigQueryTableOptions.getBigQueryTable() != null + && bigQueryTableOptions.getBigQuerySchema() != null) { + pendingMessages.add("******************Set Up Big Query Table*******************"); + setupBigQueryTable(bigQueryTableOptions.getProject(), + bigQueryTableOptions.getBigQueryDataset(), + bigQueryTableOptions.getBigQueryTable(), + bigQueryTableOptions.getBigQuerySchema()); + pendingMessages.add("The BigQuery table has been set up for this example: " + + bigQueryTableOptions.getProject() + + ":" + bigQueryTableOptions.getBigQueryDataset() + + "." + bigQueryTableOptions.getBigQueryTable()); + } + } + + /** + * Tears down external resources that can be deleted upon the example's completion. + */ + private void tearDown() { + pendingMessages.add("*************************Tear Down*************************"); + ExamplePubsubTopicAndSubscriptionOptions pubsubOptions = + options.as(ExamplePubsubTopicAndSubscriptionOptions.class); + if (!pubsubOptions.getPubsubTopic().isEmpty()) { + try { + deletePubsubTopic(pubsubOptions.getPubsubTopic()); + pendingMessages.add("The Pub/Sub topic has been deleted: " + + pubsubOptions.getPubsubTopic()); + } catch (IOException e) { + pendingMessages.add("Failed to delete the Pub/Sub topic : " + + pubsubOptions.getPubsubTopic()); + } + if (!pubsubOptions.getPubsubSubscription().isEmpty()) { + try { + deletePubsubSubscription(pubsubOptions.getPubsubSubscription()); + pendingMessages.add("The Pub/Sub subscription has been deleted: " + + pubsubOptions.getPubsubSubscription()); + } catch (IOException e) { + pendingMessages.add("Failed to delete the Pub/Sub subscription : " + + pubsubOptions.getPubsubSubscription()); + } + } + } + + ExampleBigQueryTableOptions bigQueryTableOptions = + options.as(ExampleBigQueryTableOptions.class); + if (bigQueryTableOptions.getBigQueryDataset() != null + && bigQueryTableOptions.getBigQueryTable() != null + && bigQueryTableOptions.getBigQuerySchema() != null) { + pendingMessages.add("The BigQuery table might contain the example's output, " + + "and it is not deleted automatically: " + + bigQueryTableOptions.getProject() + + ":" + bigQueryTableOptions.getBigQueryDataset() + + "." + bigQueryTableOptions.getBigQueryTable()); + pendingMessages.add("Please go to the Developers Console to delete it manually." + + " Otherwise, you may be charged for its usage."); + } + } + + private void setupBigQueryTable(String projectId, String datasetId, String tableId, + TableSchema schema) throws IOException { + if (bigQueryClient == null) { + bigQueryClient = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build(); + } + + Datasets datasetService = bigQueryClient.datasets(); + if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) { + Dataset newDataset = new Dataset().setDatasetReference( + new DatasetReference().setProjectId(projectId).setDatasetId(datasetId)); + datasetService.insert(projectId, newDataset).execute(); + } + + Tables tableService = bigQueryClient.tables(); + Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId)); + if (table == null) { + Table newTable = new Table().setSchema(schema).setTableReference( + new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId)); + tableService.insert(projectId, datasetId, newTable).execute(); + } else if (!table.getSchema().equals(schema)) { + throw new RuntimeException( + "Table exists and schemas do not match, expecting: " + schema.toPrettyString() + + ", actual: " + table.getSchema().toPrettyString()); + } + } + + private void setupPubsubTopic(String topic) throws IOException { + if (pubsubClient == null) { + pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + } + if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) { + pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute(); + } + } + + private void setupPubsubSubscription(String topic, String subscription) throws IOException { + if (pubsubClient == null) { + pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + } + if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) == null) { + Subscription subInfo = new Subscription() + .setAckDeadlineSeconds(60) + .setTopic(topic); + pubsubClient.projects().subscriptions().create(subscription, subInfo).execute(); + } + } + + /** + * Deletes the Google Cloud Pub/Sub topic. + * + * @throws IOException if there is a problem deleting the Pub/Sub topic + */ + private void deletePubsubTopic(String topic) throws IOException { + if (pubsubClient == null) { + pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + } + if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) { + pubsubClient.projects().topics().delete(topic).execute(); + } + } + + /** + * Deletes the Google Cloud Pub/Sub subscription. + * + * @throws IOException if there is a problem deleting the Pub/Sub subscription + */ + private void deletePubsubSubscription(String subscription) throws IOException { + if (pubsubClient == null) { + pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + } + if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) != null) { + pubsubClient.projects().subscriptions().delete(subscription).execute(); + } + } + + /** + * Waits for the pipeline to finish and cancels it before the program exists. + */ + public void waitToFinish(PipelineResult result) { + pipelinesToCancel.add(result); + if (!options.as(ExampleOptions.class).getKeepJobsRunning()) { + addShutdownHook(pipelinesToCancel); + } + try { + result.waitUntilFinish(); + } catch (UnsupportedOperationException e) { + // Do nothing if the given PipelineResult doesn't support waitUntilFinish(), + // such as EvaluationResults returned by DirectRunner. + tearDown(); + printPendingMessages(); + } catch (Exception e) { + throw new RuntimeException("Failed to wait the pipeline until finish: " + result); + } + } + + private void addShutdownHook(final Collection pipelineResults) { + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + tearDown(); + printPendingMessages(); + for (PipelineResult pipelineResult : pipelineResults) { + try { + pipelineResult.cancel(); + } catch (IOException e) { + System.out.println("Failed to cancel the job."); + System.out.println(e.getMessage()); + } + } + + for (PipelineResult pipelineResult : pipelineResults) { + boolean cancellationVerified = false; + for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) { + if (pipelineResult.getState().isTerminal()) { + cancellationVerified = true; + break; + } else { + System.out.println( + "The example pipeline is still running. Verifying the cancellation."); + } + Uninterruptibles.sleepUninterruptibly(10, TimeUnit.SECONDS); + } + if (!cancellationVerified) { + System.out.println("Failed to verify the cancellation for job: " + pipelineResult); + } + } + } + }); + } + + private void printPendingMessages() { + System.out.println(); + System.out.println("***********************************************************"); + System.out.println("***********************************************************"); + for (String message : pendingMessages) { + System.out.println(message); + } + System.out.println("***********************************************************"); + System.out.println("***********************************************************"); + } + + private static T executeNullIfNotFound( + AbstractGoogleClientRequest request) throws IOException { + try { + return request.execute(); + } catch (GoogleJsonResponseException e) { + if (e.getStatusCode() == SC_NOT_FOUND) { + return null; + } else { + throw e; + } + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java new file mode 100644 index 0000000000..a08e6a9b0f --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import com.google.common.annotations.VisibleForTesting; +import java.io.OutputStream; +import java.nio.channels.Channels; +import java.nio.charset.StandardCharsets; +import org.apache.beam.sdk.coders.Coder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.util.IOChannelFactory; +import org.apache.beam.sdk.util.IOChannelUtils; +import org.apache.beam.sdk.values.KV; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.ISODateTimeFormat; + +/** + * A {@link DoFn} that writes elements to files with names deterministically derived from the lower + * and upper bounds of their key (an {@link IntervalWindow}). + * + *

This is test utility code, not for end-users, so examples can be focused + * on their primary lessons. + */ +public class WriteWindowedFilesDoFn + extends DoFn>>, Void> { + + static final byte[] NEWLINE = "\n".getBytes(StandardCharsets.UTF_8); + static final Coder STRING_CODER = StringUtf8Coder.of(); + + private static DateTimeFormatter formatter = ISODateTimeFormat.hourMinute(); + + private final String output; + + public WriteWindowedFilesDoFn(String output) { + this.output = output; + } + + @VisibleForTesting + public static String fileForWindow(String output, IntervalWindow window) { + return String.format( + "%s-%s-%s", output, formatter.print(window.start()), formatter.print(window.end())); + } + + @ProcessElement + public void processElement(ProcessContext context) throws Exception { + // Build a file name from the window + IntervalWindow window = context.element().getKey(); + String outputShard = fileForWindow(output, window); + + // Open the file and write all the values + IOChannelFactory factory = IOChannelUtils.getFactory(outputShard); + OutputStream out = Channels.newOutputStream(factory.create(outputShard, "text/plain")); + for (KV wordCount : context.element().getValue()) { + STRING_CODER.encode( + wordCount.getKey() + ": " + wordCount.getValue(), out, Coder.Context.OUTER); + out.write(NEWLINE); + } + out.close(); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java new file mode 100644 index 0000000000..9877844058 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java @@ -0,0 +1,343 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import java.util.HashMap; +import java.util.Map; +import java.util.TimeZone; +import ${package}.common.ExampleUtils; +import ${package}.complete.game.utils.WriteWindowedToBigQuery; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.PubsubIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Aggregator; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.Mean; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Sum; +import org.apache.beam.sdk.transforms.Values; +import org.apache.beam.sdk.transforms.View; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.OutputTimeFns; +import org.apache.beam.sdk.transforms.windowing.Sessions; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionView; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.joda.time.DateTimeZone; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class is the fourth in a series of four pipelines that tell a story in a 'gaming' + * domain, following {@link UserScore}, {@link HourlyTeamScore}, and {@link LeaderBoard}. + * New concepts: session windows and finding session duration; use of both + * singleton and non-singleton side inputs. + * + *

This pipeline builds on the {@link LeaderBoard} functionality, and adds some "business + * intelligence" analysis: abuse detection and usage patterns. The pipeline derives the Mean user + * score sum for a window, and uses that information to identify likely spammers/robots. (The robots + * have a higher click rate than the human users). The 'robot' users are then filtered out when + * calculating the team scores. + * + *

Additionally, user sessions are tracked: that is, we find bursts of user activity using + * session windows. Then, the mean session duration information is recorded in the context of + * subsequent fixed windowing. (This could be used to tell us what games are giving us greater + * user retention). + * + *

Run {@code org.apache.beam.examples.complete.game.injector.Injector} to generate + * pubsub data for this pipeline. The {@code Injector} documentation provides more detail. + * + *

To execute this pipeline using the Dataflow service, specify the pipeline configuration + * like this: + *

{@code
+ *   --project=YOUR_PROJECT_ID
+ *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
+ *   --runner=BlockingDataflowRunner
+ *   --dataset=YOUR-DATASET
+ *   --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
+ * }
+ * 
+ * where the BigQuery dataset you specify must already exist. The PubSub topic you specify should + * be the same topic to which the Injector is publishing. + */ +public class GameStats extends LeaderBoard { + + private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; + + private static DateTimeFormatter fmt = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + + /** + * Filter out all but those users with a high clickrate, which we will consider as 'spammy' uesrs. + * We do this by finding the mean total score per user, then using that information as a side + * input to filter out all but those user scores that are larger than + * {@code (mean * SCORE_WEIGHT)}. + */ + // [START DocInclude_AbuseDetect] + public static class CalculateSpammyUsers + extends PTransform>, PCollection>> { + private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class); + private static final double SCORE_WEIGHT = 2.5; + + @Override + public PCollection> expand(PCollection> userScores) { + + // Get the sum of scores for each user. + PCollection> sumScores = userScores + .apply("UserSum", Sum.integersPerKey()); + + // Extract the score from each element, and use it to find the global mean. + final PCollectionView globalMeanScore = sumScores.apply(Values.create()) + .apply(Mean.globally().asSingletonView()); + + // Filter the user sums using the global mean. + PCollection> filtered = sumScores + .apply("ProcessAndFilter", ParDo + // use the derived mean total score as a side input + .withSideInputs(globalMeanScore) + .of(new DoFn, KV>() { + private final Aggregator numSpammerUsers = + createAggregator("SpammerUsers", Sum.ofLongs()); + @ProcessElement + public void processElement(ProcessContext c) { + Integer score = c.element().getValue(); + Double gmc = c.sideInput(globalMeanScore); + if (score > (gmc * SCORE_WEIGHT)) { + LOG.info("user " + c.element().getKey() + " spammer score " + score + + " with mean " + gmc); + numSpammerUsers.addValue(1L); + c.output(c.element()); + } + } + })); + return filtered; + } + } + // [END DocInclude_AbuseDetect] + + /** + * Calculate and output an element's session duration. + */ + private static class UserSessionInfoFn extends DoFn, Integer> { + @ProcessElement + public void processElement(ProcessContext c, BoundedWindow window) { + IntervalWindow w = (IntervalWindow) window; + int duration = new Duration( + w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes(); + c.output(duration); + } + } + + + /** + * Options supported by {@link GameStats}. + */ + interface Options extends LeaderBoard.Options { + @Description("Numeric value of fixed window duration for user analysis, in minutes") + @Default.Integer(60) + Integer getFixedWindowDuration(); + void setFixedWindowDuration(Integer value); + + @Description("Numeric value of gap between user sessions, in minutes") + @Default.Integer(5) + Integer getSessionGap(); + void setSessionGap(Integer value); + + @Description("Numeric value of fixed window for finding mean of user session duration, " + + "in minutes") + @Default.Integer(30) + Integer getUserActivityWindowDuration(); + void setUserActivityWindowDuration(Integer value); + + @Description("Prefix used for the BigQuery table names") + @Default.String("game_stats") + String getGameStatsTablePrefix(); + void setGameStatsTablePrefix(String value); + } + + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is used to write information about team score sums. + */ + protected static Map>> + configureWindowedWrite() { + Map>> tableConfigure = + new HashMap>>(); + tableConfigure.put( + "team", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", (c, w) -> c.element().getKey())); + tableConfigure.put( + "total_score", + new WriteWindowedToBigQuery.FieldInfo>( + "INTEGER", (c, w) -> c.element().getValue())); + tableConfigure.put( + "window_start", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", + (c, w) -> { + IntervalWindow window = (IntervalWindow) w; + return fmt.print(window.start()); + })); + tableConfigure.put( + "processing_time", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", (c, w) -> fmt.print(Instant.now()))); + return tableConfigure; + } + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is used to write information about mean user session time. + */ + protected static Map> + configureSessionWindowWrite() { + + Map> tableConfigure = + new HashMap>(); + tableConfigure.put( + "window_start", + new WriteWindowedToBigQuery.FieldInfo( + "STRING", + (c, w) -> { + IntervalWindow window = (IntervalWindow) w; + return fmt.print(window.start()); + })); + tableConfigure.put( + "mean_duration", + new WriteWindowedToBigQuery.FieldInfo("FLOAT", (c, w) -> c.element())); + return tableConfigure; + } + + + + public static void main(String[] args) throws Exception { + + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + // Enforce that this pipeline is always run in streaming mode. + options.setStreaming(true); + ExampleUtils exampleUtils = new ExampleUtils(options); + Pipeline pipeline = Pipeline.create(options); + + // Read Events from Pub/Sub using custom timestamps + PCollection rawEvents = pipeline + .apply(PubsubIO.read() + .timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()) + .withCoder(StringUtf8Coder.of())) + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); + + // Extract username/score pairs from the event stream + PCollection> userEvents = + rawEvents.apply("ExtractUserScore", + MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) + .withOutputType( + TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))); + + // Calculate the total score per user over fixed windows, and + // cumulative updates for late data. + final PCollectionView> spammersView = userEvents + .apply("FixedWindowsUser", Window.>into( + FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) + + // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. + // These might be robots/spammers. + .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) + // Derive a view from the collection of spammer users. It will be used as a side input + // in calculating the team score sums, below. + .apply("CreateSpammersView", View.asMap()); + + // [START DocInclude_FilterAndCalc] + // Calculate the total score per team over fixed windows, + // and emit cumulative updates for late data. Uses the side input derived above-- the set of + // suspected robots-- to filter out scores from those users from the sum. + // Write the results to BigQuery. + rawEvents + .apply("WindowIntoFixedWindows", Window.into( + FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) + // Filter out the detected spammer users, using the side input derived above. + .apply("FilterOutSpammers", ParDo + .withSideInputs(spammersView) + .of(new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) { + // If the user is not in the spammers Map, output the data element. + if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { + c.output(c.element()); + } + } + })) + // Extract and sum teamname/score pairs from the event data. + .apply("ExtractTeamScore", new ExtractAndSumScore("team")) + // [END DocInclude_FilterAndCalc] + // Write the result to BigQuery + .apply("WriteTeamSums", + new WriteWindowedToBigQuery>( + options.getGameStatsTablePrefix() + "_team", configureWindowedWrite())); + + + // [START DocInclude_SessionCalc] + // Detect user sessions-- that is, a burst of activity separated by a gap from further + // activity. Find and record the mean session lengths. + // This information could help the game designers track the changing user engagement + // as their set of games changes. + userEvents + .apply("WindowIntoSessions", Window.>into( + Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) + .withOutputTimeFn(OutputTimeFns.outputAtEndOfWindow())) + // For this use, we care only about the existence of the session, not any particular + // information aggregated over it, so the following is an efficient way to do that. + .apply(Combine.perKey(x -> 0)) + // Get the duration per session. + .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) + // [END DocInclude_SessionCalc] + // [START DocInclude_Rewindow] + // Re-window to process groups of session sums according to when the sessions complete. + .apply("WindowToExtractSessionMean", Window.into( + FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) + // Find the mean session duration in each window. + .apply(Mean.globally().withoutDefaults()) + // Write this info to a BigQuery table. + .apply("WriteAvgSessionLength", + new WriteWindowedToBigQuery( + options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite())); + // [END DocInclude_Rewindow] + + + // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the + // command line. + PipelineResult result = pipeline.run(); + exampleUtils.waitToFinish(result); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java new file mode 100644 index 0000000000..3a0745c735 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import java.util.HashMap; +import java.util.Map; +import java.util.TimeZone; +import ${package}.complete.game.utils.WriteWindowedToBigQuery; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.WithTimestamps; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.joda.time.DateTimeZone; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +/** + * This class is the second in a series of four pipelines that tell a story in a 'gaming' + * domain, following {@link UserScore}. In addition to the concepts introduced in {@link UserScore}, + * new concepts include: windowing and element timestamps; use of {@code Filter.by()}. + * + *

This pipeline processes data collected from gaming events in batch, building on {@link + * UserScore} but using fixed windows. It calculates the sum of scores per team, for each window, + * optionally allowing specification of two timestamps before and after which data is filtered out. + * This allows a model where late data collected after the intended analysis window can be included, + * and any late-arriving data prior to the beginning of the analysis window can be removed as well. + * By using windowing and adding element timestamps, we can do finer-grained analysis than with the + * {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get + * results from plays at the beginning of the batch's time period until the batch is processed. + * + *

To execute this pipeline using the Dataflow service, specify the pipeline configuration + * like this: + *

{@code
+ *   --project=YOUR_PROJECT_ID
+ *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
+ *   --runner=BlockingDataflowRunner
+ *   --dataset=YOUR-DATASET
+ * }
+ * 
+ * where the BigQuery dataset you specify must already exist. + * + *

Optionally include {@code --input} to specify the batch input file path. + * To indicate a time after which the data should be filtered out, include the + * {@code --stopMin} arg. E.g., {@code --stopMin=2015-10-18-23-59} indicates that any data + * timestamped after 23:59 PST on 2015-10-18 should not be included in the analysis. + * To indicate a time before which data should be filtered out, include the {@code --startMin} arg. + * If you're using the default input specified in {@link UserScore}, + * "gs://apache-beam-samples/game/gaming_data*.csv", then + * {@code --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values. + */ +public class HourlyTeamScore extends UserScore { + + private static DateTimeFormatter fmt = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + private static DateTimeFormatter minFmt = + DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + + + /** + * Options supported by {@link HourlyTeamScore}. + */ + interface Options extends UserScore.Options { + + @Description("Numeric value of fixed window duration, in minutes") + @Default.Integer(60) + Integer getWindowDuration(); + void setWindowDuration(Integer value); + + @Description("String representation of the first minute after which to generate results," + + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST." + + "Any input data timestamped prior to that minute won't be included in the sums.") + @Default.String("1970-01-01-00-00") + String getStartMin(); + void setStartMin(String value); + + @Description("String representation of the first minute for which to not generate results," + + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST." + + "Any input data timestamped after that minute won't be included in the sums.") + @Default.String("2100-01-01-00-00") + String getStopMin(); + void setStopMin(String value); + + @Description("The BigQuery table name. Should not already exist.") + @Default.String("hourly_team_score") + String getHourlyTeamScoreTableName(); + void setHourlyTeamScoreTableName(String value); + } + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is passed to the {@link WriteWindowedToBigQuery} constructor to write team score sums and + * includes information about window start time. + */ + protected static Map>> + configureWindowedTableWrite() { + Map>> tableConfig = + new HashMap>>(); + tableConfig.put( + "team", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", (c, w) -> c.element().getKey())); + tableConfig.put( + "total_score", + new WriteWindowedToBigQuery.FieldInfo>( + "INTEGER", (c, w) -> c.element().getValue())); + tableConfig.put( + "window_start", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", + (c, w) -> { + IntervalWindow window = (IntervalWindow) w; + return fmt.print(window.start()); + })); + return tableConfig; + } + + + /** + * Run a batch pipeline to do windowed analysis of the data. + */ + // [START DocInclude_HTSMain] + public static void main(String[] args) throws Exception { + // Begin constructing a pipeline configured by commandline flags. + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + Pipeline pipeline = Pipeline.create(options); + + final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin())); + final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin())); + + // Read 'gaming' events from a text file. + pipeline.apply(TextIO.Read.from(options.getInput())) + // Parse the incoming data. + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) + + // Filter out data before and after the given times so that it is not included + // in the calculations. As we collect data in batches (say, by day), the batch for the day + // that we want to analyze could potentially include some late-arriving data from the previous + // day. If so, we want to weed it out. Similarly, if we include data from the following day + // (to scoop up late-arriving events from the day we're analyzing), we need to weed out events + // that fall after the time period we want to analyze. + // [START DocInclude_HTSFilters] + .apply("FilterStartTime", Filter.by( + (GameActionInfo gInfo) + -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) + .apply("FilterEndTime", Filter.by( + (GameActionInfo gInfo) + -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) + // [END DocInclude_HTSFilters] + + // [START DocInclude_HTSAddTsAndWindow] + // Add an element timestamp based on the event log, and apply fixed windowing. + .apply("AddEventTimestamps", + WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) + .apply("FixedWindowsTeam", Window.into( + FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) + // [END DocInclude_HTSAddTsAndWindow] + + // Extract and sum teamname/score pairs from the event data. + .apply("ExtractTeamScore", new ExtractAndSumScore("team")) + .apply("WriteTeamScoreSums", + new WriteWindowedToBigQuery>(options.getHourlyTeamScoreTableName(), + configureWindowedTableWrite())); + + + pipeline.run().waitUntilFinish(); + } + // [END DocInclude_HTSMain] + +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java new file mode 100644 index 0000000000..e339ec3212 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import com.google.common.annotations.VisibleForTesting; +import java.util.HashMap; +import java.util.Map; +import java.util.TimeZone; +import ${package}.common.ExampleOptions; +import ${package}.common.ExampleUtils; +import ${package}.complete.game.utils.WriteToBigQuery; +import ${package}.complete.game.utils.WriteWindowedToBigQuery; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.PubsubIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.StreamingOptions; +import org.apache.beam.sdk.options.Validation; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; +import org.apache.beam.sdk.transforms.windowing.AfterWatermark; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.GlobalWindows; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.Repeatedly; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.joda.time.DateTimeZone; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +/** + * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain, + * following {@link UserScore} and {@link HourlyTeamScore}. Concepts include: processing unbounded + * data using fixed windows; use of custom timestamps and event-time processing; generation of + * early/speculative results; using .accumulatingFiredPanes() to do cumulative processing of late- + * arriving data. + * + *

This pipeline processes an unbounded stream of 'game events'. The calculation of the team + * scores uses fixed windowing based on event time (the time of the game play event), not + * processing time (the time that an event is processed by the pipeline). The pipeline calculates + * the sum of scores per team, for each window. By default, the team scores are calculated using + * one-hour windows. + * + *

In contrast-- to demo another windowing option-- the user scores are calculated using a + * global window, which periodically (every ten minutes) emits cumulative user score sums. + * + *

In contrast to the previous pipelines in the series, which used static, finite input data, + * here we're using an unbounded data source, which lets us provide speculative results, and allows + * handling of late data, at much lower latency. We can use the early/speculative results to keep a + * 'leaderboard' updated in near-realtime. Our handling of late data lets us generate correct + * results, e.g. for 'team prizes'. We're now outputting window results as they're + * calculated, giving us much lower latency than with the previous batch examples. + * + *

Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector + * documentation provides more detail on how to do this. + * + *

To execute this pipeline using the Dataflow service, specify the pipeline configuration + * like this: + *

{@code
+ *   --project=YOUR_PROJECT_ID
+ *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
+ *   --runner=BlockingDataflowRunner
+ *   --dataset=YOUR-DATASET
+ *   --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
+ * }
+ * 
+ * where the BigQuery dataset you specify must already exist. + * The PubSub topic you specify should be the same topic to which the Injector is publishing. + */ +public class LeaderBoard extends HourlyTeamScore { + + private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; + + private static DateTimeFormatter fmt = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + static final Duration FIVE_MINUTES = Duration.standardMinutes(5); + static final Duration TEN_MINUTES = Duration.standardMinutes(10); + + + /** + * Options supported by {@link LeaderBoard}. + */ + interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOptions { + + @Description("Pub/Sub topic to read from") + @Validation.Required + String getTopic(); + void setTopic(String value); + + @Description("Numeric value of fixed window duration for team analysis, in minutes") + @Default.Integer(60) + Integer getTeamWindowDuration(); + void setTeamWindowDuration(Integer value); + + @Description("Numeric value of allowed data lateness, in minutes") + @Default.Integer(120) + Integer getAllowedLateness(); + void setAllowedLateness(Integer value); + + @Description("Prefix used for the BigQuery table names") + @Default.String("leaderboard") + String getLeaderBoardTableName(); + void setLeaderBoardTableName(String value); + } + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is used to write team score sums and includes event timing information. + */ + protected static Map>> + configureWindowedTableWrite() { + + Map>> tableConfigure = + new HashMap>>(); + tableConfigure.put( + "team", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", (c, w) -> c.element().getKey())); + tableConfigure.put( + "total_score", + new WriteWindowedToBigQuery.FieldInfo>( + "INTEGER", (c, w) -> c.element().getValue())); + tableConfigure.put( + "window_start", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", + (c, w) -> { + IntervalWindow window = (IntervalWindow) w; + return fmt.print(window.start()); + })); + tableConfigure.put( + "processing_time", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", (c, w) -> fmt.print(Instant.now()))); + tableConfigure.put( + "timing", + new WriteWindowedToBigQuery.FieldInfo>( + "STRING", (c, w) -> c.pane().getTiming().toString())); + return tableConfigure; + } + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is used to write user score sums. + */ + protected static Map>> + configureGlobalWindowBigQueryWrite() { + + Map>> tableConfigure = + configureBigQueryWrite(); + tableConfigure.put( + "processing_time", + new WriteToBigQuery.FieldInfo>( + "STRING", (c, w) -> fmt.print(Instant.now()))); + return tableConfigure; + } + + + public static void main(String[] args) throws Exception { + + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + // Enforce that this pipeline is always run in streaming mode. + options.setStreaming(true); + ExampleUtils exampleUtils = new ExampleUtils(options); + Pipeline pipeline = Pipeline.create(options); + + // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub + // data elements, and parse the data. + PCollection gameEvents = pipeline + .apply(PubsubIO.read() + .timestampLabel(TIMESTAMP_ATTRIBUTE).topic(options.getTopic()) + .withCoder(StringUtf8Coder.of())) + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); + + gameEvents.apply("CalculateTeamScores", + new CalculateTeamScores( + Duration.standardMinutes(options.getTeamWindowDuration()), + Duration.standardMinutes(options.getAllowedLateness()))) + // Write the results to BigQuery. + .apply("WriteTeamScoreSums", + new WriteWindowedToBigQuery>( + options.getLeaderBoardTableName() + "_team", configureWindowedTableWrite())); + gameEvents + .apply( + "CalculateUserScores", + new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) + // Write the results to BigQuery. + .apply( + "WriteUserScoreSums", + new WriteToBigQuery>( + options.getLeaderBoardTableName() + "_user", configureGlobalWindowBigQueryWrite())); + + // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the + // command line. + PipelineResult result = pipeline.run(); + exampleUtils.waitToFinish(result); + } + + /** + * Calculates scores for each team within the configured window duration. + */ + // [START DocInclude_WindowAndTrigger] + // Extract team/score pairs from the event stream, using hour-long windows by default. + @VisibleForTesting + static class CalculateTeamScores + extends PTransform, PCollection>> { + private final Duration teamWindowDuration; + private final Duration allowedLateness; + + CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) { + this.teamWindowDuration = teamWindowDuration; + this.allowedLateness = allowedLateness; + } + + @Override + public PCollection> expand(PCollection infos) { + return infos.apply("LeaderboardTeamFixedWindows", + Window.into(FixedWindows.of(teamWindowDuration)) + // We will get early (speculative) results as well as cumulative + // processing of late data. + .triggering(AfterWatermark.pastEndOfWindow() + .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane() + .plusDelayOf(FIVE_MINUTES)) + .withLateFirings(AfterProcessingTime.pastFirstElementInPane() + .plusDelayOf(TEN_MINUTES))) + .withAllowedLateness(allowedLateness) + .accumulatingFiredPanes()) + // Extract and sum teamname/score pairs from the event data. + .apply("ExtractTeamScore", new ExtractAndSumScore("team")); + } + } + // [END DocInclude_WindowAndTrigger] + + // [START DocInclude_ProcTimeTrigger] + /** + * Extract user/score pairs from the event stream using processing time, via global windowing. + * Get periodic updates on all users' running scores. + */ + @VisibleForTesting + static class CalculateUserScores + extends PTransform, PCollection>> { + private final Duration allowedLateness; + + CalculateUserScores(Duration allowedLateness) { + this.allowedLateness = allowedLateness; + } + + @Override + public PCollection> expand(PCollection input) { + return input.apply("LeaderboardUserGlobalWindow", + Window.into(new GlobalWindows()) + // Get periodic results every ten minutes. + .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() + .plusDelayOf(TEN_MINUTES))) + .accumulatingFiredPanes() + .withAllowedLateness(allowedLateness)) + // Extract and sum username/score pairs from the event data. + .apply("ExtractUserScore", new ExtractAndSumScore("user")); + } + } + // [END DocInclude_ProcTimeTrigger] +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/README.md b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/README.md new file mode 100644 index 0000000000..25e31f55c9 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/README.md @@ -0,0 +1,131 @@ + + +# 'Gaming' examples + + +This directory holds a series of example Dataflow pipelines in a simple 'mobile +gaming' domain. They all require Java 8. Each pipeline successively introduces +new concepts, and gives some examples of using Java 8 syntax in constructing +Dataflow pipelines. Other than usage of Java 8 lambda expressions, the concepts +that are used apply equally well in Java 7. + +In the gaming scenario, many users play, as members of different teams, over +the course of a day, and their actions are logged for processing. Some of the +logged game events may be late-arriving, if users play on mobile devices and go +transiently offline for a period. + +The scenario includes not only "regular" users, but "robot users", which have a +higher click rate than the regular users, and may move from team to team. + +The first two pipelines in the series use pre-generated batch data samples. The +second two pipelines read from a [PubSub](https://cloud.google.com/pubsub/) +topic input. For these examples, you will also need to run the +`injector.Injector` program, which generates and publishes the gaming data to +PubSub. The javadocs for each pipeline have more detailed information on how to +run that pipeline. + +All of these pipelines write their results to BigQuery table(s). + + +## The pipelines in the 'gaming' series + +### UserScore + +The first pipeline in the series is `UserScore`. This pipeline does batch +processing of data collected from gaming events. It calculates the sum of +scores per user, over an entire batch of gaming data (collected, say, for each +day). The batch processing will not include any late data that arrives after +the day's cutoff point. + +### HourlyTeamScore + +The next pipeline in the series is `HourlyTeamScore`. This pipeline also +processes data collected from gaming events in batch. It builds on `UserScore`, +but uses [fixed windows](https://cloud.google.com/dataflow/model/windowing), by +default an hour in duration. It calculates the sum of scores per team, for each +window, optionally allowing specification of two timestamps before and after +which data is filtered out. This allows a model where late data collected after +the intended analysis window can be included in the analysis, and any late- +arriving data prior to the beginning of the analysis window can be removed as +well. + +By using windowing and adding element timestamps, we can do finer-grained +analysis than with the `UserScore` pipeline — we're now tracking scores for +each hour rather than over the course of a whole day. However, our batch +processing is high-latency, in that we don't get results from plays at the +beginning of the batch's time period until the complete batch is processed. + +### LeaderBoard + +The third pipeline in the series is `LeaderBoard`. This pipeline processes an +unbounded stream of 'game events' from a PubSub topic. The calculation of the +team scores uses fixed windowing based on event time (the time of the game play +event), not processing time (the time that an event is processed by the +pipeline). The pipeline calculates the sum of scores per team, for each window. +By default, the team scores are calculated using one-hour windows. + +In contrast — to demo another windowing option — the user scores are calculated +using a global window, which periodically (every ten minutes) emits cumulative +user score sums. + +In contrast to the previous pipelines in the series, which used static, finite +input data, here we're using an unbounded data source, which lets us provide +_speculative_ results, and allows handling of late data, at much lower latency. +E.g., we could use the early/speculative results to keep a 'leaderboard' +updated in near-realtime. Our handling of late data lets us generate correct +results, e.g. for 'team prizes'. We're now outputing window results as they're +calculated, giving us much lower latency than with the previous batch examples. + +### GameStats + +The fourth pipeline in the series is `GameStats`. This pipeline builds +on the `LeaderBoard` functionality — supporting output of speculative and late +data — and adds some "business intelligence" analysis: identifying abuse +detection. The pipeline derives the Mean user score sum for a window, and uses +that information to identify likely spammers/robots. (The injector is designed +so that the "robots" have a higher click rate than the "real" users). The robot +users are then filtered out when calculating the team scores. + +Additionally, user sessions are tracked: that is, we find bursts of user +activity using session windows. Then, the mean session duration information is +recorded in the context of subsequent fixed windowing. (This could be used to +tell us what games are giving us greater user retention). + +### Running the PubSub Injector + +The `LeaderBoard` and `GameStats` example pipelines read unbounded data +from a PubSub topic. + +Use the `injector.Injector` program to generate this data and publish to a +PubSub topic. See the `Injector`javadocs for more information on how to run the +injector. Set up the injector before you start one of these pipelines. Then, +when you start the pipeline, pass as an argument the name of that PubSub topic. +See the pipeline javadocs for the details. + +## Viewing the results in BigQuery + +All of the pipelines write their results to BigQuery. `UserScore` and +`HourlyTeamScore` each write one table, and `LeaderBoard` and +`GameStats` each write two. The pipelines have default table names that +you can override when you start up the pipeline if those tables already exist. + +Depending on the windowing intervals defined in a given pipeline, you may have +to wait for a while (more than an hour) before you start to see results written +to the BigQuery tables. diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java new file mode 100644 index 0000000000..d500658694 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java @@ -0,0 +1,243 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import java.util.HashMap; +import java.util.Map; +import org.apache.avro.reflect.Nullable; +import ${package}.complete.game.utils.WriteToBigQuery; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.coders.AvroCoder; +import org.apache.beam.sdk.coders.DefaultCoder; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.Validation; +import org.apache.beam.sdk.transforms.Aggregator; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Sum; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain. + * Concepts: batch processing; reading input from Google Cloud Storage and writing output to + * BigQuery; using standalone DoFns; use of the sum by key transform; examples of + * Java 8 lambda syntax. + * + *

In this gaming scenario, many users play, as members of different teams, over the course of a + * day, and their actions are logged for processing. Some of the logged game events may be late- + * arriving, if users play on mobile devices and go transiently offline for a period. + * + *

This pipeline does batch processing of data collected from gaming events. It calculates the + * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The + * batch processing will not include any late data that arrives after the day's cutoff point. + * + *

To execute this pipeline using the Dataflow service and static example input data, specify + * the pipeline configuration like this: + *

{@code
+ *   --project=YOUR_PROJECT_ID
+ *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
+ *   --runner=BlockingDataflowRunner
+ *   --dataset=YOUR-DATASET
+ * }
+ * 
+ * where the BigQuery dataset you specify must already exist. + * + *

Optionally include the --input argument to specify a batch input file. + * See the --input default value for example batch data file, or use {@code injector.Injector} to + * generate your own batch data. + */ +public class UserScore { + + /** + * Class to hold info about a game event. + */ + @DefaultCoder(AvroCoder.class) + static class GameActionInfo { + @Nullable String user; + @Nullable String team; + @Nullable Integer score; + @Nullable Long timestamp; + + public GameActionInfo() {} + + public GameActionInfo(String user, String team, Integer score, Long timestamp) { + this.user = user; + this.team = team; + this.score = score; + this.timestamp = timestamp; + } + + public String getUser() { + return this.user; + } + public String getTeam() { + return this.team; + } + public Integer getScore() { + return this.score; + } + public String getKey(String keyname) { + if (keyname.equals("team")) { + return this.team; + } else { // return username as default + return this.user; + } + } + public Long getTimestamp() { + return this.timestamp; + } + } + + + /** + * Parses the raw game event info into GameActionInfo objects. Each event line has the following + * format: username,teamname,score,timestamp_in_ms,readable_time + * e.g.: + * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 + * The human-readable time string is not used here. + */ + static class ParseEventFn extends DoFn { + + // Log and count parse errors. + private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class); + private final Aggregator numParseErrors = + createAggregator("ParseErrors", Sum.ofLongs()); + + @ProcessElement + public void processElement(ProcessContext c) { + String[] components = c.element().split(","); + try { + String user = components[0].trim(); + String team = components[1].trim(); + Integer score = Integer.parseInt(components[2].trim()); + Long timestamp = Long.parseLong(components[3].trim()); + GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp); + c.output(gInfo); + } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { + numParseErrors.addValue(1L); + LOG.info("Parse error on " + c.element() + ", " + e.getMessage()); + } + } + } + + /** + * A transform to extract key/score information from GameActionInfo, and sum the scores. The + * constructor arg determines whether 'team' or 'user' info is extracted. + */ + // [START DocInclude_USExtractXform] + public static class ExtractAndSumScore + extends PTransform, PCollection>> { + + private final String field; + + ExtractAndSumScore(String field) { + this.field = field; + } + + @Override + public PCollection> expand( + PCollection gameInfo) { + + return gameInfo + .apply(MapElements + .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore())) + .withOutputType( + TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))) + .apply(Sum.integersPerKey()); + } + } + // [END DocInclude_USExtractXform] + + + /** + * Options supported by {@link UserScore}. + */ + public interface Options extends PipelineOptions { + + @Description("Path to the data file(s) containing game data.") + // The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent + // day's worth (roughly) of data. + @Default.String("gs://apache-beam-samples/game/gaming_data*.csv") + String getInput(); + void setInput(String value); + + @Description("BigQuery Dataset to write tables to. Must already exist.") + @Validation.Required + String getDataset(); + void setDataset(String value); + + @Description("The BigQuery table name. Should not already exist.") + @Default.String("user_score") + String getUserScoreTableName(); + void setUserScoreTableName(String value); + } + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is passed to the {@link WriteToBigQuery} constructor to write user score sums. + */ + protected static Map>> + configureBigQueryWrite() { + Map>> tableConfigure = + new HashMap>>(); + tableConfigure.put( + "user", + new WriteToBigQuery.FieldInfo>( + "STRING", (c, w) -> c.element().getKey())); + tableConfigure.put( + "total_score", + new WriteToBigQuery.FieldInfo>( + "INTEGER", (c, w) -> c.element().getValue())); + return tableConfigure; + } + + + /** + * Run a batch pipeline. + */ + // [START DocInclude_USMain] + public static void main(String[] args) throws Exception { + // Begin constructing a pipeline configured by commandline flags. + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + Pipeline pipeline = Pipeline.create(options); + + // Read events from a text file and parse them. + pipeline.apply(TextIO.Read.from(options.getInput())) + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) + // Extract and sum username/score pairs from the event data. + .apply("ExtractUserScore", new ExtractAndSumScore("user")) + .apply("WriteUserScoreSums", + new WriteToBigQuery>(options.getUserScoreTableName(), + configureBigQueryWrite())); + + // Run the batch pipeline. + pipeline.run().waitUntilFinish(); + } + // [END DocInclude_USMain] + +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java new file mode 100644 index 0000000000..c8531c15d1 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java @@ -0,0 +1,414 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game.injector; + +import com.google.api.services.pubsub.Pubsub; +import com.google.api.services.pubsub.model.PublishRequest; +import com.google.api.services.pubsub.model.PubsubMessage; +import com.google.common.collect.ImmutableMap; +import java.io.BufferedOutputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.TimeZone; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + + +/** + * This is a generator that simulates usage data from a mobile game, and either publishes the data + * to a pubsub topic or writes it to a file. + * + *

The general model used by the generator is the following. There is a set of teams with team + * members. Each member is scoring points for their team. After some period, a team will dissolve + * and a new one will be created in its place. There is also a set of 'Robots', or spammer users. + * They hop from team to team. The robots are set to have a higher 'click rate' (generate more + * events) than the regular team members. + * + *

Each generated line of data has the following form: + * username,teamname,score,timestamp_in_ms,readable_time + * e.g.: + * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 + * + *

The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if + * specified. It takes the following arguments: + * {@code Injector project-name (topic-name|none) (filename|none)}. + * + *

To run the Injector in the mode where it publishes to PubSub, you will need to authenticate + * locally using project-based service account credentials to avoid running over PubSub + * quota. + * See https://developers.google.com/identity/protocols/application-default-credentials + * for more information on using service account credentials. Set the GOOGLE_APPLICATION_CREDENTIALS + * environment variable to point to your downloaded service account credentials before starting the + * program, e.g.: + * {@code export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/credentials-key.json}. + * If you do not do this, then your injector will only run for a few minutes on your + * 'user account' credentials before you will start to see quota error messages like: + * "Request throttled due to user QPS limit being reached", and see this exception: + * ".com.google.api.client.googleapis.json.GoogleJsonResponseException: 429 Too Many Requests". + * Once you've set up your credentials, run the Injector like this": + *

{@code
+ * Injector   none
+ * }
+ * 
+ * The pubsub topic will be created if it does not exist. + * + *

To run the injector in write-to-file-mode, set the topic name to "none" and specify the + * filename: + *

{@code
+ * Injector  none 
+ * }
+ * 
+ */ +class Injector { + private static Pubsub pubsub; + private static Random random = new Random(); + private static String topic; + private static String project; + private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; + + // QPS ranges from 800 to 1000. + private static final int MIN_QPS = 800; + private static final int QPS_RANGE = 200; + // How long to sleep, in ms, between creation of the threads that make API requests to PubSub. + private static final int THREAD_SLEEP_MS = 500; + + // Lists used to generate random team names. + private static final ArrayList COLORS = + new ArrayList(Arrays.asList( + "Magenta", "AliceBlue", "Almond", "Amaranth", "Amber", + "Amethyst", "AndroidGreen", "AntiqueBrass", "Fuchsia", "Ruby", "AppleGreen", + "Apricot", "Aqua", "ArmyGreen", "Asparagus", "Auburn", "Azure", "Banana", + "Beige", "Bisque", "BarnRed", "BattleshipGrey")); + + private static final ArrayList ANIMALS = + new ArrayList(Arrays.asList( + "Echidna", "Koala", "Wombat", "Marmot", "Quokka", "Kangaroo", "Dingo", "Numbat", "Emu", + "Wallaby", "CaneToad", "Bilby", "Possum", "Cassowary", "Kookaburra", "Platypus", + "Bandicoot", "Cockatoo", "Antechinus")); + + // The list of live teams. + private static ArrayList liveTeams = new ArrayList(); + + private static DateTimeFormatter fmt = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + + + // The total number of robots in the system. + private static final int NUM_ROBOTS = 20; + // Determines the chance that a team will have a robot team member. + private static final int ROBOT_PROBABILITY = 3; + private static final int NUM_LIVE_TEAMS = 15; + private static final int BASE_MEMBERS_PER_TEAM = 5; + private static final int MEMBERS_PER_TEAM = 15; + private static final int MAX_SCORE = 20; + private static final int LATE_DATA_RATE = 5 * 60 * 2; // Every 10 minutes + private static final int BASE_DELAY_IN_MILLIS = 5 * 60 * 1000; // 5-10 minute delay + private static final int FUZZY_DELAY_IN_MILLIS = 5 * 60 * 1000; + + // The minimum time a 'team' can live. + private static final int BASE_TEAM_EXPIRATION_TIME_IN_MINS = 20; + private static final int TEAM_EXPIRATION_TIME_IN_MINS = 20; + + + /** + * A class for holding team info: the name of the team, when it started, + * and the current team members. Teams may but need not include one robot team member. + */ + private static class TeamInfo { + String teamName; + long startTimeInMillis; + int expirationPeriod; + // The team might but need not include 1 robot. Will be non-null if so. + String robot; + int numMembers; + + private TeamInfo(String teamName, long startTimeInMillis, String robot) { + this.teamName = teamName; + this.startTimeInMillis = startTimeInMillis; + // How long until this team is dissolved. + this.expirationPeriod = random.nextInt(TEAM_EXPIRATION_TIME_IN_MINS) + + BASE_TEAM_EXPIRATION_TIME_IN_MINS; + this.robot = robot; + // Determine the number of team members. + numMembers = random.nextInt(MEMBERS_PER_TEAM) + BASE_MEMBERS_PER_TEAM; + } + + String getTeamName() { + return teamName; + } + String getRobot() { + return robot; + } + + long getStartTimeInMillis() { + return startTimeInMillis; + } + long getEndTimeInMillis() { + return startTimeInMillis + (expirationPeriod * 60 * 1000); + } + String getRandomUser() { + int userNum = random.nextInt(numMembers); + return "user" + userNum + "_" + teamName; + } + + int numMembers() { + return numMembers; + } + + @Override + public String toString() { + return "(" + teamName + ", num members: " + numMembers() + ", starting at: " + + startTimeInMillis + ", expires in: " + expirationPeriod + ", robot: " + robot + ")"; + } + } + + /** Utility to grab a random element from an array of Strings. */ + private static String randomElement(ArrayList list) { + int index = random.nextInt(list.size()); + return list.get(index); + } + + /** + * Get and return a random team. If the selected team is too old w.r.t its expiration, remove + * it, replacing it with a new team. + */ + private static TeamInfo randomTeam(ArrayList list) { + int index = random.nextInt(list.size()); + TeamInfo team = list.get(index); + // If the selected team is expired, remove it and return a new team. + long currTime = System.currentTimeMillis(); + if ((team.getEndTimeInMillis() < currTime) || team.numMembers() == 0) { + System.out.println("\nteam " + team + " is too old; replacing."); + System.out.println("start time: " + team.getStartTimeInMillis() + + ", end time: " + team.getEndTimeInMillis() + + ", current time:" + currTime); + removeTeam(index); + // Add a new team in its stead. + return (addLiveTeam()); + } else { + return team; + } + } + + /** + * Create and add a team. Possibly add a robot to the team. + */ + private static synchronized TeamInfo addLiveTeam() { + String teamName = randomElement(COLORS) + randomElement(ANIMALS); + String robot = null; + // Decide if we want to add a robot to the team. + if (random.nextInt(ROBOT_PROBABILITY) == 0) { + robot = "Robot-" + random.nextInt(NUM_ROBOTS); + } + // Create the new team. + TeamInfo newTeam = new TeamInfo(teamName, System.currentTimeMillis(), robot); + liveTeams.add(newTeam); + System.out.println("[+" + newTeam + "]"); + return newTeam; + } + + /** + * Remove a specific team. + */ + private static synchronized void removeTeam(int teamIndex) { + TeamInfo removedTeam = liveTeams.remove(teamIndex); + System.out.println("[-" + removedTeam + "]"); + } + + /** Generate a user gaming event. */ + private static String generateEvent(Long currTime, int delayInMillis) { + TeamInfo team = randomTeam(liveTeams); + String teamName = team.getTeamName(); + String user; + final int parseErrorRate = 900000; + + String robot = team.getRobot(); + // If the team has an associated robot team member... + if (robot != null) { + // Then use that robot for the message with some probability. + // Set this probability to higher than that used to select any of the 'regular' team + // members, so that if there is a robot on the team, it has a higher click rate. + if (random.nextInt(team.numMembers() / 2) == 0) { + user = robot; + } else { + user = team.getRandomUser(); + } + } else { // No robot. + user = team.getRandomUser(); + } + String event = user + "," + teamName + "," + random.nextInt(MAX_SCORE); + // Randomly introduce occasional parse errors. You can see a custom counter tracking the number + // of such errors in the Dataflow Monitoring UI, as the example pipeline runs. + if (random.nextInt(parseErrorRate) == 0) { + System.out.println("Introducing a parse error."); + event = "THIS LINE REPRESENTS CORRUPT DATA AND WILL CAUSE A PARSE ERROR"; + } + return addTimeInfoToEvent(event, currTime, delayInMillis); + } + + /** + * Add time info to a generated gaming event. + */ + private static String addTimeInfoToEvent(String message, Long currTime, int delayInMillis) { + String eventTimeString = + Long.toString((currTime - delayInMillis) / 1000 * 1000); + // Add a (redundant) 'human-readable' date string to make the data semantics more clear. + String dateString = fmt.print(currTime); + message = message + "," + eventTimeString + "," + dateString; + return message; + } + + /** + * Publish 'numMessages' arbitrary events from live users with the provided delay, to a + * PubSub topic. + */ + public static void publishData(int numMessages, int delayInMillis) + throws IOException { + List pubsubMessages = new ArrayList<>(); + + for (int i = 0; i < Math.max(1, numMessages); i++) { + Long currTime = System.currentTimeMillis(); + String message = generateEvent(currTime, delayInMillis); + PubsubMessage pubsubMessage = new PubsubMessage() + .encodeData(message.getBytes("UTF-8")); + pubsubMessage.setAttributes( + ImmutableMap.of(TIMESTAMP_ATTRIBUTE, + Long.toString((currTime - delayInMillis) / 1000 * 1000))); + if (delayInMillis != 0) { + System.out.println(pubsubMessage.getAttributes()); + System.out.println("late data for: " + message); + } + pubsubMessages.add(pubsubMessage); + } + + PublishRequest publishRequest = new PublishRequest(); + publishRequest.setMessages(pubsubMessages); + pubsub.projects().topics().publish(topic, publishRequest).execute(); + } + + /** + * Publish generated events to a file. + */ + public static void publishDataToFile(String fileName, int numMessages, int delayInMillis) + throws IOException { + PrintWriter out = new PrintWriter(new OutputStreamWriter( + new BufferedOutputStream(new FileOutputStream(fileName, true)), "UTF-8")); + + try { + for (int i = 0; i < Math.max(1, numMessages); i++) { + Long currTime = System.currentTimeMillis(); + String message = generateEvent(currTime, delayInMillis); + out.println(message); + } + } catch (Exception e) { + e.printStackTrace(); + } finally { + if (out != null) { + out.flush(); + out.close(); + } + } + } + + + public static void main(String[] args) throws IOException, InterruptedException { + if (args.length < 3) { + System.out.println("Usage: Injector project-name (topic-name|none) (filename|none)"); + System.exit(1); + } + boolean writeToFile = false; + boolean writeToPubsub = true; + project = args[0]; + String topicName = args[1]; + String fileName = args[2]; + // The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if + // specified; otherwise, it will try to write to a file. + if (topicName.equalsIgnoreCase("none")) { + writeToFile = true; + writeToPubsub = false; + } + if (writeToPubsub) { + // Create the PubSub client. + pubsub = InjectorUtils.getClient(); + // Create the PubSub topic as necessary. + topic = InjectorUtils.getFullyQualifiedTopicName(project, topicName); + InjectorUtils.createTopic(pubsub, topic); + System.out.println("Injecting to topic: " + topic); + } else { + if (fileName.equalsIgnoreCase("none")) { + System.out.println("Filename not specified."); + System.exit(1); + } + System.out.println("Writing to file: " + fileName); + } + System.out.println("Starting Injector"); + + // Start off with some random live teams. + while (liveTeams.size() < NUM_LIVE_TEAMS) { + addLiveTeam(); + } + + // Publish messages at a rate determined by the QPS and Thread sleep settings. + for (int i = 0; true; i++) { + if (Thread.activeCount() > 10) { + System.err.println("I'm falling behind!"); + } + + // Decide if this should be a batch of late data. + final int numMessages; + final int delayInMillis; + if (i % LATE_DATA_RATE == 0) { + // Insert delayed data for one user (one message only) + delayInMillis = BASE_DELAY_IN_MILLIS + random.nextInt(FUZZY_DELAY_IN_MILLIS); + numMessages = 1; + System.out.println("DELAY(" + delayInMillis + ", " + numMessages + ")"); + } else { + System.out.print("."); + delayInMillis = 0; + numMessages = MIN_QPS + random.nextInt(QPS_RANGE); + } + + if (writeToFile) { // Won't use threading for the file write. + publishDataToFile(fileName, numMessages, delayInMillis); + } else { // Write to PubSub. + // Start a thread to inject some data. + new Thread(){ + @Override + public void run() { + try { + publishData(numMessages, delayInMillis); + } catch (IOException e) { + System.err.println(e); + } + } + }.start(); + } + + // Wait before creating another injector thread. + Thread.sleep(THREAD_SLEEP_MS); + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java new file mode 100644 index 0000000000..55e8c7a8c3 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game.injector; + +import static com.google.common.base.Preconditions.checkNotNull; + +import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; +import com.google.api.client.googleapis.json.GoogleJsonResponseException; +import com.google.api.client.googleapis.util.Utils; +import com.google.api.client.http.HttpRequestInitializer; +import com.google.api.client.http.HttpStatusCodes; +import com.google.api.client.http.HttpTransport; +import com.google.api.client.json.JsonFactory; +import com.google.api.services.pubsub.Pubsub; +import com.google.api.services.pubsub.PubsubScopes; +import com.google.api.services.pubsub.model.Topic; +import java.io.IOException; + +class InjectorUtils { + + private static final String APP_NAME = "injector"; + + /** + * Builds a new Pubsub client and returns it. + */ + public static Pubsub getClient(final HttpTransport httpTransport, + final JsonFactory jsonFactory) + throws IOException { + checkNotNull(httpTransport); + checkNotNull(jsonFactory); + GoogleCredential credential = + GoogleCredential.getApplicationDefault(httpTransport, jsonFactory); + if (credential.createScopedRequired()) { + credential = credential.createScoped(PubsubScopes.all()); + } + if (credential.getClientAuthentication() != null) { + System.out.println("\n***Warning! You are not using service account credentials to " + + "authenticate.\nYou need to use service account credentials for this example," + + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run " + + "out of PubSub quota very quickly.\nSee " + + "https://developers.google.com/identity/protocols/application-default-credentials."); + System.exit(1); + } + HttpRequestInitializer initializer = + new RetryHttpInitializerWrapper(credential); + return new Pubsub.Builder(httpTransport, jsonFactory, initializer) + .setApplicationName(APP_NAME) + .build(); + } + + /** + * Builds a new Pubsub client with default HttpTransport and + * JsonFactory and returns it. + */ + public static Pubsub getClient() throws IOException { + return getClient(Utils.getDefaultTransport(), + Utils.getDefaultJsonFactory()); + } + + + /** + * Returns the fully qualified topic name for Pub/Sub. + */ + public static String getFullyQualifiedTopicName( + final String project, final String topic) { + return String.format("projects/%s/topics/%s", project, topic); + } + + /** + * Create a topic if it doesn't exist. + */ + public static void createTopic(Pubsub client, String fullTopicName) + throws IOException { + try { + client.projects().topics().get(fullTopicName).execute(); + } catch (GoogleJsonResponseException e) { + if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) { + Topic topic = client.projects().topics() + .create(fullTopicName, new Topic()) + .execute(); + System.out.printf("Topic %s was created.\n", topic.getName()); + } + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java new file mode 100644 index 0000000000..5d0cc68763 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game.injector; + +import static com.google.common.base.Preconditions.checkNotNull; + +import com.google.api.client.auth.oauth2.Credential; +import com.google.api.client.http.HttpBackOffIOExceptionHandler; +import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler; +import com.google.api.client.http.HttpRequest; +import com.google.api.client.http.HttpRequestInitializer; +import com.google.api.client.http.HttpResponse; +import com.google.api.client.http.HttpUnsuccessfulResponseHandler; +import com.google.api.client.util.ExponentialBackOff; +import com.google.api.client.util.Sleeper; +import java.io.IOException; +import java.util.logging.Logger; + +/** + * RetryHttpInitializerWrapper will automatically retry upon RPC + * failures, preserving the auto-refresh behavior of the Google + * Credentials. + */ +public class RetryHttpInitializerWrapper implements HttpRequestInitializer { + + /** + * A private logger. + */ + private static final Logger LOG = + Logger.getLogger(RetryHttpInitializerWrapper.class.getName()); + + /** + * One minutes in miliseconds. + */ + private static final int ONEMINITUES = 60000; + + /** + * Intercepts the request for filling in the "Authorization" + * header field, as well as recovering from certain unsuccessful + * error codes wherein the Credential must refresh its token for a + * retry. + */ + private final Credential wrappedCredential; + + /** + * A sleeper; you can replace it with a mock in your test. + */ + private final Sleeper sleeper; + + /** + * A constructor. + * + * @param wrappedCredential Credential which will be wrapped and + * used for providing auth header. + */ + public RetryHttpInitializerWrapper(final Credential wrappedCredential) { + this(wrappedCredential, Sleeper.DEFAULT); + } + + /** + * A protected constructor only for testing. + * + * @param wrappedCredential Credential which will be wrapped and + * used for providing auth header. + * @param sleeper Sleeper for easy testing. + */ + RetryHttpInitializerWrapper( + final Credential wrappedCredential, final Sleeper sleeper) { + this.wrappedCredential = checkNotNull(wrappedCredential); + this.sleeper = sleeper; + } + + /** + * Initializes the given request. + */ + @Override + public final void initialize(final HttpRequest request) { + request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout + final HttpUnsuccessfulResponseHandler backoffHandler = + new HttpBackOffUnsuccessfulResponseHandler( + new ExponentialBackOff()) + .setSleeper(sleeper); + request.setInterceptor(wrappedCredential); + request.setUnsuccessfulResponseHandler( + new HttpUnsuccessfulResponseHandler() { + @Override + public boolean handleResponse( + final HttpRequest request, + final HttpResponse response, + final boolean supportsRetry) throws IOException { + if (wrappedCredential.handleResponse( + request, response, supportsRetry)) { + // If credential decides it can handle it, + // the return code or message indicated + // something specific to authentication, + // and no backoff is desired. + return true; + } else if (backoffHandler.handleResponse( + request, response, supportsRetry)) { + // Otherwise, we defer to the judgement of + // our internal backoff handler. + LOG.info("Retrying " + + request.getUrl().toString()); + return true; + } else { + return false; + } + } + }); + request.setIOExceptionHandler( + new HttpBackOffIOExceptionHandler(new ExponentialBackOff()) + .setSleeper(sleeper)); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java new file mode 100644 index 0000000000..0e800623ab --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game.utils; + +import com.google.api.services.bigquery.model.TableFieldSchema; +import com.google.api.services.bigquery.model.TableReference; +import com.google.api.services.bigquery.model.TableRow; +import com.google.api.services.bigquery.model.TableSchema; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import ${package}.complete.game.UserScore; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; +import org.apache.beam.sdk.options.GcpOptions; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; + +/** + * Generate, format, and write BigQuery table row information. Use provided information about + * the field names and types, as well as lambda functions that describe how to generate their + * values. + */ +public class WriteToBigQuery + extends PTransform, PDone> { + + protected String tableName; + protected Map> fieldInfo; + + public WriteToBigQuery() { + } + + public WriteToBigQuery(String tableName, + Map> fieldInfo) { + this.tableName = tableName; + this.fieldInfo = fieldInfo; + } + + /** + * A {@link Serializable} function from a {@link DoFn.ProcessContext} + * and {@link BoundedWindow} to the value for that field. + */ + public interface FieldFn extends Serializable { + Object apply(DoFn.ProcessContext context, BoundedWindow window); + } + + /** Define a class to hold information about output table field definitions. */ + public static class FieldInfo implements Serializable { + // The BigQuery 'type' of the field + private String fieldType; + // A lambda function to generate the field value + private FieldFn fieldFn; + + public FieldInfo(String fieldType, + FieldFn fieldFn) { + this.fieldType = fieldType; + this.fieldFn = fieldFn; + } + + String getFieldType() { + return this.fieldType; + } + + FieldFn getFieldFn() { + return this.fieldFn; + } + } + /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */ + protected class BuildRowFn extends DoFn { + + @ProcessElement + public void processElement(ProcessContext c, BoundedWindow window) { + + TableRow row = new TableRow(); + for (Map.Entry> entry : fieldInfo.entrySet()) { + String key = entry.getKey(); + FieldInfo fcnInfo = entry.getValue(); + FieldFn fcn = fcnInfo.getFieldFn(); + row.set(key, fcn.apply(c, window)); + } + c.output(row); + } + } + + /** Build the output table schema. */ + protected TableSchema getSchema() { + List fields = new ArrayList<>(); + for (Map.Entry> entry : fieldInfo.entrySet()) { + String key = entry.getKey(); + FieldInfo fcnInfo = entry.getValue(); + String bqType = fcnInfo.getFieldType(); + fields.add(new TableFieldSchema().setName(key).setType(bqType)); + } + return new TableSchema().setFields(fields); + } + + @Override + public PDone expand(PCollection teamAndScore) { + return teamAndScore + .apply("ConvertToRow", ParDo.of(new BuildRowFn())) + .apply(BigQueryIO.Write + .to(getTable(teamAndScore.getPipeline(), + tableName)) + .withSchema(getSchema()) + .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) + .withWriteDisposition(WriteDisposition.WRITE_APPEND)); + } + + /** Utility to construct an output table reference. */ + static TableReference getTable(Pipeline pipeline, String tableName) { + PipelineOptions options = pipeline.getOptions(); + TableReference table = new TableReference(); + table.setDatasetId(options.as(UserScore.Options.class).getDataset()); + table.setProjectId(options.as(GcpOptions.class).getProject()); + table.setTableId(tableName); + return table; + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java new file mode 100644 index 0000000000..839650f02d --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game.utils; + +import com.google.api.services.bigquery.model.TableRow; +import java.util.Map; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; + +/** + * Generate, format, and write BigQuery table row information. Subclasses {@link WriteToBigQuery} + * to require windowing; so this subclass may be used for writes that require access to the + * context's window information. + */ +public class WriteWindowedToBigQuery + extends WriteToBigQuery { + + public WriteWindowedToBigQuery(String tableName, + Map> fieldInfo) { + super(tableName, fieldInfo); + } + + /** Convert each key/score pair into a BigQuery TableRow. */ + protected class BuildRowFn extends DoFn { + @ProcessElement + public void processElement(ProcessContext c, BoundedWindow window) { + + TableRow row = new TableRow(); + for (Map.Entry> entry : fieldInfo.entrySet()) { + String key = entry.getKey(); + FieldInfo fcnInfo = entry.getValue(); + row.set(key, fcnInfo.getFieldFn().apply(c, window)); + } + c.output(row); + } + } + + @Override + public PDone expand(PCollection teamAndScore) { + return teamAndScore + .apply("ConvertToRow", ParDo.of(new BuildRowFn())) + .apply(BigQueryIO.Write + .to(getTable(teamAndScore.getPipeline(), + tableName)) + .withSchema(getSchema()) + .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) + .withWriteDisposition(WriteDisposition.WRITE_APPEND)); + } + +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java new file mode 100644 index 0000000000..155242d996 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import com.google.common.io.Files; +import java.io.File; +import java.nio.charset.StandardCharsets; +import ${package}.DebuggingWordCount.WordCountOptions; +import org.apache.beam.sdk.testing.TestPipeline; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests for {@link DebuggingWordCount}. + */ +@RunWith(JUnit4.class) +public class DebuggingWordCountTest { + @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); + + @Test + public void testDebuggingWordCount() throws Exception { + File inputFile = tmpFolder.newFile(); + File outputFile = tmpFolder.newFile(); + Files.write( + "stomach secret Flourish message Flourish here Flourish", + inputFile, + StandardCharsets.UTF_8); + WordCountOptions options = + TestPipeline.testingPipelineOptions().as(WordCountOptions.class); + options.setInputFile(inputFile.getAbsolutePath()); + options.setOutput(outputFile.getAbsolutePath()); + DebuggingWordCount.main(TestPipeline.convertToArgs(options)); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java new file mode 100644 index 0000000000..bfca71bcf9 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.Serializable; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; +import java.nio.file.Files; +import java.nio.file.StandardOpenOption; +import java.util.Arrays; +import java.util.List; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.options.GcsOptions; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Count; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.FlatMapElements; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.util.GcsUtil; +import org.apache.beam.sdk.util.gcsfs.GcsPath; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; + +/** + * To keep {@link MinimalWordCountJava8} simple, it is not factored or testable. This test + * file should be maintained with a copy of its code for a basic smoke test. + */ +@RunWith(JUnit4.class) +public class MinimalWordCountJava8Test implements Serializable { + + @Rule + public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); + + /** + * A basic smoke test that ensures there is no crash at pipeline construction time. + */ + @Test + public void testMinimalWordCountJava8() throws Exception { + p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); + + p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*")) + .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+"))) + .withOutputType(TypeDescriptors.strings())) + .apply(Filter.by((String word) -> !word.isEmpty())) + .apply(Count.perElement()) + .apply(MapElements + .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()) + .withOutputType(TypeDescriptors.strings())) + .apply(TextIO.Write.to("gs://your-output-bucket/and-output-prefix")); + } + + private GcsUtil buildMockGcsUtil() throws IOException { + GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class); + + // Any request to open gets a new bogus channel + Mockito + .when(mockGcsUtil.open(Mockito.any(GcsPath.class))) + .then(new Answer() { + @Override + public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable { + return FileChannel.open( + Files.createTempFile("channel-", ".tmp"), + StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE); + } + }); + + // Any request for expansion returns a list containing the original GcsPath + // This is required to pass validation that occurs in TextIO during apply() + Mockito + .when(mockGcsUtil.expand(Mockito.any(GcsPath.class))) + .then(new Answer>() { + @Override + public List answer(InvocationOnMock invocation) throws Throwable { + return ImmutableList.of((GcsPath) invocation.getArguments()[0]); + } + }); + + return mockGcsUtil; + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java new file mode 100644 index 0000000000..e9621032e5 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}; + +import java.util.Arrays; +import java.util.List; +import ${package}.WordCount.CountWords; +import ${package}.WordCount.ExtractWordsFn; +import ${package}.WordCount.FormatAsTextFn; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.RunnableOnService; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.DoFnTester; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollection; +import org.hamcrest.CoreMatchers; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests of WordCount. + */ +@RunWith(JUnit4.class) +public class WordCountTest { + + /** Example test that tests a specific {@link DoFn}. */ + @Test + public void testExtractWordsFn() throws Exception { + DoFnTester extractWordsFn = + DoFnTester.of(new ExtractWordsFn()); + + Assert.assertThat(extractWordsFn.processBundle(" some input words "), + CoreMatchers.hasItems("some", "input", "words")); + Assert.assertThat(extractWordsFn.processBundle(" "), + CoreMatchers.hasItems()); + Assert.assertThat(extractWordsFn.processBundle(" some ", " input", " words"), + CoreMatchers.hasItems("some", "input", "words")); + } + + static final String[] WORDS_ARRAY = new String[] { + "hi there", "hi", "hi sue bob", + "hi sue", "", "bob hi"}; + + static final List WORDS = Arrays.asList(WORDS_ARRAY); + + static final String[] COUNTS_ARRAY = new String[] { + "hi: 5", "there: 1", "sue: 2", "bob: 2"}; + + @Rule + public TestPipeline p = TestPipeline.create(); + + /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ + @Test + @Category(RunnableOnService.class) + public void testCountWords() throws Exception { + PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); + + PCollection output = input.apply(new CountWords()) + .apply(MapElements.via(new FormatAsTextFn())); + + PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); + p.run().waitUntilFinish(); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java new file mode 100644 index 0000000000..3823bea673 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import ${package}.complete.game.GameStats.CalculateSpammyUsers; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.RunnableOnService; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests of GameStats. + * Because the pipeline was designed for easy readability and explanations, it lacks good + * modularity for testing. See our testing documentation for better ideas: + * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline. + */ +@RunWith(JUnit4.class) +public class GameStatsTest implements Serializable { + + // User scores + static final List> USER_SCORES = Arrays.asList( + KV.of("Robot-2", 66), KV.of("Robot-1", 116), KV.of("user7_AndroidGreenKookaburra", 23), + KV.of("user7_AndroidGreenKookaburra", 1), + KV.of("user19_BisqueBilby", 14), KV.of("user13_ApricotQuokka", 15), + KV.of("user18_BananaEmu", 25), KV.of("user6_AmberEchidna", 8), + KV.of("user2_AmberQuokka", 6), KV.of("user0_MagentaKangaroo", 4), + KV.of("user0_MagentaKangaroo", 3), KV.of("user2_AmberCockatoo", 13), + KV.of("user7_AlmondWallaby", 15), KV.of("user6_AmberNumbat", 11), + KV.of("user6_AmberQuokka", 4)); + + // The expected list of 'spammers'. + static final List> SPAMMERS = Arrays.asList( + KV.of("Robot-2", 66), KV.of("Robot-1", 116)); + + @Rule + public TestPipeline p = TestPipeline.create(); + + /** Test the calculation of 'spammy users'. */ + @Test + @Category(RunnableOnService.class) + public void testCalculateSpammyUsers() throws Exception { + PCollection> input = p.apply(Create.of(USER_SCORES)); + PCollection> output = input.apply(new CalculateSpammyUsers()); + + // Check the set of spammers. + PAssert.that(output).containsInAnyOrder(SPAMMERS); + + p.run().waitUntilFinish(); + } + + @Test + public void testGameStatsOptions() { + PipelineOptionsFactory.as(GameStats.Options.class); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java new file mode 100644 index 0000000000..c777ffb2fb --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import ${package}.complete.game.UserScore.GameActionInfo; +import ${package}.complete.game.UserScore.ParseEventFn; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.RunnableOnService; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.joda.time.Instant; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests of HourlyTeamScore. + * Because the pipeline was designed for easy readability and explanations, it lacks good + * modularity for testing. See our testing documentation for better ideas: + * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline. + */ +@RunWith(JUnit4.class) +public class HourlyTeamScoreTest implements Serializable { + + static final String[] GAME_EVENTS_ARRAY = new String[] { + "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444", + "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444", + "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444", + "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444", + "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444", + "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444", + "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444", + "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444", + // time gap... + "user0_AndroidGreenEchidna,AndroidGreenEchidna,0,1447965690000,2015-11-19 12:41:31.053", + "user0_MagentaKangaroo,MagentaKangaroo,4,1447965690000,2015-11-19 12:41:31.053", + "user2_AmberCockatoo,AmberCockatoo,13,1447965690000,2015-11-19 12:41:31.053", + "user18_BananaEmu,BananaEmu,7,1447965690000,2015-11-19 12:41:31.053", + "user3_BananaEmu,BananaEmu,17,1447965690000,2015-11-19 12:41:31.053", + "user18_BananaEmu,BananaEmu,1,1447965690000,2015-11-19 12:41:31.053", + "user18_ApricotCaneToad,ApricotCaneToad,14,1447965690000,2015-11-19 12:41:31.053" + }; + + + static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY); + + + // Used to check the filtering. + static final KV[] FILTERED_EVENTS = new KV[] { + KV.of("user0_AndroidGreenEchidna", 0), KV.of("user0_MagentaKangaroo", 4), + KV.of("user2_AmberCockatoo", 13), + KV.of("user18_BananaEmu", 7), KV.of("user3_BananaEmu", 17), + KV.of("user18_BananaEmu", 1), KV.of("user18_ApricotCaneToad", 14) + }; + + @Rule + public TestPipeline p = TestPipeline.create(); + + /** Test the filtering. */ + @Test + @Category(RunnableOnService.class) + public void testUserScoresFilter() throws Exception { + + final Instant startMinTimestamp = new Instant(1447965680000L); + + PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); + + PCollection> output = input + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) + + .apply("FilterStartTime", Filter.by( + (GameActionInfo gInfo) + -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) + // run a map to access the fields in the result. + .apply(MapElements + .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) + .withOutputType( + TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))); + + PAssert.that(output).containsInAnyOrder(FILTERED_EVENTS); + + p.run().waitUntilFinish(); + } + + @Test + public void testUserScoreOptions() { + PipelineOptionsFactory.as(HourlyTeamScore.Options.class); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java new file mode 100644 index 0000000000..b6d673a9db --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java @@ -0,0 +1,366 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ${package}.complete.game; + +import static org.hamcrest.Matchers.hasItem; +import static org.junit.Assert.assertThat; + +import com.google.common.collect.ImmutableMap; +import java.io.Serializable; +import ${package}.complete.game.LeaderBoard.CalculateTeamScores; +import ${package}.complete.game.LeaderBoard.CalculateUserScores; +import ${package}.complete.game.UserScore.GameActionInfo; +import org.apache.beam.sdk.coders.AvroCoder; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.GlobalWindow; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TimestampedValue; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests for {@link LeaderBoard}. + */ +@RunWith(JUnit4.class) +public class LeaderBoardTest implements Serializable { + private static final Duration ALLOWED_LATENESS = Duration.standardHours(1); + private static final Duration TEAM_WINDOW_DURATION = Duration.standardMinutes(20); + private Instant baseTime = new Instant(0); + + @Rule + public TestPipeline p = TestPipeline.create(); + /** + * Some example users, on two separate teams. + */ + private enum TestUser { + RED_ONE("scarlet", "red"), RED_TWO("burgundy", "red"), + BLUE_ONE("navy", "blue"), BLUE_TWO("sky", "blue"); + + private final String userName; + private final String teamName; + + TestUser(String userName, String teamName) { + this.userName = userName; + this.teamName = teamName; + } + + public String getUser() { + return userName; + } + + public String getTeam() { + return teamName; + } + } + + /** + * A test of the {@link CalculateTeamScores} {@link PTransform} when all of the elements arrive + * on time (ahead of the watermark). + */ + @Test + public void testTeamScoresOnTime() { + + TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) + // Start at the epoch + .advanceWatermarkTo(baseTime) + // add some elements ahead of the watermark + .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), + event(TestUser.BLUE_ONE, 2, Duration.standardMinutes(1)), + event(TestUser.RED_TWO, 3, Duration.standardSeconds(22)), + event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(3))) + // The watermark advances slightly, but not past the end of the window + .advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))) + // Add some more on time elements + .addElements(event(TestUser.RED_ONE, 1, Duration.standardMinutes(4)), + event(TestUser.BLUE_ONE, 2, Duration.standardSeconds(270))) + // The window should close and emit an ON_TIME pane + .advanceWatermarkToInfinity(); + + PCollection> teamScores = p.apply(createEvents) + .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); + + String blueTeam = TestUser.BLUE_ONE.getTeam(); + String redTeam = TestUser.RED_ONE.getTeam(); + PAssert.that(teamScores) + .inOnTimePane(new IntervalWindow(baseTime, TEAM_WINDOW_DURATION)) + .containsInAnyOrder(KV.of(blueTeam, 12), KV.of(redTeam, 4)); + + p.run().waitUntilFinish(); + } + + /** + * A test of the {@link CalculateTeamScores} {@link PTransform} when all of the elements arrive + * on time, and the processing time advances far enough for speculative panes. + */ + @Test + public void testTeamScoresSpeculative() { + + TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) + // Start at the epoch + .advanceWatermarkTo(baseTime) + .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), + event(TestUser.BLUE_ONE, 2, Duration.standardMinutes(1))) + // Some time passes within the runner, which causes a speculative pane containing the blue + // team's score to be emitted + .advanceProcessingTime(Duration.standardMinutes(10)) + .addElements(event(TestUser.RED_TWO, 5, Duration.standardMinutes(3))) + // Some additional time passes and we get a speculative pane for the red team + .advanceProcessingTime(Duration.standardMinutes(12)) + .addElements(event(TestUser.BLUE_TWO, 3, Duration.standardSeconds(22))) + // More time passes and a speculative pane containing a refined value for the blue pane is + // emitted + .advanceProcessingTime(Duration.standardMinutes(10)) + // Some more events occur + .addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(4)), + event(TestUser.BLUE_TWO, 2, Duration.standardMinutes(2))) + // The window closes and we get an ON_TIME pane that contains all of the updates + .advanceWatermarkToInfinity(); + + PCollection> teamScores = p.apply(createEvents) + .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); + + String blueTeam = TestUser.BLUE_ONE.getTeam(); + String redTeam = TestUser.RED_ONE.getTeam(); + IntervalWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); + // The window contains speculative panes alongside the on-time pane + PAssert.that(teamScores) + .inWindow(window) + .containsInAnyOrder(KV.of(blueTeam, 10) /* The on-time blue pane */, + KV.of(redTeam, 9) /* The on-time red pane */, + KV.of(blueTeam, 5) /* The first blue speculative pane */, + KV.of(blueTeam, 8) /* The second blue speculative pane */, + KV.of(redTeam, 5) /* The red speculative pane */); + PAssert.that(teamScores) + .inOnTimePane(window) + .containsInAnyOrder(KV.of(blueTeam, 10), KV.of(redTeam, 9)); + + p.run().waitUntilFinish(); + } + + /** + * A test where elements arrive behind the watermark (late data), but before the end of the + * window. These elements are emitted on time. + */ + @Test + public void testTeamScoresUnobservablyLate() { + + BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); + TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) + .advanceWatermarkTo(baseTime) + .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), + event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8)), + event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), + event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))) + .advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION).minus(Duration.standardMinutes(1))) + // These events are late, but the window hasn't closed yet, so the elements are in the + // on-time pane + .addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), + event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), + event(TestUser.BLUE_TWO, 2, Duration.standardSeconds(90)), + event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))) + .advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))) + .advanceWatermarkToInfinity(); + PCollection> teamScores = p.apply(createEvents) + .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); + + String blueTeam = TestUser.BLUE_ONE.getTeam(); + String redTeam = TestUser.RED_ONE.getTeam(); + // The On Time pane contains the late elements that arrived before the end of the window + PAssert.that(teamScores) + .inOnTimePane(window) + .containsInAnyOrder(KV.of(redTeam, 14), KV.of(blueTeam, 13)); + + p.run().waitUntilFinish(); + } + + /** + * A test where elements arrive behind the watermark (late data) after the watermark passes the + * end of the window, but before the maximum allowed lateness. These elements are emitted in a + * late pane. + */ + @Test + public void testTeamScoresObservablyLate() { + + Instant firstWindowCloses = baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION); + TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) + .advanceWatermarkTo(baseTime) + .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), + event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8))) + .advanceProcessingTime(Duration.standardMinutes(10)) + .advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))) + .addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(1)), + event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), + event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))) + .advanceWatermarkTo(firstWindowCloses.minus(Duration.standardMinutes(1))) + // These events are late but should still appear in a late pane + .addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), + event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), + event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))) + // A late refinement is emitted due to the advance in processing time, but the window has + // not yet closed because the watermark has not advanced + .advanceProcessingTime(Duration.standardMinutes(12)) + // These elements should appear in the final pane + .addElements(event(TestUser.RED_TWO, 9, Duration.standardMinutes(1)), + event(TestUser.RED_TWO, 1, Duration.standardMinutes(3))) + .advanceWatermarkToInfinity(); + + PCollection> teamScores = p.apply(createEvents) + .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); + + BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); + String blueTeam = TestUser.BLUE_ONE.getTeam(); + String redTeam = TestUser.RED_ONE.getTeam(); + PAssert.that(teamScores) + .inWindow(window) + .satisfies((SerializableFunction>, Void>) input -> { + // The final sums need not exist in the same pane, but must appear in the output + // PCollection + assertThat(input, hasItem(KV.of(blueTeam, 11))); + assertThat(input, hasItem(KV.of(redTeam, 27))); + return null; + }); + PAssert.thatMap(teamScores) + // The closing behavior of CalculateTeamScores precludes an inFinalPane matcher + .inOnTimePane(window) + .isEqualTo(ImmutableMap.builder().put(redTeam, 7) + .put(blueTeam, 11) + .build()); + + // No final pane is emitted for the blue team, as all of their updates have been taken into + // account in earlier panes + PAssert.that(teamScores).inFinalPane(window).containsInAnyOrder(KV.of(redTeam, 27)); + + p.run().waitUntilFinish(); + } + + /** + * A test where elements arrive beyond the maximum allowed lateness. These elements are dropped + * within {@link CalculateTeamScores} and do not impact the final result. + */ + @Test + public void testTeamScoresDroppablyLate() { + + BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); + TestStream infos = TestStream.create(AvroCoder.of(GameActionInfo.class)) + .addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), + event(TestUser.RED_ONE, 3, Duration.ZERO)) + .advanceWatermarkTo(window.maxTimestamp()) + .addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), + event(TestUser.BLUE_TWO, 3, Duration.ZERO), + event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))) + // Move the watermark past the end of the allowed lateness plus the end of the window + .advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS) + .plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))) + // These elements within the expired window are droppably late, and will not appear in the + // output + .addElements( + event(TestUser.BLUE_TWO, 3, TEAM_WINDOW_DURATION.minus(Duration.standardSeconds(5))), + event(TestUser.RED_ONE, 7, Duration.standardMinutes(4))) + .advanceWatermarkToInfinity(); + PCollection> teamScores = p.apply(infos) + .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); + + String blueTeam = TestUser.BLUE_ONE.getTeam(); + String redTeam = TestUser.RED_ONE.getTeam(); + // Only one on-time pane and no late panes should be emitted + PAssert.that(teamScores) + .inWindow(window) + .containsInAnyOrder(KV.of(redTeam, 7), KV.of(blueTeam, 18)); + // No elements are added before the watermark passes the end of the window plus the allowed + // lateness, so no refinement should be emitted + PAssert.that(teamScores).inFinalPane(window).empty(); + + p.run().waitUntilFinish(); + } + + /** + * A test where elements arrive both on-time and late in {@link CalculateUserScores}, which emits + * output into the {@link GlobalWindow}. All elements that arrive should be taken into account, + * even if they arrive later than the maximum allowed lateness. + */ + @Test + public void testUserScore() { + + TestStream infos = + TestStream.create(AvroCoder.of(GameActionInfo.class)) + .addElements( + event(TestUser.BLUE_ONE, 12, Duration.ZERO), + event(TestUser.RED_ONE, 3, Duration.ZERO)) + .advanceProcessingTime(Duration.standardMinutes(7)) + .addElements( + event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), + event(TestUser.BLUE_TWO, 3, Duration.ZERO), + event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))) + .advanceProcessingTime(Duration.standardMinutes(5)) + .advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(Duration.standardHours(12))) + // Late elements are always observable within the global window - they arrive before + // the window closes, so they will appear in a pane, even if they arrive after the + // allowed lateness, and are taken into account alongside on-time elements + .addElements( + event(TestUser.RED_ONE, 3, Duration.standardMinutes(7)), + event(TestUser.RED_ONE, 2, (ALLOWED_LATENESS).plus(Duration.standardHours(13)))) + .advanceProcessingTime(Duration.standardMinutes(6)) + .addElements(event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(12))) + .advanceProcessingTime(Duration.standardMinutes(20)) + .advanceWatermarkToInfinity(); + + PCollection> userScores = + p.apply(infos).apply(new CalculateUserScores(ALLOWED_LATENESS)); + + // User scores are emitted in speculative panes in the Global Window - this matcher choice + // ensures that panes emitted by the watermark advancing to positive infinity are not included, + // as that will not occur outside of tests + PAssert.that(userScores) + .inEarlyGlobalWindowPanes() + .containsInAnyOrder(KV.of(TestUser.BLUE_ONE.getUser(), 15), + KV.of(TestUser.RED_ONE.getUser(), 7), + KV.of(TestUser.RED_ONE.getUser(), 12), + KV.of(TestUser.BLUE_TWO.getUser(), 3), + KV.of(TestUser.BLUE_TWO.getUser(), 8)); + + p.run().waitUntilFinish(); + } + + @Test + public void testLeaderBoardOptions() { + PipelineOptionsFactory.as(LeaderBoard.Options.class); + } + + private TimestampedValue event( + TestUser user, + int score, + Duration baseTimeOffset) { + return TimestampedValue.of(new GameActionInfo(user.getUser(), + user.getTeam(), + score, + baseTime.plus(baseTimeOffset).getMillis()), baseTime.plus(baseTimeOffset)); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java new file mode 100644 index 0000000000..25987ee2cc --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import ${package}.complete.game.UserScore.ExtractAndSumScore; +import ${package}.complete.game.UserScore.GameActionInfo; +import ${package}.complete.game.UserScore.ParseEventFn; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.RunnableOnService; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.DoFnTester; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests of UserScore. + */ +@RunWith(JUnit4.class) +public class UserScoreTest implements Serializable { + + static final String[] GAME_EVENTS_ARRAY = new String[] { + "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444", + "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444", + "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444", + "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444", + "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444", + "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444", + "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444", + "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444", + "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444", + "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444" + }; + + static final String[] GAME_EVENTS_ARRAY2 = new String[] { + "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444", + "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444", + "user13_BisqueBilby,BisqueBilby,xxx,1447955630000,2015-11-19 09:53:53.444" + }; + + static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY); + static final List GAME_EVENTS2 = Arrays.asList(GAME_EVENTS_ARRAY2); + + static final List> USER_SUMS = Arrays.asList( + KV.of("user0_MagentaKangaroo", 3), KV.of("user13_ApricotQuokka", 15), + KV.of("user6_AmberNumbat", 11), KV.of("user7_AlmondWallaby", 15), + KV.of("user7_AndroidGreenKookaburra", 23), + KV.of("user19_BisqueBilby", 14)); + + static final List> TEAM_SUMS = Arrays.asList( + KV.of("MagentaKangaroo", 3), KV.of("ApricotQuokka", 15), + KV.of("AmberNumbat", 11), KV.of("AlmondWallaby", 15), + KV.of("AndroidGreenKookaburra", 23), + KV.of("BisqueBilby", 14)); + + @Rule + public TestPipeline p = TestPipeline.create(); + + /** Test the {@link ParseEventFn} {@link org.apache.beam.sdk.transforms.DoFn}. */ + @Test + public void testParseEventFn() throws Exception { + DoFnTester parseEventFn = + DoFnTester.of(new ParseEventFn()); + + List results = parseEventFn.processBundle(GAME_EVENTS_ARRAY); + Assert.assertEquals(results.size(), 8); + Assert.assertEquals(results.get(0).getUser(), "user0_MagentaKangaroo"); + Assert.assertEquals(results.get(0).getTeam(), "MagentaKangaroo"); + Assert.assertEquals(results.get(0).getScore(), new Integer(3)); + } + + /** Tests ExtractAndSumScore("user"). */ + @Test + @Category(RunnableOnService.class) + public void testUserScoreSums() throws Exception { + + PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); + + PCollection> output = input + .apply(ParDo.of(new ParseEventFn())) + // Extract and sum username/score pairs from the event data. + .apply("ExtractUserScore", new ExtractAndSumScore("user")); + + // Check the user score sums. + PAssert.that(output).containsInAnyOrder(USER_SUMS); + + p.run().waitUntilFinish(); + } + + /** Tests ExtractAndSumScore("team"). */ + @Test + @Category(RunnableOnService.class) + public void testTeamScoreSums() throws Exception { + + PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); + + PCollection> output = input + .apply(ParDo.of(new ParseEventFn())) + // Extract and sum teamname/score pairs from the event data. + .apply("ExtractTeamScore", new ExtractAndSumScore("team")); + + // Check the team score sums. + PAssert.that(output).containsInAnyOrder(TEAM_SUMS); + + p.run().waitUntilFinish(); + } + + /** Test that bad input data is dropped appropriately. */ + @Test + @Category(RunnableOnService.class) + public void testUserScoresBadInput() throws Exception { + + PCollection input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of())); + + PCollection> extract = input + .apply(ParDo.of(new ParseEventFn())) + .apply( + MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) + .withOutputType( + TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))); + + PAssert.that(extract).empty(); + + p.run().waitUntilFinish(); + } +} diff --git a/maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties new file mode 100644 index 0000000000..b0195b3f16 --- /dev/null +++ b/maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties @@ -0,0 +1,19 @@ +# Copyright (C) 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +package=it.pkg +version=0.1 +groupId=archetype.it +artifactId=basic +targetPlatform=1.8 diff --git a/maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt b/maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt new file mode 100644 index 0000000000..0b5987362f --- /dev/null +++ b/maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt @@ -0,0 +1 @@ +verify diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 80d03bde09..a534e91393 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -15,13 +15,14 @@ ~ the License. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~--> + 4.0.0 com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-parent + google-cloud-dataflow-java-archetypes-parent 2.0.0-beta3-SNAPSHOT - ../../pom.xml + ../pom.xml google-cloud-dataflow-java-archetypes-examples @@ -41,5 +42,38 @@ 2.4 + + + + + maven-archetype-plugin + 2.4 + + + org.apache.maven.shared + maven-invoker + 2.2 + + + + + + default-integration-test + install + + integration-test + + + + + +
+ diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index e1ce538e52..389064a05f 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -23,6 +23,8 @@ ${artifactId} ${version} + jar + UTF-8 @@ -41,8 +43,6 @@ - jar - @@ -55,10 +55,32 @@ + + org.apache.maven.plugins + maven-surefire-plugin + 2.19.1 + + all + 4 + true + + + + org.apache.maven.surefire + surefire-junit47 + 2.19.1 + + + + + org.apache.maven.plugins maven-shade-plugin - 2.3 + 2.4.1 package @@ -67,43 +89,24 @@ ${project.artifactId}-bundled-${project.version} - - - *:* - - *:* + META-INF/LICENSE META-INF/*.SF META-INF/*.DSA META-INF/*.RSA + + + - - - org.apache.maven.plugins - maven-surefire-plugin - 2.19.1 - - all - 4 - true - - - - org.apache.maven.surefire - surefire-junit47 - 2.19.1 - - - @@ -128,6 +131,7 @@ 2.0.0-beta3-SNAPSHOT + com.google.api-client google-api-client @@ -142,7 +146,6 @@ - com.google.apis google-api-services-bigquery @@ -194,7 +197,7 @@ com.google.guava guava - 19.0 + 20.0 diff --git a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties index 1f3c9c5178..8a76657024 100644 --- a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties +++ b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties @@ -13,7 +13,7 @@ # the License. package=it.pkg -version=0.1-SNAPSHOT +version=0.1 groupId=archetype.it artifactId=basic targetPlatform=1.7 diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml new file mode 100644 index 0000000000..5e323f6506 --- /dev/null +++ b/maven-archetypes/pom.xml @@ -0,0 +1,38 @@ + + + + + 4.0.0 + + + com.google.cloud.dataflow + google-cloud-dataflow-java-sdk-parent + 2.0.0-beta3-SNAPSHOT + ../pom.xml + + + google-cloud-dataflow-java-archetypes-parent + pom + + Google Cloud Dataflow SDK for Java - Maven Archetypes + + + starter + examples + examples-java8 + + diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index eee032d419..70c82c4e98 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -15,13 +15,14 @@ ~ the License. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~--> + 4.0.0 com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-parent + google-cloud-dataflow-java-archetypes-parent 2.0.0-beta3-SNAPSHOT - ../../pom.xml + ../pom.xml google-cloud-dataflow-java-archetypes-starter @@ -41,5 +42,44 @@ 2.4 + + + + + maven-archetype-plugin + 2.4 + + + org.apache.maven.shared + maven-invoker + 2.2 + + + + + + default-integration-test + install + + integration-test + + + + true + + + + + + + diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index 2ba93d7291..19e0e7acfe 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -41,8 +41,6 @@ - jar - diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties index 1f3c9c5178..8a76657024 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties +++ b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties @@ -13,7 +13,7 @@ # the License. package=it.pkg -version=0.1-SNAPSHOT +version=0.1 groupId=archetype.it artifactId=basic targetPlatform=1.7 diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index 1f06820c65..a363653e24 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -21,7 +21,7 @@ archetype.it basic - 0.1-SNAPSHOT + 0.1 UTF-8 @@ -41,8 +41,6 @@ - jar - diff --git a/pom.xml b/pom.xml index 8af9511fc5..2e954c12f7 100644 --- a/pom.xml +++ b/pom.xml @@ -103,7 +103,7 @@ ${maven.build.timestamp} yyyy-MM-dd HH:mm - 0.5.0 + 0.6.0 Google Cloud Dataflow SDK for Java ${project.version}-20170202 @@ -114,8 +114,7 @@ sdk examples - maven-archetypes/starter - maven-archetypes/examples + maven-archetypes From 39e87383964d93d99ab5b720568f7fba40eaca14 Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Fri, 17 Mar 2017 16:02:03 -0700 Subject: [PATCH 27/77] [maven-release-plugin] prepare branch release-2.0.0-beta3 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 2e954c12f7..f1d44cbc7c 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.0.0-beta3 From 317aebad878f9045218cfe77357259bf3546eb58 Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Fri, 17 Mar 2017 16:02:07 -0700 Subject: [PATCH 28/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples-java8/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 5bddc98bf4..f5ae58ad8a 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples-java8/pom.xml b/maven-archetypes/examples-java8/pom.xml index 3c19164933..4056cf6f8f 100644 --- a/maven-archetypes/examples-java8/pom.xml +++ b/maven-archetypes/examples-java8/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index a534e91393..8cf6cd34ca 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 5e323f6506..b698ec84b8 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 70c82c4e98..2e61c542c1 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index f1d44cbc7c..a7e091a86e 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.0.0-beta3 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index b43e014a63..14b8061c1b 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT google-cloud-dataflow-java-sdk-all From f0ccc77f8f255e0b3cc5b2d570ce43e347d7dabe Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Fri, 17 Mar 2017 16:17:04 -0700 Subject: [PATCH 29/77] Update archetype versioning after cutting 2.0.0-beta3 release branch Signed-off-by: Jason Kuster --- .../src/main/resources/archetype-resources/pom.xml | 2 +- .../examples/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/main/resources/archetype-resources/pom.xml | 2 +- .../starter/src/test/resources/projects/basic/reference/pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml index 0f5c2d13aa..e7c1647069 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -128,7 +128,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 389064a05f..0ee80d0539 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -128,7 +128,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index 19e0e7acfe..b392af2991 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -72,7 +72,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index a363653e24..6aff824959 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -72,7 +72,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta3-SNAPSHOT + 2.0.0-beta4-SNAPSHOT From 328a0d44588bfda2d78e46960ddbb61ebb2ec65e Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Fri, 17 Mar 2017 16:20:32 -0700 Subject: [PATCH 30/77] Update worker container image for 2.0.0-beta4-SNAPSHOT Signed-off-by: Jason Kuster --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a7e091a86e..7908489603 100644 --- a/pom.xml +++ b/pom.xml @@ -106,7 +106,7 @@ 0.6.0 Google Cloud Dataflow SDK for Java - ${project.version}-20170202 + ${project.version}-20170317 6 From 4ede2806b898157e381995242384e3c515e8795f Mon Sep 17 00:00:00 2001 From: Anil Muppalla Date: Mon, 3 Apr 2017 22:05:16 -0400 Subject: [PATCH 31/77] edited doc --- .../cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java index 5fd8554c90..4b4184063d 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/windowing/AfterWatermark.java @@ -58,7 +58,7 @@ * *

Additionaly firings before or after the watermark can be requested by calling * {@code AfterWatermark.pastEndOfWindow.withEarlyFirings(OnceTrigger)} or - * {@code AfterWatermark.pastEndOfWindow.withEarlyFirings(OnceTrigger)}. + * {@code AfterWatermark.pastEndOfWindow.withLateFirings(OnceTrigger)}. * * @param {@link BoundedWindow} subclass used to represent the windows used. */ From 1cb04a638781943af15975590c4cabb6995fafd7 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Fri, 7 Apr 2017 15:35:25 -0700 Subject: [PATCH 32/77] DataflowPipelineJob: gracefully handle cancellatoin concurrent with termination This is a backport of BEAM-1880 https://github.com/apache/beam/pull/2428 --- .../sdk/runners/DataflowPipelineJob.java | 32 ++++++++++++++++-- .../sdk/runners/DataflowPipelineJobTest.java | 33 +++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java index 4a68755565..ced2759561 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJob.java @@ -292,9 +292,35 @@ public void cancel() throws IOException { content.setProjectId(projectId); content.setId(jobId); content.setRequestedState("JOB_STATE_CANCELLED"); - dataflowClient.projects().jobs() - .update(projectId, jobId, content) - .execute(); + try { + dataflowClient.projects().jobs() + .update(projectId, jobId, content) + .execute(); + } catch (IOException e) { + State state = getState(); + if (state.isTerminal()) { + LOG.warn("Cancel failed because job {} is already terminated in state {}.", jobId, state); + } else if (e.getMessage().contains("has terminated")) { + // This handles the case where the getState() call above returns RUNNING but the cancel + // was rejected because the job is in fact done. Hopefully, someday we can delete this + // code if there is better consistency between the State and whether Cancel succeeds. + // + // Example message: + // Workflow modification failed. Causes: (7603adc9e9bff51e): Cannot perform + // operation 'cancel' on Job: 2017-04-01_22_50_59-9269855660514862348. Job has + // terminated in state SUCCESS: Workflow job: 2017-04-01_22_50_59-9269855660514862348 + // succeeded. + LOG.warn("Cancel failed because job {} is already terminated.", jobId, e); + } else { + String errorMsg = String.format( + "Failed to cancel job in state %s, " + + "please go to the Developers Console to cancel it manually: %s", + state, + MonitoringUtil.getJobMonitoringPageURL(getProjectId(), getJobId())); + LOG.warn(errorMsg); + throw new IOException(errorMsg, e); + } + } } @Override diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJobTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJobTest.java index 1d6ccc66ab..9d1172bb7b 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJobTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineJobTest.java @@ -26,6 +26,7 @@ import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.junit.Assert.assertEquals; +import static org.mockito.Matchers.any; import static org.mockito.Matchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -42,6 +43,7 @@ import com.google.api.services.dataflow.model.MetricUpdate; import com.google.cloud.dataflow.sdk.PipelineResult.State; import com.google.cloud.dataflow.sdk.runners.dataflow.DataflowAggregatorTransforms; +import com.google.cloud.dataflow.sdk.testing.ExpectedLogs; import com.google.cloud.dataflow.sdk.testing.FastNanoClockAndSleeper; import com.google.cloud.dataflow.sdk.transforms.Aggregator; import com.google.cloud.dataflow.sdk.transforms.AppliedPTransform; @@ -91,6 +93,9 @@ public class DataflowPipelineJobTest { @Rule public ExpectedException thrown = ExpectedException.none(); + @Rule + public ExpectedLogs expectedLogs = ExpectedLogs.none(DataflowPipelineJob.class); + @Before public void setup() { MockitoAnnotations.initMocks(this); @@ -193,6 +198,34 @@ public void testWaitToFinishCancelled() throws Exception { assertEquals(State.CANCELLED, mockWaitToFinishInState(State.CANCELLED)); } + /** + * Test that {@link DataflowPipelineJob#cancel} doesn't throw if the Dataflow service returns + * non-terminal state even though the cancel API call failed, which can happen in practice. + * + *

TODO: delete this code if the API calls become consistent. + */ + @Test + public void testCancelTerminatedJobWithStaleState() throws IOException { + Dataflow.Projects.Jobs.Get statusRequest = + mock(Dataflow.Projects.Jobs.Get.class); + + Job statusResponse = new Job(); + statusResponse.setCurrentState("JOB_STATE_RUNNING"); + when(mockJobs.get(PROJECT_ID, JOB_ID)).thenReturn(statusRequest); + when(statusRequest.execute()).thenReturn(statusResponse); + + Dataflow.Projects.Jobs.Update update = mock( + Dataflow.Projects.Jobs.Update.class); + when(mockJobs.update(eq(PROJECT_ID), eq(JOB_ID), any(Job.class))) + .thenReturn(update); + when(update.execute()).thenThrow(new IOException("Job has terminated in state SUCCESS")); + + DataflowPipelineJob job = new DataflowPipelineJob( + PROJECT_ID, JOB_ID, mockWorkflowClient, null); + job.cancel(); + expectedLogs.verifyWarn("Cancel failed because job " + JOB_ID + " is already terminated."); + } + /** * Tests that the {@link DataflowPipelineJob} understands that the {@link State#FAILED FAILED} * state is terminal. From 7cecf6e249015dac02a1a0e66b9225b03077611e Mon Sep 17 00:00:00 2001 From: Eugene Kirpichov Date: Wed, 19 Apr 2017 10:27:12 -0700 Subject: [PATCH 33/77] Cache result of BigQuerySourceBase.split --- .../cloud/dataflow/sdk/io/BigQueryIO.java | 39 ++++++++++++------- .../cloud/dataflow/sdk/io/BigQueryIOTest.java | 7 +++- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java index f844f49aed..ba5fc3bb4a 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java @@ -1211,6 +1211,8 @@ private abstract static class BigQuerySourceBase extends BoundedSource protected final BigQueryServices bqServices; protected final ValueProvider executingProject; + private List> cachedSplitResult; + private BigQuerySourceBase( String jobIdToken, String extractDestinationDir, @@ -1225,19 +1227,30 @@ private BigQuerySourceBase( @Override public List> splitIntoBundles( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { - BigQueryOptions bqOptions = options.as(BigQueryOptions.class); - TableReference tableToExtract = getTableToExtract(bqOptions); - JobService jobService = bqServices.getJobService(bqOptions); - String extractJobId = getExtractJobId(jobIdToken); - List tempFiles = executeExtract(extractJobId, tableToExtract, jobService); - - TableSchema tableSchema = bqServices.getDatasetService(bqOptions).getTable( - tableToExtract.getProjectId(), - tableToExtract.getDatasetId(), - tableToExtract.getTableId()).getSchema(); - - cleanupTempResource(bqOptions); - return createSources(tempFiles, tableSchema); + // splitIntoBundles() can be called multiple times, e.g. Dataflow runner may call it multiple + // times with different desiredBundleSizeBytes in case the splitIntoBundles() call produces + // too many sources. We ignore desiredBundleSizeBytes anyway, however in any case, we should + // not initiate another BigQuery extract job for the repeated splitIntoBundles() calls. + if (cachedSplitResult == null) { + BigQueryOptions bqOptions = options.as(BigQueryOptions.class); + TableReference tableToExtract = getTableToExtract(bqOptions); + JobService jobService = bqServices.getJobService(bqOptions); + String extractJobId = getExtractJobId(jobIdToken); + List tempFiles = executeExtract(extractJobId, tableToExtract, jobService); + + TableSchema tableSchema = + bqServices + .getDatasetService(bqOptions) + .getTable( + tableToExtract.getProjectId(), + tableToExtract.getDatasetId(), + tableToExtract.getTableId()) + .getSchema(); + + cleanupTempResource(bqOptions); + cachedSplitResult = createSources(tempFiles, tableSchema); + } + return cachedSplitResult; } protected abstract TableReference getTableToExtract(BigQueryOptions bqOptions) throws Exception; diff --git a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java index 7f9d2e95ce..1e4f733096 100644 --- a/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java +++ b/sdk/src/test/java/com/google/cloud/dataflow/sdk/io/BigQueryIOTest.java @@ -30,6 +30,7 @@ import static org.mockito.Matchers.eq; import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.when; import com.google.api.client.util.Data; @@ -1130,10 +1131,14 @@ public void testBigQueryTableSourceInitSplit() throws Exception { List> sources = bqSource.splitIntoBundles(100, options); assertEquals(1, sources.size()); + // Simulate a repeated call to splitIntoBundles(), like a Dataflow worker will sometimes do. + sources = bqSource.splitIntoBundles(200, options); + assertEquals(1, sources.size()); BoundedSource actual = sources.get(0); assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class)); - Mockito.verify(mockJobService) + // A repeated call to splitIntoBundles() should not have caused a duplicate extract job. + Mockito.verify(mockJobService, times(1)) .startExtractJob(Mockito.any(), Mockito.any()); } From 52e593a18b92ead25341244922b5334307daa70f Mon Sep 17 00:00:00 2001 From: Eugene Kirpichov Date: Fri, 21 Apr 2017 13:25:58 -0700 Subject: [PATCH 34/77] Makes cachedSplitResult transient in BigQuerySourceBase --- .../main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java index ba5fc3bb4a..d55dfd5910 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/io/BigQueryIO.java @@ -1211,7 +1211,7 @@ private abstract static class BigQuerySourceBase extends BoundedSource protected final BigQueryServices bqServices; protected final ValueProvider executingProject; - private List> cachedSplitResult; + private transient List> cachedSplitResult; private BigQuerySourceBase( String jobIdToken, From 0b189833709c3bb35327e31e29f0dc87152c1c46 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Fri, 28 Apr 2017 10:32:20 -0700 Subject: [PATCH 35/77] Archetypes: version management at the top-level outside archetype source code --- .../resources/archetype-resources/pom.xml | 2 +- .../resources/archetype-resources/pom.xml | 2 +- maven-archetypes/pom.xml | 34 +++++++++++++++++++ .../resources/archetype-resources/pom.xml | 2 +- pom.xml | 8 +++++ 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml index e7c1647069..a8677aeb3e 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -128,7 +128,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta4-SNAPSHOT + @archetype.sdk_version_dependency@ diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 0ee80d0539..55eac300f1 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -128,7 +128,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta4-SNAPSHOT + @archetype.sdk_version_dependency@ diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index b698ec84b8..837c2afad2 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -35,4 +35,38 @@ examples examples-java8 + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + @ + + false + + + + + + + + src/main/resources + true + + archetype-resources/pom.xml + + + + src/main/resources + false + + archetype-resources/pom.xml + + + + diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index b392af2991..e7f2e59c1f 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -72,7 +72,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta4-SNAPSHOT + @archetype.sdk_version_dependency@ diff --git a/pom.xml b/pom.xml index 7908489603..5c5e993397 100644 --- a/pom.xml +++ b/pom.xml @@ -108,6 +108,8 @@ Google Cloud Dataflow SDK for Java ${project.version}-20170317 6 + + 2.0.0-beta4-SNAPSHOT pom @@ -262,6 +264,12 @@ + + org.apache.maven.plugins + maven-resources-plugin + 3.0.2 + + org.apache.maven.plugins maven-dependency-plugin From f89d619f68c94ac54971265b7ecf0b7778f072fe Mon Sep 17 00:00:00 2001 From: Daniel Halperin Date: Fri, 28 Apr 2017 16:10:25 -0700 Subject: [PATCH 36/77] Fix a typo in Count.java --- .../java/com/google/cloud/dataflow/sdk/transforms/Count.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java index ffa11d13a3..99d5fb8c86 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/Count.java @@ -21,7 +21,7 @@ import com.google.cloud.dataflow.sdk.values.PCollection; /** - * {@code PTransorm}s to count the elements in a {@link PCollection}. + * {@code PTransform}s to count the elements in a {@link PCollection}. * *

{@link Count#perElement()} can be used to count the number of occurrences of each * distinct element in the PCollection, {@link Count#perKey()} can be used to count the From 3bedec34db0c965d199c2139eeeb9aa8fb4ec936 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Wed, 3 May 2017 23:56:59 -0700 Subject: [PATCH 37/77] Add comment to pom.xml explaining archetype.sdk_version_dependency --- pom.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pom.xml b/pom.xml index 5c5e993397..e6efd5c2d7 100644 --- a/pom.xml +++ b/pom.xml @@ -109,6 +109,11 @@ ${project.version}-20170317 6 + 2.0.0-beta4-SNAPSHOT From 0e93387d9e9f6d29fe75f53a98d39163ac56d29b Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Wed, 3 May 2017 23:57:14 -0700 Subject: [PATCH 38/77] Add comment to pom.xml explaining "archetype.sdk_version_dependency" From 8c35b75454ffb714e9663b5c0d4b9b65dcf047bf Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Fri, 5 May 2017 16:36:44 -0700 Subject: [PATCH 39/77] Starter archetype: automate version management in the reference code --- maven-archetypes/starter/pom.xml | 8 ++++++++ .../src/test/resources/projects/basic/reference/pom.xml | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 2e61c542c1..5832a55772 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -43,6 +43,14 @@ + + + + src/test/resources + true + + + diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index 6aff824959..19732871d0 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -72,7 +72,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - 2.0.0-beta4-SNAPSHOT + @project.version@ From 5c096caa20b6d1800a318eeeafe68056346214b1 Mon Sep 17 00:00:00 2001 From: jasonkuster Date: Mon, 22 May 2017 19:24:04 -0700 Subject: [PATCH 40/77] Upgrade to Apache Beam, version 2.0.0 (#571) * Upgrade to Apache Beam, version 2.0.0 Signed-off-by: Jason Kuster * Revert enforcer plugin changes since they seem to be beam-only. Signed-off-by: Jason Kuster * Pull request comments. Signed-off-by: Jason Kuster --- maven-archetypes/examples-java8/pom.xml | 1 + .../resources/archetype-resources/pom.xml | 14 +- .../src/main/java/DebuggingWordCount.java | 24 ++- .../src/main/java/MinimalWordCount.java | 7 +- .../src/main/java/MinimalWordCountJava8.java | 14 +- .../src/main/java/WindowedWordCount.java | 43 ++-- .../src/main/java/WordCount.java | 20 +- .../common/ExampleBigQueryTableOptions.java | 2 +- ...mplePubsubTopicAndSubscriptionOptions.java | 2 +- .../common/ExamplePubsubTopicOptions.java | 2 +- .../src/main/java/common/ExampleUtils.java | 74 ++++++- .../java/common/WriteOneFilePerWindow.java | 114 +++++++++++ .../java/common/WriteWindowedFilesDoFn.java | 77 -------- .../main/java/complete/game/GameStats.java | 49 ++--- .../java/complete/game/HourlyTeamScore.java | 60 +++--- .../main/java/complete/game/LeaderBoard.java | 70 +++++-- .../src/main/java/complete/game/README.md | 131 ------------- .../main/java/complete/game/UserScore.java | 87 ++++----- .../java/complete/game/injector/Injector.java | 3 +- .../complete/game/utils/WriteToBigQuery.java | 31 +-- .../java/complete/game/utils/WriteToText.java | 184 ++++++++++++++++++ .../game/utils/WriteWindowedToBigQuery.java | 14 +- .../test/java/MinimalWordCountJava8Test.java | 15 +- .../src/test/java/WordCountTest.java | 4 +- .../java/complete/game/GameStatsTest.java | 6 +- .../complete/game/HourlyTeamScoreTest.java | 11 +- .../java/complete/game/UserScoreTest.java | 14 +- maven-archetypes/examples/pom.xml | 1 + .../resources/archetype-resources/pom.xml | 16 +- .../src/main/java/DebuggingWordCount.java | 24 ++- .../src/main/java/MinimalWordCount.java | 7 +- .../src/main/java/WindowedWordCount.java | 43 ++-- .../src/main/java/WordCount.java | 20 +- .../common/ExampleBigQueryTableOptions.java | 2 +- ...mplePubsubTopicAndSubscriptionOptions.java | 2 +- .../common/ExamplePubsubTopicOptions.java | 2 +- .../src/main/java/common/ExampleUtils.java | 74 ++++++- .../java/common/WriteOneFilePerWindow.java | 114 +++++++++++ .../java/common/WriteWindowedFilesDoFn.java | 77 -------- .../src/test/java/WordCountTest.java | 4 +- maven-archetypes/pom.xml | 51 +++-- pom.xml | 10 +- 42 files changed, 883 insertions(+), 637 deletions(-) create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/README.md create mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java create mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java diff --git a/maven-archetypes/examples-java8/pom.xml b/maven-archetypes/examples-java8/pom.xml index 4056cf6f8f..b9ae4738d9 100644 --- a/maven-archetypes/examples-java8/pom.xml +++ b/maven-archetypes/examples-java8/pom.xml @@ -74,6 +74,7 @@ + diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml index a8677aeb3e..275e2581a2 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -27,6 +27,7 @@ UTF-8 + 2.20 @@ -58,7 +59,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.19.1 + ${surefire-plugin.version} all 4 @@ -68,11 +69,18 @@ org.apache.maven.surefire surefire-junit47 - 2.19.1 + ${surefire-plugin.version} + + + org.apache.maven.plugins + maven-jar-plugin + + - -# 'Gaming' examples - - -This directory holds a series of example Dataflow pipelines in a simple 'mobile -gaming' domain. They all require Java 8. Each pipeline successively introduces -new concepts, and gives some examples of using Java 8 syntax in constructing -Dataflow pipelines. Other than usage of Java 8 lambda expressions, the concepts -that are used apply equally well in Java 7. - -In the gaming scenario, many users play, as members of different teams, over -the course of a day, and their actions are logged for processing. Some of the -logged game events may be late-arriving, if users play on mobile devices and go -transiently offline for a period. - -The scenario includes not only "regular" users, but "robot users", which have a -higher click rate than the regular users, and may move from team to team. - -The first two pipelines in the series use pre-generated batch data samples. The -second two pipelines read from a [PubSub](https://cloud.google.com/pubsub/) -topic input. For these examples, you will also need to run the -`injector.Injector` program, which generates and publishes the gaming data to -PubSub. The javadocs for each pipeline have more detailed information on how to -run that pipeline. - -All of these pipelines write their results to BigQuery table(s). - - -## The pipelines in the 'gaming' series - -### UserScore - -The first pipeline in the series is `UserScore`. This pipeline does batch -processing of data collected from gaming events. It calculates the sum of -scores per user, over an entire batch of gaming data (collected, say, for each -day). The batch processing will not include any late data that arrives after -the day's cutoff point. - -### HourlyTeamScore - -The next pipeline in the series is `HourlyTeamScore`. This pipeline also -processes data collected from gaming events in batch. It builds on `UserScore`, -but uses [fixed windows](https://cloud.google.com/dataflow/model/windowing), by -default an hour in duration. It calculates the sum of scores per team, for each -window, optionally allowing specification of two timestamps before and after -which data is filtered out. This allows a model where late data collected after -the intended analysis window can be included in the analysis, and any late- -arriving data prior to the beginning of the analysis window can be removed as -well. - -By using windowing and adding element timestamps, we can do finer-grained -analysis than with the `UserScore` pipeline — we're now tracking scores for -each hour rather than over the course of a whole day. However, our batch -processing is high-latency, in that we don't get results from plays at the -beginning of the batch's time period until the complete batch is processed. - -### LeaderBoard - -The third pipeline in the series is `LeaderBoard`. This pipeline processes an -unbounded stream of 'game events' from a PubSub topic. The calculation of the -team scores uses fixed windowing based on event time (the time of the game play -event), not processing time (the time that an event is processed by the -pipeline). The pipeline calculates the sum of scores per team, for each window. -By default, the team scores are calculated using one-hour windows. - -In contrast — to demo another windowing option — the user scores are calculated -using a global window, which periodically (every ten minutes) emits cumulative -user score sums. - -In contrast to the previous pipelines in the series, which used static, finite -input data, here we're using an unbounded data source, which lets us provide -_speculative_ results, and allows handling of late data, at much lower latency. -E.g., we could use the early/speculative results to keep a 'leaderboard' -updated in near-realtime. Our handling of late data lets us generate correct -results, e.g. for 'team prizes'. We're now outputing window results as they're -calculated, giving us much lower latency than with the previous batch examples. - -### GameStats - -The fourth pipeline in the series is `GameStats`. This pipeline builds -on the `LeaderBoard` functionality — supporting output of speculative and late -data — and adds some "business intelligence" analysis: identifying abuse -detection. The pipeline derives the Mean user score sum for a window, and uses -that information to identify likely spammers/robots. (The injector is designed -so that the "robots" have a higher click rate than the "real" users). The robot -users are then filtered out when calculating the team scores. - -Additionally, user sessions are tracked: that is, we find bursts of user -activity using session windows. Then, the mean session duration information is -recorded in the context of subsequent fixed windowing. (This could be used to -tell us what games are giving us greater user retention). - -### Running the PubSub Injector - -The `LeaderBoard` and `GameStats` example pipelines read unbounded data -from a PubSub topic. - -Use the `injector.Injector` program to generate this data and publish to a -PubSub topic. See the `Injector`javadocs for more information on how to run the -injector. Set up the injector before you start one of these pipelines. Then, -when you start the pipeline, pass as an argument the name of that PubSub topic. -See the pipeline javadocs for the details. - -## Viewing the results in BigQuery - -All of the pipelines write their results to BigQuery. `UserScore` and -`HourlyTeamScore` each write one table, and `LeaderBoard` and -`GameStats` each write two. The pipelines have default table names that -you can override when you start up the pipeline if those tables already exist. - -Depending on the windowing intervals defined in a given pipeline, you may have -to wait for a while (more than an hour) before you start to see results written -to the BigQuery tables. diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java index d500658694..c693614c57 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java @@ -20,17 +20,18 @@ import java.util.HashMap; import java.util.Map; import org.apache.avro.reflect.Nullable; -import ${package}.complete.game.utils.WriteToBigQuery; +import ${package}.complete.game.utils.WriteToText; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.DefaultCoder; import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.Validation; -import org.apache.beam.sdk.transforms.Aggregator; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; @@ -44,8 +45,8 @@ /** * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain. - * Concepts: batch processing; reading input from Google Cloud Storage and writing output to - * BigQuery; using standalone DoFns; use of the sum by key transform; examples of + * Concepts: batch processing, reading input from text files, writing output to + * text files, using standalone DoFns, use of the sum per key transform, and use of * Java 8 lambda syntax. * *

In this gaming scenario, many users play, as members of different teams, over the course of a @@ -56,16 +57,14 @@ * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The * batch processing will not include any late data that arrives after the day's cutoff point. * - *

To execute this pipeline using the Dataflow service and static example input data, specify - * the pipeline configuration like this: + *

To execute this pipeline, specify the pipeline configuration like this: *

{@code
- *   --project=YOUR_PROJECT_ID
- *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
- *   --runner=BlockingDataflowRunner
- *   --dataset=YOUR-DATASET
+ *   --tempLocation=YOUR_TEMP_DIRECTORY
+ *   --runner=YOUR_RUNNER
+ *   --output=YOUR_OUTPUT_DIRECTORY
+ *   (possibly options specific to your runner or permissions for your temp/output locations)
  * }
  * 
- * where the BigQuery dataset you specify must already exist. * *

Optionally include the --input argument to specify a batch input file. * See the --input default value for example batch data file, or use {@code injector.Injector} to @@ -125,8 +124,7 @@ static class ParseEventFn extends DoFn { // Log and count parse errors. private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class); - private final Aggregator numParseErrors = - createAggregator("ParseErrors", Sum.ofLongs()); + private final Counter numParseErrors = Metrics.counter("main", "ParseErrors"); @ProcessElement public void processElement(ProcessContext c) { @@ -139,7 +137,7 @@ public void processElement(ProcessContext c) { GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp); c.output(gInfo); } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { - numParseErrors.addValue(1L); + numParseErrors.inc(); LOG.info("Parse error on " + c.element() + ", " + e.getMessage()); } } @@ -165,9 +163,8 @@ public PCollection> expand( return gameInfo .apply(MapElements - .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore())) - .withOutputType( - TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))) + .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) + .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))) .apply(Sum.integersPerKey()); } } @@ -186,37 +183,26 @@ public interface Options extends PipelineOptions { String getInput(); void setInput(String value); - @Description("BigQuery Dataset to write tables to. Must already exist.") + // Set this required option to specify where to write the output. + @Description("Path of the file to write to.") @Validation.Required - String getDataset(); - void setDataset(String value); - - @Description("The BigQuery table name. Should not already exist.") - @Default.String("user_score") - String getUserScoreTableName(); - void setUserScoreTableName(String value); + String getOutput(); + void setOutput(String value); } /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is passed to the {@link WriteToBigQuery} constructor to write user score sums. + * Create a map of information that describes how to write pipeline output to text. This map + * is passed to the {@link WriteToText} constructor to write user score sums. */ - protected static Map>> - configureBigQueryWrite() { - Map>> tableConfigure = - new HashMap>>(); - tableConfigure.put( - "user", - new WriteToBigQuery.FieldInfo>( - "STRING", (c, w) -> c.element().getKey())); - tableConfigure.put( - "total_score", - new WriteToBigQuery.FieldInfo>( - "INTEGER", (c, w) -> c.element().getValue())); - return tableConfigure; + protected static Map>> + configureOutput() { + Map>> config = + new HashMap>>(); + config.put("user", (c, w) -> c.element().getKey()); + config.put("total_score", (c, w) -> c.element().getValue()); + return config; } - /** * Run a batch pipeline. */ @@ -227,17 +213,20 @@ public static void main(String[] args) throws Exception { Pipeline pipeline = Pipeline.create(options); // Read events from a text file and parse them. - pipeline.apply(TextIO.Read.from(options.getInput())) - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) - // Extract and sum username/score pairs from the event data. - .apply("ExtractUserScore", new ExtractAndSumScore("user")) - .apply("WriteUserScoreSums", - new WriteToBigQuery>(options.getUserScoreTableName(), - configureBigQueryWrite())); + pipeline + .apply(TextIO.read().from(options.getInput())) + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) + // Extract and sum username/score pairs from the event data. + .apply("ExtractUserScore", new ExtractAndSumScore("user")) + .apply( + "WriteUserScoreSums", + new WriteToText>( + options.getOutput(), + configureOutput(), + false)); // Run the batch pipeline. pipeline.run().waitUntilFinish(); } // [END DocInclude_USMain] - } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java index c8531c15d1..4814ffb66f 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java @@ -260,8 +260,7 @@ private static String generateEvent(Long currTime, int delayInMillis) { user = team.getRandomUser(); } String event = user + "," + teamName + "," + random.nextInt(MAX_SCORE); - // Randomly introduce occasional parse errors. You can see a custom counter tracking the number - // of such errors in the Dataflow Monitoring UI, as the example pipeline runs. + // Randomly introduce occasional parse errors. if (random.nextInt(parseErrorRate) == 0) { System.out.println("Introducing a parse error."); event = "THIS LINE REPRESENTS CORRUPT DATA AND WILL CAUSE A PARSE ERROR"; diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java index 0e800623ab..984e958c50 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java @@ -25,13 +25,9 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import ${package}.complete.game.UserScore; -import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.options.GcpOptions; -import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; @@ -47,14 +43,21 @@ public class WriteToBigQuery extends PTransform, PDone> { + protected String projectId; + protected String datasetId; protected String tableName; protected Map> fieldInfo; public WriteToBigQuery() { } - public WriteToBigQuery(String tableName, + public WriteToBigQuery( + String projectId, + String datasetId, + String tableName, Map> fieldInfo) { + this.projectId = projectId; + this.datasetId = datasetId; this.tableName = tableName; this.fieldInfo = fieldInfo; } @@ -119,22 +122,22 @@ protected TableSchema getSchema() { @Override public PDone expand(PCollection teamAndScore) { - return teamAndScore - .apply("ConvertToRow", ParDo.of(new BuildRowFn())) - .apply(BigQueryIO.Write - .to(getTable(teamAndScore.getPipeline(), - tableName)) + teamAndScore + .apply("ConvertToRow", ParDo.of(new BuildRowFn())) + .apply( + BigQueryIO.writeTableRows() + .to(getTable(projectId, datasetId, tableName)) .withSchema(getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); + return PDone.in(teamAndScore.getPipeline()); } /** Utility to construct an output table reference. */ - static TableReference getTable(Pipeline pipeline, String tableName) { - PipelineOptions options = pipeline.getOptions(); + static TableReference getTable(String projectId, String datasetId, String tableName) { TableReference table = new TableReference(); - table.setDatasetId(options.as(UserScore.Options.class).getDataset()); - table.setProjectId(options.as(GcpOptions.class).getProject()); + table.setDatasetId(datasetId); + table.setProjectId(projectId); table.setTableId(tableName); return table; } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java new file mode 100644 index 0000000000..6d4e1399b3 --- /dev/null +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game.utils; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Verify.verifyNotNull; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; +import java.util.stream.Collectors; +import org.apache.beam.sdk.io.FileBasedSink; +import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +/** + * Generate, format, and write rows. Use provided information about the field names and types, as + * well as lambda functions that describe how to generate their values. + */ +public class WriteToText + extends PTransform, PDone> { + + private static final DateTimeFormatter formatter = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + + protected String filenamePrefix; + protected Map> fieldFn; + protected boolean windowed; + + public WriteToText() { + } + + public WriteToText( + String filenamePrefix, + Map> fieldFn, + boolean windowed) { + this.filenamePrefix = filenamePrefix; + this.fieldFn = fieldFn; + this.windowed = windowed; + } + + /** + * A {@link Serializable} function from a {@link DoFn.ProcessContext} + * and {@link BoundedWindow} to the value for that field. + */ + public interface FieldFn extends Serializable { + Object apply(DoFn.ProcessContext context, BoundedWindow window); + } + + /** Convert each key/score pair into a row as specified by fieldFn. */ + protected class BuildRowFn extends DoFn { + + @ProcessElement + public void processElement(ProcessContext c, BoundedWindow window) { + List fields = new ArrayList(); + for (Map.Entry> entry : fieldFn.entrySet()) { + String key = entry.getKey(); + FieldFn fcn = entry.getValue(); + fields.add(key + ": " + fcn.apply(c, window)); + } + String result = fields.stream().collect(Collectors.joining(", ")); + c.output(result); + } + } + + /** + * A {@link DoFn} that writes elements to files with names deterministically derived from the + * lower and upper bounds of their key (an {@link IntervalWindow}). + */ + protected class WriteOneFilePerWindow extends PTransform, PDone> { + + private final String filenamePrefix; + + public WriteOneFilePerWindow(String filenamePrefix) { + this.filenamePrefix = filenamePrefix; + } + + @Override + public PDone expand(PCollection input) { + // Verify that the input has a compatible window type. + checkArgument( + input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder()); + + // filenamePrefix may contain a directory and a filename component. Pull out only the filename + // component from that path for the PerWindowFiles. + String prefix = ""; + ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); + if (!resource.isDirectory()) { + prefix = verifyNotNull( + resource.getFilename(), + "A non-directory resource should have a non-null filename: %s", + resource); + } + + return input.apply( + TextIO.write() + .to(resource.getCurrentDirectory()) + .withFilenamePolicy(new PerWindowFiles(prefix)) + .withWindowedWrites() + .withNumShards(3)); + } + } + + /** + * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data + * being written. This always includes the shard number and the total number of shards. For + * windowed writes, it also includes the window and pane index (a sequence number assigned to each + * trigger firing). + */ + protected static class PerWindowFiles extends FilenamePolicy { + + private final String prefix; + + public PerWindowFiles(String prefix) { + this.prefix = prefix; + } + + public String filenamePrefixForWindow(IntervalWindow window) { + return String.format("%s-%s-%s", + prefix, formatter.print(window.start()), formatter.print(window.end())); + } + + @Override + public ResourceId windowedFilename( + ResourceId outputDirectory, WindowedContext context, String extension) { + IntervalWindow window = (IntervalWindow) context.getWindow(); + String filename = String.format( + "%s-%s-of-%s%s", + filenamePrefixForWindow(window), context.getShardNumber(), context.getNumShards(), + extension); + return outputDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE); + } + + @Override + public ResourceId unwindowedFilename( + ResourceId outputDirectory, Context context, String extension) { + throw new UnsupportedOperationException("Unsupported."); + } + } + + @Override + public PDone expand(PCollection teamAndScore) { + if (windowed) { + teamAndScore + .apply("ConvertToRow", ParDo.of(new BuildRowFn())) + .apply(new WriteToText.WriteOneFilePerWindow(filenamePrefix)); + } else { + teamAndScore + .apply("ConvertToRow", ParDo.of(new BuildRowFn())) + .apply(TextIO.write().to(filenamePrefix)); + } + return PDone.in(teamAndScore.getPipeline()); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java index 839650f02d..6aef88706d 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java @@ -36,9 +36,9 @@ public class WriteWindowedToBigQuery extends WriteToBigQuery { - public WriteWindowedToBigQuery(String tableName, - Map> fieldInfo) { - super(tableName, fieldInfo); + public WriteWindowedToBigQuery( + String projectId, String datasetId, String tableName, Map> fieldInfo) { + super(projectId, datasetId, tableName, fieldInfo); } /** Convert each key/score pair into a BigQuery TableRow. */ @@ -58,14 +58,14 @@ public void processElement(ProcessContext c, BoundedWindow window) { @Override public PDone expand(PCollection teamAndScore) { - return teamAndScore + teamAndScore .apply("ConvertToRow", ParDo.of(new BuildRowFn())) - .apply(BigQueryIO.Write - .to(getTable(teamAndScore.getPipeline(), - tableName)) + .apply(BigQueryIO.writeTableRows() + .to(getTable(projectId, datasetId, tableName)) .withSchema(getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); + return PDone.in(teamAndScore.getPipeline()); } } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java index bfca71bcf9..af347c1c0a 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java @@ -26,8 +26,8 @@ import java.nio.file.StandardOpenOption; import java.util.Arrays; import java.util.List; +import org.apache.beam.sdk.extensions.gcp.options.GcsOptions; import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.GcsOptions; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.Filter; @@ -62,15 +62,16 @@ public class MinimalWordCountJava8Test implements Serializable { public void testMinimalWordCountJava8() throws Exception { p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); - p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*")) - .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+"))) - .withOutputType(TypeDescriptors.strings())) + p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) + .apply(FlatMapElements + .into(TypeDescriptors.strings()) + .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))) .apply(Filter.by((String word) -> !word.isEmpty())) .apply(Count.perElement()) .apply(MapElements - .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()) - .withOutputType(TypeDescriptors.strings())) - .apply(TextIO.Write.to("gs://your-output-bucket/and-output-prefix")); + .into(TypeDescriptors.strings()) + .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) + .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix")); } private GcsUtil buildMockGcsUtil() throws IOException { diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java index e9621032e5..b4e4124e26 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java @@ -24,8 +24,8 @@ import ${package}.WordCount.FormatAsTextFn; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.RunnableOnService; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.DoFnTester; @@ -73,7 +73,7 @@ public void testExtractWordsFn() throws Exception { /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testCountWords() throws Exception { PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java index 3823bea673..5cbdc6244f 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java @@ -23,8 +23,8 @@ import ${package}.complete.game.GameStats.CalculateSpammyUsers; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.RunnableOnService; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; @@ -38,7 +38,7 @@ * Tests of GameStats. * Because the pipeline was designed for easy readability and explanations, it lacks good * modularity for testing. See our testing documentation for better ideas: - * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline. + * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ */ @RunWith(JUnit4.class) public class GameStatsTest implements Serializable { @@ -63,7 +63,7 @@ public class GameStatsTest implements Serializable { /** Test the calculation of 'spammy users'. */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testCalculateSpammyUsers() throws Exception { PCollection> input = p.apply(Create.of(USER_SCORES)); PCollection> output = input.apply(new CalculateSpammyUsers()); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java index c777ffb2fb..17d459df93 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java @@ -25,8 +25,8 @@ import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.RunnableOnService; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.Filter; import org.apache.beam.sdk.transforms.MapElements; @@ -45,7 +45,7 @@ * Tests of HourlyTeamScore. * Because the pipeline was designed for easy readability and explanations, it lacks good * modularity for testing. See our testing documentation for better ideas: - * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline. + * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ */ @RunWith(JUnit4.class) public class HourlyTeamScoreTest implements Serializable { @@ -86,7 +86,7 @@ public class HourlyTeamScoreTest implements Serializable { /** Test the filtering. */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testUserScoresFilter() throws Exception { final Instant startMinTimestamp = new Instant(1447965680000L); @@ -101,9 +101,8 @@ public void testUserScoresFilter() throws Exception { -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) // run a map to access the fields in the result. .apply(MapElements - .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) - .withOutputType( - TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))); + .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) + .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); PAssert.that(output).containsInAnyOrder(FILTERED_EVENTS); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java index 25987ee2cc..83b8821480 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java @@ -25,8 +25,8 @@ import ${package}.complete.game.UserScore.ParseEventFn; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.RunnableOnService; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFnTester; import org.apache.beam.sdk.transforms.MapElements; @@ -99,7 +99,7 @@ public void testParseEventFn() throws Exception { /** Tests ExtractAndSumScore("user"). */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testUserScoreSums() throws Exception { PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); @@ -117,7 +117,7 @@ public void testUserScoreSums() throws Exception { /** Tests ExtractAndSumScore("team"). */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testTeamScoreSums() throws Exception { PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); @@ -135,7 +135,7 @@ public void testTeamScoreSums() throws Exception { /** Test that bad input data is dropped appropriately. */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testUserScoresBadInput() throws Exception { PCollection input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of())); @@ -143,9 +143,9 @@ public void testUserScoresBadInput() throws Exception { PCollection> extract = input .apply(ParDo.of(new ParseEventFn())) .apply( - MapElements.via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())) - .withOutputType( - TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))); + MapElements + .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) + .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); PAssert.that(extract).empty(); diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 8cf6cd34ca..8c16f5e547 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -74,6 +74,7 @@ + diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 55eac300f1..83165d572c 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -27,6 +27,7 @@ UTF-8 + 2.20 @@ -58,7 +59,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.19.1 + ${surefire-plugin.version} all 4 @@ -68,11 +69,18 @@ org.apache.maven.surefire surefire-junit47 - 2.19.1 + ${surefire-plugin.version} + + + org.apache.maven.plugins + maven-jar-plugin + + org.hamcrest diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java index dd9b91decc..07870f2ed0 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java @@ -22,14 +22,14 @@ import java.util.regex.Pattern; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.transforms.Aggregator; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.Sum; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.slf4j.Logger; @@ -51,7 +51,7 @@ *

New Concepts: *

  *   1. Logging using SLF4J, even in a distributed environment
- *   2. Creating a custom aggregator (runners have varying levels of support)
+ *   2. Creating a custom metric (runners have varying levels of support)
  *   3. Testing your Pipeline via PAssert
  * 
* @@ -90,14 +90,12 @@ public FilterTextFn(String pattern) { } /** - * Concept #2: A custom aggregator can track values in your pipeline as it runs. Each - * runner provides varying levels of support for aggregators, and may expose them + * Concept #2: A custom metric can track values in your pipeline as it runs. Each + * runner provides varying levels of support for metrics, and may expose them * in a dashboard, etc. */ - private final Aggregator matchedWords = - createAggregator("matchedWords", Sum.ofLongs()); - private final Aggregator unmatchedWords = - createAggregator("unmatchedWords", Sum.ofLongs()); + private final Counter matchedWords = Metrics.counter(FilterTextFn.class, "matchedWords"); + private final Counter unmatchedWords = Metrics.counter(FilterTextFn.class, "unmatchedWords"); @ProcessElement public void processElement(ProcessContext c) { @@ -105,14 +103,14 @@ public void processElement(ProcessContext c) { // Log at the "DEBUG" level each element that we match. When executing this pipeline // these log lines will appear only if the log level is set to "DEBUG" or lower. LOG.debug("Matched: " + c.element().getKey()); - matchedWords.addValue(1L); + matchedWords.inc(); c.output(c.element()); } else { // Log at the "TRACE" level each element that is not matched. Different log levels // can be used to control the verbosity of logging providing an effective mechanism // to filter less important information. LOG.trace("Did not match: " + c.element().getKey()); - unmatchedWords.addValue(1L); + unmatchedWords.inc(); } } } @@ -138,7 +136,7 @@ public static void main(String[] args) { Pipeline p = Pipeline.create(options); PCollection> filteredWords = - p.apply("ReadLines", TextIO.Read.from(options.getInputFile())) + p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(new WordCount.CountWords()) .apply(ParDo.of(new FilterTextFn(options.getFilterPattern()))); @@ -151,7 +149,7 @@ public static void main(String[] args) { *

Below we verify that the set of filtered words matches our expected counts. Note * that PAssert does not provide any output and that successful completion of the * Pipeline implies that the expectations were met. Learn more at - * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test + * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test. */ List> expectedResults = Arrays.asList( diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java index 97bd8243b8..d6b08066db 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java @@ -17,6 +17,7 @@ */ package ${package}; +import ${package}.common.ExampleUtils; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.options.PipelineOptions; @@ -74,7 +75,7 @@ public static void main(String[] args) { // the input text (a set of Shakespeare's texts). // This example reads a public data set consisting of the complete works of Shakespeare. - p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*")) + p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a // DoFn (defined in-line) on each element that tokenizes the text line into individual words. @@ -83,7 +84,7 @@ public static void main(String[] args) { .apply("ExtractWords", ParDo.of(new DoFn() { @ProcessElement public void processElement(ProcessContext c) { - for (String word : c.element().split("[^a-zA-Z']+")) { + for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) { if (!word.isEmpty()) { c.output(word); } @@ -110,7 +111,7 @@ public String apply(KV input) { // formatted strings) to a series of text files. // // By default, it will write to a set of files with names like wordcount-00001-of-00005 - .apply(TextIO.Write.to("wordcounts")); + .apply(TextIO.write().to("wordcounts")); // Run the pipeline. p.run().waitUntilFinish(); diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java index 052d7b6a0e..6a1d07c485 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java @@ -21,7 +21,7 @@ import java.util.concurrent.ThreadLocalRandom; import ${package}.common.ExampleBigQueryTableOptions; import ${package}.common.ExampleOptions; -import ${package}.common.WriteWindowedFilesDoFn; +import ${package}.common.WriteOneFilePerWindow; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.TextIO; @@ -31,11 +31,9 @@ import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.GroupByKey; +import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; @@ -53,7 +51,7 @@ * *

Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples: * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally - * and using a selected runner; defining DoFns; creating a custom aggregator; + * and using a selected runner; defining DoFns; * user-defined PTransforms; defining PipelineOptions. * *

New Concepts: @@ -163,12 +161,15 @@ public interface Options extends WordCount.WordCountOptions, @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class) Long getMaxTimestampMillis(); void setMaxTimestampMillis(Long value); + + @Description("Fixed number of shards to produce per window, or null for runner-chosen sharding") + Integer getNumShards(); + void setNumShards(Integer numShards); } public static void main(String[] args) throws IOException { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); final String output = options.getOutput(); - final Duration windowSize = Duration.standardMinutes(options.getWindowSize()); final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); @@ -180,7 +181,7 @@ public static void main(String[] args) throws IOException { */ PCollection input = pipeline /** Read from the GCS file. */ - .apply(TextIO.Read.from(options.getInputFile())) + .apply(TextIO.read().from(options.getInputFile())) // Concept #2: Add an element timestamp, using an artificial time just to show windowing. // See AddTimestampFn for more detail on this. .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp))); @@ -203,33 +204,13 @@ public static void main(String[] args) throws IOException { PCollection> wordCounts = windowedWords.apply(new WordCount.CountWords()); /** - * Concept #5: Customize the output format using windowing information - * - *

At this point, the data is organized by window. We're writing text files and and have no - * late data, so for simplicity we can use the window as the key and {@link GroupByKey} to get - * one output file per window. (if we had late data this key would not be unique) - * - *

To access the window in a {@link DoFn}, add a {@link BoundedWindow} parameter. This will - * be automatically detected and populated with the window for the current element. - */ - PCollection>> keyedByWindow = - wordCounts.apply( - ParDo.of( - new DoFn, KV>>() { - @ProcessElement - public void processElement(ProcessContext context, IntervalWindow window) { - context.output(KV.of(window, context.element())); - } - })); - - /** - * Concept #6: Format the results and write to a sharded file partitioned by window, using a + * Concept #5: Format the results and write to a sharded file partitioned by window, using a * simple ParDo operation. Because there may be failures followed by retries, the * writes must be idempotent, but the details of writing to files is elided here. */ - keyedByWindow - .apply(GroupByKey.>create()) - .apply(ParDo.of(new WriteWindowedFilesDoFn(output))); + wordCounts + .apply(MapElements.via(new WordCount.FormatAsTextFn())) + .apply(new WriteOneFilePerWindow(output, options.getNumShards())); PipelineResult result = pipeline.run(); try { diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java index b3ef26c493..79b71403b9 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -17,21 +17,22 @@ */ package ${package}; +import ${package}.common.ExampleUtils; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.Validation.Required; -import org.apache.beam.sdk.transforms.Aggregator; import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.transforms.Sum; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; @@ -44,8 +45,8 @@ * pipeline, for introduction of additional concepts. * *

For a detailed walkthrough of this example, see - * - * http://beam.apache.org/use/walkthroughs/ + * + * https://beam.apache.org/get-started/wordcount-example/ * * *

Basic concepts, also in the MinimalWordCount example: @@ -86,17 +87,16 @@ public class WordCount { * to a ParDo in the pipeline. */ static class ExtractWordsFn extends DoFn { - private final Aggregator emptyLines = - createAggregator("emptyLines", Sum.ofLongs()); + private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); @ProcessElement public void processElement(ProcessContext c) { if (c.element().trim().isEmpty()) { - emptyLines.addValue(1L); + emptyLines.inc(); } // Split the line into words. - String[] words = c.element().split("[^a-zA-Z']+"); + String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN); // Output each word encountered into the output PCollection. for (String word : words) { @@ -176,10 +176,10 @@ public static void main(String[] args) { // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the // static FormatAsTextFn() to the ParDo transform. - p.apply("ReadLines", TextIO.Read.from(options.getInputFile())) + p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) - .apply("WriteCounts", TextIO.Write.to(options.getOutput())); + .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java index 6b51074f44..57f1546e27 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java @@ -18,10 +18,10 @@ package ${package}.common; import com.google.api.services.bigquery.model.TableSchema; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.DefaultValueFactory; import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.GcpOptions; import org.apache.beam.sdk.options.PipelineOptions; /** diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java index daeb398f7f..cf142a10fd 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java @@ -17,10 +17,10 @@ */ package ${package}.common; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.DefaultValueFactory; import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.GcpOptions; import org.apache.beam.sdk.options.PipelineOptions; /** diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java index 936bff5675..86784b06da 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java @@ -17,10 +17,10 @@ */ package ${package}.common; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.DefaultValueFactory; import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.GcpOptions; import org.apache.beam.sdk.options.PipelineOptions; /** diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java index 570b3827b7..78f3849b40 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java @@ -19,9 +19,7 @@ import com.google.api.client.googleapis.json.GoogleJsonResponseException; import com.google.api.client.googleapis.services.AbstractGoogleClientRequest; -import com.google.api.client.util.BackOff; -import com.google.api.client.util.BackOffUtils; -import com.google.api.client.util.Sleeper; +import com.google.api.client.http.HttpRequestInitializer; import com.google.api.services.bigquery.Bigquery; import com.google.api.services.bigquery.Bigquery.Datasets; import com.google.api.services.bigquery.Bigquery.Tables; @@ -33,6 +31,10 @@ import com.google.api.services.pubsub.Pubsub; import com.google.api.services.pubsub.model.Subscription; import com.google.api.services.pubsub.model.Topic; +import com.google.auth.Credentials; +import com.google.auth.http.HttpCredentialsAdapter; +import com.google.cloud.hadoop.util.ChainingHttpRequestInitializer; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.util.concurrent.Uninterruptibles; @@ -42,10 +44,15 @@ import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.options.BigQueryOptions; +import org.apache.beam.sdk.extensions.gcp.auth.NullCredentialInitializer; +import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions; import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PubsubOptions; +import org.apache.beam.sdk.util.BackOff; +import org.apache.beam.sdk.util.BackOffUtils; import org.apache.beam.sdk.util.FluentBackoff; +import org.apache.beam.sdk.util.RetryHttpRequestInitializer; +import org.apache.beam.sdk.util.Sleeper; import org.apache.beam.sdk.util.Transport; import org.joda.time.Duration; @@ -59,6 +66,14 @@ public class ExampleUtils { private static final int SC_NOT_FOUND = 404; + /** + * \p{L} denotes the category of Unicode letters, + * so this pattern will match on everything that is not a letter. + * + *

It is used for tokenizing strings in the wordcount examples. + */ + public static final String TOKENIZER_PATTERN = "[^\\p{L}]+"; + private final PipelineOptions options; private Bigquery bigQueryClient = null; private Pubsub pubsubClient = null; @@ -196,10 +211,49 @@ private void tearDown() { } } + /** + * Returns a BigQuery client builder using the specified {@link BigQueryOptions}. + */ + private static Bigquery.Builder newBigQueryClient(BigQueryOptions options) { + return new Bigquery.Builder(Transport.getTransport(), Transport.getJsonFactory(), + chainHttpRequestInitializer( + options.getGcpCredential(), + // Do not log 404. It clutters the output and is possibly even required by the caller. + new RetryHttpRequestInitializer(ImmutableList.of(404)))) + .setApplicationName(options.getAppName()) + .setGoogleClientRequestInitializer(options.getGoogleApiTrace()); + } + + /** + * Returns a Pubsub client builder using the specified {@link PubsubOptions}. + */ + private static Pubsub.Builder newPubsubClient(PubsubOptions options) { + return new Pubsub.Builder(Transport.getTransport(), Transport.getJsonFactory(), + chainHttpRequestInitializer( + options.getGcpCredential(), + // Do not log 404. It clutters the output and is possibly even required by the caller. + new RetryHttpRequestInitializer(ImmutableList.of(404)))) + .setRootUrl(options.getPubsubRootUrl()) + .setApplicationName(options.getAppName()) + .setGoogleClientRequestInitializer(options.getGoogleApiTrace()); + } + + private static HttpRequestInitializer chainHttpRequestInitializer( + Credentials credential, HttpRequestInitializer httpRequestInitializer) { + if (credential == null) { + return new ChainingHttpRequestInitializer( + new NullCredentialInitializer(), httpRequestInitializer); + } else { + return new ChainingHttpRequestInitializer( + new HttpCredentialsAdapter(credential), + httpRequestInitializer); + } + } + private void setupBigQueryTable(String projectId, String datasetId, String tableId, TableSchema schema) throws IOException { if (bigQueryClient == null) { - bigQueryClient = Transport.newBigQueryClient(options.as(BigQueryOptions.class)).build(); + bigQueryClient = newBigQueryClient(options.as(BigQueryOptions.class)).build(); } Datasets datasetService = bigQueryClient.datasets(); @@ -224,7 +278,7 @@ private void setupBigQueryTable(String projectId, String datasetId, String table private void setupPubsubTopic(String topic) throws IOException { if (pubsubClient == null) { - pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); } if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) { pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute(); @@ -233,7 +287,7 @@ private void setupPubsubTopic(String topic) throws IOException { private void setupPubsubSubscription(String topic, String subscription) throws IOException { if (pubsubClient == null) { - pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); } if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) == null) { Subscription subInfo = new Subscription() @@ -250,7 +304,7 @@ private void setupPubsubSubscription(String topic, String subscription) throws I */ private void deletePubsubTopic(String topic) throws IOException { if (pubsubClient == null) { - pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); } if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) { pubsubClient.projects().topics().delete(topic).execute(); @@ -264,7 +318,7 @@ private void deletePubsubTopic(String topic) throws IOException { */ private void deletePubsubSubscription(String subscription) throws IOException { if (pubsubClient == null) { - pubsubClient = Transport.newPubsubClient(options.as(PubsubOptions.class)).build(); + pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); } if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) != null) { pubsubClient.projects().subscriptions().delete(subscription).execute(); diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java new file mode 100644 index 0000000000..fc314b9d7b --- /dev/null +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.common; + +import static com.google.common.base.Verify.verifyNotNull; + +import javax.annotation.Nullable; +import org.apache.beam.sdk.io.FileBasedSink; +import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.ISODateTimeFormat; + +/** + * A {@link DoFn} that writes elements to files with names deterministically derived from the lower + * and upper bounds of their key (an {@link IntervalWindow}). + * + *

This is test utility code, not for end-users, so examples can be focused on their primary + * lessons. + */ +public class WriteOneFilePerWindow extends PTransform, PDone> { + private static final DateTimeFormatter FORMATTER = ISODateTimeFormat.hourMinute(); + private String filenamePrefix; + @Nullable + private Integer numShards; + + public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) { + this.filenamePrefix = filenamePrefix; + this.numShards = numShards; + } + + @Override + public PDone expand(PCollection input) { + // filenamePrefix may contain a directory and a filename component. Pull out only the filename + // component from that path for the PerWindowFiles. + String prefix = ""; + ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); + if (!resource.isDirectory()) { + prefix = verifyNotNull( + resource.getFilename(), + "A non-directory resource should have a non-null filename: %s", + resource); + } + + + TextIO.Write write = TextIO.write() + .to(resource.getCurrentDirectory()) + .withFilenamePolicy(new PerWindowFiles(prefix)) + .withWindowedWrites(); + if (numShards != null) { + write = write.withNumShards(numShards); + } + return input.apply(write); + } + + /** + * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data + * being written. This always includes the shard number and the total number of shards. For + * windowed writes, it also includes the window and pane index (a sequence number assigned to each + * trigger firing). + */ + public static class PerWindowFiles extends FilenamePolicy { + + private final String prefix; + + public PerWindowFiles(String prefix) { + this.prefix = prefix; + } + + public String filenamePrefixForWindow(IntervalWindow window) { + return String.format("%s-%s-%s", + prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end())); + } + + @Override + public ResourceId windowedFilename( + ResourceId outputDirectory, WindowedContext context, String extension) { + IntervalWindow window = (IntervalWindow) context.getWindow(); + String filename = String.format( + "%s-%s-of-%s%s", + filenamePrefixForWindow(window), context.getShardNumber(), context.getNumShards(), + extension); + return outputDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE); + } + + @Override + public ResourceId unwindowedFilename( + ResourceId outputDirectory, Context context, String extension) { + throw new UnsupportedOperationException("Unsupported."); + } + } +} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java deleted file mode 100644 index a08e6a9b0f..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteWindowedFilesDoFn.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import com.google.common.annotations.VisibleForTesting; -import java.io.OutputStream; -import java.nio.channels.Channels; -import java.nio.charset.StandardCharsets; -import org.apache.beam.sdk.coders.Coder; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.util.IOChannelFactory; -import org.apache.beam.sdk.util.IOChannelUtils; -import org.apache.beam.sdk.values.KV; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; - -/** - * A {@link DoFn} that writes elements to files with names deterministically derived from the lower - * and upper bounds of their key (an {@link IntervalWindow}). - * - *

This is test utility code, not for end-users, so examples can be focused - * on their primary lessons. - */ -public class WriteWindowedFilesDoFn - extends DoFn>>, Void> { - - static final byte[] NEWLINE = "\n".getBytes(StandardCharsets.UTF_8); - static final Coder STRING_CODER = StringUtf8Coder.of(); - - private static DateTimeFormatter formatter = ISODateTimeFormat.hourMinute(); - - private final String output; - - public WriteWindowedFilesDoFn(String output) { - this.output = output; - } - - @VisibleForTesting - public static String fileForWindow(String output, IntervalWindow window) { - return String.format( - "%s-%s-%s", output, formatter.print(window.start()), formatter.print(window.end())); - } - - @ProcessElement - public void processElement(ProcessContext context) throws Exception { - // Build a file name from the window - IntervalWindow window = context.element().getKey(); - String outputShard = fileForWindow(output, window); - - // Open the file and write all the values - IOChannelFactory factory = IOChannelUtils.getFactory(outputShard); - OutputStream out = Channels.newOutputStream(factory.create(outputShard, "text/plain")); - for (KV wordCount : context.element().getValue()) { - STRING_CODER.encode( - wordCount.getKey() + ": " + wordCount.getValue(), out, Coder.Context.OUTER); - out.write(NEWLINE); - } - out.close(); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java index e9621032e5..b4e4124e26 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java @@ -24,8 +24,8 @@ import ${package}.WordCount.FormatAsTextFn; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.RunnableOnService; import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.ValidatesRunner; import org.apache.beam.sdk.transforms.Create; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.DoFnTester; @@ -73,7 +73,7 @@ public void testExtractWordsFn() throws Exception { /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ @Test - @Category(RunnableOnService.class) + @Category(ValidatesRunner.class) public void testCountWords() throws Exception { PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 837c2afad2..98c39c2f53 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -37,22 +37,8 @@ - - - - org.apache.maven.plugins - maven-resources-plugin - - - @ - - false - - - - - + src/main/resources true @@ -60,6 +46,7 @@ archetype-resources/pom.xml + src/main/resources false @@ -68,5 +55,39 @@ + + + + + org.apache.maven.plugins + maven-resources-plugin + + + @ + + false + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + default-jar + none + + + default-test-jar + none + + + + diff --git a/pom.xml b/pom.xml index e6efd5c2d7..3db8b7cff0 100644 --- a/pom.xml +++ b/pom.xml @@ -95,7 +95,7 @@ - 3.0.3 + 3.2 @@ -103,7 +103,7 @@ ${maven.build.timestamp} yyyy-MM-dd HH:mm - 0.6.0 + 2.0.0 Google Cloud Dataflow SDK for Java ${project.version}-20170317 @@ -164,7 +164,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.6.0 + 3.6.1 1.7 1.7 @@ -254,7 +254,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.3 + 2.10.4 false @@ -294,7 +294,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.19.1 + 2.20 From dfe74966052f98bafc7425e65b8e4aa4f10b6de2 Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Mon, 22 May 2017 22:31:52 -0700 Subject: [PATCH 41/77] [maven-release-plugin] prepare branch release-2.0.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 3db8b7cff0..5957289ac4 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.0.0 From d40555732e08c272b30103b603d9518bd85eb69f Mon Sep 17 00:00:00 2001 From: Jason Kuster Date: Mon, 22 May 2017 22:32:01 -0700 Subject: [PATCH 42/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples-java8/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index f5ae58ad8a..aa7e2c7fe2 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples-java8/pom.xml b/maven-archetypes/examples-java8/pom.xml index b9ae4738d9..c88a2c90b2 100644 --- a/maven-archetypes/examples-java8/pom.xml +++ b/maven-archetypes/examples-java8/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 8c16f5e547..5a389df83b 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 98c39c2f53..083654bf2e 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 5832a55772..e8cc2e3bac 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 5957289ac4..db89ee4386 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.0.0 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index 14b8061c1b..8513702e47 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.0.0-beta4-SNAPSHOT + 2.1.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From f671e6aaa372fa724bd326156677a7a85af62187 Mon Sep 17 00:00:00 2001 From: jasonkuster Date: Tue, 23 May 2017 10:31:16 -0700 Subject: [PATCH 43/77] Update versioning after cutting 2.0.0 release branch. (#574) * Update archetype versioning after cutting 2.0.0 release branch. Signed-off-by: Jason Kuster * Update worker container image for 2.1.0-SNAPSHOT. Signed-off-by: Jason Kuster * Pin all versions to project version. Signed-off-by: Jason Kuster --- .../src/main/resources/archetype-resources/pom.xml | 2 +- .../src/main/resources/archetype-resources/pom.xml | 2 +- .../src/main/resources/archetype-resources/pom.xml | 2 +- pom.xml | 9 +-------- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml index 275e2581a2..baa1508b2b 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -136,7 +136,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - @archetype.sdk_version_dependency@ + @project.version@ diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 83165d572c..fbf5994c53 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -136,7 +136,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - @archetype.sdk_version_dependency@ + @project.version@ diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index e7f2e59c1f..a92f922f24 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -72,7 +72,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-all - @archetype.sdk_version_dependency@ + @project.version@ diff --git a/pom.xml b/pom.xml index db89ee4386..db4451e1e6 100644 --- a/pom.xml +++ b/pom.xml @@ -106,15 +106,8 @@ 2.0.0 Google Cloud Dataflow SDK for Java - ${project.version}-20170317 + ${project.version}-20170517 6 - - - 2.0.0-beta4-SNAPSHOT pom From e08f5c0a18422afba35ba34ec7d27f0a49ea0313 Mon Sep 17 00:00:00 2001 From: Dan Halperin Date: Tue, 23 May 2017 11:46:07 -0700 Subject: [PATCH 44/77] dataflow.properties: update for new configuration in Apache Beam 2.0.0 See https://github.com/apache/beam/commit/b6508977e56e18903f2224e3a2a4ad9816ea1c89\#diff-2dcc904003e8cbcbf02e330b8be4c7db --- pom.xml | 3 ++- .../org/apache/beam/runners/dataflow/dataflow.properties | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index db4451e1e6..de3af83152 100644 --- a/pom.xml +++ b/pom.xml @@ -107,7 +107,8 @@ Google Cloud Dataflow SDK for Java ${project.version}-20170517 - 6 + 6 + 1 pom diff --git a/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties b/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties index d499ad6699..699990047c 100644 --- a/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties +++ b/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow.properties @@ -12,7 +12,6 @@ # License for the specific language governing permissions and limitations under # the License. -environment.major.version=${dataflow.environment_major_version} - -worker.image.batch=dataflow.gcr.io/v1beta3/beam-java-batch:${dataflow.container_version} -worker.image.streaming=dataflow.gcr.io/v1beta3/beam-java-streaming:${dataflow.container_version} +legacy.environment.major.version=${dataflow.legacy_environment_major_version} +fnapi.environment.major.version=${dataflow.fnapi_environment_major_version} +container.version=${dataflow.container_version} From 7cdb503880682c2c4765d58f2ce791ad5134d306 Mon Sep 17 00:00:00 2001 From: Davor Bonaci Date: Tue, 30 May 2017 16:12:15 -0700 Subject: [PATCH 45/77] Update README.md for Dataflow SDK 2.0.0 release (#578) Update README.md for Dataflow SDK 2.0.0 release --- README.md | 80 +++++++++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index a644049218..112df59d01 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,11 @@ underlying source code is hosted in the [Apache Beam repository](https://github.com/apache/beam). [General usage](https://cloud.google.com/dataflow/getting-started) of Google -Cloud Dataflow does **not** require use of this repository. Instead: +Cloud Dataflow does **not** require use of this repository. Instead, you can do +any one of the following: -1. depend directly on a specific -[version](https://cloud.google.com/dataflow/release-notes/java) of the SDK in +1. Depend directly on a specific +[version](https://cloud.google.com/dataflow/downloads) of the SDK in the [Maven Central Repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.google.cloud.dataflow%22) by adding the following dependency to development environments like Eclipse or Apache Maven: @@ -41,69 +42,52 @@ environments like Eclipse or Apache Maven: version_number -1. download the example pipelines from the separate +1. Download the example pipelines from the separate [DataflowJavaSDK-examples](https://github.com/GoogleCloudPlatform/DataflowJavaSDK-examples) repository. - +[Cloud Dataflow Plugin for Eclipse](https://cloud.google.com/dataflow/docs/quickstarts/quickstart-java-eclipse) +provides tools to create and execute Dataflow pipelines inside Eclipse. -## Status [![Build Status](https://travis-ci.org/GoogleCloudPlatform/DataflowJavaSDK.svg?branch=v2)](https://travis-ci.org/GoogleCloudPlatform/DataflowJavaSDK) +## Status [![Build Status](https://api.travis-ci.org/GoogleCloudPlatform/DataflowJavaSDK.svg?branch=master)](https://travis-ci.org/GoogleCloudPlatform/DataflowJavaSDK) -This branch is a work-in-progress for the Dataflow SDK for Java, version 2.0.0. -It is currently supported on the Cloud Dataflow service in Beta. +Both the SDK and the Dataflow Service are generally available and considered +stable and fully qualified for production use. - +This [`master`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/) branch +contains code to build Dataflow SDK 2.0.0 and newer, as a distribution of Apache +Beam. Pre-Beam SDKs, versions 1.x, are maintained in the +[`master-1.x`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/tree/master-1.x) +branch. ## Overview The key concepts in this programming model are: -* [`PCollection`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk/src/main/java/com/google/cloud/dataflow/sdk/values/PCollection.java): -represents a collection of data, which could be bounded or unbounded in size. -* [`PTransform`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk/src/main/java/com/google/cloud/dataflow/sdk/transforms/PTransform.java): -represents a computation that transforms input PCollections into output -PCollections. -* [`Pipeline`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk/src/main/java/com/google/cloud/dataflow/sdk/Pipeline.java): -manages a directed acyclic graph of PTransforms and PCollections that is ready -for execution. -* [`PipelineRunner`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/PipelineRunner.java): -specifies where and how the pipeline should execute. +* `PCollection`: represents a collection of data, which could be bounded or +unbounded in size. +* `PTransform`: represents a computation that transforms input PCollections +into output PCollections. +* `Pipeline`: manages a directed acyclic graph of PTransforms and PCollections +that is ready for execution. +* `PipelineRunner`: specifies where and how the pipeline should execute. We provide two runners: - 1. The [`DirectRunner`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DirectPipelineRunner.java) -runs the pipeline on your local machine. - 1. The [`DataflowRunner`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/DataflowPipelineRunner.java) -submits the pipeline to the Dataflow Service, where it runs using managed -resources in the [Google Cloud Platform](https://cloud.google.com) (GCP). + 1. The `DirectRunner` runs the pipeline on your local machine. + 1. The `DataflowRunner` submits the pipeline to the Cloud Dataflow Service, +where it runs using managed resources in the +[Google Cloud Platform](https://cloud.google.com). The SDK is built to be extensible and support additional execution environments beyond local execution and the Google Cloud Dataflow Service. Apache Beam -contains additional SDKs, runners, IO connectors, etc. +contains additional SDKs, runners, and IO connectors. ## Getting Started -This repository consists of the following parts: - -* The [`sdk`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/sdk) -module provides a set of basic Java APIs to program against. -* The [`examples`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples) -module provides a few samples to get started. We recommend starting with the -`WordCount` example. - -The following command will build both the `sdk` and `example` modules and -install them in your local Maven repository: - - mvn clean install - -After building and installing, you can execute the `WordCount` and other -example pipelines by following the instructions in this -[README](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/blob/master/examples/README.md). +Please try our [Quickstarts](https://cloud.google.com/dataflow/docs/quickstarts). ## Contact Us @@ -117,5 +101,7 @@ on GitHub to report any bugs, comments or questions regarding SDK development. * [Google Cloud Dataflow](https://cloud.google.com/dataflow/) * [Apache Beam](https://beam.apache.org/) -* [Dataflow Concepts and Programming Model](https://cloud.google.com/dataflow/model/programming-model) -* [Java API Reference](https://cloud.google.com/dataflow/java-sdk/JavaDoc/index) +* [Dataflow Concepts and Programming Model](https://beam.apache.org/documentation/programming-guide/) +* [Java API Reference](https://beam.apache.org/documentation/sdks/javadoc/) + +_Apache, Apache Beam and the orange letter B logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and/or other countries._ From 3b9f029d3ad209fd8b29ce1c3786e214b232d590 Mon Sep 17 00:00:00 2001 From: Bill Neubauer Date: Wed, 23 Aug 2017 12:48:27 -0700 Subject: [PATCH 46/77] Update to Apache Beam, v2.1.0. --- .../src/main/resources/archetype-resources/pom.xml | 5 +++-- .../src/test/java/complete/game/LeaderBoardTest.java | 2 ++ .../examples/src/main/resources/archetype-resources/pom.xml | 5 +++-- .../starter/src/main/resources/archetype-resources/pom.xml | 4 ++-- .../src/test/resources/projects/basic/reference/pom.xml | 4 ++-- pom.xml | 2 +- 6 files changed, 13 insertions(+), 9 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml index baa1508b2b..f33914d476 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -49,7 +49,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.5.1 + 3.6.1 1.8 1.8 @@ -79,6 +79,7 @@ org.apache.maven.plugins maven-jar-plugin + 3.0.2 + -Xlint:-processing true @@ -181,18 +185,29 @@ checkstyle 6.19 + + org.apache.beam + beam-sdks-java-build-tools + ${beam.version} + - sdk/checkstyle.xml + beam/checkstyle.xml sdk/suppressions.xml true true - true + false true + - verify + test-compile check @@ -425,6 +440,13 @@ beam-examples-java8 ${beam.version} + + + junit + junit + ${junit.version} + test + diff --git a/sdk/checkstyle.xml b/sdk/checkstyle.xml deleted file mode 100644 index 1769c8bed9..0000000000 --- a/sdk/checkstyle.xml +++ /dev/null @@ -1,458 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/sdk/pom.xml b/sdk/pom.xml index 1e86beaf72..33f2255f82 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -60,5 +60,11 @@ org.apache.beam beam-runners-google-cloud-dataflow-java + + + junit + junit + test + diff --git a/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java b/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java index 44aa72285a..5088a00cfc 100644 --- a/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java +++ b/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java @@ -1,3 +1,18 @@ +/* + * Copyright (C) 2017 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ package org.apache.beam.runners.dataflow; import static org.junit.Assert.assertEquals; @@ -26,8 +41,8 @@ public class DataflowRunnerInfoOverrideTest { @Test public void testDataflowDistributionOverride() throws Exception { - try (InputStream in - = DataflowRunnerInfo.class.getResourceAsStream(DATAFLOW_DISTRIBUTION_PROPERTIES_PATH)) { + try (InputStream in = + DataflowRunnerInfo.class.getResourceAsStream(DATAFLOW_DISTRIBUTION_PROPERTIES_PATH)) { Properties properties = new Properties(); properties.load(in); diff --git a/sdk/suppressions.xml b/sdk/suppressions.xml index c3635c9bb8..4d707ab291 100644 --- a/sdk/suppressions.xml +++ b/sdk/suppressions.xml @@ -19,9 +19,8 @@ "http://www.puppycrawl.com/dtds/suppressions_1_1.dtd"> - - - + + From 053003a277d34b5afce70f281e8673c8dd2fdd1f Mon Sep 17 00:00:00 2001 From: Luke Cwik Date: Tue, 26 Sep 2017 17:31:48 -0700 Subject: [PATCH 51/77] Update Dataflow archetypes to be compatible with Apache Beam 2.2.0 changes. --- .../java/common/WriteOneFilePerWindow.java | 85 +++++++++-------- .../java/complete/game/utils/WriteToText.java | 95 +++++++++---------- .../java/common/WriteOneFilePerWindow.java | 59 ++++++------ 3 files changed, 122 insertions(+), 117 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java index fc314b9d7b..59b6ce3015 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java @@ -17,21 +17,24 @@ */ package ${package}.common; -import static com.google.common.base.Verify.verifyNotNull; + import static com.google.common.base.MoreObjects.firstNonNull; -import javax.annotation.Nullable; -import org.apache.beam.sdk.io.FileBasedSink; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; + import javax.annotation.Nullable; + import org.apache.beam.sdk.io.FileBasedSink; + import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; + import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; + import org.apache.beam.sdk.io.TextIO; + import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; + import org.apache.beam.sdk.io.fs.ResourceId; + import org.apache.beam.sdk.transforms.DoFn; + import org.apache.beam.sdk.transforms.PTransform; + import org.apache.beam.sdk.transforms.windowing.BoundedWindow; + import org.apache.beam.sdk.transforms.windowing.IntervalWindow; + import org.apache.beam.sdk.transforms.windowing.PaneInfo; + import org.apache.beam.sdk.values.PCollection; + import org.apache.beam.sdk.values.PDone; + import org.joda.time.format.DateTimeFormatter; + import org.joda.time.format.ISODateTimeFormat; /** * A {@link DoFn} that writes elements to files with names deterministically derived from the lower @@ -53,22 +56,12 @@ public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) { @Override public PDone expand(PCollection input) { - // filenamePrefix may contain a directory and a filename component. Pull out only the filename - // component from that path for the PerWindowFiles. - String prefix = ""; ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); - if (!resource.isDirectory()) { - prefix = verifyNotNull( - resource.getFilename(), - "A non-directory resource should have a non-null filename: %s", - resource); - } - - - TextIO.Write write = TextIO.write() - .to(resource.getCurrentDirectory()) - .withFilenamePolicy(new PerWindowFiles(prefix)) - .withWindowedWrites(); + TextIO.Write write = + TextIO.write() + .to(new PerWindowFiles(resource)) + .withTempDirectory(resource.getCurrentDirectory()) + .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } @@ -83,31 +76,41 @@ public PDone expand(PCollection input) { */ public static class PerWindowFiles extends FilenamePolicy { - private final String prefix; + private final ResourceId baseFilename; - public PerWindowFiles(String prefix) { - this.prefix = prefix; + public PerWindowFiles(ResourceId baseFilename) { + this.baseFilename = baseFilename; } public String filenamePrefixForWindow(IntervalWindow window) { + String prefix = + baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), ""); return String.format("%s-%s-%s", prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end())); } @Override - public ResourceId windowedFilename( - ResourceId outputDirectory, WindowedContext context, String extension) { - IntervalWindow window = (IntervalWindow) context.getWindow(); - String filename = String.format( - "%s-%s-of-%s%s", - filenamePrefixForWindow(window), context.getShardNumber(), context.getNumShards(), - extension); - return outputDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE); + public ResourceId windowedFilename(int shardNumber, + int numShards, + BoundedWindow window, + PaneInfo paneInfo, + OutputFileHints outputFileHints) { + IntervalWindow intervalWindow = (IntervalWindow) window; + String filename = + String.format( + "%s-%s-of-%s%s", + filenamePrefixForWindow(intervalWindow), + shardNumber, + numShards, + outputFileHints.getSuggestedFilenameSuffix()); + return baseFilename + .getCurrentDirectory() + .resolve(filename, StandardResolveOptions.RESOLVE_FILE); } @Override public ResourceId unwindowedFilename( - ResourceId outputDirectory, Context context, String extension) { + int shardNumber, int numShards, OutputFileHints outputFileHints) { throw new UnsupportedOperationException("Unsupported."); } } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java index 6d4e1399b3..c5828996ad 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java @@ -17,30 +17,31 @@ */ package ${package}.complete.game.utils; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Verify.verifyNotNull; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; -import java.util.stream.Collectors; -import org.apache.beam.sdk.io.FileBasedSink; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; + import static com.google.common.base.Preconditions.checkArgument; + + import java.io.Serializable; + import java.util.ArrayList; + import java.util.List; + import java.util.Map; + import java.util.TimeZone; + import java.util.stream.Collectors; + import org.apache.beam.sdk.io.FileBasedSink; + import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; + import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; + import org.apache.beam.sdk.io.TextIO; + import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; + import org.apache.beam.sdk.io.fs.ResourceId; + import org.apache.beam.sdk.transforms.DoFn; + import org.apache.beam.sdk.transforms.PTransform; + import org.apache.beam.sdk.transforms.ParDo; + import org.apache.beam.sdk.transforms.windowing.BoundedWindow; + import org.apache.beam.sdk.transforms.windowing.IntervalWindow; + import org.apache.beam.sdk.transforms.windowing.PaneInfo; + import org.apache.beam.sdk.values.PCollection; + import org.apache.beam.sdk.values.PDone; + import org.joda.time.DateTimeZone; + import org.joda.time.format.DateTimeFormat; + import org.joda.time.format.DateTimeFormatter; /** * Generate, format, and write rows. Use provided information about the field names and types, as @@ -111,21 +112,12 @@ public PDone expand(PCollection input) { checkArgument( input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder()); - // filenamePrefix may contain a directory and a filename component. Pull out only the filename - // component from that path for the PerWindowFiles. - String prefix = ""; ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); - if (!resource.isDirectory()) { - prefix = verifyNotNull( - resource.getFilename(), - "A non-directory resource should have a non-null filename: %s", - resource); - } return input.apply( TextIO.write() - .to(resource.getCurrentDirectory()) - .withFilenamePolicy(new PerWindowFiles(prefix)) + .to(new PerWindowFiles(resource)) + .withTempDirectory(resource.getCurrentDirectory()) .withWindowedWrites() .withNumShards(3)); } @@ -139,31 +131,38 @@ public PDone expand(PCollection input) { */ protected static class PerWindowFiles extends FilenamePolicy { - private final String prefix; + private final ResourceId prefix; - public PerWindowFiles(String prefix) { + public PerWindowFiles(ResourceId prefix) { this.prefix = prefix; } public String filenamePrefixForWindow(IntervalWindow window) { - return String.format("%s-%s-%s", - prefix, formatter.print(window.start()), formatter.print(window.end())); + String filePrefix = prefix.isDirectory() ? "" : prefix.getFilename(); + return String.format( + "%s-%s-%s", filePrefix, formatter.print(window.start()), formatter.print(window.end())); } @Override - public ResourceId windowedFilename( - ResourceId outputDirectory, WindowedContext context, String extension) { - IntervalWindow window = (IntervalWindow) context.getWindow(); - String filename = String.format( - "%s-%s-of-%s%s", - filenamePrefixForWindow(window), context.getShardNumber(), context.getNumShards(), - extension); - return outputDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE); + public ResourceId windowedFilename(int shardNumber, + int numShards, + BoundedWindow window, + PaneInfo paneInfo, + OutputFileHints outputFileHints) { + IntervalWindow intervalWindow = (IntervalWindow) window; + String filename = + String.format( + "%s-%s-of-%s%s", + filenamePrefixForWindow(intervalWindow), + shardNumber, + numShards, + outputFileHints.getSuggestedFilenameSuffix()); + return prefix.getCurrentDirectory().resolve(filename, StandardResolveOptions.RESOLVE_FILE); } @Override public ResourceId unwindowedFilename( - ResourceId outputDirectory, Context context, String extension) { + int shardNumber, int numShards, OutputFileHints outputFileHints) { throw new UnsupportedOperationException("Unsupported."); } } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java index fc314b9d7b..9796d647b5 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java @@ -17,17 +17,20 @@ */ package ${package}.common; -import static com.google.common.base.Verify.verifyNotNull; +import static com.google.common.base.MoreObjects.firstNonNull; import javax.annotation.Nullable; import org.apache.beam.sdk.io.FileBasedSink; import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; +import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.PaneInfo; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PDone; import org.joda.time.format.DateTimeFormatter; @@ -53,22 +56,12 @@ public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) { @Override public PDone expand(PCollection input) { - // filenamePrefix may contain a directory and a filename component. Pull out only the filename - // component from that path for the PerWindowFiles. - String prefix = ""; ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); - if (!resource.isDirectory()) { - prefix = verifyNotNull( - resource.getFilename(), - "A non-directory resource should have a non-null filename: %s", - resource); - } - - - TextIO.Write write = TextIO.write() - .to(resource.getCurrentDirectory()) - .withFilenamePolicy(new PerWindowFiles(prefix)) - .withWindowedWrites(); + TextIO.Write write = + TextIO.write() + .to(new PerWindowFiles(resource)) + .withTempDirectory(resource.getCurrentDirectory()) + .withWindowedWrites(); if (numShards != null) { write = write.withNumShards(numShards); } @@ -83,31 +76,41 @@ public PDone expand(PCollection input) { */ public static class PerWindowFiles extends FilenamePolicy { - private final String prefix; + private final ResourceId baseFilename; - public PerWindowFiles(String prefix) { - this.prefix = prefix; + public PerWindowFiles(ResourceId baseFilename) { + this.baseFilename = baseFilename; } public String filenamePrefixForWindow(IntervalWindow window) { + String prefix = + baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), ""); return String.format("%s-%s-%s", prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end())); } @Override - public ResourceId windowedFilename( - ResourceId outputDirectory, WindowedContext context, String extension) { - IntervalWindow window = (IntervalWindow) context.getWindow(); - String filename = String.format( - "%s-%s-of-%s%s", - filenamePrefixForWindow(window), context.getShardNumber(), context.getNumShards(), - extension); - return outputDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE); + public ResourceId windowedFilename(int shardNumber, + int numShards, + BoundedWindow window, + PaneInfo paneInfo, + OutputFileHints outputFileHints) { + IntervalWindow intervalWindow = (IntervalWindow) window; + String filename = + String.format( + "%s-%s-of-%s%s", + filenamePrefixForWindow(intervalWindow), + shardNumber, + numShards, + outputFileHints.getSuggestedFilenameSuffix()); + return baseFilename + .getCurrentDirectory() + .resolve(filename, StandardResolveOptions.RESOLVE_FILE); } @Override public ResourceId unwindowedFilename( - ResourceId outputDirectory, Context context, String extension) { + int shardNumber, int numShards, OutputFileHints outputFileHints) { throw new UnsupportedOperationException("Unsupported."); } } From 9f5fef097989c64372af4a22dda76c54fc0a2506 Mon Sep 17 00:00:00 2001 From: Luke Cwik Date: Thu, 28 Sep 2017 09:08:20 -0700 Subject: [PATCH 52/77] fixup! Address PR comments. --- .../java/common/WriteOneFilePerWindow.java | 34 ++++++------- .../java/complete/game/utils/WriteToText.java | 50 +++++++++---------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java index 59b6ce3015..c7296162b6 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java @@ -17,24 +17,24 @@ */ package ${package}.common; - import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.base.MoreObjects.firstNonNull; - import javax.annotation.Nullable; - import org.apache.beam.sdk.io.FileBasedSink; - import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; - import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; - import org.apache.beam.sdk.io.TextIO; - import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; - import org.apache.beam.sdk.io.fs.ResourceId; - import org.apache.beam.sdk.transforms.DoFn; - import org.apache.beam.sdk.transforms.PTransform; - import org.apache.beam.sdk.transforms.windowing.BoundedWindow; - import org.apache.beam.sdk.transforms.windowing.IntervalWindow; - import org.apache.beam.sdk.transforms.windowing.PaneInfo; - import org.apache.beam.sdk.values.PCollection; - import org.apache.beam.sdk.values.PDone; - import org.joda.time.format.DateTimeFormatter; - import org.joda.time.format.ISODateTimeFormat; +import javax.annotation.Nullable; +import org.apache.beam.sdk.io.FileBasedSink; +import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; +import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.PaneInfo; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.ISODateTimeFormat; /** * A {@link DoFn} that writes elements to files with names deterministically derived from the lower diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java index c5828996ad..7d8d19f70d 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java @@ -17,31 +17,31 @@ */ package ${package}.complete.game.utils; - import static com.google.common.base.Preconditions.checkArgument; - - import java.io.Serializable; - import java.util.ArrayList; - import java.util.List; - import java.util.Map; - import java.util.TimeZone; - import java.util.stream.Collectors; - import org.apache.beam.sdk.io.FileBasedSink; - import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; - import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; - import org.apache.beam.sdk.io.TextIO; - import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; - import org.apache.beam.sdk.io.fs.ResourceId; - import org.apache.beam.sdk.transforms.DoFn; - import org.apache.beam.sdk.transforms.PTransform; - import org.apache.beam.sdk.transforms.ParDo; - import org.apache.beam.sdk.transforms.windowing.BoundedWindow; - import org.apache.beam.sdk.transforms.windowing.IntervalWindow; - import org.apache.beam.sdk.transforms.windowing.PaneInfo; - import org.apache.beam.sdk.values.PCollection; - import org.apache.beam.sdk.values.PDone; - import org.joda.time.DateTimeZone; - import org.joda.time.format.DateTimeFormat; - import org.joda.time.format.DateTimeFormatter; +import static com.google.common.base.Preconditions.checkArgument; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TimeZone; +import java.util.stream.Collectors; +import org.apache.beam.sdk.io.FileBasedSink; +import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; +import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; +import org.apache.beam.sdk.io.TextIO; +import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; +import org.apache.beam.sdk.io.fs.ResourceId; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.PTransform; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.PaneInfo; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PDone; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; /** * Generate, format, and write rows. Use provided information about the field names and types, as From c45ff300ad12b8a625ed81fd5a42a0549456fd4a Mon Sep 17 00:00:00 2001 From: Bill Neubauer Date: Thu, 5 Oct 2017 15:08:59 -0700 Subject: [PATCH 53/77] Refactor versioning information for archetypes. This makes versioning consistent with the Beam POMs. The top-level POM contains the version numbers, and versions are plumbed into the archetypes via properties. --- .../resources/archetype-resources/pom.xml | 47 ++++++++++++------- .../resources/archetype-resources/pom.xml | 44 +++++++++++------ .../resources/archetype-resources/pom.xml | 11 +++-- .../projects/basic/reference/pom.xml | 11 +++-- pom.xml | 13 +++++ 5 files changed, 87 insertions(+), 39 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml index f33914d476..12d2783a0f 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml @@ -27,7 +27,22 @@ UTF-8 - 2.20 + + @bigquery.version@ + @google-clients.version@ + @guava.version@ + @hamcrest.version@ + @jackson.version@ + @joda.version@ + @junit.version@ + @maven-compiler-plugin.version@ + @maven-exec-plugin.version@ + @maven-jar-plugin.version@ + @maven-shade-plugin.version@ + @mockito.version@ + @pubsub.version@ + @slf4j.version@ + @surefire-plugin.version@ @@ -49,7 +64,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.6.1 + ${maven-compiler-plugin.version} 1.8 1.8 @@ -79,7 +94,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.0.2 + ${maven-jar-plugin.version} @@ -158,7 +173,7 @@ com.google.apis google-api-services-bigquery - v2-rev295-1.22.0 + ${bigquery.version} @@ -172,7 +187,7 @@ com.google.http-client google-http-client - 1.22.0 + ${google-clients.version} @@ -186,7 +201,7 @@ com.google.apis google-api-services-pubsub - v1-rev10-1.22.0 + ${pubsub.version} @@ -200,26 +215,26 @@ joda-time joda-time - 2.4 + ${joda.version} com.google.guava guava - 20.0 + ${guava.version} org.slf4j slf4j-api - 1.7.14 + ${slf4j.version} org.slf4j slf4j-jdk14 - 1.7.14 + ${slf4j.version} runtime @@ -229,19 +244,19 @@ org.hamcrest hamcrest-all - 1.3 + ${hamcrest.version} junit junit - 4.12 + ${junit.version} org.mockito mockito-all - 1.9.5 + ${mockito.version} test diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 28ae0db9fe..2c2b8d3fc6 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -27,7 +27,21 @@ UTF-8 - 2.20 + + @bigquery.version@ + @google-clients.version@ + @guava.version@ + @hamcrest.version@ + @jackson.version@ + @joda.version@ + @junit.version@ + @maven-compiler-plugin.version@ + @maven-exec-plugin.version@ + @maven-jar-plugin.version@ + @maven-shade-plugin.version@ + @pubsub.version@ + @slf4j.version@ + @surefire-plugin.version@ @@ -49,7 +63,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.6.1 + ${maven-compiler-plugin.version} ${targetPlatform} ${targetPlatform} @@ -79,7 +93,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.0.2 + ${maven-jar-plugin.version} @@ -158,7 +172,7 @@ com.google.apis google-api-services-bigquery - v2-rev295-1.22.0 + ${bigquery.version} @@ -172,7 +186,7 @@ com.google.http-client google-http-client - 1.22.0 + ${google-clients.version} @@ -186,7 +200,7 @@ com.google.apis google-api-services-pubsub - v1-rev10-1.22.0 + ${pubsub.version} @@ -200,26 +214,26 @@ joda-time joda-time - 2.4 + ${joda.version} com.google.guava guava - 20.0 + ${guava.version} org.slf4j slf4j-api - 1.7.14 + ${slf4j.version} org.slf4j slf4j-jdk14 - 1.7.14 + ${slf4j.version} runtime @@ -229,13 +243,13 @@ org.hamcrest hamcrest-all - 1.3 + ${hamcrest.version} junit junit - 4.12 + ${junit.version} diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index 75eaaade81..22f717f97a 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -25,6 +25,9 @@ UTF-8 + @maven-compiler-plugin.version@ + @maven-exec-plugin.version@ + @slf4j.version@ @@ -46,7 +49,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.6.1 + ${maven-compiler-plugin.version} ${targetPlatform} ${targetPlatform} @@ -59,7 +62,7 @@ org.codehaus.mojo exec-maven-plugin - 1.5.0 + ${maven-exec-plugin.version} false @@ -79,12 +82,12 @@ org.slf4j slf4j-api - 1.7.14 + ${slf4j.version} org.slf4j slf4j-jdk14 - 1.7.14 + ${slf4j.version} diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index fc0940bf2d..8e4edbd29e 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -25,6 +25,9 @@ UTF-8 + @maven-compiler-plugin.version@ + @maven-exec-plugin.version@ + @slf4j.version@ @@ -46,7 +49,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.6.1 + ${maven-compiler-plugin.version} 1.7 1.7 @@ -59,7 +62,7 @@ org.codehaus.mojo exec-maven-plugin - 1.5.0 + ${maven-exec-plugin.version} false @@ -79,12 +82,12 @@ org.slf4j slf4j-api - 1.7.14 + ${slf4j.version} org.slf4j slf4j-jdk14 - 1.7.14 + ${slf4j.version} diff --git a/pom.xml b/pom.xml index f9a662c658..a54ec1c5e9 100644 --- a/pom.xml +++ b/pom.xml @@ -110,7 +110,20 @@ 6 1 + v2-rev295-1.22.0 + 1.22.0 + 20.0 + 1.3 + 2.4 4.12 + 3.6.1 + 1.4.0 + 3.0.2 + 3.0.0 + 1.9.5 + v1-rev10-1.22.0 + 1.7.14 + 2.20 pom From 63cd3ae0c0ed1c6f4b820759d5666be27f241c29 Mon Sep 17 00:00:00 2001 From: Bill Neubauer Date: Wed, 11 Oct 2017 11:56:07 -0700 Subject: [PATCH 54/77] Set executor version to 1.5.0 Leaving the executor version where it was, rather than changing it to match Beam. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a54ec1c5e9..e4b917f0cf 100644 --- a/pom.xml +++ b/pom.xml @@ -117,7 +117,7 @@ 2.4 4.12 3.6.1 - 1.4.0 + 1.5.0 3.0.2 3.0.0 1.9.5 From 8a1b42be31802e40571566dc285bf1b37f6555bd Mon Sep 17 00:00:00 2001 From: Kenneth Knowles Date: Wed, 6 Dec 2017 15:13:20 -0800 Subject: [PATCH 55/77] Upgrade Beam to version 2.2.0 --- .../src/main/java/WordCount.java | 4 ++++ .../main/java/common/WriteOneFilePerWindow.java | 8 ++++---- .../java/complete/game/injector/Injector.java | 2 +- .../complete/game/injector/InjectorUtils.java | 2 +- .../java/complete/game/utils/WriteToText.java | 8 ++++---- .../src/test/java/DebuggingWordCountTest.java | 11 +++++++++-- .../src/main/java/WordCount.java | 4 ++++ .../src/test/java/DebuggingWordCountTest.java | 11 +++++++++-- pom.xml | 15 ++++++++------- 9 files changed, 44 insertions(+), 21 deletions(-) diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java index 79b71403b9..9947a26eda 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -21,6 +21,7 @@ import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; @@ -88,9 +89,12 @@ public class WordCount { */ static class ExtractWordsFn extends DoFn { private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); + private final Distribution lineLenDist = Metrics.distribution( + ExtractWordsFn.class, "lineLenDistro"); @ProcessElement public void processElement(ProcessContext c) { + lineLenDist.update(c.element().length()); if (c.element().trim().isEmpty()) { emptyLines.inc(); } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java index c7296162b6..9796d647b5 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java @@ -91,10 +91,10 @@ public String filenamePrefixForWindow(IntervalWindow window) { @Override public ResourceId windowedFilename(int shardNumber, - int numShards, - BoundedWindow window, - PaneInfo paneInfo, - OutputFileHints outputFileHints) { + int numShards, + BoundedWindow window, + PaneInfo paneInfo, + OutputFileHints outputFileHints) { IntervalWindow intervalWindow = (IntervalWindow) window; String filename = String.format( diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java index 4814ffb66f..980966e0ce 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java @@ -167,7 +167,7 @@ long getStartTimeInMillis() { return startTimeInMillis; } long getEndTimeInMillis() { - return startTimeInMillis + (expirationPeriod * 60 * 1000); + return startTimeInMillis + (expirationPeriod * 60L * 1000L); } String getRandomUser() { int userNum = random.nextInt(numMembers); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java index 55e8c7a8c3..ddcbff4f41 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java @@ -93,7 +93,7 @@ public static void createTopic(Pubsub client, String fullTopicName) Topic topic = client.projects().topics() .create(fullTopicName, new Topic()) .execute(); - System.out.printf("Topic %s was created.\n", topic.getName()); + System.out.printf("Topic %s was created.%n", topic.getName()); } } } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java index 7d8d19f70d..dbd5e39977 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java @@ -145,10 +145,10 @@ public String filenamePrefixForWindow(IntervalWindow window) { @Override public ResourceId windowedFilename(int shardNumber, - int numShards, - BoundedWindow window, - PaneInfo paneInfo, - OutputFileHints outputFileHints) { + int numShards, + BoundedWindow window, + PaneInfo paneInfo, + OutputFileHints outputFileHints) { IntervalWindow intervalWindow = (IntervalWindow) window; String filename = String.format( diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java index 155242d996..26e1498d71 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java +++ b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java @@ -35,6 +35,13 @@ public class DebuggingWordCountTest { @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); + private String getFilePath(String filePath) { + if (filePath.contains(":")) { + return filePath.replace("\\", "/").split(":")[1]; + } + return filePath; + } + @Test public void testDebuggingWordCount() throws Exception { File inputFile = tmpFolder.newFile(); @@ -45,8 +52,8 @@ public void testDebuggingWordCount() throws Exception { StandardCharsets.UTF_8); WordCountOptions options = TestPipeline.testingPipelineOptions().as(WordCountOptions.class); - options.setInputFile(inputFile.getAbsolutePath()); - options.setOutput(outputFile.getAbsolutePath()); + options.setInputFile(getFilePath(inputFile.getAbsolutePath())); + options.setOutput(getFilePath(outputFile.getAbsolutePath())); DebuggingWordCount.main(TestPipeline.convertToArgs(options)); } } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java index 79b71403b9..9947a26eda 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -21,6 +21,7 @@ import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; @@ -88,9 +89,12 @@ public class WordCount { */ static class ExtractWordsFn extends DoFn { private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); + private final Distribution lineLenDist = Metrics.distribution( + ExtractWordsFn.class, "lineLenDistro"); @ProcessElement public void processElement(ProcessContext c) { + lineLenDist.update(c.element().length()); if (c.element().trim().isEmpty()) { emptyLines.inc(); } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java index 155242d996..26e1498d71 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java @@ -35,6 +35,13 @@ public class DebuggingWordCountTest { @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); + private String getFilePath(String filePath) { + if (filePath.contains(":")) { + return filePath.replace("\\", "/").split(":")[1]; + } + return filePath; + } + @Test public void testDebuggingWordCount() throws Exception { File inputFile = tmpFolder.newFile(); @@ -45,8 +52,8 @@ public void testDebuggingWordCount() throws Exception { StandardCharsets.UTF_8); WordCountOptions options = TestPipeline.testingPipelineOptions().as(WordCountOptions.class); - options.setInputFile(inputFile.getAbsolutePath()); - options.setOutput(outputFile.getAbsolutePath()); + options.setInputFile(getFilePath(inputFile.getAbsolutePath())); + options.setOutput(getFilePath(outputFile.getAbsolutePath())); DebuggingWordCount.main(TestPipeline.convertToArgs(options)); } } diff --git a/pom.xml b/pom.xml index e4b917f0cf..a3ce24f916 100644 --- a/pom.xml +++ b/pom.xml @@ -103,27 +103,28 @@ ${maven.build.timestamp} yyyy-MM-dd HH:mm - 2.2.0-SNAPSHOT + 2.2.0 Google Cloud Dataflow SDK for Java ${project.version}-20170517 6 1 - v2-rev295-1.22.0 + v2-rev355-1.22.0 1.22.0 20.0 1.3 2.4 4.12 - 3.6.1 - 1.5.0 - 3.0.2 - 3.0.0 1.9.5 v1-rev10-1.22.0 - 1.7.14 + 1.7.25 + 2.20 + 3.6.2 + 1.6.0 + 3.0.2 + 3.0.0 pom From a363bb33af4a8de572de15c1344fbc3e4133220a Mon Sep 17 00:00:00 2001 From: Batkhuyag Batsaikhan Date: Mon, 26 Feb 2018 17:51:59 -0800 Subject: [PATCH 56/77] Upgrade to Apache Beam, version 2.3.0 --- examples/pom.xml | 5 - .../dataflow/sdk/ExamplesDependencies.java | 4 +- maven-archetypes/examples-java8/pom.xml | 80 ---- .../META-INF/maven/archetype-metadata.xml | 38 -- .../examples-java8/src/main/resources/NOTICE | 5 - .../resources/archetype-resources/pom.xml | 263 ------------ .../src/main/java/DebuggingWordCount.java | 162 ------- .../src/main/java/MinimalWordCount.java | 119 ----- .../src/main/java/MinimalWordCountJava8.java | 72 ---- .../src/main/java/WindowedWordCount.java | 223 ---------- .../src/main/java/WordCount.java | 190 -------- .../common/ExampleBigQueryTableOptions.java | 55 --- ...mplePubsubTopicAndSubscriptionOptions.java | 45 -- .../common/ExamplePubsubTopicOptions.java | 45 -- .../src/main/java/common/ExampleUtils.java | 406 ------------------ .../java/common/WriteOneFilePerWindow.java | 117 ----- .../src/test/java/DebuggingWordCountTest.java | 59 --- .../src/test/java/WordCountTest.java | 86 ---- .../projects/basic/archetype.properties | 19 - .../test/resources/projects/basic/goal.txt | 1 - maven-archetypes/examples/pom.xml | 6 +- .../META-INF/maven/archetype-metadata.xml | 2 +- .../resources/archetype-resources/pom.xml | 13 +- .../src/main/java/MinimalWordCount.java | 88 ++-- .../src/main/java/WindowedWordCount.java | 7 +- .../src/main/java/WordCount.java | 3 +- .../src/main/java/common/ExampleUtils.java | 63 +-- .../main/java/complete/game/GameStats.java | 172 ++++---- .../java/complete/game/HourlyTeamScore.java | 80 ++-- .../main/java/complete/game/LeaderBoard.java | 50 +-- .../java/complete/game/StatefulTeamScore.java | 227 ++++++++++ .../main/java/complete/game/UserScore.java | 18 +- .../java/complete/game/injector/Injector.java | 91 ++-- .../complete/game/injector/InjectorUtils.java | 0 .../injector/RetryHttpInitializerWrapper.java | 42 +- .../complete/game/utils/GameConstants.java} | 26 +- .../complete/game/utils/WriteToBigQuery.java | 0 .../java/complete/game/utils/WriteToText.java | 2 +- .../game/utils/WriteWindowedToBigQuery.java | 0 .../src/test/java/MinimalWordCountTest.java} | 57 +-- .../src/test/java/WordCountTest.java | 3 +- .../java/complete/game/GameStatsTest.java | 0 .../complete/game/HourlyTeamScoreTest.java | 0 .../java/complete/game/LeaderBoardTest.java | 16 +- .../complete/game/StatefulTeamScoreTest.java | 208 +++++++++ .../java/complete/game/UserScoreTest.java | 0 .../projects/basic/archetype.properties | 2 +- maven-archetypes/pom.xml | 1 - maven-archetypes/starter/pom.xml | 6 +- .../META-INF/maven/archetype-metadata.xml | 2 +- .../projects/basic/archetype.properties | 2 +- .../projects/basic/reference/pom.xml | 4 +- pom.xml | 85 ++-- 53 files changed, 840 insertions(+), 2430 deletions(-) delete mode 100644 maven-archetypes/examples-java8/pom.xml delete mode 100644 maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml delete mode 100644 maven-archetypes/examples-java8/src/main/resources/NOTICE delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java delete mode 100644 maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java delete mode 100644 maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties delete mode 100644 maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java (68%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java (75%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java (87%) create mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java (94%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java (89%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java (100%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java (71%) rename maven-archetypes/{examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java => examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java} (55%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java (100%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java (99%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java (100%) rename maven-archetypes/{examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java => examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java} (61%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java (100%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java (100%) rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java (97%) create mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java rename maven-archetypes/{examples-java8 => examples}/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java (100%) diff --git a/examples/pom.xml b/examples/pom.xml index f87ae36b1d..75a1d92174 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -42,10 +42,5 @@ org.apache.beam beam-examples-java - - - org.apache.beam - beam-examples-java8 - diff --git a/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java b/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java index 827aff8395..c51e527edb 100644 --- a/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java +++ b/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java @@ -15,7 +15,7 @@ */ package com.google.cloud.dataflow.sdk; -import org.apache.beam.examples.MinimalWordCountJava8; +import org.apache.beam.examples.MinimalWordCount; import org.apache.beam.examples.WordCount; /** @@ -25,5 +25,5 @@ class ExamplesDependencies { SdkDependencies sdkDependencies; WordCount wordCount; - MinimalWordCountJava8 minimalWordCount; + MinimalWordCount minimalWordCount; } diff --git a/maven-archetypes/examples-java8/pom.xml b/maven-archetypes/examples-java8/pom.xml deleted file mode 100644 index 463c66f1d1..0000000000 --- a/maven-archetypes/examples-java8/pom.xml +++ /dev/null @@ -1,80 +0,0 @@ - - - - - 4.0.0 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-archetypes-parent - 2.2.0-SNAPSHOT - ../pom.xml - - - google-cloud-dataflow-java-archetypes-examples-java8 - Google Cloud Dataflow SDK for Java - Java 8 Examples Archetype - Google Cloud Dataflow SDK for Java is a distribution of Apache - Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow - service. This archetype creates a project containing all the example - pipelines targeting Java 8. - - maven-archetype - - - - - org.apache.maven.archetype - archetype-packaging - 2.4 - - - - - - - maven-archetype-plugin - 2.4 - - - org.apache.maven.shared - maven-invoker - 2.2 - - - - - - default-integration-test - install - - integration-test - - - - - - - - - - diff --git a/maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml deleted file mode 100644 index 326fdaa528..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/META-INF/maven/archetype-metadata.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - - src/main/java - - **/*.java - - - - - src/test/java - - **/*.java - - - - diff --git a/maven-archetypes/examples-java8/src/main/resources/NOTICE b/maven-archetypes/examples-java8/src/main/resources/NOTICE deleted file mode 100644 index 981fde5a9e..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/NOTICE +++ /dev/null @@ -1,5 +0,0 @@ -Google Cloud Dataflow SDK for Java -Copyright 2017, Google Inc. - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml deleted file mode 100644 index 12d2783a0f..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml +++ /dev/null @@ -1,263 +0,0 @@ - - - - 4.0.0 - - ${groupId} - ${artifactId} - ${version} - - jar - - - UTF-8 - - @bigquery.version@ - @google-clients.version@ - @guava.version@ - @hamcrest.version@ - @jackson.version@ - @joda.version@ - @junit.version@ - @maven-compiler-plugin.version@ - @maven-exec-plugin.version@ - @maven-jar-plugin.version@ - @maven-shade-plugin.version@ - @mockito.version@ - @pubsub.version@ - @slf4j.version@ - @surefire-plugin.version@ - - - - - ossrh.snapshots - Sonatype OSS Repository Hosting - https://oss.sonatype.org/content/repositories/snapshots/ - - false - - - true - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - 1.8 - 1.8 - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${surefire-plugin.version} - - all - 4 - true - - - - org.apache.maven.surefire - surefire-junit47 - ${surefire-plugin.version} - - - - - - - org.apache.maven.plugins - maven-jar-plugin - ${maven-jar-plugin.version} - - - - - org.apache.maven.plugins - maven-shade-plugin - ${maven-shade-plugin.version} - - - package - - shade - - - ${project.artifactId}-bundled-${project.version} - - - *:* - - META-INF/LICENSE - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - - - - - - - - org.codehaus.mojo - exec-maven-plugin - ${maven-exec-plugin.version} - - false - - - - - - - - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - @project.version@ - - - - - com.google.api-client - google-api-client - ${google-clients.version} - - - - com.google.guava - guava-jdk5 - - - - - - com.google.apis - google-api-services-bigquery - ${bigquery.version} - - - - com.google.guava - guava-jdk5 - - - - - - com.google.http-client - google-http-client - ${google-clients.version} - - - - com.google.guava - guava-jdk5 - - - - - - com.google.apis - google-api-services-pubsub - ${pubsub.version} - - - - com.google.guava - guava-jdk5 - - - - - - joda-time - joda-time - ${joda.version} - - - - com.google.guava - guava - ${guava.version} - - - - - org.slf4j - slf4j-api - ${slf4j.version} - - - - org.slf4j - slf4j-jdk14 - ${slf4j.version} - - runtime - - - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - - - - junit - junit - ${junit.version} - - - - org.mockito - mockito-all - ${mockito.version} - test - - - diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java deleted file mode 100644 index 07870f2ed0..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -/** - * An example that verifies word counts in Shakespeare and includes Beam best practices. - * - *

This class, {@link DebuggingWordCount}, is the third in a series of four successively more - * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount} - * and {@link WordCount}. After you've looked at this example, then see the - * {@link WindowedWordCount} pipeline, for introduction of additional concepts. - * - *

Basic concepts, also in the MinimalWordCount and WordCount examples: - * Reading text files; counting a PCollection; executing a Pipeline both locally - * and using a selected runner; defining DoFns. - * - *

New Concepts: - *

- *   1. Logging using SLF4J, even in a distributed environment
- *   2. Creating a custom metric (runners have varying levels of support)
- *   3. Testing your Pipeline via PAssert
- * 
- * - *

To execute this pipeline locally, specify general pipeline configuration: - *

{@code
- *   --project=YOUR_PROJECT_ID
- * }
- * 
- * - *

To change the runner, specify: - *

{@code
- *   --runner=YOUR_SELECTED_RUNNER
- * }
- * 
- * - *

The input file defaults to a public data set containing the text of of King Lear, - * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. - * - */ -public class DebuggingWordCount { - /** A DoFn that filters for a specific key based upon a regular expression. */ - public static class FilterTextFn extends DoFn, KV> { - /** - * Concept #1: The logger below uses the fully qualified class name of FilterTextFn as the - * logger. Depending on your SLF4J configuration, log statements will likely be qualified by - * this name. - * - *

Note that this is entirely standard SLF4J usage. Some runners may provide a default SLF4J - * configuration that is most appropriate for their logging integration. - */ - private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class); - - private final Pattern filter; - public FilterTextFn(String pattern) { - filter = Pattern.compile(pattern); - } - - /** - * Concept #2: A custom metric can track values in your pipeline as it runs. Each - * runner provides varying levels of support for metrics, and may expose them - * in a dashboard, etc. - */ - private final Counter matchedWords = Metrics.counter(FilterTextFn.class, "matchedWords"); - private final Counter unmatchedWords = Metrics.counter(FilterTextFn.class, "unmatchedWords"); - - @ProcessElement - public void processElement(ProcessContext c) { - if (filter.matcher(c.element().getKey()).matches()) { - // Log at the "DEBUG" level each element that we match. When executing this pipeline - // these log lines will appear only if the log level is set to "DEBUG" or lower. - LOG.debug("Matched: " + c.element().getKey()); - matchedWords.inc(); - c.output(c.element()); - } else { - // Log at the "TRACE" level each element that is not matched. Different log levels - // can be used to control the verbosity of logging providing an effective mechanism - // to filter less important information. - LOG.trace("Did not match: " + c.element().getKey()); - unmatchedWords.inc(); - } - } - } - - /** - * Options supported by {@link DebuggingWordCount}. - * - *

Inherits standard configuration options and all options defined in - * {@link WordCount.WordCountOptions}. - */ - public interface WordCountOptions extends WordCount.WordCountOptions { - - @Description("Regex filter pattern to use in DebuggingWordCount. " - + "Only words matching this pattern will be counted.") - @Default.String("Flourish|stomach") - String getFilterPattern(); - void setFilterPattern(String value); - } - - public static void main(String[] args) { - WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() - .as(WordCountOptions.class); - Pipeline p = Pipeline.create(options); - - PCollection> filteredWords = - p.apply("ReadLines", TextIO.read().from(options.getInputFile())) - .apply(new WordCount.CountWords()) - .apply(ParDo.of(new FilterTextFn(options.getFilterPattern()))); - - /** - * Concept #3: PAssert is a set of convenient PTransforms in the style of - * Hamcrest's collection matchers that can be used when writing Pipeline level tests - * to validate the contents of PCollections. PAssert is best used in unit tests - * with small data sets but is demonstrated here as a teaching tool. - * - *

Below we verify that the set of filtered words matches our expected counts. Note - * that PAssert does not provide any output and that successful completion of the - * Pipeline implies that the expectations were met. Learn more at - * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test - * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test. - */ - List> expectedResults = Arrays.asList( - KV.of("Flourish", 3L), - KV.of("stomach", 1L)); - PAssert.that(filteredWords).containsInAnyOrder(expectedResults); - - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java deleted file mode 100644 index d6b08066db..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import ${package}.common.ExampleUtils; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.values.KV; - - -/** - * An example that counts words in Shakespeare. - * - *

This class, {@link MinimalWordCount}, is the first in a series of four successively more - * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or - * argument processing, and focus on construction of the pipeline, which chains together the - * application of core transforms. - * - *

Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally the - * {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional - * concepts. - * - *

Concepts: - * - *

- *   1. Reading data from text files
- *   2. Specifying 'inline' transforms
- *   3. Counting items in a PCollection
- *   4. Writing data to text files
- * 
- * - *

No arguments are required to run this pipeline. It will be executed with the DirectRunner. You - * can see the results in the output files in your current working directory, with names like - * "wordcounts-00001-of-00005. When running on a distributed service, you would use an appropriate - * file service. - */ -public class MinimalWordCount { - - public static void main(String[] args) { - // Create a PipelineOptions object. This object lets us set various execution - // options for our pipeline, such as the runner you wish to use. This example - // will run with the DirectRunner by default, based on the class path configured - // in its dependencies. - PipelineOptions options = PipelineOptionsFactory.create(); - - // Create the Pipeline object with the options we defined above. - Pipeline p = Pipeline.create(options); - - // Apply the pipeline's transforms. - - // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set - // of input text files. TextIO.Read returns a PCollection where each element is one line from - // the input text (a set of Shakespeare's texts). - - // This example reads a public data set consisting of the complete works of Shakespeare. - p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) - - // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a - // DoFn (defined in-line) on each element that tokenizes the text line into individual words. - // The ParDo returns a PCollection, where each element is an individual word in - // Shakespeare's collected texts. - .apply("ExtractWords", ParDo.of(new DoFn() { - @ProcessElement - public void processElement(ProcessContext c) { - for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) { - if (!word.isEmpty()) { - c.output(word); - } - } - } - })) - - // Concept #3: Apply the Count transform to our PCollection of individual words. The Count - // transform returns a new PCollection of key/value pairs, where each key represents a unique - // word in the text. The associated value is the occurrence count for that word. - .apply(Count.perElement()) - - // Apply a MapElements transform that formats our PCollection of word counts into a printable - // string, suitable for writing to an output file. - .apply("FormatResults", MapElements.via(new SimpleFunction, String>() { - @Override - public String apply(KV input) { - return input.getKey() + ": " + input.getValue(); - } - })) - - // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. - // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of - // formatted strings) to a series of text files. - // - // By default, it will write to a set of files with names like wordcount-00001-of-00005 - .apply(TextIO.write().to("wordcounts")); - - // Run the pipeline. - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java deleted file mode 100644 index e635a885b7..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.util.Arrays; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.Filter; -import org.apache.beam.sdk.transforms.FlatMapElements; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.TypeDescriptors; - -/** - * An example that counts words in Shakespeare, using Java 8 language features. - * - *

See {@link MinimalWordCount} for a comprehensive explanation. - */ -public class MinimalWordCountJava8 { - - public static void main(String[] args) { - PipelineOptions options = PipelineOptionsFactory.create(); - // In order to run your pipeline, you need to make following runner specific changes: - // - // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner - // or FlinkRunner. - // CHANGE 2/3: Specify runner-required options. - // For BlockingDataflowRunner, set project and temp location as follows: - // DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); - // dataflowOptions.setRunner(BlockingDataflowRunner.class); - // dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE"); - // dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY"); - // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions} - // for more details. - // options.as(FlinkPipelineOptions.class) - // .setRunner(FlinkRunner.class); - - Pipeline p = Pipeline.create(options); - - p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) - .apply(FlatMapElements - .into(TypeDescriptors.strings()) - .via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))) - .apply(Filter.by((String word) -> !word.isEmpty())) - .apply(Count.perElement()) - .apply(MapElements - .into(TypeDescriptors.strings()) - .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) - // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to. - .apply(TextIO.write().to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX")); - - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java deleted file mode 100644 index 6a1d07c485..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.io.IOException; -import java.util.concurrent.ThreadLocalRandom; -import ${package}.common.ExampleBigQueryTableOptions; -import ${package}.common.ExampleOptions; -import ${package}.common.WriteOneFilePerWindow; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.joda.time.Duration; -import org.joda.time.Instant; - - -/** - * An example that counts words in text, and can run over either unbounded or bounded input - * collections. - * - *

This class, {@link WindowedWordCount}, is the last in a series of four successively more - * detailed 'word count' examples. First take a look at {@link MinimalWordCount}, - * {@link WordCount}, and {@link DebuggingWordCount}. - * - *

Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples: - * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally - * and using a selected runner; defining DoFns; - * user-defined PTransforms; defining PipelineOptions. - * - *

New Concepts: - *

- *   1. Unbounded and bounded pipeline input modes
- *   2. Adding timestamps to data
- *   3. Windowing
- *   4. Re-using PTransforms over windowed PCollections
- *   5. Accessing the window of an element
- *   6. Writing data to per-window text files
- * 
- * - *

By default, the examples will run with the {@code DirectRunner}. - * To change the runner, specify: - *

{@code
- *   --runner=YOUR_SELECTED_RUNNER
- * }
- * 
- * See examples/java/README.md for instructions about how to configure different runners. - * - *

To execute this pipeline locally, specify a local output file (if using the - * {@code DirectRunner}) or output prefix on a supported distributed file system. - *

{@code
- *   --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
- * }
- * - *

The input file defaults to a public data set containing the text of of King Lear, - * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. - * - *

By default, the pipeline will do fixed windowing, on 1-minute windows. You can - * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10} - * for 10-minute windows. - * - *

The example will try to cancel the pipeline on the signal to terminate the process (CTRL-C). - */ -public class WindowedWordCount { - static final int WINDOW_SIZE = 10; // Default window duration in minutes - /** - * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for - * this example, for the bounded data case. - * - *

Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate - * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a - * 2-hour period. - */ - static class AddTimestampFn extends DoFn { - private static final Duration RAND_RANGE = Duration.standardHours(1); - private final Instant minTimestamp; - private final Instant maxTimestamp; - - AddTimestampFn(Instant minTimestamp, Instant maxTimestamp) { - this.minTimestamp = minTimestamp; - this.maxTimestamp = maxTimestamp; - } - - @ProcessElement - public void processElement(ProcessContext c) { - Instant randomTimestamp = - new Instant( - ThreadLocalRandom.current() - .nextLong(minTimestamp.getMillis(), maxTimestamp.getMillis())); - - /** - * Concept #2: Set the data element with that timestamp. - */ - c.outputWithTimestamp(c.element(), new Instant(randomTimestamp)); - } - } - - /** A {@link DefaultValueFactory} that returns the current system time. */ - public static class DefaultToCurrentSystemTime implements DefaultValueFactory { - @Override - public Long create(PipelineOptions options) { - return System.currentTimeMillis(); - } - } - - /** A {@link DefaultValueFactory} that returns the minimum timestamp plus one hour. */ - public static class DefaultToMinTimestampPlusOneHour implements DefaultValueFactory { - @Override - public Long create(PipelineOptions options) { - return options.as(Options.class).getMinTimestampMillis() - + Duration.standardHours(1).getMillis(); - } - } - - /** - * Options for {@link WindowedWordCount}. - * - *

Inherits standard example configuration options, which allow specification of the - * runner, as well as the {@link WordCount.WordCountOptions} support for - * specification of the input and output files. - */ - public interface Options extends WordCount.WordCountOptions, - ExampleOptions, ExampleBigQueryTableOptions { - @Description("Fixed window duration, in minutes") - @Default.Integer(WINDOW_SIZE) - Integer getWindowSize(); - void setWindowSize(Integer value); - - @Description("Minimum randomly assigned timestamp, in milliseconds-since-epoch") - @Default.InstanceFactory(DefaultToCurrentSystemTime.class) - Long getMinTimestampMillis(); - void setMinTimestampMillis(Long value); - - @Description("Maximum randomly assigned timestamp, in milliseconds-since-epoch") - @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class) - Long getMaxTimestampMillis(); - void setMaxTimestampMillis(Long value); - - @Description("Fixed number of shards to produce per window, or null for runner-chosen sharding") - Integer getNumShards(); - void setNumShards(Integer numShards); - } - - public static void main(String[] args) throws IOException { - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - final String output = options.getOutput(); - final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); - final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); - - Pipeline pipeline = Pipeline.create(options); - - /** - * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or - * unbounded input source. - */ - PCollection input = pipeline - /** Read from the GCS file. */ - .apply(TextIO.read().from(options.getInputFile())) - // Concept #2: Add an element timestamp, using an artificial time just to show windowing. - // See AddTimestampFn for more detail on this. - .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp))); - - /** - * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1 - * minute (you can change this with a command-line option). See the documentation for more - * information on how fixed windows work, and for information on the other types of windowing - * available (e.g., sliding windows). - */ - PCollection windowedWords = - input.apply( - Window.into( - FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); - - /** - * Concept #4: Re-use our existing CountWords transform that does not have knowledge of - * windows over a PCollection containing windowed values. - */ - PCollection> wordCounts = windowedWords.apply(new WordCount.CountWords()); - - /** - * Concept #5: Format the results and write to a sharded file partitioned by window, using a - * simple ParDo operation. Because there may be failures followed by retries, the - * writes must be idempotent, but the details of writing to files is elided here. - */ - wordCounts - .apply(MapElements.via(new WordCount.FormatAsTextFn())) - .apply(new WriteOneFilePerWindow(output, options.getNumShards())); - - PipelineResult result = pipeline.run(); - try { - result.waitUntilFinish(); - } catch (Exception exc) { - result.cancel(); - } - } - -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java deleted file mode 100644 index 9947a26eda..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import ${package}.common.ExampleUtils; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Distribution; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.Validation.Required; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; - -/** - * An example that counts words in Shakespeare and includes Beam best practices. - * - *

This class, {@link WordCount}, is the second in a series of four successively more detailed - * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. - * After you've looked at this example, then see the {@link DebuggingWordCount} - * pipeline, for introduction of additional concepts. - * - *

For a detailed walkthrough of this example, see - * - * https://beam.apache.org/get-started/wordcount-example/ - * - * - *

Basic concepts, also in the MinimalWordCount example: - * Reading text files; counting a PCollection; writing to text files - * - *

New Concepts: - *

- *   1. Executing a Pipeline both locally and using the selected runner
- *   2. Using ParDo with static DoFns defined out-of-line
- *   3. Building a composite transform
- *   4. Defining your own pipeline options
- * 
- * - *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. - * These are now command-line options and not hard-coded as they were in the MinimalWordCount - * example. - * - *

To change the runner, specify: - *

{@code
- *   --runner=YOUR_SELECTED_RUNNER
- * }
- * 
- * - *

To execute this pipeline, specify a local output file (if using the - * {@code DirectRunner}) or output prefix on a supported distributed file system. - *

{@code
- *   --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
- * }
- * - *

The input file defaults to a public data set containing the text of of King Lear, - * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. - */ -public class WordCount { - - /** - * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns - * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it - * to a ParDo in the pipeline. - */ - static class ExtractWordsFn extends DoFn { - private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); - private final Distribution lineLenDist = Metrics.distribution( - ExtractWordsFn.class, "lineLenDistro"); - - @ProcessElement - public void processElement(ProcessContext c) { - lineLenDist.update(c.element().length()); - if (c.element().trim().isEmpty()) { - emptyLines.inc(); - } - - // Split the line into words. - String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN); - - // Output each word encountered into the output PCollection. - for (String word : words) { - if (!word.isEmpty()) { - c.output(word); - } - } - } - } - - /** A SimpleFunction that converts a Word and Count into a printable string. */ - public static class FormatAsTextFn extends SimpleFunction, String> { - @Override - public String apply(KV input) { - return input.getKey() + ": " + input.getValue(); - } - } - - /** - * A PTransform that converts a PCollection containing lines of text into a PCollection of - * formatted word counts. - * - *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and - * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, - * modular testing, and an improved monitoring experience. - */ - public static class CountWords extends PTransform, - PCollection>> { - @Override - public PCollection> expand(PCollection lines) { - - // Convert lines of text into individual words. - PCollection words = lines.apply( - ParDo.of(new ExtractWordsFn())); - - // Count the number of times each word occurs. - PCollection> wordCounts = - words.apply(Count.perElement()); - - return wordCounts; - } - } - - /** - * Options supported by {@link WordCount}. - * - *

Concept #4: Defining your own configuration options. Here, you can add your own arguments - * to be processed by the command-line parser, and specify default values for them. You can then - * access the options values in your pipeline code. - * - *

Inherits standard configuration options. - */ - public interface WordCountOptions extends PipelineOptions { - - /** - * By default, this example reads from a public dataset containing the text of - * King Lear. Set this option to choose a different input file or glob. - */ - @Description("Path of the file to read from") - @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") - String getInputFile(); - void setInputFile(String value); - - /** - * Set this required option to specify where to write the output. - */ - @Description("Path of the file to write to") - @Required - String getOutput(); - void setOutput(String value); - } - - public static void main(String[] args) { - WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() - .as(WordCountOptions.class); - Pipeline p = Pipeline.create(options); - - // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the - // static FormatAsTextFn() to the ParDo transform. - p.apply("ReadLines", TextIO.read().from(options.getInputFile())) - .apply(new CountWords()) - .apply(MapElements.via(new FormatAsTextFn())) - .apply("WriteCounts", TextIO.write().to(options.getOutput())); - - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java deleted file mode 100644 index 57f1546e27..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import com.google.api.services.bigquery.model.TableSchema; -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure BigQuery tables in Beam examples. - * The project defaults to the project being used to run the example. - */ -public interface ExampleBigQueryTableOptions extends GcpOptions { - @Description("BigQuery dataset name") - @Default.String("beam_examples") - String getBigQueryDataset(); - void setBigQueryDataset(String dataset); - - @Description("BigQuery table name") - @Default.InstanceFactory(BigQueryTableFactory.class) - String getBigQueryTable(); - void setBigQueryTable(String table); - - @Description("BigQuery table schema") - TableSchema getBigQuerySchema(); - void setBigQuerySchema(TableSchema schema); - - /** - * Returns the job name as the default BigQuery table name. - */ - class BigQueryTableFactory implements DefaultValueFactory { - @Override - public String create(PipelineOptions options) { - return options.getJobName().replace('-', '_'); - } - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java deleted file mode 100644 index cf142a10fd..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure Pub/Sub topic/subscription in Beam examples. - */ -public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions { - @Description("Pub/Sub subscription") - @Default.InstanceFactory(PubsubSubscriptionFactory.class) - String getPubsubSubscription(); - void setPubsubSubscription(String subscription); - - /** - * Returns a default Pub/Sub subscription based on the project and the job names. - */ - class PubsubSubscriptionFactory implements DefaultValueFactory { - @Override - public String create(PipelineOptions options) { - return "projects/" + options.as(GcpOptions.class).getProject() - + "/subscriptions/" + options.getJobName(); - } - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java deleted file mode 100644 index 86784b06da..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure Pub/Sub topic in Beam examples. - */ -public interface ExamplePubsubTopicOptions extends GcpOptions { - @Description("Pub/Sub topic") - @Default.InstanceFactory(PubsubTopicFactory.class) - String getPubsubTopic(); - void setPubsubTopic(String topic); - - /** - * Returns a default Pub/Sub topic based on the project and the job names. - */ - class PubsubTopicFactory implements DefaultValueFactory { - @Override - public String create(PipelineOptions options) { - return "projects/" + options.as(GcpOptions.class).getProject() - + "/topics/" + options.getJobName(); - } - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java deleted file mode 100644 index 78f3849b40..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import com.google.api.client.googleapis.json.GoogleJsonResponseException; -import com.google.api.client.googleapis.services.AbstractGoogleClientRequest; -import com.google.api.client.http.HttpRequestInitializer; -import com.google.api.services.bigquery.Bigquery; -import com.google.api.services.bigquery.Bigquery.Datasets; -import com.google.api.services.bigquery.Bigquery.Tables; -import com.google.api.services.bigquery.model.Dataset; -import com.google.api.services.bigquery.model.DatasetReference; -import com.google.api.services.bigquery.model.Table; -import com.google.api.services.bigquery.model.TableReference; -import com.google.api.services.bigquery.model.TableSchema; -import com.google.api.services.pubsub.Pubsub; -import com.google.api.services.pubsub.model.Subscription; -import com.google.api.services.pubsub.model.Topic; -import com.google.auth.Credentials; -import com.google.auth.http.HttpCredentialsAdapter; -import com.google.cloud.hadoop.util.ChainingHttpRequestInitializer; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.common.util.concurrent.Uninterruptibles; -import java.io.IOException; -import java.util.Collection; -import java.util.List; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.extensions.gcp.auth.NullCredentialInitializer; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; -import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.sdk.util.BackOffUtils; -import org.apache.beam.sdk.util.FluentBackoff; -import org.apache.beam.sdk.util.RetryHttpRequestInitializer; -import org.apache.beam.sdk.util.Sleeper; -import org.apache.beam.sdk.util.Transport; -import org.joda.time.Duration; - -/** - * The utility class that sets up and tears down external resources, - * and cancels the streaming pipelines once the program terminates. - * - *

It is used to run Beam examples. - */ -public class ExampleUtils { - - private static final int SC_NOT_FOUND = 404; - - /** - * \p{L} denotes the category of Unicode letters, - * so this pattern will match on everything that is not a letter. - * - *

It is used for tokenizing strings in the wordcount examples. - */ - public static final String TOKENIZER_PATTERN = "[^\\p{L}]+"; - - private final PipelineOptions options; - private Bigquery bigQueryClient = null; - private Pubsub pubsubClient = null; - private Set pipelinesToCancel = Sets.newHashSet(); - private List pendingMessages = Lists.newArrayList(); - - /** - * Do resources and runner options setup. - */ - public ExampleUtils(PipelineOptions options) { - this.options = options; - } - - /** - * Sets up external resources that are required by the example, - * such as Pub/Sub topics and BigQuery tables. - * - * @throws IOException if there is a problem setting up the resources - */ - public void setup() throws IOException { - Sleeper sleeper = Sleeper.DEFAULT; - BackOff backOff = - FluentBackoff.DEFAULT - .withMaxRetries(3).withInitialBackoff(Duration.millis(200)).backoff(); - Throwable lastException = null; - try { - do { - try { - setupPubsub(); - setupBigQueryTable(); - return; - } catch (GoogleJsonResponseException e) { - lastException = e; - } - } while (BackOffUtils.next(sleeper, backOff)); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - // Ignore InterruptedException - } - throw new RuntimeException(lastException); - } - - /** - * Sets up the Google Cloud Pub/Sub topic. - * - *

If the topic doesn't exist, a new topic with the given name will be created. - * - * @throws IOException if there is a problem setting up the Pub/Sub topic - */ - public void setupPubsub() throws IOException { - ExamplePubsubTopicAndSubscriptionOptions pubsubOptions = - options.as(ExamplePubsubTopicAndSubscriptionOptions.class); - if (!pubsubOptions.getPubsubTopic().isEmpty()) { - pendingMessages.add("**********************Set Up Pubsub************************"); - setupPubsubTopic(pubsubOptions.getPubsubTopic()); - pendingMessages.add("The Pub/Sub topic has been set up for this example: " - + pubsubOptions.getPubsubTopic()); - - if (!pubsubOptions.getPubsubSubscription().isEmpty()) { - setupPubsubSubscription( - pubsubOptions.getPubsubTopic(), pubsubOptions.getPubsubSubscription()); - pendingMessages.add("The Pub/Sub subscription has been set up for this example: " - + pubsubOptions.getPubsubSubscription()); - } - } - } - - /** - * Sets up the BigQuery table with the given schema. - * - *

If the table already exists, the schema has to match the given one. Otherwise, the example - * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema - * will be created. - * - * @throws IOException if there is a problem setting up the BigQuery table - */ - public void setupBigQueryTable() throws IOException { - ExampleBigQueryTableOptions bigQueryTableOptions = - options.as(ExampleBigQueryTableOptions.class); - if (bigQueryTableOptions.getBigQueryDataset() != null - && bigQueryTableOptions.getBigQueryTable() != null - && bigQueryTableOptions.getBigQuerySchema() != null) { - pendingMessages.add("******************Set Up Big Query Table*******************"); - setupBigQueryTable(bigQueryTableOptions.getProject(), - bigQueryTableOptions.getBigQueryDataset(), - bigQueryTableOptions.getBigQueryTable(), - bigQueryTableOptions.getBigQuerySchema()); - pendingMessages.add("The BigQuery table has been set up for this example: " - + bigQueryTableOptions.getProject() - + ":" + bigQueryTableOptions.getBigQueryDataset() - + "." + bigQueryTableOptions.getBigQueryTable()); - } - } - - /** - * Tears down external resources that can be deleted upon the example's completion. - */ - private void tearDown() { - pendingMessages.add("*************************Tear Down*************************"); - ExamplePubsubTopicAndSubscriptionOptions pubsubOptions = - options.as(ExamplePubsubTopicAndSubscriptionOptions.class); - if (!pubsubOptions.getPubsubTopic().isEmpty()) { - try { - deletePubsubTopic(pubsubOptions.getPubsubTopic()); - pendingMessages.add("The Pub/Sub topic has been deleted: " - + pubsubOptions.getPubsubTopic()); - } catch (IOException e) { - pendingMessages.add("Failed to delete the Pub/Sub topic : " - + pubsubOptions.getPubsubTopic()); - } - if (!pubsubOptions.getPubsubSubscription().isEmpty()) { - try { - deletePubsubSubscription(pubsubOptions.getPubsubSubscription()); - pendingMessages.add("The Pub/Sub subscription has been deleted: " - + pubsubOptions.getPubsubSubscription()); - } catch (IOException e) { - pendingMessages.add("Failed to delete the Pub/Sub subscription : " - + pubsubOptions.getPubsubSubscription()); - } - } - } - - ExampleBigQueryTableOptions bigQueryTableOptions = - options.as(ExampleBigQueryTableOptions.class); - if (bigQueryTableOptions.getBigQueryDataset() != null - && bigQueryTableOptions.getBigQueryTable() != null - && bigQueryTableOptions.getBigQuerySchema() != null) { - pendingMessages.add("The BigQuery table might contain the example's output, " - + "and it is not deleted automatically: " - + bigQueryTableOptions.getProject() - + ":" + bigQueryTableOptions.getBigQueryDataset() - + "." + bigQueryTableOptions.getBigQueryTable()); - pendingMessages.add("Please go to the Developers Console to delete it manually." - + " Otherwise, you may be charged for its usage."); - } - } - - /** - * Returns a BigQuery client builder using the specified {@link BigQueryOptions}. - */ - private static Bigquery.Builder newBigQueryClient(BigQueryOptions options) { - return new Bigquery.Builder(Transport.getTransport(), Transport.getJsonFactory(), - chainHttpRequestInitializer( - options.getGcpCredential(), - // Do not log 404. It clutters the output and is possibly even required by the caller. - new RetryHttpRequestInitializer(ImmutableList.of(404)))) - .setApplicationName(options.getAppName()) - .setGoogleClientRequestInitializer(options.getGoogleApiTrace()); - } - - /** - * Returns a Pubsub client builder using the specified {@link PubsubOptions}. - */ - private static Pubsub.Builder newPubsubClient(PubsubOptions options) { - return new Pubsub.Builder(Transport.getTransport(), Transport.getJsonFactory(), - chainHttpRequestInitializer( - options.getGcpCredential(), - // Do not log 404. It clutters the output and is possibly even required by the caller. - new RetryHttpRequestInitializer(ImmutableList.of(404)))) - .setRootUrl(options.getPubsubRootUrl()) - .setApplicationName(options.getAppName()) - .setGoogleClientRequestInitializer(options.getGoogleApiTrace()); - } - - private static HttpRequestInitializer chainHttpRequestInitializer( - Credentials credential, HttpRequestInitializer httpRequestInitializer) { - if (credential == null) { - return new ChainingHttpRequestInitializer( - new NullCredentialInitializer(), httpRequestInitializer); - } else { - return new ChainingHttpRequestInitializer( - new HttpCredentialsAdapter(credential), - httpRequestInitializer); - } - } - - private void setupBigQueryTable(String projectId, String datasetId, String tableId, - TableSchema schema) throws IOException { - if (bigQueryClient == null) { - bigQueryClient = newBigQueryClient(options.as(BigQueryOptions.class)).build(); - } - - Datasets datasetService = bigQueryClient.datasets(); - if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) { - Dataset newDataset = new Dataset().setDatasetReference( - new DatasetReference().setProjectId(projectId).setDatasetId(datasetId)); - datasetService.insert(projectId, newDataset).execute(); - } - - Tables tableService = bigQueryClient.tables(); - Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId)); - if (table == null) { - Table newTable = new Table().setSchema(schema).setTableReference( - new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId)); - tableService.insert(projectId, datasetId, newTable).execute(); - } else if (!table.getSchema().equals(schema)) { - throw new RuntimeException( - "Table exists and schemas do not match, expecting: " + schema.toPrettyString() - + ", actual: " + table.getSchema().toPrettyString()); - } - } - - private void setupPubsubTopic(String topic) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) { - pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute(); - } - } - - private void setupPubsubSubscription(String topic, String subscription) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) == null) { - Subscription subInfo = new Subscription() - .setAckDeadlineSeconds(60) - .setTopic(topic); - pubsubClient.projects().subscriptions().create(subscription, subInfo).execute(); - } - } - - /** - * Deletes the Google Cloud Pub/Sub topic. - * - * @throws IOException if there is a problem deleting the Pub/Sub topic - */ - private void deletePubsubTopic(String topic) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) { - pubsubClient.projects().topics().delete(topic).execute(); - } - } - - /** - * Deletes the Google Cloud Pub/Sub subscription. - * - * @throws IOException if there is a problem deleting the Pub/Sub subscription - */ - private void deletePubsubSubscription(String subscription) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) != null) { - pubsubClient.projects().subscriptions().delete(subscription).execute(); - } - } - - /** - * Waits for the pipeline to finish and cancels it before the program exists. - */ - public void waitToFinish(PipelineResult result) { - pipelinesToCancel.add(result); - if (!options.as(ExampleOptions.class).getKeepJobsRunning()) { - addShutdownHook(pipelinesToCancel); - } - try { - result.waitUntilFinish(); - } catch (UnsupportedOperationException e) { - // Do nothing if the given PipelineResult doesn't support waitUntilFinish(), - // such as EvaluationResults returned by DirectRunner. - tearDown(); - printPendingMessages(); - } catch (Exception e) { - throw new RuntimeException("Failed to wait the pipeline until finish: " + result); - } - } - - private void addShutdownHook(final Collection pipelineResults) { - Runtime.getRuntime().addShutdownHook(new Thread() { - @Override - public void run() { - tearDown(); - printPendingMessages(); - for (PipelineResult pipelineResult : pipelineResults) { - try { - pipelineResult.cancel(); - } catch (IOException e) { - System.out.println("Failed to cancel the job."); - System.out.println(e.getMessage()); - } - } - - for (PipelineResult pipelineResult : pipelineResults) { - boolean cancellationVerified = false; - for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) { - if (pipelineResult.getState().isTerminal()) { - cancellationVerified = true; - break; - } else { - System.out.println( - "The example pipeline is still running. Verifying the cancellation."); - } - Uninterruptibles.sleepUninterruptibly(10, TimeUnit.SECONDS); - } - if (!cancellationVerified) { - System.out.println("Failed to verify the cancellation for job: " + pipelineResult); - } - } - } - }); - } - - private void printPendingMessages() { - System.out.println(); - System.out.println("***********************************************************"); - System.out.println("***********************************************************"); - for (String message : pendingMessages) { - System.out.println(message); - } - System.out.println("***********************************************************"); - System.out.println("***********************************************************"); - } - - private static T executeNullIfNotFound( - AbstractGoogleClientRequest request) throws IOException { - try { - return request.execute(); - } catch (GoogleJsonResponseException e) { - if (e.getStatusCode() == SC_NOT_FOUND) { - return null; - } else { - throw e; - } - } - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java deleted file mode 100644 index 9796d647b5..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import static com.google.common.base.MoreObjects.firstNonNull; - -import javax.annotation.Nullable; -import org.apache.beam.sdk.io.FileBasedSink; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.PaneInfo; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; - -/** - * A {@link DoFn} that writes elements to files with names deterministically derived from the lower - * and upper bounds of their key (an {@link IntervalWindow}). - * - *

This is test utility code, not for end-users, so examples can be focused on their primary - * lessons. - */ -public class WriteOneFilePerWindow extends PTransform, PDone> { - private static final DateTimeFormatter FORMATTER = ISODateTimeFormat.hourMinute(); - private String filenamePrefix; - @Nullable - private Integer numShards; - - public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) { - this.filenamePrefix = filenamePrefix; - this.numShards = numShards; - } - - @Override - public PDone expand(PCollection input) { - ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); - TextIO.Write write = - TextIO.write() - .to(new PerWindowFiles(resource)) - .withTempDirectory(resource.getCurrentDirectory()) - .withWindowedWrites(); - if (numShards != null) { - write = write.withNumShards(numShards); - } - return input.apply(write); - } - - /** - * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data - * being written. This always includes the shard number and the total number of shards. For - * windowed writes, it also includes the window and pane index (a sequence number assigned to each - * trigger firing). - */ - public static class PerWindowFiles extends FilenamePolicy { - - private final ResourceId baseFilename; - - public PerWindowFiles(ResourceId baseFilename) { - this.baseFilename = baseFilename; - } - - public String filenamePrefixForWindow(IntervalWindow window) { - String prefix = - baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), ""); - return String.format("%s-%s-%s", - prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end())); - } - - @Override - public ResourceId windowedFilename(int shardNumber, - int numShards, - BoundedWindow window, - PaneInfo paneInfo, - OutputFileHints outputFileHints) { - IntervalWindow intervalWindow = (IntervalWindow) window; - String filename = - String.format( - "%s-%s-of-%s%s", - filenamePrefixForWindow(intervalWindow), - shardNumber, - numShards, - outputFileHints.getSuggestedFilenameSuffix()); - return baseFilename - .getCurrentDirectory() - .resolve(filename, StandardResolveOptions.RESOLVE_FILE); - } - - @Override - public ResourceId unwindowedFilename( - int shardNumber, int numShards, OutputFileHints outputFileHints) { - throw new UnsupportedOperationException("Unsupported."); - } - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java deleted file mode 100644 index 26e1498d71..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import com.google.common.io.Files; -import java.io.File; -import java.nio.charset.StandardCharsets; -import ${package}.DebuggingWordCount.WordCountOptions; -import org.apache.beam.sdk.testing.TestPipeline; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests for {@link DebuggingWordCount}. - */ -@RunWith(JUnit4.class) -public class DebuggingWordCountTest { - @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); - - private String getFilePath(String filePath) { - if (filePath.contains(":")) { - return filePath.replace("\\", "/").split(":")[1]; - } - return filePath; - } - - @Test - public void testDebuggingWordCount() throws Exception { - File inputFile = tmpFolder.newFile(); - File outputFile = tmpFolder.newFile(); - Files.write( - "stomach secret Flourish message Flourish here Flourish", - inputFile, - StandardCharsets.UTF_8); - WordCountOptions options = - TestPipeline.testingPipelineOptions().as(WordCountOptions.class); - options.setInputFile(getFilePath(inputFile.getAbsolutePath())); - options.setOutput(getFilePath(outputFile.getAbsolutePath())); - DebuggingWordCount.main(TestPipeline.convertToArgs(options)); - } -} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java deleted file mode 100644 index b4e4124e26..0000000000 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/WordCountTest.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.util.Arrays; -import java.util.List; -import ${package}.WordCount.CountWords; -import ${package}.WordCount.ExtractWordsFn; -import ${package}.WordCount.FormatAsTextFn; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.ValidatesRunner; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.DoFnTester; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.values.PCollection; -import org.hamcrest.CoreMatchers; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests of WordCount. - */ -@RunWith(JUnit4.class) -public class WordCountTest { - - /** Example test that tests a specific {@link DoFn}. */ - @Test - public void testExtractWordsFn() throws Exception { - DoFnTester extractWordsFn = - DoFnTester.of(new ExtractWordsFn()); - - Assert.assertThat(extractWordsFn.processBundle(" some input words "), - CoreMatchers.hasItems("some", "input", "words")); - Assert.assertThat(extractWordsFn.processBundle(" "), - CoreMatchers.hasItems()); - Assert.assertThat(extractWordsFn.processBundle(" some ", " input", " words"), - CoreMatchers.hasItems("some", "input", "words")); - } - - static final String[] WORDS_ARRAY = new String[] { - "hi there", "hi", "hi sue bob", - "hi sue", "", "bob hi"}; - - static final List WORDS = Arrays.asList(WORDS_ARRAY); - - static final String[] COUNTS_ARRAY = new String[] { - "hi: 5", "there: 1", "sue: 2", "bob: 2"}; - - @Rule - public TestPipeline p = TestPipeline.create(); - - /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ - @Test - @Category(ValidatesRunner.class) - public void testCountWords() throws Exception { - PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); - - PCollection output = input.apply(new CountWords()) - .apply(MapElements.via(new FormatAsTextFn())); - - PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties deleted file mode 100644 index b0195b3f16..0000000000 --- a/maven-archetypes/examples-java8/src/test/resources/projects/basic/archetype.properties +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -package=it.pkg -version=0.1 -groupId=archetype.it -artifactId=basic -targetPlatform=1.8 diff --git a/maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt b/maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt deleted file mode 100644 index 0b5987362f..0000000000 --- a/maven-archetypes/examples-java8/src/test/resources/projects/basic/goal.txt +++ /dev/null @@ -1 +0,0 @@ -verify diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index cfe47d4d8a..5ff4872335 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -39,7 +39,7 @@ org.apache.maven.archetype archetype-packaging - 2.4 + ${archetype-packaging.version} @@ -47,12 +47,12 @@ maven-archetype-plugin - 2.4 + ${maven-archetype-plugin.version} org.apache.maven.shared maven-invoker - 2.2 + ${maven-invoker.version} diff --git a/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml index 2b9eb52d80..29f8605cce 100644 --- a/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml +++ b/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml @@ -22,7 +22,7 @@ - 1.7 + 1.8 diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 2c2b8d3fc6..dcbedafd76 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -32,13 +32,13 @@ @google-clients.version@ @guava.version@ @hamcrest.version@ - @jackson.version@ @joda.version@ @junit.version@ @maven-compiler-plugin.version@ - @maven-exec-plugin.version@ + @exec-maven-plugin.version@ @maven-jar-plugin.version@ @maven-shade-plugin.version@ + @mockito.version@ @pubsub.version@ @slf4j.version@ @surefire-plugin.version@ @@ -137,7 +137,7 @@ org.codehaus.mojo exec-maven-plugin - ${maven-exec-plugin.version} + ${exec-maven-plugin.version} false @@ -251,5 +251,12 @@ junit ${junit.version} + + + org.mockito + mockito-all + ${mockito.version} + test + diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java index d6b08066db..f1bd8bfaa8 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java @@ -17,18 +17,17 @@ */ package ${package}; -import ${package}.common.ExampleUtils; +import java.util.Arrays; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.Filter; +import org.apache.beam.sdk.transforms.FlatMapElements; import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; import org.apache.beam.sdk.values.KV; - +import org.apache.beam.sdk.values.TypeDescriptors; /** * An example that counts words in Shakespeare. @@ -59,16 +58,30 @@ public class MinimalWordCount { public static void main(String[] args) { + // Create a PipelineOptions object. This object lets us set various execution // options for our pipeline, such as the runner you wish to use. This example // will run with the DirectRunner by default, based on the class path configured // in its dependencies. PipelineOptions options = PipelineOptionsFactory.create(); - // Create the Pipeline object with the options we defined above. - Pipeline p = Pipeline.create(options); + // In order to run your pipeline, you need to make following runner specific changes: + // + // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner + // or FlinkRunner. + // CHANGE 2/3: Specify runner-required options. + // For BlockingDataflowRunner, set project and temp location as follows: + // DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); + // dataflowOptions.setRunner(BlockingDataflowRunner.class); + // dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE"); + // dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY"); + // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions} + // for more details. + // options.as(FlinkPipelineOptions.class) + // .setRunner(FlinkRunner.class); - // Apply the pipeline's transforms. + // Create the Pipeline object with the options we defined above + Pipeline p = Pipeline.create(options); // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set // of input text files. TextIO.Read returns a PCollection where each element is one line from @@ -77,43 +90,30 @@ public static void main(String[] args) { // This example reads a public data set consisting of the complete works of Shakespeare. p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) - // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a - // DoFn (defined in-line) on each element that tokenizes the text line into individual words. - // The ParDo returns a PCollection, where each element is an individual word in - // Shakespeare's collected texts. - .apply("ExtractWords", ParDo.of(new DoFn() { - @ProcessElement - public void processElement(ProcessContext c) { - for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) { - if (!word.isEmpty()) { - c.output(word); - } - } - } - })) - - // Concept #3: Apply the Count transform to our PCollection of individual words. The Count - // transform returns a new PCollection of key/value pairs, where each key represents a unique - // word in the text. The associated value is the occurrence count for that word. - .apply(Count.perElement()) - - // Apply a MapElements transform that formats our PCollection of word counts into a printable - // string, suitable for writing to an output file. - .apply("FormatResults", MapElements.via(new SimpleFunction, String>() { - @Override - public String apply(KV input) { - return input.getKey() + ": " + input.getValue(); - } - })) - - // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. - // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of - // formatted strings) to a series of text files. - // - // By default, it will write to a set of files with names like wordcount-00001-of-00005 - .apply(TextIO.write().to("wordcounts")); + // Concept #2: Apply a FlatMapElements transform the PCollection of text lines. + // This transform splits the lines in PCollection, where each element is an + // individual word in Shakespeare's collected texts. + .apply(FlatMapElements + .into(TypeDescriptors.strings()) + .via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))) + // We use a Filter transform to avoid empty word + .apply(Filter.by((String word) -> !word.isEmpty())) + // Concept #3: Apply the Count transform to our PCollection of individual words. The Count + // transform returns a new PCollection of key/value pairs, where each key represents a + // unique word in the text. The associated value is the occurrence count for that word. + .apply(Count.perElement()) + // Apply a MapElements transform that formats our PCollection of word counts into a + // printable string, suitable for writing to an output file. + .apply(MapElements + .into(TypeDescriptors.strings()) + .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) + // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. + // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of + // formatted strings) to a series of text files. + // + // By default, it will write to a set of files with names like wordcounts-00001-of-00005 + .apply(TextIO.write().to("wordcounts")); - // Run the pipeline. p.run().waitUntilFinish(); } } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java index 6a1d07c485..501ac27881 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java @@ -40,7 +40,6 @@ import org.joda.time.Duration; import org.joda.time.Instant; - /** * An example that counts words in text, and can run over either unbounded or bounded input * collections. @@ -98,7 +97,6 @@ public class WindowedWordCount { * 2-hour period. */ static class AddTimestampFn extends DoFn { - private static final Duration RAND_RANGE = Duration.standardHours(1); private final Instant minTimestamp; private final Instant maxTimestamp; @@ -162,7 +160,7 @@ public interface Options extends WordCount.WordCountOptions, Long getMaxTimestampMillis(); void setMaxTimestampMillis(Long value); - @Description("Fixed number of shards to produce per window, or null for runner-chosen sharding") + @Description("Fixed number of shards to produce per window") Integer getNumShards(); void setNumShards(Integer numShards); } @@ -194,8 +192,7 @@ public static void main(String[] args) throws IOException { */ PCollection windowedWords = input.apply( - Window.into( - FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); + Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); /** * Concept #4: Re-use our existing CountWords transform that does not have knowledge of diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java index 9947a26eda..33f7b39f19 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -137,8 +137,7 @@ public PCollection> expand(PCollection lines) { ParDo.of(new ExtractWordsFn())); // Count the number of times each word occurs. - PCollection> wordCounts = - words.apply(Count.perElement()); + PCollection> wordCounts = words.apply(Count.perElement()); return wordCounts; } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java index 78f3849b40..e1159b9018 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java @@ -346,38 +346,39 @@ public void waitToFinish(PipelineResult result) { } private void addShutdownHook(final Collection pipelineResults) { - Runtime.getRuntime().addShutdownHook(new Thread() { - @Override - public void run() { - tearDown(); - printPendingMessages(); - for (PipelineResult pipelineResult : pipelineResults) { - try { - pipelineResult.cancel(); - } catch (IOException e) { - System.out.println("Failed to cancel the job."); - System.out.println(e.getMessage()); - } - } + Runtime.getRuntime() + .addShutdownHook( + new Thread( + () -> { + tearDown(); + printPendingMessages(); + for (PipelineResult pipelineResult : pipelineResults) { + try { + pipelineResult.cancel(); + } catch (IOException e) { + System.out.println("Failed to cancel the job."); + System.out.println(e.getMessage()); + } + } - for (PipelineResult pipelineResult : pipelineResults) { - boolean cancellationVerified = false; - for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) { - if (pipelineResult.getState().isTerminal()) { - cancellationVerified = true; - break; - } else { - System.out.println( - "The example pipeline is still running. Verifying the cancellation."); - } - Uninterruptibles.sleepUninterruptibly(10, TimeUnit.SECONDS); - } - if (!cancellationVerified) { - System.out.println("Failed to verify the cancellation for job: " + pipelineResult); - } - } - } - }); + for (PipelineResult pipelineResult : pipelineResults) { + boolean cancellationVerified = false; + for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) { + if (pipelineResult.getState().isTerminal()) { + cancellationVerified = true; + break; + } else { + System.out.println( + "The example pipeline is still running. Verifying the cancellation."); + } + Uninterruptibles.sleepUninterruptibly(10, TimeUnit.SECONDS); + } + if (!cancellationVerified) { + System.out.println( + "Failed to verify the cancellation for job: " + pipelineResult); + } + } + })); } private void printPendingMessages() { diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java similarity index 68% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java index a286811293..3cb04bd2e4 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java @@ -19,8 +19,8 @@ import java.util.HashMap; import java.util.Map; -import java.util.TimeZone; import ${package}.common.ExampleUtils; +import ${package}.complete.game.utils.GameConstants; import ${package}.complete.game.utils.WriteWindowedToBigQuery; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; @@ -50,11 +50,8 @@ import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TypeDescriptors; -import org.joda.time.DateTimeZone; import org.joda.time.Duration; import org.joda.time.Instant; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -93,14 +90,8 @@ */ public class GameStats extends LeaderBoard { - private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; - - private static DateTimeFormatter fmt = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); - /** - * Filter out all but those users with a high clickrate, which we will consider as 'spammy' uesrs. + * Filter out all users but those with a high clickrate, which we will consider as 'spammy' users. * We do this by finding the mean total score per user, then using that information as a side * input to filter out all but those user scores that are larger than * {@code (mean * SCORE_WEIGHT)}. @@ -115,12 +106,12 @@ public static class CalculateSpammyUsers public PCollection> expand(PCollection> userScores) { // Get the sum of scores for each user. - PCollection> sumScores = userScores - .apply("UserSum", Sum.integersPerKey()); + PCollection> sumScores = + userScores.apply("UserSum", Sum.integersPerKey()); // Extract the score from each element, and use it to find the global mean. - final PCollectionView globalMeanScore = sumScores.apply(Values.create()) - .apply(Mean.globally().asSingletonView()); + final PCollectionView globalMeanScore = + sumScores.apply(Values.create()).apply(Mean.globally().asSingletonView()); // Filter the user sums using the global mean. PCollection> filtered = sumScores @@ -193,27 +184,24 @@ interface Options extends LeaderBoard.Options { protected static Map>> configureWindowedWrite() { Map>> tableConfigure = - new HashMap>>(); + new HashMap<>(); tableConfigure.put( - "team", - new WriteWindowedToBigQuery.FieldInfo>( - "STRING", (c, w) -> c.element().getKey())); + "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); tableConfigure.put( "total_score", - new WriteWindowedToBigQuery.FieldInfo>( - "INTEGER", (c, w) -> c.element().getValue())); + new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); tableConfigure.put( "window_start", - new WriteWindowedToBigQuery.FieldInfo>( + new WriteWindowedToBigQuery.FieldInfo<>( "STRING", (c, w) -> { IntervalWindow window = (IntervalWindow) w; - return fmt.print(window.start()); + return GameConstants.DATE_TIME_FORMATTER.print(window.start()); })); tableConfigure.put( "processing_time", - new WriteWindowedToBigQuery.FieldInfo>( - "STRING", (c, w) -> fmt.print(Instant.now()))); + new WriteWindowedToBigQuery.FieldInfo<>( + "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); return tableConfigure; } @@ -224,19 +212,17 @@ interface Options extends LeaderBoard.Options { protected static Map> configureSessionWindowWrite() { - Map> tableConfigure = - new HashMap>(); + Map> tableConfigure = new HashMap<>(); tableConfigure.put( "window_start", - new WriteWindowedToBigQuery.FieldInfo( + new WriteWindowedToBigQuery.FieldInfo<>( "STRING", (c, w) -> { IntervalWindow window = (IntervalWindow) w; - return fmt.print(window.start()); + return GameConstants.DATE_TIME_FORMATTER.print(window.start()); })); tableConfigure.put( - "mean_duration", - new WriteWindowedToBigQuery.FieldInfo("FLOAT", (c, w) -> c.element())); + "mean_duration", new WriteWindowedToBigQuery.FieldInfo<>("FLOAT", (c, w) -> c.element())); return tableConfigure; } @@ -253,7 +239,8 @@ public static void main(String[] args) throws Exception { // Read Events from Pub/Sub using custom timestamps PCollection rawEvents = pipeline .apply(PubsubIO.readStrings() - .withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())) + .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) + .fromTopic(options.getTopic())) .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); // Extract username/score pairs from the event stream @@ -265,16 +252,19 @@ public static void main(String[] args) throws Exception { // Calculate the total score per user over fixed windows, and // cumulative updates for late data. - final PCollectionView> spammersView = userEvents - .apply("FixedWindowsUser", Window.>into( - FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) - - // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. - // These might be robots/spammers. - .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) - // Derive a view from the collection of spammer users. It will be used as a side input - // in calculating the team score sums, below. - .apply("CreateSpammersView", View.asMap()); + final PCollectionView> spammersView = + userEvents + .apply( + "FixedWindowsUser", + Window.into( + FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) + + // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. + // These might be robots/spammers. + .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) + // Derive a view from the collection of spammer users. It will be used as a side input + // in calculating the team score sums, below. + .apply("CreateSpammersView", View.asMap()); // [START DocInclude_FilterAndCalc] // Calculate the total score per team over fixed windows, @@ -282,29 +272,35 @@ public static void main(String[] args) throws Exception { // suspected robots-- to filter out scores from those users from the sum. // Write the results to BigQuery. rawEvents - .apply("WindowIntoFixedWindows", Window.into( - FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) - // Filter out the detected spammer users, using the side input derived above. - .apply("FilterOutSpammers", ParDo - .of(new DoFn() { - @ProcessElement - public void processElement(ProcessContext c) { - // If the user is not in the spammers Map, output the data element. - if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { - c.output(c.element()); - } - } - }).withSideInputs(spammersView)) + .apply( + "WindowIntoFixedWindows", + Window.into( + FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) + // Filter out the detected spammer users, using the side input derived above. + .apply( + "FilterOutSpammers", + ParDo.of( + new DoFn() { + @ProcessElement + public void processElement(ProcessContext c) { + // If the user is not in the spammers Map, output the data element. + if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { + c.output(c.element()); + } + } + }) + .withSideInputs(spammersView)) // Extract and sum teamname/score pairs from the event data. - .apply("ExtractTeamScore", new ExtractAndSumScore("team")) - // [END DocInclude_FilterAndCalc] - // Write the result to BigQuery - .apply("WriteTeamSums", - new WriteWindowedToBigQuery>( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getGameStatsTablePrefix() + "_team", configureWindowedWrite())); - + .apply("ExtractTeamScore", new ExtractAndSumScore("team")) + // [END DocInclude_FilterAndCalc] + // Write the result to BigQuery + .apply( + "WriteTeamSums", + new WriteWindowedToBigQuery<>( + options.as(GcpOptions.class).getProject(), + options.getDataset(), + options.getGameStatsTablePrefix() + "_team", + configureWindowedWrite())); // [START DocInclude_SessionCalc] // Detect user sessions-- that is, a burst of activity separated by a gap from further @@ -312,27 +308,33 @@ public void processElement(ProcessContext c) { // This information could help the game designers track the changing user engagement // as their set of games changes. userEvents - .apply("WindowIntoSessions", Window.>into( - Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) - .withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)) - // For this use, we care only about the existence of the session, not any particular - // information aggregated over it, so the following is an efficient way to do that. - .apply(Combine.perKey(x -> 0)) - // Get the duration per session. - .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) - // [END DocInclude_SessionCalc] - // [START DocInclude_Rewindow] - // Re-window to process groups of session sums according to when the sessions complete. - .apply("WindowToExtractSessionMean", Window.into( - FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) - // Find the mean session duration in each window. - .apply(Mean.globally().withoutDefaults()) - // Write this info to a BigQuery table. - .apply("WriteAvgSessionLength", - new WriteWindowedToBigQuery( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite())); + .apply( + "WindowIntoSessions", + Window.>into( + Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) + .withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)) + // For this use, we care only about the existence of the session, not any particular + // information aggregated over it, so the following is an efficient way to do that. + .apply(Combine.perKey(x -> 0)) + // Get the duration per session. + .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) + // [END DocInclude_SessionCalc] + // [START DocInclude_Rewindow] + // Re-window to process groups of session sums according to when the sessions complete. + .apply( + "WindowToExtractSessionMean", + Window.into( + FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) + // Find the mean session duration in each window. + .apply(Mean.globally().withoutDefaults()) + // Write this info to a BigQuery table. + .apply( + "WriteAvgSessionLength", + new WriteWindowedToBigQuery<>( + options.as(GcpOptions.class).getProject(), + options.getDataset(), + options.getGameStatsTablePrefix() + "_sessions", + configureSessionWindowWrite())); // [END DocInclude_Rewindow] diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java similarity index 75% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java index e60af492e4..fe1fe99da7 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java @@ -20,6 +20,7 @@ import java.util.HashMap; import java.util.Map; import java.util.TimeZone; +import ${package}.complete.game.utils.GameConstants; import ${package}.complete.game.utils.WriteToText; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.TextIO; @@ -73,9 +74,6 @@ */ public class HourlyTeamScore extends UserScore { - private static DateTimeFormatter fmt = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); private static DateTimeFormatter minFmt = DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm") .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); @@ -113,15 +111,14 @@ interface Options extends UserScore.Options { */ protected static Map>> configureOutput() { - Map>> config = - new HashMap>>(); + Map>> config = new HashMap<>(); config.put("team", (c, w) -> c.element().getKey()); config.put("total_score", (c, w) -> c.element().getValue()); config.put( "window_start", (c, w) -> { IntervalWindow window = (IntervalWindow) w; - return fmt.print(window.start()); + return GameConstants.DATE_TIME_FORMATTER.print(window.start()); }); return config; } @@ -140,40 +137,43 @@ public static void main(String[] args) throws Exception { final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin())); // Read 'gaming' events from a text file. - pipeline.apply(TextIO.read().from(options.getInput())) - // Parse the incoming data. - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) - - // Filter out data before and after the given times so that it is not included - // in the calculations. As we collect data in batches (say, by day), the batch for the day - // that we want to analyze could potentially include some late-arriving data from the previous - // day. If so, we want to weed it out. Similarly, if we include data from the following day - // (to scoop up late-arriving events from the day we're analyzing), we need to weed out events - // that fall after the time period we want to analyze. - // [START DocInclude_HTSFilters] - .apply("FilterStartTime", Filter.by( - (GameActionInfo gInfo) - -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) - .apply("FilterEndTime", Filter.by( - (GameActionInfo gInfo) - -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) - // [END DocInclude_HTSFilters] - - // [START DocInclude_HTSAddTsAndWindow] - // Add an element timestamp based on the event log, and apply fixed windowing. - .apply("AddEventTimestamps", - WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) - .apply("FixedWindowsTeam", Window.into( - FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) - // [END DocInclude_HTSAddTsAndWindow] - - // Extract and sum teamname/score pairs from the event data. - .apply("ExtractTeamScore", new ExtractAndSumScore("team")) - .apply("WriteTeamScoreSums", - new WriteToText>( - options.getOutput(), - configureOutput(), - true)); + pipeline + .apply(TextIO.read().from(options.getInput())) + // Parse the incoming data. + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) + + // Filter out data before and after the given times so that it is not included + // in the calculations. As we collect data in batches (say, by day), the batch for the day + // that we want to analyze could potentially include some late-arriving data from the + // previous day. + // If so, we want to weed it out. Similarly, if we include data from the following day + // (to scoop up late-arriving events from the day we're analyzing), we need to weed out + // events that fall after the time period we want to analyze. + // [START DocInclude_HTSFilters] + .apply( + "FilterStartTime", + Filter.by( + (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) + .apply( + "FilterEndTime", + Filter.by( + (GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) + // [END DocInclude_HTSFilters] + + // [START DocInclude_HTSAddTsAndWindow] + // Add an element timestamp based on the event log, and apply fixed windowing. + .apply( + "AddEventTimestamps", + WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) + .apply( + "FixedWindowsTeam", + Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) + // [END DocInclude_HTSAddTsAndWindow] + + // Extract and sum teamname/score pairs from the event data. + .apply("ExtractTeamScore", new ExtractAndSumScore("team")) + .apply( + "WriteTeamScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), true)); pipeline.run().waitUntilFinish(); } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java similarity index 87% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java index 4f0ee28128..ae32637e15 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java @@ -20,9 +20,9 @@ import com.google.common.annotations.VisibleForTesting; import java.util.HashMap; import java.util.Map; -import java.util.TimeZone; import ${package}.common.ExampleOptions; import ${package}.common.ExampleUtils; +import ${package}.complete.game.utils.GameConstants; import ${package}.complete.game.utils.WriteToBigQuery; import ${package}.complete.game.utils.WriteWindowedToBigQuery; import org.apache.beam.sdk.Pipeline; @@ -45,11 +45,8 @@ import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; -import org.joda.time.DateTimeZone; import org.joda.time.Duration; import org.joda.time.Instant; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; /** * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain, @@ -92,11 +89,6 @@ */ public class LeaderBoard extends HourlyTeamScore { - private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; - - private static DateTimeFormatter fmt = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); static final Duration FIVE_MINUTES = Duration.standardMinutes(5); static final Duration TEN_MINUTES = Duration.standardMinutes(10); @@ -140,30 +132,27 @@ interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOpti configureWindowedTableWrite() { Map>> tableConfigure = - new HashMap>>(); + new HashMap<>(); tableConfigure.put( - "team", - new WriteWindowedToBigQuery.FieldInfo>( - "STRING", (c, w) -> c.element().getKey())); + "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); tableConfigure.put( "total_score", - new WriteWindowedToBigQuery.FieldInfo>( - "INTEGER", (c, w) -> c.element().getValue())); + new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); tableConfigure.put( "window_start", - new WriteWindowedToBigQuery.FieldInfo>( + new WriteWindowedToBigQuery.FieldInfo<>( "STRING", (c, w) -> { IntervalWindow window = (IntervalWindow) w; - return fmt.print(window.start()); + return GameConstants.DATE_TIME_FORMATTER.print(window.start()); })); tableConfigure.put( "processing_time", - new WriteWindowedToBigQuery.FieldInfo>( - "STRING", (c, w) -> fmt.print(Instant.now()))); + new WriteWindowedToBigQuery.FieldInfo<>( + "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); tableConfigure.put( "timing", - new WriteWindowedToBigQuery.FieldInfo>( + new WriteWindowedToBigQuery.FieldInfo<>( "STRING", (c, w) -> c.pane().getTiming().toString())); return tableConfigure; } @@ -175,16 +164,12 @@ interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOpti */ protected static Map>> configureBigQueryWrite() { - Map>> tableConfigure = - new HashMap>>(); + Map>> tableConfigure = new HashMap<>(); tableConfigure.put( - "user", - new WriteToBigQuery.FieldInfo>( - "STRING", (c, w) -> c.element().getKey())); + "user", new WriteToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); tableConfigure.put( "total_score", - new WriteToBigQuery.FieldInfo>( - "INTEGER", (c, w) -> c.element().getValue())); + new WriteToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); return tableConfigure; } @@ -200,8 +185,8 @@ interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOpti configureBigQueryWrite(); tableConfigure.put( "processing_time", - new WriteToBigQuery.FieldInfo>( - "STRING", (c, w) -> fmt.print(Instant.now()))); + new WriteToBigQuery.FieldInfo<>( + "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); return tableConfigure; } @@ -218,7 +203,8 @@ public static void main(String[] args) throws Exception { // data elements, and parse the data. PCollection gameEvents = pipeline .apply(PubsubIO.readStrings() - .withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())) + .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) + .fromTopic(options.getTopic())) .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); gameEvents @@ -230,7 +216,7 @@ public static void main(String[] args) throws Exception { // Write the results to BigQuery. .apply( "WriteTeamScoreSums", - new WriteWindowedToBigQuery>( + new WriteWindowedToBigQuery<>( options.as(GcpOptions.class).getProject(), options.getDataset(), options.getLeaderBoardTableName() + "_team", @@ -242,7 +228,7 @@ public static void main(String[] args) throws Exception { // Write the results to BigQuery. .apply( "WriteUserScoreSums", - new WriteToBigQuery>( + new WriteToBigQuery<>( options.as(GcpOptions.class).getProject(), options.getDataset(), options.getLeaderBoardTableName() + "_user", diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java new file mode 100644 index 0000000000..c0a7bc8e17 --- /dev/null +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ${package}.complete.game; + +import static com.google.common.base.MoreObjects.firstNonNull; + +import com.google.common.annotations.VisibleForTesting; +import java.util.HashMap; +import java.util.Map; +import ${package}.common.ExampleUtils; +import ${package}.complete.game.utils.GameConstants; +import ${package}.complete.game.utils.WriteToBigQuery.FieldInfo; +import ${package}.complete.game.utils.WriteWindowedToBigQuery; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.VarIntCoder; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.state.StateSpec; +import org.apache.beam.sdk.state.StateSpecs; +import org.apache.beam.sdk.state.ValueState; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.sdk.values.TypeDescriptors; +import org.joda.time.Instant; + +/** + * This class is part of a series of pipelines that tell a story in a gaming domain. Concepts + * include: stateful processing. + * + *

This pipeline processes an unbounded stream of 'game events'. It uses stateful processing to + * aggregate team scores per team and outputs team name and it's total score every time the team + * passes a new multiple of a threshold score. For example, multiples of the threshold could be the + * corresponding scores required to pass each level of the game. By default, this threshold is set + * to 5000. + * + *

Stateful processing allows us to write pipelines that output based on a runtime state (when + * a team reaches a certain score, in every 100 game events etc) without time triggers. See + * https://beam.apache.org/blog/2017/02/13/stateful-processing.html for more information on using + * stateful processing. + * + *

Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector + * documentation provides more detail on how to do this. + * + *

To execute this pipeline, specify the pipeline configuration like this: + *

{@code
+ *   --project=YOUR_PROJECT_ID
+ *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
+ *   --runner=YOUR_RUNNER
+ *   --dataset=YOUR-DATASET
+ *   --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
+ * }
+ * 
+ * + *

The BigQuery dataset you specify must already exist. The PubSub topic you specify should be + * the same topic to which the Injector is publishing. + */ +public class StatefulTeamScore extends LeaderBoard { + + /** + * Options supported by {@link StatefulTeamScore}. + */ + interface Options extends LeaderBoard.Options { + + @Description("Numeric value, multiple of which is used as threshold for outputting team score.") + @Default.Integer(5000) + Integer getThresholdScore(); + + void setThresholdScore(Integer value); + } + + /** + * Create a map of information that describes how to write pipeline output to BigQuery. This map + * is used to write team score sums. + */ + private static Map>> configureCompleteWindowedTableWrite() { + + Map>> tableConfigure = + new HashMap<>(); + tableConfigure.put( + "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); + tableConfigure.put( + "total_score", + new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); + tableConfigure.put( + "processing_time", + new WriteWindowedToBigQuery.FieldInfo<>( + "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); + return tableConfigure; + } + + + public static void main(String[] args) throws Exception { + + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + // Enforce that this pipeline is always run in streaming mode. + options.setStreaming(true); + ExampleUtils exampleUtils = new ExampleUtils(options); + Pipeline pipeline = Pipeline.create(options); + + pipeline + // Read game events from Pub/Sub using custom timestamps, which are extracted from the + // pubsub data elements, and parse the data. + .apply( + PubsubIO.readStrings() + .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) + .fromTopic(options.getTopic())) + .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) + // Create mapping. UpdateTeamScore uses team name as key. + .apply( + "MapTeamAsKey", + MapElements.into( + TypeDescriptors.kvs( + TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class))) + .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo))) + // Outputs a team's score every time it passes a new multiple of the threshold. + .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore()))) + // Write the results to BigQuery. + .apply( + "WriteTeamLeaders", + new WriteWindowedToBigQuery<>( + options.as(GcpOptions.class).getProject(), + options.getDataset(), + options.getLeaderBoardTableName() + "_team_leader", + configureCompleteWindowedTableWrite())); + + // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the + // command line. + PipelineResult result = pipeline.run(); + exampleUtils.waitToFinish(result); + } + + /** + * Tracks each team's score separately in a single state cell and outputs the score every time it + * passes a new multiple of a threshold. + * + *

We use stateful {@link DoFn} because: + *

    + *
  • State is key-partitioned. Therefore, the score is calculated per team.
  • + *
  • Stateful {@link DoFn} can determine when to output based on the state. This only allows + * outputting when a team's score passes a given threshold.
  • + *
+ */ + @VisibleForTesting + public static class UpdateTeamScoreFn + extends DoFn, KV> { + + private static final String TOTAL_SCORE = "totalScore"; + private final int thresholdScore; + + public UpdateTeamScoreFn(int thresholdScore) { + this.thresholdScore = thresholdScore; + } + + /** + * Describes the state for storing team score. Let's break down this statement. + * + * {@link StateSpec} configures the state cell, which is provided by a runner during pipeline + * execution. + * + * {@link org.apache.beam.sdk.transforms.DoFn.StateId} annotation assigns an identifier to the + * state, which is used to refer the state in + * {@link org.apache.beam.sdk.transforms.DoFn.ProcessElement}. + * + *

A {@link ValueState} stores single value per key and per window. Because our pipeline is + * globally windowed in this example, this {@link ValueState} is just key partitioned, with one + * score per team. Any other class that extends {@link org.apache.beam.sdk.state.State} can be + * used.

+ * + *

In order to store the value, the state must be encoded. Therefore, we provide a coder, in + * this case the {@link VarIntCoder}. If the coder is not provided as in + * {@code StateSpecs.value()}, Beam's coder inference will try to provide a coder automatically. + *

+ */ + @StateId(TOTAL_SCORE) + private final StateSpec> totalScoreSpec = + StateSpecs.value(VarIntCoder.of()); + + /** + * To use a state cell, annotate a parameter with + * {@link org.apache.beam.sdk.transforms.DoFn.StateId} that matches the state declaration. The + * type of the parameter should match the {@link StateSpec} type. + */ + @ProcessElement + public void processElement( + ProcessContext c, + @StateId(TOTAL_SCORE) ValueState totalScore) { + String teamName = c.element().getKey(); + GameActionInfo gInfo = c.element().getValue(); + + // ValueState cells do not contain a default value. If the state is possibly not written, make + // sure to check for null on read. + int oldTotalScore = firstNonNull(totalScore.read(), 0); + totalScore.write(oldTotalScore + gInfo.score); + + // Since there are no negative scores, the easiest way to check whether a team just passed a + // new multiple of the threshold score is to compare the quotients of dividing total scores by + // threshold before and after this aggregation. For example, if the total score was 1999, + // the new total is 2002, and the threshold is 1000, 1999 / 1000 = 1, 2002 / 1000 = 2. + // Therefore, this team passed the threshold. + if (oldTotalScore / this.thresholdScore < totalScore.read() / this.thresholdScore) { + c.output(KV.of(teamName, totalScore.read())); + } + } + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java similarity index 94% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java index c693614c57..f7aa8ff8c0 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java @@ -162,10 +162,11 @@ public PCollection> expand( PCollection gameInfo) { return gameInfo - .apply(MapElements - .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) - .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))) - .apply(Sum.integersPerKey()); + .apply( + MapElements.into( + TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) + .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))) + .apply(Sum.integersPerKey()); } } // [END DocInclude_USExtractXform] @@ -196,8 +197,7 @@ public interface Options extends PipelineOptions { */ protected static Map>> configureOutput() { - Map>> config = - new HashMap>>(); + Map>> config = new HashMap<>(); config.put("user", (c, w) -> c.element().getKey()); config.put("total_score", (c, w) -> c.element().getValue()); return config; @@ -219,11 +219,7 @@ public static void main(String[] args) throws Exception { // Extract and sum username/score pairs from the event data. .apply("ExtractUserScore", new ExtractAndSumScore("user")) .apply( - "WriteUserScoreSums", - new WriteToText>( - options.getOutput(), - configureOutput(), - false)); + "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false)); // Run the batch pipeline. pipeline.run().waitUntilFinish(); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java similarity index 89% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java index 980966e0ce..952cb6fc34 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java @@ -30,11 +30,7 @@ import java.util.Arrays; import java.util.List; import java.util.Random; -import java.util.TimeZone; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; - +import ${package}.complete.game.utils.GameConstants; /** * This is a generator that simulates usage data from a mobile game, and either publishes the data @@ -86,7 +82,6 @@ class Injector { private static Random random = new Random(); private static String topic; private static String project; - private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; // QPS ranges from 800 to 1000. private static final int MIN_QPS = 800; @@ -96,25 +91,56 @@ class Injector { // Lists used to generate random team names. private static final ArrayList COLORS = - new ArrayList(Arrays.asList( - "Magenta", "AliceBlue", "Almond", "Amaranth", "Amber", - "Amethyst", "AndroidGreen", "AntiqueBrass", "Fuchsia", "Ruby", "AppleGreen", - "Apricot", "Aqua", "ArmyGreen", "Asparagus", "Auburn", "Azure", "Banana", - "Beige", "Bisque", "BarnRed", "BattleshipGrey")); + new ArrayList<>( + Arrays.asList( + "Magenta", + "AliceBlue", + "Almond", + "Amaranth", + "Amber", + "Amethyst", + "AndroidGreen", + "AntiqueBrass", + "Fuchsia", + "Ruby", + "AppleGreen", + "Apricot", + "Aqua", + "ArmyGreen", + "Asparagus", + "Auburn", + "Azure", + "Banana", + "Beige", + "Bisque", + "BarnRed", + "BattleshipGrey")); private static final ArrayList ANIMALS = - new ArrayList(Arrays.asList( - "Echidna", "Koala", "Wombat", "Marmot", "Quokka", "Kangaroo", "Dingo", "Numbat", "Emu", - "Wallaby", "CaneToad", "Bilby", "Possum", "Cassowary", "Kookaburra", "Platypus", - "Bandicoot", "Cockatoo", "Antechinus")); + new ArrayList<>( + Arrays.asList( + "Echidna", + "Koala", + "Wombat", + "Marmot", + "Quokka", + "Kangaroo", + "Dingo", + "Numbat", + "Emu", + "Wallaby", + "CaneToad", + "Bilby", + "Possum", + "Cassowary", + "Kookaburra", + "Platypus", + "Bandicoot", + "Cockatoo", + "Antechinus")); // The list of live teams. - private static ArrayList liveTeams = new ArrayList(); - - private static DateTimeFormatter fmt = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); - + private static ArrayList liveTeams = new ArrayList<>(); // The total number of robots in the system. private static final int NUM_ROBOTS = 20; @@ -275,7 +301,7 @@ private static String addTimeInfoToEvent(String message, Long currTime, int dela String eventTimeString = Long.toString((currTime - delayInMillis) / 1000 * 1000); // Add a (redundant) 'human-readable' date string to make the data semantics more clear. - String dateString = fmt.print(currTime); + String dateString = GameConstants.DATE_TIME_FORMATTER.print(currTime); message = message + "," + eventTimeString + "," + dateString; return message; } @@ -294,7 +320,7 @@ public static void publishData(int numMessages, int delayInMillis) PubsubMessage pubsubMessage = new PubsubMessage() .encodeData(message.getBytes("UTF-8")); pubsubMessage.setAttributes( - ImmutableMap.of(TIMESTAMP_ATTRIBUTE, + ImmutableMap.of(GameConstants.TIMESTAMP_ATTRIBUTE, Long.toString((currTime - delayInMillis) / 1000 * 1000))); if (delayInMillis != 0) { System.out.println(pubsubMessage.getAttributes()); @@ -394,16 +420,15 @@ public static void main(String[] args) throws IOException, InterruptedException publishDataToFile(fileName, numMessages, delayInMillis); } else { // Write to PubSub. // Start a thread to inject some data. - new Thread(){ - @Override - public void run() { - try { - publishData(numMessages, delayInMillis); - } catch (IOException e) { - System.err.println(e); - } - } - }.start(); + new Thread( + () -> { + try { + publishData(numMessages, delayInMillis); + } catch (IOException e) { + System.err.println(e); + } + }) + .start(); } // Wait before creating another injector thread. diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java similarity index 100% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java similarity index 71% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java index 5d0cc68763..e90fbcc18e 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java @@ -24,11 +24,9 @@ import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler; import com.google.api.client.http.HttpRequest; import com.google.api.client.http.HttpRequestInitializer; -import com.google.api.client.http.HttpResponse; import com.google.api.client.http.HttpUnsuccessfulResponseHandler; import com.google.api.client.util.ExponentialBackOff; import com.google.api.client.util.Sleeper; -import java.io.IOException; import java.util.logging.Logger; /** @@ -96,32 +94,20 @@ public final void initialize(final HttpRequest request) { new ExponentialBackOff()) .setSleeper(sleeper); request.setInterceptor(wrappedCredential); - request.setUnsuccessfulResponseHandler( - new HttpUnsuccessfulResponseHandler() { - @Override - public boolean handleResponse( - final HttpRequest request, - final HttpResponse response, - final boolean supportsRetry) throws IOException { - if (wrappedCredential.handleResponse( - request, response, supportsRetry)) { - // If credential decides it can handle it, - // the return code or message indicated - // something specific to authentication, - // and no backoff is desired. - return true; - } else if (backoffHandler.handleResponse( - request, response, supportsRetry)) { - // Otherwise, we defer to the judgement of - // our internal backoff handler. - LOG.info("Retrying " - + request.getUrl().toString()); - return true; - } else { - return false; - } - } - }); + request.setUnsuccessfulResponseHandler( + (request1, response, supportsRetry) -> { + if (wrappedCredential.handleResponse(request1, response, supportsRetry)) { + // If credential decides it can handle it, the return code or message indicated + // something specific to authentication, and no backoff is desired. + return true; + } else if (backoffHandler.handleResponse(request1, response, supportsRetry)) { + // Otherwise, we defer to the judgement of our internal backoff handler. + LOG.info("Retrying " + request1.getUrl().toString()); + return true; + } else { + return false; + } + }); request.setIOExceptionHandler( new HttpBackOffIOExceptionHandler(new ExponentialBackOff()) .setSleeper(sleeper)); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java similarity index 55% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java index 90f935c3ce..93da132690 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java @@ -15,23 +15,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package ${package}.common; +package ${package}.complete.game.utils; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; +import java.util.TimeZone; +import org.joda.time.DateTimeZone; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; /** - * Options that can be used to configure the Beam examples. + * Shared constants between game series classes. */ -public interface ExampleOptions extends PipelineOptions { - @Description("Whether to keep jobs running after local process exit") - @Default.Boolean(false) - boolean getKeepJobsRunning(); - void setKeepJobsRunning(boolean keepJobsRunning); +public class GameConstants { - @Description("Number of workers to use when executing the injector pipeline") - @Default.Integer(1) - int getInjectorNumWorkers(); - void setInjectorNumWorkers(int numWorkers); + public static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; + + public static final DateTimeFormatter DATE_TIME_FORMATTER = + DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java similarity index 100% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java similarity index 99% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java index dbd5e39977..45135fb059 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java @@ -83,7 +83,7 @@ protected class BuildRowFn extends DoFn { @ProcessElement public void processElement(ProcessContext c, BoundedWindow window) { - List fields = new ArrayList(); + List fields = new ArrayList<>(); for (Map.Entry> entry : fieldFn.entrySet()) { String key = entry.getKey(); FieldFn fcn = entry.getValue(); diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java similarity index 100% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java similarity index 61% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java index af347c1c0a..f4c8b160d7 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/MinimalWordCountJava8Test.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java @@ -21,11 +21,9 @@ import java.io.IOException; import java.io.Serializable; import java.nio.channels.FileChannel; -import java.nio.channels.SeekableByteChannel; import java.nio.file.Files; import java.nio.file.StandardOpenOption; import java.util.Arrays; -import java.util.List; import org.apache.beam.sdk.extensions.gcp.options.GcsOptions; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.testing.TestPipeline; @@ -42,15 +40,13 @@ import org.junit.runner.RunWith; import org.junit.runners.JUnit4; import org.mockito.Mockito; -import org.mockito.invocation.InvocationOnMock; -import org.mockito.stubbing.Answer; /** - * To keep {@link MinimalWordCountJava8} simple, it is not factored or testable. This test + * To keep {@link MinimalWordCount} simple, it is not factored or testable. This test * file should be maintained with a copy of its code for a basic smoke test. */ @RunWith(JUnit4.class) -public class MinimalWordCountJava8Test implements Serializable { +public class MinimalWordCountTest implements Serializable { @Rule public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); @@ -59,46 +55,39 @@ public class MinimalWordCountJava8Test implements Serializable { * A basic smoke test that ensures there is no crash at pipeline construction time. */ @Test - public void testMinimalWordCountJava8() throws Exception { + public void testMinimalWordCount() throws Exception { p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) - .apply(FlatMapElements - .into(TypeDescriptors.strings()) - .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))) - .apply(Filter.by((String word) -> !word.isEmpty())) - .apply(Count.perElement()) - .apply(MapElements - .into(TypeDescriptors.strings()) - .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) - .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix")); + .apply( + FlatMapElements.into(TypeDescriptors.strings()) + .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))) + .apply(Filter.by((String word) -> !word.isEmpty())) + .apply(Count.perElement()) + .apply( + MapElements.into(TypeDescriptors.strings()) + .via( + (KV wordCount) -> + wordCount.getKey() + ": " + wordCount.getValue())) + .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix")); } private GcsUtil buildMockGcsUtil() throws IOException { GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class); // Any request to open gets a new bogus channel - Mockito - .when(mockGcsUtil.open(Mockito.any(GcsPath.class))) - .then(new Answer() { - @Override - public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable { - return FileChannel.open( - Files.createTempFile("channel-", ".tmp"), - StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE); - } - }); + Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))) + .then( + invocation -> + FileChannel.open( + Files.createTempFile("channel-", ".tmp"), + StandardOpenOption.CREATE, + StandardOpenOption.DELETE_ON_CLOSE)); // Any request for expansion returns a list containing the original GcsPath // This is required to pass validation that occurs in TextIO during apply() - Mockito - .when(mockGcsUtil.expand(Mockito.any(GcsPath.class))) - .then(new Answer>() { - @Override - public List answer(InvocationOnMock invocation) throws Throwable { - return ImmutableList.of((GcsPath) invocation.getArguments()[0]); - } - }); + Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))) + .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0])); return mockGcsUtil; } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java index b4e4124e26..91a1bf8edc 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java @@ -53,8 +53,7 @@ public void testExtractWordsFn() throws Exception { Assert.assertThat(extractWordsFn.processBundle(" some input words "), CoreMatchers.hasItems("some", "input", "words")); - Assert.assertThat(extractWordsFn.processBundle(" "), - CoreMatchers.hasItems()); + Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems()); Assert.assertThat(extractWordsFn.processBundle(" some ", " input", " words"), CoreMatchers.hasItems("some", "input", "words")); } diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java similarity index 100% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java similarity index 100% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java similarity index 97% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java index 6075c564b7..2478c07fa8 100644 --- a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java @@ -32,7 +32,6 @@ import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.TestStream; import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.GlobalWindow; import org.apache.beam.sdk.transforms.windowing.IntervalWindow; @@ -240,13 +239,14 @@ public void testTeamScoresObservablyLate() { String redTeam = TestUser.RED_ONE.getTeam(); PAssert.that(teamScores) .inWindow(window) - .satisfies((SerializableFunction>, Void>) input -> { - // The final sums need not exist in the same pane, but must appear in the output - // PCollection - assertThat(input, hasItem(KV.of(blueTeam, 11))); - assertThat(input, hasItem(KV.of(redTeam, 27))); - return null; - }); + .satisfies( + input -> { + // The final sums need not exist in the same pane, but must appear in the output + // PCollection + assertThat(input, hasItem(KV.of(blueTeam, 11))); + assertThat(input, hasItem(KV.of(redTeam, 27))); + return null; + }); PAssert.thatMap(teamScores) // The closing behavior of CalculateTeamScores precludes an inFinalPane matcher .inOnTimePane(window) diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java new file mode 100644 index 0000000000..d48b450547 --- /dev/null +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ${package}.complete.game; + +import ${package}.complete.game.StatefulTeamScore.UpdateTeamScoreFn; +import ${package}.complete.game.UserScore.GameActionInfo; +import org.apache.beam.sdk.coders.AvroCoder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestStream; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.windowing.FixedWindows; +import org.apache.beam.sdk.transforms.windowing.GlobalWindow; +import org.apache.beam.sdk.transforms.windowing.IntervalWindow; +import org.apache.beam.sdk.transforms.windowing.Window; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TimestampedValue; +import org.joda.time.Duration; +import org.joda.time.Instant; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** + * Tests for {@link StatefulTeamScore}. + */ +@RunWith(JUnit4.class) +public class StatefulTeamScoreTest { + + private static final Duration ALLOWED_LATENESS = Duration.standardHours(1); + private static final Duration TEAM_WINDOW_DURATION = Duration.standardMinutes(20); + private Instant baseTime = new Instant(0); + + @Rule + public TestPipeline p = TestPipeline.create(); + + /** + * Some example users, on two separate teams. + */ + private enum TestUser { + RED_ONE("scarlet", "red"), RED_TWO("burgundy", "red"), + BLUE_ONE("navy", "blue"), BLUE_TWO("sky", "blue"); + + private final String userName; + private final String teamName; + + TestUser(String userName, String teamName) { + this.userName = userName; + this.teamName = teamName; + } + + public String getUser() { + return userName; + } + + public String getTeam() { + return teamName; + } + } + + /** + * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs + * correctly for one team. + */ + @Test + public void testScoreUpdatesOneTeam() { + + TestStream> createEvents = TestStream.create(KvCoder.of( + StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) + .advanceWatermarkTo(baseTime) + .addElements( + event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)), + event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)), + event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)), + event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)), + event(TestUser.RED_TWO, 201, Duration.standardSeconds(50)) + ) + .advanceWatermarkToInfinity(); + + PCollection> teamScores = p.apply(createEvents) + .apply(ParDo.of(new UpdateTeamScoreFn(100))); + + String redTeam = TestUser.RED_ONE.getTeam(); + + PAssert.that(teamScores) + .inWindow(GlobalWindow.INSTANCE) + .containsInAnyOrder( + KV.of(redTeam, 100), + KV.of(redTeam, 200), + KV.of(redTeam, 401) + ); + + p.run().waitUntilFinish(); + } + + /** + * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs + * correctly for multiple teams. + */ + @Test + public void testScoreUpdatesPerTeam() { + + TestStream> createEvents = TestStream.create(KvCoder.of( + StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) + .advanceWatermarkTo(baseTime) + .addElements( + event(TestUser.RED_ONE, 50, Duration.standardSeconds(10)), + event(TestUser.RED_TWO, 50, Duration.standardSeconds(20)), + event(TestUser.BLUE_ONE, 70, Duration.standardSeconds(30)), + event(TestUser.BLUE_TWO, 80, Duration.standardSeconds(40)), + event(TestUser.BLUE_TWO, 50, Duration.standardSeconds(50)) + ) + .advanceWatermarkToInfinity(); + + PCollection> teamScores = p.apply(createEvents) + .apply(ParDo.of(new UpdateTeamScoreFn(100))); + + String redTeam = TestUser.RED_ONE.getTeam(); + String blueTeam = TestUser.BLUE_ONE.getTeam(); + + PAssert.that(teamScores) + .inWindow(GlobalWindow.INSTANCE) + .containsInAnyOrder( + KV.of(redTeam, 100), + KV.of(blueTeam, 150), + KV.of(blueTeam, 200) + ); + + p.run().waitUntilFinish(); + } + + /** + * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs + * correctly per window and per key. + */ + @Test + public void testScoreUpdatesPerWindow() { + + TestStream> createEvents = TestStream.create(KvCoder.of( + StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) + .advanceWatermarkTo(baseTime) + .addElements( + event(TestUser.RED_ONE, 50, Duration.standardMinutes(1)), + event(TestUser.RED_TWO, 50, Duration.standardMinutes(2)), + event(TestUser.RED_ONE, 50, Duration.standardMinutes(3)), + event(TestUser.RED_ONE, 60, Duration.standardMinutes(6)), + event(TestUser.RED_TWO, 60, Duration.standardMinutes(7)) + ) + .advanceWatermarkToInfinity(); + + Duration teamWindowDuration = Duration.standardMinutes(5); + + PCollection> teamScores = p + .apply(createEvents) + .apply(Window.>into(FixedWindows.of(teamWindowDuration))) + .apply(ParDo.of(new UpdateTeamScoreFn(100))); + + String redTeam = TestUser.RED_ONE.getTeam(); + String blueTeam = TestUser.BLUE_ONE.getTeam(); + + IntervalWindow window1 = new IntervalWindow(baseTime, teamWindowDuration); + IntervalWindow window2 = new IntervalWindow(window1.end(), teamWindowDuration); + + PAssert.that(teamScores) + .inWindow(window1) + .containsInAnyOrder( + KV.of(redTeam, 100) + ); + + PAssert.that(teamScores) + .inWindow(window2) + .containsInAnyOrder( + KV.of(redTeam, 120) + ); + + p.run().waitUntilFinish(); + } + + private TimestampedValue> event( + TestUser user, + int score, + Duration baseTimeOffset) { + return TimestampedValue.of(KV.of(user.getTeam(), new GameActionInfo(user.getUser(), + user.getTeam(), + score, + baseTime.plus(baseTimeOffset).getMillis())), baseTime.plus(baseTimeOffset)); + } +} diff --git a/maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java similarity index 100% rename from maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java rename to maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java diff --git a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties index 8a76657024..b0195b3f16 100644 --- a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties +++ b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties @@ -16,4 +16,4 @@ package=it.pkg version=0.1 groupId=archetype.it artifactId=basic -targetPlatform=1.7 +targetPlatform=1.8 diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 7e5eb44cc9..53eeaf01b8 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -33,7 +33,6 @@ starter examples - examples-java8 diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 34bfd076bb..000e743cd2 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -39,7 +39,7 @@ org.apache.maven.archetype archetype-packaging - 2.4 + ${archetype-packaging.version} @@ -55,12 +55,12 @@ maven-archetype-plugin - 2.4 + ${maven-archetype-plugin.version} org.apache.maven.shared maven-invoker - 2.2 + ${maven-invoker.version} diff --git a/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml index 4c22d5d68b..428c74aa4a 100644 --- a/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml +++ b/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml @@ -21,7 +21,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> - 1.7 + 1.8 diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties index 8a76657024..b0195b3f16 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties +++ b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties @@ -16,4 +16,4 @@ package=it.pkg version=0.1 groupId=archetype.it artifactId=basic -targetPlatform=1.7 +targetPlatform=1.8 diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index 8e4edbd29e..506665830a 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -51,8 +51,8 @@ maven-compiler-plugin ${maven-compiler-plugin.version} - 1.7 - 1.7 + 1.8 + 1.8 diff --git a/pom.xml b/pom.xml index a3ce24f916..320ac544c3 100644 --- a/pom.xml +++ b/pom.xml @@ -99,18 +99,21 @@ + 1.8 + UTF-8 ${maven.build.timestamp} yyyy-MM-dd HH:mm - 2.2.0 + 2.3.0 Google Cloud Dataflow SDK for Java - ${project.version}-20170517 + ${beam.version} 6 1 v2-rev355-1.22.0 + 6.19 1.22.0 20.0 1.3 @@ -120,11 +123,22 @@ v1-rev10-1.22.0 1.7.25 + 2.4 + 1.6.0 2.20 + 2.4 + 2.17 + 3.0.0 3.6.2 - 1.6.0 + 3.0.1 + 2.2 3.0.2 + 3.0.0-M1 + 2.5.3 + 3.0.2 3.0.0 + 2.20 + 3.0.1 pom @@ -137,47 +151,19 @@ - - org.apache.maven.plugins - maven-enforcer-plugin - 1.4.1 - - - enforce-java - - enforce - - - - - - [1.8.0,) - - - - - - - org.apache.maven.plugins maven-clean-plugin - 3.0.0 + ${maven-clean-plugin.version} org.apache.maven.plugins maven-compiler-plugin - 3.6.2 + ${maven-compiler-plugin.version} - 1.7 - 1.7 + ${java.version} + ${java.version} -Xlint:all -Werror @@ -192,12 +178,12 @@ org.apache.maven.plugins maven-checkstyle-plugin - 2.17 + ${maven-checkstyle-plugin.version} com.puppycrawl.tools checkstyle - 6.19 + ${checkstyle.version} org.apache.beam @@ -232,7 +218,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.0.2 + ${maven-jar-plugin.version} true @@ -255,7 +241,7 @@ org.apache.maven.plugins maven-source-plugin - 3.0.1 + ${maven-source-plugin.version} attach-sources @@ -277,7 +263,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 2.10.4 + ${maven-javadoc-plugin.version} false @@ -295,13 +281,13 @@ org.apache.maven.plugins maven-resources-plugin - 3.0.2 + ${maven-resources-plugin.version} org.apache.maven.plugins maven-dependency-plugin - 3.0.0 + ${maven-dependency-plugin.version} @@ -317,18 +303,18 @@ org.apache.maven.plugins maven-surefire-plugin - 2.20 + ${maven-surefire-plugin.version} org.apache.maven.plugins maven-archetype-plugin - 2.4 + ${maven-archetype-plugin.version} org.apache.maven.shared maven-invoker - 2.2 + ${maven-invoker.version} @@ -354,7 +340,7 @@ org.apache.maven.plugins maven-release-plugin - 2.5.3 + ${maven-release-plugin} true true @@ -365,7 +351,7 @@ org.codehaus.mojo exec-maven-plugin - 1.5.0 + ${exec-maven-plugin.version} false @@ -374,11 +360,6 @@ - - org.apache.maven.plugins - maven-enforcer-plugin - - org.apache.maven.plugins maven-compiler-plugin From 8090e858362899a7a6682b2f46dea882820626b3 Mon Sep 17 00:00:00 2001 From: Batkhuyag Batsaikhan Date: Tue, 27 Feb 2018 18:45:50 -0800 Subject: [PATCH 57/77] removed java8 example --- pom.xml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pom.xml b/pom.xml index 320ac544c3..e24580f9e9 100644 --- a/pom.xml +++ b/pom.xml @@ -430,12 +430,6 @@ ${beam.version} - - org.apache.beam - beam-examples-java8 - ${beam.version} - - junit junit From 5b20661f8ae21f390aa764daf9a3b14da060bce0 Mon Sep 17 00:00:00 2001 From: Batkhuyag Batsaikhan Date: Tue, 27 Feb 2018 19:41:41 -0800 Subject: [PATCH 58/77] [maven-release-plugin] prepare for the current development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 2 +- sdk/pom.xml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 75a1d92174..ef6d842c47 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 5ff4872335..477a670dcb 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 53eeaf01b8..9774b06575 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 000e743cd2..3d51805b04 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index e24580f9e9..937f0aa2ff 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT diff --git a/sdk/pom.xml b/sdk/pom.xml index 33f2255f82..d6ede19a5d 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.2.0-SNAPSHOT + 2.3.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From 0e8ae93463edfc32019a01369e440edec922e91a Mon Sep 17 00:00:00 2001 From: Batkhuyag Batsaikhan Date: Tue, 27 Feb 2018 19:43:58 -0800 Subject: [PATCH 59/77] [maven-release-plugin] prepare branch release-2.3.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 937f0aa2ff..11621568fc 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.3.0 From 3f896cb574abf90f1fcad7c46dce0488c8e69cb2 Mon Sep 17 00:00:00 2001 From: Batkhuyag Batsaikhan Date: Tue, 27 Feb 2018 19:43:58 -0800 Subject: [PATCH 60/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index ef6d842c47..14d46dc1ef 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.3.0-SNAPSHOT + 2.4.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 477a670dcb..13b0d6114d 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.3.0-SNAPSHOT + 2.4.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 9774b06575..70bd4462cd 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.3.0-SNAPSHOT + 2.4.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 3d51805b04..cb80d97dc8 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.3.0-SNAPSHOT + 2.4.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 11621568fc..8fa0890f60 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.3.0-SNAPSHOT + 2.4.0-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.3.0 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index d6ede19a5d..c17eacadca 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.3.0-SNAPSHOT + 2.4.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From 16729af118a52b0fb2a64b85e3962e7689a18518 Mon Sep 17 00:00:00 2001 From: tvalentyn Date: Thu, 22 Mar 2018 16:17:33 -0700 Subject: [PATCH 61/77] Use beam-x.y.z containers for DF SDK releases moving forward. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8fa0890f60..f22890e0b9 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,7 @@ 2.3.0 Google Cloud Dataflow SDK for Java - ${beam.version} + beam-${beam.version} 6 1 From 3e015a8884604b5e22ccbf2e1fb576a934917720 Mon Sep 17 00:00:00 2001 From: tvalentyn Date: Thu, 22 Mar 2018 16:17:33 -0700 Subject: [PATCH 62/77] Use beam-x.y.z containers for DF SDK releases moving forward. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8fa0890f60..f22890e0b9 100644 --- a/pom.xml +++ b/pom.xml @@ -108,7 +108,7 @@ 2.3.0 Google Cloud Dataflow SDK for Java - ${beam.version} + beam-${beam.version} 6 1 From b391210d013538d55c5ea85f3b78d640ab031d9e Mon Sep 17 00:00:00 2001 From: akedin Date: Tue, 27 Mar 2018 15:49:43 -0700 Subject: [PATCH 63/77] Upgrade to Apache Beam version 2.4.0 --- maven-archetypes/examples/pom.xml | 28 +++++++++++++++++++ .../resources/archetype-resources/pom.xml | 2 +- .../complete/game/utils/WriteToBigQuery.java | 1 + .../complete/game/StatefulTeamScoreTest.java | 2 -- maven-archetypes/pom.xml | 2 +- pom.xml | 19 +++++++------ 6 files changed, 41 insertions(+), 13 deletions(-) diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 13b0d6114d..cc50502e5e 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -71,6 +71,34 @@ + + + + org.eclipse.m2e + lifecycle-mapping + ${eclipse-m2e.version} + + + + + + org.codehaus.mojo + exec-maven-plugin + [1.5.0,) + + exec + + + + + false + + + + + + + diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index dcbedafd76..04d70618d9 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -254,7 +254,7 @@ org.mockito - mockito-all + mockito-core ${mockito.version} test diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java index 984e958c50..d35a4ffcfc 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java @@ -91,6 +91,7 @@ FieldFn getFieldFn() { return this.fieldFn; } } + /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */ protected class BuildRowFn extends DoFn { diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java index d48b450547..c80c57f4fc 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java @@ -47,8 +47,6 @@ @RunWith(JUnit4.class) public class StatefulTeamScoreTest { - private static final Duration ALLOWED_LATENESS = Duration.standardHours(1); - private static final Duration TEAM_WINDOW_DURATION = Duration.standardMinutes(20); private Instant baseTime = new Instant(0); @Rule diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 70bd4462cd..1d0500729c 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -69,7 +69,7 @@
- + diff --git a/pom.xml b/pom.xml index f22890e0b9..be1fba9dd6 100644 --- a/pom.xml +++ b/pom.xml @@ -105,15 +105,16 @@ ${maven.build.timestamp} yyyy-MM-dd HH:mm - 2.3.0 + 2.4.0 Google Cloud Dataflow SDK for Java beam-${beam.version} 6 1 - v2-rev355-1.22.0 - 6.19 + v2-rev374-1.22.0 + 8.7 + 1.0.0 1.22.0 20.0 1.3 @@ -125,19 +126,19 @@ 2.4 1.6.0 - 2.20 + 2.20.1 2.4 - 2.17 + 3.0.0 3.0.0 - 3.6.2 - 3.0.1 + 3.7.0 + 3.0.2 2.2 3.0.2 3.0.0-M1 2.5.3 3.0.2 - 3.0.0 - 2.20 + 3.1.0 + 2.20.1 3.0.1 From f75991996c8cb39473519a7138c814e2b62b7919 Mon Sep 17 00:00:00 2001 From: Anton Kedin Date: Wed, 28 Mar 2018 10:13:58 -0700 Subject: [PATCH 64/77] [maven-release-plugin] prepare branch release-2.4.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index be1fba9dd6..621f2bc192 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.4.0 From 46b74c852ad97fbfedfc2b3e4d7d46143546d662 Mon Sep 17 00:00:00 2001 From: Anton Kedin Date: Wed, 28 Mar 2018 10:13:58 -0700 Subject: [PATCH 65/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 14d46dc1ef..54b3613909 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.4.0-SNAPSHOT + 2.5.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index cc50502e5e..2c4a6eb9fe 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.4.0-SNAPSHOT + 2.5.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 1d0500729c..95f647d608 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.4.0-SNAPSHOT + 2.5.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index cb80d97dc8..4e91b8de88 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.4.0-SNAPSHOT + 2.5.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 621f2bc192..4b3108219d 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.4.0-SNAPSHOT + 2.5.0-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.4.0 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index c17eacadca..475af49854 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.4.0-SNAPSHOT + 2.5.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From fb0d662af48359d2d123a4a7ec4b63ce92ede1dd Mon Sep 17 00:00:00 2001 From: Chanseok Oh Date: Wed, 2 May 2018 14:00:07 -0400 Subject: [PATCH 66/77] Fix undefined property bug --- .../starter/src/main/resources/archetype-resources/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml index 22f717f97a..da443b16fa 100644 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml @@ -26,7 +26,7 @@ UTF-8 @maven-compiler-plugin.version@ - @maven-exec-plugin.version@ + @exec-maven-plugin.version@ @slf4j.version@ @@ -62,7 +62,7 @@ org.codehaus.mojo exec-maven-plugin - ${maven-exec-plugin.version} + ${exec-maven-plugin.version} false From bf7770d222ac2a9b89633e8287e26e7c84196d17 Mon Sep 17 00:00:00 2001 From: Chanseok Oh Date: Wed, 2 May 2018 14:08:17 -0400 Subject: [PATCH 67/77] Fix test --- .../src/test/resources/projects/basic/reference/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml index 506665830a..daf87595b7 100644 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml @@ -26,7 +26,7 @@ UTF-8 @maven-compiler-plugin.version@ - @maven-exec-plugin.version@ + @exec-maven-plugin.version@ @slf4j.version@ @@ -62,7 +62,7 @@ org.codehaus.mojo exec-maven-plugin - ${maven-exec-plugin.version} + ${exec-maven-plugin.version} false From 568a4ed5eb3c6908a5f882281da98572826d9512 Mon Sep 17 00:00:00 2001 From: Pablo Date: Mon, 25 Jun 2018 13:34:41 -0700 Subject: [PATCH 68/77] Dataflow SDK Release 2.5.0 --- .../src/main/java/DebuggingWordCount.java | 11 ++++++--- .../src/main/java/WindowedWordCount.java | 13 +++++++---- .../src/main/java/WordCount.java | 21 ++++++++++------- .../main/java/complete/game/GameStats.java | 2 +- .../java/complete/game/HourlyTeamScore.java | 4 ++-- .../main/java/complete/game/LeaderBoard.java | 2 +- .../main/java/complete/game/UserScore.java | 5 ++-- .../java/complete/game/injector/Injector.java | 13 ++++++----- .../complete/game/injector/InjectorUtils.java | 1 + .../complete/game/utils/GameConstants.java | 2 +- .../java/complete/game/utils/WriteToText.java | 4 ++-- .../src/test/java/DebuggingWordCountTest.java | 4 ++-- .../java/complete/game/UserScoreTest.java | 8 +++---- pom.xml | 23 ++++++++++++------- 14 files changed, 69 insertions(+), 44 deletions(-) diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java index 07870f2ed0..0ae31d575d 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java @@ -130,9 +130,7 @@ public interface WordCountOptions extends WordCount.WordCountOptions { void setFilterPattern(String value); } - public static void main(String[] args) { - WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() - .as(WordCountOptions.class); + static void runDebuggingWordCount(WordCountOptions options) { Pipeline p = Pipeline.create(options); PCollection> filteredWords = @@ -159,4 +157,11 @@ public static void main(String[] args) { p.run().waitUntilFinish(); } + + public static void main(String[] args) { + WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() + .as(WordCountOptions.class); + + runDebuggingWordCount(options); + } } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java index 501ac27881..5798f290eb 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java @@ -106,7 +106,7 @@ static class AddTimestampFn extends DoFn { } @ProcessElement - public void processElement(ProcessContext c) { + public void processElement(@Element String element, OutputReceiver receiver) { Instant randomTimestamp = new Instant( ThreadLocalRandom.current() @@ -115,7 +115,7 @@ public void processElement(ProcessContext c) { /** * Concept #2: Set the data element with that timestamp. */ - c.outputWithTimestamp(c.element(), new Instant(randomTimestamp)); + receiver.outputWithTimestamp(element, new Instant(randomTimestamp)); } } @@ -165,8 +165,7 @@ public interface Options extends WordCount.WordCountOptions, void setNumShards(Integer numShards); } - public static void main(String[] args) throws IOException { - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + static void runWindowedWordCount(Options options) throws IOException { final String output = options.getOutput(); final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); @@ -217,4 +216,10 @@ public static void main(String[] args) throws IOException { } } + public static void main(String[] args) throws IOException { + Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); + + runWindowedWordCount(options); + } + } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java index 33f7b39f19..d4302ed67a 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java @@ -93,19 +93,19 @@ static class ExtractWordsFn extends DoFn { ExtractWordsFn.class, "lineLenDistro"); @ProcessElement - public void processElement(ProcessContext c) { - lineLenDist.update(c.element().length()); - if (c.element().trim().isEmpty()) { + public void processElement(@Element String element, OutputReceiver receiver) { + lineLenDist.update(element.length()); + if (element.trim().isEmpty()) { emptyLines.inc(); } // Split the line into words. - String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN); + String[] words = element.split(ExampleUtils.TOKENIZER_PATTERN, -1); // Output each word encountered into the output PCollection. for (String word : words) { if (!word.isEmpty()) { - c.output(word); + receiver.output(word); } } } @@ -172,9 +172,7 @@ public interface WordCountOptions extends PipelineOptions { void setOutput(String value); } - public static void main(String[] args) { - WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() - .as(WordCountOptions.class); + static void runWordCount(WordCountOptions options) { Pipeline p = Pipeline.create(options); // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the @@ -186,4 +184,11 @@ public static void main(String[] args) { p.run().waitUntilFinish(); } + + public static void main(String[] args) { + WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() + .as(WordCountOptions.class); + + runWordCount(options); + } } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java index 3cb04bd2e4..2660cdac2b 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java @@ -153,7 +153,7 @@ public void processElement(ProcessContext c, BoundedWindow window) { /** * Options supported by {@link GameStats}. */ - interface Options extends LeaderBoard.Options { + public interface Options extends LeaderBoard.Options { @Description("Numeric value of fixed window duration for user analysis, in minutes") @Default.Integer(60) Integer getFixedWindowDuration(); diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java index fe1fe99da7..05455219fc 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java @@ -76,13 +76,13 @@ public class HourlyTeamScore extends UserScore { private static DateTimeFormatter minFmt = DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); /** * Options supported by {@link HourlyTeamScore}. */ - interface Options extends UserScore.Options { + public interface Options extends UserScore.Options { @Description("Numeric value of fixed window duration, in minutes") @Default.Integer(60) diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java index ae32637e15..b5983fa789 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java @@ -96,7 +96,7 @@ public class LeaderBoard extends HourlyTeamScore { /** * Options supported by {@link LeaderBoard}. */ - interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOptions { + public interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOptions { @Description("BigQuery Dataset to write tables to. Must already exist.") @Validation.Required diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java index f7aa8ff8c0..3459d043f5 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java @@ -101,7 +101,7 @@ public Integer getScore() { return this.score; } public String getKey(String keyname) { - if (keyname.equals("team")) { + if ("team".equals(keyname)) { return this.team; } else { // return username as default return this.user; @@ -128,7 +128,8 @@ static class ParseEventFn extends DoFn { @ProcessElement public void processElement(ProcessContext c) { - String[] components = c.element().split(","); + System.out.println("GOT " + c.element()); + String[] components = c.element().split(",", -1); try { String user = components[0].trim(); String team = components[1].trim(); diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java index 952cb6fc34..c21ec2e319 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java @@ -90,6 +90,8 @@ class Injector { private static final int THREAD_SLEEP_MS = 500; // Lists used to generate random team names. + // If COLORS is changed, please also make changes in + // release/src/main/groovy/MobileGamingCommands.COLORS private static final ArrayList COLORS = new ArrayList<>( Arrays.asList( @@ -349,12 +351,11 @@ public static void publishDataToFile(String fileName, int numMessages, int delay out.println(message); } } catch (Exception e) { + System.err.print("Error in writing generated events to file"); e.printStackTrace(); } finally { - if (out != null) { - out.flush(); - out.close(); - } + out.flush(); + out.close(); } } @@ -371,7 +372,7 @@ public static void main(String[] args) throws IOException, InterruptedException String fileName = args[2]; // The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if // specified; otherwise, it will try to write to a file. - if (topicName.equalsIgnoreCase("none")) { + if ("none".equalsIgnoreCase(topicName)) { writeToFile = true; writeToPubsub = false; } @@ -383,7 +384,7 @@ public static void main(String[] args) throws IOException, InterruptedException InjectorUtils.createTopic(pubsub, topic); System.out.println("Injecting to topic: " + topic); } else { - if (fileName.equalsIgnoreCase("none")) { + if ("none".equalsIgnoreCase(fileName)) { System.out.println("Filename not specified."); System.exit(1); } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java index ddcbff4f41..5a0cf0166e 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java @@ -86,6 +86,7 @@ public static String getFullyQualifiedTopicName( */ public static void createTopic(Pubsub client, String fullTopicName) throws IOException { + System.out.println("fullTopicName " + fullTopicName); try { client.projects().topics().get(fullTopicName).execute(); } catch (GoogleJsonResponseException e) { diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java index 93da132690..dc28ad72ea 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java @@ -31,5 +31,5 @@ public class GameConstants { public static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java index 45135fb059..76fa3ff075 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java @@ -52,7 +52,7 @@ public class WriteToText private static final DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); + .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); protected String filenamePrefix; protected Map> fieldFn; @@ -98,7 +98,7 @@ public void processElement(ProcessContext c, BoundedWindow window) { * A {@link DoFn} that writes elements to files with names deterministically derived from the * lower and upper bounds of their key (an {@link IntervalWindow}). */ - protected class WriteOneFilePerWindow extends PTransform, PDone> { + protected static class WriteOneFilePerWindow extends PTransform, PDone> { private final String filenamePrefix; diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java index 26e1498d71..0fbee20cb5 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java @@ -37,7 +37,7 @@ public class DebuggingWordCountTest { private String getFilePath(String filePath) { if (filePath.contains(":")) { - return filePath.replace("\\", "/").split(":")[1]; + return filePath.replace("\\", "/").split(":", -1)[1]; } return filePath; } @@ -54,6 +54,6 @@ public void testDebuggingWordCount() throws Exception { TestPipeline.testingPipelineOptions().as(WordCountOptions.class); options.setInputFile(getFilePath(inputFile.getAbsolutePath())); options.setOutput(getFilePath(outputFile.getAbsolutePath())); - DebuggingWordCount.main(TestPipeline.convertToArgs(options)); + DebuggingWordCount.runDebuggingWordCount(options); } } diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java index 83b8821480..b691a0cbd5 100644 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java +++ b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java @@ -91,10 +91,10 @@ public void testParseEventFn() throws Exception { DoFnTester.of(new ParseEventFn()); List results = parseEventFn.processBundle(GAME_EVENTS_ARRAY); - Assert.assertEquals(results.size(), 8); - Assert.assertEquals(results.get(0).getUser(), "user0_MagentaKangaroo"); - Assert.assertEquals(results.get(0).getTeam(), "MagentaKangaroo"); - Assert.assertEquals(results.get(0).getScore(), new Integer(3)); + Assert.assertEquals(8, results.size()); + Assert.assertEquals("user0_MagentaKangaroo", results.get(0).getUser()); + Assert.assertEquals("MagentaKangaroo", results.get(0).getTeam()); + Assert.assertEquals(Integer.valueOf(3), results.get(0).getScore()); } /** Tests ExtractAndSumScore("user"). */ diff --git a/pom.xml b/pom.xml index 4b3108219d..6cf1ef6b6c 100644 --- a/pom.xml +++ b/pom.xml @@ -105,23 +105,24 @@ ${maven.build.timestamp} yyyy-MM-dd HH:mm - 2.4.0 + 2.5.0 Google Cloud Dataflow SDK for Java beam-${beam.version} 6 1 - v2-rev374-1.22.0 + v2-rev374-1.23.0 8.7 1.0.0 - 1.22.0 + 1.23.0 20.0 1.3 2.4 4.12 + 1.0.0 1.9.5 - v1-rev10-1.22.0 + v1-rev382-1.23.0 1.7.25 2.4 @@ -129,16 +130,16 @@ 2.20.1 2.4 3.0.0 - 3.0.0 + 3.1.0 3.7.0 - 3.0.2 + 3.1.1 2.2 3.0.2 3.0.0-M1 2.5.3 - 3.0.2 + 3.1.0 3.1.0 - 2.20.1 + 2.21.0 3.0.1 @@ -431,6 +432,12 @@ ${beam.version} + + org.apache.beam + beam-sdks-java-io-kafka + ${beam.version} + + junit junit From 0e6e1039a276ed183320e51228944daa5542b697 Mon Sep 17 00:00:00 2001 From: Pablo Date: Mon, 25 Jun 2018 14:25:39 -0700 Subject: [PATCH 69/77] [maven-release-plugin] prepare branch release-2.5.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6cf1ef6b6c..43b148a3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.5.0 From eef52e96134085ecbf618b247308b9be0b567789 Mon Sep 17 00:00:00 2001 From: Pablo Date: Mon, 25 Jun 2018 14:25:39 -0700 Subject: [PATCH 70/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 54b3613909..468d87505e 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 2c4a6eb9fe..792eb40c92 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 95f647d608..f995770ea6 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 4e91b8de88..643cfa4096 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 43b148a3fe..2924ff9fa2 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.5.0 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index 475af49854..21e2a2df0f 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From 422c8eb2b6e0959627af33375ddcdfe8c98cab8c Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 26 Jun 2018 10:13:30 -0700 Subject: [PATCH 71/77] Adding Kafka IO to Dataflow SDK dependencies --- sdk/pom.xml | 5 +++++ .../java/com/google/cloud/dataflow/sdk/SdkDependencies.java | 2 ++ 2 files changed, 7 insertions(+) diff --git a/sdk/pom.xml b/sdk/pom.xml index 21e2a2df0f..0bd69dc58c 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -61,6 +61,11 @@ beam-runners-google-cloud-dataflow-java + + org.apache.beam + beam-sdks-java-io-kafka + + junit junit diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java index 7bbfbe3729..df3fd76ae6 100644 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java +++ b/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java @@ -19,6 +19,7 @@ import org.apache.beam.runners.direct.DirectRunner; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; +import org.apache.beam.sdk.io.kafka.KafkaIO; /** * Mark the dependencies as used at compile time. @@ -26,6 +27,7 @@ class SdkDependencies { private Pipeline p; private BigQueryIO bigQueryIO; + private KafkaIO kafkaIO; private DirectRunner directRunner; private DataflowRunner dataflowRunner; } From 0968379b3d63e3a3e181d7229cba2e8a2b6c3290 Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 26 Jun 2018 11:15:47 -0700 Subject: [PATCH 72/77] Revert "[maven-release-plugin] prepare for next development iteration" This reverts commit eef52e96134085ecbf618b247308b9be0b567789. --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 468d87505e..54b3613909 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.6.0-SNAPSHOT + 2.5.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 792eb40c92..2c4a6eb9fe 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.6.0-SNAPSHOT + 2.5.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index f995770ea6..95f647d608 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.6.0-SNAPSHOT + 2.5.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 643cfa4096..4e91b8de88 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.6.0-SNAPSHOT + 2.5.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 2924ff9fa2..43b148a3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.6.0-SNAPSHOT + 2.5.0-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.5.0 diff --git a/sdk/pom.xml b/sdk/pom.xml index 0bd69dc58c..d54342364c 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.6.0-SNAPSHOT + 2.5.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From 279fda33114999da2d9158a51a79acdc36a4108c Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 26 Jun 2018 11:17:54 -0700 Subject: [PATCH 73/77] Revert "[maven-release-plugin] prepare branch release-2.5.0" This reverts commit 0e6e1039a276ed183320e51228944daa5542b697. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 43b148a3fe..6cf1ef6b6c 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.5.0 + HEAD From e598e911b554e0150853aefff06a510fdbc1fe69 Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 26 Jun 2018 13:04:23 -0700 Subject: [PATCH 74/77] [maven-release-plugin] prepare branch release-2.5.0 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6cf1ef6b6c..43b148a3fe 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD + release-2.5.0 From 0668997c78639ee05d41a2c1ba79899ebba37649 Mon Sep 17 00:00:00 2001 From: Pablo Date: Tue, 26 Jun 2018 13:04:23 -0700 Subject: [PATCH 75/77] [maven-release-plugin] prepare for next development iteration --- examples/pom.xml | 2 +- maven-archetypes/examples/pom.xml | 2 +- maven-archetypes/pom.xml | 2 +- maven-archetypes/starter/pom.xml | 2 +- pom.xml | 4 ++-- sdk/pom.xml | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/pom.xml b/examples/pom.xml index 54b3613909..468d87505e 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT google-cloud-dataflow-java-examples-all diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml index 2c4a6eb9fe..792eb40c92 100644 --- a/maven-archetypes/examples/pom.xml +++ b/maven-archetypes/examples/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml index 95f647d608..f995770ea6 100644 --- a/maven-archetypes/pom.xml +++ b/maven-archetypes/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT ../pom.xml diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml index 4e91b8de88..643cfa4096 100644 --- a/maven-archetypes/starter/pom.xml +++ b/maven-archetypes/starter/pom.xml @@ -21,7 +21,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-archetypes-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 43b148a3fe..2924ff9fa2 100644 --- a/pom.xml +++ b/pom.xml @@ -33,7 +33,7 @@ http://cloud.google.com/dataflow 2013 - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT @@ -54,7 +54,7 @@ scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - release-2.5.0 + HEAD diff --git a/sdk/pom.xml b/sdk/pom.xml index d54342364c..0bd69dc58c 100644 --- a/sdk/pom.xml +++ b/sdk/pom.xml @@ -20,7 +20,7 @@ com.google.cloud.dataflow google-cloud-dataflow-java-sdk-parent - 2.5.0-SNAPSHOT + 2.6.0-SNAPSHOT google-cloud-dataflow-java-sdk-all From 913d720638a37f5ea150cde5734c731192e5ab16 Mon Sep 17 00:00:00 2001 From: Ahmet Altay Date: Wed, 25 Jul 2018 10:40:55 -0700 Subject: [PATCH 76/77] Clean up, and point to Beam locations. --- .gitattributes | 40 -- .gitignore | 30 -- .travis.yml | 54 --- CONTRIBUTING.md | 51 -- LICENSE | 202 -------- NOTICE | 5 - examples/pom.xml | 46 -- .../dataflow/sdk/ExamplesDependencies.java | 29 -- maven-archetypes/examples/pom.xml | 108 ----- .../META-INF/maven/archetype-metadata.xml | 44 -- .../examples/src/main/resources/NOTICE | 5 - .../resources/archetype-resources/pom.xml | 262 ---------- .../src/main/java/DebuggingWordCount.java | 167 ------- .../src/main/java/MinimalWordCount.java | 119 ----- .../src/main/java/WindowedWordCount.java | 225 --------- .../src/main/java/WordCount.java | 194 -------- .../common/ExampleBigQueryTableOptions.java | 55 --- .../src/main/java/common/ExampleOptions.java | 37 -- ...mplePubsubTopicAndSubscriptionOptions.java | 45 -- .../common/ExamplePubsubTopicOptions.java | 45 -- .../src/main/java/common/ExampleUtils.java | 407 ---------------- .../java/common/WriteOneFilePerWindow.java | 117 ----- .../main/java/complete/game/GameStats.java | 346 -------------- .../java/complete/game/HourlyTeamScore.java | 182 ------- .../main/java/complete/game/LeaderBoard.java | 306 ------------ .../java/complete/game/StatefulTeamScore.java | 227 --------- .../main/java/complete/game/UserScore.java | 229 --------- .../java/complete/game/injector/Injector.java | 439 ----------------- .../complete/game/injector/InjectorUtils.java | 101 ---- .../injector/RetryHttpInitializerWrapper.java | 115 ----- .../complete/game/utils/GameConstants.java | 35 -- .../complete/game/utils/WriteToBigQuery.java | 145 ------ .../java/complete/game/utils/WriteToText.java | 183 ------- .../game/utils/WriteWindowedToBigQuery.java | 71 --- .../src/test/java/DebuggingWordCountTest.java | 59 --- .../src/test/java/MinimalWordCountTest.java | 94 ---- .../src/test/java/WordCountTest.java | 85 ---- .../java/complete/game/GameStatsTest.java | 81 ---- .../complete/game/HourlyTeamScoreTest.java | 116 ----- .../java/complete/game/LeaderBoardTest.java | 368 -------------- .../complete/game/StatefulTeamScoreTest.java | 206 -------- .../java/complete/game/UserScoreTest.java | 154 ------ .../projects/basic/archetype.properties | 19 - .../test/resources/projects/basic/goal.txt | 1 - maven-archetypes/pom.xml | 92 ---- maven-archetypes/starter/pom.xml | 93 ---- .../META-INF/maven/archetype-metadata.xml | 36 -- .../starter/src/main/resources/NOTICE | 5 - .../resources/archetype-resources/pom.xml | 93 ---- .../src/main/java/StarterPipeline.java | 69 --- .../projects/basic/archetype.properties | 19 - .../test/resources/projects/basic/goal.txt | 1 - .../projects/basic/reference/pom.xml | 93 ---- .../src/main/java/it/pkg/StarterPipeline.java | 69 --- pom.xml | 449 ------------------ sdk/pom.xml | 75 --- .../cloud/dataflow/sdk/SdkDependencies.java | 33 -- .../dataflow/dataflow-distribution.properties | 20 - .../DataflowRunnerInfoOverrideTest.java | 57 --- sdk/suppressions.xml | 30 -- 60 files changed, 7083 deletions(-) delete mode 100644 .gitattributes delete mode 100644 .gitignore delete mode 100644 .travis.yml delete mode 100644 CONTRIBUTING.md delete mode 100644 LICENSE delete mode 100644 NOTICE delete mode 100644 examples/pom.xml delete mode 100644 examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java delete mode 100644 maven-archetypes/examples/pom.xml delete mode 100644 maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml delete mode 100644 maven-archetypes/examples/src/main/resources/NOTICE delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java delete mode 100644 maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java delete mode 100644 maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties delete mode 100644 maven-archetypes/examples/src/test/resources/projects/basic/goal.txt delete mode 100644 maven-archetypes/pom.xml delete mode 100644 maven-archetypes/starter/pom.xml delete mode 100644 maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml delete mode 100644 maven-archetypes/starter/src/main/resources/NOTICE delete mode 100644 maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml delete mode 100644 maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java delete mode 100644 maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties delete mode 100644 maven-archetypes/starter/src/test/resources/projects/basic/goal.txt delete mode 100644 maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml delete mode 100644 maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java delete mode 100644 pom.xml delete mode 100644 sdk/pom.xml delete mode 100644 sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java delete mode 100644 sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow-distribution.properties delete mode 100644 sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java delete mode 100644 sdk/suppressions.xml diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index c39158cf00..0000000000 --- a/.gitattributes +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -# The default behavior, which overrides 'core.autocrlf', is to use Git's -# built-in heuristics to determine whether a particular file is text or binary. -# Text files are automatically normalized to the user's platforms. -* text=auto - -# Explicitly declare text files that should always be normalized and converted -# to native line endings. -.gitattributes text -.gitignore text -LICENSE text -*.avsc text -*.html text -*.java text -*.md text -*.properties text -*.proto text -*.py text -*.sh text -*.xml text -*.yml text - -# Declare files that will always have CRLF line endings on checkout. -# *.sln text eol=crlf - -# Explicitly denote all files that are truly binary and should not be modified. -# *.jpg binary diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 2a27023c28..0000000000 --- a/.gitignore +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -target/ - -# Ignore IntelliJ files. -.idea/ -*.iml -*.ipr -*.iws - -# Ignore Eclipse files. -.classpath -.project -.settings/ - -# The build process generates the dependency-reduced POM, but it shouldn't be -# committed. -dependency-reduced-pom.xml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 8fa5d9a932..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -language: java - -sudo: false - -notifications: - email: - # Group email notifications are disabled for now, since we cannot do it on a per-branch basis. - # Right now, it would trigger a notification for each fork, which generates a lot of spam. - # recipients: - # - dataflow-sdk-build-notifications+travis@google.com - on_success: change - on_failure: always - -matrix: - include: - # On OSX, run with default JDK only. - - os: osx - # On Linux, run with specific JDKs only. - - os: linux - env: CUSTOM_JDK="oraclejdk8" - # The distribution does not build with Java 7 by design. We need to rewrite these tests - # to, for example, build and install with Java 8 and then test examples with Java 7. - # - os: linux - # env: CUSTOM_JDK="oraclejdk7" - # - os: linux - # env: CUSTOM_JDK="openjdk7" - -before_install: - - if [ "$TRAVIS_OS_NAME" == "osx" ]; then export JAVA_HOME=$(/usr/libexec/java_home); fi - - if [ "$TRAVIS_OS_NAME" == "linux" ]; then jdk_switcher use "$CUSTOM_JDK"; fi - -install: - - travis_retry mvn install clean -U -DskipTests=true - -script: - # Verify that the project can be built and installed. - - mvn install - # Verify that starter and examples archetypes have the correct version of the NOTICE file. - - diff -q NOTICE maven-archetypes/starter/src/main/resources/NOTICE - - diff -q NOTICE maven-archetypes/examples/src/main/resources/NOTICE diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index 9b616e5fe3..0000000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,51 +0,0 @@ - - -Want to contribute? Great! First, read this page (including the small print at -the end). - -Google Cloud Dataflow SDK is a distribution of Apache Beam. If you'd like to -change anything under the `org.apache.beam.*` namespace, please submit that -change directly to the [Apache Beam](https://github.com/apache/beam) project. - -This repository contains code to build the Dataflow distribution of Beam, and -some Dataflow-specific code. Only changes to how the distribution is built, or -the Dataflow-specific code under the `com.google.cloud.dataflow.*` namespace, -can be merged here. - -### Before you contribute -Before we can use your code, you must sign the -[Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1) -(CLA), which you can do online. The CLA is necessary mainly because you own the -copyright to your changes, even after your contribution becomes part of our -codebase, so we need your permission to use and distribute your code. We also -need to be sure of various other things. For instance that you'll tell us if you -know that your code infringes on other people's patents. You don't have to sign -the CLA until after you've submitted your code for review and a member has -approved it, but you must do it before we can put your code into our codebase. - -Before you start working on a larger contribution, we recommend to get in touch -with us first through the issue tracker with your idea so that we can help out -and possibly guide you. Coordinating up front makes it much easier to avoid -frustration later on. - -### Code reviews -All submissions, including submissions by project members, require review. We -use GitHub pull requests for this purpose. - -### The small print -Contributions made by corporations are covered by a different agreement than -the one above, the Software Grant and Corporate Contributor License Agreement. diff --git a/LICENSE b/LICENSE deleted file mode 100644 index d645695673..0000000000 --- a/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/NOTICE b/NOTICE deleted file mode 100644 index 981fde5a9e..0000000000 --- a/NOTICE +++ /dev/null @@ -1,5 +0,0 @@ -Google Cloud Dataflow SDK for Java -Copyright 2017, Google Inc. - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). diff --git a/examples/pom.xml b/examples/pom.xml deleted file mode 100644 index 468d87505e..0000000000 --- a/examples/pom.xml +++ /dev/null @@ -1,46 +0,0 @@ - - - - 4.0.0 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-parent - 2.6.0-SNAPSHOT - - - google-cloud-dataflow-java-examples-all - Google Cloud Dataflow Java Examples - All - Google Cloud Dataflow SDK for Java is a distribution of Apache - Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow - service. This artifact includes all Dataflow Java SDK - examples. - - jar - - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - - - - org.apache.beam - beam-examples-java - - - diff --git a/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java b/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java deleted file mode 100644 index c51e527edb..0000000000 --- a/examples/src/main/java/com/google/cloud/dataflow/sdk/ExamplesDependencies.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2017 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.google.cloud.dataflow.sdk; - -import org.apache.beam.examples.MinimalWordCount; -import org.apache.beam.examples.WordCount; - -/** - * Mark the examples dependencies as used at compile time. This is also needed - * to produce some content in the final JAR file. - */ -class ExamplesDependencies { - SdkDependencies sdkDependencies; - WordCount wordCount; - MinimalWordCount minimalWordCount; -} diff --git a/maven-archetypes/examples/pom.xml b/maven-archetypes/examples/pom.xml deleted file mode 100644 index 792eb40c92..0000000000 --- a/maven-archetypes/examples/pom.xml +++ /dev/null @@ -1,108 +0,0 @@ - - - - - 4.0.0 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-archetypes-parent - 2.6.0-SNAPSHOT - ../pom.xml - - - google-cloud-dataflow-java-archetypes-examples - Google Cloud Dataflow SDK for Java - Examples Archetype - Google Cloud Dataflow SDK for Java is a distribution of Apache - Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow - service. This archetype creates a project containing all the example - pipelines. - - maven-archetype - - - - - org.apache.maven.archetype - archetype-packaging - ${archetype-packaging.version} - - - - - - - maven-archetype-plugin - ${maven-archetype-plugin.version} - - - org.apache.maven.shared - maven-invoker - ${maven-invoker.version} - - - - - - default-integration-test - install - - integration-test - - - - - - - - org.eclipse.m2e - lifecycle-mapping - ${eclipse-m2e.version} - - - - - - org.codehaus.mojo - exec-maven-plugin - [1.5.0,) - - exec - - - - - false - - - - - - - - - - - - - diff --git a/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml deleted file mode 100644 index 29f8605cce..0000000000 --- a/maven-archetypes/examples/src/main/resources/META-INF/maven/archetype-metadata.xml +++ /dev/null @@ -1,44 +0,0 @@ - - - - - - - 1.8 - - - - - - src/main/java - - **/*.java - - - - - src/test/java - - **/*.java - - - - diff --git a/maven-archetypes/examples/src/main/resources/NOTICE b/maven-archetypes/examples/src/main/resources/NOTICE deleted file mode 100644 index 981fde5a9e..0000000000 --- a/maven-archetypes/examples/src/main/resources/NOTICE +++ /dev/null @@ -1,5 +0,0 @@ -Google Cloud Dataflow SDK for Java -Copyright 2017, Google Inc. - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml deleted file mode 100644 index 04d70618d9..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ /dev/null @@ -1,262 +0,0 @@ - - - - 4.0.0 - - ${groupId} - ${artifactId} - ${version} - - jar - - - UTF-8 - - @bigquery.version@ - @google-clients.version@ - @guava.version@ - @hamcrest.version@ - @joda.version@ - @junit.version@ - @maven-compiler-plugin.version@ - @exec-maven-plugin.version@ - @maven-jar-plugin.version@ - @maven-shade-plugin.version@ - @mockito.version@ - @pubsub.version@ - @slf4j.version@ - @surefire-plugin.version@ - - - - - ossrh.snapshots - Sonatype OSS Repository Hosting - https://oss.sonatype.org/content/repositories/snapshots/ - - false - - - true - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${targetPlatform} - ${targetPlatform} - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${surefire-plugin.version} - - all - 4 - true - - - - org.apache.maven.surefire - surefire-junit47 - ${surefire-plugin.version} - - - - - - - org.apache.maven.plugins - maven-jar-plugin - ${maven-jar-plugin.version} - - - - - org.apache.maven.plugins - maven-shade-plugin - ${maven-shade-plugin.version} - - - package - - shade - - - ${project.artifactId}-bundled-${project.version} - - - *:* - - META-INF/LICENSE - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - - - - - - - - - org.codehaus.mojo - exec-maven-plugin - ${exec-maven-plugin.version} - - false - - - - - - - - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - @project.version@ - - - - - com.google.api-client - google-api-client - ${google-clients.version} - - - - com.google.guava - guava-jdk5 - - - - - - com.google.apis - google-api-services-bigquery - ${bigquery.version} - - - - com.google.guava - guava-jdk5 - - - - - - com.google.http-client - google-http-client - ${google-clients.version} - - - - com.google.guava - guava-jdk5 - - - - - - com.google.apis - google-api-services-pubsub - ${pubsub.version} - - - - com.google.guava - guava-jdk5 - - - - - - joda-time - joda-time - ${joda.version} - - - - com.google.guava - guava - ${guava.version} - - - - - org.slf4j - slf4j-api - ${slf4j.version} - - - - org.slf4j - slf4j-jdk14 - ${slf4j.version} - - runtime - - - - - org.hamcrest - hamcrest-all - ${hamcrest.version} - - - - junit - junit - ${junit.version} - - - - org.mockito - mockito-core - ${mockito.version} - test - - - diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java deleted file mode 100644 index 0ae31d575d..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -/** - * An example that verifies word counts in Shakespeare and includes Beam best practices. - * - *

This class, {@link DebuggingWordCount}, is the third in a series of four successively more - * detailed 'word count' examples. You may first want to take a look at {@link MinimalWordCount} - * and {@link WordCount}. After you've looked at this example, then see the - * {@link WindowedWordCount} pipeline, for introduction of additional concepts. - * - *

Basic concepts, also in the MinimalWordCount and WordCount examples: - * Reading text files; counting a PCollection; executing a Pipeline both locally - * and using a selected runner; defining DoFns. - * - *

New Concepts: - *

- *   1. Logging using SLF4J, even in a distributed environment
- *   2. Creating a custom metric (runners have varying levels of support)
- *   3. Testing your Pipeline via PAssert
- * 
- * - *

To execute this pipeline locally, specify general pipeline configuration: - *

{@code
- *   --project=YOUR_PROJECT_ID
- * }
- * 
- * - *

To change the runner, specify: - *

{@code
- *   --runner=YOUR_SELECTED_RUNNER
- * }
- * 
- * - *

The input file defaults to a public data set containing the text of of King Lear, - * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. - * - */ -public class DebuggingWordCount { - /** A DoFn that filters for a specific key based upon a regular expression. */ - public static class FilterTextFn extends DoFn, KV> { - /** - * Concept #1: The logger below uses the fully qualified class name of FilterTextFn as the - * logger. Depending on your SLF4J configuration, log statements will likely be qualified by - * this name. - * - *

Note that this is entirely standard SLF4J usage. Some runners may provide a default SLF4J - * configuration that is most appropriate for their logging integration. - */ - private static final Logger LOG = LoggerFactory.getLogger(FilterTextFn.class); - - private final Pattern filter; - public FilterTextFn(String pattern) { - filter = Pattern.compile(pattern); - } - - /** - * Concept #2: A custom metric can track values in your pipeline as it runs. Each - * runner provides varying levels of support for metrics, and may expose them - * in a dashboard, etc. - */ - private final Counter matchedWords = Metrics.counter(FilterTextFn.class, "matchedWords"); - private final Counter unmatchedWords = Metrics.counter(FilterTextFn.class, "unmatchedWords"); - - @ProcessElement - public void processElement(ProcessContext c) { - if (filter.matcher(c.element().getKey()).matches()) { - // Log at the "DEBUG" level each element that we match. When executing this pipeline - // these log lines will appear only if the log level is set to "DEBUG" or lower. - LOG.debug("Matched: " + c.element().getKey()); - matchedWords.inc(); - c.output(c.element()); - } else { - // Log at the "TRACE" level each element that is not matched. Different log levels - // can be used to control the verbosity of logging providing an effective mechanism - // to filter less important information. - LOG.trace("Did not match: " + c.element().getKey()); - unmatchedWords.inc(); - } - } - } - - /** - * Options supported by {@link DebuggingWordCount}. - * - *

Inherits standard configuration options and all options defined in - * {@link WordCount.WordCountOptions}. - */ - public interface WordCountOptions extends WordCount.WordCountOptions { - - @Description("Regex filter pattern to use in DebuggingWordCount. " - + "Only words matching this pattern will be counted.") - @Default.String("Flourish|stomach") - String getFilterPattern(); - void setFilterPattern(String value); - } - - static void runDebuggingWordCount(WordCountOptions options) { - Pipeline p = Pipeline.create(options); - - PCollection> filteredWords = - p.apply("ReadLines", TextIO.read().from(options.getInputFile())) - .apply(new WordCount.CountWords()) - .apply(ParDo.of(new FilterTextFn(options.getFilterPattern()))); - - /** - * Concept #3: PAssert is a set of convenient PTransforms in the style of - * Hamcrest's collection matchers that can be used when writing Pipeline level tests - * to validate the contents of PCollections. PAssert is best used in unit tests - * with small data sets but is demonstrated here as a teaching tool. - * - *

Below we verify that the set of filtered words matches our expected counts. Note - * that PAssert does not provide any output and that successful completion of the - * Pipeline implies that the expectations were met. Learn more at - * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test - * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test. - */ - List> expectedResults = Arrays.asList( - KV.of("Flourish", 3L), - KV.of("stomach", 1L)); - PAssert.that(filteredWords).containsInAnyOrder(expectedResults); - - p.run().waitUntilFinish(); - } - - public static void main(String[] args) { - WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() - .as(WordCountOptions.class); - - runDebuggingWordCount(options); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java deleted file mode 100644 index f1bd8bfaa8..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.util.Arrays; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.Filter; -import org.apache.beam.sdk.transforms.FlatMapElements; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.TypeDescriptors; - -/** - * An example that counts words in Shakespeare. - * - *

This class, {@link MinimalWordCount}, is the first in a series of four successively more - * detailed 'word count' examples. Here, for simplicity, we don't show any error-checking or - * argument processing, and focus on construction of the pipeline, which chains together the - * application of core transforms. - * - *

Next, see the {@link WordCount} pipeline, then the {@link DebuggingWordCount}, and finally the - * {@link WindowedWordCount} pipeline, for more detailed examples that introduce additional - * concepts. - * - *

Concepts: - * - *

- *   1. Reading data from text files
- *   2. Specifying 'inline' transforms
- *   3. Counting items in a PCollection
- *   4. Writing data to text files
- * 
- * - *

No arguments are required to run this pipeline. It will be executed with the DirectRunner. You - * can see the results in the output files in your current working directory, with names like - * "wordcounts-00001-of-00005. When running on a distributed service, you would use an appropriate - * file service. - */ -public class MinimalWordCount { - - public static void main(String[] args) { - - // Create a PipelineOptions object. This object lets us set various execution - // options for our pipeline, such as the runner you wish to use. This example - // will run with the DirectRunner by default, based on the class path configured - // in its dependencies. - PipelineOptions options = PipelineOptionsFactory.create(); - - // In order to run your pipeline, you need to make following runner specific changes: - // - // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner - // or FlinkRunner. - // CHANGE 2/3: Specify runner-required options. - // For BlockingDataflowRunner, set project and temp location as follows: - // DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); - // dataflowOptions.setRunner(BlockingDataflowRunner.class); - // dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE"); - // dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY"); - // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions} - // for more details. - // options.as(FlinkPipelineOptions.class) - // .setRunner(FlinkRunner.class); - - // Create the Pipeline object with the options we defined above - Pipeline p = Pipeline.create(options); - - // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set - // of input text files. TextIO.Read returns a PCollection where each element is one line from - // the input text (a set of Shakespeare's texts). - - // This example reads a public data set consisting of the complete works of Shakespeare. - p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) - - // Concept #2: Apply a FlatMapElements transform the PCollection of text lines. - // This transform splits the lines in PCollection, where each element is an - // individual word in Shakespeare's collected texts. - .apply(FlatMapElements - .into(TypeDescriptors.strings()) - .via((String word) -> Arrays.asList(word.split("[^\\p{L}]+")))) - // We use a Filter transform to avoid empty word - .apply(Filter.by((String word) -> !word.isEmpty())) - // Concept #3: Apply the Count transform to our PCollection of individual words. The Count - // transform returns a new PCollection of key/value pairs, where each key represents a - // unique word in the text. The associated value is the occurrence count for that word. - .apply(Count.perElement()) - // Apply a MapElements transform that formats our PCollection of word counts into a - // printable string, suitable for writing to an output file. - .apply(MapElements - .into(TypeDescriptors.strings()) - .via((KV wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) - // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. - // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of - // formatted strings) to a series of text files. - // - // By default, it will write to a set of files with names like wordcounts-00001-of-00005 - .apply(TextIO.write().to("wordcounts")); - - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java deleted file mode 100644 index 5798f290eb..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.io.IOException; -import java.util.concurrent.ThreadLocalRandom; -import ${package}.common.ExampleBigQueryTableOptions; -import ${package}.common.ExampleOptions; -import ${package}.common.WriteOneFilePerWindow; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.joda.time.Duration; -import org.joda.time.Instant; - -/** - * An example that counts words in text, and can run over either unbounded or bounded input - * collections. - * - *

This class, {@link WindowedWordCount}, is the last in a series of four successively more - * detailed 'word count' examples. First take a look at {@link MinimalWordCount}, - * {@link WordCount}, and {@link DebuggingWordCount}. - * - *

Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples: - * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally - * and using a selected runner; defining DoFns; - * user-defined PTransforms; defining PipelineOptions. - * - *

New Concepts: - *

- *   1. Unbounded and bounded pipeline input modes
- *   2. Adding timestamps to data
- *   3. Windowing
- *   4. Re-using PTransforms over windowed PCollections
- *   5. Accessing the window of an element
- *   6. Writing data to per-window text files
- * 
- * - *

By default, the examples will run with the {@code DirectRunner}. - * To change the runner, specify: - *

{@code
- *   --runner=YOUR_SELECTED_RUNNER
- * }
- * 
- * See examples/java/README.md for instructions about how to configure different runners. - * - *

To execute this pipeline locally, specify a local output file (if using the - * {@code DirectRunner}) or output prefix on a supported distributed file system. - *

{@code
- *   --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
- * }
- * - *

The input file defaults to a public data set containing the text of of King Lear, - * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. - * - *

By default, the pipeline will do fixed windowing, on 1-minute windows. You can - * change this interval by setting the {@code --windowSize} parameter, e.g. {@code --windowSize=10} - * for 10-minute windows. - * - *

The example will try to cancel the pipeline on the signal to terminate the process (CTRL-C). - */ -public class WindowedWordCount { - static final int WINDOW_SIZE = 10; // Default window duration in minutes - /** - * Concept #2: A DoFn that sets the data element timestamp. This is a silly method, just for - * this example, for the bounded data case. - * - *

Imagine that many ghosts of Shakespeare are all typing madly at the same time to recreate - * his masterworks. Each line of the corpus will get a random associated timestamp somewhere in a - * 2-hour period. - */ - static class AddTimestampFn extends DoFn { - private final Instant minTimestamp; - private final Instant maxTimestamp; - - AddTimestampFn(Instant minTimestamp, Instant maxTimestamp) { - this.minTimestamp = minTimestamp; - this.maxTimestamp = maxTimestamp; - } - - @ProcessElement - public void processElement(@Element String element, OutputReceiver receiver) { - Instant randomTimestamp = - new Instant( - ThreadLocalRandom.current() - .nextLong(minTimestamp.getMillis(), maxTimestamp.getMillis())); - - /** - * Concept #2: Set the data element with that timestamp. - */ - receiver.outputWithTimestamp(element, new Instant(randomTimestamp)); - } - } - - /** A {@link DefaultValueFactory} that returns the current system time. */ - public static class DefaultToCurrentSystemTime implements DefaultValueFactory { - @Override - public Long create(PipelineOptions options) { - return System.currentTimeMillis(); - } - } - - /** A {@link DefaultValueFactory} that returns the minimum timestamp plus one hour. */ - public static class DefaultToMinTimestampPlusOneHour implements DefaultValueFactory { - @Override - public Long create(PipelineOptions options) { - return options.as(Options.class).getMinTimestampMillis() - + Duration.standardHours(1).getMillis(); - } - } - - /** - * Options for {@link WindowedWordCount}. - * - *

Inherits standard example configuration options, which allow specification of the - * runner, as well as the {@link WordCount.WordCountOptions} support for - * specification of the input and output files. - */ - public interface Options extends WordCount.WordCountOptions, - ExampleOptions, ExampleBigQueryTableOptions { - @Description("Fixed window duration, in minutes") - @Default.Integer(WINDOW_SIZE) - Integer getWindowSize(); - void setWindowSize(Integer value); - - @Description("Minimum randomly assigned timestamp, in milliseconds-since-epoch") - @Default.InstanceFactory(DefaultToCurrentSystemTime.class) - Long getMinTimestampMillis(); - void setMinTimestampMillis(Long value); - - @Description("Maximum randomly assigned timestamp, in milliseconds-since-epoch") - @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class) - Long getMaxTimestampMillis(); - void setMaxTimestampMillis(Long value); - - @Description("Fixed number of shards to produce per window") - Integer getNumShards(); - void setNumShards(Integer numShards); - } - - static void runWindowedWordCount(Options options) throws IOException { - final String output = options.getOutput(); - final Instant minTimestamp = new Instant(options.getMinTimestampMillis()); - final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis()); - - Pipeline pipeline = Pipeline.create(options); - - /** - * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or - * unbounded input source. - */ - PCollection input = pipeline - /** Read from the GCS file. */ - .apply(TextIO.read().from(options.getInputFile())) - // Concept #2: Add an element timestamp, using an artificial time just to show windowing. - // See AddTimestampFn for more detail on this. - .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp))); - - /** - * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1 - * minute (you can change this with a command-line option). See the documentation for more - * information on how fixed windows work, and for information on the other types of windowing - * available (e.g., sliding windows). - */ - PCollection windowedWords = - input.apply( - Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize())))); - - /** - * Concept #4: Re-use our existing CountWords transform that does not have knowledge of - * windows over a PCollection containing windowed values. - */ - PCollection> wordCounts = windowedWords.apply(new WordCount.CountWords()); - - /** - * Concept #5: Format the results and write to a sharded file partitioned by window, using a - * simple ParDo operation. Because there may be failures followed by retries, the - * writes must be idempotent, but the details of writing to files is elided here. - */ - wordCounts - .apply(MapElements.via(new WordCount.FormatAsTextFn())) - .apply(new WriteOneFilePerWindow(output, options.getNumShards())); - - PipelineResult result = pipeline.run(); - try { - result.waitUntilFinish(); - } catch (Exception exc) { - result.cancel(); - } - } - - public static void main(String[] args) throws IOException { - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - - runWindowedWordCount(options); - } - -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java deleted file mode 100644 index d4302ed67a..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/WordCount.java +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import ${package}.common.ExampleUtils; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Distribution; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.Validation.Required; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; - -/** - * An example that counts words in Shakespeare and includes Beam best practices. - * - *

This class, {@link WordCount}, is the second in a series of four successively more detailed - * 'word count' examples. You may first want to take a look at {@link MinimalWordCount}. - * After you've looked at this example, then see the {@link DebuggingWordCount} - * pipeline, for introduction of additional concepts. - * - *

For a detailed walkthrough of this example, see - * - * https://beam.apache.org/get-started/wordcount-example/ - * - * - *

Basic concepts, also in the MinimalWordCount example: - * Reading text files; counting a PCollection; writing to text files - * - *

New Concepts: - *

- *   1. Executing a Pipeline both locally and using the selected runner
- *   2. Using ParDo with static DoFns defined out-of-line
- *   3. Building a composite transform
- *   4. Defining your own pipeline options
- * 
- * - *

Concept #1: you can execute this pipeline either locally or using by selecting another runner. - * These are now command-line options and not hard-coded as they were in the MinimalWordCount - * example. - * - *

To change the runner, specify: - *

{@code
- *   --runner=YOUR_SELECTED_RUNNER
- * }
- * 
- * - *

To execute this pipeline, specify a local output file (if using the - * {@code DirectRunner}) or output prefix on a supported distributed file system. - *

{@code
- *   --output=[YOUR_LOCAL_FILE | YOUR_OUTPUT_PREFIX]
- * }
- * - *

The input file defaults to a public data set containing the text of of King Lear, - * by William Shakespeare. You can override it and choose your own input with {@code --inputFile}. - */ -public class WordCount { - - /** - * Concept #2: You can make your pipeline assembly code less verbose by defining your DoFns - * statically out-of-line. This DoFn tokenizes lines of text into individual words; we pass it - * to a ParDo in the pipeline. - */ - static class ExtractWordsFn extends DoFn { - private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines"); - private final Distribution lineLenDist = Metrics.distribution( - ExtractWordsFn.class, "lineLenDistro"); - - @ProcessElement - public void processElement(@Element String element, OutputReceiver receiver) { - lineLenDist.update(element.length()); - if (element.trim().isEmpty()) { - emptyLines.inc(); - } - - // Split the line into words. - String[] words = element.split(ExampleUtils.TOKENIZER_PATTERN, -1); - - // Output each word encountered into the output PCollection. - for (String word : words) { - if (!word.isEmpty()) { - receiver.output(word); - } - } - } - } - - /** A SimpleFunction that converts a Word and Count into a printable string. */ - public static class FormatAsTextFn extends SimpleFunction, String> { - @Override - public String apply(KV input) { - return input.getKey() + ": " + input.getValue(); - } - } - - /** - * A PTransform that converts a PCollection containing lines of text into a PCollection of - * formatted word counts. - * - *

Concept #3: This is a custom composite transform that bundles two transforms (ParDo and - * Count) as a reusable PTransform subclass. Using composite transforms allows for easy reuse, - * modular testing, and an improved monitoring experience. - */ - public static class CountWords extends PTransform, - PCollection>> { - @Override - public PCollection> expand(PCollection lines) { - - // Convert lines of text into individual words. - PCollection words = lines.apply( - ParDo.of(new ExtractWordsFn())); - - // Count the number of times each word occurs. - PCollection> wordCounts = words.apply(Count.perElement()); - - return wordCounts; - } - } - - /** - * Options supported by {@link WordCount}. - * - *

Concept #4: Defining your own configuration options. Here, you can add your own arguments - * to be processed by the command-line parser, and specify default values for them. You can then - * access the options values in your pipeline code. - * - *

Inherits standard configuration options. - */ - public interface WordCountOptions extends PipelineOptions { - - /** - * By default, this example reads from a public dataset containing the text of - * King Lear. Set this option to choose a different input file or glob. - */ - @Description("Path of the file to read from") - @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") - String getInputFile(); - void setInputFile(String value); - - /** - * Set this required option to specify where to write the output. - */ - @Description("Path of the file to write to") - @Required - String getOutput(); - void setOutput(String value); - } - - static void runWordCount(WordCountOptions options) { - Pipeline p = Pipeline.create(options); - - // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the - // static FormatAsTextFn() to the ParDo transform. - p.apply("ReadLines", TextIO.read().from(options.getInputFile())) - .apply(new CountWords()) - .apply(MapElements.via(new FormatAsTextFn())) - .apply("WriteCounts", TextIO.write().to(options.getOutput())); - - p.run().waitUntilFinish(); - } - - public static void main(String[] args) { - WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() - .as(WordCountOptions.class); - - runWordCount(options); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java deleted file mode 100644 index 57f1546e27..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import com.google.api.services.bigquery.model.TableSchema; -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure BigQuery tables in Beam examples. - * The project defaults to the project being used to run the example. - */ -public interface ExampleBigQueryTableOptions extends GcpOptions { - @Description("BigQuery dataset name") - @Default.String("beam_examples") - String getBigQueryDataset(); - void setBigQueryDataset(String dataset); - - @Description("BigQuery table name") - @Default.InstanceFactory(BigQueryTableFactory.class) - String getBigQueryTable(); - void setBigQueryTable(String table); - - @Description("BigQuery table schema") - TableSchema getBigQuerySchema(); - void setBigQuerySchema(TableSchema schema); - - /** - * Returns the job name as the default BigQuery table name. - */ - class BigQueryTableFactory implements DefaultValueFactory { - @Override - public String create(PipelineOptions options) { - return options.getJobName().replace('-', '_'); - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java deleted file mode 100644 index 90f935c3ce..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleOptions.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure the Beam examples. - */ -public interface ExampleOptions extends PipelineOptions { - @Description("Whether to keep jobs running after local process exit") - @Default.Boolean(false) - boolean getKeepJobsRunning(); - void setKeepJobsRunning(boolean keepJobsRunning); - - @Description("Number of workers to use when executing the injector pipeline") - @Default.Integer(1) - int getInjectorNumWorkers(); - void setInjectorNumWorkers(int numWorkers); -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java deleted file mode 100644 index cf142a10fd..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure Pub/Sub topic/subscription in Beam examples. - */ -public interface ExamplePubsubTopicAndSubscriptionOptions extends ExamplePubsubTopicOptions { - @Description("Pub/Sub subscription") - @Default.InstanceFactory(PubsubSubscriptionFactory.class) - String getPubsubSubscription(); - void setPubsubSubscription(String subscription); - - /** - * Returns a default Pub/Sub subscription based on the project and the job names. - */ - class PubsubSubscriptionFactory implements DefaultValueFactory { - @Override - public String create(PipelineOptions options) { - return "projects/" + options.as(GcpOptions.class).getProject() - + "/subscriptions/" + options.getJobName(); - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java deleted file mode 100644 index 86784b06da..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.DefaultValueFactory; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; - -/** - * Options that can be used to configure Pub/Sub topic in Beam examples. - */ -public interface ExamplePubsubTopicOptions extends GcpOptions { - @Description("Pub/Sub topic") - @Default.InstanceFactory(PubsubTopicFactory.class) - String getPubsubTopic(); - void setPubsubTopic(String topic); - - /** - * Returns a default Pub/Sub topic based on the project and the job names. - */ - class PubsubTopicFactory implements DefaultValueFactory { - @Override - public String create(PipelineOptions options) { - return "projects/" + options.as(GcpOptions.class).getProject() - + "/topics/" + options.getJobName(); - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java deleted file mode 100644 index e1159b9018..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/ExampleUtils.java +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import com.google.api.client.googleapis.json.GoogleJsonResponseException; -import com.google.api.client.googleapis.services.AbstractGoogleClientRequest; -import com.google.api.client.http.HttpRequestInitializer; -import com.google.api.services.bigquery.Bigquery; -import com.google.api.services.bigquery.Bigquery.Datasets; -import com.google.api.services.bigquery.Bigquery.Tables; -import com.google.api.services.bigquery.model.Dataset; -import com.google.api.services.bigquery.model.DatasetReference; -import com.google.api.services.bigquery.model.Table; -import com.google.api.services.bigquery.model.TableReference; -import com.google.api.services.bigquery.model.TableSchema; -import com.google.api.services.pubsub.Pubsub; -import com.google.api.services.pubsub.model.Subscription; -import com.google.api.services.pubsub.model.Topic; -import com.google.auth.Credentials; -import com.google.auth.http.HttpCredentialsAdapter; -import com.google.cloud.hadoop.util.ChainingHttpRequestInitializer; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.common.util.concurrent.Uninterruptibles; -import java.io.IOException; -import java.util.Collection; -import java.util.List; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.extensions.gcp.auth.NullCredentialInitializer; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryOptions; -import org.apache.beam.sdk.io.gcp.pubsub.PubsubOptions; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.sdk.util.BackOffUtils; -import org.apache.beam.sdk.util.FluentBackoff; -import org.apache.beam.sdk.util.RetryHttpRequestInitializer; -import org.apache.beam.sdk.util.Sleeper; -import org.apache.beam.sdk.util.Transport; -import org.joda.time.Duration; - -/** - * The utility class that sets up and tears down external resources, - * and cancels the streaming pipelines once the program terminates. - * - *

It is used to run Beam examples. - */ -public class ExampleUtils { - - private static final int SC_NOT_FOUND = 404; - - /** - * \p{L} denotes the category of Unicode letters, - * so this pattern will match on everything that is not a letter. - * - *

It is used for tokenizing strings in the wordcount examples. - */ - public static final String TOKENIZER_PATTERN = "[^\\p{L}]+"; - - private final PipelineOptions options; - private Bigquery bigQueryClient = null; - private Pubsub pubsubClient = null; - private Set pipelinesToCancel = Sets.newHashSet(); - private List pendingMessages = Lists.newArrayList(); - - /** - * Do resources and runner options setup. - */ - public ExampleUtils(PipelineOptions options) { - this.options = options; - } - - /** - * Sets up external resources that are required by the example, - * such as Pub/Sub topics and BigQuery tables. - * - * @throws IOException if there is a problem setting up the resources - */ - public void setup() throws IOException { - Sleeper sleeper = Sleeper.DEFAULT; - BackOff backOff = - FluentBackoff.DEFAULT - .withMaxRetries(3).withInitialBackoff(Duration.millis(200)).backoff(); - Throwable lastException = null; - try { - do { - try { - setupPubsub(); - setupBigQueryTable(); - return; - } catch (GoogleJsonResponseException e) { - lastException = e; - } - } while (BackOffUtils.next(sleeper, backOff)); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - // Ignore InterruptedException - } - throw new RuntimeException(lastException); - } - - /** - * Sets up the Google Cloud Pub/Sub topic. - * - *

If the topic doesn't exist, a new topic with the given name will be created. - * - * @throws IOException if there is a problem setting up the Pub/Sub topic - */ - public void setupPubsub() throws IOException { - ExamplePubsubTopicAndSubscriptionOptions pubsubOptions = - options.as(ExamplePubsubTopicAndSubscriptionOptions.class); - if (!pubsubOptions.getPubsubTopic().isEmpty()) { - pendingMessages.add("**********************Set Up Pubsub************************"); - setupPubsubTopic(pubsubOptions.getPubsubTopic()); - pendingMessages.add("The Pub/Sub topic has been set up for this example: " - + pubsubOptions.getPubsubTopic()); - - if (!pubsubOptions.getPubsubSubscription().isEmpty()) { - setupPubsubSubscription( - pubsubOptions.getPubsubTopic(), pubsubOptions.getPubsubSubscription()); - pendingMessages.add("The Pub/Sub subscription has been set up for this example: " - + pubsubOptions.getPubsubSubscription()); - } - } - } - - /** - * Sets up the BigQuery table with the given schema. - * - *

If the table already exists, the schema has to match the given one. Otherwise, the example - * will throw a RuntimeException. If the table doesn't exist, a new table with the given schema - * will be created. - * - * @throws IOException if there is a problem setting up the BigQuery table - */ - public void setupBigQueryTable() throws IOException { - ExampleBigQueryTableOptions bigQueryTableOptions = - options.as(ExampleBigQueryTableOptions.class); - if (bigQueryTableOptions.getBigQueryDataset() != null - && bigQueryTableOptions.getBigQueryTable() != null - && bigQueryTableOptions.getBigQuerySchema() != null) { - pendingMessages.add("******************Set Up Big Query Table*******************"); - setupBigQueryTable(bigQueryTableOptions.getProject(), - bigQueryTableOptions.getBigQueryDataset(), - bigQueryTableOptions.getBigQueryTable(), - bigQueryTableOptions.getBigQuerySchema()); - pendingMessages.add("The BigQuery table has been set up for this example: " - + bigQueryTableOptions.getProject() - + ":" + bigQueryTableOptions.getBigQueryDataset() - + "." + bigQueryTableOptions.getBigQueryTable()); - } - } - - /** - * Tears down external resources that can be deleted upon the example's completion. - */ - private void tearDown() { - pendingMessages.add("*************************Tear Down*************************"); - ExamplePubsubTopicAndSubscriptionOptions pubsubOptions = - options.as(ExamplePubsubTopicAndSubscriptionOptions.class); - if (!pubsubOptions.getPubsubTopic().isEmpty()) { - try { - deletePubsubTopic(pubsubOptions.getPubsubTopic()); - pendingMessages.add("The Pub/Sub topic has been deleted: " - + pubsubOptions.getPubsubTopic()); - } catch (IOException e) { - pendingMessages.add("Failed to delete the Pub/Sub topic : " - + pubsubOptions.getPubsubTopic()); - } - if (!pubsubOptions.getPubsubSubscription().isEmpty()) { - try { - deletePubsubSubscription(pubsubOptions.getPubsubSubscription()); - pendingMessages.add("The Pub/Sub subscription has been deleted: " - + pubsubOptions.getPubsubSubscription()); - } catch (IOException e) { - pendingMessages.add("Failed to delete the Pub/Sub subscription : " - + pubsubOptions.getPubsubSubscription()); - } - } - } - - ExampleBigQueryTableOptions bigQueryTableOptions = - options.as(ExampleBigQueryTableOptions.class); - if (bigQueryTableOptions.getBigQueryDataset() != null - && bigQueryTableOptions.getBigQueryTable() != null - && bigQueryTableOptions.getBigQuerySchema() != null) { - pendingMessages.add("The BigQuery table might contain the example's output, " - + "and it is not deleted automatically: " - + bigQueryTableOptions.getProject() - + ":" + bigQueryTableOptions.getBigQueryDataset() - + "." + bigQueryTableOptions.getBigQueryTable()); - pendingMessages.add("Please go to the Developers Console to delete it manually." - + " Otherwise, you may be charged for its usage."); - } - } - - /** - * Returns a BigQuery client builder using the specified {@link BigQueryOptions}. - */ - private static Bigquery.Builder newBigQueryClient(BigQueryOptions options) { - return new Bigquery.Builder(Transport.getTransport(), Transport.getJsonFactory(), - chainHttpRequestInitializer( - options.getGcpCredential(), - // Do not log 404. It clutters the output and is possibly even required by the caller. - new RetryHttpRequestInitializer(ImmutableList.of(404)))) - .setApplicationName(options.getAppName()) - .setGoogleClientRequestInitializer(options.getGoogleApiTrace()); - } - - /** - * Returns a Pubsub client builder using the specified {@link PubsubOptions}. - */ - private static Pubsub.Builder newPubsubClient(PubsubOptions options) { - return new Pubsub.Builder(Transport.getTransport(), Transport.getJsonFactory(), - chainHttpRequestInitializer( - options.getGcpCredential(), - // Do not log 404. It clutters the output and is possibly even required by the caller. - new RetryHttpRequestInitializer(ImmutableList.of(404)))) - .setRootUrl(options.getPubsubRootUrl()) - .setApplicationName(options.getAppName()) - .setGoogleClientRequestInitializer(options.getGoogleApiTrace()); - } - - private static HttpRequestInitializer chainHttpRequestInitializer( - Credentials credential, HttpRequestInitializer httpRequestInitializer) { - if (credential == null) { - return new ChainingHttpRequestInitializer( - new NullCredentialInitializer(), httpRequestInitializer); - } else { - return new ChainingHttpRequestInitializer( - new HttpCredentialsAdapter(credential), - httpRequestInitializer); - } - } - - private void setupBigQueryTable(String projectId, String datasetId, String tableId, - TableSchema schema) throws IOException { - if (bigQueryClient == null) { - bigQueryClient = newBigQueryClient(options.as(BigQueryOptions.class)).build(); - } - - Datasets datasetService = bigQueryClient.datasets(); - if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) { - Dataset newDataset = new Dataset().setDatasetReference( - new DatasetReference().setProjectId(projectId).setDatasetId(datasetId)); - datasetService.insert(projectId, newDataset).execute(); - } - - Tables tableService = bigQueryClient.tables(); - Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId)); - if (table == null) { - Table newTable = new Table().setSchema(schema).setTableReference( - new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId)); - tableService.insert(projectId, datasetId, newTable).execute(); - } else if (!table.getSchema().equals(schema)) { - throw new RuntimeException( - "Table exists and schemas do not match, expecting: " + schema.toPrettyString() - + ", actual: " + table.getSchema().toPrettyString()); - } - } - - private void setupPubsubTopic(String topic) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) == null) { - pubsubClient.projects().topics().create(topic, new Topic().setName(topic)).execute(); - } - } - - private void setupPubsubSubscription(String topic, String subscription) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) == null) { - Subscription subInfo = new Subscription() - .setAckDeadlineSeconds(60) - .setTopic(topic); - pubsubClient.projects().subscriptions().create(subscription, subInfo).execute(); - } - } - - /** - * Deletes the Google Cloud Pub/Sub topic. - * - * @throws IOException if there is a problem deleting the Pub/Sub topic - */ - private void deletePubsubTopic(String topic) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().topics().get(topic)) != null) { - pubsubClient.projects().topics().delete(topic).execute(); - } - } - - /** - * Deletes the Google Cloud Pub/Sub subscription. - * - * @throws IOException if there is a problem deleting the Pub/Sub subscription - */ - private void deletePubsubSubscription(String subscription) throws IOException { - if (pubsubClient == null) { - pubsubClient = newPubsubClient(options.as(PubsubOptions.class)).build(); - } - if (executeNullIfNotFound(pubsubClient.projects().subscriptions().get(subscription)) != null) { - pubsubClient.projects().subscriptions().delete(subscription).execute(); - } - } - - /** - * Waits for the pipeline to finish and cancels it before the program exists. - */ - public void waitToFinish(PipelineResult result) { - pipelinesToCancel.add(result); - if (!options.as(ExampleOptions.class).getKeepJobsRunning()) { - addShutdownHook(pipelinesToCancel); - } - try { - result.waitUntilFinish(); - } catch (UnsupportedOperationException e) { - // Do nothing if the given PipelineResult doesn't support waitUntilFinish(), - // such as EvaluationResults returned by DirectRunner. - tearDown(); - printPendingMessages(); - } catch (Exception e) { - throw new RuntimeException("Failed to wait the pipeline until finish: " + result); - } - } - - private void addShutdownHook(final Collection pipelineResults) { - Runtime.getRuntime() - .addShutdownHook( - new Thread( - () -> { - tearDown(); - printPendingMessages(); - for (PipelineResult pipelineResult : pipelineResults) { - try { - pipelineResult.cancel(); - } catch (IOException e) { - System.out.println("Failed to cancel the job."); - System.out.println(e.getMessage()); - } - } - - for (PipelineResult pipelineResult : pipelineResults) { - boolean cancellationVerified = false; - for (int retryAttempts = 6; retryAttempts > 0; retryAttempts--) { - if (pipelineResult.getState().isTerminal()) { - cancellationVerified = true; - break; - } else { - System.out.println( - "The example pipeline is still running. Verifying the cancellation."); - } - Uninterruptibles.sleepUninterruptibly(10, TimeUnit.SECONDS); - } - if (!cancellationVerified) { - System.out.println( - "Failed to verify the cancellation for job: " + pipelineResult); - } - } - })); - } - - private void printPendingMessages() { - System.out.println(); - System.out.println("***********************************************************"); - System.out.println("***********************************************************"); - for (String message : pendingMessages) { - System.out.println(message); - } - System.out.println("***********************************************************"); - System.out.println("***********************************************************"); - } - - private static T executeNullIfNotFound( - AbstractGoogleClientRequest request) throws IOException { - try { - return request.execute(); - } catch (GoogleJsonResponseException e) { - if (e.getStatusCode() == SC_NOT_FOUND) { - return null; - } else { - throw e; - } - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java deleted file mode 100644 index 9796d647b5..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/common/WriteOneFilePerWindow.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.common; - -import static com.google.common.base.MoreObjects.firstNonNull; - -import javax.annotation.Nullable; -import org.apache.beam.sdk.io.FileBasedSink; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.PaneInfo; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; -import org.joda.time.format.DateTimeFormatter; -import org.joda.time.format.ISODateTimeFormat; - -/** - * A {@link DoFn} that writes elements to files with names deterministically derived from the lower - * and upper bounds of their key (an {@link IntervalWindow}). - * - *

This is test utility code, not for end-users, so examples can be focused on their primary - * lessons. - */ -public class WriteOneFilePerWindow extends PTransform, PDone> { - private static final DateTimeFormatter FORMATTER = ISODateTimeFormat.hourMinute(); - private String filenamePrefix; - @Nullable - private Integer numShards; - - public WriteOneFilePerWindow(String filenamePrefix, Integer numShards) { - this.filenamePrefix = filenamePrefix; - this.numShards = numShards; - } - - @Override - public PDone expand(PCollection input) { - ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); - TextIO.Write write = - TextIO.write() - .to(new PerWindowFiles(resource)) - .withTempDirectory(resource.getCurrentDirectory()) - .withWindowedWrites(); - if (numShards != null) { - write = write.withNumShards(numShards); - } - return input.apply(write); - } - - /** - * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data - * being written. This always includes the shard number and the total number of shards. For - * windowed writes, it also includes the window and pane index (a sequence number assigned to each - * trigger firing). - */ - public static class PerWindowFiles extends FilenamePolicy { - - private final ResourceId baseFilename; - - public PerWindowFiles(ResourceId baseFilename) { - this.baseFilename = baseFilename; - } - - public String filenamePrefixForWindow(IntervalWindow window) { - String prefix = - baseFilename.isDirectory() ? "" : firstNonNull(baseFilename.getFilename(), ""); - return String.format("%s-%s-%s", - prefix, FORMATTER.print(window.start()), FORMATTER.print(window.end())); - } - - @Override - public ResourceId windowedFilename(int shardNumber, - int numShards, - BoundedWindow window, - PaneInfo paneInfo, - OutputFileHints outputFileHints) { - IntervalWindow intervalWindow = (IntervalWindow) window; - String filename = - String.format( - "%s-%s-of-%s%s", - filenamePrefixForWindow(intervalWindow), - shardNumber, - numShards, - outputFileHints.getSuggestedFilenameSuffix()); - return baseFilename - .getCurrentDirectory() - .resolve(filename, StandardResolveOptions.RESOLVE_FILE); - } - - @Override - public ResourceId unwindowedFilename( - int shardNumber, int numShards, OutputFileHints outputFileHints) { - throw new UnsupportedOperationException("Unsupported."); - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java deleted file mode 100644 index 2660cdac2b..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/GameStats.java +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import java.util.HashMap; -import java.util.Map; -import ${package}.common.ExampleUtils; -import ${package}.complete.game.utils.GameConstants; -import ${package}.complete.game.utils.WriteWindowedToBigQuery; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Combine; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.Mean; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.Sum; -import org.apache.beam.sdk.transforms.Values; -import org.apache.beam.sdk.transforms.View; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.Sessions; -import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PCollectionView; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class is the fourth in a series of four pipelines that tell a story in a 'gaming' - * domain, following {@link UserScore}, {@link HourlyTeamScore}, and {@link LeaderBoard}. - * New concepts: session windows and finding session duration; use of both - * singleton and non-singleton side inputs. - * - *

This pipeline builds on the {@link LeaderBoard} functionality, and adds some "business - * intelligence" analysis: abuse detection and usage patterns. The pipeline derives the Mean user - * score sum for a window, and uses that information to identify likely spammers/robots. (The robots - * have a higher click rate than the human users). The 'robot' users are then filtered out when - * calculating the team scores. - * - *

Additionally, user sessions are tracked: that is, we find bursts of user activity using - * session windows. Then, the mean session duration information is recorded in the context of - * subsequent fixed windowing. (This could be used to tell us what games are giving us greater - * user retention). - * - *

Run {@code org.apache.beam.examples.complete.game.injector.Injector} to generate - * pubsub data for this pipeline. The {@code Injector} documentation provides more detail. - * - *

To execute this pipeline, specify the pipeline configuration like this: - *

{@code
- *   --project=YOUR_PROJECT_ID
- *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
- *   --runner=YOUR_RUNNER
- *   --dataset=YOUR-DATASET
- *   --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
- * }
- * 
- * - *

The BigQuery dataset you specify must already exist. The PubSub topic you specify should - * be the same topic to which the Injector is publishing. - */ -public class GameStats extends LeaderBoard { - - /** - * Filter out all users but those with a high clickrate, which we will consider as 'spammy' users. - * We do this by finding the mean total score per user, then using that information as a side - * input to filter out all but those user scores that are larger than - * {@code (mean * SCORE_WEIGHT)}. - */ - // [START DocInclude_AbuseDetect] - public static class CalculateSpammyUsers - extends PTransform>, PCollection>> { - private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class); - private static final double SCORE_WEIGHT = 2.5; - - @Override - public PCollection> expand(PCollection> userScores) { - - // Get the sum of scores for each user. - PCollection> sumScores = - userScores.apply("UserSum", Sum.integersPerKey()); - - // Extract the score from each element, and use it to find the global mean. - final PCollectionView globalMeanScore = - sumScores.apply(Values.create()).apply(Mean.globally().asSingletonView()); - - // Filter the user sums using the global mean. - PCollection> filtered = sumScores - .apply("ProcessAndFilter", ParDo - // use the derived mean total score as a side input - .of(new DoFn, KV>() { - private final Counter numSpammerUsers = Metrics.counter("main", "SpammerUsers"); - @ProcessElement - public void processElement(ProcessContext c) { - Integer score = c.element().getValue(); - Double gmc = c.sideInput(globalMeanScore); - if (score > (gmc * SCORE_WEIGHT)) { - LOG.info("user " + c.element().getKey() + " spammer score " + score - + " with mean " + gmc); - numSpammerUsers.inc(); - c.output(c.element()); - } - } - }).withSideInputs(globalMeanScore)); - return filtered; - } - } - // [END DocInclude_AbuseDetect] - - /** - * Calculate and output an element's session duration. - */ - private static class UserSessionInfoFn extends DoFn, Integer> { - @ProcessElement - public void processElement(ProcessContext c, BoundedWindow window) { - IntervalWindow w = (IntervalWindow) window; - int duration = new Duration( - w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes(); - c.output(duration); - } - } - - - /** - * Options supported by {@link GameStats}. - */ - public interface Options extends LeaderBoard.Options { - @Description("Numeric value of fixed window duration for user analysis, in minutes") - @Default.Integer(60) - Integer getFixedWindowDuration(); - void setFixedWindowDuration(Integer value); - - @Description("Numeric value of gap between user sessions, in minutes") - @Default.Integer(5) - Integer getSessionGap(); - void setSessionGap(Integer value); - - @Description("Numeric value of fixed window for finding mean of user session duration, " - + "in minutes") - @Default.Integer(30) - Integer getUserActivityWindowDuration(); - void setUserActivityWindowDuration(Integer value); - - @Description("Prefix used for the BigQuery table names") - @Default.String("game_stats") - String getGameStatsTablePrefix(); - void setGameStatsTablePrefix(String value); - } - - - /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is used to write information about team score sums. - */ - protected static Map>> - configureWindowedWrite() { - Map>> tableConfigure = - new HashMap<>(); - tableConfigure.put( - "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); - tableConfigure.put( - "total_score", - new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); - tableConfigure.put( - "window_start", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", - (c, w) -> { - IntervalWindow window = (IntervalWindow) w; - return GameConstants.DATE_TIME_FORMATTER.print(window.start()); - })); - tableConfigure.put( - "processing_time", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); - return tableConfigure; - } - - /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is used to write information about mean user session time. - */ - protected static Map> - configureSessionWindowWrite() { - - Map> tableConfigure = new HashMap<>(); - tableConfigure.put( - "window_start", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", - (c, w) -> { - IntervalWindow window = (IntervalWindow) w; - return GameConstants.DATE_TIME_FORMATTER.print(window.start()); - })); - tableConfigure.put( - "mean_duration", new WriteWindowedToBigQuery.FieldInfo<>("FLOAT", (c, w) -> c.element())); - return tableConfigure; - } - - - - public static void main(String[] args) throws Exception { - - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - // Enforce that this pipeline is always run in streaming mode. - options.setStreaming(true); - ExampleUtils exampleUtils = new ExampleUtils(options); - Pipeline pipeline = Pipeline.create(options); - - // Read Events from Pub/Sub using custom timestamps - PCollection rawEvents = pipeline - .apply(PubsubIO.readStrings() - .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) - .fromTopic(options.getTopic())) - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); - - // Extract username/score pairs from the event stream - PCollection> userEvents = - rawEvents.apply("ExtractUserScore", - MapElements - .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) - .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); - - // Calculate the total score per user over fixed windows, and - // cumulative updates for late data. - final PCollectionView> spammersView = - userEvents - .apply( - "FixedWindowsUser", - Window.into( - FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) - - // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. - // These might be robots/spammers. - .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) - // Derive a view from the collection of spammer users. It will be used as a side input - // in calculating the team score sums, below. - .apply("CreateSpammersView", View.asMap()); - - // [START DocInclude_FilterAndCalc] - // Calculate the total score per team over fixed windows, - // and emit cumulative updates for late data. Uses the side input derived above-- the set of - // suspected robots-- to filter out scores from those users from the sum. - // Write the results to BigQuery. - rawEvents - .apply( - "WindowIntoFixedWindows", - Window.into( - FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) - // Filter out the detected spammer users, using the side input derived above. - .apply( - "FilterOutSpammers", - ParDo.of( - new DoFn() { - @ProcessElement - public void processElement(ProcessContext c) { - // If the user is not in the spammers Map, output the data element. - if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { - c.output(c.element()); - } - } - }) - .withSideInputs(spammersView)) - // Extract and sum teamname/score pairs from the event data. - .apply("ExtractTeamScore", new ExtractAndSumScore("team")) - // [END DocInclude_FilterAndCalc] - // Write the result to BigQuery - .apply( - "WriteTeamSums", - new WriteWindowedToBigQuery<>( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getGameStatsTablePrefix() + "_team", - configureWindowedWrite())); - - // [START DocInclude_SessionCalc] - // Detect user sessions-- that is, a burst of activity separated by a gap from further - // activity. Find and record the mean session lengths. - // This information could help the game designers track the changing user engagement - // as their set of games changes. - userEvents - .apply( - "WindowIntoSessions", - Window.>into( - Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) - .withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)) - // For this use, we care only about the existence of the session, not any particular - // information aggregated over it, so the following is an efficient way to do that. - .apply(Combine.perKey(x -> 0)) - // Get the duration per session. - .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) - // [END DocInclude_SessionCalc] - // [START DocInclude_Rewindow] - // Re-window to process groups of session sums according to when the sessions complete. - .apply( - "WindowToExtractSessionMean", - Window.into( - FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) - // Find the mean session duration in each window. - .apply(Mean.globally().withoutDefaults()) - // Write this info to a BigQuery table. - .apply( - "WriteAvgSessionLength", - new WriteWindowedToBigQuery<>( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getGameStatsTablePrefix() + "_sessions", - configureSessionWindowWrite())); - // [END DocInclude_Rewindow] - - - // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the - // command line. - PipelineResult result = pipeline.run(); - exampleUtils.waitToFinish(result); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java deleted file mode 100644 index 05455219fc..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/HourlyTeamScore.java +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import java.util.HashMap; -import java.util.Map; -import java.util.TimeZone; -import ${package}.complete.game.utils.GameConstants; -import ${package}.complete.game.utils.WriteToText; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Filter; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.WithTimestamps; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.KV; -import org.joda.time.DateTimeZone; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; - -/** - * This class is the second in a series of four pipelines that tell a story in a 'gaming' - * domain, following {@link UserScore}. In addition to the concepts introduced in {@link UserScore}, - * new concepts include: windowing and element timestamps; use of {@code Filter.by()}. - * - *

This pipeline processes data collected from gaming events in batch, building on {@link - * UserScore} but using fixed windows. It calculates the sum of scores per team, for each window, - * optionally allowing specification of two timestamps before and after which data is filtered out. - * This allows a model where late data collected after the intended analysis window can be included, - * and any late-arriving data prior to the beginning of the analysis window can be removed as well. - * By using windowing and adding element timestamps, we can do finer-grained analysis than with the - * {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get - * results from plays at the beginning of the batch's time period until the batch is processed. - * - *

To execute this pipeline, specify the pipeline configuration like this: - *

{@code
- *   --tempLocation=YOUR_TEMP_DIRECTORY
- *   --runner=YOUR_RUNNER
- *   --output=YOUR_OUTPUT_DIRECTORY
- *   (possibly options specific to your runner or permissions for your temp/output locations)
- * }
- * 
- * - *

Optionally include {@code --input} to specify the batch input file path. - * To indicate a time after which the data should be filtered out, include the - * {@code --stopMin} arg. E.g., {@code --stopMin=2015-10-18-23-59} indicates that any data - * timestamped after 23:59 PST on 2015-10-18 should not be included in the analysis. - * To indicate a time before which data should be filtered out, include the {@code --startMin} arg. - * If you're using the default input specified in {@link UserScore}, - * "gs://apache-beam-samples/game/gaming_data*.csv", then - * {@code --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values. - */ -public class HourlyTeamScore extends UserScore { - - private static DateTimeFormatter minFmt = - DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); - - - /** - * Options supported by {@link HourlyTeamScore}. - */ - public interface Options extends UserScore.Options { - - @Description("Numeric value of fixed window duration, in minutes") - @Default.Integer(60) - Integer getWindowDuration(); - void setWindowDuration(Integer value); - - @Description("String representation of the first minute after which to generate results," - + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST." - + "Any input data timestamped prior to that minute won't be included in the sums.") - @Default.String("1970-01-01-00-00") - String getStartMin(); - void setStartMin(String value); - - @Description("String representation of the first minute for which to not generate results," - + "in the format: yyyy-MM-dd-HH-mm . This time should be in PST." - + "Any input data timestamped after that minute won't be included in the sums.") - @Default.String("2100-01-01-00-00") - String getStopMin(); - void setStopMin(String value); - } - - /** - * Create a map of information that describes how to write pipeline output to text. This map - * is passed to the {@link WriteToText} constructor to write team score sums and - * includes information about window start time. - */ - protected static Map>> - configureOutput() { - Map>> config = new HashMap<>(); - config.put("team", (c, w) -> c.element().getKey()); - config.put("total_score", (c, w) -> c.element().getValue()); - config.put( - "window_start", - (c, w) -> { - IntervalWindow window = (IntervalWindow) w; - return GameConstants.DATE_TIME_FORMATTER.print(window.start()); - }); - return config; - } - - - /** - * Run a batch pipeline to do windowed analysis of the data. - */ - // [START DocInclude_HTSMain] - public static void main(String[] args) throws Exception { - // Begin constructing a pipeline configured by commandline flags. - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - Pipeline pipeline = Pipeline.create(options); - - final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin())); - final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin())); - - // Read 'gaming' events from a text file. - pipeline - .apply(TextIO.read().from(options.getInput())) - // Parse the incoming data. - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) - - // Filter out data before and after the given times so that it is not included - // in the calculations. As we collect data in batches (say, by day), the batch for the day - // that we want to analyze could potentially include some late-arriving data from the - // previous day. - // If so, we want to weed it out. Similarly, if we include data from the following day - // (to scoop up late-arriving events from the day we're analyzing), we need to weed out - // events that fall after the time period we want to analyze. - // [START DocInclude_HTSFilters] - .apply( - "FilterStartTime", - Filter.by( - (GameActionInfo gInfo) -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) - .apply( - "FilterEndTime", - Filter.by( - (GameActionInfo gInfo) -> gInfo.getTimestamp() < stopMinTimestamp.getMillis())) - // [END DocInclude_HTSFilters] - - // [START DocInclude_HTSAddTsAndWindow] - // Add an element timestamp based on the event log, and apply fixed windowing. - .apply( - "AddEventTimestamps", - WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp()))) - .apply( - "FixedWindowsTeam", - Window.into(FixedWindows.of(Duration.standardMinutes(options.getWindowDuration())))) - // [END DocInclude_HTSAddTsAndWindow] - - // Extract and sum teamname/score pairs from the event data. - .apply("ExtractTeamScore", new ExtractAndSumScore("team")) - .apply( - "WriteTeamScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), true)); - - pipeline.run().waitUntilFinish(); - } - // [END DocInclude_HTSMain] - -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java deleted file mode 100644 index b5983fa789..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/LeaderBoard.java +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import com.google.common.annotations.VisibleForTesting; -import java.util.HashMap; -import java.util.Map; -import ${package}.common.ExampleOptions; -import ${package}.common.ExampleUtils; -import ${package}.complete.game.utils.GameConstants; -import ${package}.complete.game.utils.WriteToBigQuery; -import ${package}.complete.game.utils.WriteWindowedToBigQuery; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.StreamingOptions; -import org.apache.beam.sdk.options.Validation; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; -import org.apache.beam.sdk.transforms.windowing.AfterWatermark; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.GlobalWindows; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.Repeatedly; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.joda.time.Duration; -import org.joda.time.Instant; - -/** - * This class is the third in a series of four pipelines that tell a story in a 'gaming' domain, - * following {@link UserScore} and {@link HourlyTeamScore}. Concepts include: processing unbounded - * data using fixed windows; use of custom timestamps and event-time processing; generation of - * early/speculative results; using .accumulatingFiredPanes() to do cumulative processing of late- - * arriving data. - * - *

This pipeline processes an unbounded stream of 'game events'. The calculation of the team - * scores uses fixed windowing based on event time (the time of the game play event), not - * processing time (the time that an event is processed by the pipeline). The pipeline calculates - * the sum of scores per team, for each window. By default, the team scores are calculated using - * one-hour windows. - * - *

In contrast-- to demo another windowing option-- the user scores are calculated using a - * global window, which periodically (every ten minutes) emits cumulative user score sums. - * - *

In contrast to the previous pipelines in the series, which used static, finite input data, - * here we're using an unbounded data source, which lets us provide speculative results, and allows - * handling of late data, at much lower latency. We can use the early/speculative results to keep a - * 'leaderboard' updated in near-realtime. Our handling of late data lets us generate correct - * results, e.g. for 'team prizes'. We're now outputting window results as they're - * calculated, giving us much lower latency than with the previous batch examples. - * - *

Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector - * documentation provides more detail on how to do this. - * - *

To execute this pipeline, specify the pipeline configuration like this: - *

{@code
- *   --project=YOUR_PROJECT_ID
- *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
- *   --runner=YOUR_RUNNER
- *   --dataset=YOUR-DATASET
- *   --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
- * }
- * 
- * - *

The BigQuery dataset you specify must already exist. The PubSub topic you specify should be - * the same topic to which the Injector is publishing. - */ -public class LeaderBoard extends HourlyTeamScore { - - static final Duration FIVE_MINUTES = Duration.standardMinutes(5); - static final Duration TEN_MINUTES = Duration.standardMinutes(10); - - - /** - * Options supported by {@link LeaderBoard}. - */ - public interface Options extends HourlyTeamScore.Options, ExampleOptions, StreamingOptions { - - @Description("BigQuery Dataset to write tables to. Must already exist.") - @Validation.Required - String getDataset(); - void setDataset(String value); - - @Description("Pub/Sub topic to read from") - @Validation.Required - String getTopic(); - void setTopic(String value); - - @Description("Numeric value of fixed window duration for team analysis, in minutes") - @Default.Integer(60) - Integer getTeamWindowDuration(); - void setTeamWindowDuration(Integer value); - - @Description("Numeric value of allowed data lateness, in minutes") - @Default.Integer(120) - Integer getAllowedLateness(); - void setAllowedLateness(Integer value); - - @Description("Prefix used for the BigQuery table names") - @Default.String("leaderboard") - String getLeaderBoardTableName(); - void setLeaderBoardTableName(String value); - } - - /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is used to write team score sums and includes event timing information. - */ - protected static Map>> - configureWindowedTableWrite() { - - Map>> tableConfigure = - new HashMap<>(); - tableConfigure.put( - "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); - tableConfigure.put( - "total_score", - new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); - tableConfigure.put( - "window_start", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", - (c, w) -> { - IntervalWindow window = (IntervalWindow) w; - return GameConstants.DATE_TIME_FORMATTER.print(window.start()); - })); - tableConfigure.put( - "processing_time", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); - tableConfigure.put( - "timing", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", (c, w) -> c.pane().getTiming().toString())); - return tableConfigure; - } - - - /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is passed to the {@link WriteToBigQuery} constructor to write user score sums. - */ - protected static Map>> - configureBigQueryWrite() { - Map>> tableConfigure = new HashMap<>(); - tableConfigure.put( - "user", new WriteToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); - tableConfigure.put( - "total_score", - new WriteToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); - return tableConfigure; - } - - - /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is used to write user score sums. - */ - protected static Map>> - configureGlobalWindowBigQueryWrite() { - - Map>> tableConfigure = - configureBigQueryWrite(); - tableConfigure.put( - "processing_time", - new WriteToBigQuery.FieldInfo<>( - "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); - return tableConfigure; - } - - - public static void main(String[] args) throws Exception { - - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - // Enforce that this pipeline is always run in streaming mode. - options.setStreaming(true); - ExampleUtils exampleUtils = new ExampleUtils(options); - Pipeline pipeline = Pipeline.create(options); - - // Read game events from Pub/Sub using custom timestamps, which are extracted from the pubsub - // data elements, and parse the data. - PCollection gameEvents = pipeline - .apply(PubsubIO.readStrings() - .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) - .fromTopic(options.getTopic())) - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); - - gameEvents - .apply( - "CalculateTeamScores", - new CalculateTeamScores( - Duration.standardMinutes(options.getTeamWindowDuration()), - Duration.standardMinutes(options.getAllowedLateness()))) - // Write the results to BigQuery. - .apply( - "WriteTeamScoreSums", - new WriteWindowedToBigQuery<>( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getLeaderBoardTableName() + "_team", - configureWindowedTableWrite())); - gameEvents - .apply( - "CalculateUserScores", - new CalculateUserScores(Duration.standardMinutes(options.getAllowedLateness()))) - // Write the results to BigQuery. - .apply( - "WriteUserScoreSums", - new WriteToBigQuery<>( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getLeaderBoardTableName() + "_user", - configureGlobalWindowBigQueryWrite())); - - // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the - // command line. - PipelineResult result = pipeline.run(); - exampleUtils.waitToFinish(result); - } - - /** - * Calculates scores for each team within the configured window duration. - */ - // [START DocInclude_WindowAndTrigger] - // Extract team/score pairs from the event stream, using hour-long windows by default. - @VisibleForTesting - static class CalculateTeamScores - extends PTransform, PCollection>> { - private final Duration teamWindowDuration; - private final Duration allowedLateness; - - CalculateTeamScores(Duration teamWindowDuration, Duration allowedLateness) { - this.teamWindowDuration = teamWindowDuration; - this.allowedLateness = allowedLateness; - } - - @Override - public PCollection> expand(PCollection infos) { - return infos.apply("LeaderboardTeamFixedWindows", - Window.into(FixedWindows.of(teamWindowDuration)) - // We will get early (speculative) results as well as cumulative - // processing of late data. - .triggering(AfterWatermark.pastEndOfWindow() - .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane() - .plusDelayOf(FIVE_MINUTES)) - .withLateFirings(AfterProcessingTime.pastFirstElementInPane() - .plusDelayOf(TEN_MINUTES))) - .withAllowedLateness(allowedLateness) - .accumulatingFiredPanes()) - // Extract and sum teamname/score pairs from the event data. - .apply("ExtractTeamScore", new ExtractAndSumScore("team")); - } - } - // [END DocInclude_WindowAndTrigger] - - // [START DocInclude_ProcTimeTrigger] - /** - * Extract user/score pairs from the event stream using processing time, via global windowing. - * Get periodic updates on all users' running scores. - */ - @VisibleForTesting - static class CalculateUserScores - extends PTransform, PCollection>> { - private final Duration allowedLateness; - - CalculateUserScores(Duration allowedLateness) { - this.allowedLateness = allowedLateness; - } - - @Override - public PCollection> expand(PCollection input) { - return input.apply("LeaderboardUserGlobalWindow", - Window.into(new GlobalWindows()) - // Get periodic results every ten minutes. - .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() - .plusDelayOf(TEN_MINUTES))) - .accumulatingFiredPanes() - .withAllowedLateness(allowedLateness)) - // Extract and sum username/score pairs from the event data. - .apply("ExtractUserScore", new ExtractAndSumScore("user")); - } - } - // [END DocInclude_ProcTimeTrigger] -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java deleted file mode 100644 index c0a7bc8e17..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/StatefulTeamScore.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import static com.google.common.base.MoreObjects.firstNonNull; - -import com.google.common.annotations.VisibleForTesting; -import java.util.HashMap; -import java.util.Map; -import ${package}.common.ExampleUtils; -import ${package}.complete.game.utils.GameConstants; -import ${package}.complete.game.utils.WriteToBigQuery.FieldInfo; -import ${package}.complete.game.utils.WriteWindowedToBigQuery; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.PipelineResult; -import org.apache.beam.sdk.coders.VarIntCoder; -import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; -import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.state.StateSpec; -import org.apache.beam.sdk.state.StateSpecs; -import org.apache.beam.sdk.state.ValueState; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.TypeDescriptor; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.joda.time.Instant; - -/** - * This class is part of a series of pipelines that tell a story in a gaming domain. Concepts - * include: stateful processing. - * - *

This pipeline processes an unbounded stream of 'game events'. It uses stateful processing to - * aggregate team scores per team and outputs team name and it's total score every time the team - * passes a new multiple of a threshold score. For example, multiples of the threshold could be the - * corresponding scores required to pass each level of the game. By default, this threshold is set - * to 5000. - * - *

Stateful processing allows us to write pipelines that output based on a runtime state (when - * a team reaches a certain score, in every 100 game events etc) without time triggers. See - * https://beam.apache.org/blog/2017/02/13/stateful-processing.html for more information on using - * stateful processing. - * - *

Run {@code injector.Injector} to generate pubsub data for this pipeline. The Injector - * documentation provides more detail on how to do this. - * - *

To execute this pipeline, specify the pipeline configuration like this: - *

{@code
- *   --project=YOUR_PROJECT_ID
- *   --tempLocation=gs://YOUR_TEMP_DIRECTORY
- *   --runner=YOUR_RUNNER
- *   --dataset=YOUR-DATASET
- *   --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC
- * }
- * 
- * - *

The BigQuery dataset you specify must already exist. The PubSub topic you specify should be - * the same topic to which the Injector is publishing. - */ -public class StatefulTeamScore extends LeaderBoard { - - /** - * Options supported by {@link StatefulTeamScore}. - */ - interface Options extends LeaderBoard.Options { - - @Description("Numeric value, multiple of which is used as threshold for outputting team score.") - @Default.Integer(5000) - Integer getThresholdScore(); - - void setThresholdScore(Integer value); - } - - /** - * Create a map of information that describes how to write pipeline output to BigQuery. This map - * is used to write team score sums. - */ - private static Map>> configureCompleteWindowedTableWrite() { - - Map>> tableConfigure = - new HashMap<>(); - tableConfigure.put( - "team", new WriteWindowedToBigQuery.FieldInfo<>("STRING", (c, w) -> c.element().getKey())); - tableConfigure.put( - "total_score", - new WriteWindowedToBigQuery.FieldInfo<>("INTEGER", (c, w) -> c.element().getValue())); - tableConfigure.put( - "processing_time", - new WriteWindowedToBigQuery.FieldInfo<>( - "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now()))); - return tableConfigure; - } - - - public static void main(String[] args) throws Exception { - - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - // Enforce that this pipeline is always run in streaming mode. - options.setStreaming(true); - ExampleUtils exampleUtils = new ExampleUtils(options); - Pipeline pipeline = Pipeline.create(options); - - pipeline - // Read game events from Pub/Sub using custom timestamps, which are extracted from the - // pubsub data elements, and parse the data. - .apply( - PubsubIO.readStrings() - .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE) - .fromTopic(options.getTopic())) - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) - // Create mapping. UpdateTeamScore uses team name as key. - .apply( - "MapTeamAsKey", - MapElements.into( - TypeDescriptors.kvs( - TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class))) - .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo))) - // Outputs a team's score every time it passes a new multiple of the threshold. - .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore()))) - // Write the results to BigQuery. - .apply( - "WriteTeamLeaders", - new WriteWindowedToBigQuery<>( - options.as(GcpOptions.class).getProject(), - options.getDataset(), - options.getLeaderBoardTableName() + "_team_leader", - configureCompleteWindowedTableWrite())); - - // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the - // command line. - PipelineResult result = pipeline.run(); - exampleUtils.waitToFinish(result); - } - - /** - * Tracks each team's score separately in a single state cell and outputs the score every time it - * passes a new multiple of a threshold. - * - *

We use stateful {@link DoFn} because: - *

    - *
  • State is key-partitioned. Therefore, the score is calculated per team.
  • - *
  • Stateful {@link DoFn} can determine when to output based on the state. This only allows - * outputting when a team's score passes a given threshold.
  • - *
- */ - @VisibleForTesting - public static class UpdateTeamScoreFn - extends DoFn, KV> { - - private static final String TOTAL_SCORE = "totalScore"; - private final int thresholdScore; - - public UpdateTeamScoreFn(int thresholdScore) { - this.thresholdScore = thresholdScore; - } - - /** - * Describes the state for storing team score. Let's break down this statement. - * - * {@link StateSpec} configures the state cell, which is provided by a runner during pipeline - * execution. - * - * {@link org.apache.beam.sdk.transforms.DoFn.StateId} annotation assigns an identifier to the - * state, which is used to refer the state in - * {@link org.apache.beam.sdk.transforms.DoFn.ProcessElement}. - * - *

A {@link ValueState} stores single value per key and per window. Because our pipeline is - * globally windowed in this example, this {@link ValueState} is just key partitioned, with one - * score per team. Any other class that extends {@link org.apache.beam.sdk.state.State} can be - * used.

- * - *

In order to store the value, the state must be encoded. Therefore, we provide a coder, in - * this case the {@link VarIntCoder}. If the coder is not provided as in - * {@code StateSpecs.value()}, Beam's coder inference will try to provide a coder automatically. - *

- */ - @StateId(TOTAL_SCORE) - private final StateSpec> totalScoreSpec = - StateSpecs.value(VarIntCoder.of()); - - /** - * To use a state cell, annotate a parameter with - * {@link org.apache.beam.sdk.transforms.DoFn.StateId} that matches the state declaration. The - * type of the parameter should match the {@link StateSpec} type. - */ - @ProcessElement - public void processElement( - ProcessContext c, - @StateId(TOTAL_SCORE) ValueState totalScore) { - String teamName = c.element().getKey(); - GameActionInfo gInfo = c.element().getValue(); - - // ValueState cells do not contain a default value. If the state is possibly not written, make - // sure to check for null on read. - int oldTotalScore = firstNonNull(totalScore.read(), 0); - totalScore.write(oldTotalScore + gInfo.score); - - // Since there are no negative scores, the easiest way to check whether a team just passed a - // new multiple of the threshold score is to compare the quotients of dividing total scores by - // threshold before and after this aggregation. For example, if the total score was 1999, - // the new total is 2002, and the threshold is 1000, 1999 / 1000 = 1, 2002 / 1000 = 2. - // Therefore, this team passed the threshold. - if (oldTotalScore / this.thresholdScore < totalScore.read() / this.thresholdScore) { - c.output(KV.of(teamName, totalScore.read())); - } - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java deleted file mode 100644 index 3459d043f5..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/UserScore.java +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import java.util.HashMap; -import java.util.Map; -import org.apache.avro.reflect.Nullable; -import ${package}.complete.game.utils.WriteToText; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.DefaultCoder; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.metrics.Counter; -import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.Default; -import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.PipelineOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.options.Validation; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.Sum; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * This class is the first in a series of four pipelines that tell a story in a 'gaming' domain. - * Concepts: batch processing, reading input from text files, writing output to - * text files, using standalone DoFns, use of the sum per key transform, and use of - * Java 8 lambda syntax. - * - *

In this gaming scenario, many users play, as members of different teams, over the course of a - * day, and their actions are logged for processing. Some of the logged game events may be late- - * arriving, if users play on mobile devices and go transiently offline for a period. - * - *

This pipeline does batch processing of data collected from gaming events. It calculates the - * sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The - * batch processing will not include any late data that arrives after the day's cutoff point. - * - *

To execute this pipeline, specify the pipeline configuration like this: - *

{@code
- *   --tempLocation=YOUR_TEMP_DIRECTORY
- *   --runner=YOUR_RUNNER
- *   --output=YOUR_OUTPUT_DIRECTORY
- *   (possibly options specific to your runner or permissions for your temp/output locations)
- * }
- * 
- * - *

Optionally include the --input argument to specify a batch input file. - * See the --input default value for example batch data file, or use {@code injector.Injector} to - * generate your own batch data. - */ -public class UserScore { - - /** - * Class to hold info about a game event. - */ - @DefaultCoder(AvroCoder.class) - static class GameActionInfo { - @Nullable String user; - @Nullable String team; - @Nullable Integer score; - @Nullable Long timestamp; - - public GameActionInfo() {} - - public GameActionInfo(String user, String team, Integer score, Long timestamp) { - this.user = user; - this.team = team; - this.score = score; - this.timestamp = timestamp; - } - - public String getUser() { - return this.user; - } - public String getTeam() { - return this.team; - } - public Integer getScore() { - return this.score; - } - public String getKey(String keyname) { - if ("team".equals(keyname)) { - return this.team; - } else { // return username as default - return this.user; - } - } - public Long getTimestamp() { - return this.timestamp; - } - } - - - /** - * Parses the raw game event info into GameActionInfo objects. Each event line has the following - * format: username,teamname,score,timestamp_in_ms,readable_time - * e.g.: - * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 - * The human-readable time string is not used here. - */ - static class ParseEventFn extends DoFn { - - // Log and count parse errors. - private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class); - private final Counter numParseErrors = Metrics.counter("main", "ParseErrors"); - - @ProcessElement - public void processElement(ProcessContext c) { - System.out.println("GOT " + c.element()); - String[] components = c.element().split(",", -1); - try { - String user = components[0].trim(); - String team = components[1].trim(); - Integer score = Integer.parseInt(components[2].trim()); - Long timestamp = Long.parseLong(components[3].trim()); - GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp); - c.output(gInfo); - } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { - numParseErrors.inc(); - LOG.info("Parse error on " + c.element() + ", " + e.getMessage()); - } - } - } - - /** - * A transform to extract key/score information from GameActionInfo, and sum the scores. The - * constructor arg determines whether 'team' or 'user' info is extracted. - */ - // [START DocInclude_USExtractXform] - public static class ExtractAndSumScore - extends PTransform, PCollection>> { - - private final String field; - - ExtractAndSumScore(String field) { - this.field = field; - } - - @Override - public PCollection> expand( - PCollection gameInfo) { - - return gameInfo - .apply( - MapElements.into( - TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) - .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore()))) - .apply(Sum.integersPerKey()); - } - } - // [END DocInclude_USExtractXform] - - - /** - * Options supported by {@link UserScore}. - */ - public interface Options extends PipelineOptions { - - @Description("Path to the data file(s) containing game data.") - // The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent - // day's worth (roughly) of data. - @Default.String("gs://apache-beam-samples/game/gaming_data*.csv") - String getInput(); - void setInput(String value); - - // Set this required option to specify where to write the output. - @Description("Path of the file to write to.") - @Validation.Required - String getOutput(); - void setOutput(String value); - } - - /** - * Create a map of information that describes how to write pipeline output to text. This map - * is passed to the {@link WriteToText} constructor to write user score sums. - */ - protected static Map>> - configureOutput() { - Map>> config = new HashMap<>(); - config.put("user", (c, w) -> c.element().getKey()); - config.put("total_score", (c, w) -> c.element().getValue()); - return config; - } - - /** - * Run a batch pipeline. - */ - // [START DocInclude_USMain] - public static void main(String[] args) throws Exception { - // Begin constructing a pipeline configured by commandline flags. - Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); - Pipeline pipeline = Pipeline.create(options); - - // Read events from a text file and parse them. - pipeline - .apply(TextIO.read().from(options.getInput())) - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) - // Extract and sum username/score pairs from the event data. - .apply("ExtractUserScore", new ExtractAndSumScore("user")) - .apply( - "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false)); - - // Run the batch pipeline. - pipeline.run().waitUntilFinish(); - } - // [END DocInclude_USMain] -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java deleted file mode 100644 index c21ec2e319..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/Injector.java +++ /dev/null @@ -1,439 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.injector; - -import com.google.api.services.pubsub.Pubsub; -import com.google.api.services.pubsub.model.PublishRequest; -import com.google.api.services.pubsub.model.PubsubMessage; -import com.google.common.collect.ImmutableMap; -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; -import ${package}.complete.game.utils.GameConstants; - -/** - * This is a generator that simulates usage data from a mobile game, and either publishes the data - * to a pubsub topic or writes it to a file. - * - *

The general model used by the generator is the following. There is a set of teams with team - * members. Each member is scoring points for their team. After some period, a team will dissolve - * and a new one will be created in its place. There is also a set of 'Robots', or spammer users. - * They hop from team to team. The robots are set to have a higher 'click rate' (generate more - * events) than the regular team members. - * - *

Each generated line of data has the following form: - * username,teamname,score,timestamp_in_ms,readable_time - * e.g.: - * user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224 - * - *

The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if - * specified. It takes the following arguments: - * {@code Injector project-name (topic-name|none) (filename|none)}. - * - *

To run the Injector in the mode where it publishes to PubSub, you will need to authenticate - * locally using project-based service account credentials to avoid running over PubSub - * quota. - * See https://developers.google.com/identity/protocols/application-default-credentials - * for more information on using service account credentials. Set the GOOGLE_APPLICATION_CREDENTIALS - * environment variable to point to your downloaded service account credentials before starting the - * program, e.g.: - * {@code export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/credentials-key.json}. - * If you do not do this, then your injector will only run for a few minutes on your - * 'user account' credentials before you will start to see quota error messages like: - * "Request throttled due to user QPS limit being reached", and see this exception: - * ".com.google.api.client.googleapis.json.GoogleJsonResponseException: 429 Too Many Requests". - * Once you've set up your credentials, run the Injector like this": - *

{@code
- * Injector   none
- * }
- * 
- * The pubsub topic will be created if it does not exist. - * - *

To run the injector in write-to-file-mode, set the topic name to "none" and specify the - * filename: - *

{@code
- * Injector  none 
- * }
- * 
- */ -class Injector { - private static Pubsub pubsub; - private static Random random = new Random(); - private static String topic; - private static String project; - - // QPS ranges from 800 to 1000. - private static final int MIN_QPS = 800; - private static final int QPS_RANGE = 200; - // How long to sleep, in ms, between creation of the threads that make API requests to PubSub. - private static final int THREAD_SLEEP_MS = 500; - - // Lists used to generate random team names. - // If COLORS is changed, please also make changes in - // release/src/main/groovy/MobileGamingCommands.COLORS - private static final ArrayList COLORS = - new ArrayList<>( - Arrays.asList( - "Magenta", - "AliceBlue", - "Almond", - "Amaranth", - "Amber", - "Amethyst", - "AndroidGreen", - "AntiqueBrass", - "Fuchsia", - "Ruby", - "AppleGreen", - "Apricot", - "Aqua", - "ArmyGreen", - "Asparagus", - "Auburn", - "Azure", - "Banana", - "Beige", - "Bisque", - "BarnRed", - "BattleshipGrey")); - - private static final ArrayList ANIMALS = - new ArrayList<>( - Arrays.asList( - "Echidna", - "Koala", - "Wombat", - "Marmot", - "Quokka", - "Kangaroo", - "Dingo", - "Numbat", - "Emu", - "Wallaby", - "CaneToad", - "Bilby", - "Possum", - "Cassowary", - "Kookaburra", - "Platypus", - "Bandicoot", - "Cockatoo", - "Antechinus")); - - // The list of live teams. - private static ArrayList liveTeams = new ArrayList<>(); - - // The total number of robots in the system. - private static final int NUM_ROBOTS = 20; - // Determines the chance that a team will have a robot team member. - private static final int ROBOT_PROBABILITY = 3; - private static final int NUM_LIVE_TEAMS = 15; - private static final int BASE_MEMBERS_PER_TEAM = 5; - private static final int MEMBERS_PER_TEAM = 15; - private static final int MAX_SCORE = 20; - private static final int LATE_DATA_RATE = 5 * 60 * 2; // Every 10 minutes - private static final int BASE_DELAY_IN_MILLIS = 5 * 60 * 1000; // 5-10 minute delay - private static final int FUZZY_DELAY_IN_MILLIS = 5 * 60 * 1000; - - // The minimum time a 'team' can live. - private static final int BASE_TEAM_EXPIRATION_TIME_IN_MINS = 20; - private static final int TEAM_EXPIRATION_TIME_IN_MINS = 20; - - - /** - * A class for holding team info: the name of the team, when it started, - * and the current team members. Teams may but need not include one robot team member. - */ - private static class TeamInfo { - String teamName; - long startTimeInMillis; - int expirationPeriod; - // The team might but need not include 1 robot. Will be non-null if so. - String robot; - int numMembers; - - private TeamInfo(String teamName, long startTimeInMillis, String robot) { - this.teamName = teamName; - this.startTimeInMillis = startTimeInMillis; - // How long until this team is dissolved. - this.expirationPeriod = random.nextInt(TEAM_EXPIRATION_TIME_IN_MINS) - + BASE_TEAM_EXPIRATION_TIME_IN_MINS; - this.robot = robot; - // Determine the number of team members. - numMembers = random.nextInt(MEMBERS_PER_TEAM) + BASE_MEMBERS_PER_TEAM; - } - - String getTeamName() { - return teamName; - } - String getRobot() { - return robot; - } - - long getStartTimeInMillis() { - return startTimeInMillis; - } - long getEndTimeInMillis() { - return startTimeInMillis + (expirationPeriod * 60L * 1000L); - } - String getRandomUser() { - int userNum = random.nextInt(numMembers); - return "user" + userNum + "_" + teamName; - } - - int numMembers() { - return numMembers; - } - - @Override - public String toString() { - return "(" + teamName + ", num members: " + numMembers() + ", starting at: " - + startTimeInMillis + ", expires in: " + expirationPeriod + ", robot: " + robot + ")"; - } - } - - /** Utility to grab a random element from an array of Strings. */ - private static String randomElement(ArrayList list) { - int index = random.nextInt(list.size()); - return list.get(index); - } - - /** - * Get and return a random team. If the selected team is too old w.r.t its expiration, remove - * it, replacing it with a new team. - */ - private static TeamInfo randomTeam(ArrayList list) { - int index = random.nextInt(list.size()); - TeamInfo team = list.get(index); - // If the selected team is expired, remove it and return a new team. - long currTime = System.currentTimeMillis(); - if ((team.getEndTimeInMillis() < currTime) || team.numMembers() == 0) { - System.out.println("\nteam " + team + " is too old; replacing."); - System.out.println("start time: " + team.getStartTimeInMillis() - + ", end time: " + team.getEndTimeInMillis() - + ", current time:" + currTime); - removeTeam(index); - // Add a new team in its stead. - return (addLiveTeam()); - } else { - return team; - } - } - - /** - * Create and add a team. Possibly add a robot to the team. - */ - private static synchronized TeamInfo addLiveTeam() { - String teamName = randomElement(COLORS) + randomElement(ANIMALS); - String robot = null; - // Decide if we want to add a robot to the team. - if (random.nextInt(ROBOT_PROBABILITY) == 0) { - robot = "Robot-" + random.nextInt(NUM_ROBOTS); - } - // Create the new team. - TeamInfo newTeam = new TeamInfo(teamName, System.currentTimeMillis(), robot); - liveTeams.add(newTeam); - System.out.println("[+" + newTeam + "]"); - return newTeam; - } - - /** - * Remove a specific team. - */ - private static synchronized void removeTeam(int teamIndex) { - TeamInfo removedTeam = liveTeams.remove(teamIndex); - System.out.println("[-" + removedTeam + "]"); - } - - /** Generate a user gaming event. */ - private static String generateEvent(Long currTime, int delayInMillis) { - TeamInfo team = randomTeam(liveTeams); - String teamName = team.getTeamName(); - String user; - final int parseErrorRate = 900000; - - String robot = team.getRobot(); - // If the team has an associated robot team member... - if (robot != null) { - // Then use that robot for the message with some probability. - // Set this probability to higher than that used to select any of the 'regular' team - // members, so that if there is a robot on the team, it has a higher click rate. - if (random.nextInt(team.numMembers() / 2) == 0) { - user = robot; - } else { - user = team.getRandomUser(); - } - } else { // No robot. - user = team.getRandomUser(); - } - String event = user + "," + teamName + "," + random.nextInt(MAX_SCORE); - // Randomly introduce occasional parse errors. - if (random.nextInt(parseErrorRate) == 0) { - System.out.println("Introducing a parse error."); - event = "THIS LINE REPRESENTS CORRUPT DATA AND WILL CAUSE A PARSE ERROR"; - } - return addTimeInfoToEvent(event, currTime, delayInMillis); - } - - /** - * Add time info to a generated gaming event. - */ - private static String addTimeInfoToEvent(String message, Long currTime, int delayInMillis) { - String eventTimeString = - Long.toString((currTime - delayInMillis) / 1000 * 1000); - // Add a (redundant) 'human-readable' date string to make the data semantics more clear. - String dateString = GameConstants.DATE_TIME_FORMATTER.print(currTime); - message = message + "," + eventTimeString + "," + dateString; - return message; - } - - /** - * Publish 'numMessages' arbitrary events from live users with the provided delay, to a - * PubSub topic. - */ - public static void publishData(int numMessages, int delayInMillis) - throws IOException { - List pubsubMessages = new ArrayList<>(); - - for (int i = 0; i < Math.max(1, numMessages); i++) { - Long currTime = System.currentTimeMillis(); - String message = generateEvent(currTime, delayInMillis); - PubsubMessage pubsubMessage = new PubsubMessage() - .encodeData(message.getBytes("UTF-8")); - pubsubMessage.setAttributes( - ImmutableMap.of(GameConstants.TIMESTAMP_ATTRIBUTE, - Long.toString((currTime - delayInMillis) / 1000 * 1000))); - if (delayInMillis != 0) { - System.out.println(pubsubMessage.getAttributes()); - System.out.println("late data for: " + message); - } - pubsubMessages.add(pubsubMessage); - } - - PublishRequest publishRequest = new PublishRequest(); - publishRequest.setMessages(pubsubMessages); - pubsub.projects().topics().publish(topic, publishRequest).execute(); - } - - /** - * Publish generated events to a file. - */ - public static void publishDataToFile(String fileName, int numMessages, int delayInMillis) - throws IOException { - PrintWriter out = new PrintWriter(new OutputStreamWriter( - new BufferedOutputStream(new FileOutputStream(fileName, true)), "UTF-8")); - - try { - for (int i = 0; i < Math.max(1, numMessages); i++) { - Long currTime = System.currentTimeMillis(); - String message = generateEvent(currTime, delayInMillis); - out.println(message); - } - } catch (Exception e) { - System.err.print("Error in writing generated events to file"); - e.printStackTrace(); - } finally { - out.flush(); - out.close(); - } - } - - - public static void main(String[] args) throws IOException, InterruptedException { - if (args.length < 3) { - System.out.println("Usage: Injector project-name (topic-name|none) (filename|none)"); - System.exit(1); - } - boolean writeToFile = false; - boolean writeToPubsub = true; - project = args[0]; - String topicName = args[1]; - String fileName = args[2]; - // The Injector writes either to a PubSub topic, or a file. It will use the PubSub topic if - // specified; otherwise, it will try to write to a file. - if ("none".equalsIgnoreCase(topicName)) { - writeToFile = true; - writeToPubsub = false; - } - if (writeToPubsub) { - // Create the PubSub client. - pubsub = InjectorUtils.getClient(); - // Create the PubSub topic as necessary. - topic = InjectorUtils.getFullyQualifiedTopicName(project, topicName); - InjectorUtils.createTopic(pubsub, topic); - System.out.println("Injecting to topic: " + topic); - } else { - if ("none".equalsIgnoreCase(fileName)) { - System.out.println("Filename not specified."); - System.exit(1); - } - System.out.println("Writing to file: " + fileName); - } - System.out.println("Starting Injector"); - - // Start off with some random live teams. - while (liveTeams.size() < NUM_LIVE_TEAMS) { - addLiveTeam(); - } - - // Publish messages at a rate determined by the QPS and Thread sleep settings. - for (int i = 0; true; i++) { - if (Thread.activeCount() > 10) { - System.err.println("I'm falling behind!"); - } - - // Decide if this should be a batch of late data. - final int numMessages; - final int delayInMillis; - if (i % LATE_DATA_RATE == 0) { - // Insert delayed data for one user (one message only) - delayInMillis = BASE_DELAY_IN_MILLIS + random.nextInt(FUZZY_DELAY_IN_MILLIS); - numMessages = 1; - System.out.println("DELAY(" + delayInMillis + ", " + numMessages + ")"); - } else { - System.out.print("."); - delayInMillis = 0; - numMessages = MIN_QPS + random.nextInt(QPS_RANGE); - } - - if (writeToFile) { // Won't use threading for the file write. - publishDataToFile(fileName, numMessages, delayInMillis); - } else { // Write to PubSub. - // Start a thread to inject some data. - new Thread( - () -> { - try { - publishData(numMessages, delayInMillis); - } catch (IOException e) { - System.err.println(e); - } - }) - .start(); - } - - // Wait before creating another injector thread. - Thread.sleep(THREAD_SLEEP_MS); - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java deleted file mode 100644 index 5a0cf0166e..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/InjectorUtils.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.injector; - -import static com.google.common.base.Preconditions.checkNotNull; - -import com.google.api.client.googleapis.auth.oauth2.GoogleCredential; -import com.google.api.client.googleapis.json.GoogleJsonResponseException; -import com.google.api.client.googleapis.util.Utils; -import com.google.api.client.http.HttpRequestInitializer; -import com.google.api.client.http.HttpStatusCodes; -import com.google.api.client.http.HttpTransport; -import com.google.api.client.json.JsonFactory; -import com.google.api.services.pubsub.Pubsub; -import com.google.api.services.pubsub.PubsubScopes; -import com.google.api.services.pubsub.model.Topic; -import java.io.IOException; - -class InjectorUtils { - - private static final String APP_NAME = "injector"; - - /** - * Builds a new Pubsub client and returns it. - */ - public static Pubsub getClient(final HttpTransport httpTransport, - final JsonFactory jsonFactory) - throws IOException { - checkNotNull(httpTransport); - checkNotNull(jsonFactory); - GoogleCredential credential = - GoogleCredential.getApplicationDefault(httpTransport, jsonFactory); - if (credential.createScopedRequired()) { - credential = credential.createScoped(PubsubScopes.all()); - } - if (credential.getClientAuthentication() != null) { - System.out.println("\n***Warning! You are not using service account credentials to " - + "authenticate.\nYou need to use service account credentials for this example," - + "\nsince user-level credentials do not have enough pubsub quota,\nand so you will run " - + "out of PubSub quota very quickly.\nSee " - + "https://developers.google.com/identity/protocols/application-default-credentials."); - System.exit(1); - } - HttpRequestInitializer initializer = - new RetryHttpInitializerWrapper(credential); - return new Pubsub.Builder(httpTransport, jsonFactory, initializer) - .setApplicationName(APP_NAME) - .build(); - } - - /** - * Builds a new Pubsub client with default HttpTransport and - * JsonFactory and returns it. - */ - public static Pubsub getClient() throws IOException { - return getClient(Utils.getDefaultTransport(), - Utils.getDefaultJsonFactory()); - } - - - /** - * Returns the fully qualified topic name for Pub/Sub. - */ - public static String getFullyQualifiedTopicName( - final String project, final String topic) { - return String.format("projects/%s/topics/%s", project, topic); - } - - /** - * Create a topic if it doesn't exist. - */ - public static void createTopic(Pubsub client, String fullTopicName) - throws IOException { - System.out.println("fullTopicName " + fullTopicName); - try { - client.projects().topics().get(fullTopicName).execute(); - } catch (GoogleJsonResponseException e) { - if (e.getStatusCode() == HttpStatusCodes.STATUS_CODE_NOT_FOUND) { - Topic topic = client.projects().topics() - .create(fullTopicName, new Topic()) - .execute(); - System.out.printf("Topic %s was created.%n", topic.getName()); - } - } - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java deleted file mode 100644 index e90fbcc18e..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/injector/RetryHttpInitializerWrapper.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.injector; - -import static com.google.common.base.Preconditions.checkNotNull; - -import com.google.api.client.auth.oauth2.Credential; -import com.google.api.client.http.HttpBackOffIOExceptionHandler; -import com.google.api.client.http.HttpBackOffUnsuccessfulResponseHandler; -import com.google.api.client.http.HttpRequest; -import com.google.api.client.http.HttpRequestInitializer; -import com.google.api.client.http.HttpUnsuccessfulResponseHandler; -import com.google.api.client.util.ExponentialBackOff; -import com.google.api.client.util.Sleeper; -import java.util.logging.Logger; - -/** - * RetryHttpInitializerWrapper will automatically retry upon RPC - * failures, preserving the auto-refresh behavior of the Google - * Credentials. - */ -public class RetryHttpInitializerWrapper implements HttpRequestInitializer { - - /** - * A private logger. - */ - private static final Logger LOG = - Logger.getLogger(RetryHttpInitializerWrapper.class.getName()); - - /** - * One minutes in miliseconds. - */ - private static final int ONEMINITUES = 60000; - - /** - * Intercepts the request for filling in the "Authorization" - * header field, as well as recovering from certain unsuccessful - * error codes wherein the Credential must refresh its token for a - * retry. - */ - private final Credential wrappedCredential; - - /** - * A sleeper; you can replace it with a mock in your test. - */ - private final Sleeper sleeper; - - /** - * A constructor. - * - * @param wrappedCredential Credential which will be wrapped and - * used for providing auth header. - */ - public RetryHttpInitializerWrapper(final Credential wrappedCredential) { - this(wrappedCredential, Sleeper.DEFAULT); - } - - /** - * A protected constructor only for testing. - * - * @param wrappedCredential Credential which will be wrapped and - * used for providing auth header. - * @param sleeper Sleeper for easy testing. - */ - RetryHttpInitializerWrapper( - final Credential wrappedCredential, final Sleeper sleeper) { - this.wrappedCredential = checkNotNull(wrappedCredential); - this.sleeper = sleeper; - } - - /** - * Initializes the given request. - */ - @Override - public final void initialize(final HttpRequest request) { - request.setReadTimeout(2 * ONEMINITUES); // 2 minutes read timeout - final HttpUnsuccessfulResponseHandler backoffHandler = - new HttpBackOffUnsuccessfulResponseHandler( - new ExponentialBackOff()) - .setSleeper(sleeper); - request.setInterceptor(wrappedCredential); - request.setUnsuccessfulResponseHandler( - (request1, response, supportsRetry) -> { - if (wrappedCredential.handleResponse(request1, response, supportsRetry)) { - // If credential decides it can handle it, the return code or message indicated - // something specific to authentication, and no backoff is desired. - return true; - } else if (backoffHandler.handleResponse(request1, response, supportsRetry)) { - // Otherwise, we defer to the judgement of our internal backoff handler. - LOG.info("Retrying " + request1.getUrl().toString()); - return true; - } else { - return false; - } - }); - request.setIOExceptionHandler( - new HttpBackOffIOExceptionHandler(new ExponentialBackOff()) - .setSleeper(sleeper)); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java deleted file mode 100644 index dc28ad72ea..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/GameConstants.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.utils; - -import java.util.TimeZone; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; - -/** - * Shared constants between game series classes. - */ -public class GameConstants { - - public static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; - - public static final DateTimeFormatter DATE_TIME_FORMATTER = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java deleted file mode 100644 index d35a4ffcfc..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToBigQuery.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.utils; - -import com.google.api.services.bigquery.model.TableFieldSchema; -import com.google.api.services.bigquery.model.TableReference; -import com.google.api.services.bigquery.model.TableRow; -import com.google.api.services.bigquery.model.TableSchema; -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; - -/** - * Generate, format, and write BigQuery table row information. Use provided information about - * the field names and types, as well as lambda functions that describe how to generate their - * values. - */ -public class WriteToBigQuery - extends PTransform, PDone> { - - protected String projectId; - protected String datasetId; - protected String tableName; - protected Map> fieldInfo; - - public WriteToBigQuery() { - } - - public WriteToBigQuery( - String projectId, - String datasetId, - String tableName, - Map> fieldInfo) { - this.projectId = projectId; - this.datasetId = datasetId; - this.tableName = tableName; - this.fieldInfo = fieldInfo; - } - - /** - * A {@link Serializable} function from a {@link DoFn.ProcessContext} - * and {@link BoundedWindow} to the value for that field. - */ - public interface FieldFn extends Serializable { - Object apply(DoFn.ProcessContext context, BoundedWindow window); - } - - /** Define a class to hold information about output table field definitions. */ - public static class FieldInfo implements Serializable { - // The BigQuery 'type' of the field - private String fieldType; - // A lambda function to generate the field value - private FieldFn fieldFn; - - public FieldInfo(String fieldType, - FieldFn fieldFn) { - this.fieldType = fieldType; - this.fieldFn = fieldFn; - } - - String getFieldType() { - return this.fieldType; - } - - FieldFn getFieldFn() { - return this.fieldFn; - } - } - - /** Convert each key/score pair into a BigQuery TableRow as specified by fieldFn. */ - protected class BuildRowFn extends DoFn { - - @ProcessElement - public void processElement(ProcessContext c, BoundedWindow window) { - - TableRow row = new TableRow(); - for (Map.Entry> entry : fieldInfo.entrySet()) { - String key = entry.getKey(); - FieldInfo fcnInfo = entry.getValue(); - FieldFn fcn = fcnInfo.getFieldFn(); - row.set(key, fcn.apply(c, window)); - } - c.output(row); - } - } - - /** Build the output table schema. */ - protected TableSchema getSchema() { - List fields = new ArrayList<>(); - for (Map.Entry> entry : fieldInfo.entrySet()) { - String key = entry.getKey(); - FieldInfo fcnInfo = entry.getValue(); - String bqType = fcnInfo.getFieldType(); - fields.add(new TableFieldSchema().setName(key).setType(bqType)); - } - return new TableSchema().setFields(fields); - } - - @Override - public PDone expand(PCollection teamAndScore) { - teamAndScore - .apply("ConvertToRow", ParDo.of(new BuildRowFn())) - .apply( - BigQueryIO.writeTableRows() - .to(getTable(projectId, datasetId, tableName)) - .withSchema(getSchema()) - .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(WriteDisposition.WRITE_APPEND)); - return PDone.in(teamAndScore.getPipeline()); - } - - /** Utility to construct an output table reference. */ - static TableReference getTable(String projectId, String datasetId, String tableName) { - TableReference table = new TableReference(); - table.setDatasetId(datasetId); - table.setProjectId(projectId); - table.setTableId(tableName); - return table; - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java deleted file mode 100644 index 76fa3ff075..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteToText.java +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.utils; - -import static com.google.common.base.Preconditions.checkArgument; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; -import java.util.stream.Collectors; -import org.apache.beam.sdk.io.FileBasedSink; -import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; -import org.apache.beam.sdk.io.FileBasedSink.OutputFileHints; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; -import org.apache.beam.sdk.io.fs.ResourceId; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.PaneInfo; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; -import org.joda.time.DateTimeZone; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; - -/** - * Generate, format, and write rows. Use provided information about the field names and types, as - * well as lambda functions that describe how to generate their values. - */ -public class WriteToText - extends PTransform, PDone> { - - private static final DateTimeFormatter formatter = - DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") - .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("America/Los_Angeles"))); - - protected String filenamePrefix; - protected Map> fieldFn; - protected boolean windowed; - - public WriteToText() { - } - - public WriteToText( - String filenamePrefix, - Map> fieldFn, - boolean windowed) { - this.filenamePrefix = filenamePrefix; - this.fieldFn = fieldFn; - this.windowed = windowed; - } - - /** - * A {@link Serializable} function from a {@link DoFn.ProcessContext} - * and {@link BoundedWindow} to the value for that field. - */ - public interface FieldFn extends Serializable { - Object apply(DoFn.ProcessContext context, BoundedWindow window); - } - - /** Convert each key/score pair into a row as specified by fieldFn. */ - protected class BuildRowFn extends DoFn { - - @ProcessElement - public void processElement(ProcessContext c, BoundedWindow window) { - List fields = new ArrayList<>(); - for (Map.Entry> entry : fieldFn.entrySet()) { - String key = entry.getKey(); - FieldFn fcn = entry.getValue(); - fields.add(key + ": " + fcn.apply(c, window)); - } - String result = fields.stream().collect(Collectors.joining(", ")); - c.output(result); - } - } - - /** - * A {@link DoFn} that writes elements to files with names deterministically derived from the - * lower and upper bounds of their key (an {@link IntervalWindow}). - */ - protected static class WriteOneFilePerWindow extends PTransform, PDone> { - - private final String filenamePrefix; - - public WriteOneFilePerWindow(String filenamePrefix) { - this.filenamePrefix = filenamePrefix; - } - - @Override - public PDone expand(PCollection input) { - // Verify that the input has a compatible window type. - checkArgument( - input.getWindowingStrategy().getWindowFn().windowCoder() == IntervalWindow.getCoder()); - - ResourceId resource = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix); - - return input.apply( - TextIO.write() - .to(new PerWindowFiles(resource)) - .withTempDirectory(resource.getCurrentDirectory()) - .withWindowedWrites() - .withNumShards(3)); - } - } - - /** - * A {@link FilenamePolicy} produces a base file name for a write based on metadata about the data - * being written. This always includes the shard number and the total number of shards. For - * windowed writes, it also includes the window and pane index (a sequence number assigned to each - * trigger firing). - */ - protected static class PerWindowFiles extends FilenamePolicy { - - private final ResourceId prefix; - - public PerWindowFiles(ResourceId prefix) { - this.prefix = prefix; - } - - public String filenamePrefixForWindow(IntervalWindow window) { - String filePrefix = prefix.isDirectory() ? "" : prefix.getFilename(); - return String.format( - "%s-%s-%s", filePrefix, formatter.print(window.start()), formatter.print(window.end())); - } - - @Override - public ResourceId windowedFilename(int shardNumber, - int numShards, - BoundedWindow window, - PaneInfo paneInfo, - OutputFileHints outputFileHints) { - IntervalWindow intervalWindow = (IntervalWindow) window; - String filename = - String.format( - "%s-%s-of-%s%s", - filenamePrefixForWindow(intervalWindow), - shardNumber, - numShards, - outputFileHints.getSuggestedFilenameSuffix()); - return prefix.getCurrentDirectory().resolve(filename, StandardResolveOptions.RESOLVE_FILE); - } - - @Override - public ResourceId unwindowedFilename( - int shardNumber, int numShards, OutputFileHints outputFileHints) { - throw new UnsupportedOperationException("Unsupported."); - } - } - - @Override - public PDone expand(PCollection teamAndScore) { - if (windowed) { - teamAndScore - .apply("ConvertToRow", ParDo.of(new BuildRowFn())) - .apply(new WriteToText.WriteOneFilePerWindow(filenamePrefix)); - } else { - teamAndScore - .apply("ConvertToRow", ParDo.of(new BuildRowFn())) - .apply(TextIO.write().to(filenamePrefix)); - } - return PDone.in(teamAndScore.getPipeline()); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java deleted file mode 100644 index 6aef88706d..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/main/java/complete/game/utils/WriteWindowedToBigQuery.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game.utils; - -import com.google.api.services.bigquery.model.TableRow; -import java.util.Map; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.PDone; - -/** - * Generate, format, and write BigQuery table row information. Subclasses {@link WriteToBigQuery} - * to require windowing; so this subclass may be used for writes that require access to the - * context's window information. - */ -public class WriteWindowedToBigQuery - extends WriteToBigQuery { - - public WriteWindowedToBigQuery( - String projectId, String datasetId, String tableName, Map> fieldInfo) { - super(projectId, datasetId, tableName, fieldInfo); - } - - /** Convert each key/score pair into a BigQuery TableRow. */ - protected class BuildRowFn extends DoFn { - @ProcessElement - public void processElement(ProcessContext c, BoundedWindow window) { - - TableRow row = new TableRow(); - for (Map.Entry> entry : fieldInfo.entrySet()) { - String key = entry.getKey(); - FieldInfo fcnInfo = entry.getValue(); - row.set(key, fcnInfo.getFieldFn().apply(c, window)); - } - c.output(row); - } - } - - @Override - public PDone expand(PCollection teamAndScore) { - teamAndScore - .apply("ConvertToRow", ParDo.of(new BuildRowFn())) - .apply(BigQueryIO.writeTableRows() - .to(getTable(projectId, datasetId, tableName)) - .withSchema(getSchema()) - .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(WriteDisposition.WRITE_APPEND)); - return PDone.in(teamAndScore.getPipeline()); - } - -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java deleted file mode 100644 index 0fbee20cb5..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/DebuggingWordCountTest.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import com.google.common.io.Files; -import java.io.File; -import java.nio.charset.StandardCharsets; -import ${package}.DebuggingWordCount.WordCountOptions; -import org.apache.beam.sdk.testing.TestPipeline; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests for {@link DebuggingWordCount}. - */ -@RunWith(JUnit4.class) -public class DebuggingWordCountTest { - @Rule public TemporaryFolder tmpFolder = new TemporaryFolder(); - - private String getFilePath(String filePath) { - if (filePath.contains(":")) { - return filePath.replace("\\", "/").split(":", -1)[1]; - } - return filePath; - } - - @Test - public void testDebuggingWordCount() throws Exception { - File inputFile = tmpFolder.newFile(); - File outputFile = tmpFolder.newFile(); - Files.write( - "stomach secret Flourish message Flourish here Flourish", - inputFile, - StandardCharsets.UTF_8); - WordCountOptions options = - TestPipeline.testingPipelineOptions().as(WordCountOptions.class); - options.setInputFile(getFilePath(inputFile.getAbsolutePath())); - options.setOutput(getFilePath(outputFile.getAbsolutePath())); - DebuggingWordCount.runDebuggingWordCount(options); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java deleted file mode 100644 index f4c8b160d7..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/MinimalWordCountTest.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import com.google.common.collect.ImmutableList; -import java.io.IOException; -import java.io.Serializable; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.StandardOpenOption; -import java.util.Arrays; -import org.apache.beam.sdk.extensions.gcp.options.GcsOptions; -import org.apache.beam.sdk.io.TextIO; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.transforms.Count; -import org.apache.beam.sdk.transforms.Filter; -import org.apache.beam.sdk.transforms.FlatMapElements; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.util.GcsUtil; -import org.apache.beam.sdk.util.gcsfs.GcsPath; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.junit.Rule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; -import org.mockito.Mockito; - -/** - * To keep {@link MinimalWordCount} simple, it is not factored or testable. This test - * file should be maintained with a copy of its code for a basic smoke test. - */ -@RunWith(JUnit4.class) -public class MinimalWordCountTest implements Serializable { - - @Rule - public TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); - - /** - * A basic smoke test that ensures there is no crash at pipeline construction time. - */ - @Test - public void testMinimalWordCount() throws Exception { - p.getOptions().as(GcsOptions.class).setGcsUtil(buildMockGcsUtil()); - - p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) - .apply( - FlatMapElements.into(TypeDescriptors.strings()) - .via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))) - .apply(Filter.by((String word) -> !word.isEmpty())) - .apply(Count.perElement()) - .apply( - MapElements.into(TypeDescriptors.strings()) - .via( - (KV wordCount) -> - wordCount.getKey() + ": " + wordCount.getValue())) - .apply(TextIO.write().to("gs://your-output-bucket/and-output-prefix")); - } - - private GcsUtil buildMockGcsUtil() throws IOException { - GcsUtil mockGcsUtil = Mockito.mock(GcsUtil.class); - - // Any request to open gets a new bogus channel - Mockito.when(mockGcsUtil.open(Mockito.any(GcsPath.class))) - .then( - invocation -> - FileChannel.open( - Files.createTempFile("channel-", ".tmp"), - StandardOpenOption.CREATE, - StandardOpenOption.DELETE_ON_CLOSE)); - - // Any request for expansion returns a list containing the original GcsPath - // This is required to pass validation that occurs in TextIO during apply() - Mockito.when(mockGcsUtil.expand(Mockito.any(GcsPath.class))) - .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0])); - - return mockGcsUtil; - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java deleted file mode 100644 index 91a1bf8edc..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/WordCountTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import java.util.Arrays; -import java.util.List; -import ${package}.WordCount.CountWords; -import ${package}.WordCount.ExtractWordsFn; -import ${package}.WordCount.FormatAsTextFn; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.ValidatesRunner; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.DoFnTester; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.values.PCollection; -import org.hamcrest.CoreMatchers; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests of WordCount. - */ -@RunWith(JUnit4.class) -public class WordCountTest { - - /** Example test that tests a specific {@link DoFn}. */ - @Test - public void testExtractWordsFn() throws Exception { - DoFnTester extractWordsFn = - DoFnTester.of(new ExtractWordsFn()); - - Assert.assertThat(extractWordsFn.processBundle(" some input words "), - CoreMatchers.hasItems("some", "input", "words")); - Assert.assertThat(extractWordsFn.processBundle(" "), CoreMatchers.hasItems()); - Assert.assertThat(extractWordsFn.processBundle(" some ", " input", " words"), - CoreMatchers.hasItems("some", "input", "words")); - } - - static final String[] WORDS_ARRAY = new String[] { - "hi there", "hi", "hi sue bob", - "hi sue", "", "bob hi"}; - - static final List WORDS = Arrays.asList(WORDS_ARRAY); - - static final String[] COUNTS_ARRAY = new String[] { - "hi: 5", "there: 1", "sue: 2", "bob: 2"}; - - @Rule - public TestPipeline p = TestPipeline.create(); - - /** Example test that tests a PTransform by using an in-memory input and inspecting the output. */ - @Test - @Category(ValidatesRunner.class) - public void testCountWords() throws Exception { - PCollection input = p.apply(Create.of(WORDS).withCoder(StringUtf8Coder.of())); - - PCollection output = input.apply(new CountWords()) - .apply(MapElements.via(new FormatAsTextFn())); - - PAssert.that(output).containsInAnyOrder(COUNTS_ARRAY); - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java deleted file mode 100644 index 5cbdc6244f..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/GameStatsTest.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; -import ${package}.complete.game.GameStats.CalculateSpammyUsers; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.ValidatesRunner; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests of GameStats. - * Because the pipeline was designed for easy readability and explanations, it lacks good - * modularity for testing. See our testing documentation for better ideas: - * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ - */ -@RunWith(JUnit4.class) -public class GameStatsTest implements Serializable { - - // User scores - static final List> USER_SCORES = Arrays.asList( - KV.of("Robot-2", 66), KV.of("Robot-1", 116), KV.of("user7_AndroidGreenKookaburra", 23), - KV.of("user7_AndroidGreenKookaburra", 1), - KV.of("user19_BisqueBilby", 14), KV.of("user13_ApricotQuokka", 15), - KV.of("user18_BananaEmu", 25), KV.of("user6_AmberEchidna", 8), - KV.of("user2_AmberQuokka", 6), KV.of("user0_MagentaKangaroo", 4), - KV.of("user0_MagentaKangaroo", 3), KV.of("user2_AmberCockatoo", 13), - KV.of("user7_AlmondWallaby", 15), KV.of("user6_AmberNumbat", 11), - KV.of("user6_AmberQuokka", 4)); - - // The expected list of 'spammers'. - static final List> SPAMMERS = Arrays.asList( - KV.of("Robot-2", 66), KV.of("Robot-1", 116)); - - @Rule - public TestPipeline p = TestPipeline.create(); - - /** Test the calculation of 'spammy users'. */ - @Test - @Category(ValidatesRunner.class) - public void testCalculateSpammyUsers() throws Exception { - PCollection> input = p.apply(Create.of(USER_SCORES)); - PCollection> output = input.apply(new CalculateSpammyUsers()); - - // Check the set of spammers. - PAssert.that(output).containsInAnyOrder(SPAMMERS); - - p.run().waitUntilFinish(); - } - - @Test - public void testGameStatsOptions() { - PipelineOptionsFactory.as(GameStats.Options.class); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java deleted file mode 100644 index 17d459df93..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/HourlyTeamScoreTest.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; -import ${package}.complete.game.UserScore.GameActionInfo; -import ${package}.complete.game.UserScore.ParseEventFn; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.ValidatesRunner; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.Filter; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.joda.time.Instant; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests of HourlyTeamScore. - * Because the pipeline was designed for easy readability and explanations, it lacks good - * modularity for testing. See our testing documentation for better ideas: - * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ - */ -@RunWith(JUnit4.class) -public class HourlyTeamScoreTest implements Serializable { - - static final String[] GAME_EVENTS_ARRAY = new String[] { - "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444", - "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444", - "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444", - "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444", - "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444", - "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444", - "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444", - "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444", - // time gap... - "user0_AndroidGreenEchidna,AndroidGreenEchidna,0,1447965690000,2015-11-19 12:41:31.053", - "user0_MagentaKangaroo,MagentaKangaroo,4,1447965690000,2015-11-19 12:41:31.053", - "user2_AmberCockatoo,AmberCockatoo,13,1447965690000,2015-11-19 12:41:31.053", - "user18_BananaEmu,BananaEmu,7,1447965690000,2015-11-19 12:41:31.053", - "user3_BananaEmu,BananaEmu,17,1447965690000,2015-11-19 12:41:31.053", - "user18_BananaEmu,BananaEmu,1,1447965690000,2015-11-19 12:41:31.053", - "user18_ApricotCaneToad,ApricotCaneToad,14,1447965690000,2015-11-19 12:41:31.053" - }; - - - static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY); - - - // Used to check the filtering. - static final KV[] FILTERED_EVENTS = new KV[] { - KV.of("user0_AndroidGreenEchidna", 0), KV.of("user0_MagentaKangaroo", 4), - KV.of("user2_AmberCockatoo", 13), - KV.of("user18_BananaEmu", 7), KV.of("user3_BananaEmu", 17), - KV.of("user18_BananaEmu", 1), KV.of("user18_ApricotCaneToad", 14) - }; - - @Rule - public TestPipeline p = TestPipeline.create(); - - /** Test the filtering. */ - @Test - @Category(ValidatesRunner.class) - public void testUserScoresFilter() throws Exception { - - final Instant startMinTimestamp = new Instant(1447965680000L); - - PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); - - PCollection> output = input - .apply("ParseGameEvent", ParDo.of(new ParseEventFn())) - - .apply("FilterStartTime", Filter.by( - (GameActionInfo gInfo) - -> gInfo.getTimestamp() > startMinTimestamp.getMillis())) - // run a map to access the fields in the result. - .apply(MapElements - .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) - .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); - - PAssert.that(output).containsInAnyOrder(FILTERED_EVENTS); - - p.run().waitUntilFinish(); - } - - @Test - public void testUserScoreOptions() { - PipelineOptionsFactory.as(HourlyTeamScore.Options.class); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java deleted file mode 100644 index 2478c07fa8..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/LeaderBoardTest.java +++ /dev/null @@ -1,368 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ${package}.complete.game; - -import static org.hamcrest.Matchers.hasItem; -import static org.junit.Assert.assertThat; - -import com.google.common.collect.ImmutableMap; -import java.io.Serializable; -import ${package}.complete.game.LeaderBoard.CalculateTeamScores; -import ${package}.complete.game.LeaderBoard.CalculateUserScores; -import ${package}.complete.game.UserScore.GameActionInfo; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestStream; -import org.apache.beam.sdk.transforms.PTransform; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; -import org.apache.beam.sdk.transforms.windowing.GlobalWindow; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.TimestampedValue; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.junit.Rule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests for {@link LeaderBoard}. - */ -@RunWith(JUnit4.class) -public class LeaderBoardTest implements Serializable { - private static final Duration ALLOWED_LATENESS = Duration.standardHours(1); - private static final Duration TEAM_WINDOW_DURATION = Duration.standardMinutes(20); - private Instant baseTime = new Instant(0); - - @Rule - public TestPipeline p = TestPipeline.create(); - /** - * Some example users, on two separate teams. - */ - private enum TestUser { - RED_ONE("scarlet", "red"), RED_TWO("burgundy", "red"), - BLUE_ONE("navy", "blue"), BLUE_TWO("sky", "blue"); - - private final String userName; - private final String teamName; - - TestUser(String userName, String teamName) { - this.userName = userName; - this.teamName = teamName; - } - - public String getUser() { - return userName; - } - - public String getTeam() { - return teamName; - } - } - - /** - * A test of the {@link CalculateTeamScores} {@link PTransform} when all of the elements arrive - * on time (ahead of the watermark). - */ - @Test - public void testTeamScoresOnTime() { - - TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) - // Start at the epoch - .advanceWatermarkTo(baseTime) - // add some elements ahead of the watermark - .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), - event(TestUser.BLUE_ONE, 2, Duration.standardMinutes(1)), - event(TestUser.RED_TWO, 3, Duration.standardSeconds(22)), - event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(3))) - // The watermark advances slightly, but not past the end of the window - .advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))) - // Add some more on time elements - .addElements(event(TestUser.RED_ONE, 1, Duration.standardMinutes(4)), - event(TestUser.BLUE_ONE, 2, Duration.standardSeconds(270))) - // The window should close and emit an ON_TIME pane - .advanceWatermarkToInfinity(); - - PCollection> teamScores = p.apply(createEvents) - .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); - - String blueTeam = TestUser.BLUE_ONE.getTeam(); - String redTeam = TestUser.RED_ONE.getTeam(); - PAssert.that(teamScores) - .inOnTimePane(new IntervalWindow(baseTime, TEAM_WINDOW_DURATION)) - .containsInAnyOrder(KV.of(blueTeam, 12), KV.of(redTeam, 4)); - - p.run().waitUntilFinish(); - } - - /** - * A test of the {@link CalculateTeamScores} {@link PTransform} when all of the elements arrive - * on time, and the processing time advances far enough for speculative panes. - */ - @Test - public void testTeamScoresSpeculative() { - - TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) - // Start at the epoch - .advanceWatermarkTo(baseTime) - .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), - event(TestUser.BLUE_ONE, 2, Duration.standardMinutes(1))) - // Some time passes within the runner, which causes a speculative pane containing the blue - // team's score to be emitted - .advanceProcessingTime(Duration.standardMinutes(10)) - .addElements(event(TestUser.RED_TWO, 5, Duration.standardMinutes(3))) - // Some additional time passes and we get a speculative pane for the red team - .advanceProcessingTime(Duration.standardMinutes(12)) - .addElements(event(TestUser.BLUE_TWO, 3, Duration.standardSeconds(22))) - // More time passes and a speculative pane containing a refined value for the blue pane is - // emitted - .advanceProcessingTime(Duration.standardMinutes(10)) - // Some more events occur - .addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(4)), - event(TestUser.BLUE_TWO, 2, Duration.standardMinutes(2))) - // The window closes and we get an ON_TIME pane that contains all of the updates - .advanceWatermarkToInfinity(); - - PCollection> teamScores = p.apply(createEvents) - .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); - - String blueTeam = TestUser.BLUE_ONE.getTeam(); - String redTeam = TestUser.RED_ONE.getTeam(); - IntervalWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); - // The window contains speculative panes alongside the on-time pane - PAssert.that(teamScores) - .inWindow(window) - .containsInAnyOrder(KV.of(blueTeam, 10) /* The on-time blue pane */, - KV.of(redTeam, 9) /* The on-time red pane */, - KV.of(blueTeam, 5) /* The first blue speculative pane */, - KV.of(blueTeam, 8) /* The second blue speculative pane */, - KV.of(redTeam, 5) /* The red speculative pane */); - PAssert.that(teamScores) - .inOnTimePane(window) - .containsInAnyOrder(KV.of(blueTeam, 10), KV.of(redTeam, 9)); - - p.run().waitUntilFinish(); - } - - /** - * A test where elements arrive behind the watermark (late data), but before the end of the - * window. These elements are emitted on time. - */ - @Test - public void testTeamScoresUnobservablyLate() { - - BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); - TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) - .advanceWatermarkTo(baseTime) - .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), - event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8)), - event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), - event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))) - .advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION).minus(Duration.standardMinutes(1))) - // These events are late, but the window hasn't closed yet, so the elements are in the - // on-time pane - .addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), - event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), - event(TestUser.BLUE_TWO, 2, Duration.standardSeconds(90)), - event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))) - .advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))) - .advanceWatermarkToInfinity(); - PCollection> teamScores = p.apply(createEvents) - .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); - - String blueTeam = TestUser.BLUE_ONE.getTeam(); - String redTeam = TestUser.RED_ONE.getTeam(); - // The On Time pane contains the late elements that arrived before the end of the window - PAssert.that(teamScores) - .inOnTimePane(window) - .containsInAnyOrder(KV.of(redTeam, 14), KV.of(blueTeam, 13)); - - p.run().waitUntilFinish(); - } - - /** - * A test where elements arrive behind the watermark (late data) after the watermark passes the - * end of the window, but before the maximum allowed lateness. These elements are emitted in a - * late pane. - */ - @Test - public void testTeamScoresObservablyLate() { - - Instant firstWindowCloses = baseTime.plus(ALLOWED_LATENESS).plus(TEAM_WINDOW_DURATION); - TestStream createEvents = TestStream.create(AvroCoder.of(GameActionInfo.class)) - .advanceWatermarkTo(baseTime) - .addElements(event(TestUser.BLUE_ONE, 3, Duration.standardSeconds(3)), - event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(8))) - .advanceProcessingTime(Duration.standardMinutes(10)) - .advanceWatermarkTo(baseTime.plus(Duration.standardMinutes(3))) - .addElements(event(TestUser.RED_ONE, 3, Duration.standardMinutes(1)), - event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), - event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(5))) - .advanceWatermarkTo(firstWindowCloses.minus(Duration.standardMinutes(1))) - // These events are late but should still appear in a late pane - .addElements(event(TestUser.RED_TWO, 2, Duration.ZERO), - event(TestUser.RED_TWO, 5, Duration.standardMinutes(1)), - event(TestUser.RED_TWO, 3, Duration.standardMinutes(3))) - // A late refinement is emitted due to the advance in processing time, but the window has - // not yet closed because the watermark has not advanced - .advanceProcessingTime(Duration.standardMinutes(12)) - // These elements should appear in the final pane - .addElements(event(TestUser.RED_TWO, 9, Duration.standardMinutes(1)), - event(TestUser.RED_TWO, 1, Duration.standardMinutes(3))) - .advanceWatermarkToInfinity(); - - PCollection> teamScores = p.apply(createEvents) - .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); - - BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); - String blueTeam = TestUser.BLUE_ONE.getTeam(); - String redTeam = TestUser.RED_ONE.getTeam(); - PAssert.that(teamScores) - .inWindow(window) - .satisfies( - input -> { - // The final sums need not exist in the same pane, but must appear in the output - // PCollection - assertThat(input, hasItem(KV.of(blueTeam, 11))); - assertThat(input, hasItem(KV.of(redTeam, 27))); - return null; - }); - PAssert.thatMap(teamScores) - // The closing behavior of CalculateTeamScores precludes an inFinalPane matcher - .inOnTimePane(window) - .isEqualTo(ImmutableMap.builder().put(redTeam, 7) - .put(blueTeam, 11) - .build()); - - // No final pane is emitted for the blue team, as all of their updates have been taken into - // account in earlier panes - PAssert.that(teamScores).inFinalPane(window).containsInAnyOrder(KV.of(redTeam, 27)); - - p.run().waitUntilFinish(); - } - - /** - * A test where elements arrive beyond the maximum allowed lateness. These elements are dropped - * within {@link CalculateTeamScores} and do not impact the final result. - */ - @Test - public void testTeamScoresDroppablyLate() { - - BoundedWindow window = new IntervalWindow(baseTime, TEAM_WINDOW_DURATION); - TestStream infos = TestStream.create(AvroCoder.of(GameActionInfo.class)) - .addElements(event(TestUser.BLUE_ONE, 12, Duration.ZERO), - event(TestUser.RED_ONE, 3, Duration.ZERO)) - .advanceWatermarkTo(window.maxTimestamp()) - .addElements(event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), - event(TestUser.BLUE_TWO, 3, Duration.ZERO), - event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))) - // Move the watermark to the end of the window to output on time - .advanceWatermarkTo(baseTime.plus(TEAM_WINDOW_DURATION)) - // Move the watermark past the end of the allowed lateness plus the end of the window - .advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS) - .plus(TEAM_WINDOW_DURATION).plus(Duration.standardMinutes(1))) - // These elements within the expired window are droppably late, and will not appear in the - // output - .addElements( - event(TestUser.BLUE_TWO, 3, TEAM_WINDOW_DURATION.minus(Duration.standardSeconds(5))), - event(TestUser.RED_ONE, 7, Duration.standardMinutes(4))) - .advanceWatermarkToInfinity(); - PCollection> teamScores = p.apply(infos) - .apply(new CalculateTeamScores(TEAM_WINDOW_DURATION, ALLOWED_LATENESS)); - - String blueTeam = TestUser.BLUE_ONE.getTeam(); - String redTeam = TestUser.RED_ONE.getTeam(); - // Only one on-time pane and no late panes should be emitted - PAssert.that(teamScores) - .inWindow(window) - .containsInAnyOrder(KV.of(redTeam, 7), KV.of(blueTeam, 18)); - // No elements are added before the watermark passes the end of the window plus the allowed - // lateness, so no refinement should be emitted - PAssert.that(teamScores).inFinalPane(window).empty(); - - p.run().waitUntilFinish(); - } - - /** - * A test where elements arrive both on-time and late in {@link CalculateUserScores}, which emits - * output into the {@link GlobalWindow}. All elements that arrive should be taken into account, - * even if they arrive later than the maximum allowed lateness. - */ - @Test - public void testUserScore() { - - TestStream infos = - TestStream.create(AvroCoder.of(GameActionInfo.class)) - .addElements( - event(TestUser.BLUE_ONE, 12, Duration.ZERO), - event(TestUser.RED_ONE, 3, Duration.ZERO)) - .advanceProcessingTime(Duration.standardMinutes(7)) - .addElements( - event(TestUser.RED_ONE, 4, Duration.standardMinutes(2)), - event(TestUser.BLUE_TWO, 3, Duration.ZERO), - event(TestUser.BLUE_ONE, 3, Duration.standardMinutes(3))) - .advanceProcessingTime(Duration.standardMinutes(5)) - .advanceWatermarkTo(baseTime.plus(ALLOWED_LATENESS).plus(Duration.standardHours(12))) - // Late elements are always observable within the global window - they arrive before - // the window closes, so they will appear in a pane, even if they arrive after the - // allowed lateness, and are taken into account alongside on-time elements - .addElements( - event(TestUser.RED_ONE, 3, Duration.standardMinutes(7)), - event(TestUser.RED_ONE, 2, (ALLOWED_LATENESS).plus(Duration.standardHours(13)))) - .advanceProcessingTime(Duration.standardMinutes(6)) - .addElements(event(TestUser.BLUE_TWO, 5, Duration.standardMinutes(12))) - .advanceProcessingTime(Duration.standardMinutes(20)) - .advanceWatermarkToInfinity(); - - PCollection> userScores = - p.apply(infos).apply(new CalculateUserScores(ALLOWED_LATENESS)); - - // User scores are emitted in speculative panes in the Global Window - this matcher choice - // ensures that panes emitted by the watermark advancing to positive infinity are not included, - // as that will not occur outside of tests - PAssert.that(userScores) - .inEarlyGlobalWindowPanes() - .containsInAnyOrder(KV.of(TestUser.BLUE_ONE.getUser(), 15), - KV.of(TestUser.RED_ONE.getUser(), 7), - KV.of(TestUser.RED_ONE.getUser(), 12), - KV.of(TestUser.BLUE_TWO.getUser(), 3), - KV.of(TestUser.BLUE_TWO.getUser(), 8)); - - p.run().waitUntilFinish(); - } - - @Test - public void testLeaderBoardOptions() { - PipelineOptionsFactory.as(LeaderBoard.Options.class); - } - - private TimestampedValue event( - TestUser user, - int score, - Duration baseTimeOffset) { - return TimestampedValue.of(new GameActionInfo(user.getUser(), - user.getTeam(), - score, - baseTime.plus(baseTimeOffset).getMillis()), baseTime.plus(baseTimeOffset)); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java deleted file mode 100644 index c80c57f4fc..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/StatefulTeamScoreTest.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package ${package}.complete.game; - -import ${package}.complete.game.StatefulTeamScore.UpdateTeamScoreFn; -import ${package}.complete.game.UserScore.GameActionInfo; -import org.apache.beam.sdk.coders.AvroCoder; -import org.apache.beam.sdk.coders.KvCoder; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestStream; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.windowing.FixedWindows; -import org.apache.beam.sdk.transforms.windowing.GlobalWindow; -import org.apache.beam.sdk.transforms.windowing.IntervalWindow; -import org.apache.beam.sdk.transforms.windowing.Window; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.TimestampedValue; -import org.joda.time.Duration; -import org.joda.time.Instant; -import org.junit.Rule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests for {@link StatefulTeamScore}. - */ -@RunWith(JUnit4.class) -public class StatefulTeamScoreTest { - - private Instant baseTime = new Instant(0); - - @Rule - public TestPipeline p = TestPipeline.create(); - - /** - * Some example users, on two separate teams. - */ - private enum TestUser { - RED_ONE("scarlet", "red"), RED_TWO("burgundy", "red"), - BLUE_ONE("navy", "blue"), BLUE_TWO("sky", "blue"); - - private final String userName; - private final String teamName; - - TestUser(String userName, String teamName) { - this.userName = userName; - this.teamName = teamName; - } - - public String getUser() { - return userName; - } - - public String getTeam() { - return teamName; - } - } - - /** - * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs - * correctly for one team. - */ - @Test - public void testScoreUpdatesOneTeam() { - - TestStream> createEvents = TestStream.create(KvCoder.of( - StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) - .advanceWatermarkTo(baseTime) - .addElements( - event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)), - event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)), - event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)), - event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)), - event(TestUser.RED_TWO, 201, Duration.standardSeconds(50)) - ) - .advanceWatermarkToInfinity(); - - PCollection> teamScores = p.apply(createEvents) - .apply(ParDo.of(new UpdateTeamScoreFn(100))); - - String redTeam = TestUser.RED_ONE.getTeam(); - - PAssert.that(teamScores) - .inWindow(GlobalWindow.INSTANCE) - .containsInAnyOrder( - KV.of(redTeam, 100), - KV.of(redTeam, 200), - KV.of(redTeam, 401) - ); - - p.run().waitUntilFinish(); - } - - /** - * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs - * correctly for multiple teams. - */ - @Test - public void testScoreUpdatesPerTeam() { - - TestStream> createEvents = TestStream.create(KvCoder.of( - StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) - .advanceWatermarkTo(baseTime) - .addElements( - event(TestUser.RED_ONE, 50, Duration.standardSeconds(10)), - event(TestUser.RED_TWO, 50, Duration.standardSeconds(20)), - event(TestUser.BLUE_ONE, 70, Duration.standardSeconds(30)), - event(TestUser.BLUE_TWO, 80, Duration.standardSeconds(40)), - event(TestUser.BLUE_TWO, 50, Duration.standardSeconds(50)) - ) - .advanceWatermarkToInfinity(); - - PCollection> teamScores = p.apply(createEvents) - .apply(ParDo.of(new UpdateTeamScoreFn(100))); - - String redTeam = TestUser.RED_ONE.getTeam(); - String blueTeam = TestUser.BLUE_ONE.getTeam(); - - PAssert.that(teamScores) - .inWindow(GlobalWindow.INSTANCE) - .containsInAnyOrder( - KV.of(redTeam, 100), - KV.of(blueTeam, 150), - KV.of(blueTeam, 200) - ); - - p.run().waitUntilFinish(); - } - - /** - * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs - * correctly per window and per key. - */ - @Test - public void testScoreUpdatesPerWindow() { - - TestStream> createEvents = TestStream.create(KvCoder.of( - StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class))) - .advanceWatermarkTo(baseTime) - .addElements( - event(TestUser.RED_ONE, 50, Duration.standardMinutes(1)), - event(TestUser.RED_TWO, 50, Duration.standardMinutes(2)), - event(TestUser.RED_ONE, 50, Duration.standardMinutes(3)), - event(TestUser.RED_ONE, 60, Duration.standardMinutes(6)), - event(TestUser.RED_TWO, 60, Duration.standardMinutes(7)) - ) - .advanceWatermarkToInfinity(); - - Duration teamWindowDuration = Duration.standardMinutes(5); - - PCollection> teamScores = p - .apply(createEvents) - .apply(Window.>into(FixedWindows.of(teamWindowDuration))) - .apply(ParDo.of(new UpdateTeamScoreFn(100))); - - String redTeam = TestUser.RED_ONE.getTeam(); - String blueTeam = TestUser.BLUE_ONE.getTeam(); - - IntervalWindow window1 = new IntervalWindow(baseTime, teamWindowDuration); - IntervalWindow window2 = new IntervalWindow(window1.end(), teamWindowDuration); - - PAssert.that(teamScores) - .inWindow(window1) - .containsInAnyOrder( - KV.of(redTeam, 100) - ); - - PAssert.that(teamScores) - .inWindow(window2) - .containsInAnyOrder( - KV.of(redTeam, 120) - ); - - p.run().waitUntilFinish(); - } - - private TimestampedValue> event( - TestUser user, - int score, - Duration baseTimeOffset) { - return TimestampedValue.of(KV.of(user.getTeam(), new GameActionInfo(user.getUser(), - user.getTeam(), - score, - baseTime.plus(baseTimeOffset).getMillis())), baseTime.plus(baseTimeOffset)); - } -} diff --git a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java b/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java deleted file mode 100644 index b691a0cbd5..0000000000 --- a/maven-archetypes/examples/src/main/resources/archetype-resources/src/test/java/complete/game/UserScoreTest.java +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}.complete.game; - -import java.io.Serializable; -import java.util.Arrays; -import java.util.List; -import ${package}.complete.game.UserScore.ExtractAndSumScore; -import ${package}.complete.game.UserScore.GameActionInfo; -import ${package}.complete.game.UserScore.ParseEventFn; -import org.apache.beam.sdk.coders.StringUtf8Coder; -import org.apache.beam.sdk.testing.PAssert; -import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.ValidatesRunner; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFnTester; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.values.KV; -import org.apache.beam.sdk.values.PCollection; -import org.apache.beam.sdk.values.TypeDescriptors; -import org.junit.Assert; -import org.junit.Rule; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests of UserScore. - */ -@RunWith(JUnit4.class) -public class UserScoreTest implements Serializable { - - static final String[] GAME_EVENTS_ARRAY = new String[] { - "user0_MagentaKangaroo,MagentaKangaroo,3,1447955630000,2015-11-19 09:53:53.444", - "user13_ApricotQuokka,ApricotQuokka,15,1447955630000,2015-11-19 09:53:53.444", - "user6_AmberNumbat,AmberNumbat,11,1447955630000,2015-11-19 09:53:53.444", - "user7_AlmondWallaby,AlmondWallaby,15,1447955630000,2015-11-19 09:53:53.444", - "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,12,1447955630000,2015-11-19 09:53:53.444", - "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444", - "user7_AndroidGreenKookaburra,AndroidGreenKookaburra,11,1447955630000,2015-11-19 09:53:53.444", - "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444", - "user19_BisqueBilby,BisqueBilby,6,1447955630000,2015-11-19 09:53:53.444", - "user19_BisqueBilby,BisqueBilby,8,1447955630000,2015-11-19 09:53:53.444" - }; - - static final String[] GAME_EVENTS_ARRAY2 = new String[] { - "user6_AliceBlueDingo,AliceBlueDingo,4,xxxxxxx,2015-11-19 09:53:53.444", - "THIS IS A PARSE ERROR,2015-11-19 09:53:53.444", - "user13_BisqueBilby,BisqueBilby,xxx,1447955630000,2015-11-19 09:53:53.444" - }; - - static final List GAME_EVENTS = Arrays.asList(GAME_EVENTS_ARRAY); - static final List GAME_EVENTS2 = Arrays.asList(GAME_EVENTS_ARRAY2); - - static final List> USER_SUMS = Arrays.asList( - KV.of("user0_MagentaKangaroo", 3), KV.of("user13_ApricotQuokka", 15), - KV.of("user6_AmberNumbat", 11), KV.of("user7_AlmondWallaby", 15), - KV.of("user7_AndroidGreenKookaburra", 23), - KV.of("user19_BisqueBilby", 14)); - - static final List> TEAM_SUMS = Arrays.asList( - KV.of("MagentaKangaroo", 3), KV.of("ApricotQuokka", 15), - KV.of("AmberNumbat", 11), KV.of("AlmondWallaby", 15), - KV.of("AndroidGreenKookaburra", 23), - KV.of("BisqueBilby", 14)); - - @Rule - public TestPipeline p = TestPipeline.create(); - - /** Test the {@link ParseEventFn} {@link org.apache.beam.sdk.transforms.DoFn}. */ - @Test - public void testParseEventFn() throws Exception { - DoFnTester parseEventFn = - DoFnTester.of(new ParseEventFn()); - - List results = parseEventFn.processBundle(GAME_EVENTS_ARRAY); - Assert.assertEquals(8, results.size()); - Assert.assertEquals("user0_MagentaKangaroo", results.get(0).getUser()); - Assert.assertEquals("MagentaKangaroo", results.get(0).getTeam()); - Assert.assertEquals(Integer.valueOf(3), results.get(0).getScore()); - } - - /** Tests ExtractAndSumScore("user"). */ - @Test - @Category(ValidatesRunner.class) - public void testUserScoreSums() throws Exception { - - PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); - - PCollection> output = input - .apply(ParDo.of(new ParseEventFn())) - // Extract and sum username/score pairs from the event data. - .apply("ExtractUserScore", new ExtractAndSumScore("user")); - - // Check the user score sums. - PAssert.that(output).containsInAnyOrder(USER_SUMS); - - p.run().waitUntilFinish(); - } - - /** Tests ExtractAndSumScore("team"). */ - @Test - @Category(ValidatesRunner.class) - public void testTeamScoreSums() throws Exception { - - PCollection input = p.apply(Create.of(GAME_EVENTS).withCoder(StringUtf8Coder.of())); - - PCollection> output = input - .apply(ParDo.of(new ParseEventFn())) - // Extract and sum teamname/score pairs from the event data. - .apply("ExtractTeamScore", new ExtractAndSumScore("team")); - - // Check the team score sums. - PAssert.that(output).containsInAnyOrder(TEAM_SUMS); - - p.run().waitUntilFinish(); - } - - /** Test that bad input data is dropped appropriately. */ - @Test - @Category(ValidatesRunner.class) - public void testUserScoresBadInput() throws Exception { - - PCollection input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of())); - - PCollection> extract = input - .apply(ParDo.of(new ParseEventFn())) - .apply( - MapElements - .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) - .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); - - PAssert.that(extract).empty(); - - p.run().waitUntilFinish(); - } -} diff --git a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties deleted file mode 100644 index b0195b3f16..0000000000 --- a/maven-archetypes/examples/src/test/resources/projects/basic/archetype.properties +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -package=it.pkg -version=0.1 -groupId=archetype.it -artifactId=basic -targetPlatform=1.8 diff --git a/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt b/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt deleted file mode 100644 index 0b5987362f..0000000000 --- a/maven-archetypes/examples/src/test/resources/projects/basic/goal.txt +++ /dev/null @@ -1 +0,0 @@ -verify diff --git a/maven-archetypes/pom.xml b/maven-archetypes/pom.xml deleted file mode 100644 index f995770ea6..0000000000 --- a/maven-archetypes/pom.xml +++ /dev/null @@ -1,92 +0,0 @@ - - - - - 4.0.0 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-parent - 2.6.0-SNAPSHOT - ../pom.xml - - - google-cloud-dataflow-java-archetypes-parent - pom - - Google Cloud Dataflow SDK for Java - Maven Archetypes - - - starter - examples - - - - - - - src/main/resources - true - - archetype-resources/pom.xml - - - - - src/main/resources - false - - archetype-resources/pom.xml - - - - - - - - org.apache.maven.plugins - maven-resources-plugin - - - @ - - false - - - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - default-jar - none - - - default-test-jar - none - - - - - - diff --git a/maven-archetypes/starter/pom.xml b/maven-archetypes/starter/pom.xml deleted file mode 100644 index 643cfa4096..0000000000 --- a/maven-archetypes/starter/pom.xml +++ /dev/null @@ -1,93 +0,0 @@ - - - - - 4.0.0 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-archetypes-parent - 2.6.0-SNAPSHOT - ../pom.xml - - - google-cloud-dataflow-java-archetypes-starter - Google Cloud Dataflow SDK for Java - Starter Archetype - Google Cloud Dataflow SDK for Java is a distribution of Apache - Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow - service. This archetype creates a simple starter pipeline to get started - using the Google Cloud Dataflow SDK for Java. - - maven-archetype - - - - - org.apache.maven.archetype - archetype-packaging - ${archetype-packaging.version} - - - - - - - src/test/resources - true - - - - - - - maven-archetype-plugin - ${maven-archetype-plugin.version} - - - org.apache.maven.shared - maven-invoker - ${maven-invoker.version} - - - - - - default-integration-test - install - - integration-test - - - - true - - - - - - - - - diff --git a/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml b/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml deleted file mode 100644 index 428c74aa4a..0000000000 --- a/maven-archetypes/starter/src/main/resources/META-INF/maven/archetype-metadata.xml +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - 1.8 - - - - - - src/main/java - - **/*.java - - - - diff --git a/maven-archetypes/starter/src/main/resources/NOTICE b/maven-archetypes/starter/src/main/resources/NOTICE deleted file mode 100644 index 981fde5a9e..0000000000 --- a/maven-archetypes/starter/src/main/resources/NOTICE +++ /dev/null @@ -1,5 +0,0 @@ -Google Cloud Dataflow SDK for Java -Copyright 2017, Google Inc. - -This product includes software developed at -The Apache Software Foundation (http://www.apache.org/). diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml b/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml deleted file mode 100644 index da443b16fa..0000000000 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/pom.xml +++ /dev/null @@ -1,93 +0,0 @@ - - - - 4.0.0 - - ${groupId} - ${artifactId} - ${version} - - - UTF-8 - @maven-compiler-plugin.version@ - @exec-maven-plugin.version@ - @slf4j.version@ - - - - - ossrh.snapshots - Sonatype OSS Repository Hosting - https://oss.sonatype.org/content/repositories/snapshots/ - - false - - - true - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${targetPlatform} - ${targetPlatform} - - - - - - - - org.codehaus.mojo - exec-maven-plugin - ${exec-maven-plugin.version} - - false - - - - - - - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - @project.version@ - - - - - org.slf4j - slf4j-api - ${slf4j.version} - - - org.slf4j - slf4j-jdk14 - ${slf4j.version} - - - diff --git a/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java b/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java deleted file mode 100644 index d6afdecf11..0000000000 --- a/maven-archetypes/starter/src/main/resources/archetype-resources/src/main/java/StarterPipeline.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package ${package}; - -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A starter example for writing Beam programs. - * - *

The example takes two strings, converts them to their upper-case - * representation and logs them. - * - *

To run this starter example locally using DirectRunner, just - * execute it without any additional parameters from your favorite development - * environment. - * - *

To run this starter example using managed resource in Google Cloud - * Platform, you should specify the following command-line options: - * --project= - * --stagingLocation= - * --runner=DataflowRunner - */ -public class StarterPipeline { - private static final Logger LOG = LoggerFactory.getLogger(StarterPipeline.class); - - public static void main(String[] args) { - Pipeline p = Pipeline.create( - PipelineOptionsFactory.fromArgs(args).withValidation().create()); - - p.apply(Create.of("Hello", "World")) - .apply(MapElements.via(new SimpleFunction() { - @Override - public String apply(String input) { - return input.toUpperCase(); - } - })) - .apply(ParDo.of(new DoFn() { - @ProcessElement - public void processElement(ProcessContext c) { - LOG.info(c.element()); - } - })); - - p.run(); - } -} diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties b/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties deleted file mode 100644 index b0195b3f16..0000000000 --- a/maven-archetypes/starter/src/test/resources/projects/basic/archetype.properties +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -package=it.pkg -version=0.1 -groupId=archetype.it -artifactId=basic -targetPlatform=1.8 diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt b/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt deleted file mode 100644 index 0b5987362f..0000000000 --- a/maven-archetypes/starter/src/test/resources/projects/basic/goal.txt +++ /dev/null @@ -1 +0,0 @@ -verify diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml b/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml deleted file mode 100644 index daf87595b7..0000000000 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/pom.xml +++ /dev/null @@ -1,93 +0,0 @@ - - - - 4.0.0 - - archetype.it - basic - 0.1 - - - UTF-8 - @maven-compiler-plugin.version@ - @exec-maven-plugin.version@ - @slf4j.version@ - - - - - ossrh.snapshots - Sonatype OSS Repository Hosting - https://oss.sonatype.org/content/repositories/snapshots/ - - false - - - true - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - 1.8 - 1.8 - - - - - - - - org.codehaus.mojo - exec-maven-plugin - ${exec-maven-plugin.version} - - false - - - - - - - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - @project.version@ - - - - - org.slf4j - slf4j-api - ${slf4j.version} - - - org.slf4j - slf4j-jdk14 - ${slf4j.version} - - - diff --git a/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java b/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java deleted file mode 100644 index 4ae92e8ce6..0000000000 --- a/maven-archetypes/starter/src/test/resources/projects/basic/reference/src/main/java/it/pkg/StarterPipeline.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package it.pkg; - -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.options.PipelineOptionsFactory; -import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.MapElements; -import org.apache.beam.sdk.transforms.ParDo; -import org.apache.beam.sdk.transforms.SimpleFunction; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * A starter example for writing Beam programs. - * - *

The example takes two strings, converts them to their upper-case - * representation and logs them. - * - *

To run this starter example locally using DirectRunner, just - * execute it without any additional parameters from your favorite development - * environment. - * - *

To run this starter example using managed resource in Google Cloud - * Platform, you should specify the following command-line options: - * --project= - * --stagingLocation= - * --runner=DataflowRunner - */ -public class StarterPipeline { - private static final Logger LOG = LoggerFactory.getLogger(StarterPipeline.class); - - public static void main(String[] args) { - Pipeline p = Pipeline.create( - PipelineOptionsFactory.fromArgs(args).withValidation().create()); - - p.apply(Create.of("Hello", "World")) - .apply(MapElements.via(new SimpleFunction() { - @Override - public String apply(String input) { - return input.toUpperCase(); - } - })) - .apply(ParDo.of(new DoFn() { - @ProcessElement - public void processElement(ProcessContext c) { - LOG.info(c.element()); - } - })); - - p.run(); - } -} diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 2924ff9fa2..0000000000 --- a/pom.xml +++ /dev/null @@ -1,449 +0,0 @@ - - - - 4.0.0 - - - com.google - google - 5 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-parent - Google Cloud Dataflow SDK for Java - Parent - Google Cloud Dataflow SDK for Java is a distribution of Apache - Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow - service. This artifact includes the parent POM for other Dataflow SDK - artifacts. - http://cloud.google.com/dataflow - 2013 - - 2.6.0-SNAPSHOT - - - - Apache License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - - Google Inc. - http://www.google.com - - - - - scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - scm:git:git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - git@github.com:GoogleCloudPlatform/DataflowJavaSDK.git - HEAD - - - - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - - ossrh - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - - - apache.staging - Apache Software Foundation Staging Repository - https://repository.apache.org/content/repositories/staging/ - - true - - - false - - - - - apache.snapshots - Apache Software Foundation Snapshot Repository - https://repository.apache.org/content/repositories/snapshots/ - - false - - - true - - - - - - 3.2 - - - - 1.8 - - UTF-8 - ${maven.build.timestamp} - yyyy-MM-dd HH:mm - - 2.5.0 - - Google Cloud Dataflow SDK for Java - beam-${beam.version} - 6 - 1 - - v2-rev374-1.23.0 - 8.7 - 1.0.0 - 1.23.0 - 20.0 - 1.3 - 2.4 - 4.12 - 1.0.0 - 1.9.5 - v1-rev382-1.23.0 - 1.7.25 - - 2.4 - 1.6.0 - 2.20.1 - 2.4 - 3.0.0 - 3.1.0 - 3.7.0 - 3.1.1 - 2.2 - 3.0.2 - 3.0.0-M1 - 2.5.3 - 3.1.0 - 3.1.0 - 2.21.0 - 3.0.1 - - - pom - - sdk - examples - maven-archetypes - - - - - - - org.apache.maven.plugins - maven-clean-plugin - ${maven-clean-plugin.version} - - - - org.apache.maven.plugins - maven-compiler-plugin - ${maven-compiler-plugin.version} - - ${java.version} - ${java.version} - - -Xlint:all - -Werror - -Xlint:-options - - -Xlint:-processing - - true - - - - - org.apache.maven.plugins - maven-checkstyle-plugin - ${maven-checkstyle-plugin.version} - - - com.puppycrawl.tools - checkstyle - ${checkstyle.version} - - - org.apache.beam - beam-sdks-java-build-tools - ${beam.version} - - - - beam/checkstyle.xml - sdk/suppressions.xml - true - true - false - true - - - - - test-compile - - check - - - - - - - org.apache.maven.plugins - maven-jar-plugin - ${maven-jar-plugin.version} - - true - - - - default-jar - - jar - - - - default-test-jar - - test-jar - - - - - - - org.apache.maven.plugins - maven-source-plugin - ${maven-source-plugin.version} - - - attach-sources - compile - - jar - - - - attach-test-sources - test-compile - - test-jar - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - ${maven-javadoc-plugin.version} - - false - - - - javadoc - package - - jar - - - - - - - org.apache.maven.plugins - maven-resources-plugin - ${maven-resources-plugin.version} - - - - org.apache.maven.plugins - maven-dependency-plugin - ${maven-dependency-plugin.version} - - - - analyze-only - - - true - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - - - org.apache.maven.plugins - maven-archetype-plugin - ${maven-archetype-plugin.version} - - - org.apache.maven.shared - maven-invoker - ${maven-invoker.version} - - - - - - default-integration-test - install - - integration-test - - - true - - - - - - - org.apache.maven.plugins - maven-release-plugin - ${maven-release-plugin} - - true - true - deploy - - - - - org.codehaus.mojo - exec-maven-plugin - ${exec-maven-plugin.version} - - false - - - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - - - org.apache.maven.plugins - maven-source-plugin - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - - org.apache.maven.plugins - maven-dependency-plugin - - - - org.apache.maven.plugins - maven-surefire-plugin - - - - org.apache.maven.plugins - maven-checkstyle-plugin - - - - - - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - ${project.version} - - - - org.apache.beam - beam-sdks-java-core - ${beam.version} - - - - org.apache.beam - beam-sdks-java-io-google-cloud-platform - ${beam.version} - - - - org.apache.beam - beam-runners-direct-java - ${beam.version} - - - - org.apache.beam - beam-runners-google-cloud-dataflow-java - ${beam.version} - - - - org.apache.beam - beam-examples-java - ${beam.version} - - - - org.apache.beam - beam-sdks-java-io-kafka - ${beam.version} - - - - junit - junit - ${junit.version} - test - - - - diff --git a/sdk/pom.xml b/sdk/pom.xml deleted file mode 100644 index 0bd69dc58c..0000000000 --- a/sdk/pom.xml +++ /dev/null @@ -1,75 +0,0 @@ - - - - 4.0.0 - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-parent - 2.6.0-SNAPSHOT - - - google-cloud-dataflow-java-sdk-all - Google Cloud Dataflow SDK for Java - All - Google Cloud Dataflow SDK for Java is a distribution of Apache - Beam designed to simplify usage of Apache Beam on Google Cloud Dataflow - service. This artifact includes entire Dataflow Java SDK. - - jar - - - - - src/main/resources - true - - - - - - - org.apache.beam - beam-sdks-java-core - - - - org.apache.beam - beam-sdks-java-io-google-cloud-platform - - - - org.apache.beam - beam-runners-direct-java - - - - org.apache.beam - beam-runners-google-cloud-dataflow-java - - - - org.apache.beam - beam-sdks-java-io-kafka - - - - junit - junit - test - - - diff --git a/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java b/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java deleted file mode 100644 index df3fd76ae6..0000000000 --- a/sdk/src/main/java/com/google/cloud/dataflow/sdk/SdkDependencies.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2017 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package com.google.cloud.dataflow.sdk; - -import org.apache.beam.runners.dataflow.DataflowRunner; -import org.apache.beam.runners.direct.DirectRunner; -import org.apache.beam.sdk.Pipeline; -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; -import org.apache.beam.sdk.io.kafka.KafkaIO; - -/** - * Mark the dependencies as used at compile time. - */ -class SdkDependencies { - private Pipeline p; - private BigQueryIO bigQueryIO; - private KafkaIO kafkaIO; - private DirectRunner directRunner; - private DataflowRunner dataflowRunner; -} diff --git a/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow-distribution.properties b/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow-distribution.properties deleted file mode 100644 index 33ee76287a..0000000000 --- a/sdk/src/main/resources/org/apache/beam/runners/dataflow/dataflow-distribution.properties +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (C) 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -name=${dataflow.release_name} -version=${pom.version} -build.date=${timestamp} -legacy.environment.major.version=${dataflow.legacy_environment_major_version} -fnapi.environment.major.version=${dataflow.fnapi_environment_major_version} -container.version=${dataflow.container_version} diff --git a/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java b/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java deleted file mode 100644 index 5088a00cfc..0000000000 --- a/sdk/src/test/java/org/apache/beam/runners/dataflow/DataflowRunnerInfoOverrideTest.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (C) 2017 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ -package org.apache.beam.runners.dataflow; - -import static org.junit.Assert.assertEquals; - -import java.io.InputStream; -import java.util.Properties; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.JUnit4; - -/** - * Tests for {@link DataflowRunnerInfo} specifically validating that properties in - * this distrbution are correctly read. - */ -@RunWith(JUnit4.class) -public class DataflowRunnerInfoOverrideTest { - private static final String DATAFLOW_DISTRIBUTION_PROPERTIES_PATH = - "/org/apache/beam/runners/dataflow/dataflow-distribution.properties"; - - private static final String FNAPI_ENVIRONMENT_MAJOR_VERSION_KEY = - "fnapi.environment.major.version"; - private static final String LEGACY_ENVIRONMENT_MAJOR_VERSION_KEY = - "legacy.environment.major.version"; - private static final String CONTAINER_VERSION_KEY = "container.version"; - - - @Test - public void testDataflowDistributionOverride() throws Exception { - try (InputStream in = - DataflowRunnerInfo.class.getResourceAsStream(DATAFLOW_DISTRIBUTION_PROPERTIES_PATH)) { - Properties properties = new Properties(); - properties.load(in); - - assertEquals(properties.getProperty(FNAPI_ENVIRONMENT_MAJOR_VERSION_KEY), - DataflowRunnerInfo.getDataflowRunnerInfo().getFnApiEnvironmentMajorVersion()); - assertEquals(properties.getProperty(LEGACY_ENVIRONMENT_MAJOR_VERSION_KEY), - DataflowRunnerInfo.getDataflowRunnerInfo().getLegacyEnvironmentMajorVersion()); - assertEquals(properties.getProperty(CONTAINER_VERSION_KEY), - DataflowRunnerInfo.getDataflowRunnerInfo().getContainerVersion()); - } - } -} diff --git a/sdk/suppressions.xml b/sdk/suppressions.xml deleted file mode 100644 index 4d707ab291..0000000000 --- a/sdk/suppressions.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - - - - From 9f055d9b6e15f512491e6e8310abb31d73fc4cb0 Mon Sep 17 00:00:00 2001 From: Ahmet Altay Date: Wed, 25 Jul 2018 10:42:36 -0700 Subject: [PATCH 77/77] Clean up README --- README.md | 84 +++++++++---------------------------------------------- 1 file changed, 13 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 112df59d01..dfb630ad79 100644 --- a/README.md +++ b/README.md @@ -16,86 +16,28 @@ # Google Cloud Dataflow SDK for Java -[Google Cloud Dataflow](https://cloud.google.com/dataflow/) provides a simple, -powerful programming model for building both batch and streaming parallel data -processing pipelines. +[Google Cloud Dataflow](https://cloud.google.com/dataflow/) is a service for executing [Apache Beam](https://beam.apache.org) pipelines on Google Cloud Platform. -Dataflow SDK for Java is a distribution of a portion of the -[Apache Beam](https://beam.apache.org) project. This repository hosts the -code to build this distribution and any Dataflow-specific code/modules. The -underlying source code is hosted in the -[Apache Beam repository](https://github.com/apache/beam). - -[General usage](https://cloud.google.com/dataflow/getting-started) of Google -Cloud Dataflow does **not** require use of this repository. Instead, you can do -any one of the following: - -1. Depend directly on a specific -[version](https://cloud.google.com/dataflow/downloads) of the SDK in -the [Maven Central Repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.google.cloud.dataflow%22) -by adding the following dependency to development -environments like Eclipse or Apache Maven: - - - com.google.cloud.dataflow - google-cloud-dataflow-java-sdk-all - version_number - - -1. Download the example pipelines from the separate -[DataflowJavaSDK-examples](https://github.com/GoogleCloudPlatform/DataflowJavaSDK-examples) -repository. - -1. If you are using [Eclipse](https://eclipse.org/) integrated development -environment (IDE), the -[Cloud Dataflow Plugin for Eclipse](https://cloud.google.com/dataflow/docs/quickstarts/quickstart-java-eclipse) -provides tools to create and execute Dataflow pipelines inside Eclipse. - -## Status [![Build Status](https://api.travis-ci.org/GoogleCloudPlatform/DataflowJavaSDK.svg?branch=master)](https://travis-ci.org/GoogleCloudPlatform/DataflowJavaSDK) - -Both the SDK and the Dataflow Service are generally available and considered -stable and fully qualified for production use. - -This [`master`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/) branch -contains code to build Dataflow SDK 2.0.0 and newer, as a distribution of Apache -Beam. Pre-Beam SDKs, versions 1.x, are maintained in the -[`master-1.x`](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/tree/master-1.x) -branch. - -## Overview - -The key concepts in this programming model are: - -* `PCollection`: represents a collection of data, which could be bounded or -unbounded in size. -* `PTransform`: represents a computation that transforms input PCollections -into output PCollections. -* `Pipeline`: manages a directed acyclic graph of PTransforms and PCollections -that is ready for execution. -* `PipelineRunner`: specifies where and how the pipeline should execute. - -We provide two runners: - - 1. The `DirectRunner` runs the pipeline on your local machine. - 1. The `DataflowRunner` submits the pipeline to the Cloud Dataflow Service, -where it runs using managed resources in the -[Google Cloud Platform](https://cloud.google.com). +## Getting Started -The SDK is built to be extensible and support additional execution environments -beyond local execution and the Google Cloud Dataflow Service. Apache Beam -contains additional SDKs, runners, and IO connectors. +* [Quickstart Using Java](https://cloud.google.com/dataflow/docs/quickstarts/quickstart-java-maven) on Google Cloud Dataflow +* [Java API Reference](https://beam.apache.org/documentation/sdks/javadoc/) +* [Java Examples](https://github.com/apache/beam/tree/master/examples/java) -## Getting Started +## We moved to Apache Beam! +Apache Beam Java SDK and the code development moved to the [Apache Beam repo](https://github.com/apache/beam/tree/master/sdks/java). -Please try our [Quickstarts](https://cloud.google.com/dataflow/docs/quickstarts). +If you want to contribute to the project (please do!) use this [Apache Beam contributor's guide](http://beam.apache.org/contribution-guide/) ## Contact Us -We welcome all usage-related questions on [Stack Overflow](http://stackoverflow.com/questions/tagged/google-cloud-dataflow) +We welcome all usage-related questions on +[Stack Overflow](https://stackoverflow.com/questions/tagged/google-cloud-dataflow) tagged with `google-cloud-dataflow`. -Please use [issue tracker](https://github.com/GoogleCloudPlatform/DataflowJavaSDK/issues) -on GitHub to report any bugs, comments or questions regarding SDK development. +Please use the +[issue tracker](https://issues.apache.org/jira/browse/BEAM) +on Apache JIRA to report any bugs, comments or questions regarding SDK development. ## More Information