From ca1463461465e1a006f918c9ad093cb8554400e7 Mon Sep 17 00:00:00 2001 From: Pawel Leszczynski Date: Thu, 5 Sep 2024 15:07:17 +0200 Subject: [PATCH 1/3] [FLINK-34466] lineage interfaces for kafka connector --- .../kafka/lineage/LineageFacetProvider.java | 19 +++ .../connector/kafka/lineage/LineageUtil.java | 156 ++++++++++++++++++ .../lineage/facets/KafkaPropertiesFacet.java | 59 +++++++ .../lineage/facets/KafkaTopicListFacet.java | 59 +++++++ .../facets/KafkaTopicPatternFacet.java | 59 +++++++ .../lineage/facets/TypeInformationFacet.java | 64 +++++++ ...KafkaRecordSerializationSchemaBuilder.java | 68 +++++++- .../flink/connector/kafka/sink/KafkaSink.java | 22 ++- .../connector/kafka/source/KafkaSource.java | 28 +++- .../subscriber/PartitionSetSubscriber.java | 18 +- .../subscriber/TopicListSubscriber.java | 12 +- .../subscriber/TopicPatternSubscriber.java | 13 +- .../KafkaDeserializationSchemaWrapper.java | 13 +- ...ValueOnlyDeserializationSchemaWrapper.java | 13 +- .../KafkaValueOnlyDeserializerWrapper.java | 13 +- .../KafkaDeserializationSchemaWrapper.java | 14 +- .../kafka/lineage/LineageUtilTest.java | 152 +++++++++++++++++ ...aRecordSerializationSchemaBuilderTest.java | 40 +++++ .../connector/kafka/sink/KafkaSinkTest.java | 59 +++++++ .../kafka/source/KafkaSourceTest.java | 85 ++++++++++ .../subscriber/KafkaSubscriberTest.java | 9 + .../KafkaRecordDeserializationSchemaTest.java | 14 ++ 22 files changed, 978 insertions(+), 11 deletions(-) create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java create mode 100644 flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java create mode 100644 flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java create mode 100644 flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java new file mode 100644 index 000000000..24ea3a1d8 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java @@ -0,0 +1,19 @@ +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +import java.util.Collection; + +/** + * Contains method which can be used for lineage schema facet extraction. Useful for classes like + * topic selectors or serialization schemas to extract dataset information from. + */ +public interface LineageFacetProvider { + + /** + * List of lineage dataset facets. + * + * @return + */ + Collection getDatasetFacets(); +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java new file mode 100644 index 000000000..c526e83cf --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; +import org.apache.flink.streaming.api.lineage.LineageDataset; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.streaming.api.lineage.LineageVertex; +import org.apache.flink.streaming.api.lineage.SourceLineageVertex; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Properties; +import java.util.stream.Collectors; + +/** Utility class with useful methods for managing dataset facets. */ +public class LineageUtil { + + private static final String KAFKA_DATASET_PREFIX = "kafka://"; + private static final String COMMA = ","; + private static final String SEMICOLON = ";"; + + /** + * Loads facet from any object implementing @link{DatasetFacetProvider} interface. + * + * @param object + * @return + */ + public static Collection facetsFrom(Object object) { + return Optional.of(object) + .filter(LineageFacetProvider.class::isInstance) + .map(LineageFacetProvider.class::cast) + .map(LineageFacetProvider::getDatasetFacets) + .orElse(Collections.emptyList()); + } + + /** + * Creates dataset from a list of facets. Uses {@link KafkaTopicListFacet} to extract dataset + * name from. Dataset per each element of topic list is created + * + * @param facets + * @return + */ + public static Collection datasetsFrom( + String namespace, Collection facets) { + // Check if topic list facet is available -> if so explode the list of facets + Optional topicList = + facets.stream() + .filter(KafkaTopicListFacet.class::isInstance) + .map(KafkaTopicListFacet.class::cast) + .findAny(); + + List datasets = new ArrayList<>(); + + // Explode list of other facets + if (topicList.isPresent()) { + List facetsWithoutTopicList = + facets.stream().filter(f -> !f.equals(topicList)).collect(Collectors.toList()); + + datasets.addAll( + topicList.get().topics.stream() + .map(t -> datasetOf(namespace, t, facetsWithoutTopicList)) + .collect(Collectors.toList())); + } + + // Check if topic pattern is present + // If so topic pattern will be used as a dataset name + datasets.addAll( + facets.stream() + .filter(KafkaTopicPatternFacet.class::isInstance) + .map(KafkaTopicPatternFacet.class::cast) + .map(f -> datasetOf(namespace, f.pattern.toString(), facets)) + .collect(Collectors.toList())); + return datasets; + } + + private static LineageDataset datasetOf( + String namespace, String name, Collection facets) { + return new LineageDataset() { + @Override + public String name() { + return name; + } + + @Override + public String namespace() { + return namespace; + } + + @Override + public Map facets() { + return facets.stream() + .distinct() + .collect(Collectors.toMap(LineageDatasetFacet::name, item -> item)); + } + }; + } + + public static String datasetNamespaceOf(Properties properties) { + String bootstrapServers = properties.getProperty("bootstrap.servers"); + + if (bootstrapServers.contains(COMMA)) { + bootstrapServers = bootstrapServers.split(COMMA)[0]; + } else if (bootstrapServers.contains(SEMICOLON)) { + bootstrapServers = bootstrapServers.split(SEMICOLON)[0]; + } + + return String.format(KAFKA_DATASET_PREFIX + bootstrapServers); + } + + public static SourceLineageVertex sourceLineageVertexOf(Collection datasets) { + return new SourceLineageVertex() { + @Override + public Boundedness boundedness() { + return Boundedness.CONTINUOUS_UNBOUNDED; + } + + @Override + public List datasets() { + return datasets.stream().collect(Collectors.toList()); + } + }; + } + + public static LineageVertex lineageVertexOf(Collection datasets) { + return new LineageVertex() { + @Override + public List datasets() { + return datasets.stream().collect(Collectors.toList()); + } + }; + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java new file mode 100644 index 000000000..29b9a0687 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.kafka.lineage.facets; + +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +import com.google.common.collect.ImmutableMap; + +import java.util.Objects; +import java.util.Properties; + +/** Facet containing Kafka properties. */ +public class KafkaPropertiesFacet implements LineageDatasetFacet { + + public static final String KAFKA_PROPERTIES_FACET_NAME = "kafkaProperties"; + public Properties properties; + + public KafkaPropertiesFacet(Properties properties) { + this.properties = new Properties(); + this.properties.putAll(ImmutableMap.copyOf(properties)); + } + + @Override + public String name() { + return KAFKA_PROPERTIES_FACET_NAME; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + KafkaPropertiesFacet that = (KafkaPropertiesFacet) o; + return Objects.equals(properties, that.properties); + } + + @Override + public int hashCode() { + return Objects.hash(properties); + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java new file mode 100644 index 000000000..1121673e1 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.kafka.lineage.facets; + +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +import java.util.List; +import java.util.Objects; + +/** + * Facet containing TypeInformation object. Can be used as an intermediate step for evaluating topic + * involved in data processing. + */ +public class KafkaTopicListFacet implements LineageDatasetFacet { + + public static final String TOPIC_LIST_FACET_NAME = "topicList"; + public List topics; + + public KafkaTopicListFacet(List topics) { + this.topics = topics; + } + + @Override + public String name() { + return TOPIC_LIST_FACET_NAME; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + KafkaTopicListFacet that = (KafkaTopicListFacet) o; + return Objects.equals(topics, that.topics); + } + + @Override + public int hashCode() { + return Objects.hash(topics); + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java new file mode 100644 index 000000000..c43212500 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.kafka.lineage.facets; + +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +import java.util.Objects; +import java.util.regex.Pattern; + +/** + * Facet containing topic pattern. Can be used as an intermediate step for evaluating topics + * involved in data processing. + */ +public class KafkaTopicPatternFacet implements LineageDatasetFacet { + + public static final String TOPIC_PATTERN_FACET_NAME = "topicPattern"; + public Pattern pattern; + + public KafkaTopicPatternFacet(Pattern pattern) { + this.pattern = pattern; + } + + @Override + public String name() { + return TOPIC_PATTERN_FACET_NAME; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + KafkaTopicPatternFacet that = (KafkaTopicPatternFacet) o; + return Objects.equals(pattern.pattern(), that.pattern.pattern()); + } + + @Override + public int hashCode() { + return Objects.hash(pattern.pattern()); + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java new file mode 100644 index 000000000..2bb1dd7dc --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.kafka.lineage.facets; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +import java.util.Objects; + +/** + * Facet containing TypeInformation object. Can be used as an intermediate step for evaluating + * schema dataset facet. + */ +public class TypeInformationFacet implements LineageDatasetFacet { + + public static final String TYPE_INFORMATION_FACET_NAME = "typeInformation"; + + private TypeInformation typeInformation; + + public TypeInformationFacet(TypeInformation typeInformation) { + this.typeInformation = typeInformation; + } + + @Override + public String name() { + return TYPE_INFORMATION_FACET_NAME; + } + + public TypeInformation getTypeInformation() { + return typeInformation; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + TypeInformationFacet that = (TypeInformationFacet) o; + return Objects.equals(typeInformation, that.typeInformation); + } + + @Override + public int hashCode() { + return Objects.hash(typeInformation); + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java index e9fc413b2..29ceb5159 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java @@ -19,15 +19,28 @@ import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.serialization.SerializationSchema; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; +import com.google.common.reflect.Invokable; +import com.google.common.reflect.TypeToken; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.common.serialization.Serializer; import javax.annotation.Nullable; import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.OptionalInt; import java.util.function.Function; @@ -122,7 +135,8 @@ public KafkaRecordSerializationSchemaBuilder setPartitioner( public KafkaRecordSerializationSchemaBuilder setTopic(String topic) { checkState(this.topicSelector == null, "Topic selector already set."); checkNotNull(topic); - this.topicSelector = new CachingTopicSelector<>((e) -> topic); + + this.topicSelector = new ConstantTopicSelector<>(topic); return this; } @@ -283,6 +297,27 @@ private void checkKeySerializerNotSet() { checkState(keySerializationSchema == null, "Key serializer already set."); } + private static class ConstantTopicSelector + implements Function, Serializable, LineageFacetProvider { + + private String topic; + + ConstantTopicSelector(String topic) { + this.topic = topic; + } + + @Override + public String apply(IN in) { + return topic; + } + + @Override + public Collection getDatasetFacets() { + return Collections.singletonList( + new KafkaTopicListFacet(Collections.singletonList(topic))); + } + } + private static class CachingTopicSelector implements Function, Serializable { private static final int CACHE_RESET_SIZE = 5; @@ -306,7 +341,7 @@ public String apply(IN in) { } private static class KafkaRecordSerializationSchemaWrapper - implements KafkaRecordSerializationSchema { + implements LineageFacetProvider, KafkaRecordSerializationSchema { private final SerializationSchema valueSerializationSchema; private final Function topicSelector; private final KafkaPartitioner partitioner; @@ -369,5 +404,34 @@ public ProducerRecord serialize( value, headerProvider != null ? headerProvider.getHeaders(element) : null); } + + @Override + public List getDatasetFacets() { + List facets = new ArrayList<>(); + if (topicSelector instanceof LineageFacetProvider) { + facets.addAll(((LineageFacetProvider) topicSelector).getDatasetFacets()); + } + + if (this.valueSerializationSchema instanceof ResultTypeQueryable) { + facets.add( + new TypeInformationFacet( + ((ResultTypeQueryable) this.valueSerializationSchema) + .getProducedType())); + } else { + // gets type information from serialize method signature + Arrays.stream(this.valueSerializationSchema.getClass().getMethods()) + .map(m -> Invokable.from(m)) + .filter(m -> "serialize".equalsIgnoreCase(m.getName())) + .map(m -> m.getParameters().get(0)) + .filter(p -> !p.getType().equals(TypeToken.of(Object.class))) + .findFirst() + .map(p -> p.getType()) + .map(t -> TypeInformation.of(t.getRawType())) + .map(g -> new TypeInformationFacet(g)) + .ifPresent(f -> facets.add(f)); + } + + return facets; + } } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java index d5b1c3700..faee558cb 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java @@ -22,11 +22,18 @@ import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.connector.sink2.Committer; import org.apache.flink.connector.base.DeliveryGuarantee; +import org.apache.flink.connector.kafka.lineage.LineageUtil; +import org.apache.flink.connector.kafka.lineage.facets.KafkaPropertiesFacet; import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.streaming.api.lineage.LineageVertex; +import org.apache.flink.streaming.api.lineage.LineageVertexProvider; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.List; import java.util.Properties; /** @@ -54,7 +61,8 @@ */ @PublicEvolving public class KafkaSink - implements TwoPhaseCommittingStatefulSink { + implements LineageVertexProvider, + TwoPhaseCommittingStatefulSink { private final DeliveryGuarantee deliveryGuarantee; @@ -132,4 +140,16 @@ public SimpleVersionedSerializer getWriterStateSerializer() { protected Properties getKafkaProducerConfig() { return kafkaProducerConfig; } + + @Override + public LineageVertex getLineageVertex() { + List facets = new ArrayList<>(); + + // add all the facets from deserialization schema and subscriber + facets.addAll(LineageUtil.facetsFrom(recordSerializer)); + facets.add(new KafkaPropertiesFacet(this.kafkaProducerConfig)); + + String namespace = LineageUtil.datasetNamespaceOf(this.kafkaProducerConfig); + return LineageUtil.lineageVertexOf(LineageUtil.datasetsFrom(namespace, facets)); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java index 54f5f856c..0a3f84a46 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java @@ -33,6 +33,9 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.lineage.LineageUtil; +import org.apache.flink.connector.kafka.lineage.facets.KafkaPropertiesFacet; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumState; import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumStateSerializer; import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumerator; @@ -48,6 +51,9 @@ import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitSerializer; import org.apache.flink.core.io.SimpleVersionedSerializer; import org.apache.flink.metrics.MetricGroup; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.streaming.api.lineage.LineageVertexProvider; +import org.apache.flink.streaming.api.lineage.SourceLineageVertex; import org.apache.flink.util.UserCodeClassLoader; import org.apache.flink.util.function.SerializableSupplier; @@ -56,12 +62,16 @@ import javax.annotation.Nullable; import java.io.IOException; +import java.util.ArrayList; import java.util.Collection; +import java.util.List; import java.util.Optional; import java.util.Properties; import java.util.function.Consumer; import java.util.function.Supplier; +import static org.apache.flink.connector.kafka.lineage.LineageUtil.sourceLineageVertexOf; + /** * The Source implementation of Kafka. Please use a {@link KafkaSourceBuilder} to construct a {@link * KafkaSource}. The following example shows how to create a KafkaSource emitting records of @@ -87,7 +97,8 @@ */ @PublicEvolving public class KafkaSource - implements Source, + implements LineageVertexProvider, + Source, ResultTypeQueryable { private static final long serialVersionUID = -8755372893283732098L; // Users can choose only one of the following ways to specify the topics to consume from. @@ -251,4 +262,19 @@ KafkaSubscriber getKafkaSubscriber() { OffsetsInitializer getStoppingOffsetsInitializer() { return stoppingOffsetsInitializer; } + + @Override + public SourceLineageVertex getLineageVertex() { + List facets = new ArrayList<>(); + + // add all the facets from deserialization schema and subscriber + facets.addAll(LineageUtil.facetsFrom(deserializationSchema)); + facets.addAll(LineageUtil.facetsFrom(subscriber)); + + facets.add(new TypeInformationFacet(getProducedType())); + facets.add(new KafkaPropertiesFacet(props)); + + String namespace = LineageUtil.datasetNamespaceOf(props); + return sourceLineageVertexOf(LineageUtil.datasetsFrom(namespace, facets)); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java index 3423b0f90..6e03a4b68 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java @@ -18,13 +18,19 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.TopicDescription; import org.apache.kafka.common.TopicPartition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -32,7 +38,7 @@ import static org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriberUtils.getTopicMetadata; /** A subscriber for a partition set. */ -class PartitionSetSubscriber implements KafkaSubscriber { +class PartitionSetSubscriber implements LineageFacetProvider, KafkaSubscriber { private static final long serialVersionUID = 390970375272146036L; private static final Logger LOG = LoggerFactory.getLogger(PartitionSetSubscriber.class); private final Set subscribedPartitions; @@ -73,4 +79,14 @@ && partitionExistsInTopic( private boolean partitionExistsInTopic(TopicPartition partition, TopicDescription topic) { return topic.partitions().size() > partition.partition(); } + + @Override + public List getDatasetFacets() { + return Collections.singletonList( + new KafkaTopicListFacet( + subscribedPartitions.stream() + .map(TopicPartition::topic) + .distinct() + .collect(Collectors.toList()))); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java index b2ad844ab..a4f25531f 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java @@ -18,6 +18,10 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.TopicDescription; import org.apache.kafka.common.TopicPartition; @@ -25,6 +29,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -36,7 +41,7 @@ * A subscriber to a fixed list of topics. The subscribed topics must have existed in the Kafka * cluster, otherwise an exception will be thrown. */ -class TopicListSubscriber implements KafkaSubscriber { +class TopicListSubscriber implements KafkaSubscriber, LineageFacetProvider { private static final long serialVersionUID = -6917603843104947866L; private static final Logger LOG = LoggerFactory.getLogger(TopicListSubscriber.class); private final List topics; @@ -60,4 +65,9 @@ public Set getSubscribedTopicPartitions(AdminClient adminClient) return subscribedPartitions; } + + @Override + public List getDatasetFacets() { + return Collections.singletonList(new KafkaTopicListFacet(topics)); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java index 985ca7137..ada3067bd 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java @@ -18,6 +18,10 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.TopicDescription; import org.apache.kafka.common.TopicPartition; @@ -25,7 +29,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; @@ -33,7 +39,7 @@ import static org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriberUtils.getTopicMetadata; /** A subscriber to a topic pattern. */ -class TopicPatternSubscriber implements KafkaSubscriber { +class TopicPatternSubscriber implements KafkaSubscriber, LineageFacetProvider { private static final long serialVersionUID = -7471048577725467797L; private static final Logger LOG = LoggerFactory.getLogger(TopicPatternSubscriber.class); private final Pattern topicPattern; @@ -60,4 +66,9 @@ public Set getSubscribedTopicPartitions(AdminClient adminClient) return subscribedTopicPartitions; } + + @Override + public List getDatasetFacets() { + return Collections.singletonList(new KafkaTopicPatternFacet(topicPattern)); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java index 1cc7dde79..71a567166 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java @@ -20,12 +20,17 @@ import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema; import org.apache.flink.util.Collector; import org.apache.kafka.clients.consumer.ConsumerRecord; import java.io.IOException; +import java.util.Collections; +import java.util.List; /** * A wrapper class that wraps a {@link @@ -36,7 +41,8 @@ * @deprecated Remove with @{@link KafkaDeserializationSchema} */ @Deprecated -class KafkaDeserializationSchemaWrapper implements KafkaRecordDeserializationSchema { +class KafkaDeserializationSchemaWrapper + implements KafkaRecordDeserializationSchema, LineageFacetProvider { private static final long serialVersionUID = 1L; private final KafkaDeserializationSchema kafkaDeserializationSchema; @@ -64,4 +70,9 @@ public void deserialize(ConsumerRecord message, Collector out public TypeInformation getProducedType() { return kafkaDeserializationSchema.getProducedType(); } + + @Override + public List getDatasetFacets() { + return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java index 209f5e15c..30b760da1 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java @@ -20,11 +20,16 @@ import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.util.Collector; import org.apache.kafka.clients.consumer.ConsumerRecord; import java.io.IOException; +import java.util.Collections; +import java.util.List; /** * A class that wraps a {@link DeserializationSchema} as the value deserializer for a {@link @@ -32,7 +37,8 @@ * * @param the return type of the deserialization. */ -class KafkaValueOnlyDeserializationSchemaWrapper implements KafkaRecordDeserializationSchema { +class KafkaValueOnlyDeserializationSchemaWrapper + implements KafkaRecordDeserializationSchema, LineageFacetProvider { private static final long serialVersionUID = 1L; private final DeserializationSchema deserializationSchema; @@ -55,4 +61,9 @@ public void deserialize(ConsumerRecord message, Collector out public TypeInformation getProducedType() { return deserializationSchema.getProducedType(); } + + @Override + public List getDatasetFacets() { + return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java index 8c8095b6b..3ab93f3df 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java @@ -21,6 +21,9 @@ import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.typeutils.TypeExtractor; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.util.Collector; import org.apache.flink.util.InstantiationUtil; import org.apache.flink.util.TemporaryClassLoaderContext; @@ -32,10 +35,13 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.Collections; +import java.util.List; import java.util.Map; /** A package private class to wrap {@link Deserializer}. */ -class KafkaValueOnlyDeserializerWrapper implements KafkaRecordDeserializationSchema { +class KafkaValueOnlyDeserializerWrapper + implements KafkaRecordDeserializationSchema, LineageFacetProvider { private static final long serialVersionUID = 5409547407386004054L; @@ -103,4 +109,9 @@ public void deserialize(ConsumerRecord record, Collector coll public TypeInformation getProducedType() { return TypeExtractor.createTypeInfo(Deserializer.class, deserializerClass, 0, null, null); } + + @Override + public List getDatasetFacets() { + return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); + } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java index b754b4d09..f0005a097 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java @@ -20,11 +20,17 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema; import org.apache.flink.util.Collector; import org.apache.kafka.clients.consumer.ConsumerRecord; +import java.util.Collections; +import java.util.List; + /** * A simple wrapper for using the DeserializationSchema with the KafkaDeserializationSchema * interface. @@ -33,7 +39,8 @@ */ @Internal @Deprecated -public class KafkaDeserializationSchemaWrapper implements KafkaDeserializationSchema { +public class KafkaDeserializationSchemaWrapper + implements KafkaDeserializationSchema, LineageFacetProvider { private static final long serialVersionUID = 2651665280744549932L; @@ -68,4 +75,9 @@ public boolean isEndOfStream(T nextElement) { public TypeInformation getProducedType() { return deserializationSchema.getProducedType(); } + + @Override + public List getDatasetFacets() { + return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); + } } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java new file mode 100644 index 000000000..43a3083e3 --- /dev/null +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java @@ -0,0 +1,152 @@ +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; +import org.apache.flink.streaming.api.lineage.LineageDataset; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.streaming.api.lineage.SourceLineageVertex; + +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.regex.Pattern; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +/** Tests for {@link LineageUtil}. */ +public class LineageUtilTest { + + @Test + public void testFromFacetsForNonDatasetFacetProvider() { + assertThat(LineageUtil.facetsFrom(new Object())).isEmpty(); + } + + @Test + public void testFromFacetsWhenNoFacetsReturned() { + LineageFacetProvider facetProvider = mock(LineageFacetProvider.class); + when(facetProvider.getDatasetFacets()).thenReturn(Collections.emptyList()); + + assertThat(LineageUtil.facetsFrom(facetProvider)).isEmpty(); + } + + @Test + public void testFromFacets() { + LineageDatasetFacet facet1 = mock(LineageDatasetFacet.class); + LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); + + LineageFacetProvider facetProvider = mock(LineageFacetProvider.class); + when(facetProvider.getDatasetFacets()).thenReturn(Arrays.asList(facet1, facet2)); + + assertThat(LineageUtil.facetsFrom(facetProvider)).containsExactly(facet1, facet2); + } + + @Test + public void testDatasetsFromWithTopicList() { + LineageDatasetFacet facet1 = new KafkaTopicListFacet(Arrays.asList("topic1", "topic2")); + LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); + LineageDatasetFacet facet3 = mock(LineageDatasetFacet.class); + + when(facet2.name()).thenReturn("facetName2"); + when(facet3.name()).thenReturn("facetName3"); + String namespace = "kafka://host"; + + List facets = Arrays.asList(facet1, facet2, facet3); + + Collection datasets = LineageUtil.datasetsFrom(namespace, facets); + + assertThat(datasets).hasSize(2); + + Optional topic1 = + datasets.stream().filter(e -> "topic1".equals(e.name())).findAny(); + assertThat(topic1).isPresent(); + assertThat(topic1.get().namespace()).isEqualTo(namespace); + assertThat(topic1.get().facets().get("facetName2")).isEqualTo(facet2); + assertThat(topic1.get().facets().get("facetName3")).isEqualTo(facet3); + + Optional topic2 = + datasets.stream().filter(e -> "topic2".equals(e.name())).findAny(); + assertThat(topic2).isPresent(); + assertThat(topic2.get().name()).isEqualTo("topic2"); + assertThat(topic2.get().namespace()).isEqualTo(namespace); + assertThat(topic2.get().facets().get("facetName2")).isEqualTo(facet2); + assertThat(topic2.get().facets().get("facetName3")).isEqualTo(facet3); + } + + @Test + public void testDatasetsFromWithTopicPattern() { + Pattern pattern = Pattern.compile("some-pattern"); + + LineageDatasetFacet facet1 = new KafkaTopicPatternFacet(pattern); + LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); + LineageDatasetFacet facet3 = mock(LineageDatasetFacet.class); + + when(facet2.name()).thenReturn("facetName2"); + when(facet3.name()).thenReturn("facetName3"); + String namespace = "kafka://host"; + + List facets = Arrays.asList(facet1, facet2, facet3); + + Collection datasets = LineageUtil.datasetsFrom(namespace, facets); + assertThat(datasets).hasSize(1); + + LineageDataset dataset = datasets.iterator().next(); + + assertThat(dataset.name()).isEqualTo("some-pattern"); + assertThat(dataset.namespace()).isEqualTo(namespace); + assertThat(dataset.facets().get("facetName2")).isEqualTo(facet2); + assertThat(dataset.facets().get("facetName3")).isEqualTo(facet3); + } + + @Test + public void testDatasetsWithNoTopicListNorPattern() { + LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); + LineageDatasetFacet facet3 = mock(LineageDatasetFacet.class); + + List facets = Arrays.asList(facet2, facet3); + + assertThat(LineageUtil.datasetsFrom("some-namespace", facets)).isEmpty(); + } + + @Test + public void testSourceLineageVertexOf() { + List datasets = Arrays.asList(Mockito.mock(LineageDataset.class)); + + SourceLineageVertex sourceLineageVertex = LineageUtil.sourceLineageVertexOf(datasets); + + assertThat(sourceLineageVertex.boundedness()).isEqualTo(Boundedness.CONTINUOUS_UNBOUNDED); + assertThat(sourceLineageVertex.datasets()).isEqualTo(datasets); + } + + @Test + public void testDatasetNamespaceOf() { + Properties properties = new Properties(); + properties.put("bootstrap.servers", "my-kafka-host"); + + assertThat(LineageUtil.datasetNamespaceOf(properties)).isEqualTo("kafka://my-kafka-host"); + } + + @Test + public void testDatasetNamespaceOfWithSemicolon() { + Properties properties = new Properties(); + properties.put("bootstrap.servers", "my-kafka-host1;my-kafka-host2"); + + assertThat(LineageUtil.datasetNamespaceOf(properties)).isEqualTo("kafka://my-kafka-host1"); + } + + @Test + public void testDatasetNamespaceOfWithComma() { + Properties properties = new Properties(); + properties.put("bootstrap.servers", "my-kafka-host1,my-kafka-host2"); + + assertThat(LineageUtil.datasetNamespaceOf(properties)).isEqualTo("kafka://my-kafka-host1"); + } +} diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java index 701f9c8aa..1326009e7 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java @@ -19,7 +19,12 @@ import org.apache.flink.api.common.serialization.SerializationSchema; import org.apache.flink.api.common.serialization.SimpleStringSchema; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; import org.apache.flink.connector.testutils.formats.DummyInitializationContext; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; import org.apache.flink.util.TestLogger; @@ -36,10 +41,12 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Function; @@ -256,6 +263,39 @@ public void testSerializeRecordWithTimestamp() { assertThat(recordWithInvalidTimestamp.timestamp()).isNull(); } + @Test + public void testGetLineageDatasetFacets() { + final SerializationSchema serializationSchema = new SimpleStringSchema(); + final KafkaRecordSerializationSchema schema = + KafkaRecordSerializationSchema.builder() + .setTopic(DEFAULT_TOPIC) + .setValueSerializationSchema(serializationSchema) + .setKeySerializationSchema(serializationSchema) + .build(); + + Collection facets = ((LineageFacetProvider) schema).getDatasetFacets(); + + assertThat(facets).hasSize(2); + + Optional kafkaTopicListFacet = + facets.stream() + .filter(f -> f instanceof KafkaTopicListFacet) + .map(f -> (KafkaTopicListFacet) f) + .findAny(); + assertThat(kafkaTopicListFacet).isPresent(); + assertThat(kafkaTopicListFacet.get()) + .hasFieldOrPropertyWithValue("topics", Arrays.asList(DEFAULT_TOPIC)); + + Optional typeInformationFacet = + facets.stream() + .filter(f -> f instanceof TypeInformationFacet) + .map(f -> (TypeInformationFacet) f) + .findAny(); + assertThat(typeInformationFacet).isPresent(); + assertThat(typeInformationFacet.get().getTypeInformation()) + .isEqualTo(BasicTypeInfo.STRING_TYPE_INFO); + } + private static void assertOnlyOneSerializerAllowed( List< Function< diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java new file mode 100644 index 000000000..998046e49 --- /dev/null +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java @@ -0,0 +1,59 @@ +package org.apache.flink.connector.kafka.sink; + +import org.apache.flink.connector.base.DeliveryGuarantee; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.streaming.api.lineage.LineageVertex; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Properties; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.mockito.Mockito.withSettings; + +/** Tests for {@link KafkaSink}. */ +public class KafkaSinkTest { + + @Test + public void testGetLineageVertex() { + LineageDatasetFacet facet1 = mock(LineageDatasetFacet.class); + LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); + when(facet1.name()).thenReturn("facet1"); + when(facet2.name()).thenReturn("facet2"); + LineageDatasetFacet topicSelector = + new KafkaTopicListFacet(Arrays.asList("topic1", "topic2")); + + KafkaRecordSerializationSchema schema = + mock( + KafkaRecordSerializationSchema.class, + withSettings().extraInterfaces(LineageFacetProvider.class)); + + when(((LineageFacetProvider) schema).getDatasetFacets()) + .thenReturn(Arrays.asList(facet1, facet2, topicSelector)); + Properties kafkaProperties = new Properties(); + kafkaProperties.put("bootstrap.servers", "host1;host2"); + KafkaSink sink = new KafkaSink(DeliveryGuarantee.EXACTLY_ONCE, kafkaProperties, "", schema); + + LineageVertex lineageVertex = sink.getLineageVertex(); + assertThat(lineageVertex.datasets()).hasSize(2); + + assertThat(lineageVertex.datasets().get(0).namespace()).isEqualTo("kafka://host1"); + assertThat(lineageVertex.datasets().get(0).name()).isEqualTo("topic1"); + + assertThat(lineageVertex.datasets().get(1).namespace()).isEqualTo("kafka://host1"); + assertThat(lineageVertex.datasets().get(1).name()).isEqualTo("topic2"); + + // facets shall be the same for both datasets + assertThat(lineageVertex.datasets().get(0).facets()) + .isEqualTo(lineageVertex.datasets().get(1).facets()); + + assertThat(lineageVertex.datasets().get(0).facets()) + .containsEntry("facet1", facet1) + .containsEntry("facet2", facet2); + } +} diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java new file mode 100644 index 000000000..c138d2fb6 --- /dev/null +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connector.kafka.source; + +import org.apache.flink.api.connector.source.Boundedness; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.streaming.api.lineage.LineageVertex; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Properties; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.mockito.Mockito.withSettings; + +/** Tests for {@link KafkaSource}. */ +public class KafkaSourceTest { + + @Test + public void testGetLineageVertex() { + LineageDatasetFacet facet1 = mock(LineageDatasetFacet.class); + LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); + when(facet1.name()).thenReturn("facet1"); + when(facet2.name()).thenReturn("facet2"); + + KafkaRecordDeserializationSchema schema = + mock( + KafkaRecordDeserializationSchema.class, + withSettings().extraInterfaces(LineageFacetProvider.class)); + + when(((LineageFacetProvider) schema).getDatasetFacets()) + .thenReturn(Arrays.asList(facet1, facet2)); + Properties kafkaProperties = new Properties(); + + kafkaProperties.put("bootstrap.servers", "host1;host2"); + KafkaSource source = + new KafkaSource( + KafkaSubscriber.getTopicListSubscriber(Arrays.asList("topic1", "topic2")), + mock(OffsetsInitializer.class), + null, + Boundedness.CONTINUOUS_UNBOUNDED, + schema, + kafkaProperties, + null); + + LineageVertex lineageVertex = source.getLineageVertex(); + assertThat(lineageVertex.datasets()).hasSize(2); + + assertThat(lineageVertex.datasets().get(0).namespace()).isEqualTo("kafka://host1"); + assertThat(lineageVertex.datasets().get(0).name()).isEqualTo("topic1"); + + assertThat(lineageVertex.datasets().get(1).namespace()).isEqualTo("kafka://host1"); + assertThat(lineageVertex.datasets().get(1).name()).isEqualTo("topic2"); + + // facets shall be the same for both datasets + assertThat(lineageVertex.datasets().get(0).facets()) + .isEqualTo(lineageVertex.datasets().get(1).facets()); + + assertThat(lineageVertex.datasets().get(0).facets()) + .containsEntry("facet1", facet1) + .containsEntry("facet2", facet2); + } +} diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java index 258c1c0ab..435570de8 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java @@ -18,6 +18,9 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; +import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; import org.apache.flink.connector.kafka.testutils.KafkaSourceTestEnv; import org.apache.kafka.clients.admin.AdminClient; @@ -71,6 +74,8 @@ public void testTopicListSubscriber() { new HashSet<>(KafkaSourceTestEnv.getPartitionsForTopics(topics)); assertThat(subscribedPartitions).isEqualTo(expectedSubscribedPartitions); + assertThat(((LineageFacetProvider) subscriber).getDatasetFacets()) + .containsExactly(new KafkaTopicListFacet(topics)); } @Test @@ -96,6 +101,8 @@ public void testTopicPatternSubscriber() { KafkaSourceTestEnv.getPartitionsForTopics(Collections.singleton(TOPIC2))); assertThat(subscribedPartitions).isEqualTo(expectedSubscribedPartitions); + assertThat(((LineageFacetProvider) subscriber).getDatasetFacets()) + .containsExactly(new KafkaTopicPatternFacet(Pattern.compile("pattern.*"))); } @Test @@ -111,6 +118,8 @@ public void testPartitionSetSubscriber() { subscriber.getSubscribedTopicPartitions(adminClient); assertThat(subscribedPartitions).isEqualTo(partitions); + assertThat(((LineageFacetProvider) subscriber).getDatasetFacets()) + .containsExactly(new KafkaTopicListFacet(topics)); } @Test diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java index b0ca63161..9bb7eb0c2 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java @@ -18,6 +18,10 @@ package org.apache.flink.connector.kafka.source.reader.deserializer; +import org.apache.flink.api.common.typeinfo.BasicTypeInfo; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; +import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; import org.apache.flink.connector.kafka.util.JacksonMapperFactory; import org.apache.flink.connector.testutils.formats.DummyInitializationContext; import org.apache.flink.connector.testutils.source.deserialization.TestingDeserializationContext; @@ -76,6 +80,8 @@ public void testKafkaDeserializationSchemaWrapper() throws Exception { assertThat(deserializedValue.get("metadata").get("topic").asText()).isEqualTo("topic#1"); assertThat(deserializedValue.get("metadata").get("offset").asInt()).isEqualTo(4); assertThat(deserializedValue.get("metadata").get("partition").asInt()).isEqualTo(3); + assertThat(((LineageFacetProvider) schema).getDatasetFacets()) + .containsExactly(new TypeInformationFacet(TypeInformation.of(ObjectNode.class))); } @Test @@ -102,6 +108,12 @@ public void testKafkaValueDeserializationSchemaWrapper() throws Exception { assertThat(deserializedValue.get("word").asText()).isEqualTo("world"); assertThat(deserializedValue.get("key")).isNull(); assertThat(deserializedValue.get("metadata")).isNull(); + assertThat(((LineageFacetProvider) schema).getDatasetFacets()) + .containsExactly( + new TypeInformationFacet( + TypeInformation.of( + org.apache.flink.shaded.jackson2.com.fasterxml.jackson + .databind.node.ObjectNode.class))); } @Test @@ -119,6 +131,8 @@ public void testKafkaValueDeserializerWrapper() throws Exception { assertThat(collector.list).hasSize(1); assertThat(collector.list.get(0)).isEqualTo("world"); + assertThat(((LineageFacetProvider) schema).getDatasetFacets()) + .containsExactly(new TypeInformationFacet(BasicTypeInfo.STRING_TYPE_INFO)); } @Test From 4bbff1795da04296d01481eef3b6ada8ab690258 Mon Sep 17 00:00:00 2001 From: Pawel Leszczynski Date: Thu, 17 Oct 2024 15:32:08 +0200 Subject: [PATCH 2/3] [FLINK-34466] create KafkaDatasetFacet Signed-off-by: Pawel Leszczynski --- .../lineage/DefaultKafkaDatasetFacet.java | 63 +++++++ .../DefaultKafkaDatasetIdentifier.java | 56 ++++++ .../lineage/DefaultTypeDatasetFacet.java | 42 +++++ .../kafka/lineage/KafkaDatasetFacet.java | 14 ++ .../lineage/KafkaDatasetFacetProvider.java | 15 ++ .../kafka/lineage/KafkaDatasetIdentifier.java | 28 +++ .../KafkaDatasetIdentifierProvider.java | 15 ++ .../kafka/lineage/LineageFacetProvider.java | 19 --- .../connector/kafka/lineage/LineageUtil.java | 84 +++------ .../kafka/lineage/TypeDatasetFacet.java | 9 + .../lineage/TypeDatasetFacetProvider.java | 15 ++ .../lineage/facets/KafkaPropertiesFacet.java | 59 ------- .../lineage/facets/KafkaTopicListFacet.java | 59 ------- .../facets/KafkaTopicPatternFacet.java | 59 ------- .../lineage/facets/TypeInformationFacet.java | 64 ------- ...KafkaRecordSerializationSchemaBuilder.java | 94 +++++++---- .../flink/connector/kafka/sink/KafkaSink.java | 56 ++++-- .../connector/kafka/source/KafkaSource.java | 43 +++-- .../subscriber/PartitionSetSubscriber.java | 16 +- .../subscriber/TopicListSubscriber.java | 13 +- .../subscriber/TopicPatternSubscriber.java | 14 +- .../KafkaDeserializationSchemaWrapper.java | 13 +- ...ValueOnlyDeserializationSchemaWrapper.java | 13 +- .../KafkaValueOnlyDeserializerWrapper.java | 13 +- .../KafkaDeserializationSchemaWrapper.java | 14 +- .../kafka/lineage/LineageUtilTest.java | 140 ++++----------- ...aRecordSerializationSchemaBuilderTest.java | 159 ++++++++++++++---- .../connector/kafka/sink/KafkaSinkTest.java | 151 +++++++++++++---- .../kafka/source/KafkaSourceTest.java | 146 ++++++++++++---- .../subscriber/KafkaSubscriberTest.java | 21 ++- .../KafkaRecordDeserializationSchemaTest.java | 14 -- 31 files changed, 837 insertions(+), 684 deletions(-) create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java delete mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java create mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java delete mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java delete mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java delete mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java delete mode 100644 flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java new file mode 100644 index 000000000..cb1a4671c --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java @@ -0,0 +1,63 @@ +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.connector.kafka.source.KafkaPropertiesUtil; + +import java.util.Objects; +import java.util.Properties; + +/** Default implementation of {@link KafkaDatasetFacet}. */ +public class DefaultKafkaDatasetFacet implements KafkaDatasetFacet { + + public static final String KAFKA_FACET_NAME = "kafka"; + + private Properties properties; + + private final KafkaDatasetIdentifier topicIdentifier; + + public DefaultKafkaDatasetFacet(KafkaDatasetIdentifier topicIdentifier, Properties properties) { + this(topicIdentifier); + + this.properties = new Properties(); + KafkaPropertiesUtil.copyProperties(properties, this.properties); + } + + public DefaultKafkaDatasetFacet(KafkaDatasetIdentifier topicIdentifier) { + this.topicIdentifier = topicIdentifier; + } + + public void setProperties(Properties properties) { + this.properties = new Properties(); + KafkaPropertiesUtil.copyProperties(properties, this.properties); + } + + public Properties getProperties() { + return properties; + } + + public KafkaDatasetIdentifier getTopicIdentifier() { + return topicIdentifier; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DefaultKafkaDatasetFacet that = (DefaultKafkaDatasetFacet) o; + return Objects.equals(properties, that.properties) + && Objects.equals(topicIdentifier, that.topicIdentifier); + } + + @Override + public int hashCode() { + return Objects.hash(properties, topicIdentifier); + } + + @Override + public String name() { + return KAFKA_FACET_NAME; + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java new file mode 100644 index 000000000..bd05cfd52 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java @@ -0,0 +1,56 @@ +package org.apache.flink.connector.kafka.lineage; + +import javax.annotation.Nullable; + +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.regex.Pattern; + +/** Default implementation of {@link KafkaDatasetIdentifier}. */ +public class DefaultKafkaDatasetIdentifier implements KafkaDatasetIdentifier { + + @Nullable private final List topics; + @Nullable private final Pattern topicPattern; + + public DefaultKafkaDatasetIdentifier(List fixedTopics, Pattern topicPattern) { + this.topics = fixedTopics; + this.topicPattern = topicPattern; + } + + public static DefaultKafkaDatasetIdentifier ofPattern(Pattern pattern) { + return new DefaultKafkaDatasetIdentifier(Collections.emptyList(), pattern); + } + + public static DefaultKafkaDatasetIdentifier ofTopics(List fixedTopics) { + return new DefaultKafkaDatasetIdentifier(fixedTopics, null); + } + + @Nullable + public List getTopics() { + return topics; + } + + @Nullable + public Pattern getTopicPattern() { + return topicPattern; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DefaultKafkaDatasetIdentifier that = (DefaultKafkaDatasetIdentifier) o; + return Objects.equals(topics, that.topics) + && Objects.equals(topicPattern, that.topicPattern); + } + + @Override + public int hashCode() { + return Objects.hash(topics, topicPattern); + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java new file mode 100644 index 000000000..69183e3a1 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java @@ -0,0 +1,42 @@ +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.api.common.typeinfo.TypeInformation; + +import java.util.Objects; + +/** Default implementation of {@link KafkaDatasetFacet}. */ +public class DefaultTypeDatasetFacet implements TypeDatasetFacet { + + public static final String TYPE_FACET_NAME = "type"; + + private final TypeInformation typeInformation; + + public DefaultTypeDatasetFacet(TypeInformation typeInformation) { + this.typeInformation = typeInformation; + } + + public TypeInformation getTypeInformation() { + return typeInformation; + } + + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + DefaultTypeDatasetFacet that = (DefaultTypeDatasetFacet) o; + return Objects.equals(typeInformation, that.typeInformation); + } + + @Override + public int hashCode() { + return Objects.hash(typeInformation); + } + + @Override + public String name() { + return TYPE_FACET_NAME; + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java new file mode 100644 index 000000000..22d14dd2c --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java @@ -0,0 +1,14 @@ +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +import java.util.Properties; + +/** Facet definition to contain all Kafka specific information on Kafka sources and sinks. */ +public interface KafkaDatasetFacet extends LineageDatasetFacet { + Properties getProperties(); + + KafkaDatasetIdentifier getTopicIdentifier(); + + void setProperties(Properties properties); +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java new file mode 100644 index 000000000..0eed6f715 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java @@ -0,0 +1,15 @@ +package org.apache.flink.connector.kafka.lineage; + +import java.util.Optional; + +/** Contains method to extract {@link KafkaDatasetFacet}. */ +public interface KafkaDatasetFacetProvider { + + /** + * Returns a Kafka dataset facet or `Optional.empty` in case an implementing class is not able + * to identify a dataset. + * + * @return + */ + Optional getKafkaDatasetFacet(); +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java new file mode 100644 index 000000000..0c43f8be9 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java @@ -0,0 +1,28 @@ +package org.apache.flink.connector.kafka.lineage; + +import javax.annotation.Nullable; + +import java.util.List; +import java.util.regex.Pattern; + +/** Kafka dataset identifier which can contain either a list of topics or a topic pattern. */ +public interface KafkaDatasetIdentifier { + @Nullable + List getTopics(); + + @Nullable + Pattern getTopicPattern(); + + /** + * Assigns lineage dataset's name which is topic pattern if it is present or comma separated + * list of topics. + * + * @return + */ + default String toLineageName() { + if (getTopicPattern() != null) { + return getTopicPattern().toString(); + } + return String.join(",", getTopics()); + } +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java new file mode 100644 index 000000000..36f8c4f2e --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java @@ -0,0 +1,15 @@ +package org.apache.flink.connector.kafka.lineage; + +import java.util.Optional; + +/** Contains method which allows extracting topic identifier. */ +public interface KafkaDatasetIdentifierProvider { + + /** + * Gets Kafka dataset identifier or empty in case a class implementing is not able to extract + * dataset identifier. + * + * @return + */ + Optional getDatasetIdentifier(); +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java deleted file mode 100644 index 24ea3a1d8..000000000 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageFacetProvider.java +++ /dev/null @@ -1,19 +0,0 @@ -package org.apache.flink.connector.kafka.lineage; - -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; - -import java.util.Collection; - -/** - * Contains method which can be used for lineage schema facet extraction. Useful for classes like - * topic selectors or serialization schemas to extract dataset information from. - */ -public interface LineageFacetProvider { - - /** - * List of lineage dataset facets. - * - * @return - */ - Collection getDatasetFacets(); -} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java index c526e83cf..779c167c6 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java @@ -20,89 +20,43 @@ package org.apache.flink.connector.kafka.lineage; import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; import org.apache.flink.streaming.api.lineage.LineageDataset; import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.api.lineage.LineageVertex; import org.apache.flink.streaming.api.lineage.SourceLineageVertex; -import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Properties; import java.util.stream.Collectors; -/** Utility class with useful methods for managing dataset facets. */ +/** Utility class with useful methods for managing lineage objects. */ public class LineageUtil { private static final String KAFKA_DATASET_PREFIX = "kafka://"; private static final String COMMA = ","; private static final String SEMICOLON = ";"; - /** - * Loads facet from any object implementing @link{DatasetFacetProvider} interface. - * - * @param object - * @return - */ - public static Collection facetsFrom(Object object) { - return Optional.of(object) - .filter(LineageFacetProvider.class::isInstance) - .map(LineageFacetProvider.class::cast) - .map(LineageFacetProvider::getDatasetFacets) - .orElse(Collections.emptyList()); + public static LineageDataset datasetOf(String namespace, KafkaDatasetFacet kafkaDatasetFacet) { + return datasetOf(namespace, kafkaDatasetFacet, Collections.emptyList()); } - /** - * Creates dataset from a list of facets. Uses {@link KafkaTopicListFacet} to extract dataset - * name from. Dataset per each element of topic list is created - * - * @param facets - * @return - */ - public static Collection datasetsFrom( - String namespace, Collection facets) { - // Check if topic list facet is available -> if so explode the list of facets - Optional topicList = - facets.stream() - .filter(KafkaTopicListFacet.class::isInstance) - .map(KafkaTopicListFacet.class::cast) - .findAny(); - - List datasets = new ArrayList<>(); - - // Explode list of other facets - if (topicList.isPresent()) { - List facetsWithoutTopicList = - facets.stream().filter(f -> !f.equals(topicList)).collect(Collectors.toList()); - - datasets.addAll( - topicList.get().topics.stream() - .map(t -> datasetOf(namespace, t, facetsWithoutTopicList)) - .collect(Collectors.toList())); - } - - // Check if topic pattern is present - // If so topic pattern will be used as a dataset name - datasets.addAll( - facets.stream() - .filter(KafkaTopicPatternFacet.class::isInstance) - .map(KafkaTopicPatternFacet.class::cast) - .map(f -> datasetOf(namespace, f.pattern.toString(), facets)) - .collect(Collectors.toList())); - return datasets; + public static LineageDataset datasetOf( + String namespace, KafkaDatasetFacet kafkaDatasetFacet, TypeDatasetFacet typeFacet) { + return datasetOf(namespace, kafkaDatasetFacet, Collections.singletonList(typeFacet)); } private static LineageDataset datasetOf( - String namespace, String name, Collection facets) { + String namespace, + KafkaDatasetFacet kafkaDatasetFacet, + List facets) { return new LineageDataset() { @Override public String name() { - return name; + return kafkaDatasetFacet.getTopicIdentifier().toLineageName(); } @Override @@ -112,16 +66,24 @@ public String namespace() { @Override public Map facets() { - return facets.stream() - .distinct() - .collect(Collectors.toMap(LineageDatasetFacet::name, item -> item)); + Map facetMap = new HashMap(); + facetMap.put(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME, kafkaDatasetFacet); + facetMap.putAll( + facets.stream() + .collect( + Collectors.toMap(LineageDatasetFacet::name, item -> item))); + return facetMap; } }; } - public static String datasetNamespaceOf(Properties properties) { + public static String namespaceOf(Properties properties) { String bootstrapServers = properties.getProperty("bootstrap.servers"); + if (bootstrapServers == null) { + return KAFKA_DATASET_PREFIX; + } + if (bootstrapServers.contains(COMMA)) { bootstrapServers = bootstrapServers.split(COMMA)[0]; } else if (bootstrapServers.contains(SEMICOLON)) { diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java new file mode 100644 index 000000000..4b4261c65 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java @@ -0,0 +1,9 @@ +package org.apache.flink.connector.kafka.lineage; + +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; + +/** Facet definition to contain type information of source and sink. */ +public interface TypeDatasetFacet extends LineageDatasetFacet { + TypeInformation getTypeInformation(); +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java new file mode 100644 index 000000000..b2f0ea831 --- /dev/null +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java @@ -0,0 +1,15 @@ +package org.apache.flink.connector.kafka.lineage; + +import java.util.Optional; + +/** Contains method to extract {@link TypeDatasetFacet}. */ +public interface TypeDatasetFacetProvider { + + /** + * Returns a type dataset facet or `Optional.empty` in case an implementing class is not able to + * resolve type. + * + * @return + */ + Optional getTypeDatasetFacet(); +} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java deleted file mode 100644 index 29b9a0687..000000000 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaPropertiesFacet.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.flink.connector.kafka.lineage.facets; - -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; - -import com.google.common.collect.ImmutableMap; - -import java.util.Objects; -import java.util.Properties; - -/** Facet containing Kafka properties. */ -public class KafkaPropertiesFacet implements LineageDatasetFacet { - - public static final String KAFKA_PROPERTIES_FACET_NAME = "kafkaProperties"; - public Properties properties; - - public KafkaPropertiesFacet(Properties properties) { - this.properties = new Properties(); - this.properties.putAll(ImmutableMap.copyOf(properties)); - } - - @Override - public String name() { - return KAFKA_PROPERTIES_FACET_NAME; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - KafkaPropertiesFacet that = (KafkaPropertiesFacet) o; - return Objects.equals(properties, that.properties); - } - - @Override - public int hashCode() { - return Objects.hash(properties); - } -} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java deleted file mode 100644 index 1121673e1..000000000 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicListFacet.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.flink.connector.kafka.lineage.facets; - -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; - -import java.util.List; -import java.util.Objects; - -/** - * Facet containing TypeInformation object. Can be used as an intermediate step for evaluating topic - * involved in data processing. - */ -public class KafkaTopicListFacet implements LineageDatasetFacet { - - public static final String TOPIC_LIST_FACET_NAME = "topicList"; - public List topics; - - public KafkaTopicListFacet(List topics) { - this.topics = topics; - } - - @Override - public String name() { - return TOPIC_LIST_FACET_NAME; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - KafkaTopicListFacet that = (KafkaTopicListFacet) o; - return Objects.equals(topics, that.topics); - } - - @Override - public int hashCode() { - return Objects.hash(topics); - } -} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java deleted file mode 100644 index c43212500..000000000 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/KafkaTopicPatternFacet.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.flink.connector.kafka.lineage.facets; - -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; - -import java.util.Objects; -import java.util.regex.Pattern; - -/** - * Facet containing topic pattern. Can be used as an intermediate step for evaluating topics - * involved in data processing. - */ -public class KafkaTopicPatternFacet implements LineageDatasetFacet { - - public static final String TOPIC_PATTERN_FACET_NAME = "topicPattern"; - public Pattern pattern; - - public KafkaTopicPatternFacet(Pattern pattern) { - this.pattern = pattern; - } - - @Override - public String name() { - return TOPIC_PATTERN_FACET_NAME; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - KafkaTopicPatternFacet that = (KafkaTopicPatternFacet) o; - return Objects.equals(pattern.pattern(), that.pattern.pattern()); - } - - @Override - public int hashCode() { - return Objects.hash(pattern.pattern()); - } -} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java deleted file mode 100644 index 2bb1dd7dc..000000000 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/facets/TypeInformationFacet.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.flink.connector.kafka.lineage.facets; - -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; - -import java.util.Objects; - -/** - * Facet containing TypeInformation object. Can be used as an intermediate step for evaluating - * schema dataset facet. - */ -public class TypeInformationFacet implements LineageDatasetFacet { - - public static final String TYPE_INFORMATION_FACET_NAME = "typeInformation"; - - private TypeInformation typeInformation; - - public TypeInformationFacet(TypeInformation typeInformation) { - this.typeInformation = typeInformation; - } - - @Override - public String name() { - return TYPE_INFORMATION_FACET_NAME; - } - - public TypeInformation getTypeInformation() { - return typeInformation; - } - - @Override - public boolean equals(Object o) { - if (this == o) { - return true; - } - if (o == null || getClass() != o.getClass()) { - return false; - } - TypeInformationFacet that = (TypeInformationFacet) o; - return Objects.equals(typeInformation, that.typeInformation); - } - - @Override - public int hashCode() { - return Objects.hash(typeInformation); - } -} diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java index 29ceb5159..0fba3a364 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilder.java @@ -21,27 +21,30 @@ import org.apache.flink.api.common.serialization.SerializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.typeutils.ResultTypeQueryable; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.DefaultTypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacetProvider; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacetProvider; +import org.apache.flink.connector.kafka.source.KafkaSource; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; -import com.google.common.reflect.Invokable; import com.google.common.reflect.TypeToken; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.common.serialization.Serializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.OptionalInt; import java.util.function.Function; @@ -92,6 +95,7 @@ */ @PublicEvolving public class KafkaRecordSerializationSchemaBuilder { + private static final Logger LOG = LoggerFactory.getLogger(KafkaSource.class); @Nullable private Function topicSelector; @Nullable private SerializationSchema valueSerializationSchema; @@ -298,7 +302,7 @@ private void checkKeySerializerNotSet() { } private static class ConstantTopicSelector - implements Function, Serializable, LineageFacetProvider { + implements Function, Serializable, KafkaDatasetIdentifierProvider { private String topic; @@ -312,13 +316,14 @@ public String apply(IN in) { } @Override - public Collection getDatasetFacets() { - return Collections.singletonList( - new KafkaTopicListFacet(Collections.singletonList(topic))); + public Optional getDatasetIdentifier() { + return Optional.of( + DefaultKafkaDatasetIdentifier.ofTopics(Collections.singletonList(topic))); } } - private static class CachingTopicSelector implements Function, Serializable { + private static class CachingTopicSelector + implements Function, KafkaDatasetIdentifierProvider, Serializable { private static final int CACHE_RESET_SIZE = 5; private final Map cache; @@ -338,10 +343,21 @@ public String apply(IN in) { } return topic; } + + @Override + public Optional getDatasetIdentifier() { + if (topicSelector instanceof KafkaDatasetIdentifierProvider) { + return ((KafkaDatasetIdentifierProvider) topicSelector).getDatasetIdentifier(); + } else { + return Optional.empty(); + } + } } private static class KafkaRecordSerializationSchemaWrapper - implements LineageFacetProvider, KafkaRecordSerializationSchema { + implements KafkaDatasetFacetProvider, + KafkaRecordSerializationSchema, + TypeDatasetFacetProvider { private final SerializationSchema valueSerializationSchema; private final Function topicSelector; private final KafkaPartitioner partitioner; @@ -406,32 +422,44 @@ public ProducerRecord serialize( } @Override - public List getDatasetFacets() { - List facets = new ArrayList<>(); - if (topicSelector instanceof LineageFacetProvider) { - facets.addAll(((LineageFacetProvider) topicSelector).getDatasetFacets()); + public Optional getKafkaDatasetFacet() { + if (!(topicSelector instanceof KafkaDatasetIdentifierProvider)) { + LOG.info("Cannot identify topics. Not an TopicsIdentifierProvider"); + return Optional.empty(); + } + + Optional topicsIdentifier = + ((KafkaDatasetIdentifierProvider) (topicSelector)).getDatasetIdentifier(); + + if (!topicsIdentifier.isPresent()) { + LOG.info("No topics' identifiers provided"); + return Optional.empty(); } + return Optional.of(new DefaultKafkaDatasetFacet(topicsIdentifier.get())); + } + + @Override + public Optional getTypeDatasetFacet() { if (this.valueSerializationSchema instanceof ResultTypeQueryable) { - facets.add( - new TypeInformationFacet( + return Optional.of( + new DefaultTypeDatasetFacet( ((ResultTypeQueryable) this.valueSerializationSchema) .getProducedType())); } else { // gets type information from serialize method signature - Arrays.stream(this.valueSerializationSchema.getClass().getMethods()) - .map(m -> Invokable.from(m)) - .filter(m -> "serialize".equalsIgnoreCase(m.getName())) - .map(m -> m.getParameters().get(0)) - .filter(p -> !p.getType().equals(TypeToken.of(Object.class))) - .findFirst() - .map(p -> p.getType()) - .map(t -> TypeInformation.of(t.getRawType())) - .map(g -> new TypeInformationFacet(g)) - .ifPresent(f -> facets.add(f)); + TypeToken serializationSchemaType = + TypeToken.of(valueSerializationSchema.getClass()); + Class parameterType = + serializationSchemaType + .resolveType(SerializationSchema.class.getTypeParameters()[0]) + .getRawType(); + if (parameterType != Object.class) { + return Optional.of( + new DefaultTypeDatasetFacet(TypeInformation.of(parameterType))); + } } - - return facets; + return Optional.empty(); } } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java index faee558cb..0f89e5bb2 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java @@ -22,18 +22,22 @@ import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.connector.sink2.Committer; import org.apache.flink.connector.base.DeliveryGuarantee; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacetProvider; import org.apache.flink.connector.kafka.lineage.LineageUtil; -import org.apache.flink.connector.kafka.lineage.facets.KafkaPropertiesFacet; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacetProvider; import org.apache.flink.core.io.SimpleVersionedSerializer; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.api.lineage.LineageVertex; import org.apache.flink.streaming.api.lineage.LineageVertexProvider; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.List; +import java.util.Optional; import java.util.Properties; /** @@ -63,7 +67,7 @@ public class KafkaSink implements LineageVertexProvider, TwoPhaseCommittingStatefulSink { - + private static final Logger LOG = LoggerFactory.getLogger(KafkaSink.class); private final DeliveryGuarantee deliveryGuarantee; private final KafkaRecordSerializationSchema recordSerializer; @@ -143,13 +147,39 @@ protected Properties getKafkaProducerConfig() { @Override public LineageVertex getLineageVertex() { - List facets = new ArrayList<>(); - - // add all the facets from deserialization schema and subscriber - facets.addAll(LineageUtil.facetsFrom(recordSerializer)); - facets.add(new KafkaPropertiesFacet(this.kafkaProducerConfig)); - - String namespace = LineageUtil.datasetNamespaceOf(this.kafkaProducerConfig); - return LineageUtil.lineageVertexOf(LineageUtil.datasetsFrom(namespace, facets)); + // enrich dataset facet with properties + Optional kafkaDatasetFacet; + if (recordSerializer instanceof KafkaDatasetFacetProvider) { + kafkaDatasetFacet = + ((KafkaDatasetFacetProvider) recordSerializer).getKafkaDatasetFacet(); + + if (!kafkaDatasetFacet.isPresent()) { + LOG.info("Provided did not return kafka dataset facet"); + return LineageUtil.sourceLineageVertexOf(Collections.emptyList()); + } + kafkaDatasetFacet.get().setProperties(this.kafkaProducerConfig); + } else { + LOG.info( + "recordSerializer does not implement KafkaDatasetFacetProvider: {}", + recordSerializer); + return LineageUtil.sourceLineageVertexOf(Collections.emptyList()); + } + + String namespace = LineageUtil.namespaceOf(kafkaProducerConfig); + + Optional typeDatasetFacet = Optional.empty(); + if (recordSerializer instanceof TypeDatasetFacetProvider) { + typeDatasetFacet = ((TypeDatasetFacetProvider) recordSerializer).getTypeDatasetFacet(); + } + + if (typeDatasetFacet.isPresent()) { + return LineageUtil.sourceLineageVertexOf( + Collections.singleton( + LineageUtil.datasetOf( + namespace, kafkaDatasetFacet.get(), typeDatasetFacet.get()))); + } + + return LineageUtil.sourceLineageVertexOf( + Collections.singleton(LineageUtil.datasetOf(namespace, kafkaDatasetFacet.get()))); } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java index 0a3f84a46..39302751c 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/KafkaSource.java @@ -33,9 +33,11 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.synchronization.FutureCompletingBlockingQueue; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.DefaultTypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; import org.apache.flink.connector.kafka.lineage.LineageUtil; -import org.apache.flink.connector.kafka.lineage.facets.KafkaPropertiesFacet; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumState; import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumStateSerializer; import org.apache.flink.connector.kafka.source.enumerator.KafkaSourceEnumerator; @@ -51,27 +53,25 @@ import org.apache.flink.connector.kafka.source.split.KafkaPartitionSplitSerializer; import org.apache.flink.core.io.SimpleVersionedSerializer; import org.apache.flink.metrics.MetricGroup; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.api.lineage.LineageVertexProvider; import org.apache.flink.streaming.api.lineage.SourceLineageVertex; import org.apache.flink.util.UserCodeClassLoader; import org.apache.flink.util.function.SerializableSupplier; import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; -import java.util.List; +import java.util.Collections; import java.util.Optional; import java.util.Properties; import java.util.function.Consumer; import java.util.function.Supplier; -import static org.apache.flink.connector.kafka.lineage.LineageUtil.sourceLineageVertexOf; - /** * The Source implementation of Kafka. Please use a {@link KafkaSourceBuilder} to construct a {@link * KafkaSource}. The following example shows how to create a KafkaSource emitting records of @@ -100,6 +100,7 @@ public class KafkaSource implements LineageVertexProvider, Source, ResultTypeQueryable { + private static final Logger LOG = LoggerFactory.getLogger(KafkaSource.class); private static final long serialVersionUID = -8755372893283732098L; // Users can choose only one of the following ways to specify the topics to consume from. private final KafkaSubscriber subscriber; @@ -265,16 +266,28 @@ OffsetsInitializer getStoppingOffsetsInitializer() { @Override public SourceLineageVertex getLineageVertex() { - List facets = new ArrayList<>(); + if (!(subscriber instanceof KafkaDatasetIdentifierProvider)) { + LOG.info("unable to determine topic identifier"); + return LineageUtil.sourceLineageVertexOf(Collections.emptyList()); + } + + Optional topicsIdentifier = + ((KafkaDatasetIdentifierProvider) subscriber).getDatasetIdentifier(); - // add all the facets from deserialization schema and subscriber - facets.addAll(LineageUtil.facetsFrom(deserializationSchema)); - facets.addAll(LineageUtil.facetsFrom(subscriber)); + if (!topicsIdentifier.isPresent()) { + LOG.info("No topics' identifier returned from subscriber"); + return LineageUtil.sourceLineageVertexOf(Collections.emptyList()); + } - facets.add(new TypeInformationFacet(getProducedType())); - facets.add(new KafkaPropertiesFacet(props)); + DefaultKafkaDatasetFacet kafkaDatasetFacet = + new DefaultKafkaDatasetFacet(topicsIdentifier.get(), props); - String namespace = LineageUtil.datasetNamespaceOf(props); - return sourceLineageVertexOf(LineageUtil.datasetsFrom(namespace, facets)); + String namespace = LineageUtil.namespaceOf(props); + return LineageUtil.sourceLineageVertexOf( + Collections.singletonList( + LineageUtil.datasetOf( + namespace, + kafkaDatasetFacet, + new DefaultTypeDatasetFacet(getProducedType())))); } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java index 6e03a4b68..3ea6f9a5a 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java @@ -18,9 +18,8 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.TopicDescription; @@ -28,17 +27,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; import static org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriberUtils.getTopicMetadata; /** A subscriber for a partition set. */ -class PartitionSetSubscriber implements LineageFacetProvider, KafkaSubscriber { +class PartitionSetSubscriber implements KafkaDatasetIdentifierProvider, KafkaSubscriber { private static final long serialVersionUID = 390970375272146036L; private static final Logger LOG = LoggerFactory.getLogger(PartitionSetSubscriber.class); private final Set subscribedPartitions; @@ -81,9 +79,9 @@ private boolean partitionExistsInTopic(TopicPartition partition, TopicDescriptio } @Override - public List getDatasetFacets() { - return Collections.singletonList( - new KafkaTopicListFacet( + public Optional getDatasetIdentifier() { + return Optional.of( + DefaultKafkaDatasetIdentifier.ofTopics( subscribedPartitions.stream() .map(TopicPartition::topic) .distinct() diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java index a4f25531f..e86ade0fa 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicListSubscriber.java @@ -18,9 +18,8 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.TopicDescription; @@ -29,10 +28,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import static org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriberUtils.getTopicMetadata; @@ -41,7 +40,7 @@ * A subscriber to a fixed list of topics. The subscribed topics must have existed in the Kafka * cluster, otherwise an exception will be thrown. */ -class TopicListSubscriber implements KafkaSubscriber, LineageFacetProvider { +class TopicListSubscriber implements KafkaSubscriber, KafkaDatasetIdentifierProvider { private static final long serialVersionUID = -6917603843104947866L; private static final Logger LOG = LoggerFactory.getLogger(TopicListSubscriber.class); private final List topics; @@ -67,7 +66,7 @@ public Set getSubscribedTopicPartitions(AdminClient adminClient) } @Override - public List getDatasetFacets() { - return Collections.singletonList(new KafkaTopicListFacet(topics)); + public Optional getDatasetIdentifier() { + return Optional.of(DefaultKafkaDatasetIdentifier.ofTopics(topics)); } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java index ada3067bd..208959e27 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/TopicPatternSubscriber.java @@ -18,9 +18,8 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; import org.apache.kafka.clients.admin.AdminClient; import org.apache.kafka.clients.admin.TopicDescription; @@ -29,17 +28,16 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collections; import java.util.HashSet; -import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.regex.Pattern; import static org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriberUtils.getTopicMetadata; /** A subscriber to a topic pattern. */ -class TopicPatternSubscriber implements KafkaSubscriber, LineageFacetProvider { +class TopicPatternSubscriber implements KafkaSubscriber, KafkaDatasetIdentifierProvider { private static final long serialVersionUID = -7471048577725467797L; private static final Logger LOG = LoggerFactory.getLogger(TopicPatternSubscriber.class); private final Pattern topicPattern; @@ -68,7 +66,7 @@ public Set getSubscribedTopicPartitions(AdminClient adminClient) } @Override - public List getDatasetFacets() { - return Collections.singletonList(new KafkaTopicPatternFacet(topicPattern)); + public Optional getDatasetIdentifier() { + return Optional.of(DefaultKafkaDatasetIdentifier.ofPattern(topicPattern)); } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java index 71a567166..1cc7dde79 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaDeserializationSchemaWrapper.java @@ -20,17 +20,12 @@ import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema; import org.apache.flink.util.Collector; import org.apache.kafka.clients.consumer.ConsumerRecord; import java.io.IOException; -import java.util.Collections; -import java.util.List; /** * A wrapper class that wraps a {@link @@ -41,8 +36,7 @@ * @deprecated Remove with @{@link KafkaDeserializationSchema} */ @Deprecated -class KafkaDeserializationSchemaWrapper - implements KafkaRecordDeserializationSchema, LineageFacetProvider { +class KafkaDeserializationSchemaWrapper implements KafkaRecordDeserializationSchema { private static final long serialVersionUID = 1L; private final KafkaDeserializationSchema kafkaDeserializationSchema; @@ -70,9 +64,4 @@ public void deserialize(ConsumerRecord message, Collector out public TypeInformation getProducedType() { return kafkaDeserializationSchema.getProducedType(); } - - @Override - public List getDatasetFacets() { - return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); - } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java index 30b760da1..209f5e15c 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializationSchemaWrapper.java @@ -20,16 +20,11 @@ import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.util.Collector; import org.apache.kafka.clients.consumer.ConsumerRecord; import java.io.IOException; -import java.util.Collections; -import java.util.List; /** * A class that wraps a {@link DeserializationSchema} as the value deserializer for a {@link @@ -37,8 +32,7 @@ * * @param the return type of the deserialization. */ -class KafkaValueOnlyDeserializationSchemaWrapper - implements KafkaRecordDeserializationSchema, LineageFacetProvider { +class KafkaValueOnlyDeserializationSchemaWrapper implements KafkaRecordDeserializationSchema { private static final long serialVersionUID = 1L; private final DeserializationSchema deserializationSchema; @@ -61,9 +55,4 @@ public void deserialize(ConsumerRecord message, Collector out public TypeInformation getProducedType() { return deserializationSchema.getProducedType(); } - - @Override - public List getDatasetFacets() { - return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); - } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java index 3ab93f3df..8c8095b6b 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaValueOnlyDeserializerWrapper.java @@ -21,9 +21,6 @@ import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.typeutils.TypeExtractor; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.util.Collector; import org.apache.flink.util.InstantiationUtil; import org.apache.flink.util.TemporaryClassLoaderContext; @@ -35,13 +32,10 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.Collections; -import java.util.List; import java.util.Map; /** A package private class to wrap {@link Deserializer}. */ -class KafkaValueOnlyDeserializerWrapper - implements KafkaRecordDeserializationSchema, LineageFacetProvider { +class KafkaValueOnlyDeserializerWrapper implements KafkaRecordDeserializationSchema { private static final long serialVersionUID = 5409547407386004054L; @@ -109,9 +103,4 @@ public void deserialize(ConsumerRecord record, Collector coll public TypeInformation getProducedType() { return TypeExtractor.createTypeInfo(Deserializer.class, deserializerClass, 0, null, null); } - - @Override - public List getDatasetFacets() { - return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); - } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java b/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java index f0005a097..b754b4d09 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/streaming/connectors/kafka/internals/KafkaDeserializationSchemaWrapper.java @@ -20,17 +20,11 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.serialization.DeserializationSchema; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema; import org.apache.flink.util.Collector; import org.apache.kafka.clients.consumer.ConsumerRecord; -import java.util.Collections; -import java.util.List; - /** * A simple wrapper for using the DeserializationSchema with the KafkaDeserializationSchema * interface. @@ -39,8 +33,7 @@ */ @Internal @Deprecated -public class KafkaDeserializationSchemaWrapper - implements KafkaDeserializationSchema, LineageFacetProvider { +public class KafkaDeserializationSchemaWrapper implements KafkaDeserializationSchema { private static final long serialVersionUID = 2651665280744549932L; @@ -75,9 +68,4 @@ public boolean isEndOfStream(T nextElement) { public TypeInformation getProducedType() { return deserializationSchema.getProducedType(); } - - @Override - public List getDatasetFacets() { - return Collections.singletonList(new TypeInformationFacet(this.getProducedType())); - } } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java index 43a3083e3..869399896 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/lineage/LineageUtilTest.java @@ -1,129 +1,28 @@ package org.apache.flink.connector.kafka.lineage; import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; import org.apache.flink.streaming.api.lineage.LineageDataset; import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.api.lineage.SourceLineageVertex; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; -import java.util.Arrays; -import java.util.Collection; import java.util.Collections; -import java.util.List; -import java.util.Optional; +import java.util.Map; import java.util.Properties; -import java.util.regex.Pattern; import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; /** Tests for {@link LineageUtil}. */ public class LineageUtilTest { - - @Test - public void testFromFacetsForNonDatasetFacetProvider() { - assertThat(LineageUtil.facetsFrom(new Object())).isEmpty(); - } - - @Test - public void testFromFacetsWhenNoFacetsReturned() { - LineageFacetProvider facetProvider = mock(LineageFacetProvider.class); - when(facetProvider.getDatasetFacets()).thenReturn(Collections.emptyList()); - - assertThat(LineageUtil.facetsFrom(facetProvider)).isEmpty(); - } - - @Test - public void testFromFacets() { - LineageDatasetFacet facet1 = mock(LineageDatasetFacet.class); - LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); - - LineageFacetProvider facetProvider = mock(LineageFacetProvider.class); - when(facetProvider.getDatasetFacets()).thenReturn(Arrays.asList(facet1, facet2)); - - assertThat(LineageUtil.facetsFrom(facetProvider)).containsExactly(facet1, facet2); - } - - @Test - public void testDatasetsFromWithTopicList() { - LineageDatasetFacet facet1 = new KafkaTopicListFacet(Arrays.asList("topic1", "topic2")); - LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); - LineageDatasetFacet facet3 = mock(LineageDatasetFacet.class); - - when(facet2.name()).thenReturn("facetName2"); - when(facet3.name()).thenReturn("facetName3"); - String namespace = "kafka://host"; - - List facets = Arrays.asList(facet1, facet2, facet3); - - Collection datasets = LineageUtil.datasetsFrom(namespace, facets); - - assertThat(datasets).hasSize(2); - - Optional topic1 = - datasets.stream().filter(e -> "topic1".equals(e.name())).findAny(); - assertThat(topic1).isPresent(); - assertThat(topic1.get().namespace()).isEqualTo(namespace); - assertThat(topic1.get().facets().get("facetName2")).isEqualTo(facet2); - assertThat(topic1.get().facets().get("facetName3")).isEqualTo(facet3); - - Optional topic2 = - datasets.stream().filter(e -> "topic2".equals(e.name())).findAny(); - assertThat(topic2).isPresent(); - assertThat(topic2.get().name()).isEqualTo("topic2"); - assertThat(topic2.get().namespace()).isEqualTo(namespace); - assertThat(topic2.get().facets().get("facetName2")).isEqualTo(facet2); - assertThat(topic2.get().facets().get("facetName3")).isEqualTo(facet3); - } - - @Test - public void testDatasetsFromWithTopicPattern() { - Pattern pattern = Pattern.compile("some-pattern"); - - LineageDatasetFacet facet1 = new KafkaTopicPatternFacet(pattern); - LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); - LineageDatasetFacet facet3 = mock(LineageDatasetFacet.class); - - when(facet2.name()).thenReturn("facetName2"); - when(facet3.name()).thenReturn("facetName3"); - String namespace = "kafka://host"; - - List facets = Arrays.asList(facet1, facet2, facet3); - - Collection datasets = LineageUtil.datasetsFrom(namespace, facets); - assertThat(datasets).hasSize(1); - - LineageDataset dataset = datasets.iterator().next(); - - assertThat(dataset.name()).isEqualTo("some-pattern"); - assertThat(dataset.namespace()).isEqualTo(namespace); - assertThat(dataset.facets().get("facetName2")).isEqualTo(facet2); - assertThat(dataset.facets().get("facetName3")).isEqualTo(facet3); - } - - @Test - public void testDatasetsWithNoTopicListNorPattern() { - LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); - LineageDatasetFacet facet3 = mock(LineageDatasetFacet.class); - - List facets = Arrays.asList(facet2, facet3); - - assertThat(LineageUtil.datasetsFrom("some-namespace", facets)).isEmpty(); - } - @Test public void testSourceLineageVertexOf() { - List datasets = Arrays.asList(Mockito.mock(LineageDataset.class)); - - SourceLineageVertex sourceLineageVertex = LineageUtil.sourceLineageVertexOf(datasets); + LineageDataset dataset = new TestingLineageDataset(); + SourceLineageVertex sourceLineageVertex = + LineageUtil.sourceLineageVertexOf(Collections.singletonList(dataset)); assertThat(sourceLineageVertex.boundedness()).isEqualTo(Boundedness.CONTINUOUS_UNBOUNDED); - assertThat(sourceLineageVertex.datasets()).isEqualTo(datasets); + assertThat(sourceLineageVertex.datasets()).containsExactly(dataset); } @Test @@ -131,7 +30,7 @@ public void testDatasetNamespaceOf() { Properties properties = new Properties(); properties.put("bootstrap.servers", "my-kafka-host"); - assertThat(LineageUtil.datasetNamespaceOf(properties)).isEqualTo("kafka://my-kafka-host"); + assertThat(LineageUtil.namespaceOf(properties)).isEqualTo("kafka://my-kafka-host"); } @Test @@ -139,7 +38,7 @@ public void testDatasetNamespaceOfWithSemicolon() { Properties properties = new Properties(); properties.put("bootstrap.servers", "my-kafka-host1;my-kafka-host2"); - assertThat(LineageUtil.datasetNamespaceOf(properties)).isEqualTo("kafka://my-kafka-host1"); + assertThat(LineageUtil.namespaceOf(properties)).isEqualTo("kafka://my-kafka-host1"); } @Test @@ -147,6 +46,29 @@ public void testDatasetNamespaceOfWithComma() { Properties properties = new Properties(); properties.put("bootstrap.servers", "my-kafka-host1,my-kafka-host2"); - assertThat(LineageUtil.datasetNamespaceOf(properties)).isEqualTo("kafka://my-kafka-host1"); + assertThat(LineageUtil.namespaceOf(properties)).isEqualTo("kafka://my-kafka-host1"); + } + + @Test + public void testDatasetNamespaceWhenNoBootstrapServersProperty() { + Properties properties = new Properties(); + assertThat(LineageUtil.namespaceOf(properties)).isEqualTo("kafka://"); + } + + private static class TestingLineageDataset implements LineageDataset { + @Override + public String name() { + return null; + } + + @Override + public String namespace() { + return null; + } + + @Override + public Map facets() { + return null; + } } } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java index 1326009e7..50b1abfbe 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java @@ -20,11 +20,14 @@ import org.apache.flink.api.common.serialization.SerializationSchema; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.common.typeinfo.BasicTypeInfo; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.java.typeutils.ResultTypeQueryable; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacetProvider; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacetProvider; import org.apache.flink.connector.testutils.formats.DummyInitializationContext; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; import org.apache.flink.util.TestLogger; @@ -41,7 +44,6 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; -import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -60,6 +62,12 @@ public class KafkaRecordSerializationSchemaBuilderTest extends TestLogger { private static Map configurableConfiguration; private static Map configuration; + + private interface TestingTopicSelector extends TopicSelector, KafkaDatasetIdentifierProvider {} + + private interface SerializationSchemaWithResultQueryable + extends SerializationSchema, ResultTypeQueryable {} + private static boolean isKeySerializer; @Before @@ -264,35 +272,128 @@ public void testSerializeRecordWithTimestamp() { } @Test - public void testGetLineageDatasetFacets() { - final SerializationSchema serializationSchema = new SimpleStringSchema(); - final KafkaRecordSerializationSchema schema = + public void testGetLineageDatasetFacetsWhenTopicSelectorNotKafkaTopicsIdentifierProvider() { + SerializationSchema serializationSchema = new SimpleStringSchema(); + KafkaRecordSerializationSchema schema = KafkaRecordSerializationSchema.builder() - .setTopic(DEFAULT_TOPIC) + .setTopicSelector((TopicSelector) o -> DEFAULT_TOPIC) + .setValueSerializationSchema(serializationSchema) + .setKeySerializationSchema(serializationSchema) + .build(); + + assertThat(((KafkaDatasetFacetProvider) schema).getKafkaDatasetFacet()).isEmpty(); + } + + @Test + public void testGetLineageDatasetFacetsWhenNoTopicsIdentifiersFound() { + SerializationSchema serializationSchema = new SimpleStringSchema(); + KafkaRecordSerializationSchema schema = + KafkaRecordSerializationSchema.builder() + .setTopicSelector( + new TestingTopicSelector() { + @Override + public Optional + getDatasetIdentifier() { + return Optional.empty(); + } + + @Override + public String apply(Object o) { + return DEFAULT_TOPIC; + } + }) .setValueSerializationSchema(serializationSchema) .setKeySerializationSchema(serializationSchema) .build(); + assertThat(((KafkaDatasetFacetProvider) schema).getKafkaDatasetFacet()).isEmpty(); + } + + @Test + public void testGetLineageDatasetFacetsValueSerializationSchemaIsResultTypeQueryable() { + TypeInformation stringTypeInformation = TypeInformation.of(String.class); + SerializationSchemaWithResultQueryable serializationSchema = + new SerializationSchemaWithResultQueryable() { + + @Override + public TypeInformation getProducedType() { + return stringTypeInformation; + } + + @Override + public byte[] serialize(Object o) { + return new byte[0]; + } + }; + + KafkaRecordSerializationSchema schema = + KafkaRecordSerializationSchema.builder() + .setTopicSelector( + new TestingTopicSelector() { + @Override + public Optional + getDatasetIdentifier() { + return Optional.of( + DefaultKafkaDatasetIdentifier.ofTopics( + Arrays.asList("topic1", "topic2"))); + } + + @Override + public Object apply(Object o) { + return DEFAULT_TOPIC; + } + }) + .setValueSerializationSchema(serializationSchema) + .setKeySerializationSchema(serializationSchema) + .build(); + + Optional kafkaDatasetFacet = + ((KafkaDatasetFacetProvider) schema).getKafkaDatasetFacet(); + + assertThat(kafkaDatasetFacet).isPresent(); + assertThat(kafkaDatasetFacet.get().getTopicIdentifier().getTopics()) + .containsExactly("topic1", "topic2"); + assertThat( + ((TypeDatasetFacetProvider) schema) + .getTypeDatasetFacet() + .get() + .getTypeInformation()) + .isEqualTo(stringTypeInformation); + } + + @Test + public void testGetLineageDatasetFacets() { + KafkaRecordSerializationSchema schema = + KafkaRecordSerializationSchema.builder() + .setTopicSelector( + new TestingTopicSelector() { + @Override + public Optional + getDatasetIdentifier() { + return Optional.of( + DefaultKafkaDatasetIdentifier.ofTopics( + Arrays.asList("topic1", "topic2"))); + } + + @Override + public Object apply(Object o) { + return DEFAULT_TOPIC; + } + }) + .setValueSerializationSchema(new SimpleStringSchema()) + .setKeySerializationSchema(new SimpleStringSchema()) + .build(); - Collection facets = ((LineageFacetProvider) schema).getDatasetFacets(); - - assertThat(facets).hasSize(2); - - Optional kafkaTopicListFacet = - facets.stream() - .filter(f -> f instanceof KafkaTopicListFacet) - .map(f -> (KafkaTopicListFacet) f) - .findAny(); - assertThat(kafkaTopicListFacet).isPresent(); - assertThat(kafkaTopicListFacet.get()) - .hasFieldOrPropertyWithValue("topics", Arrays.asList(DEFAULT_TOPIC)); - - Optional typeInformationFacet = - facets.stream() - .filter(f -> f instanceof TypeInformationFacet) - .map(f -> (TypeInformationFacet) f) - .findAny(); - assertThat(typeInformationFacet).isPresent(); - assertThat(typeInformationFacet.get().getTypeInformation()) + Optional kafkaDatasetFacet = + ((KafkaDatasetFacetProvider) schema).getKafkaDatasetFacet(); + + assertThat(kafkaDatasetFacet).isPresent(); + assertThat(kafkaDatasetFacet.get().getTopicIdentifier().getTopics()) + .containsExactly("topic1", "topic2"); + assertThat( + ((TypeDatasetFacetProvider) schema) + .getTypeDatasetFacet() + .get() + .getTypeInformation()) .isEqualTo(BasicTypeInfo.STRING_TYPE_INFO); } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java index 998046e49..1efb6ec7d 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaSinkTest.java @@ -1,59 +1,144 @@ package org.apache.flink.connector.kafka.sink; +import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.connector.base.DeliveryGuarantee; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.DefaultTypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacetProvider; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacetProvider; import org.apache.flink.streaming.api.lineage.LineageVertex; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.jetbrains.annotations.Nullable; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.util.Arrays; +import java.util.Collections; +import java.util.Optional; import java.util.Properties; import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; -import static org.mockito.Mockito.withSettings; /** Tests for {@link KafkaSink}. */ public class KafkaSinkTest { + Properties kafkaProperties; + + @BeforeEach + void setup() { + kafkaProperties = new Properties(); + kafkaProperties.put("bootstrap.servers", "host1;host2"); + } + + @Test + public void testGetLineageVertexWhenSerializerNotAnKafkaDatasetFacetProvider() { + KafkaRecordSerializationSchema recordSerializer = + new KafkaRecordSerializationSchemaWithoutKafkaDatasetProvider(); + KafkaSink sink = + new KafkaSink( + DeliveryGuarantee.EXACTLY_ONCE, new Properties(), "", recordSerializer); + + assertThat(sink.getLineageVertex().datasets()).isEmpty(); + } + + @Test + public void testGetLineageVertexWhenNoKafkaDatasetFacetReturnedFromSerializer() { + KafkaRecordSerializationSchema recordSerializer = + new KafkaRecordSerializationSchemaWithEmptyKafkaDatasetProvider(); + + KafkaSink sink = + new KafkaSink( + DeliveryGuarantee.EXACTLY_ONCE, new Properties(), "", recordSerializer); + + assertThat(sink.getLineageVertex().datasets()).isEmpty(); + } + @Test public void testGetLineageVertex() { - LineageDatasetFacet facet1 = mock(LineageDatasetFacet.class); - LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); - when(facet1.name()).thenReturn("facet1"); - when(facet2.name()).thenReturn("facet2"); - LineageDatasetFacet topicSelector = - new KafkaTopicListFacet(Arrays.asList("topic1", "topic2")); - - KafkaRecordSerializationSchema schema = - mock( - KafkaRecordSerializationSchema.class, - withSettings().extraInterfaces(LineageFacetProvider.class)); - - when(((LineageFacetProvider) schema).getDatasetFacets()) - .thenReturn(Arrays.asList(facet1, facet2, topicSelector)); - Properties kafkaProperties = new Properties(); - kafkaProperties.put("bootstrap.servers", "host1;host2"); - KafkaSink sink = new KafkaSink(DeliveryGuarantee.EXACTLY_ONCE, kafkaProperties, "", schema); + KafkaRecordSerializationSchema recordSerializer = + new TestingKafkaRecordSerializationSchema(); + + KafkaSink sink = + new KafkaSink( + DeliveryGuarantee.EXACTLY_ONCE, kafkaProperties, "", recordSerializer); LineageVertex lineageVertex = sink.getLineageVertex(); - assertThat(lineageVertex.datasets()).hasSize(2); assertThat(lineageVertex.datasets().get(0).namespace()).isEqualTo("kafka://host1"); assertThat(lineageVertex.datasets().get(0).name()).isEqualTo("topic1"); - assertThat(lineageVertex.datasets().get(1).namespace()).isEqualTo("kafka://host1"); - assertThat(lineageVertex.datasets().get(1).name()).isEqualTo("topic2"); + assertThat( + lineageVertex + .datasets() + .get(0) + .facets() + .get(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME)) + .hasFieldOrPropertyWithValue("properties", kafkaProperties) + .hasFieldOrPropertyWithValue( + "topicIdentifier", + DefaultKafkaDatasetIdentifier.ofTopics( + Collections.singletonList("topic1"))); + + assertThat( + lineageVertex + .datasets() + .get(0) + .facets() + .get(DefaultTypeDatasetFacet.TYPE_FACET_NAME)) + .hasFieldOrPropertyWithValue("typeInformation", TypeInformation.of(String.class)); + } + + private static class KafkaRecordSerializationSchemaWithoutKafkaDatasetProvider + implements KafkaRecordSerializationSchema { + @Nullable + @Override + public ProducerRecord serialize( + Object element, KafkaSinkContext context, Long timestamp) { + return null; + } + } + + private static class KafkaRecordSerializationSchemaWithEmptyKafkaDatasetProvider + implements KafkaRecordSerializationSchema, KafkaDatasetFacetProvider { + @Nullable + @Override + public ProducerRecord serialize( + Object element, KafkaSinkContext context, Long timestamp) { + return null; + } + + @Override + public Optional getKafkaDatasetFacet() { + return Optional.empty(); + } + } + + private static class TestingKafkaRecordSerializationSchema + implements KafkaRecordSerializationSchema, + KafkaDatasetFacetProvider, + TypeDatasetFacetProvider { + + @Override + public Optional getKafkaDatasetFacet() { + return Optional.of( + new DefaultKafkaDatasetFacet( + DefaultKafkaDatasetIdentifier.ofTopics( + Collections.singletonList("topic1")))); + } - // facets shall be the same for both datasets - assertThat(lineageVertex.datasets().get(0).facets()) - .isEqualTo(lineageVertex.datasets().get(1).facets()); + @Nullable + @Override + public ProducerRecord serialize( + Object element, KafkaSinkContext context, Long timestamp) { + return null; + } - assertThat(lineageVertex.datasets().get(0).facets()) - .containsEntry("facet1", facet1) - .containsEntry("facet2", facet2); + @Override + public Optional getTypeDatasetFacet() { + return Optional.of(new DefaultTypeDatasetFacet(TypeInformation.of(String.class))); + } } } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java index c138d2fb6..f755edc9e 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java @@ -17,69 +17,149 @@ package org.apache.flink.connector.kafka.source; +import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.connector.source.Boundedness; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.DefaultTypeDatasetFacet; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; -import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import org.apache.flink.streaming.api.lineage.LineageVertex; +import org.apache.flink.util.Collector; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.common.TopicPartition; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.util.Arrays; +import java.io.IOException; +import java.util.Collections; +import java.util.Optional; import java.util.Properties; +import java.util.Set; import static org.assertj.core.api.Assertions.assertThat; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; -import static org.mockito.Mockito.withSettings; /** Tests for {@link KafkaSource}. */ public class KafkaSourceTest { + Properties kafkaProperties; + + private interface TestingKafkaSubscriber + extends KafkaSubscriber, KafkaDatasetIdentifierProvider {} + + @BeforeEach + void setup() { + kafkaProperties = new Properties(); + kafkaProperties.put("bootstrap.servers", "host1;host2"); + } @Test - public void testGetLineageVertex() { - LineageDatasetFacet facet1 = mock(LineageDatasetFacet.class); - LineageDatasetFacet facet2 = mock(LineageDatasetFacet.class); - when(facet1.name()).thenReturn("facet1"); - when(facet2.name()).thenReturn("facet2"); + public void testGetLineageVertexWhenSubscriberNotAnKafkaDatasetFacetProvider() { + KafkaSource source = + new KafkaSource( + new KafkaSubscriber() { + @Override + public Set getSubscribedTopicPartitions( + AdminClient adminClient) { + return null; + } + }, + null, + null, + Boundedness.CONTINUOUS_UNBOUNDED, + null, + kafkaProperties, + null); + assertThat(source.getLineageVertex().datasets()).isEmpty(); + } - KafkaRecordDeserializationSchema schema = - mock( - KafkaRecordDeserializationSchema.class, - withSettings().extraInterfaces(LineageFacetProvider.class)); + @Test + public void testGetLineageVertexWhenNoKafkaTopicsIdentifier() { + KafkaSource source = + new KafkaSource( + new TestingKafkaSubscriber() { + @Override + public Optional getDatasetIdentifier() { + return Optional.empty(); + } - when(((LineageFacetProvider) schema).getDatasetFacets()) - .thenReturn(Arrays.asList(facet1, facet2)); - Properties kafkaProperties = new Properties(); + @Override + public Set getSubscribedTopicPartitions( + AdminClient adminClient) { + return null; + } + }, + null, + null, + Boundedness.CONTINUOUS_UNBOUNDED, + null, + kafkaProperties, + null); + assertThat(source.getLineageVertex().datasets()).isEmpty(); + assertThat(source.getLineageVertex().datasets()).isEmpty(); + } - kafkaProperties.put("bootstrap.servers", "host1;host2"); + @Test + public void testGetLineageVertex() { + TypeInformation typeInformation = TypeInformation.of(String.class); KafkaSource source = new KafkaSource( - KafkaSubscriber.getTopicListSubscriber(Arrays.asList("topic1", "topic2")), - mock(OffsetsInitializer.class), + new TestingKafkaSubscriber() { + @Override + public Optional getDatasetIdentifier() { + return Optional.of( + DefaultKafkaDatasetIdentifier.ofTopics( + Collections.singletonList("topic1"))); + } + + @Override + public Set getSubscribedTopicPartitions( + AdminClient adminClient) { + return null; + } + }, + null, null, Boundedness.CONTINUOUS_UNBOUNDED, - schema, + new KafkaRecordDeserializationSchema() { + @Override + public void deserialize(ConsumerRecord record, Collector out) + throws IOException {} + + @Override + public TypeInformation getProducedType() { + return typeInformation; + } + }, kafkaProperties, null); LineageVertex lineageVertex = source.getLineageVertex(); - assertThat(lineageVertex.datasets()).hasSize(2); + assertThat(lineageVertex.datasets()).hasSize(1); assertThat(lineageVertex.datasets().get(0).namespace()).isEqualTo("kafka://host1"); assertThat(lineageVertex.datasets().get(0).name()).isEqualTo("topic1"); - assertThat(lineageVertex.datasets().get(1).namespace()).isEqualTo("kafka://host1"); - assertThat(lineageVertex.datasets().get(1).name()).isEqualTo("topic2"); - - // facets shall be the same for both datasets - assertThat(lineageVertex.datasets().get(0).facets()) - .isEqualTo(lineageVertex.datasets().get(1).facets()); + assertThat( + lineageVertex + .datasets() + .get(0) + .facets() + .get(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME)) + .hasFieldOrPropertyWithValue("properties", kafkaProperties) + .hasFieldOrPropertyWithValue( + "topicIdentifier", + DefaultKafkaDatasetIdentifier.ofTopics( + Collections.singletonList("topic1"))); - assertThat(lineageVertex.datasets().get(0).facets()) - .containsEntry("facet1", facet1) - .containsEntry("facet2", facet2); + assertThat( + lineageVertex + .datasets() + .get(0) + .facets() + .get(DefaultTypeDatasetFacet.TYPE_FACET_NAME)) + .hasFieldOrPropertyWithValue("typeInformation", TypeInformation.of(String.class)); } } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java index 435570de8..4c5a50243 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriberTest.java @@ -18,9 +18,8 @@ package org.apache.flink.connector.kafka.source.enumerator.subscriber; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicListFacet; -import org.apache.flink.connector.kafka.lineage.facets.KafkaTopicPatternFacet; +import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; +import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; import org.apache.flink.connector.kafka.testutils.KafkaSourceTestEnv; import org.apache.kafka.clients.admin.AdminClient; @@ -74,8 +73,8 @@ public void testTopicListSubscriber() { new HashSet<>(KafkaSourceTestEnv.getPartitionsForTopics(topics)); assertThat(subscribedPartitions).isEqualTo(expectedSubscribedPartitions); - assertThat(((LineageFacetProvider) subscriber).getDatasetFacets()) - .containsExactly(new KafkaTopicListFacet(topics)); + assertThat(((KafkaDatasetIdentifierProvider) subscriber).getDatasetIdentifier().get()) + .isEqualTo(DefaultKafkaDatasetIdentifier.ofTopics(topics)); } @Test @@ -91,8 +90,8 @@ public void testNonExistingTopic() { @Test public void testTopicPatternSubscriber() { - KafkaSubscriber subscriber = - KafkaSubscriber.getTopicPatternSubscriber(Pattern.compile("pattern.*")); + Pattern pattern = Pattern.compile("pattern.*"); + KafkaSubscriber subscriber = KafkaSubscriber.getTopicPatternSubscriber(pattern); final Set subscribedPartitions = subscriber.getSubscribedTopicPartitions(adminClient); @@ -101,8 +100,8 @@ public void testTopicPatternSubscriber() { KafkaSourceTestEnv.getPartitionsForTopics(Collections.singleton(TOPIC2))); assertThat(subscribedPartitions).isEqualTo(expectedSubscribedPartitions); - assertThat(((LineageFacetProvider) subscriber).getDatasetFacets()) - .containsExactly(new KafkaTopicPatternFacet(Pattern.compile("pattern.*"))); + assertThat(((KafkaDatasetIdentifierProvider) subscriber).getDatasetIdentifier().get()) + .isEqualTo(DefaultKafkaDatasetIdentifier.ofPattern(pattern)); } @Test @@ -118,8 +117,8 @@ public void testPartitionSetSubscriber() { subscriber.getSubscribedTopicPartitions(adminClient); assertThat(subscribedPartitions).isEqualTo(partitions); - assertThat(((LineageFacetProvider) subscriber).getDatasetFacets()) - .containsExactly(new KafkaTopicListFacet(topics)); + assertThat(((KafkaDatasetIdentifierProvider) subscriber).getDatasetIdentifier().get()) + .isEqualTo(DefaultKafkaDatasetIdentifier.ofTopics(topics)); } @Test diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java index 9bb7eb0c2..b0ca63161 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/reader/deserializer/KafkaRecordDeserializationSchemaTest.java @@ -18,10 +18,6 @@ package org.apache.flink.connector.kafka.source.reader.deserializer; -import org.apache.flink.api.common.typeinfo.BasicTypeInfo; -import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.connector.kafka.lineage.LineageFacetProvider; -import org.apache.flink.connector.kafka.lineage.facets.TypeInformationFacet; import org.apache.flink.connector.kafka.util.JacksonMapperFactory; import org.apache.flink.connector.testutils.formats.DummyInitializationContext; import org.apache.flink.connector.testutils.source.deserialization.TestingDeserializationContext; @@ -80,8 +76,6 @@ public void testKafkaDeserializationSchemaWrapper() throws Exception { assertThat(deserializedValue.get("metadata").get("topic").asText()).isEqualTo("topic#1"); assertThat(deserializedValue.get("metadata").get("offset").asInt()).isEqualTo(4); assertThat(deserializedValue.get("metadata").get("partition").asInt()).isEqualTo(3); - assertThat(((LineageFacetProvider) schema).getDatasetFacets()) - .containsExactly(new TypeInformationFacet(TypeInformation.of(ObjectNode.class))); } @Test @@ -108,12 +102,6 @@ public void testKafkaValueDeserializationSchemaWrapper() throws Exception { assertThat(deserializedValue.get("word").asText()).isEqualTo("world"); assertThat(deserializedValue.get("key")).isNull(); assertThat(deserializedValue.get("metadata")).isNull(); - assertThat(((LineageFacetProvider) schema).getDatasetFacets()) - .containsExactly( - new TypeInformationFacet( - TypeInformation.of( - org.apache.flink.shaded.jackson2.com.fasterxml.jackson - .databind.node.ObjectNode.class))); } @Test @@ -131,8 +119,6 @@ public void testKafkaValueDeserializerWrapper() throws Exception { assertThat(collector.list).hasSize(1); assertThat(collector.list.get(0)).isEqualTo("world"); - assertThat(((LineageFacetProvider) schema).getDatasetFacets()) - .containsExactly(new TypeInformationFacet(BasicTypeInfo.STRING_TYPE_INFO)); } @Test From 4f7acc0d717d6c7d0ed6b9eb3b00b361049ca7b0 Mon Sep 17 00:00:00 2001 From: Pawel Leszczynski Date: Tue, 12 Nov 2024 08:22:19 +0100 Subject: [PATCH 3/3] code review changes Signed-off-by: Pawel Leszczynski --- .../lineage/DefaultKafkaDatasetFacet.java | 2 + .../DefaultKafkaDatasetIdentifier.java | 9 +- .../lineage/DefaultTypeDatasetFacet.java | 2 + .../kafka/lineage/KafkaDatasetFacet.java | 2 + .../lineage/KafkaDatasetFacetProvider.java | 9 +- .../kafka/lineage/KafkaDatasetIdentifier.java | 8 +- .../KafkaDatasetIdentifierProvider.java | 5 +- .../connector/kafka/lineage/LineageUtil.java | 2 +- .../kafka/lineage/TypeDatasetFacet.java | 2 + .../lineage/TypeDatasetFacetProvider.java | 5 +- .../sink/KafkaRecordSerializationSchema.java | 5 +- .../flink/connector/kafka/sink/KafkaSink.java | 2 +- .../subscriber/KafkaSubscriber.java | 4 + .../subscriber/PartitionSetSubscriber.java | 2 +- ...aRecordSerializationSchemaBuilderTest.java | 53 +++-- .../kafka/source/KafkaSourceTest.java | 206 ++++++++++-------- 16 files changed, 182 insertions(+), 136 deletions(-) diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java index cb1a4671c..e1c682345 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetFacet.java @@ -1,11 +1,13 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.connector.kafka.source.KafkaPropertiesUtil; import java.util.Objects; import java.util.Properties; /** Default implementation of {@link KafkaDatasetFacet}. */ +@PublicEvolving public class DefaultKafkaDatasetFacet implements KafkaDatasetFacet { public static final String KAFKA_FACET_NAME = "kafka"; diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java index bd05cfd52..cd97b7ff4 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultKafkaDatasetIdentifier.java @@ -1,25 +1,28 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; + import javax.annotation.Nullable; -import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.regex.Pattern; /** Default implementation of {@link KafkaDatasetIdentifier}. */ +@PublicEvolving public class DefaultKafkaDatasetIdentifier implements KafkaDatasetIdentifier { @Nullable private final List topics; @Nullable private final Pattern topicPattern; - public DefaultKafkaDatasetIdentifier(List fixedTopics, Pattern topicPattern) { + private DefaultKafkaDatasetIdentifier( + @Nullable List fixedTopics, @Nullable Pattern topicPattern) { this.topics = fixedTopics; this.topicPattern = topicPattern; } public static DefaultKafkaDatasetIdentifier ofPattern(Pattern pattern) { - return new DefaultKafkaDatasetIdentifier(Collections.emptyList(), pattern); + return new DefaultKafkaDatasetIdentifier(null, pattern); } public static DefaultKafkaDatasetIdentifier ofTopics(List fixedTopics) { diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java index 69183e3a1..d9475d77a 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/DefaultTypeDatasetFacet.java @@ -1,10 +1,12 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.typeinfo.TypeInformation; import java.util.Objects; /** Default implementation of {@link KafkaDatasetFacet}. */ +@PublicEvolving public class DefaultTypeDatasetFacet implements TypeDatasetFacet { public static final String TYPE_FACET_NAME = "type"; diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java index 22d14dd2c..c0d3d0b73 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacet.java @@ -1,10 +1,12 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; import java.util.Properties; /** Facet definition to contain all Kafka specific information on Kafka sources and sinks. */ +@PublicEvolving public interface KafkaDatasetFacet extends LineageDatasetFacet { Properties getProperties(); diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java index 0eed6f715..76fe41b82 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetFacetProvider.java @@ -1,15 +1,16 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; + import java.util.Optional; /** Contains method to extract {@link KafkaDatasetFacet}. */ +@PublicEvolving public interface KafkaDatasetFacetProvider { /** - * Returns a Kafka dataset facet or `Optional.empty` in case an implementing class is not able - * to identify a dataset. - * - * @return + * Returns a Kafka dataset facet or empty in case an implementing class is not able to identify + * a dataset. */ Optional getKafkaDatasetFacet(); } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java index 0c43f8be9..19f7082e2 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifier.java @@ -1,11 +1,15 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; + import javax.annotation.Nullable; import java.util.List; +import java.util.Objects; import java.util.regex.Pattern; /** Kafka dataset identifier which can contain either a list of topics or a topic pattern. */ +@PublicEvolving public interface KafkaDatasetIdentifier { @Nullable List getTopics(); @@ -16,13 +20,11 @@ public interface KafkaDatasetIdentifier { /** * Assigns lineage dataset's name which is topic pattern if it is present or comma separated * list of topics. - * - * @return */ default String toLineageName() { if (getTopicPattern() != null) { return getTopicPattern().toString(); } - return String.join(",", getTopics()); + return String.join(",", Objects.requireNonNull(getTopics())); } } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java index 36f8c4f2e..1389fea58 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/KafkaDatasetIdentifierProvider.java @@ -1,15 +1,16 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; + import java.util.Optional; /** Contains method which allows extracting topic identifier. */ +@PublicEvolving public interface KafkaDatasetIdentifierProvider { /** * Gets Kafka dataset identifier or empty in case a class implementing is not able to extract * dataset identifier. - * - * @return */ Optional getDatasetIdentifier(); } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java index 779c167c6..086303e09 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/LineageUtil.java @@ -66,7 +66,7 @@ public String namespace() { @Override public Map facets() { - Map facetMap = new HashMap(); + Map facetMap = new HashMap<>(); facetMap.put(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME, kafkaDatasetFacet); facetMap.putAll( facets.stream() diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java index 4b4261c65..1e64f5819 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacet.java @@ -1,9 +1,11 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.streaming.api.lineage.LineageDatasetFacet; /** Facet definition to contain type information of source and sink. */ +@PublicEvolving public interface TypeDatasetFacet extends LineageDatasetFacet { TypeInformation getTypeInformation(); } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java index b2f0ea831..016a1bb84 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/lineage/TypeDatasetFacetProvider.java @@ -1,15 +1,16 @@ package org.apache.flink.connector.kafka.lineage; +import org.apache.flink.annotation.PublicEvolving; + import java.util.Optional; /** Contains method to extract {@link TypeDatasetFacet}. */ +@PublicEvolving public interface TypeDatasetFacetProvider { /** * Returns a type dataset facet or `Optional.empty` in case an implementing class is not able to * resolve type. - * - * @return */ Optional getTypeDatasetFacet(); } diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchema.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchema.java index 9d081c755..f56a7da54 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchema.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchema.java @@ -29,7 +29,10 @@ /** * A serialization schema which defines how to convert a value of type {@code T} to {@link - * ProducerRecord}. + * ProducerRecord}. {@link org.apache.flink.connector.kafka.lineage.KafkaDatasetFacetProvider} can + * be implemented to provide Kafka specific lineage metadata, while {@link + * org.apache.flink.connector.kafka.lineage.TypeDatasetFacetProvider} can be implemented to provide + * lineage metadata with type information. * * @param the type of values being serialized */ diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java index 0f89e5bb2..d3d3c89df 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/sink/KafkaSink.java @@ -154,7 +154,7 @@ public LineageVertex getLineageVertex() { ((KafkaDatasetFacetProvider) recordSerializer).getKafkaDatasetFacet(); if (!kafkaDatasetFacet.isPresent()) { - LOG.info("Provided did not return kafka dataset facet"); + LOG.info("Provider did not return kafka dataset facet"); return LineageUtil.sourceLineageVertexOf(Collections.emptyList()); } kafkaDatasetFacet.get().setProperties(this.kafkaProducerConfig); diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriber.java index 1b819fb23..37de884af 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/KafkaSubscriber.java @@ -39,6 +39,10 @@ * *

The KafkaSubscriber provides a unified interface for the Kafka source to support all these * three types of subscribing mode. + * + *

When implementing a subscriber, {@link + * org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider} can be implemented to + * provide lineage metadata with source topics. */ @PublicEvolving public interface KafkaSubscriber extends Serializable { diff --git a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java index 3ea6f9a5a..9cd50fb20 100644 --- a/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java +++ b/flink-connector-kafka/src/main/java/org/apache/flink/connector/kafka/source/enumerator/subscriber/PartitionSetSubscriber.java @@ -36,7 +36,7 @@ import static org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriberUtils.getTopicMetadata; /** A subscriber for a partition set. */ -class PartitionSetSubscriber implements KafkaDatasetIdentifierProvider, KafkaSubscriber { +class PartitionSetSubscriber implements KafkaSubscriber, KafkaDatasetIdentifierProvider { private static final long serialVersionUID = 390970375272146036L; private static final Logger LOG = LoggerFactory.getLogger(PartitionSetSubscriber.class); private final Set subscribedPartitions; diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java index 50b1abfbe..4d1437288 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/sink/KafkaRecordSerializationSchemaBuilderTest.java @@ -26,6 +26,7 @@ import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacet; import org.apache.flink.connector.kafka.lineage.KafkaDatasetFacetProvider; import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; +import org.apache.flink.connector.kafka.lineage.TypeDatasetFacet; import org.apache.flink.connector.kafka.lineage.TypeDatasetFacetProvider; import org.apache.flink.connector.testutils.formats.DummyInitializationContext; import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; @@ -39,6 +40,7 @@ import org.apache.kafka.common.serialization.Deserializer; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.kafka.common.serialization.StringSerializer; +import org.assertj.core.api.InstanceOfAssertFactories; import org.junit.Before; import org.junit.Test; @@ -63,10 +65,11 @@ public class KafkaRecordSerializationSchemaBuilderTest extends TestLogger { private static Map configurableConfiguration; private static Map configuration; - private interface TestingTopicSelector extends TopicSelector, KafkaDatasetIdentifierProvider {} + private interface TestingTopicSelector + extends TopicSelector, KafkaDatasetIdentifierProvider {} - private interface SerializationSchemaWithResultQueryable - extends SerializationSchema, ResultTypeQueryable {} + private interface SerializationSchemaWithResultQueryable + extends SerializationSchema, ResultTypeQueryable {} private static boolean isKeySerializer; @@ -281,7 +284,9 @@ public void testGetLineageDatasetFacetsWhenTopicSelectorNotKafkaTopicsIdentifier .setKeySerializationSchema(serializationSchema) .build(); - assertThat(((KafkaDatasetFacetProvider) schema).getKafkaDatasetFacet()).isEmpty(); + assertThat(schema) + .asInstanceOf(InstanceOfAssertFactories.type(KafkaDatasetFacetProvider.class)) + .returns(Optional.empty(), KafkaDatasetFacetProvider::getKafkaDatasetFacet); } @Test @@ -290,7 +295,7 @@ public void testGetLineageDatasetFacetsWhenNoTopicsIdentifiersFound() { KafkaRecordSerializationSchema schema = KafkaRecordSerializationSchema.builder() .setTopicSelector( - new TestingTopicSelector() { + new TestingTopicSelector() { @Override public Optional getDatasetIdentifier() { @@ -305,22 +310,24 @@ public String apply(Object o) { .setValueSerializationSchema(serializationSchema) .setKeySerializationSchema(serializationSchema) .build(); - assertThat(((KafkaDatasetFacetProvider) schema).getKafkaDatasetFacet()).isEmpty(); + assertThat(schema) + .asInstanceOf(InstanceOfAssertFactories.type(KafkaDatasetFacetProvider.class)) + .returns(Optional.empty(), KafkaDatasetFacetProvider::getKafkaDatasetFacet); } @Test public void testGetLineageDatasetFacetsValueSerializationSchemaIsResultTypeQueryable() { TypeInformation stringTypeInformation = TypeInformation.of(String.class); - SerializationSchemaWithResultQueryable serializationSchema = - new SerializationSchemaWithResultQueryable() { + SerializationSchemaWithResultQueryable serializationSchema = + new SerializationSchemaWithResultQueryable() { @Override - public TypeInformation getProducedType() { + public TypeInformation getProducedType() { return stringTypeInformation; } @Override - public byte[] serialize(Object o) { + public byte[] serialize(String o) { return new byte[0]; } }; @@ -328,7 +335,7 @@ public byte[] serialize(Object o) { KafkaRecordSerializationSchema schema = KafkaRecordSerializationSchema.builder() .setTopicSelector( - new TestingTopicSelector() { + new TestingTopicSelector() { @Override public Optional getDatasetIdentifier() { @@ -338,7 +345,7 @@ public byte[] serialize(Object o) { } @Override - public Object apply(Object o) { + public String apply(Object o) { return DEFAULT_TOPIC; } }) @@ -352,11 +359,10 @@ public Object apply(Object o) { assertThat(kafkaDatasetFacet).isPresent(); assertThat(kafkaDatasetFacet.get().getTopicIdentifier().getTopics()) .containsExactly("topic1", "topic2"); - assertThat( - ((TypeDatasetFacetProvider) schema) - .getTypeDatasetFacet() - .get() - .getTypeInformation()) + assertThat(((TypeDatasetFacetProvider) schema).getTypeDatasetFacet()) + .isPresent() + .get() + .extracting(TypeDatasetFacet::getTypeInformation) .isEqualTo(stringTypeInformation); } @@ -365,7 +371,7 @@ public void testGetLineageDatasetFacets() { KafkaRecordSerializationSchema schema = KafkaRecordSerializationSchema.builder() .setTopicSelector( - new TestingTopicSelector() { + new TestingTopicSelector() { @Override public Optional getDatasetIdentifier() { @@ -375,7 +381,7 @@ public void testGetLineageDatasetFacets() { } @Override - public Object apply(Object o) { + public String apply(Object o) { return DEFAULT_TOPIC; } }) @@ -389,11 +395,10 @@ public Object apply(Object o) { assertThat(kafkaDatasetFacet).isPresent(); assertThat(kafkaDatasetFacet.get().getTopicIdentifier().getTopics()) .containsExactly("topic1", "topic2"); - assertThat( - ((TypeDatasetFacetProvider) schema) - .getTypeDatasetFacet() - .get() - .getTypeInformation()) + assertThat(((TypeDatasetFacetProvider) schema).getTypeDatasetFacet()) + .isPresent() + .get() + .extracting(TypeDatasetFacet::getTypeInformation) .isEqualTo(BasicTypeInfo.STRING_TYPE_INFO); } diff --git a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java index f755edc9e..259668c5d 100644 --- a/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java +++ b/flink-connector-kafka/src/test/java/org/apache/flink/connector/kafka/source/KafkaSourceTest.java @@ -18,13 +18,14 @@ package org.apache.flink.connector.kafka.source; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.api.connector.source.Boundedness; import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetFacet; import org.apache.flink.connector.kafka.lineage.DefaultKafkaDatasetIdentifier; import org.apache.flink.connector.kafka.lineage.DefaultTypeDatasetFacet; import org.apache.flink.connector.kafka.lineage.KafkaDatasetIdentifierProvider; +import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; import org.apache.flink.connector.kafka.source.enumerator.subscriber.KafkaSubscriber; import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; +import org.apache.flink.streaming.api.lineage.LineageDataset; import org.apache.flink.streaming.api.lineage.LineageVertex; import org.apache.flink.util.Collector; @@ -46,9 +47,6 @@ public class KafkaSourceTest { Properties kafkaProperties; - private interface TestingKafkaSubscriber - extends KafkaSubscriber, KafkaDatasetIdentifierProvider {} - @BeforeEach void setup() { kafkaProperties = new Properties(); @@ -57,109 +55,129 @@ void setup() { @Test public void testGetLineageVertexWhenSubscriberNotAnKafkaDatasetFacetProvider() { - KafkaSource source = - new KafkaSource( - new KafkaSubscriber() { - @Override - public Set getSubscribedTopicPartitions( - AdminClient adminClient) { - return null; - } - }, - null, - null, - Boundedness.CONTINUOUS_UNBOUNDED, - null, - kafkaProperties, - null); - assertThat(source.getLineageVertex().datasets()).isEmpty(); + KafkaSource source = + new KafkaSourceBuilder() + .setKafkaSubscriber( + new KafkaSubscriber() { + @Override + public Set getSubscribedTopicPartitions( + AdminClient adminClient) { + return null; + } + }) + .setProperties(kafkaProperties) + .setGroupId("") + .setDeserializer( + new KafkaRecordDeserializationSchema() { + @Override + public TypeInformation getProducedType() { + return null; + } + + @Override + public void deserialize( + ConsumerRecord record, + Collector out) + throws IOException {} + }) + .setUnbounded(OffsetsInitializer.committedOffsets()) + .build(); + + assertThat(source.getLineageVertex()) + .extracting(LineageVertex::datasets) + .asList() + .isEmpty(); } @Test public void testGetLineageVertexWhenNoKafkaTopicsIdentifier() { - KafkaSource source = - new KafkaSource( - new TestingKafkaSubscriber() { - @Override - public Optional getDatasetIdentifier() { - return Optional.empty(); - } - - @Override - public Set getSubscribedTopicPartitions( - AdminClient adminClient) { - return null; - } - }, - null, - null, - Boundedness.CONTINUOUS_UNBOUNDED, - null, - kafkaProperties, - null); - assertThat(source.getLineageVertex().datasets()).isEmpty(); - assertThat(source.getLineageVertex().datasets()).isEmpty(); + KafkaSource source = + new KafkaSourceBuilder() + .setKafkaSubscriber( + new TestingKafkaSubscriber() { + @Override + public Optional + getDatasetIdentifier() { + return Optional.empty(); + } + }) + .setProperties(kafkaProperties) + .setGroupId("") + .setDeserializer( + new KafkaRecordDeserializationSchema() { + @Override + public void deserialize( + ConsumerRecord record, + Collector out) + throws IOException {} + + @Override + public TypeInformation getProducedType() { + return TypeInformation.of(String.class); + } + }) + .setUnbounded(OffsetsInitializer.committedOffsets()) + .build(); + assertThat(source.getLineageVertex()) + .extracting(LineageVertex::datasets) + .asList() + .isEmpty(); } @Test public void testGetLineageVertex() { TypeInformation typeInformation = TypeInformation.of(String.class); - KafkaSource source = - new KafkaSource( - new TestingKafkaSubscriber() { - @Override - public Optional getDatasetIdentifier() { - return Optional.of( - DefaultKafkaDatasetIdentifier.ofTopics( - Collections.singletonList("topic1"))); - } - - @Override - public Set getSubscribedTopicPartitions( - AdminClient adminClient) { - return null; - } - }, - null, - null, - Boundedness.CONTINUOUS_UNBOUNDED, - new KafkaRecordDeserializationSchema() { - @Override - public void deserialize(ConsumerRecord record, Collector out) - throws IOException {} - - @Override - public TypeInformation getProducedType() { - return typeInformation; - } - }, - kafkaProperties, - null); + KafkaSource source = + new KafkaSourceBuilder() + .setKafkaSubscriber(new TestingKafkaSubscriber()) + .setProperties(kafkaProperties) + .setGroupId("") + .setDeserializer( + new KafkaRecordDeserializationSchema() { + @Override + public void deserialize( + ConsumerRecord record, + Collector out) + throws IOException {} + + @Override + public TypeInformation getProducedType() { + return typeInformation; + } + }) + .setUnbounded(OffsetsInitializer.committedOffsets()) + .build(); LineageVertex lineageVertex = source.getLineageVertex(); assertThat(lineageVertex.datasets()).hasSize(1); + LineageDataset dataset = lineageVertex.datasets().get(0); + + assertThat(dataset.namespace()).isEqualTo("kafka://host1"); + assertThat(dataset.name()).isEqualTo("topic1"); + + assertThat(dataset.facets()).containsKey(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME); + DefaultKafkaDatasetFacet kafkaFacet = + (DefaultKafkaDatasetFacet) + dataset.facets().get(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME); - assertThat(lineageVertex.datasets().get(0).namespace()).isEqualTo("kafka://host1"); - assertThat(lineageVertex.datasets().get(0).name()).isEqualTo("topic1"); - - assertThat( - lineageVertex - .datasets() - .get(0) - .facets() - .get(DefaultKafkaDatasetFacet.KAFKA_FACET_NAME)) - .hasFieldOrPropertyWithValue("properties", kafkaProperties) - .hasFieldOrPropertyWithValue( - "topicIdentifier", - DefaultKafkaDatasetIdentifier.ofTopics( - Collections.singletonList("topic1"))); - - assertThat( - lineageVertex - .datasets() - .get(0) - .facets() - .get(DefaultTypeDatasetFacet.TYPE_FACET_NAME)) + assertThat(kafkaFacet.getProperties()).containsEntry("bootstrap.servers", "host1;host2"); + + assertThat(dataset.facets()).containsKey(DefaultTypeDatasetFacet.TYPE_FACET_NAME); + assertThat(dataset.facets().get(DefaultTypeDatasetFacet.TYPE_FACET_NAME)) .hasFieldOrPropertyWithValue("typeInformation", TypeInformation.of(String.class)); } + + private static class TestingKafkaSubscriber + implements KafkaSubscriber, KafkaDatasetIdentifierProvider { + @Override + public Optional getDatasetIdentifier() { + return Optional.of( + DefaultKafkaDatasetIdentifier.ofTopics(Collections.singletonList("topic1"))); + } + + @Override + public Set getSubscribedTopicPartitions(AdminClient adminClient) { + return null; + } + } }