|
2 | 2 |
|
3 | 3 | import static com.linkedin.venice.ConfigKeys.KAFKA_CONFIG_PREFIX; |
4 | 4 | import static com.linkedin.venice.meta.Store.UNLIMITED_STORAGE_QUOTA; |
| 5 | +import static com.linkedin.venice.spark.SparkConstants.CHUNKED_KEY_SUFFIX_COLUMN_NAME; |
5 | 6 | import static com.linkedin.venice.spark.SparkConstants.KEY_COLUMN_NAME; |
| 7 | +import static com.linkedin.venice.spark.SparkConstants.MESSAGE_TYPE_COLUMN_NAME; |
| 8 | +import static com.linkedin.venice.spark.SparkConstants.OFFSET_COLUMN_NAME; |
6 | 9 | import static com.linkedin.venice.spark.SparkConstants.RMD_COLUMN_NAME; |
| 10 | +import static com.linkedin.venice.spark.SparkConstants.RMD_VERSION_ID_COLUMN_NAME; |
| 11 | +import static com.linkedin.venice.spark.SparkConstants.SCHEMA_ID_COLUMN_NAME; |
7 | 12 | import static com.linkedin.venice.spark.SparkConstants.SPARK_APP_NAME_CONFIG; |
8 | 13 | import static com.linkedin.venice.spark.SparkConstants.SPARK_DATA_WRITER_CONF_PREFIX; |
9 | 14 | import static com.linkedin.venice.spark.SparkConstants.SPARK_SESSION_CONF_PREFIX; |
10 | 15 | import static com.linkedin.venice.spark.SparkConstants.VALUE_COLUMN_NAME; |
11 | 16 | import static com.linkedin.venice.vpj.VenicePushJobConstants.DEFAULT_KEY_FIELD_PROP; |
12 | 17 | import static com.linkedin.venice.vpj.VenicePushJobConstants.DEFAULT_VALUE_FIELD_PROP; |
13 | 18 | import static org.apache.spark.sql.types.DataTypes.BinaryType; |
| 19 | +import static org.apache.spark.sql.types.DataTypes.IntegerType; |
| 20 | +import static org.apache.spark.sql.types.DataTypes.LongType; |
14 | 21 | import static org.apache.spark.sql.types.DataTypes.StringType; |
15 | 22 | import static org.mockito.ArgumentMatchers.eq; |
16 | 23 | import static org.mockito.Mockito.mock; |
@@ -129,6 +136,70 @@ public void testValidateDataFrameWithValidRmdType() { |
129 | 136 | dataWriterSparkJob.validateDataFrame(mockDataset); |
130 | 137 | } |
131 | 138 |
|
| 139 | + @Test |
| 140 | + public void testValidateDataFrameWithChunkedKifColumns() { |
| 141 | + PushJobSetting kafkaSetting = new PushJobSetting(); |
| 142 | + kafkaSetting.isSourceKafka = true; |
| 143 | + |
| 144 | + AbstractDataWriterSparkJob dataWriterSparkJob = spy(AbstractDataWriterSparkJob.class); |
| 145 | + when(dataWriterSparkJob.getPushJobSetting()).thenReturn(kafkaSetting); |
| 146 | + |
| 147 | + // Schema matching chunked KIF repush input: key, value, rmd + internal columns for chunk assembly |
| 148 | + StructType chunkedKifSchema = new StructType( |
| 149 | + new StructField[] { new StructField(KEY_COLUMN_NAME, BinaryType, false, Metadata.empty()), |
| 150 | + new StructField(VALUE_COLUMN_NAME, BinaryType, true, Metadata.empty()), |
| 151 | + new StructField(RMD_COLUMN_NAME, BinaryType, true, Metadata.empty()), |
| 152 | + new StructField(SCHEMA_ID_COLUMN_NAME, IntegerType, false, Metadata.empty()), |
| 153 | + new StructField(RMD_VERSION_ID_COLUMN_NAME, IntegerType, false, Metadata.empty()), |
| 154 | + new StructField(OFFSET_COLUMN_NAME, LongType, false, Metadata.empty()), |
| 155 | + new StructField(MESSAGE_TYPE_COLUMN_NAME, IntegerType, false, Metadata.empty()), |
| 156 | + new StructField(CHUNKED_KEY_SUFFIX_COLUMN_NAME, BinaryType, true, Metadata.empty()) }); |
| 157 | + |
| 158 | + Dataset<Row> mockDataset = mock(Dataset.class); |
| 159 | + when(mockDataset.schema()).thenReturn(chunkedKifSchema); |
| 160 | + dataWriterSparkJob.validateDataFrame(mockDataset); |
| 161 | + } |
| 162 | + |
| 163 | + @Test(expectedExceptions = VeniceInvalidInputException.class, expectedExceptionsMessageRegExp = ".*must not have fields that start with an underscore.*__schema_id__.*") |
| 164 | + public void testValidateDataFrameRejectsInternalColumnsForNonKifJob() { |
| 165 | + PushJobSetting hdfsSetting = new PushJobSetting(); |
| 166 | + hdfsSetting.isSourceKafka = false; |
| 167 | + |
| 168 | + AbstractDataWriterSparkJob dataWriterSparkJob = spy(AbstractDataWriterSparkJob.class); |
| 169 | + when(dataWriterSparkJob.getPushJobSetting()).thenReturn(hdfsSetting); |
| 170 | + |
| 171 | + // Same chunked KIF schema but on a non-KIF job — should be rejected |
| 172 | + StructType chunkedKifSchema = new StructType( |
| 173 | + new StructField[] { new StructField(KEY_COLUMN_NAME, BinaryType, false, Metadata.empty()), |
| 174 | + new StructField(VALUE_COLUMN_NAME, BinaryType, true, Metadata.empty()), |
| 175 | + new StructField(RMD_COLUMN_NAME, BinaryType, true, Metadata.empty()), |
| 176 | + new StructField(SCHEMA_ID_COLUMN_NAME, IntegerType, false, Metadata.empty()) }); |
| 177 | + |
| 178 | + Dataset<Row> mockDataset = mock(Dataset.class); |
| 179 | + when(mockDataset.schema()).thenReturn(chunkedKifSchema); |
| 180 | + dataWriterSparkJob.validateDataFrame(mockDataset); |
| 181 | + } |
| 182 | + |
| 183 | + @Test(expectedExceptions = VeniceInvalidInputException.class, expectedExceptionsMessageRegExp = ".*must not have fields that start with an underscore.*_unknown_internal.*") |
| 184 | + public void testValidateDataFrameRejectsUnknownUnderscoreColumnsForKifJob() { |
| 185 | + PushJobSetting kafkaSetting = new PushJobSetting(); |
| 186 | + kafkaSetting.isSourceKafka = true; |
| 187 | + |
| 188 | + AbstractDataWriterSparkJob dataWriterSparkJob = spy(AbstractDataWriterSparkJob.class); |
| 189 | + when(dataWriterSparkJob.getPushJobSetting()).thenReturn(kafkaSetting); |
| 190 | + |
| 191 | + // KIF job but with an unknown underscore column — should still be rejected |
| 192 | + StructType schemaWithUnknownInternalCol = new StructType( |
| 193 | + new StructField[] { new StructField(KEY_COLUMN_NAME, BinaryType, false, Metadata.empty()), |
| 194 | + new StructField(VALUE_COLUMN_NAME, BinaryType, true, Metadata.empty()), |
| 195 | + new StructField(RMD_COLUMN_NAME, BinaryType, true, Metadata.empty()), |
| 196 | + new StructField("_unknown_internal", StringType, true, Metadata.empty()) }); |
| 197 | + |
| 198 | + Dataset<Row> mockDataset = mock(Dataset.class); |
| 199 | + when(mockDataset.schema()).thenReturn(schemaWithUnknownInternalCol); |
| 200 | + dataWriterSparkJob.validateDataFrame(mockDataset); |
| 201 | + } |
| 202 | + |
132 | 203 | @Test |
133 | 204 | public void testValidateDataFrameSchema() throws IOException { |
134 | 205 | File inputDir = TestWriteUtils.getTempDataDirectory(); |
|
0 commit comments