[FLINK-35825][hive] HiveTableSource supports report statistics for text file

reswqa · reswqa · commit db8daded4a87 · 2024-07-12T13:20:08.000+08:00
diff --git a/flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveTableSource.java b/flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveTableSource.java
@@ -34,6 +34,7 @@
 import org.apache.flink.connectors.hive.read.HivePartitionFetcherContextBase;
 import org.apache.flink.connectors.hive.read.HiveSourceSplit;
 import org.apache.flink.connectors.hive.util.HivePartitionUtils;
+import org.apache.flink.connectors.hive.util.TextFormatStatisticsReportUtil;
 import org.apache.flink.core.fs.Path;
 import org.apache.flink.formats.parquet.utils.ParquetFormatStatisticsReportUtil;
 import org.apache.flink.orc.util.OrcFormatStatisticsReportUtil;
@@ -379,7 +380,7 @@ private TableStats getMapRedInputFormatStatistics(
         Preconditions.checkArgument(
                 statisticsThreadNum >= 1,
                 TABLE_EXEC_HIVE_READ_STATISTICS_THREAD_NUM.key() + " cannot be less than 1");
-        // Now we only support Parquet, Orc formats.
+        // Now we only support Parquet, Orc and Text formats.
         if (serializationLib.contains("parquet")) {
             return ParquetFormatStatisticsReportUtil.getTableStatistics(
                     files,
@@ -390,10 +391,13 @@ private TableStats getMapRedInputFormatStatistics(
         } else if (serializationLib.contains("orc")) {
             return OrcFormatStatisticsReportUtil.getTableStatistics(
                     files, producedDataType, jobConf, statisticsThreadNum);
+        } else if (serializationLib.contains("simple")) {
+            return TextFormatStatisticsReportUtil.estimateTableStatistics(
+                    files, producedDataType, jobConf);
         } else {
-            // Now, only support Orc and Parquet Formats.
+            // Now, only support Orc and Parquet and Text Formats.
             LOG.info(
-                    "Now for hive table source, reporting statistics only support Orc and Parquet formats.");
+                    "Now for hive table source, reporting statistics only support Orc and Parquet and Text formats.");
             return TableStats.UNKNOWN;
         }
     }
diff --git a/flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/util/TextFormatStatisticsReportUtil.java b/flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/util/TextFormatStatisticsReportUtil.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive.util;
+
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.plan.stats.TableStats;
+import org.apache.flink.table.types.DataType;
+import org.apache.flink.table.types.logical.ArrayType;
+import org.apache.flink.table.types.logical.LogicalType;
+import org.apache.flink.table.types.logical.LogicalTypeRoot;
+import org.apache.flink.table.types.logical.MapType;
+import org.apache.flink.table.types.logical.RowType;
+
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.List;
+
+/** Utils for text format statistics report. */
+public class TextFormatStatisticsReportUtil {
+    private static final Logger LOG = LoggerFactory.getLogger(TextFormatStatisticsReportUtil.class);
+
+    public static TableStats estimateTableStatistics(
+            List<Path> files, DataType producedDataType, Configuration hadoopConfig) {
+        try {
+            long rowCount;
+            RowType rowType = (RowType) producedDataType.getLogicalType();
+            double totalFileSize = 0.0;
+            for (Path file : files) {
+                totalFileSize += getTextFileSize(hadoopConfig, file);
+            }
+            rowCount = (long) (totalFileSize / estimateRowSize(rowType));
+            return new TableStats(rowCount);
+        } catch (Exception e) {
+            LOG.warn("Estimating statistics failed for text format: {}", e.getMessage());
+            return TableStats.UNKNOWN;
+        }
+    }
+
+    private static int estimateRowSize(RowType rowType) {
+        int rowSize = 0;
+        for (int index = 0; index < rowType.getFieldCount(); ++index) {
+            LogicalType logicalType = rowType.getTypeAt(index);
+            rowSize += getAverageTypeValueSize(logicalType);
+        }
+        return rowSize;
+    }
+
+    /** Estimation rules based on Hive field types. */
+    private static double getAverageTypeValueSize(LogicalType logicalType) {
+        LogicalTypeRoot typeRoot = logicalType.getTypeRoot();
+        switch (typeRoot) {
+            case CHAR:
+            case TINYINT:
+                return 1;
+            case VARCHAR:
+            case DATE:
+            case TIMESTAMP_WITHOUT_TIME_ZONE:
+            case DECIMAL:
+                return 12;
+            case SMALLINT:
+                return 2;
+            case INTEGER:
+            case FLOAT:
+            case INTERVAL_DAY_TIME:
+                return 4;
+            case BIGINT:
+            case DOUBLE:
+            case INTERVAL_YEAR_MONTH:
+                return 8;
+            case VARBINARY:
+                return 16;
+            case ARRAY:
+                return getAverageTypeValueSize(((ArrayType) logicalType).getElementType()) * 16;
+            case MAP:
+                return (getAverageTypeValueSize(((MapType) logicalType).getKeyType())
+                                + getAverageTypeValueSize(((MapType) logicalType).getValueType()))
+                        * 16;
+            case ROW:
+                return estimateRowSize((RowType) logicalType);
+            default:
+                // For unknown data types, we use a smaller data size for estimation.
+                return 8;
+        }
+    }
+
+    private static long getTextFileSize(Configuration hadoopConfig, Path file) throws IOException {
+        org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(file.toUri());
+        return hadoopPath.getFileSystem(hadoopConfig).getContentSummary(hadoopPath).getLength();
+    }
+}
diff --git a/flink-connectors/flink-connector-hive/src/test/java/org/apache/flink/connectors/hive/util/TextFormatStatisticsReportUtilTest.java b/flink-connectors/flink-connector-hive/src/test/java/org/apache/flink/connectors/hive/util/TextFormatStatisticsReportUtilTest.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive.util;
+
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.plan.stats.TableStats;
+import org.apache.flink.table.types.AtomicDataType;
+import org.apache.flink.table.types.DataType;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.table.types.logical.VarCharType;
+import org.apache.flink.testutils.junit.utils.TempDirUtils;
+
+import org.apache.hadoop.conf.Configuration;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.List;
+
+import static java.nio.file.StandardOpenOption.APPEND;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/** Tests for {@link TextFormatStatisticsReportUtil}. */
+class TextFormatStatisticsReportUtilTest {
+    private Configuration hadoopConfig;
+    private DataType producedDataType;
+
+    @TempDir private java.nio.file.Path temporaryFolder;
+
+    @BeforeEach
+    void setUp() {
+        hadoopConfig = new Configuration();
+        // Create a sample producedDataType with a RowType
+        List<RowType.RowField> fields = new ArrayList<>();
+        fields.add(new RowType.RowField("field1", new VarCharType()));
+        fields.add(new RowType.RowField("field2", new VarCharType()));
+        fields.add(new RowType.RowField("field3", new VarCharType()));
+        producedDataType = new AtomicDataType(new RowType(fields));
+    }
+
+    @Test
+    void testEstimateTableStatisticsCase1() throws IOException {
+        // Create sample files for testing
+        File tempFile = TempDirUtils.newFile(temporaryFolder, "flink_test_file.txt");
+
+        List<Path> files = new ArrayList<>();
+        files.add(new Path(tempFile.toURI()));
+
+        String sampleString = "sample data";
+        Files.write(tempFile.toPath(), sampleString.getBytes());
+        TableStats stats =
+                TextFormatStatisticsReportUtil.estimateTableStatistics(
+                        files, producedDataType, hadoopConfig);
+        assertEquals(0, stats.getRowCount());
+        for (int i = 0; i < 10; ++i) {
+            Files.write(tempFile.toPath(), sampleString.getBytes(), APPEND);
+        }
+        stats =
+                TextFormatStatisticsReportUtil.estimateTableStatistics(
+                        files, producedDataType, hadoopConfig);
+        assertEquals(3, stats.getRowCount());
+    }
+
+    @Test
+    void testEstimateFailedToUnknown() {
+        List<Path> files = new ArrayList<>();
+        files.add(new Path(URI.create("file:///non_existent_file.txt")));
+        // Estimate table statistics
+        TableStats stats =
+                TextFormatStatisticsReportUtil.estimateTableStatistics(
+                        files, producedDataType, hadoopConfig);
+        // Verify that it returns TableStats.UNKNOWN on failure
+        assertEquals(TableStats.UNKNOWN, stats);
+    }
+}