diff --git a/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java b/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java index 180c6e70d6b6..28e93f48777e 100644 --- a/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java +++ b/itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.io.OutputStream; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; @@ -296,7 +297,7 @@ public void setInputFile(String queryFile) throws IOException { } public void setInputFile(File qf) throws IOException { - String query = FileUtils.readFileToString(qf); + String query = FileUtils.readFileToString(qf, StandardCharsets.UTF_8); inputFile = qf; inputContent = query; qTestResultProcessor.init(query); @@ -518,7 +519,7 @@ public void cleanUp() throws Exception { private void cleanupFromFile() throws IOException { File cleanupFile = new File(cleanupScript); if (cleanupFile.isFile()) { - String cleanupCommands = FileUtils.readFileToString(cleanupFile); + String cleanupCommands = FileUtils.readFileToString(cleanupFile, StandardCharsets.UTF_8); LOG.info("Cleanup (" + cleanupScript + "):\n" + cleanupCommands); try { @@ -553,7 +554,7 @@ private void initFromScript() throws IOException { return; } - String initCommands = FileUtils.readFileToString(scriptFile); + String initCommands = FileUtils.readFileToString(scriptFile, StandardCharsets.UTF_8); LOG.info("Initial setup (" + initScript + "):\n" + initCommands); try { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/ConstantVectorExpression.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/ConstantVectorExpression.java index 3dca014d2f69..ff295becbc14 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/ConstantVectorExpression.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/ConstantVectorExpression.java @@ -92,13 +92,13 @@ public ConstantVectorExpression(int outputColumnNum, byte[] value, TypeInfo outp public ConstantVectorExpression(int outputColumnNum, HiveChar value, TypeInfo outputTypeInfo) throws HiveException { this(outputColumnNum, outputTypeInfo); - setBytesValue(value.getStrippedValue().getBytes()); + setBytesValue(value.getStrippedValue().getBytes(StandardCharsets.UTF_8)); } public ConstantVectorExpression(int outputColumnNum, HiveVarchar value, TypeInfo outputTypeInfo) throws HiveException { this(outputColumnNum, outputTypeInfo); - setBytesValue(value.getValue().getBytes()); + setBytesValue(value.getValue().getBytes(StandardCharsets.UTF_8)); } // Include type name for precision/scale. diff --git a/ql/src/test/queries/clientpositive/chinese_utf8_characters.q b/ql/src/test/queries/clientpositive/chinese_utf8_characters.q index aa9286b82110..150fb068c881 100644 --- a/ql/src/test/queries/clientpositive/chinese_utf8_characters.q +++ b/ql/src/test/queries/clientpositive/chinese_utf8_characters.q @@ -1,29 +1,50 @@ -CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string); -INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀'); +-- this test case is to check the behavior of mandarin characters with different serdes and scenarios +-- while it still passes without the corresponding fixes (HIVE-28544), the behavior is not correct +-- if the qtest is run when mimicking a different default charset, like: US-ASCII, +-- which can be achieved by adding this to the command line while running the qtest: +-- -Dmaven.test.jvm.args="-Dfile.encoding=US-ASCII" -CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) +CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b varchar(100), c char(100), d string); +INSERT INTO tbl_chinese_chars values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3'); + +CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (a int, b varchar(100), c char(100), d string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') STORED AS TEXTFILE; -INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2'); +INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3'); + +CREATE EXTERNAL TABLE tbl_chinese_chars_orc (a int, b varchar(100), c char(100), d string) +STORED AS ORC; +INSERT INTO TABLE tbl_chinese_chars_orc values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3'); set hive.fetch.task.conversion=more; -EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'; -SELECT * FROM default.tbl_chinese_chars where b='北京'; +EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; +SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; set hive.fetch.task.conversion=none; -EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'; -SELECT * FROM default.tbl_chinese_chars where b='北京'; +EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; +SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; set hive.fetch.task.conversion=more; SELECT * FROM default.tbl_chinese_chars_multidelimitserde; -EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; -SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; +EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; +SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; set hive.fetch.task.conversion=none; SELECT * FROM default.tbl_chinese_chars_multidelimitserde; -EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; -SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; +EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; +SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; + + +set hive.fetch.task.conversion=more; +SELECT * FROM default.tbl_chinese_chars_orc; +EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; +SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; + +set hive.fetch.task.conversion=none; +SELECT * FROM default.tbl_chinese_chars_orc; +EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; +SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; \ No newline at end of file diff --git a/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out b/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out index 911d1802cc46..a22dd5848412 100644 --- a/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out +++ b/ql/src/test/results/clientpositive/llap/chinese_utf8_characters.q.out @@ -1,51 +1,76 @@ -PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string) +PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b varchar(100), c char(100), d string) PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@tbl_chinese_chars -POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string) +POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b varchar(100), c char(100), d string) POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@tbl_chinese_chars -PREHOOK: query: INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀') +PREHOOK: query: INSERT INTO tbl_chinese_chars values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tbl_chinese_chars -POSTHOOK: query: INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀') +POSTHOOK: query: INSERT INTO tbl_chinese_chars values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tbl_chinese_chars POSTHOOK: Lineage: tbl_chinese_chars.a SCRIPT [] POSTHOOK: Lineage: tbl_chinese_chars.b SCRIPT [] POSTHOOK: Lineage: tbl_chinese_chars.c SCRIPT [] -PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) +POSTHOOK: Lineage: tbl_chinese_chars.d SCRIPT [] +PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (a int, b varchar(100), c char(100), d string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') STORED AS TEXTFILE PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@tbl_chinese_chars_multidelimitserde -POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) +POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (a int, b varchar(100), c char(100), d string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') STORED AS TEXTFILE POSTHOOK: type: CREATETABLE POSTHOOK: Output: database:default POSTHOOK: Output: default@tbl_chinese_chars_multidelimitserde -PREHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2') +PREHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3') PREHOOK: type: QUERY PREHOOK: Input: _dummy_database@_dummy_table PREHOOK: Output: default@tbl_chinese_chars_multidelimitserde -POSTHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2') +POSTHOOK: query: INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3') POSTHOOK: type: QUERY POSTHOOK: Input: _dummy_database@_dummy_table POSTHOOK: Output: default@tbl_chinese_chars_multidelimitserde -POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.col1 SCRIPT [] -POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.col2 SCRIPT [] -PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京' +POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.a SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.b SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.c SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_multidelimitserde.d SCRIPT [] +PREHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_orc (a int, b varchar(100), c char(100), d string) +STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tbl_chinese_chars_orc +POSTHOOK: query: CREATE EXTERNAL TABLE tbl_chinese_chars_orc (a int, b varchar(100), c char(100), d string) +STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tbl_chinese_chars_orc +PREHOOK: query: INSERT INTO TABLE tbl_chinese_chars_orc values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@tbl_chinese_chars_orc +POSTHOOK: query: INSERT INTO TABLE tbl_chinese_chars_orc values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@tbl_chinese_chars_orc +POSTHOOK: Lineage: tbl_chinese_chars_orc.a SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_orc.b SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_orc.c SCRIPT [] +POSTHOOK: Lineage: tbl_chinese_chars_orc.d SCRIPT [] +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### -POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京' +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### @@ -59,28 +84,28 @@ STAGE PLANS: Processor Tree: TableScan alias: tbl_chinese_chars - filterExpr: (b = '北京') (type: boolean) + filterExpr: (b = '上海1_1') (type: boolean) Filter Operator - predicate: (b = '北京') (type: boolean) + predicate: (b = '上海1_1') (type: boolean) Select Operator - expressions: a (type: int), '北京' (type: string), c (type: string) - outputColumnNames: _col0, _col1, _col2 + expressions: a (type: int), '上海1_1' (type: varchar(100)), c (type: char(100)), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 ListSink -PREHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='北京' +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### -POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='北京' +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### -2 北京 海淀 -PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京' +1 上海1_1 徐汇1_2 徐上1_3 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### -POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京' +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### @@ -97,18 +122,18 @@ STAGE PLANS: Map Operator Tree: TableScan alias: tbl_chinese_chars - filterExpr: (b = '北京') (type: boolean) - Statistics: Num rows: 2 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + filterExpr: (b = '上海1_1') (type: boolean) + Statistics: Num rows: 2 Data size: 542 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (b = '北京') (type: boolean) - Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + predicate: (b = '上海1_1') (type: boolean) + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: a (type: int), '北京' (type: string), c (type: string) - outputColumnNames: _col0, _col1, _col2 - Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + expressions: a (type: int), '上海1_1' (type: varchar(100)), c (type: char(100)), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -122,15 +147,15 @@ STAGE PLANS: Processor Tree: ListSink -PREHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='北京' +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### -POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='北京' +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars where b='上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars #### A masked pattern was here #### -2 北京 海淀 +1 上海1_1 徐汇1_2 徐上1_3 PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde @@ -139,12 +164,13 @@ POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -测试1 测试2 -PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +1 上海1_1 徐汇1_2 徐上1_3 +2 北京2_1 海淀2_2 徐上2_3 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### @@ -158,23 +184,23 @@ STAGE PLANS: Processor Tree: TableScan alias: tbl_chinese_chars_multidelimitserde - filterExpr: (col1 = '测试1') (type: boolean) + filterExpr: (b = '上海1_1') (type: boolean) Filter Operator - predicate: (col1 = '测试1') (type: boolean) + predicate: (b = '上海1_1') (type: boolean) Select Operator - expressions: '测试1' (type: varchar(100)), col2 (type: varchar(100)) - outputColumnNames: _col0, _col1 + expressions: a (type: int), '上海1_1' (type: varchar(100)), c (type: char(100)), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 ListSink -PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -测试1 测试2 +1 上海1_1 徐汇1_2 徐上1_3 PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde @@ -183,12 +209,13 @@ POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -测试1 测试2 -PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +1 上海1_1 徐汇1_2 徐上1_3 +2 北京2_1 海淀2_2 徐上2_3 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### @@ -205,18 +232,18 @@ STAGE PLANS: Map Operator Tree: TableScan alias: tbl_chinese_chars_multidelimitserde - filterExpr: (col1 = '测试1') (type: boolean) - Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + filterExpr: (b = '上海1_1') (type: boolean) + Statistics: Num rows: 2 Data size: 542 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator - predicate: (col1 = '测试1') (type: boolean) - Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + predicate: (b = '上海1_1') (type: boolean) + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: '测试1' (type: varchar(100)), col2 (type: varchar(100)) - outputColumnNames: _col0, _col1 - Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + expressions: a (type: int), '上海1_1' (type: varchar(100)), c (type: char(100)), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 174 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -230,12 +257,122 @@ STAGE PLANS: Processor Tree: ListSink -PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' PREHOOK: type: QUERY PREHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1' +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1' POSTHOOK: type: QUERY POSTHOOK: Input: default@tbl_chinese_chars_multidelimitserde #### A masked pattern was here #### -测试1 测试2 +1 上海1_1 徐汇1_2 徐上1_3 +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +1 上海1_1 徐汇1_2 徐上1_3 +2 北京2_1 海淀2_2 徐上2_3 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: tbl_chinese_chars_orc + filterExpr: (b = '上海1_1') (type: boolean) + Filter Operator + predicate: (b = '上海1_1') (type: boolean) + Select Operator + expressions: a (type: int), '上海1_1' (type: varchar(100)), c (type: char(100)), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + ListSink + +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +1 上海1_1 徐汇1_2 徐上1_3 +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +1 上海1_1 徐汇1_2 徐上1_3 +2 北京2_1 海淀2_2 徐上2_3 +PREHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tbl_chinese_chars_orc + filterExpr: (b = '上海1_1') (type: boolean) + Statistics: Num rows: 2 Data size: 542 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (b = '上海1_1') (type: boolean) + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: a (type: int), '上海1_1' (type: varchar(100)), c (type: char(100)), d (type: string) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 271 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +PREHOOK: type: QUERY +PREHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +POSTHOOK: query: SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tbl_chinese_chars_orc +#### A masked pattern was here #### +1 上海1_1 徐汇1_2 徐上1_3