Skip to content

Commit

Permalink
HIVE-28544: Ensure using UTF-8 encoding in some String/Char/Varchar r…
Browse files Browse the repository at this point in the history
…elated operations (apache#5479) (Laszlo Bodor reviewed by Simhadri Govindappa)
  • Loading branch information
abstractdog authored Oct 2, 2024
1 parent 6e261a3 commit 799b5cf
Show file tree
Hide file tree
Showing 4 changed files with 232 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.io.IOException;
import java.io.OutputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -296,7 +297,7 @@ public void setInputFile(String queryFile) throws IOException {
}

public void setInputFile(File qf) throws IOException {
String query = FileUtils.readFileToString(qf);
String query = FileUtils.readFileToString(qf, StandardCharsets.UTF_8);
inputFile = qf;
inputContent = query;
qTestResultProcessor.init(query);
Expand Down Expand Up @@ -518,7 +519,7 @@ public void cleanUp() throws Exception {
private void cleanupFromFile() throws IOException {
File cleanupFile = new File(cleanupScript);
if (cleanupFile.isFile()) {
String cleanupCommands = FileUtils.readFileToString(cleanupFile);
String cleanupCommands = FileUtils.readFileToString(cleanupFile, StandardCharsets.UTF_8);
LOG.info("Cleanup (" + cleanupScript + "):\n" + cleanupCommands);

try {
Expand Down Expand Up @@ -553,7 +554,7 @@ private void initFromScript() throws IOException {
return;
}

String initCommands = FileUtils.readFileToString(scriptFile);
String initCommands = FileUtils.readFileToString(scriptFile, StandardCharsets.UTF_8);
LOG.info("Initial setup (" + initScript + "):\n" + initCommands);

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ public ConstantVectorExpression(int outputColumnNum, byte[] value, TypeInfo outp
public ConstantVectorExpression(int outputColumnNum, HiveChar value, TypeInfo outputTypeInfo)
throws HiveException {
this(outputColumnNum, outputTypeInfo);
setBytesValue(value.getStrippedValue().getBytes());
setBytesValue(value.getStrippedValue().getBytes(StandardCharsets.UTF_8));
}

public ConstantVectorExpression(int outputColumnNum, HiveVarchar value, TypeInfo outputTypeInfo)
throws HiveException {
this(outputColumnNum, outputTypeInfo);
setBytesValue(value.getValue().getBytes());
setBytesValue(value.getValue().getBytes(StandardCharsets.UTF_8));
}

// Include type name for precision/scale.
Expand Down
45 changes: 33 additions & 12 deletions ql/src/test/queries/clientpositive/chinese_utf8_characters.q
Original file line number Diff line number Diff line change
@@ -1,29 +1,50 @@
CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string);
INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀');
-- this test case is to check the behavior of mandarin characters with different serdes and scenarios
-- while it still passes without the corresponding fixes (HIVE-28544), the behavior is not correct
-- if the qtest is run when mimicking a different default charset, like: US-ASCII,
-- which can be achieved by adding this to the command line while running the qtest:
-- -Dmaven.test.jvm.args="-Dfile.encoding=US-ASCII"

CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100))
CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b varchar(100), c char(100), d string);
INSERT INTO tbl_chinese_chars values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3');

CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (a int, b varchar(100), c char(100), d string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8')
STORED AS TEXTFILE;
INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2');
INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3');

CREATE EXTERNAL TABLE tbl_chinese_chars_orc (a int, b varchar(100), c char(100), d string)
STORED AS ORC;
INSERT INTO TABLE tbl_chinese_chars_orc values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3');


set hive.fetch.task.conversion=more;
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京';
SELECT * FROM default.tbl_chinese_chars where b='北京';
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1';
SELECT * FROM default.tbl_chinese_chars where b='上海1_1';

set hive.fetch.task.conversion=none;
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京';
SELECT * FROM default.tbl_chinese_chars where b='北京';
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1';
SELECT * FROM default.tbl_chinese_chars where b='上海1_1';


set hive.fetch.task.conversion=more;
SELECT * FROM default.tbl_chinese_chars_multidelimitserde;
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1';
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1';
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1';
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1';


set hive.fetch.task.conversion=none;
SELECT * FROM default.tbl_chinese_chars_multidelimitserde;
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1';
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1';
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1';
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1';


set hive.fetch.task.conversion=more;
SELECT * FROM default.tbl_chinese_chars_orc;
EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1';
SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1';

set hive.fetch.task.conversion=none;
SELECT * FROM default.tbl_chinese_chars_orc;
EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1';
SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1';
Loading

0 comments on commit 799b5cf

Please sign in to comment.