forked from apache/hive
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
HIVE-28544: Ensure using UTF-8 encoding in some String/Char/Varchar r…
…elated operations (apache#5479) (Laszlo Bodor reviewed by Simhadri Govindappa)
- Loading branch information
1 parent
6e261a3
commit 799b5cf
Showing
4 changed files
with
232 additions
and
73 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
45 changes: 33 additions & 12 deletions
45
ql/src/test/queries/clientpositive/chinese_utf8_characters.q
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,50 @@ | ||
CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b string, c string); | ||
INSERT INTO tbl_chinese_chars values(1,'上海','徐汇'),(2,'北京','海淀'); | ||
-- this test case is to check the behavior of mandarin characters with different serdes and scenarios | ||
-- while it still passes without the corresponding fixes (HIVE-28544), the behavior is not correct | ||
-- if the qtest is run when mimicking a different default charset, like: US-ASCII, | ||
-- which can be achieved by adding this to the command line while running the qtest: | ||
-- -Dmaven.test.jvm.args="-Dfile.encoding=US-ASCII" | ||
|
||
CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (col1 varchar(100), col2 varchar(100)) | ||
CREATE EXTERNAL TABLE tbl_chinese_chars(a int, b varchar(100), c char(100), d string); | ||
INSERT INTO tbl_chinese_chars values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3'); | ||
|
||
CREATE EXTERNAL TABLE tbl_chinese_chars_multidelimitserde (a int, b varchar(100), c char(100), d string) | ||
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe' | ||
WITH SERDEPROPERTIES ('field.delim'='|~|', 'serialization.encoding'='UTF-8') | ||
STORED AS TEXTFILE; | ||
INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values('测试1','测试2'); | ||
INSERT INTO TABLE tbl_chinese_chars_multidelimitserde values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3'); | ||
|
||
CREATE EXTERNAL TABLE tbl_chinese_chars_orc (a int, b varchar(100), c char(100), d string) | ||
STORED AS ORC; | ||
INSERT INTO TABLE tbl_chinese_chars_orc values(1,'上海1_1','徐汇1_2', '徐上1_3'),(2,'北京2_1','海淀2_2', '徐上2_3'); | ||
|
||
|
||
set hive.fetch.task.conversion=more; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'; | ||
SELECT * FROM default.tbl_chinese_chars where b='北京'; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; | ||
SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; | ||
|
||
set hive.fetch.task.conversion=none; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='北京'; | ||
SELECT * FROM default.tbl_chinese_chars where b='北京'; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; | ||
SELECT * FROM default.tbl_chinese_chars where b='上海1_1'; | ||
|
||
|
||
set hive.fetch.task.conversion=more; | ||
SELECT * FROM default.tbl_chinese_chars_multidelimitserde; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; | ||
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; | ||
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; | ||
|
||
|
||
set hive.fetch.task.conversion=none; | ||
SELECT * FROM default.tbl_chinese_chars_multidelimitserde; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; | ||
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where col1 = '测试1'; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; | ||
SELECT * FROM default.tbl_chinese_chars_multidelimitserde where b = '上海1_1'; | ||
|
||
|
||
set hive.fetch.task.conversion=more; | ||
SELECT * FROM default.tbl_chinese_chars_orc; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; | ||
SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; | ||
|
||
set hive.fetch.task.conversion=none; | ||
SELECT * FROM default.tbl_chinese_chars_orc; | ||
EXPLAIN SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; | ||
SELECT * FROM default.tbl_chinese_chars_orc where b = '上海1_1'; |
Oops, something went wrong.