Skip to content

Commit 40b7d5f

Browse files
authored
Refactor SQL expression cleaning to preserve charset introducers for functional index compatibility (#599)
1 parent e4d29f2 commit 40b7d5f

2 files changed

Lines changed: 14 additions & 12 deletions

File tree

dao-impl/ebean-dao/src/main/java/com/linkedin/metadata/dao/utils/SchemaValidatorUtil.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -119,22 +119,24 @@ public boolean indexExists(@Nonnull String tableName, @Nonnull String indexName)
119119

120120

121121
/**
122-
* Cleans SQL expression by removing MySQL-specific encoding artifacts that otherwise result in unrecognized syntax.
123-
* Removes _utf8mb4 and _utf8mb3 charset prefix, unescapes quotes, and removes newlines.
124-
* MySQL team is the POC for questions about this since there is preprocessing needed to transform the as-is
122+
* Cleans SQL expression by unescaping quotes and removing newlines while preserving the original structure.
123+
* IMPORTANT: Preserves charset introducers (e.g., _utf8mb4) exactly as they appear in the index definition
124+
* to ensure byte-for-byte matching with functional index expressions. MySQL requires exact expression matching
125+
* for functional indexes - any mismatch (even just a missing charset introducer) causes MySQL to ignore the
126+
* index and perform a full table scan instead.
127+
*
128+
* <p>MySQL team is the POC for questions about this since there is preprocessing needed to transform the as-is
125129
* index expression from the index table to a (string) expression that is usable directly in an indexed query.
126130
*
127-
* @param expression Raw SQL expression from database
128-
* @return Cleaned expression string, with enclosing parentheses
131+
* @param expression Raw SQL expression from database (e.g., _utf8mb4\'$.aspect.model_group_urn\')
132+
* @return Cleaned expression string with unescaped quotes and enclosing parentheses (e.g., (_utf8mb4'$.aspect.model_group_urn'))
129133
*/
130134
public static String cleanIndexExpression(@Nullable String expression) {
131135
if (expression == null) {
132136
return null;
133137
}
134138

135139
return "(" + expression
136-
.replace("_utf8mb4\\'", "'")
137-
.replace("_utf8mb3\\'", "'")
138140
.replace("\\'", "'")
139141
.replace("\\\"", "\"")
140142
.replace("\n", "") + ")";

dao-impl/ebean-dao/src/test/java/com/linkedin/metadata/dao/utils/SchemaValidatorUtilTest.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,20 +89,20 @@ public void testCheckIndexExists() {
8989
@Test
9090
@SuppressWarnings("checkstyle:LineLength")
9191
public void testCleanIndexExpression() {
92-
assertEquals("(cast(json_unquote(json_extract(`a_azkabanjobinfo`,'$.aspect.project.clusterInfo.hadoopCluster')) as char(255) charset utf8mb4))",
92+
assertEquals("(cast(json_unquote(json_extract(`a_azkabanjobinfo`,_utf8mb4'$.aspect.project.clusterInfo.hadoopCluster')) as char(255) charset utf8mb4))",
9393
SchemaValidatorUtil.cleanIndexExpression("cast(json_unquote(json_extract(`a_azkabanjobinfo`,_utf8mb4\\'$.aspect.project.clusterInfo.hadoopCluster\\')) as char(255) charset utf8mb4)"));
9494

95-
assertEquals("(cast(json_unquote(json_extract(`a_urn`,'$.\"\\\\\\\\/azkabanFlowUrn\"')) as char(255) charset utf8mb4))",
95+
assertEquals("(cast(json_unquote(json_extract(`a_urn`,_utf8mb4'$.\"\\\\\\\\/azkabanFlowUrn\"')) as char(255) charset utf8mb4))",
9696
SchemaValidatorUtil.cleanIndexExpression("cast(json_unquote(json_extract(`a_urn`,_utf8mb4\\'$.\"\\\\\\\\/azkabanFlowUrn\"\\')) as char(255) charset utf8mb4)"));
9797

98-
assertEquals("(cast(replace(json_unquote(json_extract(`a_datapolicyinfo`,'$.aspect.annotation.ontologyIris[*]')),'\"','') as char(255) charset utf8mb4))",
98+
assertEquals("(cast(replace(json_unquote(json_extract(`a_datapolicyinfo`,_utf8mb3'$.aspect.annotation.ontologyIris[*]')),_utf8mb4'\"',_utf8mb3'') as char(255) charset utf8mb4))",
9999
SchemaValidatorUtil.cleanIndexExpression("cast(replace(json_unquote(json_extract(`a_datapolicyinfo`,_utf8mb3\\'$.aspect.annotation.ontologyIris[*]\\')),_utf8mb4\\'\"\\',_utf8mb3\\'\\') as char(255) charset utf8mb4)"));
100100

101-
assertEquals("(cast(json_unquote(json_extract(`a_urn`,'$.\"\\\\\\\\/dataset\\\\\\\\/platform\\\\\\\\/platformName\"')) as char(255) charset utf8mb4))",
101+
assertEquals("(cast(json_unquote(json_extract(`a_urn`,_utf8mb3'$.\"\\\\\\\\/dataset\\\\\\\\/platform\\\\\\\\/platformName\"')) as char(255) charset utf8mb4))",
102102
SchemaValidatorUtil.cleanIndexExpression("cast(json_unquote(json_extract(`a_urn`,_utf8mb3\\'$.\"\\\\\\\\/dataset\\\\\\\\/platform\\\\\\\\/platformName\"\\')) as char(255) charset utf8mb4)"));
103103

104104
// crazy AIM use case lol
105-
assertEquals("(cast(concat(json_unquote(json_extract(`a_model_instance_info`,'$.aspect.multi_product_version.major')),'.',json_unquote(json_extract(`a_model_instance_info`,'$.aspect.multi_product_version.minor')),'.',json_unquote(json_extract(`a_model_instance_info`,'$.aspect.multi_product_version.patch'))) as char(255) charset utf8mb4))",
105+
assertEquals("(cast(concat(json_unquote(json_extract(`a_model_instance_info`,_utf8mb3'$.aspect.multi_product_version.major')),_utf8mb4'.',json_unquote(json_extract(`a_model_instance_info`,_utf8mb3'$.aspect.multi_product_version.minor')),_utf8mb4'.',json_unquote(json_extract(`a_model_instance_info`,_utf8mb3'$.aspect.multi_product_version.patch'))) as char(255) charset utf8mb4))",
106106
SchemaValidatorUtil.cleanIndexExpression("cast(concat(json_unquote(json_extract(`a_model_instance_info`,_utf8mb3\\'$.aspect.multi_product_version.major\\')),_utf8mb4\\'.\\',json_unquote(json_extract(`a_model_instance_info`,_utf8mb3\\'$.aspect.multi_product_version.minor\\')),_utf8mb4\\'.\\',json_unquote(json_extract(`a_model_instance_info`,_utf8mb3\\'$.aspect.multi_product_version.patch\\'))) as char(255) charset utf8mb4)"));
107107

108108
assertNull(SchemaValidatorUtil.cleanIndexExpression(null));

0 commit comments

Comments
 (0)