From 6b36f2e09b96d1528bc1d382fa9fe5df001ad62c Mon Sep 17 00:00:00 2001 From: Ralph Ursprung Date: Fri, 10 Jan 2025 10:49:25 +0100 Subject: [PATCH] `phone-search` analyzer: don't emit int'l prefix this was an oversight in the initial implementation: if the tokenizer emits the international calling prefix in the search analyzer then all documents with the same international calling prefix will match. e.g. when searching for `+1-555-123-4567` not only documents with this number would match but also any other document with a `1` token (i.e. any other number with this prefix). thus the search functionality is currently broken for this analyzer, making it useless. the test coverage has now been extended to cover these and other use-cases. Signed-off-by: Ralph Ursprung --- CHANGELOG.md | 1 + .../phone/PhoneNumberTermTokenizer.java | 7 +- .../phone/PhoneNumberAnalyzerTests.java | 10 +-- .../test/analysis-phone/20_search.yml | 67 +++++++++++++++++++ 4 files changed, 78 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9aabbbf75f00c..eeeb752eabdd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,6 +91,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix remote shards balance ([#15335](https://github.com/opensearch-project/OpenSearch/pull/15335)) - Always use `constant_score` query for `match_only_text` field ([#16964](https://github.com/opensearch-project/OpenSearch/pull/16964)) - Fix Shallow copy snapshot failures on closed index ([#16868](https://github.com/opensearch-project/OpenSearch/pull/16868)) +- The `phone-search` analyzer no longer emits the international calling code as a token ([#16993](https://github.com/opensearch-project/OpenSearch/pull/16993)) ### Security diff --git a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java index 6b95594204eb4..90f1c38bd133b 100644 --- a/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java +++ b/plugins/analysis-phonenumber/src/main/java/org/opensearch/analysis/phone/PhoneNumberTermTokenizer.java @@ -128,8 +128,11 @@ private Set getTokens() throws IOException { countryCode = Optional.of(String.valueOf(numberProto.getCountryCode())); input = String.valueOf(numberProto.getNationalNumber()); - // Add Country code, extension, and the number as tokens - tokens.add(countryCode.get()); + if (addNgrams) { + // Consider the country code as an ngram - it makes no sense in the search analyzer as it'd match all values with the same country code + tokens.add(countryCode.get()); + } + // Add extension, and the number as tokens tokens.add(countryCode.get() + input); if (!Strings.isEmpty(numberProto.getExtension())) { tokens.add(numberProto.getExtension()); diff --git a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java index 332f6d21f47d6..1dc2f32e84921 100644 --- a/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java +++ b/plugins/analysis-phonenumber/src/test/java/org/opensearch/analysis/phone/PhoneNumberAnalyzerTests.java @@ -90,7 +90,7 @@ public void testEuropeDetailledSearch() throws IOException { assertTokensAreInAnyOrder( phoneSearchAnalyzer, "tel:+441344840400", - Arrays.asList("tel:+441344840400", "tel:", "441344840400", "44", "1344840400") + Arrays.asList("tel:+441344840400", "tel:", "441344840400", "1344840400") ); } @@ -189,21 +189,21 @@ public void testLocalNumberWithCH() throws IOException { } public void testSearchInternationalPrefixWithZZ() throws IOException { - assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensInclude(phoneSearchAnalyzer, "+41583161010", Arrays.asList("41583161010", "583161010")); } public void testSearchInternationalPrefixWithCH() throws IOException { - assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensInclude(phoneSearchCHAnalyzer, "+41583161010", Arrays.asList("41583161010", "583161010")); } public void testSearchNationalPrefixWithCH() throws IOException { // + is equivalent to 00 in Switzerland - assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensInclude(phoneSearchCHAnalyzer, "0041583161010", Arrays.asList("41583161010", "583161010")); } public void testSearchLocalNumberWithCH() throws IOException { // when omitting the international prefix swiss numbers must start with '0' - assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41", "41583161010", "583161010")); + assertTokensInclude(phoneSearchCHAnalyzer, "0583161010", Arrays.asList("41583161010", "583161010")); } /** diff --git a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml index 0bd7d2c371bfc..91560f3960e53 100644 --- a/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml +++ b/plugins/analysis-phonenumber/src/yamlRestTest/resources/rest-api-spec/test/analysis-phone/20_search.yml @@ -32,9 +32,20 @@ index: test id: 1 body: { "phone": "+41 58 316 10 10", "phone-ch": "058 316 10 10" } + - do: + index: + index: test + id: 2 + body: { "phone": "+41 58 316 99 99", "phone-ch": "058 316 99 99" } + - do: + index: + index: test + id: 2 + body: { "phone": "+1-888-280-4331", "phone-ch": "+1-888-280-4331" } - do: indices.refresh: {} + # international format in document & search will always work - do: search: rest_total_hits_as_int: true @@ -45,6 +56,7 @@ "phone": "+41583161010" - match: { hits.total: 1 } + # correct national format & international format in search will always work - do: search: rest_total_hits_as_int: true @@ -54,3 +66,58 @@ match: "phone-ch": "+41583161010" - match: { hits.total: 1 } + + # national format without country specified won't work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "0583161010" + - match: { hits.total: 0 } + + # correct national format with country specified in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "0583161010" + - match: { hits.total: 1 } + + # international format in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "+1 888 280 4331" + - match: { hits.total: 1 } + + # international format in document & search will always work + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone-ch": "+1 888 280 4331" + - match: { hits.total: 1 } + + # national format in search won't work if no country is specified + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + match: + "phone": "888 280 4331" + - match: { hits.total: 0 }