From f6e40b3b4510172f8884aa33dd4e145949aa2611 Mon Sep 17 00:00:00 2001 From: Josh Mock Date: Wed, 16 Apr 2025 15:13:36 -0500 Subject: [PATCH 1/4] Add several missing token filter types --- .../_types/analysis/token_filters.ts | 106 +++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/specification/_types/analysis/token_filters.ts b/specification/_types/analysis/token_filters.ts index 5035e5680b..d0b84582f8 100644 --- a/specification/_types/analysis/token_filters.ts +++ b/specification/_types/analysis/token_filters.ts @@ -342,6 +342,94 @@ export class UppercaseTokenFilter extends TokenFilterBase { type: 'uppercase' } +export class ApostropheTokenFilter extends TokenFilterBase { + type: 'apostrophe' +} + +export class ArabicNormalizationTokenFilter extends TokenFilterBase { + type: 'arabic_normalization' +} + +export enum CjkBigramIgnoredScript { + han, + hangul, + hiragana, + katakana +} + +export class CjkBigramTokenFilter extends TokenFilterBase { + type: 'cjk_bigram' + /** Array of character scripts for which to disable bigrams. */ + ignored_scripts?: CjkBigramIgnoredScript[] + /** If `true`, emit tokens in both bigram and unigram form. If `false`, a CJK character is output in unigram form when it has no adjacent characters. Defaults to `false`. */ + output_unigrams?: boolean +} + +export class CjkWidthTokenFilter extends TokenFilterBase { + type: 'cjk_width' +} + +export class ClassicTokenFilter extends TokenFilterBase { + type: 'classic' +} + +export class DecimalDigitTokenFilter extends TokenFilterBase { + type: 'decimal_digit' +} + +export class FlattenGraphTokenFilter extends TokenFilterBase { + type: 'flatten_graph' +} + +export class GermanNormalizationTokenFilter extends TokenFilterBase { + type: 'german_normalization' +} + +export class HindiNormalizationTokenFilter extends TokenFilterBase { + type: 'hindi_normalization' +} + +export class IndicNormalizationTokenFilter extends TokenFilterBase { + type: 'indic_normalization' +} + +export class KeywordRepeatTokenFilter extends TokenFilterBase { + type: 'keyword_repeat' +} + +export class MinHashTokenFilter extends TokenFilterBase { + type: 'min_hash' + /** Number of buckets to which hashes are assigned. Defaults to `512`. */ + bucket_count?: integer + /** Number of ways to hash each token in the stream. Defaults to `1`. */ + hash_count?: integer + /** Number of hashes to keep from each bucket. Defaults to `1`. + * Hashes are retained by ascending size, starting with the bucket’s smallest hash first. */ + hash_set_size?: integer + /** If `true`, the filter fills empty buckets with the value of the first non-empty bucket to its circular right if the `hash_set_size` is `1`. If the `bucket_count` argument is greater than 1, this parameter defaults to `true`. Otherwise, this parameter defaults to `false`. */ + with_rotation?: boolean +} + +export class PersianNormalizationTokenFilter extends TokenFilterBase { + type: 'persian_normalization' +} + +export class ScandinavianFoldingTokenFilter extends TokenFilterBase { + type: 'scandinavian_folding' +} + +export class ScandinavianNormalizationTokenFilter extends TokenFilterBase { + type: 'scandinavian_normalization' +} + +export class SerbianNormalizationTokenFilter extends TokenFilterBase { + type: 'serbian_normalization' +} + +export class SoraniNormalizationTokenFilter extends TokenFilterBase { + type: 'sorani_normalization' +} + /** * @codegen_names name, definition * @ext_doc_id analysis-tokenfilters @@ -354,34 +442,50 @@ export type TokenFilter = string | TokenFilterDefinition * @non_exhaustive */ export type TokenFilterDefinition = + | ApostropheTokenFilter + | ArabicNormalizationTokenFilter | AsciiFoldingTokenFilter + | CjkBigramTokenFilter + | CjkWidthTokenFilter + | ClassicTokenFilter | CommonGramsTokenFilter | ConditionTokenFilter + | DecimalDigitTokenFilter | DelimitedPayloadTokenFilter - //DictionaryDecompounderTokenFilter | | EdgeNGramTokenFilter | ElisionTokenFilter | FingerprintTokenFilter + | FlattenGraphTokenFilter + | GermanNormalizationTokenFilter + | HindiNormalizationTokenFilter | HunspellTokenFilter | HyphenationDecompounderTokenFilter + | IndicNormalizationTokenFilter | KeepTypesTokenFilter | KeepWordsTokenFilter | KeywordMarkerTokenFilter + | KeywordRepeatTokenFilter | KStemTokenFilter | LengthTokenFilter | LimitTokenCountTokenFilter | LowercaseTokenFilter + | MinHashTokenFilter | MultiplexerTokenFilter | NGramTokenFilter | NoriPartOfSpeechTokenFilter | PatternCaptureTokenFilter | PatternReplaceTokenFilter + | PersianNormalizationTokenFilter | PorterStemTokenFilter | PredicateTokenFilter | RemoveDuplicatesTokenFilter | ReverseTokenFilter + | ScandinavianFoldingTokenFilter + | ScandinavianNormalizationTokenFilter + | SerbianNormalizationTokenFilter | ShingleTokenFilter | SnowballTokenFilter + | SoraniNormalizationTokenFilter | StemmerOverrideTokenFilter | StemmerTokenFilter | StopTokenFilter From c8b4de54f9fb66123a0a9a81d191e9fe70f64d9e Mon Sep 17 00:00:00 2001 From: Josh Mock Date: Wed, 16 Apr 2025 15:23:55 -0500 Subject: [PATCH 2/4] Lots of docstrings and combining redundant definitions into parent classes --- .../_types/analysis/token_filters.ts | 102 ++++++++++++------ 1 file changed, 72 insertions(+), 30 deletions(-) diff --git a/specification/_types/analysis/token_filters.ts b/specification/_types/analysis/token_filters.ts index d0b84582f8..66fcdd9b0c 100644 --- a/specification/_types/analysis/token_filters.ts +++ b/specification/_types/analysis/token_filters.ts @@ -41,12 +41,20 @@ export class TokenFilterBase { } export class CompoundWordTokenFilterBase extends TokenFilterBase { - hyphenation_patterns_path?: string + /** Maximum subword character length. Longer subword tokens are excluded from the output. Defaults to `15`. */ max_subword_size?: integer + /** Minimum subword character length. Shorter subword tokens are excluded from the output. Defaults to `2`. */ min_subword_size?: integer + /** Minimum word character length. Shorter word tokens are excluded from the output. Defaults to `5`. */ min_word_size?: integer + /** If `true`, only include the longest matching subword. Defaults to `false`. */ only_longest_match?: boolean + /** A list of subwords to look for in the token stream. If found, the subword is included in the token output. + * Either this parameter or `word_list_path` must be specified.*/ word_list?: string[] + /** Path to a file that contains a list of subwords to find in the token stream. If found, the subword is included in the token output. + * This path must be absolute or relative to the config location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. + * Either this parameter or `word_list` must be specified. */ word_list_path?: string } @@ -56,6 +64,13 @@ export class DictionaryDecompounderTokenFilter extends CompoundWordTokenFilterBa export class HyphenationDecompounderTokenFilter extends CompoundWordTokenFilterBase { type: 'hyphenation_decompounder' + /** Path to an Apache FOP (Formatting Objects Processor) XML hyphenation pattern file. + * This path must be absolute or relative to the `config` location. Only FOP v1.2 compatible files are supported. */ + hyphenation_patterns_path: string + /** If `true`, do not match sub tokens in tokens that are in the word list. Defaults to `false`. */ + no_sub_matches?: boolean + /** If `true`, do not allow overlapping tokens. Defaults to `false`. */ + no_overlapping_matches?: boolean } export enum DelimitedPayloadEncoding { @@ -66,7 +81,9 @@ export enum DelimitedPayloadEncoding { export class DelimitedPayloadTokenFilter extends TokenFilterBase { type: 'delimited_payload' + /** Character used to separate tokens from payloads. Defaults to `|`. */ delimiter?: string + /** Data type for the stored payload. */ encoding?: DelimitedPayloadEncoding } @@ -77,27 +94,42 @@ export enum EdgeNGramSide { export class EdgeNGramTokenFilter extends TokenFilterBase { type: 'edge_ngram' + /** Maximum character length of a gram. For custom token filters, defaults to `2`. For the built-in edge_ngram filter, defaults to `1`. */ max_gram?: integer + /** Minimum character length of a gram. Defaults to `1`. */ min_gram?: integer + /** Indicates whether to truncate tokens from the `front` or `back`. Defaults to `front`. */ side?: EdgeNGramSide + /** Emits original token when set to `true`. Defaults to `false`. */ preserve_original?: Stringified } export class ShingleTokenFilter extends TokenFilterBase { type: 'shingle' + /** String used in shingles as a replacement for empty positions that do not contain a token. This filler token is only used in shingles, not original unigrams. Defaults to an underscore (`_`). */ filler_token?: string - max_shingle_size?: integer | string // TODO: should be only int - min_shingle_size?: integer | string // TODO: should be only int + /** Maximum number of tokens to concatenate when creating shingles. Defaults to `2`. */ + max_shingle_size?: Stringified + /** Minimum number of tokens to concatenate when creating shingles. Defaults to `2`. */ + min_shingle_size?: Stringified + /** If `true`, the output includes the original input tokens. If `false`, the output only includes shingles; the original input tokens are removed. Defaults to `true`. */ output_unigrams?: boolean + /** If `true`, the output includes the original input tokens only if no shingles are produced; if shingles are produced, the output only includes shingles. Defaults to `false`. */ output_unigrams_if_no_shingles?: boolean + /** Separator used to concatenate adjacent tokens to form a shingle. Defaults to a space (`" "`). */ token_separator?: string } export class StopTokenFilter extends TokenFilterBase { type: 'stop' + /** If `true`, stop word matching is case insensitive. For example, if `true`, a stop word of the matches and removes `The`, `THE`, or `the`. Defaults to `false`. */ ignore_case?: boolean + /** If `true`, the last token of a stream is removed if it’s a stop word. Defaults to `true`. */ remove_trailing?: boolean + /** Language value, such as `_arabic_` or `_thai_`. Defaults to `_english_`. */ stopwords?: StopWords + /** Path to a file that contains a list of stop words to remove. + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each stop word in the file must be separated by a line break. */ stopwords_path?: string } @@ -106,64 +138,74 @@ export enum SynonymFormat { wordnet } -export class SynonymGraphTokenFilter extends TokenFilterBase { - type: 'synonym_graph' +export class SynonymTokenFilterBase extends TokenFilterBase { + /** Expands definitions for equivalent synonym rules. Defaults to `true`. */ expand?: boolean + /** Sets the synonym rules format. */ format?: SynonymFormat + /** If `true` ignores errors while parsing the synonym rules. It is important to note that only those synonym rules which cannot get parsed are ignored. Defaults to the value of the `updateable` setting. */ lenient?: boolean + /** Used to define inline synonyms. */ synonyms?: string[] + /** Used to provide a synonym file. This path must be absolute or relative to the `config` location. */ synonyms_path?: string + /** Provide a synonym set created via Synonyms Management APIs. */ synonyms_set?: string + /** Controls the tokenizers that will be used to tokenize the synonym, this parameter is for backwards compatibility for indices that created before 6.0. + * @deprecated 6.0.0 */ tokenizer?: string + /** If `true` allows reloading search analyzers to pick up changes to synonym files. Only to be used for search analyzers. Defaults to `false`. */ updateable?: boolean } -export class SynonymTokenFilter extends TokenFilterBase { +export class SynonymGraphTokenFilter extends SynonymTokenFilterBase { + type: 'synonym_graph' +} + +export class SynonymTokenFilter extends SynonymTokenFilterBase { type: 'synonym' - expand?: boolean - format?: SynonymFormat - lenient?: boolean - synonyms?: string[] - synonyms_path?: string - synonyms_set?: string - tokenizer?: string - updateable?: boolean } -export class WordDelimiterTokenFilter extends TokenFilterBase { - type: 'word_delimiter' +export class WordDelimiterTokenFilterBase extends TokenFilterBase { + /** If `true`, the filter produces catenated tokens for chains of alphanumeric characters separated by non-alphabetic delimiters. Defaults to `false`. */ catenate_all?: boolean + /** If `true`, the filter produces catenated tokens for chains of numeric characters separated by non-alphabetic delimiters. Defaults to `false`. */ catenate_numbers?: boolean + /** If `true`, the filter produces catenated tokens for chains of alphabetical characters separated by non-alphabetic delimiters. Defaults to `false`. */ catenate_words?: boolean + /** If `true`, the filter includes tokens consisting of only numeric characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. */ generate_number_parts?: boolean + /** If `true`, the filter includes tokens consisting of only alphabetical characters in the output. If `false`, the filter excludes these tokens from the output. Defaults to `true`. */ generate_word_parts?: boolean + /** If `true`, the filter includes the original version of any split tokens in the output. This original version includes non-alphanumeric delimiters. Defaults to `false`. */ preserve_original?: Stringified + /** Array of tokens the filter won’t split. */ protected_words?: string[] + /** Path to a file that contains a list of tokens the filter won’t split. + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. */ protected_words_path?: string + /** If `true`, the filter splits tokens at letter case transitions. For example: camelCase -> [ camel, Case ]. Defaults to `true`. */ split_on_case_change?: boolean + /** If `true`, the filter splits tokens at letter-number transitions. For example: j2se -> [ j, 2, se ]. Defaults to `true`. */ split_on_numerics?: boolean + /** If `true`, the filter removes the English possessive (`'s`) from the end of each token. For example: O'Neil's -> [ O, Neil ]. Defaults to `true`. */ stem_english_possessive?: boolean + /** Array of custom type mappings for characters. This allows you to map non-alphanumeric characters as numeric or alphanumeric to avoid splitting on those characters. */ type_table?: string[] + /** Path to a file that contains custom type mappings for characters. This allows you to map non-alphanumeric characters as numeric or alphanumeric to avoid splitting on those characters. */ type_table_path?: string } -export class WordDelimiterGraphTokenFilter extends TokenFilterBase { +export class WordDelimiterTokenFilter extends WordDelimiterTokenFilterBase { + type: 'word_delimiter' +} + +export class WordDelimiterGraphTokenFilter extends WordDelimiterTokenFilterBase { type: 'word_delimiter_graph' + /** If `true`, the filter adjusts the offsets of split or catenated tokens to better reflect their actual position in the token stream. Defaults to `true`. */ adjust_offsets?: boolean - catenate_all?: boolean - catenate_numbers?: boolean - catenate_words?: boolean - generate_number_parts?: boolean - generate_word_parts?: boolean + /** If `true`, the filter skips tokens with a keyword attribute of true. Defaults to `false`. */ ignore_keywords?: boolean - preserve_original?: Stringified - protected_words?: string[] - protected_words_path?: string - split_on_case_change?: boolean - split_on_numerics?: boolean - stem_english_possessive?: boolean - type_table?: string[] - type_table_path?: string } export class AsciiFoldingTokenFilter extends TokenFilterBase { From f1101b08014a4b823bb412135d609f0f27966b8f Mon Sep 17 00:00:00 2001 From: Josh Mock Date: Thu, 17 Apr 2025 13:25:53 -0500 Subject: [PATCH 3/4] Token filter updates --- specification/_types/analysis/StopWords.ts | 41 ++++++++- .../_types/analysis/kuromoji-plugin.ts | 6 ++ specification/_types/analysis/languages.ts | 5 + specification/_types/analysis/nori-plugin.ts | 8 ++ .../_types/analysis/token_filters.ts | 92 ++++++++++++++++--- 5 files changed, 137 insertions(+), 15 deletions(-) diff --git a/specification/_types/analysis/StopWords.ts b/specification/_types/analysis/StopWords.ts index 03fa0490a3..c9c78d3f58 100644 --- a/specification/_types/analysis/StopWords.ts +++ b/specification/_types/analysis/StopWords.ts @@ -17,10 +17,49 @@ * under the License. */ +export enum StopWord { + _arabic_, + _armenian_, + _basque_, + _bengali_, + _brazilian_, + _bulgarian_, + _catalan_, + _cjk_, + _czech_, + _danish_, + _dutch_, + _english_, + _estonian_, + _finnish_, + _french_, + _galician_, + _german_, + _greek_, + _hindi_, + _hungarian_, + _indonesian_, + _irish_, + _italian_, + _latvian_, + _lithuanian_, + _norwegian_, + _persian_, + _portuguese_, + _romanian_, + _russian_, + _serbian_, + _sorani_, + _spanish_, + _swedish_, + _thai_, + _turkish_ +} + /** * Language value, such as _arabic_ or _thai_. Defaults to _english_. * Each language value corresponds to a predefined list of stop words in Lucene. See Stop words by language for supported language values and their stop words. * Also accepts an array of stop words. * @class_serializer: StopWordsFormatter */ -export type StopWords = string | string[] +export type StopWords = StopWord | StopWord[] diff --git a/specification/_types/analysis/kuromoji-plugin.ts b/specification/_types/analysis/kuromoji-plugin.ts index 4a04242cbf..dfc17fe826 100644 --- a/specification/_types/analysis/kuromoji-plugin.ts +++ b/specification/_types/analysis/kuromoji-plugin.ts @@ -21,6 +21,7 @@ import { integer } from '@_types/Numeric' import { CharFilterBase } from './char_filters' import { TokenizerBase } from './tokenizers' import { TokenFilterBase } from './token_filters' +import { StopWords } from './StopWords' export class KuromojiAnalyzer { type: 'kuromoji' @@ -28,6 +29,11 @@ export class KuromojiAnalyzer { user_dictionary?: string } +export class JaStopTokenFilter extends TokenFilterBase { + type: 'ja_stop' + stopwords?: StopWords +} + export class KuromojiIterationMarkCharFilter extends CharFilterBase { type: 'kuromoji_iteration_mark' normalize_kana: boolean diff --git a/specification/_types/analysis/languages.ts b/specification/_types/analysis/languages.ts index 427848a45d..2aceb3918a 100644 --- a/specification/_types/analysis/languages.ts +++ b/specification/_types/analysis/languages.ts @@ -18,25 +18,30 @@ */ export enum SnowballLanguage { + Arabic, Armenian, Basque, Catalan, Danish, Dutch, English, + Estonian, Finnish, French, German, German2, Hungarian, Italian, + Irish, Kp, + Lithuanian, Lovins, Norwegian, Porter, Portuguese, Romanian, Russian, + Serbian, Spanish, Swedish, Turkish diff --git a/specification/_types/analysis/nori-plugin.ts b/specification/_types/analysis/nori-plugin.ts index b245996e72..18d71be2c4 100644 --- a/specification/_types/analysis/nori-plugin.ts +++ b/specification/_types/analysis/nori-plugin.ts @@ -18,6 +18,7 @@ */ import { TokenizerBase } from './tokenizers' +import { TokenFilterBase } from './token_filters' export enum NoriDecompoundMode { discard, @@ -32,3 +33,10 @@ export class NoriTokenizer extends TokenizerBase { user_dictionary?: string user_dictionary_rules?: string[] } + +export class NoriPartOfSpeechTokenFilter extends TokenFilterBase { + type: 'nori_part_of_speech' + /** An array of part-of-speech tags that should be removed. */ + stoptags?: string[] +} + diff --git a/specification/_types/analysis/token_filters.ts b/specification/_types/analysis/token_filters.ts index 66fcdd9b0c..0e8212f968 100644 --- a/specification/_types/analysis/token_filters.ts +++ b/specification/_types/analysis/token_filters.ts @@ -30,8 +30,10 @@ import { import { KuromojiPartOfSpeechTokenFilter, KuromojiReadingFormTokenFilter, - KuromojiStemmerTokenFilter + KuromojiStemmerTokenFilter, + JaStopTokenFilter } from './kuromoji-plugin' +import { NoriPartOfSpeechTokenFilter } from './nori-plugin' import { SnowballLanguage } from './languages' import { PhoneticTokenFilter } from './phonetic-plugin' import { StopWords } from './StopWords' @@ -210,49 +212,73 @@ export class WordDelimiterGraphTokenFilter extends WordDelimiterTokenFilterBase export class AsciiFoldingTokenFilter extends TokenFilterBase { type: 'asciifolding' + /** If `true`, emit both original tokens and folded tokens. Defaults to `false`. */ preserve_original?: Stringified } export class CommonGramsTokenFilter extends TokenFilterBase { type: 'common_grams' + /** A list of tokens. The filter generates bigrams for these tokens. + * Either this or the `common_words_path` parameter is required. */ common_words?: string[] + /** Path to a file containing a list of tokens. The filter generates bigrams for these tokens. + * This path must be absolute or relative to the `config` location. The file must be UTF-8 encoded. Each token in the file must be separated by a line break. + * Either this or the `common_words` parameter is required. */ common_words_path?: string + /** If `true`, matches for common words matching are case-insensitive. Defaults to `false`. */ ignore_case?: boolean + /** If `true`, the filter excludes the following tokens from the output: + * - Unigrams for common words + * - Unigrams for terms followed by common words + * Defaults to `false`. We recommend enabling this parameter for search analyzers. */ query_mode?: boolean } export class ConditionTokenFilter extends TokenFilterBase { type: 'condition' + /** Array of token filters. If a token matches the predicate script in the `script` parameter, these filters are applied to the token in the order provided. */ filter: string[] + /** Predicate script used to apply token filters. If a token matches this script, the filters in the `filter` parameter are applied to the token. */ script: Script } export class ElisionTokenFilter extends TokenFilterBase { type: 'elision' + /** List of elisions to remove. + * To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed. + * For custom `elision` filters, either this parameter or `articles_path` must be specified. */ articles?: string[] + /** Path to a file that contains a list of elisions to remove. + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each elision in the file must be separated by a line break. + * To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed. + * For custom `elision` filters, either this parameter or `articles` must be specified. */ articles_path?: string + /** If `true`, elision matching is case insensitive. If `false`, elision matching is case sensitive. Defaults to `false`. */ articles_case?: Stringified } export class FingerprintTokenFilter extends TokenFilterBase { type: 'fingerprint' + /** Maximum character length, including whitespace, of the output token. Defaults to `255`. Concatenated tokens longer than this will result in no token output. */ max_output_size?: integer + /** Character to use to concatenate the token stream input. Defaults to a space. */ separator?: string } export class HunspellTokenFilter extends TokenFilterBase { type: 'hunspell' + /** If `true`, duplicate tokens are removed from the filter’s output. Defaults to `true`. */ dedup?: boolean + /** One or more `.dic` files (e.g, `en_US.dic`, my_custom.dic) to use for the Hunspell dictionary. + * By default, the `hunspell` filter uses all `.dic` files in the `<$ES_PATH_CONF>/hunspell/` directory specified using the `lang`, `language`, or `locale` parameter. */ dictionary?: string + /** Locale directory used to specify the `.aff` and `.dic` files for a Hunspell dictionary. + * @aliases lang, language */ locale: string + /** If `true`, only the longest stemmed version of each token is included in the output. If `false`, all stemmed versions of the token are included. Defaults to `false`. */ longest_only?: boolean } -export class JaStopTokenFilter extends TokenFilterBase { - type: 'ja_stop' - stopwords?: StopWords -} - export enum KeepTypesMode { include, exclude @@ -260,22 +286,38 @@ export enum KeepTypesMode { export class KeepTypesTokenFilter extends TokenFilterBase { type: 'keep_types' + /** Indicates whether to keep or remove the specified token types. */ mode?: KeepTypesMode - types?: string[] + /** List of token types to keep or remove. */ + types: string[] } export class KeepWordsTokenFilter extends TokenFilterBase { type: 'keep' + /** List of words to keep. Only tokens that match words in this list are included in the output. + * Either this parameter or `keep_words_path` must be specified. */ keep_words?: string[] + /** If `true`, lowercase all keep words. Defaults to `false`. */ keep_words_case?: boolean + /** Path to a file that contains a list of words to keep. Only tokens that match words in this list are included in the output. + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break. + * Either this parameter or `keep_words` must be specified. */ keep_words_path?: string } export class KeywordMarkerTokenFilter extends TokenFilterBase { type: 'keyword_marker' + /** If `true`, matching for the `keywords` and `keywords_path` parameters ignores letter case. Defaults to `false`. */ ignore_case?: boolean + /** Array of keywords. Tokens that match these keywords are not stemmed. + * This parameter, `keywords_path`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */ keywords?: string | string[] + /** Path to a file that contains a list of keywords. Tokens that match these keywords are not stemmed. + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break. + * This parameter, `keywords`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */ keywords_path?: string + /** Java regular expression used to match tokens. Tokens that match this expression are marked as keywords and not stemmed. + * This parameter, `keywords`, or `keywords_path` must be specified. You cannot specify this parameter and `keywords` or `keywords_pattern`. */ keywords_pattern?: string } @@ -285,50 +327,65 @@ export class KStemTokenFilter extends TokenFilterBase { export class LengthTokenFilter extends TokenFilterBase { type: 'length' + /** Maximum character length of a token. Longer tokens are excluded from the output. Defaults to `Integer.MAX_VALUE`, which is `2^31-1` or `2147483647`. */ max?: integer + /** Minimum character length of a token. Shorter tokens are excluded from the output. Defaults to `0`. */ min?: integer } export class LimitTokenCountTokenFilter extends TokenFilterBase { type: 'limit' + /** If `true`, the limit filter exhausts the token stream, even if the `max_token_count` has already been reached. Defaults to `false`. */ consume_all_tokens?: boolean + /** Maximum number of tokens to keep. Once this limit is reached, any remaining tokens are excluded from the output. Defaults to `1`. */ max_token_count?: Stringified } +export enum LowercaseTokenFilterLanguages { + greek, + irish, + turkish +} + export class LowercaseTokenFilter extends TokenFilterBase { type: 'lowercase' - language?: string + /** Language-specific lowercase token filter to use. */ + language?: LowercaseTokenFilterLanguages } export class MultiplexerTokenFilter extends TokenFilterBase { type: 'multiplexer' + /** A list of token filters to apply to incoming tokens. */ filters: string[] + /** If `true` (the default) then emit the original token in addition to the filtered tokens. */ preserve_original?: Stringified } export class NGramTokenFilter extends TokenFilterBase { type: 'ngram' + /** Maximum length of characters in a gram. Defaults to `2`. */ max_gram?: integer + /** Minimum length of characters in a gram. Defaults to `1`. */ min_gram?: integer + /** Emits original token when set to `true`. Defaults to `false`. */ preserve_original?: Stringified } -export class NoriPartOfSpeechTokenFilter extends TokenFilterBase { - type: 'nori_part_of_speech' - stoptags?: string[] -} - export class PatternCaptureTokenFilter extends TokenFilterBase { type: 'pattern_capture' + /** A list of regular expressions to match. */ patterns: string[] + /** If set to `true` (the default) it will emit the original token. */ preserve_original?: Stringified } export class PatternReplaceTokenFilter extends TokenFilterBase { type: 'pattern_replace' + /** If `true`, all substrings matching the pattern parameter’s regular expression are replaced. If `false`, the filter replaces only the first matching substring in each token. Defaults to `true`. */ all?: boolean - flags?: string + /** Regular expression, written in Java’s regular expression syntax. The filter replaces token substrings matching this pattern with the substring in the `replacement` parameter. */ pattern: string + /** Replacement substring. Defaults to an empty substring (`""`). */ replacement?: string } @@ -338,6 +395,7 @@ export class PorterStemTokenFilter extends TokenFilterBase { export class PredicateTokenFilter extends TokenFilterBase { type: 'predicate_token_filter' + /** Script containing a condition used to filter incoming tokens. Only tokens that match this script are included in the output. */ script: Script } @@ -351,12 +409,15 @@ export class ReverseTokenFilter extends TokenFilterBase { export class SnowballTokenFilter extends TokenFilterBase { type: 'snowball' + /** Controls the language used by the stemmer. */ language?: SnowballLanguage } export class StemmerOverrideTokenFilter extends TokenFilterBase { type: 'stemmer_override' + /** A list of mapping rules to use. */ rules?: string[] + /** A path (either relative to `config` location, or absolute) to a list of mappings. */ rules_path?: string } @@ -372,11 +433,13 @@ export class TrimTokenFilter extends TokenFilterBase { export class TruncateTokenFilter extends TokenFilterBase { type: 'truncate' + /** Character limit for each token. Tokens exceeding this limit are truncated. Defaults to `10`. */ length?: integer } export class UniqueTokenFilter extends TokenFilterBase { type: 'unique' + /** If `true`, only remove duplicate tokens in the same position. Defaults to `false`. */ only_on_same_position?: boolean } @@ -539,6 +602,7 @@ export type TokenFilterDefinition = | UppercaseTokenFilter | WordDelimiterGraphTokenFilter | WordDelimiterTokenFilter + | JaStopTokenFilter | KuromojiStemmerTokenFilter | KuromojiReadingFormTokenFilter | KuromojiPartOfSpeechTokenFilter From 585bb865bf14b8c1f71cd6584abe5cad67b2edb9 Mon Sep 17 00:00:00 2001 From: Josh Mock Date: Thu, 17 Apr 2025 13:39:17 -0500 Subject: [PATCH 4/4] Appease the linter --- .../_types/analysis/kuromoji-plugin.ts | 2 +- specification/_types/analysis/nori-plugin.ts | 1 - .../_types/analysis/token_filters.ts | 60 +++++++++---------- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/specification/_types/analysis/kuromoji-plugin.ts b/specification/_types/analysis/kuromoji-plugin.ts index dfc17fe826..adaf90a370 100644 --- a/specification/_types/analysis/kuromoji-plugin.ts +++ b/specification/_types/analysis/kuromoji-plugin.ts @@ -19,9 +19,9 @@ import { integer } from '@_types/Numeric' import { CharFilterBase } from './char_filters' +import { StopWords } from './StopWords' import { TokenizerBase } from './tokenizers' import { TokenFilterBase } from './token_filters' -import { StopWords } from './StopWords' export class KuromojiAnalyzer { type: 'kuromoji' diff --git a/specification/_types/analysis/nori-plugin.ts b/specification/_types/analysis/nori-plugin.ts index 18d71be2c4..282e376bc8 100644 --- a/specification/_types/analysis/nori-plugin.ts +++ b/specification/_types/analysis/nori-plugin.ts @@ -39,4 +39,3 @@ export class NoriPartOfSpeechTokenFilter extends TokenFilterBase { /** An array of part-of-speech tags that should be removed. */ stoptags?: string[] } - diff --git a/specification/_types/analysis/token_filters.ts b/specification/_types/analysis/token_filters.ts index 0e8212f968..f75f2db1dc 100644 --- a/specification/_types/analysis/token_filters.ts +++ b/specification/_types/analysis/token_filters.ts @@ -28,13 +28,13 @@ import { IcuTransformTokenFilter } from './icu-plugin' import { + JaStopTokenFilter, KuromojiPartOfSpeechTokenFilter, KuromojiReadingFormTokenFilter, - KuromojiStemmerTokenFilter, - JaStopTokenFilter + KuromojiStemmerTokenFilter } from './kuromoji-plugin' -import { NoriPartOfSpeechTokenFilter } from './nori-plugin' import { SnowballLanguage } from './languages' +import { NoriPartOfSpeechTokenFilter } from './nori-plugin' import { PhoneticTokenFilter } from './phonetic-plugin' import { StopWords } from './StopWords' @@ -52,11 +52,11 @@ export class CompoundWordTokenFilterBase extends TokenFilterBase { /** If `true`, only include the longest matching subword. Defaults to `false`. */ only_longest_match?: boolean /** A list of subwords to look for in the token stream. If found, the subword is included in the token output. - * Either this parameter or `word_list_path` must be specified.*/ + * Either this parameter or `word_list_path` must be specified.*/ word_list?: string[] /** Path to a file that contains a list of subwords to find in the token stream. If found, the subword is included in the token output. - * This path must be absolute or relative to the config location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. - * Either this parameter or `word_list` must be specified. */ + * This path must be absolute or relative to the config location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. + * Either this parameter or `word_list` must be specified. */ word_list_path?: string } @@ -67,7 +67,7 @@ export class DictionaryDecompounderTokenFilter extends CompoundWordTokenFilterBa export class HyphenationDecompounderTokenFilter extends CompoundWordTokenFilterBase { type: 'hyphenation_decompounder' /** Path to an Apache FOP (Formatting Objects Processor) XML hyphenation pattern file. - * This path must be absolute or relative to the `config` location. Only FOP v1.2 compatible files are supported. */ + * This path must be absolute or relative to the `config` location. Only FOP v1.2 compatible files are supported. */ hyphenation_patterns_path: string /** If `true`, do not match sub tokens in tokens that are in the word list. Defaults to `false`. */ no_sub_matches?: boolean @@ -131,7 +131,7 @@ export class StopTokenFilter extends TokenFilterBase { /** Language value, such as `_arabic_` or `_thai_`. Defaults to `_english_`. */ stopwords?: StopWords /** Path to a file that contains a list of stop words to remove. - * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each stop word in the file must be separated by a line break. */ + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each stop word in the file must be separated by a line break. */ stopwords_path?: string } @@ -184,7 +184,7 @@ export class WordDelimiterTokenFilterBase extends TokenFilterBase { /** Array of tokens the filter won’t split. */ protected_words?: string[] /** Path to a file that contains a list of tokens the filter won’t split. - * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. */ + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each token in the file must be separated by a line break. */ protected_words_path?: string /** If `true`, the filter splits tokens at letter case transitions. For example: camelCase -> [ camel, Case ]. Defaults to `true`. */ split_on_case_change?: boolean @@ -219,18 +219,18 @@ export class AsciiFoldingTokenFilter extends TokenFilterBase { export class CommonGramsTokenFilter extends TokenFilterBase { type: 'common_grams' /** A list of tokens. The filter generates bigrams for these tokens. - * Either this or the `common_words_path` parameter is required. */ + * Either this or the `common_words_path` parameter is required. */ common_words?: string[] /** Path to a file containing a list of tokens. The filter generates bigrams for these tokens. - * This path must be absolute or relative to the `config` location. The file must be UTF-8 encoded. Each token in the file must be separated by a line break. - * Either this or the `common_words` parameter is required. */ + * This path must be absolute or relative to the `config` location. The file must be UTF-8 encoded. Each token in the file must be separated by a line break. + * Either this or the `common_words` parameter is required. */ common_words_path?: string /** If `true`, matches for common words matching are case-insensitive. Defaults to `false`. */ ignore_case?: boolean /** If `true`, the filter excludes the following tokens from the output: - * - Unigrams for common words - * - Unigrams for terms followed by common words - * Defaults to `false`. We recommend enabling this parameter for search analyzers. */ + * - Unigrams for common words + * - Unigrams for terms followed by common words + * Defaults to `false`. We recommend enabling this parameter for search analyzers. */ query_mode?: boolean } @@ -245,13 +245,13 @@ export class ConditionTokenFilter extends TokenFilterBase { export class ElisionTokenFilter extends TokenFilterBase { type: 'elision' /** List of elisions to remove. - * To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed. - * For custom `elision` filters, either this parameter or `articles_path` must be specified. */ + * To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed. + * For custom `elision` filters, either this parameter or `articles_path` must be specified. */ articles?: string[] /** Path to a file that contains a list of elisions to remove. - * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each elision in the file must be separated by a line break. - * To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed. - * For custom `elision` filters, either this parameter or `articles` must be specified. */ + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each elision in the file must be separated by a line break. + * To be removed, the elision must be at the beginning of a token and be immediately followed by an apostrophe. Both the elision and apostrophe are removed. + * For custom `elision` filters, either this parameter or `articles` must be specified. */ articles_path?: string /** If `true`, elision matching is case insensitive. If `false`, elision matching is case sensitive. Defaults to `false`. */ articles_case?: Stringified @@ -270,10 +270,10 @@ export class HunspellTokenFilter extends TokenFilterBase { /** If `true`, duplicate tokens are removed from the filter’s output. Defaults to `true`. */ dedup?: boolean /** One or more `.dic` files (e.g, `en_US.dic`, my_custom.dic) to use for the Hunspell dictionary. - * By default, the `hunspell` filter uses all `.dic` files in the `<$ES_PATH_CONF>/hunspell/` directory specified using the `lang`, `language`, or `locale` parameter. */ + * By default, the `hunspell` filter uses all `.dic` files in the `<$ES_PATH_CONF>/hunspell/` directory specified using the `lang`, `language`, or `locale` parameter. */ dictionary?: string /** Locale directory used to specify the `.aff` and `.dic` files for a Hunspell dictionary. - * @aliases lang, language */ + * @aliases lang, language */ locale: string /** If `true`, only the longest stemmed version of each token is included in the output. If `false`, all stemmed versions of the token are included. Defaults to `false`. */ longest_only?: boolean @@ -295,13 +295,13 @@ export class KeepTypesTokenFilter extends TokenFilterBase { export class KeepWordsTokenFilter extends TokenFilterBase { type: 'keep' /** List of words to keep. Only tokens that match words in this list are included in the output. - * Either this parameter or `keep_words_path` must be specified. */ + * Either this parameter or `keep_words_path` must be specified. */ keep_words?: string[] /** If `true`, lowercase all keep words. Defaults to `false`. */ keep_words_case?: boolean /** Path to a file that contains a list of words to keep. Only tokens that match words in this list are included in the output. - * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break. - * Either this parameter or `keep_words` must be specified. */ + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break. + * Either this parameter or `keep_words` must be specified. */ keep_words_path?: string } @@ -310,14 +310,14 @@ export class KeywordMarkerTokenFilter extends TokenFilterBase { /** If `true`, matching for the `keywords` and `keywords_path` parameters ignores letter case. Defaults to `false`. */ ignore_case?: boolean /** Array of keywords. Tokens that match these keywords are not stemmed. - * This parameter, `keywords_path`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */ + * This parameter, `keywords_path`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */ keywords?: string | string[] /** Path to a file that contains a list of keywords. Tokens that match these keywords are not stemmed. - * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break. - * This parameter, `keywords`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */ + * This path must be absolute or relative to the `config` location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break. + * This parameter, `keywords`, or `keywords_pattern` must be specified. You cannot specify this parameter and `keywords_pattern`. */ keywords_path?: string /** Java regular expression used to match tokens. Tokens that match this expression are marked as keywords and not stemmed. - * This parameter, `keywords`, or `keywords_path` must be specified. You cannot specify this parameter and `keywords` or `keywords_pattern`. */ + * This parameter, `keywords`, or `keywords_path` must be specified. You cannot specify this parameter and `keywords` or `keywords_pattern`. */ keywords_pattern?: string } @@ -509,7 +509,7 @@ export class MinHashTokenFilter extends TokenFilterBase { /** Number of ways to hash each token in the stream. Defaults to `1`. */ hash_count?: integer /** Number of hashes to keep from each bucket. Defaults to `1`. - * Hashes are retained by ascending size, starting with the bucket’s smallest hash first. */ + * Hashes are retained by ascending size, starting with the bucket’s smallest hash first. */ hash_set_size?: integer /** If `true`, the filter fills empty buckets with the value of the first non-empty bucket to its circular right if the `hash_set_size` is `1`. If the `bucket_count` argument is greater than 1, this parameter defaults to `true`. Otherwise, this parameter defaults to `false`. */ with_rotation?: boolean