From 5d1391f62b80549cfcbc07569695a5d4b6b7615f Mon Sep 17 00:00:00 2001 From: JimBim Date: Sun, 1 Mar 2026 20:25:47 +0000 Subject: [PATCH 1/3] Improve Arabic preprocessors --- .../language/ar/arabic-text-preprocessors.js | 50 ++++++++++++++++++- ext/js/language/language-descriptors.js | 3 ++ types/ext/language-descriptors.d.ts | 2 + 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js index a1d1b4ee42..167350dc12 100644 --- a/ext/js/language/ar/arabic-text-preprocessors.js +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -15,6 +15,45 @@ * along with this program. If not, see . */ +/** + * Generates all possible combinations of replacing or not replacing + * each occurrence of a pattern within a string. + * + * For a pattern that matches `n` times, this function returns `2^n` + * strings representing every possible combination of replacements. + * + * Note: this implementation only works for n < 31, as bitwise shifting is used, and + * JavaScript operations operate on 32-bit signed integers. However, this function should + * not be used if such large values of n are expected anyway, due to its inherent + * exponential growth. + * @param {string} str + * @param {string|RegExp} pattern + * @param {string} replacement + * @returns {string[]} + */ +function generateReplacementCombinations(str, pattern, replacement) { + const regex = pattern instanceof RegExp ? pattern : new RegExp(pattern, 'g'); + const matches = [...str.matchAll(regex)]; + const n = matches.length; + // Total of 2^n possible combinations + const total = 1 << n; + + const results = []; + for (let mask = 0; mask < total; mask++) { + let i = 0; + + const result = str.replaceAll(regex, (match) => { + // Only replace ith occurrence if ith bit in bitmask is set to 1 + const shouldReplace = mask & (1 << i++); + return shouldReplace ? replacement : match; + }); + + results.push(result); + } + + return results; +} + const optionalDiacritics = [ '\u0618', // Small Fatha '\u0619', // Small Damma @@ -61,14 +100,21 @@ export const normalizeUnicode = { export const addHamzaTop = { name: 'Add Hamza to top of Alif', description: 'اكبر → أكبر', - process: (text) => [text, text.replace('ا', 'أ')], + process: (text) => generateReplacementCombinations(text, 'ا', 'أ'), }; /** @type {import('language').TextProcessor} */ export const addHamzaBottom = { name: 'Add Hamza to bottom of Alif', description: 'اسلام → إسلام', - process: (text) => [text, text.replace('ا', 'إ')], + process: (text) => generateReplacementCombinations(text, 'ا', 'إ'), +}; + +/** @type {import('language').TextProcessor} */ +export const addMadd = { + name: 'Add Madd to Alif', + description: 'الان → الآن', + process: (text) => generateReplacementCombinations(text, 'ا', 'آ'), }; /** @type {import('language').TextProcessor} */ diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 4343b3e057..f2178f2435 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -19,6 +19,7 @@ import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-prep import { addHamzaBottom, addHamzaTop, + addMadd, convertAlifMaqsuraToYaa, convertHaToTaMarbuta, normalizeUnicode, @@ -94,6 +95,7 @@ const languageDescriptors = [ normalizeUnicode, addHamzaTop, addHamzaBottom, + addMadd, convertAlifMaqsuraToYaa, }, languageTransforms: arabicTransforms, @@ -109,6 +111,7 @@ const languageDescriptors = [ normalizeUnicode, addHamzaTop, addHamzaBottom, + addMadd, convertAlifMaqsuraToYaa, convertHaToTaMarbuta, }, diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index b3c44e9fef..aa277526cf 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -82,6 +82,7 @@ type AllTextProcessors = { normalizeUnicode: TextProcessor; addHamzaTop: TextProcessor; addHamzaBottom: TextProcessor; + addMadd: TextProcessor; convertAlifMaqsuraToYaa: TextProcessor; }; }; @@ -92,6 +93,7 @@ type AllTextProcessors = { normalizeUnicode: TextProcessor; addHamzaTop: TextProcessor; addHamzaBottom: TextProcessor; + addMadd: TextProcessor; convertAlifMaqsuraToYaa: TextProcessor; convertHaToTaMarbuta: TextProcessor; }; From 3df9bec7f094a61cdb06cf17269bfcd275c4782a Mon Sep 17 00:00:00 2001 From: JimBim Date: Sat, 11 Apr 2026 16:33:57 +0100 Subject: [PATCH 2/3] Combine alif substitutions into single preprocessor --- .../language/ar/arabic-text-preprocessors.js | 57 ++++++++----------- ext/js/language/language-descriptors.js | 12 +--- types/ext/language-descriptors.d.ts | 8 +-- 3 files changed, 28 insertions(+), 49 deletions(-) diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js index 167350dc12..24411418e9 100644 --- a/ext/js/language/ar/arabic-text-preprocessors.js +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -16,36 +16,39 @@ */ /** - * Generates all possible combinations of replacing or not replacing - * each occurrence of a pattern within a string. + * Generates all possible combinations of replacing each occurrence of a + * pattern with each of the options provided in replacements. * - * For a pattern that matches `n` times, this function returns `2^n` - * strings representing every possible combination of replacements. + * For a pattern that matches `n` times, and a list of `m` replacements, this + * function returns `m^n` strings representing every possible combination of replacements. * - * Note: this implementation only works for n < 31, as bitwise shifting is used, and - * JavaScript operations operate on 32-bit signed integers. However, this function should - * not be used if such large values of n are expected anyway, due to its inherent + * Note: This function should not be used for large values of n and m, due to its inherent * exponential growth. * @param {string} str * @param {string|RegExp} pattern - * @param {string} replacement + * @param {string[]} replacements * @returns {string[]} */ -function generateReplacementCombinations(str, pattern, replacement) { +function generateReplacementCombinations(str, pattern, replacements) { const regex = pattern instanceof RegExp ? pattern : new RegExp(pattern, 'g'); + const matches = [...str.matchAll(regex)]; const n = matches.length; - // Total of 2^n possible combinations - const total = 1 << n; + + const m = replacements.length; // number of choices per match + const total = m ** n; // m^n combinations const results = []; - for (let mask = 0; mask < total; mask++) { - let i = 0; - const result = str.replaceAll(regex, (match) => { - // Only replace ith occurrence if ith bit in bitmask is set to 1 - const shouldReplace = mask & (1 << i++); - return shouldReplace ? replacement : match; + for (let combo = 0; combo < total; combo++) { + let current = combo; + + const result = str.replaceAll(regex, (_) => { + // Pick option using base-m digit + const choiceIndex = current % m; + current = Math.floor(current / m); + + return replacements[choiceIndex]; }); results.push(result); @@ -97,24 +100,10 @@ export const normalizeUnicode = { }; /** @type {import('language').TextProcessor} */ -export const addHamzaTop = { - name: 'Add Hamza to top of Alif', +export const substituteAlif = { + name: 'Substitutes plain alifs with its variations (alif with hamza, alif with madd)', description: 'اكبر → أكبر', - process: (text) => generateReplacementCombinations(text, 'ا', 'أ'), -}; - -/** @type {import('language').TextProcessor} */ -export const addHamzaBottom = { - name: 'Add Hamza to bottom of Alif', - description: 'اسلام → إسلام', - process: (text) => generateReplacementCombinations(text, 'ا', 'إ'), -}; - -/** @type {import('language').TextProcessor} */ -export const addMadd = { - name: 'Add Madd to Alif', - description: 'الان → الآن', - process: (text) => generateReplacementCombinations(text, 'ا', 'آ'), + process: (text) => generateReplacementCombinations(text, 'ا', ['ا', 'أ', 'إ', 'آ']), }; /** @type {import('language').TextProcessor} */ diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 4de14ef3a0..c1fb249413 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -17,9 +17,7 @@ import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-preprocessors.js'; import { - addHamzaBottom, - addHamzaTop, - addMadd, + substituteAlif, convertAlifMaqsuraToYaa, convertHaToTaMarbuta, normalizeUnicode, @@ -94,9 +92,7 @@ const languageDescriptors = [ removeArabicScriptDiacritics, removeTatweel, normalizeUnicode, - addHamzaTop, - addHamzaBottom, - addMadd, + substituteAlif, convertAlifMaqsuraToYaa, }, languageTransforms: arabicTransforms, @@ -110,9 +106,7 @@ const languageDescriptors = [ removeArabicScriptDiacritics, removeTatweel, normalizeUnicode, - addHamzaTop, - addHamzaBottom, - addMadd, + substituteAlif, convertAlifMaqsuraToYaa, convertHaToTaMarbuta, }, diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 5056b188ad..803ad1bbfe 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -80,9 +80,7 @@ type AllTextProcessors = { removeArabicScriptDiacritics: TextProcessor; removeTatweel: TextProcessor; normalizeUnicode: TextProcessor; - addHamzaTop: TextProcessor; - addHamzaBottom: TextProcessor; - addMadd: TextProcessor; + substituteAlif: TextProcessor; convertAlifMaqsuraToYaa: TextProcessor; }; }; @@ -91,9 +89,7 @@ type AllTextProcessors = { removeArabicScriptDiacritics: TextProcessor; removeTatweel: TextProcessor; normalizeUnicode: TextProcessor; - addHamzaTop: TextProcessor; - addHamzaBottom: TextProcessor; - addMadd: TextProcessor; + substituteAlif: TextProcessor; convertAlifMaqsuraToYaa: TextProcessor; convertHaToTaMarbuta: TextProcessor; }; From b72a9fae07e8503a46e18ef4d10e92fa0fedefa1 Mon Sep 17 00:00:00 2001 From: JimBim Date: Sat, 11 Apr 2026 17:04:04 +0100 Subject: [PATCH 3/3] Update comments --- ext/js/language/ar/arabic-text-preprocessors.js | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js index a68813e738..c0f08787d8 100644 --- a/ext/js/language/ar/arabic-text-preprocessors.js +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -40,12 +40,17 @@ function generateReplacementCombinations(str, pattern, replacements) { const results = []; - for (let combo = 0; combo < total; combo++) { - let current = combo; - + for (let combination = 0; combination < total; combination++) { + // Treat `combination` as a base-m number with n digits. + // Starting from the least significant digit, position 0, until the most significant digit, + // position n-1, we loop through each digit of `combination`. Each position i is a digit + // between 0 and m-1, representing which of the m replacement choices to substitute the ith + // occurrence of `pattern` with + let current = combination; const result = str.replaceAll(regex, (_) => { - // Pick option using base-m digit + // Pick replacement choice using the value of the current least significant digit const choiceIndex = current % m; + // Pop the least significant digit current = Math.floor(current / m); return replacements[choiceIndex];