Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 50 additions & 10 deletions ext/js/language/ar/arabic-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,53 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

/**
* Generates all possible combinations of replacing each occurrence of a
* pattern with each of the options provided in replacements.
*
* For a pattern that matches `n` times, and a list of `m` replacements, this
* function returns `m^n` strings representing every possible combination of replacements.
*
* Note: This function should not be used for large values of n and m, due to its inherent
* exponential growth.
* @param {string} str
* @param {string|RegExp} pattern
* @param {string[]} replacements
* @returns {string[]}
*/
function generateReplacementCombinations(str, pattern, replacements) {
const regex = pattern instanceof RegExp ? pattern : new RegExp(pattern, 'g');

const matches = [...str.matchAll(regex)];
const n = matches.length;

const m = replacements.length; // number of choices per match
const total = m ** n; // m^n combinations

const results = [];

for (let combination = 0; combination < total; combination++) {
Comment on lines +39 to +43
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Cap replacement combinations before materializing variants

generateReplacementCombinations always allocates every m^n permutation (total + full for loop) before any safeguard runs, while truncation to MAX_PROCESS_VARIANTS only happens later in translator._getProcessedTexts. With substituteAlif (m=4), inputs containing several plain alifs can explode to tens of thousands+ strings per call and stall lookups; fresh evidence versus the earlier thread is that lookup input is not bounded to dictionary-word shapes (scanner/search paths accept multi-character user text), so this can be triggered by real queries, not just synthetic words.

Useful? React with 👍 / 👎.

// Treat `combination` as a base-m number with n digits.
// Starting from the least significant digit, position 0, until the most significant digit,
// position n-1, we loop through each digit of `combination`. Each position i is a digit
// between 0 and m-1, representing which of the m replacement choices to substitute the ith
// occurrence of `pattern` with
let current = combination;
const result = str.replaceAll(regex, (_) => {
// Pick replacement choice using the value of the current least significant digit
const choiceIndex = current % m;
// Pop the least significant digit
current = Math.floor(current / m);

return replacements[choiceIndex];
});

results.push(result);
}

return results;
}

const optionalDiacritics = [
'\u0618', // Small Fatha
'\u0619', // Small Damma
Expand Down Expand Up @@ -58,17 +105,10 @@ export const normalizeUnicode = {
};

/** @type {import('language').TextProcessor} */
export const addHamzaTop = {
name: 'Add Hamza to top of Alif',
export const substituteAlif = {
name: 'Substitutes plain alifs with its variations (alif with hamza, alif with madd)',
description: 'اكبر → أكبر',
process: (text) => [text, text.replace('ا', 'أ')],
};

/** @type {import('language').TextProcessor} */
export const addHamzaBottom = {
name: 'Add Hamza to bottom of Alif',
description: 'اسلام → إسلام',
process: (text) => [text, text.replace('ا', 'إ')],
process: (text) => generateReplacementCombinations(text, 'ا', ['ا', 'أ', 'إ', 'آ']),
};

/** @type {import('language').TextProcessor} */
Expand Down
9 changes: 3 additions & 6 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@

import {removeSyriacScriptDiacritics} from './aii/assyrian-neo-aramaic-text-preprocessors.js';
import {
addHamzaBottom,
addHamzaTop,
substituteAlif,
convertAlifMaqsuraToYaa,
convertHaToTaMarbuta,
normalizeUnicode,
Expand Down Expand Up @@ -101,8 +100,7 @@ const languageDescriptors = [
removeArabicScriptDiacritics,
removeTatweel,
normalizeUnicode,
addHamzaTop,
addHamzaBottom,
substituteAlif,
convertAlifMaqsuraToYaa,
},
languageTransforms: arabicTransforms,
Expand All @@ -116,8 +114,7 @@ const languageDescriptors = [
removeArabicScriptDiacritics,
removeTatweel,
normalizeUnicode,
addHamzaTop,
addHamzaBottom,
substituteAlif,
convertAlifMaqsuraToYaa,
convertHaToTaMarbuta,
},
Expand Down
6 changes: 2 additions & 4 deletions types/ext/language-descriptors.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,7 @@ type AllTextProcessors = {
removeArabicScriptDiacritics: TextProcessor;
removeTatweel: TextProcessor;
normalizeUnicode: TextProcessor;
addHamzaTop: TextProcessor;
addHamzaBottom: TextProcessor;
substituteAlif: TextProcessor;
convertAlifMaqsuraToYaa: TextProcessor;
};
};
Expand All @@ -91,8 +90,7 @@ type AllTextProcessors = {
removeArabicScriptDiacritics: TextProcessor;
removeTatweel: TextProcessor;
normalizeUnicode: TextProcessor;
addHamzaTop: TextProcessor;
addHamzaBottom: TextProcessor;
substituteAlif: TextProcessor;
convertAlifMaqsuraToYaa: TextProcessor;
convertHaToTaMarbuta: TextProcessor;
};
Expand Down
Loading