From 21a5c8742b88b34af31655e1c04a940ab20f8c9f Mon Sep 17 00:00:00 2001 From: Cibu Johny Date: Mon, 10 Jun 2024 11:58:15 -0700 Subject: [PATCH] Tamil visual normalization rules added for flipped two-part vowel signs. Example: SIGN EE + SIGN AA -> SIGN OO PiperOrigin-RevId: 641977604 --- .../brahmic/data/Taml/visual_rewrite.textproto | 15 +++++++++++++++ .../brahmic/testdata/visual_norm.textproto | 4 ++++ 2 files changed, 19 insertions(+) diff --git a/nisaba/scripts/brahmic/data/Taml/visual_rewrite.textproto b/nisaba/scripts/brahmic/data/Taml/visual_rewrite.textproto index ccf2e9e3..e642818d 100644 --- a/nisaba/scripts/brahmic/data/Taml/visual_rewrite.textproto +++ b/nisaba/scripts/brahmic/data/Taml/visual_rewrite.textproto @@ -106,3 +106,18 @@ item { uname: ["SIGN AU", "SIGN I"] raw: "ௌி" to_uname: ["SIGN E", "LLA", "SIGN I"] to_raw: "ெளி" } + +# Flipped two-part vowel signs. +# The non-flipped sequence is covered by NFC. +item { + uname: ["SIGN AA", "SIGN E"] raw: "ாெ" + to_uname: ["SIGN O"] to_raw: "ொ" +} +item { + uname: ["SIGN AA", "SIGN EE"] raw: "ாே" + to_uname: ["SIGN OO"] to_raw: "ோ" +} +item { + uname: ["AU LENGTH MARK", "SIGN E"] raw: "ௗெ" + to_uname: ["SIGN AU"] to_raw: "ௌ" +} diff --git a/nisaba/scripts/brahmic/testdata/visual_norm.textproto b/nisaba/scripts/brahmic/testdata/visual_norm.textproto index e90eb288..ea1948f3 100644 --- a/nisaba/scripts/brahmic/testdata/visual_norm.textproto +++ b/nisaba/scripts/brahmic/testdata/visual_norm.textproto @@ -44,6 +44,10 @@ rewrite { rule: "SINH" input: "අපේ‍්‍රල්" output: "අප් # rewrite { rule: "TAML" input: "தமி​ழர்‌கள்‍" output: "தமிழர்கள்" } rewrite { rule: "TAML" input: "ஆக்‌ஷன்" output: "ஆக்‌ஷன்" } +rewrite { rule: "TAML" input: "காெள்" output: "கொள்" } +rewrite { rule: "TAML" input: "ப்ராேஷன்" output: "ப்ரோஷன்" } +rewrite { rule: "TAML" input: "சௗெந்தர்யம்" output: "சௌந்தர்யம்" } + rewrite { rule: "DEVA" input: "श्रीमान्‌को" output: "श्रीमान्‌को" } rewrite { rule: "DEVA" input: "गोल्‍डबर्ग" output: "गोल्डबर्ग" }