From 163cf07699f2ec7ef54fa3b55161198d5533e00b Mon Sep 17 00:00:00 2001 From: Justin Date: Sat, 2 Nov 2024 11:17:15 -0400 Subject: [PATCH 1/5] applied black formatting --- plugins/lute-thai/README_PyPi.md | 35 +++++++ plugins/lute-thai/definition.yaml | 19 ++++ .../lute-thai/lute_thai_parser/__init__.py | 5 + plugins/lute-thai/lute_thai_parser/parser.py | 71 ++++++++++++++ plugins/lute-thai/pyproject.toml | 24 +++++ plugins/lute-thai/requirements.txt | 5 + plugins/lute-thai/tests/__init__.py | 0 plugins/lute-thai/tests/conftest.py | 37 ++++++++ plugins/lute-thai/tests/test_ThaiParser.py | 94 +++++++++++++++++++ 9 files changed, 290 insertions(+) create mode 100644 plugins/lute-thai/README_PyPi.md create mode 100644 plugins/lute-thai/definition.yaml create mode 100644 plugins/lute-thai/lute_thai_parser/__init__.py create mode 100644 plugins/lute-thai/lute_thai_parser/parser.py create mode 100644 plugins/lute-thai/pyproject.toml create mode 100644 plugins/lute-thai/requirements.txt create mode 100644 plugins/lute-thai/tests/__init__.py create mode 100644 plugins/lute-thai/tests/conftest.py create mode 100644 plugins/lute-thai/tests/test_ThaiParser.py diff --git a/plugins/lute-thai/README_PyPi.md b/plugins/lute-thai/README_PyPi.md new file mode 100644 index 000000000..b8fab8359 --- /dev/null +++ b/plugins/lute-thai/README_PyPi.md @@ -0,0 +1,35 @@ +# `lute3-mandarin` + +A Mandarin parser for Lute (`lute3`) using the `jieba` library, and +`pypinyin` for readings. + +## Installation + +See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html). + +## Usage + +When this parser is installed, you can add "Mandarin Chinese" as a +language to Lute, which comes with a simple story. + +## Parsing exceptions + +Sometimes `jieba` groups too many characters together when parsing. +For example, it returns "清华大学" as a single word of four +characters, which might not be correct. + +You can specify how Lute should correct these cases by adding some +simple "rules" to the file +`plugins/lute_mandarin/parser_exceptions.txt` found in your Lute +`data` directory. This file is automatically created when Lute +starts. Each rule contains the characters of the word as parsed by +`jieba`, with regular commas added where the word should be split. + +Some examples: + +| File content | Results when parsing "清华大学" | +| --- | --- | +| (empty file) | "清华大学" | +|
清华,大学
| Two tokens, "清华" and "大学" (the single token is split in two) | +|
清,华,大,学
| Four tokens, "清", "华", "大", "学" | +|
清华,大学
大,学
| Three tokens, "清华", "大, "学" (results are recursively broken down if rules are found) | diff --git a/plugins/lute-thai/definition.yaml b/plugins/lute-thai/definition.yaml new file mode 100644 index 000000000..35221db41 --- /dev/null +++ b/plugins/lute-thai/definition.yaml @@ -0,0 +1,19 @@ +name: Thai +dictionaries: + - for: terms + type: embedded + url: https://dict.com/thai-english/### + - for: terms + type: popup + url: https://glosbe.com/th/en/### + - for: sentences + type: popup + url: https://www.bing.com/translator/?from=th&to=en&text=### +show_romanization: true +# right_to_left: + +parser_type: lute_thai +# character_substitutions: +split_sentences: ฯ! +# split_sentence_exceptions: +word_chars: ก-๛ diff --git a/plugins/lute-thai/lute_thai_parser/__init__.py b/plugins/lute-thai/lute_thai_parser/__init__.py new file mode 100644 index 000000000..d7f4171d6 --- /dev/null +++ b/plugins/lute-thai/lute_thai_parser/__init__.py @@ -0,0 +1,5 @@ +""" +Lute Thai Parser +""" + +__version__ = "0.0.3" diff --git a/plugins/lute-thai/lute_thai_parser/parser.py b/plugins/lute-thai/lute_thai_parser/parser.py new file mode 100644 index 000000000..4018c7aec --- /dev/null +++ b/plugins/lute-thai/lute_thai_parser/parser.py @@ -0,0 +1,71 @@ +""" +Parsing using pythainlp + +Includes classes: + +- ThaiParser + +""" + +import re +import os +import pythainlp + +from typing import List + +from pythainlp.transliterate import romanize + +from lute.parse.base import ParsedToken, AbstractParser + + +class ThaiParser(AbstractParser): + """ + A parser for Thai that uses the pythainlp library for text segmentation. + + The user can add some exceptions to the "parsing_exceptions.txt" + data file. + """ + + @classmethod + def name(cls): + return "Lute Thai" + + @classmethod + def uses_data_directory(cls): + "Uses the data_directory (defined in the AbstractParser)." + return False + + # @classmethod + # def init_data_directory(cls): + # "Set up necessary files." + # pass + + def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]: + """ + Returns ParsedToken array for given language. + """ + text = text.replace("\r\n", "\n") + + words = pythainlp.word_tokenize(text) + tokens = [] + pattern = f"[{language.word_characters}]" + for word in words: + is_word_char = re.match(pattern, word) is not None + is_end_of_sentence = word in language.regexp_split_sentences + if is_end_of_sentence: + is_word_char = False + if word == "\n": + word = "¶" + if word == "¶": + is_word_char = False + is_end_of_sentence = True + t = ParsedToken(word, is_word_char, is_end_of_sentence) + tokens.append(t) + return tokens + + def get_reading(self, text: str): # pylint: disable=unused-argument + """ + Get the pronunciation for the given text. For most + languages, this can't be automated. + """ + return None diff --git a/plugins/lute-thai/pyproject.toml b/plugins/lute-thai/pyproject.toml new file mode 100644 index 000000000..ca68c3483 --- /dev/null +++ b/plugins/lute-thai/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["flit_core >=3.2,<4"] +build-backend = "flit_core.buildapi" + +[tool.flit.module] +name = "lute_thai_parser" + +[project] +name = "lute3-thai" +dynamic = ['version'] +description = "Learning Using Texts - Thai Parser" +requires-python = ">=3.8" +authors = [ + {name = "Justin Dom"} +] +readme = "README_PyPi.md" + +dependencies = [ + "lute3>=3.4.2", + "pythainlp==5.0.4" +] + +[project.entry-points."lute.plugin.parse"] +lute_thai = "lute_thai_parser.parser:ThaiParser" diff --git a/plugins/lute-thai/requirements.txt b/plugins/lute-thai/requirements.txt new file mode 100644 index 000000000..2143a93c8 --- /dev/null +++ b/plugins/lute-thai/requirements.txt @@ -0,0 +1,5 @@ +# Required dependency for base classes. +lute3>=3.4.2 + +# TODO -- extra requirements here. +pythainlp==5.0.4 diff --git a/plugins/lute-thai/tests/__init__.py b/plugins/lute-thai/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/plugins/lute-thai/tests/conftest.py b/plugins/lute-thai/tests/conftest.py new file mode 100644 index 000000000..b2dd6ddc2 --- /dev/null +++ b/plugins/lute-thai/tests/conftest.py @@ -0,0 +1,37 @@ +""" +Common fixtures used by many tests. +""" + +import os +import yaml +import pytest + + +from lute.parse.registry import init_parser_plugins + +from lute.models.language import Language + + +def pytest_sessionstart(session): # pylint: disable=unused-argument + """ + Initialize parser list + """ + init_parser_plugins() + + +def _get_test_language(): + """ + Retrieve the language definition file for testing ths plugin from definition.yaml + """ + thisdir = os.path.dirname(os.path.realpath(__file__)) + definition_file = os.path.join(thisdir, "..", "definition.yaml") + with open(definition_file, "r", encoding="utf-8") as df: + d = yaml.safe_load(df) + lang = Language.from_dict(d) + return lang + + +# TODO fix name +@pytest.fixture(name="thai") +def fixture_thai(): + return _get_test_language() diff --git a/plugins/lute-thai/tests/test_ThaiParser.py b/plugins/lute-thai/tests/test_ThaiParser.py new file mode 100644 index 000000000..697d00b8c --- /dev/null +++ b/plugins/lute-thai/tests/test_ThaiParser.py @@ -0,0 +1,94 @@ +""" +ThaiParser tests. +""" + +# TODO fix names, activate tests. + +import pytest + +# pylint: disable=wrong-import-order +from lute.models.term import Term +from lute.parse.base import ParsedToken + +# TODO fix name +from lute_thai_parser.parser import ThaiParser + + +def test_dummy_test(): + "A dummy test so that pytest doesn't complain in github ci." + s = "Hello" + assert s == "Hello", "TODO - fix these tests for your parser :-)" + + +def test_token_count(thai): + """ + token_count checks. + """ + cases = [ + ("สวัสดี", 1), + ("ลาก่อน", 1), + ("ฉันรักคุณ", 3), + ("ฉันกำลังเรียนภาษาไทย", 4), + ] + for text, expected_count in cases: + t = Term(thai, text) + assert t.token_count == expected_count, text + assert t.text_lc == t.text, "case" + + +def assert_tokens_equals(text, lang, expected): + """ + Parsing a text using a language should give the expected parsed tokens. + + expected is given as array of: + [ original_text, is_word, is_end_of_sentence ] + """ + p = ThaiParser() + actual = p.get_parsed_tokens(text, lang) + expected = [ParsedToken(*a) for a in expected] + assert [str(a) for a in actual] == [str(e) for e in expected] + + +def test_end_of_sentence_stored_in_parsed_tokens(thai): + """ + ParsedToken is marked as EOS=True at ends of sentences. + """ + s = "สวัสดีทุกคน! ฉันเรียนภาษาไทยมา2เดือนแล้วฯ" + + expected = [ + ("สวัสดี", True), + ("ทุกคน", True), + ("!", False, True), + (" ", False), + ("ฉัน", True), + ("เรียน", True), + ("ภาษาไทย", True), + ("มา", True), + ("2", False), + ("เดือน", True), + ("แล้ว", True, False), + ("ฯ", False, True), + ] + assert_tokens_equals(s, thai, expected) + + +def test_carriage_returns_treated_as_reverse_p_character(thai): + """ + Returns need to be marked with the backwards P for rendering etc. + """ + s = "สวัสดีทุกคน!\nฉันเรียนภาษาไทยมา2เดือนแล้ว" + + expected = [ + ("สวัสดี", True), + ("ทุกคน", True), + ("!", False, True), + ("¶", False, True), + ("ฉัน", True), + ("เรียน", True), + ("ภาษาไทย", True), + ("มา", True), + ("2", False), + ("เดือน", True), + ("แล้ว", True, False), + ] + assert_tokens_equals(s, thai, expected) From 451b3b77659669f33f7721a282f0af76820a6c8f Mon Sep 17 00:00:00 2001 From: Justin Date: Sat, 2 Nov 2024 11:33:53 -0400 Subject: [PATCH 2/5] added question mark sentence delimiter --- plugins/lute-thai/definition.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/lute-thai/definition.yaml b/plugins/lute-thai/definition.yaml index 35221db41..a21313dab 100644 --- a/plugins/lute-thai/definition.yaml +++ b/plugins/lute-thai/definition.yaml @@ -14,6 +14,6 @@ show_romanization: true parser_type: lute_thai # character_substitutions: -split_sentences: ฯ! +split_sentences: ฯ!? # split_sentence_exceptions: word_chars: ก-๛ From 96eb4d861f909062ff1f53d5fa03f95dd8be1389 Mon Sep 17 00:00:00 2001 From: Justin Date: Sat, 2 Nov 2024 11:39:22 -0400 Subject: [PATCH 3/5] edited README files --- plugins/lute-thai/README.md | 5 ++++ plugins/lute-thai/README_PyPi.md | 29 ++------------------ plugins/lute-thai/lute_thai_parser/parser.py | 2 -- plugins/lute-thai/tests/test_ThaiParser.py | 8 ------ 4 files changed, 8 insertions(+), 36 deletions(-) create mode 100644 plugins/lute-thai/README.md diff --git a/plugins/lute-thai/README.md b/plugins/lute-thai/README.md new file mode 100644 index 000000000..2c2efd006 --- /dev/null +++ b/plugins/lute-thai/README.md @@ -0,0 +1,5 @@ +The Lute Thai parser. + +See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes. + +See the [Pypi readme](./README_PyPi.md) for extra config notes. diff --git a/plugins/lute-thai/README_PyPi.md b/plugins/lute-thai/README_PyPi.md index b8fab8359..fc8d9ace9 100644 --- a/plugins/lute-thai/README_PyPi.md +++ b/plugins/lute-thai/README_PyPi.md @@ -1,7 +1,6 @@ -# `lute3-mandarin` +# `lute3-thai` -A Mandarin parser for Lute (`lute3`) using the `jieba` library, and -`pypinyin` for readings. +A Thai parser for Lute (`lute3`) using the `pythainlp` library. ## Installation @@ -9,27 +8,5 @@ See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html ## Usage -When this parser is installed, you can add "Mandarin Chinese" as a +When this parser is installed, you can add "Thai" as a language to Lute, which comes with a simple story. - -## Parsing exceptions - -Sometimes `jieba` groups too many characters together when parsing. -For example, it returns "清华大学" as a single word of four -characters, which might not be correct. - -You can specify how Lute should correct these cases by adding some -simple "rules" to the file -`plugins/lute_mandarin/parser_exceptions.txt` found in your Lute -`data` directory. This file is automatically created when Lute -starts. Each rule contains the characters of the word as parsed by -`jieba`, with regular commas added where the word should be split. - -Some examples: - -| File content | Results when parsing "清华大学" | -| --- | --- | -| (empty file) | "清华大学" | -|
清华,大学
| Two tokens, "清华" and "大学" (the single token is split in two) | -|
清,华,大,学
| Four tokens, "清", "华", "大", "学" | -|
清华,大学
大,学
| Three tokens, "清华", "大, "学" (results are recursively broken down if rules are found) | diff --git a/plugins/lute-thai/lute_thai_parser/parser.py b/plugins/lute-thai/lute_thai_parser/parser.py index 4018c7aec..b1ca5a06a 100644 --- a/plugins/lute-thai/lute_thai_parser/parser.py +++ b/plugins/lute-thai/lute_thai_parser/parser.py @@ -13,8 +13,6 @@ from typing import List -from pythainlp.transliterate import romanize - from lute.parse.base import ParsedToken, AbstractParser diff --git a/plugins/lute-thai/tests/test_ThaiParser.py b/plugins/lute-thai/tests/test_ThaiParser.py index 697d00b8c..dc98121ea 100644 --- a/plugins/lute-thai/tests/test_ThaiParser.py +++ b/plugins/lute-thai/tests/test_ThaiParser.py @@ -2,7 +2,6 @@ ThaiParser tests. """ -# TODO fix names, activate tests. import pytest @@ -10,16 +9,9 @@ from lute.models.term import Term from lute.parse.base import ParsedToken -# TODO fix name from lute_thai_parser.parser import ThaiParser -def test_dummy_test(): - "A dummy test so that pytest doesn't complain in github ci." - s = "Hello" - assert s == "Hello", "TODO - fix these tests for your parser :-)" - - def test_token_count(thai): """ token_count checks. From da2f4917199ba40f0bc5bd67defeeff907983ae6 Mon Sep 17 00:00:00 2001 From: Justin Date: Sat, 2 Nov 2024 11:48:54 -0400 Subject: [PATCH 4/5] removed TODO comments --- plugins/lute-thai/requirements.txt | 2 +- plugins/lute-thai/tests/conftest.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/plugins/lute-thai/requirements.txt b/plugins/lute-thai/requirements.txt index 2143a93c8..883c5110a 100644 --- a/plugins/lute-thai/requirements.txt +++ b/plugins/lute-thai/requirements.txt @@ -1,5 +1,5 @@ # Required dependency for base classes. lute3>=3.4.2 -# TODO -- extra requirements here. +# extra requirements here. pythainlp==5.0.4 diff --git a/plugins/lute-thai/tests/conftest.py b/plugins/lute-thai/tests/conftest.py index b2dd6ddc2..d40d2cf04 100644 --- a/plugins/lute-thai/tests/conftest.py +++ b/plugins/lute-thai/tests/conftest.py @@ -31,7 +31,6 @@ def _get_test_language(): return lang -# TODO fix name @pytest.fixture(name="thai") def fixture_thai(): return _get_test_language() From 52c681ebed19360e9fde538d1407b774eb46c437 Mon Sep 17 00:00:00 2001 From: Justin Date: Sat, 2 Nov 2024 15:56:07 -0400 Subject: [PATCH 5/5] added wiktionary --- plugins/lute-thai/definition.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/plugins/lute-thai/definition.yaml b/plugins/lute-thai/definition.yaml index a21313dab..a6e8a2fe3 100644 --- a/plugins/lute-thai/definition.yaml +++ b/plugins/lute-thai/definition.yaml @@ -3,6 +3,9 @@ dictionaries: - for: terms type: embedded url: https://dict.com/thai-english/### + - for: terms + type: embedded + url: https://en.wiktionary.org/wiki/### - for: terms type: popup url: https://glosbe.com/th/en/###