LuteOrg · jzohrab · Nov 2, 2024 · Nov 2, 2024 · Nov 2, 2024 · Nov 2, 2024
diff --git a/plugins/lute-thai/README.md b/plugins/lute-thai/README.md
@@ -0,0 +1,5 @@
+The Lute Thai parser.
+
+See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes.
+
+See the [Pypi readme](./README_PyPi.md) for extra config notes.
diff --git a/plugins/lute-thai/README_PyPi.md b/plugins/lute-thai/README_PyPi.md
@@ -0,0 +1,12 @@
+# `lute3-thai`
+
+A Thai parser for Lute (`lute3`) using the `pythainlp` library.
+
+## Installation
+
+See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html).
+
+## Usage
+
+When this parser is installed, you can add "Thai" as a
+language to Lute, which comes with a simple story.
diff --git a/plugins/lute-thai/definition.yaml b/plugins/lute-thai/definition.yaml
@@ -0,0 +1,22 @@
+name: Thai
+dictionaries:
+  - for: terms
+    type: embedded
+    url: https://dict.com/thai-english/###
+  - for: terms
+    type: embedded
+    url: https://en.wiktionary.org/wiki/###
+  - for: terms
+    type: popup
+    url: https://glosbe.com/th/en/###
+  - for: sentences
+    type: popup
+    url: https://www.bing.com/translator/?from=th&to=en&text=###
+show_romanization: true
+# right_to_left:
+
+parser_type: lute_thai
+# character_substitutions:
+split_sentences: ฯ!?
+# split_sentence_exceptions:
+word_chars: ก-๛
diff --git a/plugins/lute-thai/lute_thai_parser/__init__.py b/plugins/lute-thai/lute_thai_parser/__init__.py
@@ -0,0 +1,5 @@
+"""
+Lute Thai Parser
+"""
+
+__version__ = "0.0.3"
diff --git a/plugins/lute-thai/lute_thai_parser/parser.py b/plugins/lute-thai/lute_thai_parser/parser.py
@@ -0,0 +1,69 @@
+"""
+Parsing using pythainlp
+
+Includes classes:
+
+- ThaiParser
+
+"""
+
+import re
+import os
+import pythainlp
+
+from typing import List
+
+from lute.parse.base import ParsedToken, AbstractParser
+
+
+class ThaiParser(AbstractParser):
+    """
+    A parser for Thai that uses the pythainlp library for text segmentation.
+
+    The user can add some exceptions to the "parsing_exceptions.txt"
+    data file.
+    """
+
+    @classmethod
+    def name(cls):
+        return "Lute Thai"
+
+    @classmethod
+    def uses_data_directory(cls):
+        "Uses the data_directory (defined in the AbstractParser)."
+        return False
+
+    # @classmethod
+    # def init_data_directory(cls):
+    #     "Set up necessary files."
+    #     pass
+
+    def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
+        """
+        Returns ParsedToken array for given language.
+        """
+        text = text.replace("\r\n", "\n")
+
+        words = pythainlp.word_tokenize(text)
+        tokens = []
+        pattern = f"[{language.word_characters}]"
+        for word in words:
+            is_word_char = re.match(pattern, word) is not None
+            is_end_of_sentence = word in language.regexp_split_sentences
+            if is_end_of_sentence:
+                is_word_char = False
+            if word == "\n":
+                word = "¶"
+            if word == "¶":
+                is_word_char = False
+                is_end_of_sentence = True
+            t = ParsedToken(word, is_word_char, is_end_of_sentence)
+            tokens.append(t)
+        return tokens
+
+    def get_reading(self, text: str):  # pylint: disable=unused-argument
+        """
+        Get the pronunciation for the given text.  For most
+        languages, this can't be automated.
+        """
+        return None
diff --git a/plugins/lute-thai/pyproject.toml b/plugins/lute-thai/pyproject.toml
@@ -0,0 +1,24 @@
+[build-system]
+requires = ["flit_core >=3.2,<4"]
+build-backend = "flit_core.buildapi"
+
+[tool.flit.module]
+name = "lute_thai_parser"
+
+[project]
+name = "lute3-thai"
+dynamic = ['version']
+description = "Learning Using Texts - Thai Parser"
+requires-python = ">=3.8"
+authors = [
+  {name = "Justin Dom"}
+]
+readme = "README_PyPi.md"
+
+dependencies = [
+  "lute3>=3.4.2",
+  "pythainlp==5.0.4"
+]
+
+[project.entry-points."lute.plugin.parse"]
+lute_thai = "lute_thai_parser.parser:ThaiParser"
diff --git a/plugins/lute-thai/requirements.txt b/plugins/lute-thai/requirements.txt
@@ -0,0 +1,5 @@
+# Required dependency for base classes.
+lute3>=3.4.2
+
+# extra requirements here.
+pythainlp==5.0.4
diff --git a/plugins/lute-thai/tests/__init__.py b/plugins/lute-thai/tests/__init__.py
diff --git a/plugins/lute-thai/tests/conftest.py b/plugins/lute-thai/tests/conftest.py
@@ -0,0 +1,36 @@
+"""
+Common fixtures used by many tests.
+"""
+
+import os
+import yaml
+import pytest
+
+
+from lute.parse.registry import init_parser_plugins
+
+from lute.models.language import Language
+
+
+def pytest_sessionstart(session):  # pylint: disable=unused-argument
+    """
+    Initialize parser list
+    """
+    init_parser_plugins()
+
+
+def _get_test_language():
+    """
+    Retrieve the language definition file for testing ths plugin from definition.yaml
+    """
+    thisdir = os.path.dirname(os.path.realpath(__file__))
+    definition_file = os.path.join(thisdir, "..", "definition.yaml")
+    with open(definition_file, "r", encoding="utf-8") as df:
+        d = yaml.safe_load(df)
+    lang = Language.from_dict(d)
+    return lang
+
+
+@pytest.fixture(name="thai")
+def fixture_thai():
+    return _get_test_language()
diff --git a/plugins/lute-thai/tests/test_ThaiParser.py b/plugins/lute-thai/tests/test_ThaiParser.py
@@ -0,0 +1,86 @@
+"""
+ThaiParser tests.
+"""
+
+
+import pytest
+
+# pylint: disable=wrong-import-order
+from lute.models.term import Term
+from lute.parse.base import ParsedToken
+
+from lute_thai_parser.parser import ThaiParser
+
+
+def test_token_count(thai):
+    """
+    token_count checks.
+    """
+    cases = [
+        ("สวัสดี", 1),
+        ("ลาก่อน", 1),
+        ("ฉันรักคุณ", 3),
+        ("ฉันกำลังเรียนภาษาไทย", 4),
+    ]
+    for text, expected_count in cases:
+        t = Term(thai, text)
+        assert t.token_count == expected_count, text
+        assert t.text_lc == t.text, "case"
+
+
+def assert_tokens_equals(text, lang, expected):
+    """
+    Parsing a text using a language should give the expected parsed tokens.
+
+    expected is given as array of:
+    [ original_text, is_word, is_end_of_sentence ]
+    """
+    p = ThaiParser()
+    actual = p.get_parsed_tokens(text, lang)
+    expected = [ParsedToken(*a) for a in expected]
+    assert [str(a) for a in actual] == [str(e) for e in expected]
+
+
+def test_end_of_sentence_stored_in_parsed_tokens(thai):
+    """
+    ParsedToken is marked as EOS=True at ends of sentences.
+    """
+    s = "สวัสดีทุกคน! ฉันเรียนภาษาไทยมา2เดือนแล้วฯ"
+
+    expected = [
+        ("สวัสดี", True),
+        ("ทุกคน", True),
+        ("!", False, True),
+        (" ", False),
+        ("ฉัน", True),
+        ("เรียน", True),
+        ("ภาษาไทย", True),
+        ("มา", True),
+        ("2", False),
+        ("เดือน", True),
+        ("แล้ว", True, False),
+        ("ฯ", False, True),
+    ]
+    assert_tokens_equals(s, thai, expected)
+
+
+def test_carriage_returns_treated_as_reverse_p_character(thai):
+    """
+    Returns need to be marked with the backwards P for rendering etc.
+    """
+    s = "สวัสดีทุกคน!\nฉันเรียนภาษาไทยมา2เดือนแล้ว"
+
+    expected = [
+        ("สวัสดี", True),
+        ("ทุกคน", True),
+        ("!", False, True),
+        ("¶", False, True),
+        ("ฉัน", True),
+        ("เรียน", True),
+        ("ภาษาไทย", True),
+        ("มา", True),
+        ("2", False),
+        ("เดือน", True),
+        ("แล้ว", True, False),
+    ]
+    assert_tokens_equals(s, thai, expected)