Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jdom/plugin/thai #510

Merged
merged 6 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions plugins/lute-thai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The Lute Thai parser.

See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes.

See the [Pypi readme](./README_PyPi.md) for extra config notes.
12 changes: 12 additions & 0 deletions plugins/lute-thai/README_PyPi.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# `lute3-thai`

A Thai parser for Lute (`lute3`) using the `pythainlp` library.

## Installation

See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html).

## Usage

When this parser is installed, you can add "Thai" as a
language to Lute, which comes with a simple story.
22 changes: 22 additions & 0 deletions plugins/lute-thai/definition.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Thai
dictionaries:
- for: terms
type: embedded
url: https://dict.com/thai-english/###
- for: terms
type: embedded
url: https://en.wiktionary.org/wiki/###
- for: terms
type: popup
url: https://glosbe.com/th/en/###
- for: sentences
type: popup
url: https://www.bing.com/translator/?from=th&to=en&text=###
show_romanization: true
# right_to_left:

parser_type: lute_thai
# character_substitutions:
split_sentences: ฯ!?
# split_sentence_exceptions:
word_chars: ก-๛
5 changes: 5 additions & 0 deletions plugins/lute-thai/lute_thai_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
Lute Thai Parser
"""

__version__ = "0.0.3"
69 changes: 69 additions & 0 deletions plugins/lute-thai/lute_thai_parser/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Parsing using pythainlp

Includes classes:

- ThaiParser

"""

import re
import os
import pythainlp

from typing import List

from lute.parse.base import ParsedToken, AbstractParser


class ThaiParser(AbstractParser):
"""
A parser for Thai that uses the pythainlp library for text segmentation.

The user can add some exceptions to the "parsing_exceptions.txt"
data file.
"""

@classmethod
def name(cls):
return "Lute Thai"

@classmethod
def uses_data_directory(cls):
"Uses the data_directory (defined in the AbstractParser)."
return False

# @classmethod
# def init_data_directory(cls):
# "Set up necessary files."
# pass

def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
"""
Returns ParsedToken array for given language.
"""
text = text.replace("\r\n", "\n")

words = pythainlp.word_tokenize(text)
tokens = []
pattern = f"[{language.word_characters}]"
for word in words:
is_word_char = re.match(pattern, word) is not None
is_end_of_sentence = word in language.regexp_split_sentences
if is_end_of_sentence:
is_word_char = False
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wanted to leave a comment about this logic here,

In Thai the "period" punctuation is and in my tests, the parser was showing it as a word and I figured it didn't make sense for punctuation to be words so that is why I have is_word_char set to False if the text is a sentence delimiter

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, sometimes there maybe funny cases like this where you have to hardcode it.

if word == "\n":
word = "¶"
if word == "¶":
is_word_char = False
is_end_of_sentence = True
t = ParsedToken(word, is_word_char, is_end_of_sentence)
tokens.append(t)
return tokens

def get_reading(self, text: str): # pylint: disable=unused-argument
"""
Get the pronunciation for the given text. For most
languages, this can't be automated.
"""
return None
24 changes: 24 additions & 0 deletions plugins/lute-thai/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"

[tool.flit.module]
name = "lute_thai_parser"

[project]
name = "lute3-thai"
dynamic = ['version']
description = "Learning Using Texts - Thai Parser"
requires-python = ">=3.8"
authors = [
{name = "Justin Dom"}
]
readme = "README_PyPi.md"

dependencies = [
"lute3>=3.4.2",
"pythainlp==5.0.4"
]

[project.entry-points."lute.plugin.parse"]
lute_thai = "lute_thai_parser.parser:ThaiParser"
5 changes: 5 additions & 0 deletions plugins/lute-thai/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Required dependency for base classes.
lute3>=3.4.2

# extra requirements here.
pythainlp==5.0.4
Empty file.
36 changes: 36 additions & 0 deletions plugins/lute-thai/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Common fixtures used by many tests.
"""

import os
import yaml
import pytest


from lute.parse.registry import init_parser_plugins

from lute.models.language import Language


def pytest_sessionstart(session): # pylint: disable=unused-argument
"""
Initialize parser list
"""
init_parser_plugins()


def _get_test_language():
"""
Retrieve the language definition file for testing ths plugin from definition.yaml
"""
thisdir = os.path.dirname(os.path.realpath(__file__))
definition_file = os.path.join(thisdir, "..", "definition.yaml")
with open(definition_file, "r", encoding="utf-8") as df:
d = yaml.safe_load(df)
lang = Language.from_dict(d)
return lang


@pytest.fixture(name="thai")
def fixture_thai():
return _get_test_language()
86 changes: 86 additions & 0 deletions plugins/lute-thai/tests/test_ThaiParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
ThaiParser tests.
"""


import pytest

# pylint: disable=wrong-import-order
from lute.models.term import Term
from lute.parse.base import ParsedToken

from lute_thai_parser.parser import ThaiParser


def test_token_count(thai):
"""
token_count checks.
"""
cases = [
("สวัสดี", 1),
("ลาก่อน", 1),
("ฉันรักคุณ", 3),
("ฉันกำลังเรียนภาษาไทย", 4),
]
for text, expected_count in cases:
t = Term(thai, text)
assert t.token_count == expected_count, text
assert t.text_lc == t.text, "case"


def assert_tokens_equals(text, lang, expected):
"""
Parsing a text using a language should give the expected parsed tokens.

expected is given as array of:
[ original_text, is_word, is_end_of_sentence ]
"""
p = ThaiParser()
actual = p.get_parsed_tokens(text, lang)
expected = [ParsedToken(*a) for a in expected]
assert [str(a) for a in actual] == [str(e) for e in expected]


def test_end_of_sentence_stored_in_parsed_tokens(thai):
"""
ParsedToken is marked as EOS=True at ends of sentences.
"""
s = "สวัสดีทุกคน! ฉันเรียนภาษาไทยมา2เดือนแล้วฯ"

expected = [
("สวัสดี", True),
("ทุกคน", True),
("!", False, True),
(" ", False),
("ฉัน", True),
("เรียน", True),
("ภาษาไทย", True),
("มา", True),
("2", False),
("เดือน", True),
("แล้ว", True, False),
("ฯ", False, True),
]
assert_tokens_equals(s, thai, expected)


def test_carriage_returns_treated_as_reverse_p_character(thai):
"""
Returns need to be marked with the backwards P for rendering etc.
"""
s = "สวัสดีทุกคน!\nฉันเรียนภาษาไทยมา2เดือนแล้ว"

expected = [
("สวัสดี", True),
("ทุกคน", True),
("!", False, True),
("¶", False, True),
("ฉัน", True),
("เรียน", True),
("ภาษาไทย", True),
("มา", True),
("2", False),
("เดือน", True),
("แล้ว", True, False),
]
assert_tokens_equals(s, thai, expected)