Skip to content

Commit

Permalink
fixed newline logic for khmerparser
Browse files Browse the repository at this point in the history
  • Loading branch information
jaydom28 committed Nov 9, 2024
1 parent c108dc3 commit 48c31a9
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions plugins/lute-khmer/lute_khmer_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
# markers are used correctly. Lute uses paragraph markers
# for rendering.
text = text.replace("\r\n", "\n")
text = text.replace("\n", "NEWLINE_CHARACTER_FOR_LUTE")
text = text.replace("\n", "\\")

words = khmernltk.word_tokenize(text) # ... get words using parser.
tokens = []
Expand All @@ -60,11 +60,21 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
re.match(pattern, word) is not None
)

if word == "NEWLINE_CHARACTER_FOR_LUTE":
if word == "\\":
word = "¶"
if word == "¶":
is_word_char = False
is_end_of_sentence = True

if word.startswith("\\"):
num_leading_slashes = len(word) - len(word.lstrip("\\"))
for _ in range(num_leading_slashes):
tokens.append(ParsedToken("¶", False, True))

word = word.lstrip("\\")
is_word_char = True
is_end_of_sentence = False

t = ParsedToken(word, is_word_char, is_end_of_sentence)
tokens.append(t)
return tokens
Expand Down

0 comments on commit 48c31a9

Please sign in to comment.