fixed newline logic for khmerparser

LuteOrg · Nov 9, 2024 · 48c31a9 · 48c31a9
1 parent c108dc3
commit 48c31a9
Showing 1 changed file with 12 additions and 2 deletions.
diff --git a/plugins/lute-khmer/lute_khmer_parser/parser.py b/plugins/lute-khmer/lute_khmer_parser/parser.py
@@ -44,7 +44,7 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
         # markers are used correctly.  Lute uses paragraph markers
         # for rendering.
         text = text.replace("\r\n", "\n")
-        text = text.replace("\n", "NEWLINE_CHARACTER_FOR_LUTE")
+        text = text.replace("\n", "\\")
 
         words = khmernltk.word_tokenize(text)  # ... get words using parser.
         tokens = []
@@ -60,11 +60,21 @@ def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
                 re.match(pattern, word) is not None
             )
 
-            if word == "NEWLINE_CHARACTER_FOR_LUTE":
+            if word == "\\":
                 word = "¶"
             if word == "¶":
                 is_word_char = False
                 is_end_of_sentence = True
+
+            if word.startswith("\\"):
+                num_leading_slashes = len(word) - len(word.lstrip("\\"))
+                for _ in range(num_leading_slashes):
+                    tokens.append(ParsedToken("¶", False, True))
+
+                word = word.lstrip("\\")
+                is_word_char = True
+                is_end_of_sentence = False
+
             t = ParsedToken(word, is_word_char, is_end_of_sentence)
             tokens.append(t)
         return tokens