diff --git a/src/dbas/firebase_migrations/translate.py b/src/dbas/firebase_migrations/translate.py index b458ac54d..75b932353 100644 --- a/src/dbas/firebase_migrations/translate.py +++ b/src/dbas/firebase_migrations/translate.py @@ -6,6 +6,7 @@ # Download the Punkt tokenizer models to split scentances nltk.download('punkt') from nltk.tokenize import sent_tokenize +import re # Max characters that can be translated at a time MAX_CHARS = 4900 @@ -42,6 +43,8 @@ def translate_text(text, dest_language='en'): if temp_text: try: translated_text += translator.translate(temp_text, dest=dest_language).text + translated_text = re.sub(r"\.([A-Z])", r". \1", translated_text) # Ensure single space after period before uppercase letter + translated_text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', translated_text) # remove spaces inbetween " 4. 0 gram" for example except Exception as e: print(f"Error translating text: {str(e)}") diff --git a/src/dbas/translate.py b/src/dbas/translate.py index ca5cc6799..04b64ccd4 100644 --- a/src/dbas/translate.py +++ b/src/dbas/translate.py @@ -1,4 +1,3 @@ - # pip install googletrans==4.0.0-rc1 nltk from googletrans import Translator import nltk @@ -28,6 +27,8 @@ def translate_text(text, dest_language='en'): else: # Translate the accumulated text translated_text += translator.translate(temp_text, dest=dest_language).text + " " + translated_text = translated_text.replace('. ', '.').replace('.', '. ') + # Start accumulating sentences again temp_text = sentence