From c0a5b0897525040c92bc15c3aab562b9c2af0e2f Mon Sep 17 00:00:00 2001 From: Marc Date: Thu, 29 Apr 2021 10:34:58 +0200 Subject: [PATCH 1/2] fix taiwanese examples --- dateparser/data/date_translation_data/ja.py | 28 ++++++------ dateparser/data/date_translation_data/yue.py | 28 ++++++------ .../data/date_translation_data/zh-Hant.py | 44 +++++++++---------- dateparser_scripts/write_complete_data.py | 10 +++-- tests/test_languages.py | 1 + 5 files changed, 58 insertions(+), 53 deletions(-) diff --git a/dateparser/data/date_translation_data/ja.py b/dateparser/data/date_translation_data/ja.py index aacba6f4b..538d9a33d 100644 --- a/dateparser/data/date_translation_data/ja.py +++ b/dateparser/data/date_translation_data/ja.py @@ -176,59 +176,59 @@ }, "relative-type-regex": { "\\1 day ago": [ - "(\\d+) 日前", + "(\\d+)\\s?日前", "(\\d+)日前" ], "\\1 hour ago": [ - "(\\d+) 時間前", + "(\\d+)\\s?時間前", "(\\d+)時間前" ], "\\1 minute ago": [ - "(\\d+) 分前", + "(\\d+)\\s?分前", "(\\d+)分前" ], "\\1 month ago": [ - "(\\d+) か月前", + "(\\d+)\\s?か月前", "(\\d+)か月前" ], "\\1 second ago": [ - "(\\d+) 秒前", + "(\\d+)\\s?秒前", "(\\d+)秒前" ], "\\1 week ago": [ - "(\\d+) 週間前", + "(\\d+)\\s?週間前", "(\\d+)週間前" ], "\\1 year ago": [ - "(\\d+) 年前", + "(\\d+)\\s?年前", "(\\d+)年前" ], "in \\1 day": [ - "(\\d+) 日後", + "(\\d+)\\s?日後", "(\\d+)日後" ], "in \\1 hour": [ - "(\\d+) 時間後", + "(\\d+)\\s?時間後", "(\\d+)時間後" ], "in \\1 minute": [ - "(\\d+) 分後", + "(\\d+)\\s?分後", "(\\d+)分後" ], "in \\1 month": [ - "(\\d+) か月後", + "(\\d+)\\s?か月後", "(\\d+)か月後" ], "in \\1 second": [ - "(\\d+) 秒後", + "(\\d+)\\s?秒後", "(\\d+)秒後" ], "in \\1 week": [ - "(\\d+) 週間後", + "(\\d+)\\s?週間後", "(\\d+)週間後" ], "in \\1 year": [ - "(\\d+) 年後", + "(\\d+)\\s?年後", "(\\d+)年後" ] }, diff --git a/dateparser/data/date_translation_data/yue.py b/dateparser/data/date_translation_data/yue.py index 4446be78d..ceb4090a6 100644 --- a/dateparser/data/date_translation_data/yue.py +++ b/dateparser/data/date_translation_data/yue.py @@ -141,46 +141,46 @@ }, "relative-type-regex": { "\\1 day ago": [ - "(\\d+) 日前" + "(\\d+)\\s?日前" ], "\\1 hour ago": [ - "(\\d+) 小時前" + "(\\d+)\\s?小時前" ], "\\1 minute ago": [ - "(\\d+) 分鐘前" + "(\\d+)\\s?分鐘前" ], "\\1 month ago": [ - "(\\d+) 個月前" + "(\\d+)\\s?個月前" ], "\\1 second ago": [ - "(\\d+) 秒前" + "(\\d+)\\s?秒前" ], "\\1 week ago": [ - "(\\d+) 個星期前" + "(\\d+)\\s?個星期前" ], "\\1 year ago": [ - "(\\d+) 年前" + "(\\d+)\\s?年前" ], "in \\1 day": [ - "(\\d+) 日後" + "(\\d+)\\s?日後" ], "in \\1 hour": [ - "(\\d+) 小時後" + "(\\d+)\\s?小時後" ], "in \\1 minute": [ - "(\\d+) 分鐘後" + "(\\d+)\\s?分鐘後" ], "in \\1 month": [ - "(\\d+) 個月後" + "(\\d+)\\s?個月後" ], "in \\1 second": [ - "(\\d+) 秒後" + "(\\d+)\\s?秒後" ], "in \\1 week": [ - "(\\d+) 個星期後" + "(\\d+)\\s?個星期後" ], "in \\1 year": [ - "(\\d+) 年後" + "(\\d+)\\s?年後" ] }, "locale_specific": {}, diff --git a/dateparser/data/date_translation_data/zh-Hant.py b/dateparser/data/date_translation_data/zh-Hant.py index 12a0ff3d5..bf0b67e06 100644 --- a/dateparser/data/date_translation_data/zh-Hant.py +++ b/dateparser/data/date_translation_data/zh-Hant.py @@ -141,46 +141,46 @@ }, "relative-type-regex": { "\\1 day ago": [ - "(\\d+) 天前" + "(\\d+)\\s?天前" ], "\\1 hour ago": [ - "(\\d+) 小時前" + "(\\d+)\\s?小時前" ], "\\1 minute ago": [ - "(\\d+) 分鐘前" + "(\\d+)\\s?分鐘前" ], "\\1 month ago": [ - "(\\d+) 個月前" + "(\\d+)\\s?個月前" ], "\\1 second ago": [ - "(\\d+) 秒前" + "(\\d+)\\s?秒前" ], "\\1 week ago": [ - "(\\d+) 週前" + "(\\d+)\\s?週前" ], "\\1 year ago": [ - "(\\d+) 年前" + "(\\d+)\\s?年前" ], "in \\1 day": [ - "(\\d+) 天後" + "(\\d+)\\s?天後" ], "in \\1 hour": [ - "(\\d+) 小時後" + "(\\d+)\\s?小時後" ], "in \\1 minute": [ - "(\\d+) 分鐘後" + "(\\d+)\\s?分鐘後" ], "in \\1 month": [ - "(\\d+) 個月後" + "(\\d+)\\s?個月後" ], "in \\1 second": [ - "(\\d+) 秒後" + "(\\d+)\\s?秒後" ], "in \\1 week": [ - "(\\d+) 週後" + "(\\d+)\\s?週後" ], "in \\1 year": [ - "(\\d+) 年後" + "(\\d+)\\s?年後" ] }, "locale_specific": { @@ -236,7 +236,7 @@ }, "relative-type-regex": { "\\1 day ago": [ - "(\\d+) 日前", + "(\\d+)\\s?日前", "(\\d+)日前" ], "\\1 hour ago": [ @@ -252,14 +252,14 @@ "(\\d+)秒前" ], "\\1 week ago": [ - "(\\d+) 星期前", + "(\\d+)\\s?星期前", "(\\d+)週前" ], "\\1 year ago": [ "(\\d+)年前" ], "in \\1 day": [ - "(\\d+) 日後", + "(\\d+)\\s?日後", "(\\d+)日後" ], "in \\1 hour": [ @@ -275,7 +275,7 @@ "(\\d+)秒後" ], "in \\1 week": [ - "(\\d+) 星期後", + "(\\d+)\\s?星期後", "(\\d+)週後" ], "in \\1 year": [ @@ -335,7 +335,7 @@ }, "relative-type-regex": { "\\1 day ago": [ - "(\\d+) 日前", + "(\\d+)\\s?日前", "(\\d+)日前" ], "\\1 hour ago": [ @@ -351,14 +351,14 @@ "(\\d+)秒前" ], "\\1 week ago": [ - "(\\d+) 星期前", + "(\\d+)\\s?星期前", "(\\d+)週前" ], "\\1 year ago": [ "(\\d+)年前" ], "in \\1 day": [ - "(\\d+) 日後", + "(\\d+)\\s?日後", "(\\d+)日後" ], "in \\1 hour": [ @@ -374,7 +374,7 @@ "(\\d+)秒後" ], "in \\1 week": [ - "(\\d+) 星期後", + "(\\d+)\\s?星期後", "(\\d+)週後" ], "in \\1 year": [ diff --git a/dateparser_scripts/write_complete_data.py b/dateparser_scripts/write_complete_data.py index dcb4aa520..303dcf7dd 100644 --- a/dateparser_scripts/write_complete_data.py +++ b/dateparser_scripts/write_complete_data.py @@ -24,23 +24,27 @@ RELATIVE_PATTERN = re.compile(r'\{0\}') -def _modify_relative_data(relative_data): +def _modify_relative_data(relative_data, replace_spaces=False): modified_relative_data = OrderedDict() for key, value in relative_data.items(): for i, string in enumerate(value): string = RELATIVE_PATTERN.sub(r'(\\d+)', string) + if replace_spaces: + string = re.sub(r'\s+', '\\s?', string) value[i] = string modified_relative_data[key] = value return modified_relative_data def _modify_data(language_data): + replace_spaces = eval(language_data.get('no_word_spacing', 'False')) + relative_data = language_data.get("relative-type-regex", {}) - relative_data = _modify_relative_data(relative_data) + relative_data = _modify_relative_data(relative_data, replace_spaces=replace_spaces) locale_specific_data = language_data.get("locale_specific", {}) for _, info in locale_specific_data.items(): locale_relative_data = info.get("relative-type-regex", {}) - locale_relative_data = _modify_relative_data(locale_relative_data) + locale_relative_data = _modify_relative_data(locale_relative_data, replace_spaces=replace_spaces) def _get_complete_date_translation_data(language): diff --git a/tests/test_languages.py b/tests/test_languages.py index 536d6cafc..bfb325414 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -1638,6 +1638,7 @@ def test_translation(self, shortname, datetime_string, expected_translation): param('yue', "13 個星期後", "in 13 week"), param('yue', "2 小時前", "2 hour ago"), param('yue', "上個月", "1 month ago"), + param('yue', "2分鐘前", "2 minute ago"), # zgh param('zgh', "ⴰⵙⵙⴰ", "0 day ago"), param('zgh', "ⵉⴹⵍⵍⵉ", "1 day ago"), From 42deba301079c8c690601a49d1e55a150adf07ab Mon Sep 17 00:00:00 2001 From: Marc Date: Thu, 29 Apr 2021 10:46:20 +0200 Subject: [PATCH 2/2] fix thai --- dateparser/data/date_translation_data/th.py | 58 ++++++++++--------- .../date_translation_data/th.yaml | 3 + tests/test_languages.py | 7 +++ 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/dateparser/data/date_translation_data/th.py b/dateparser/data/date_translation_data/th.py index e9bdf456c..817069048 100644 --- a/dateparser/data/date_translation_data/th.py +++ b/dateparser/data/date_translation_data/th.py @@ -195,63 +195,64 @@ }, "relative-type-regex": { "\\1 day ago": [ - "(\\d+) วันที่ผ่านมา", - "(\\d+) วันที่แล้ว" + "(\\d+)\\s?วันที่ผ่านมา", + "(\\d+)\\s?วันที่แล้ว" ], "\\1 hour ago": [ - "(\\d+) ชม ที่แล้ว", - "(\\d+) ชั่วโมงที่ผ่านมา" + "(\\d+)\\s?ชม\\s?ที่แล้ว", + "(\\d+)\\s?ชั่วโมงที่ผ่านมา" ], "\\1 minute ago": [ - "(\\d+) นาทีที่ผ่านมา", - "(\\d+) นาทีที่แล้ว" + "(\\d+)\\s?นาทีที่ผ่านมา", + "(\\d+)\\s?นาทีที่แล้ว" ], "\\1 month ago": [ - "(\\d+) เดือนที่ผ่านมา", - "(\\d+) เดือนที่แล้ว" + "(\\d+)\\s?เดือนที่ผ่านมา", + "(\\d+)\\s?เดือนที่แล้ว" ], "\\1 second ago": [ - "(\\d+) วินาทีที่ผ่านมา", - "(\\d+) วินาทีที่แล้ว" + "(\\d+)\\s?วินาทีที่ผ่านมา", + "(\\d+)\\s?วินาทีที่แล้ว" ], "\\1 week ago": [ - "(\\d+) สัปดาห์ที่ผ่านมา", - "(\\d+) สัปดาห์ที่แล้ว" + "(\\d+)\\s?สัปดาห์ที่ผ่านมา", + "(\\d+)\\s?สัปดาห์ที่แล้ว" ], "\\1 year ago": [ - "(\\d+) ปีที่แล้ว" + "(\\d+)\\s?ปีที่แล้ว" ], "in \\1 day": [ - "ใน (\\d+) วัน", - "ในอีก (\\d+) วัน" + "ใน\\s?(\\d+)\\s?วัน", + "ในอีก\\s?(\\d+)\\s?วัน" ], "in \\1 hour": [ - "ใน (\\d+) ชม", - "ในอีก (\\d+) ชั่วโมง" + "ใน\\s?(\\d+)\\s?ชม", + "ในอีก\\s?(\\d+)\\s?ชั่วโมง" ], "in \\1 minute": [ - "ใน (\\d+) นาที", - "ในอีก (\\d+) นาที" + "ใน\\s?(\\d+)\\s?นาที", + "ในอีก\\s?(\\d+)\\s?นาที" ], "in \\1 month": [ - "ใน (\\d+) เดือน", - "ในอีก (\\d+) เดือน" + "ใน\\s?(\\d+)\\s?เดือน", + "ในอีก\\s?(\\d+)\\s?เดือน" ], "in \\1 second": [ - "ใน (\\d+) วินาที", - "ในอีก (\\d+) วินาที" + "ใน\\s?(\\d+)\\s?วินาที", + "ในอีก\\s?(\\d+)\\s?วินาที" ], "in \\1 week": [ - "ใน (\\d+) สัปดาห์", - "ในอีก (\\d+) สัปดาห์" + "ใน\\s?(\\d+)\\s?สัปดาห์", + "ในอีก\\s?(\\d+)\\s?สัปดาห์" ], "in \\1 year": [ - "ใน (\\d+) ปี", - "ในอีก (\\d+) ปี" + "ใน\\s?(\\d+)\\s?ปี", + "ในอีก\\s?(\\d+)\\s?ปี" ] }, "locale_specific": {}, "sentence_splitter_group": 5, + "no_word_spacing": "True", "skip": [ "น.", "เมื่อ ", @@ -272,7 +273,8 @@ "ago": [ "แต่ก่อน", "มาแล้ว", - "ก่อน" + "ก่อน", + "ที่ผ่านมา" ], "in": [ "ใน" diff --git a/dateparser_data/supplementary_language_data/date_translation_data/th.yaml b/dateparser_data/supplementary_language_data/date_translation_data/th.yaml index 7047680a7..31a8e8f10 100644 --- a/dateparser_data/supplementary_language_data/date_translation_data/th.yaml +++ b/dateparser_data/supplementary_language_data/date_translation_data/th.yaml @@ -1,5 +1,7 @@ sentence_splitter_group : 5 +no_word_spacing: "True" + skip: ["น.", "เมื่อ ", "เวลา"] monday: @@ -66,6 +68,7 @@ ago: - แต่ก่อน - มาแล้ว - ก่อน + - ที่ผ่านมา in: - ใน diff --git a/tests/test_languages.py b/tests/test_languages.py index bfb325414..233360eaf 100644 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -1584,6 +1584,13 @@ def test_translation(self, shortname, datetime_string, expected_translation): # teo param('teo', "moi", "in 1 day"), param('teo', "lolo", "0 day ago"), + # th + param('th', "2 เดือน ที่ผ่านมา", "2 month ago"), + param('th', "2 เดือนที่ผ่านมา", "2 month ago"), + param('th', "3สัปดาห์ที่ผ่านมา", "3 week ago"), + param('th', "20 ปี ที่ผ่านมา", "20 year ago"), + param('th', "6เดือน ที่ผ่านมา", "6 month ago"), + param('th', "3 นาที ที่ผ่านมา", "3 minute ago"), # to param('to', "miniti 'e 5 kuo'osi", "5 minute ago"), param('to', "'i he ta'u 'e 6", "in 6 year"),