-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathutils.py
231 lines (202 loc) · 6.87 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# coding=utf-8
# @author: cer
# this script must run with python3
from __future__ import print_function
from num2words import num2words
import re
odd = "odd!~!"
def norm_year(token):
token = token.strip()
if len(token) == 4:
# 2015
if token[2] == '0' and token[3] == '0' and token[1] != '0':
y1 = token[:2]
return numstr2word(y1) + ' hundred'
elif token[1] == '0' and token[2] == '0':
y1 = token[0]
y2 = token[3:]
ys2 = " " + num2words(float(y2)) if y2 != "0" else ""
return numstr2word(y1) + ' thousand' + ys2
else:
y1 = token[: 2]
y2 = token[2:]
ys2 = numstr2word(y2).replace("-", " ")
if int(y2) < 10:
ys2 = "o " + ys2
return numstr2word(y1).replace("-", " ") + ' ' + ys2
elif len(token) == 3:
if token[1] == '0': # 202 or 200
return numstr2word(token)
else: # 360
y1s = numstr2word(token[0])
y2s = numstr2word(token[1:])
return y1s + " " + y2s
elif len(token) == 2:
if int(token) < 10: # 09
return "o " + numstr2word(token[1])
else:
return numstr2word(token)
elif len(token) == 1:
return "o " + numstr2word(token)
else:
return odd
def test_norm_year():
year = "2007"
print(norm_year(year))
def norm_month(token):
months = ["january", "february", "march", "april",
"may", "june", "july", "august",
"september", "october", "november", "december"]
months_prefix = list(map(lambda m: m[:3], months))
if token.isdigit():
if int(token) > 12:
return odd
return months[int(token)-1]
else:
if token[:3].lower() not in months_prefix:
return odd
return months[months_prefix.index(token[:3].lower())]
def test_norm_month():
mon = "Mar"
print(norm_month(mon))
def norm_day(token):
return numstr2word(token, ordinal=True)
def norm_date(month_n, month_first=True, year_n="", day_n=""):
if year_n == "":
year_s = ""
else:
year_s = " " + norm_year(year_n)
if day_n == "":
day_s = ""
month_first = True
else:
day_s = " " + norm_day(day_n)
if month_first:
res = norm_month(month_n) + day_s \
+ year_s
else:
res = "the" + day_s + " of " + norm_month(month_n) \
+ year_s
return res
def int2order_string(num):
"""暂时只支持31以内的"""
ordinals = ["first", "second", "third", "fourth", "fifth",
"sixth", "seventh", "eighth", "ninth", "tenth",
"eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth",
"sixteenth", "seventeenth", "eighteenth", "nineteenth", "twentieth",
"twenty first", "twenty second", "twenty third", "twenty fourth", "twenty fifth",
"twenty sixth", "twenty seventh", "twenty eighth", "twenty ninth", "thirtieth",
"thirty-first"]
return ordinals[int(num) - 1]
def norm_time(time_, sufix):
time_ = time_.strip()
sep = ""
if ":" in time_:
sep = ":"
elif "." in time_:
sep = "."
if sep == "":
if time_.isdigit():
return numstr2word(time_)
else:
return time_
arr = time_.split(sep)
arr = [one.strip() for one in arr]
if len(arr) == 2 and arr[0].isdigit() and arr[1].isdigit():
min_s = " " + numstr2word(arr[1])
if int(arr[1]) == 0:
if sufix == "":
min_s = " o'clock"
else:
min_s = ""
elif int(arr[1]) < 10:
min_s = ' o' + min_s
return numstr2word(arr[0]) + min_s
elif len(arr) == 2 and arr[0].isdigit() and "." in arr[1]:
arr_2 = arr[1].split(".")
if len(arr_2) == 2 and arr_2[0].isdigit() and arr_2[1].isdigit():
if int(arr[0]) == 1:
return numstr2word(arr[0]) + " minute " + \
numstr2word(arr_2[0]) + " seconds and " + \
numstr2word(arr_2[1]) + " milliseconds"
else:
return numstr2word(arr[0]) + " minutes " + \
numstr2word(arr_2[0]) + " seconds and " + \
numstr2word(arr_2[1]) + " milliseconds"
elif len(arr) == 3 and arr[0].isdigit() and arr[1].isdigit() and arr[2].isdigit():
if int(arr[0]) == 1:
return numstr2word(arr[0]) + " hour " + \
numstr2word(arr[1]) + " minutes and " + \
numstr2word(arr[2]) + " seconds"
else:
return numstr2word(arr[0]) + " hours " + \
numstr2word(arr[1]) + " minutes and " + \
numstr2word(arr[2]) + " seconds"
else:
return time_
def numstr2word(num, ordinal=False):
"""using num2word"""
return num2words(float(num), ordinal=ordinal).replace("-", " ").replace(" and ", " ").replace(",", "")
def norm_digit(num):
# print([num2words(float(one)) for one in num])
return " ".join([num2words(float(one)) for one in num])
def has_upper(token):
for c in token:
if c.isupper():
return True
return False
def all_upper(token):
for c in token:
if not c.isupper():
return False
return True
def infer_year_month_day(arr, split):
"""arr是长度为3的list:["12", "20", "2014"]"""
if len(arr[0]) == 4: # 2004-12-20
year_n = arr[0]
if int(arr[1]) > 12:
month_n = arr[2]
day_n = arr[1]
month_first = False
else:
month_n = arr[1]
day_n = arr[2]
# 1988/11/21,the twenty first of november nineteen eighty eight
# 2005-04-15, the fifteenth of april two thousand five
if split == "-" or split == "/":
month_first = False
else:
month_first = True
elif len(arr[2]) == 4: # 12-20-2004
year_n = arr[2]
if int(arr[1]) > 12:
month_n = arr[0]
day_n = arr[1]
month_first = True
else:
month_n = arr[1]
day_n = arr[0]
month_first = False
else:
if int(arr[0]) > 31:
year_n = arr[0]
month_n = arr[1]
day_n = arr[2]
if split == "-":
month_first = False
else:
month_first = True
else:
year_n = arr[2]
if int(arr[1]) > 12:
month_n = arr[0]
day_n = arr[1]
month_first = True
else:
month_n = arr[1]
day_n = arr[0]
month_first = False
return year_n, month_n, day_n, month_first
if __name__ == '__main__':
test_norm_month()
# test_norm_year()