-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfull_replace.py
109 lines (100 loc) · 4.08 KB
/
full_replace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# coding=utf-8
# @author: cer
# this script must run with python3
from __future__ import print_function
from num2words import num2words
import os
import time
import pandas as pd
import numpy as np
import pickle as pkl
from replace_by_rule import *
INPUT_PATH = "input"
OUTPUT_PATH = "output"
# self_classes = ["PLAIN", "PUNCT"]
dict_pkl_name = "dict.pkl"
class_pred_name = "class_pred_16_fixed.v2.5.csv"
out_file_name = "res_16.v2.5.csv"
out_debug_name = "res_16.v2.5.debug.csv"
labels = ['PLAIN', 'PUNCT', 'DATE', 'LETTERS', 'CARDINAL', 'VERBATIM',
'DECIMAL', 'MEASURE', 'MONEY', 'ORDINAL', 'TIME', 'ELECTRONIC',
'DIGIT', 'FRACTION', 'TELEPHONE', 'ADDRESS']
def replace():
print("read class predictions from: ", class_pred_name)
class_pred_df = pd.read_csv(os.path.join(OUTPUT_PATH, class_pred_name))
result = class_pred_df[["id"]]
after_s = []
s = time.time()
for i, row in class_pred_df.iterrows():
token = str(row["before"])
# this token is 'PLAIN'
if labels[int(row["class_pred"])] == 'PLAIN':
token = replace_plain(token)
after_s.append(token)
# this token is 'PUNCT'
elif labels[int(row["class_pred"])] == 'PUNCT':
token = replace_puct(token)
after_s.append(token)
# this token belongs to other classes
elif labels[int(row["class_pred"])] == 'DATE':
token = replace_date(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'LETTERS':
token = replace_letters(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'CARDINAL':
token = replace_cardinal(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'VERBATIM':
token = replace_verbatim(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'DECIMAL':
token = replace_decimal(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'MEASURE':
token = replace_measure(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'MONEY':
token = replace_money(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'ORDINAL':
token = replace_ordinal(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'TIME':
token = replace_time(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'ELECTRONIC':
token = replace_electronic(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'DIGIT':
token = replace_digit(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'FRACTION':
token = replace_fraction(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'TELEPHONE':
token = replace_telephone(token)
after_s.append(token)
elif labels[int(row["class_pred"])] == 'ADDRESS':
token = replace_address(token)
after_s.append(token)
print("replacing done!")
print("time cost: {}".format(time.time() - s))
print("after:", len(after_s))
print("test file size: {}".format(result.shape[0]))
result.loc[:, "after"] = after_s
out_name = os.path.join(OUTPUT_PATH, out_file_name)
print("save normalization to file : ", out_name)
result.to_csv(out_name, index=False)
result.loc[:, "before"] = class_pred_df.loc[:, "before"]
result.loc[:, "class_pred"] = class_pred_df.loc[:, "class_pred"].apply(lambda c: labels[int(c)])
result.loc[:, "max_prob"] = class_pred_df.loc[:, "max_prob"]
debug_name = os.path.join(OUTPUT_PATH, out_debug_name)
print("save debug to file : ", debug_name)
result.to_csv(debug_name)
if __name__ == '__main__':
# print("loading big dict...")
# with open(os.path.join(OUTPUT_PATH, dict_pkl_name), "rb") as f:
# big_dict = pkl.load(f)
print("start replacing...")
replace()