-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process.py
52 lines (47 loc) · 1.71 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import logging
import numpy as np
from tqdm import trange
class Processor:
def __init__(self, config):
self.data_dir = config.data_dir
self.config = config
def process(self):
"""
process train and test data
"""
for file_name in self.config.files:
self.preprocess(file_name)
def preprocess(self, mode):
"""
params:
将bio文件每一行中的文本和标签分离出来,存储为words和labels列表
examples:
words示例:['生', '生', '不', '息', 'C', 'S', 'O', 'L']
labels示例:['O', 'O', 'O', 'O', 'B-game', 'I-game', 'I-game', 'I-game']
"""
input_dir = self.data_dir + str(mode) + '.bio'
output_dir = self.data_dir + str(mode) + '.npz'
if os.path.exists(output_dir) is True:
return
word_list = []
label_list = []
with open(input_dir, 'r', encoding='utf8') as f:
lines = f.readlines()
words = []
labels = []
for idx in trange(len(lines)):
line = lines[idx].rstrip()
if not line:
assert len(words) == len(labels), (len(words), len(labels))
word_list.append(words)
label_list.append(labels)
words = []
labels = []
else:
word, label = line.split()
words.append(word)
labels.append(label)
# save as a binary file
np.savez_compressed(output_dir, words=word_list, labels=label_list)
logging.info("--------{} data process DONE!--------".format(mode))