-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
87 lines (72 loc) · 3.26 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import re
import csv
import shutil
import pandas as pd
from time import time
from tqdm import tqdm
from datasets import load_dataset
from sklearn.model_selection import train_test_split
DATA_PATH = './data'
CACHE_BASE_PATH = os.path.join(DATA_PATH,'cache')
CACHE_PATH = os.path.join(CACHE_BASE_PATH,''.join(str(time()).split('.')))
os.makedirs(CACHE_PATH,exist_ok=True)
print("CACHE_PATH: ",CACHE_PATH)
RE_PATTERNS = {'urls':r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)",
'not_alpha_num':r"[^a-zA-Z0-9']",
}
RE_PATTERNS = {k:re.compile(v) for k,v in RE_PATTERNS.items()}
MAX_SEQ_LEN = 512
MIN_SEQ_LEN = 5
def clean_label_df(df):
texts = df['comment_text']
texts = [re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)",'',s) for s in texts]
texts = [re.sub(r"[^a-zA-Z0-9']",' ',s) for s in texts]
texts = [re.sub(r"[ ]+",' ',s) for s in texts]
texts = [str(s)[:MAX_SEQ_LEN] for s in texts]
df['comment_text'] = texts
if 'comment_label' in df:
labels = df['comment_label']
labels = [1.0 if int(l)==1 else 0.0 for l in labels]
labels = [1 if s else 0 for s in labels]
df['comment_label'] = labels
return df
def preprocess_label_data(datadf,split=False,test_size=None):
datadf = datadf.dropna()
datadf = clean_label_df(datadf)
datadf['comment_text'] = [str(s)[:MAX_SEQ_LEN].strip() for s in datadf['comment_text']]
datadf = datadf[[len(l)>MIN_SEQ_LEN for l in datadf['comment_text']]]
if 'comment_label' in datadf:
datadf['comment_label'] = [1 if int(l)==1 else 0 for l in datadf['comment_label']]
datadf = datadf.rename(columns={'comment_id':'comment_id',
'comment_text':'comment_text',
'comment_label':'labels'})
if split:
print(f'before pruning: {len(datadf)}')
datadf = datadf.drop_duplicates(subset=['comment_text'],keep='first')
print(f'after pruning: {len(datadf)}')
traindf, testdf = train_test_split(datadf,test_size=0.25,random_state=0)
# traindf = traindf.reset_index(drop=True)
# testdf = testdf.reset_index(drop=True)
traindf.to_csv('./data/train.csv',index=False)
testdf.to_csv('./data/test.csv',index=False)
dataset = load_dataset('csv', data_files={'train':'./data/train.csv',
'test':'./data/test.csv'},
cache_dir=CACHE_PATH)
else:
# datadf = datadf.reset_index(drop=True)
datadf.to_csv('./data/data.csv',index=False)
dataset = load_dataset('csv',data_files={'eval':'./data/data.csv'},
cache_dir=CACHE_PATH)
return dataset, datadf
def linear_join(srcdf,predf):
'''
srcdf: ["comment_id","comment_text"]
predf: ["comment_id","comment_label"]
'''
predf_lookup = {r['comment_id']:int(r['comment_label']) for _,r in tqdm(predf.iterrows(),total=len(predf))}
srcdf['comment_label'] = [predf_lookup[r['comment_id']] if r['comment_id'] in predf_lookup else 0 for _,r in tqdm(srcdf.iterrows(),total=len(srcdf))]
return srcdf
def clear_cache():
shutil.rmtree(CACHE_PATH)
print(f"CACHE_PATH: {CACHE_PATH} cleared.")