-
Notifications
You must be signed in to change notification settings - Fork 1.2k
/
Copy pathsample_processing_script.py
41 lines (36 loc) · 2.28 KB
/
sample_processing_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import numpy as np
import argparse
import os
from sklearn.preprocessing import OrdinalEncoder
def _parse_args():
parser = argparse.ArgumentParser()
# Data, model, and output directories
# model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
parser.add_argument('--filepath', type=str, default='/opt/ml/processing/input/')
parser.add_argument('--filename', type=str, default='bank-additional-full.csv')
parser.add_argument('--outputpath', type=str, default='/opt/ml/processing/output/')
parser.add_argument('--categorical_features', type=str, default='y, job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome')
return parser.parse_known_args()
if __name__=="__main__":
# Process arguments
args, _ = _parse_args()
# Load data
df = pd.read_csv(os.path.join(args.filepath, args.filename))
# Change the value . into _
df = df.replace(regex=r'\.', value='_')
df = df.replace(regex=r'\_$', value='')
# Add two new indicators
df["no_previous_contact"] = (df["pdays"] == 999).astype(int)
df["not_working"] = df["job"].isin(["student", "retired", "unemployed"]).astype(int)
df = df.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)
# Encode the categorical features
df = pd.get_dummies(df)
# Train, test, validation split
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=42), [int(0.7 * len(df)), int(0.9 * len(df))]) # Randomly sort the data then split out first 70%, second 20%, and last 10%
# Local store
pd.concat([train_data['y_yes'], train_data.drop(['y_yes','y_no'], axis=1)], axis=1).to_csv(os.path.join(args.outputpath, 'train/train.csv'), index=False, header=False)
pd.concat([validation_data['y_yes'], validation_data.drop(['y_yes','y_no'], axis=1)], axis=1).to_csv(os.path.join(args.outputpath, 'validation/validation.csv'), index=False, header=False)
test_data['y_yes'].to_csv(os.path.join(args.outputpath, 'test/test_y.csv'), index=False, header=False)
test_data.drop(['y_yes','y_no'], axis=1).to_csv(os.path.join(args.outputpath, 'test/test_x.csv'), index=False, header=False)
print("## Processing complete. Exiting.")