-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcleaning.py
179 lines (144 loc) · 6.38 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 27 17:26:51 2021
@author: a_sylvester
"""
import pandas as pd
import pickle
cars = pd.read_csv('cars.csv', index_col=0)
out = {}#library of output paramters required to clean input to productionized model
out['columns'] = cars.columns#output the required columns we need in our input data
#filling missing data
out['od_mean'] = cars['odometer'].mean()
fill_lib = { #this library specifies how to fill missing values based on column
'condition':'unknown',
'odometer':out['od_mean'],
'paint color':'unknown',
' ':'unknown'
}
cars.fillna(fill_lib, inplace=True) #fill the missing values
cars = cars.replace(' ', 'unknown') #replace blank space with 'unknown'
#### type ####
#combing 'pickup' and 'truck' types
cars['type'] = cars['type'].replace(' pickup', ' truck')
#### price #####
#convert price column to number
cars['price'] = cars['price'].str.replace('$', '').str.replace(',', '').astype('int')
#### latlong ####
#split latlong
cars['latitude'] = cars['latlong'].map(lambda x: x.split(';')[0]).astype('float32')
cars['longitude'] = cars['latlong'].map(lambda x: x.split(';')[1]).astype('float32')
####location#####
#clean up location column - we will just take the first word to clean up data
#make the location column all lower case avoid separating the same place into different categories
cars['location'] = cars['location'].str.lower()
#make the reaplcements below so when we take the first word the town names still make sense
loc_lib = {
'port ':'port-',
'new ':'new-',
'north ':'north-',
'maple ':'maple-',
'campbell ':'campbell-',
'fraser ':'fraser-',
'langley city':'langley-city',
'langley township':'langley-township',
'pitt ':'pitt-',
'white ':'white-',
',':'',
'powell ':'powell-'
}
location_cleaned = cars['location'].map(lambda x: x.lower())
for a, b in loc_lib.items(): #make the replacements in the library above
location_cleaned = location_cleaned.map(lambda x: x.replace(a, b))
#take the first word of the location column only as it is too specific
cars['location_simple'] = location_cleaned.map(lambda x: x.split(' ')[0])
#### year, make, model####
#get year, make, and model from make column
#clean up common variations or makes
make_lib = {
'mercedes-benz':'mercedes','mercedes benz':'mercedes',
'smart car':'smart-car',
'land ':'land-','range ':'range-',
'vw':'volkswagen',
'grand ':'grand-',
'chevy':'chevrolet',
'model ':'model-',
'f150':'f-150', 'f250':'f-250', 'f350':'f-350',
'cx9':'cx-9', 'cx5':'cx-5','cx3':'cx-3',
'mazda3':'3',
'santa fe':'santa-fe'
}
make_cleaned = cars['make'].map(lambda x: x.lower())
for a, b in make_lib.items(): #make the replacements in the library above
make_cleaned = make_cleaned.map(lambda x: x.replace(a, b))
cars['year'] = make_cleaned.map(lambda x: x.split(' ', 2)[0]).astype('int')
cars['age'] = 2021 - cars['year']
cars['make_'] = make_cleaned.map(lambda x: x.split(' ', 2)[1])
def get_third(x):
try:
return(x.split(' ', 2)[2])
except:
return('unknown') #if there is no more text then the model must be missing
cars['model'] = make_cleaned.map(get_third)
##### text body #####
#get a column from the body text
pos_words = ['lady', 'off road', 'winter', 'lift', 'vintage']
neg_words = ['crack', 'torn', 'damage', 'leak', 'missing']
cars['pos_words'] = cars['body text'].map(lambda x: 1 if any(txt in x for txt in pos_words) else 0).astype(bool)
cars['neg_words'] = cars['body text'].map(lambda x: 1 if any(txt in x for txt in neg_words) else 0).astype(bool)
cars['low_text'] = cars['body text'].map(lambda x: 1 if len(x.split()) < 30 else 0).astype(bool) #text has less than 30 words
#convert the odometer reading to an integer to save space
cars['odometer'] = cars['odometer'].astype('int')
#drop extraneous columns
cars = cars.drop(['body text', 'latlong', 'make'], axis=1)
#many of the model values ended up as a year because people put the year in twice. Deal with that here.
years = [str(y) for y in range(1900, 2020)]#check for these years in make column
make = cars[cars.make_.isin(years)]['model'].map(lambda x: x.split(' ')[0]) #get the actual make as the first word of the next column
def get_second(x): #function to get the second word of text when it might be missing
try:
return(x.split(' ', 1)[1])
except:
return('None') #if there is no more text then the model must be missing
cars.loc[make.index, 'make_'] = make.values #replace the years make with the actual make
cars.loc[make.index, 'model'] = cars.loc[make.index, 'model'].map(get_second) #remove the year from the model column
def get_second_3_parts(x): #function to get the second word of text when it might be missing
try:
return(x.split(' ', 2)[1])
except:
return('None') #if there is no more text then the model must be missing
def get_third_3_parts(x): #function to get the second word of text when it might be missing
try:
return(x.split(' ', 2)[2])
except:
return('None') #if there is no more text then the model must be missing
#split the model out into one column for each word
cars['model0'] = cars['model'].map(lambda x: x.split(' ')[0])
cars['model1'] = cars['model'].map(get_second_3_parts)
cars['model2'] = cars['model'].map(get_third_3_parts)
#fill unknown values in some columns by the mode of the model
fill_mode_cols = ['type', 'size', 'drive', 'cylinders'] #for these columns we, will replace 'unknown' with the mode of that make of vehicle
def fill_func(x, m):
try:
fill_val = m.loc[x['model0']].values[0]
if type(fill_val) != str: return(x[col])
elif pd.isnull(x[col]):return(fill_val)
else: return(x[col])
except:
return(x[col])
for col in fill_mode_cols:
modes = cars.groupby('model0')[col].agg([pd.Series.mode])
out['{}'.format(col)] = modes
cars[col] = cars.apply(fill_func, args=(modes,), axis=1)
#filling remaining missing data
fill_lib = { #this library specifies how to fill missing values based on column
'drive':'unknown',
'type':'unknown',
'cylinders':'unknown',
'size':'unknown',
}
cars.fillna(fill_lib, inplace=True) #fill the missing values
#output data
cars.to_csv('cars_cleaned.csv', index=False)
#output library of cleaning parameters so it can be used to clean in the productionized model
pkl = {'clean_data': out}
pickle.dump( pkl, open( 'clean_data' + ".p", "wb" ) )