-
Notifications
You must be signed in to change notification settings - Fork 70
/
Copy pathfeature_engineering.py
92 lines (77 loc) · 3.8 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# encoding=utf-8
'''
用sklearn做特征工程,分为三部分:
1.数据预处理
2.特征选择
3.降维
'''
import pandas as pd
import numpy as np
from numpy import vstack, array, nan
from sklearn.datasets import load_iris
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
if __name__ == '__main__':
# 导入IRIS数据集
iris = load_iris()
features = iris.data
labels = iris.target
'''
1.数据预处理
'''
# 1.1 无量纲化:将不同规格的数据转换到同一规格
# 1.1.1 标准化:将服从正态分布的特征值转换成标准正态分布(对列向量处理)
# print(np.mean(features, axis=0))
# print(np.std(features, axis=0))
features_new = preprocessing.StandardScaler().fit_transform(features)
# print(np.mean(features_new, axis=0))
# print(np.std(features_new, axis=0))
# 1.1.2 区间缩放:将特征值缩放到[0, 1]区间的数据(对列向量处理)
features_new = preprocessing.MinMaxScaler().fit_transform(features)
# 1.1.3 归一化:将行向量转化为“单位向量”(对每个样本处理)
features_new = preprocessing.Normalizer().fit_transform(features)
# 1.2 对定量特征二值化:设定一个阈值,大于阈值的赋值为1,小于等于阈值的赋值为0
features_new = preprocessing.Binarizer(threshold=3).fit_transform(features)
# 1.3 对定性(分类)特征编码(也可用pandas.get_dummies函数)
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3],
[1, 1, 0],
[0, 2, 1],
[1, 0, 2]])
# print(enc.transform([[0, 1, 3]]))
# print(enc.transform([[0, 1, 3]]).toarray())
# 1.4 缺失值计算(也可用pandas.fillna函数)
imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
features_new = imp.fit_transform(vstack((array([nan, nan, nan, nan]), features)))
# 1.5 数据变换
# 1.5.1 基于多项式变换(对行变量处理)
features_new = preprocessing.PolynomialFeatures().fit_transform(features)
# 1.5.2 基于自定义函数变换,以log函数为例
features_new = preprocessing.FunctionTransformer(np.log1p).fit_transform(features)
'''
2.特征选择
'''
# 2.1 Filter
# 2.1.1 方差选择法,选择方差大于阈值的特征
features_new = feature_selection.VarianceThreshold(threshold=0.3).fit_transform(features)
# 2.1.2 卡方检验,选择K个与标签最相关的特征
features_new = feature_selection.SelectKBest(feature_selection.chi2, k=3).fit_transform(features, labels)
# 2.2 Wrapper
# 2.2.1 递归特征消除法,这里选择逻辑回归作为基模型,n_features_to_select为选择的特征个数
features_new = feature_selection.RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(features, labels)
# 2.3 Embedded
# 2.3.1 基于惩罚项的特征选择法,这里选择带L1惩罚项的逻辑回归作为基模型
features_new = feature_selection.SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(features, labels)
# 2.3.2 基于树模型的特征选择法,这里选择GBDT模型作为基模型
features_new = feature_selection.SelectFromModel(GradientBoostingClassifier()).fit_transform(features, labels)
'''
3.降维
'''
# 3.1 主成分分析法(PCA),参数n_components为降维后的维数
features_new = PCA(n_components=2).fit_transform(features)
# 3.2 线性判别分析法(LDA),参数n_components为降维后的维数
features_new = LDA(n_components=2).fit_transform(features, labels)