Skip to content

Commit

Permalink
Cleaned up code
Browse files Browse the repository at this point in the history
  • Loading branch information
vruusmann committed Feb 26, 2023
1 parent 375760a commit 3f7757d
Show file tree
Hide file tree
Showing 16 changed files with 54 additions and 62 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys

from category_encoders import BaseNEncoder, BinaryEncoder, CatBoostEncoder, CountEncoder, LeaveOneOutEncoder, OneHotEncoder, OrdinalEncoder, TargetEncoder, WOEEncoder
from mlxtend.preprocessing import DenseTransformer
from pandas import DataFrame
Expand All @@ -11,8 +14,6 @@
from sklearn2pmml.pipeline import PMMLPipeline

import numpy
import os
import sys

sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/"))

Expand Down Expand Up @@ -127,4 +128,4 @@ def build_audit(cat_encoder, cont_encoder, classifier, name, **pmml_options):
build_audit(LeaveOneOutEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "LeaveOneOutEncoderAuditNA")

build_audit(TargetEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "TargetEncoderAuditNA")
build_audit(WOEEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "WOEEncoderAuditNA")
build_audit(WOEEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "WOEEncoderAuditNA")
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys

from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE
Expand All @@ -12,9 +15,6 @@
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import ExpressionTransformer

import os
import sys

sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/"))

from common import *
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from optbinning import BinningProcess, OptimalBinning, Scorecard
import os
import sys

from optbinning import BinningProcess, Scorecard
from pandas import DataFrame
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import HuberRegressor, LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn2pmml.decoration import Alias
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import ExpressionTransformer

import os
import sys

sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/"))

from common import *
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys

from pandas import DataFrame, Int64Dtype, Series
from pycaret.classification import ClassificationExperiment
from pycaret.clustering import ClusteringExperiment
Expand All @@ -6,9 +9,6 @@
from sklearn2pmml.pycaret import _escape

import numpy
import os
import pycaret
import sys

sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/"))

Expand Down Expand Up @@ -113,4 +113,4 @@ def make_regression(df, estimator, name, **setup_params):
for cat_col in cat_cols:
auto_df[cat_col] = auto_df[cat_col].astype(Int64Dtype())

make_regression(auto_df, "lr", "PyCaretAutoNA", feature_selection = True, feature_selection_method = "classic", n_features_to_select = 0.85)
make_regression(auto_df, "lr", "PyCaretAutoNA", feature_selection = True, feature_selection_method = "classic", n_features_to_select = 0.85)
11 changes: 5 additions & 6 deletions pmml-sklearn-extension/src/test/resources/extensions/sklego.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys

from mlxtend.preprocessing import DenseTransformer
from pandas import DataFrame
from sklearn.cluster import KMeans
Expand All @@ -10,17 +13,13 @@
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC, OneClassSVM
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn2pmml.decoration import Alias, CategoricalDomain, ContinuousDomain
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import ExpressionTransformer
from sklearn2pmml.util import Reshaper
from sklego.meta import EstimatorTransformer
from sklego.pipeline import DebugPipeline
from sklego.preprocessing import IdentityTransformer

import os
import sys

sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/"))

from common import *
Expand Down Expand Up @@ -166,4 +165,4 @@ def build_estimatortransformer_visit(name):
docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"])
store_csv(docvis, name)

build_estimatortransformer_visit("EstimatorTransformerVisit")
build_estimatortransformer_visit("EstimatorTransformerVisit")
5 changes: 3 additions & 2 deletions pmml-sklearn-extension/src/test/resources/extensions/tpot.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os
import sys

from pandas import DataFrame
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline
Expand All @@ -10,8 +13,6 @@
from tpot.config import classifier_config_dict, regressor_config_dict

import pandas
import os
import sys

sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/"))

Expand Down
5 changes: 3 additions & 2 deletions pmml-sklearn-h2o/src/test/resources/main-h2o.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys

from h2o import H2OFrame
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
Expand All @@ -10,7 +12,6 @@
from sklearn_pandas import DataFrameMapper

import h2o
import sys

sys.path.append("../../../../pmml-sklearn/src/test/resources/")

Expand Down Expand Up @@ -88,4 +89,4 @@ def build_auto(auto_df, regressor, name):
build_auto(auto_df, H2ORandomForestEstimator(distribution = "gaussian", seed = 13), "H2ORandomForestAuto")
build_auto(auto_df, H2OXGBoostEstimator(ntrees = 17, seed = 13), "H2OXGBoostAuto")

h2o.shutdown()
h2o.shutdown()
8 changes: 4 additions & 4 deletions pmml-sklearn-lightgbm/src/test/resources/main-lightgbm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from lightgbm import LGBMClassifier, LGBMRegressor

import sys

from lightgbm import LGBMClassifier, LGBMRegressor

sys.path.append("../../../../pmml-sklearn/src/test/resources/")

from main import *
Expand Down Expand Up @@ -43,7 +43,7 @@ def build_audit_cat(audit_df, classifier, name, with_proba = True, fit_params =
pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13))
store_pkl(pipeline, name)
adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"])
if with_proba == True:
if with_proba:
adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"])
adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1)
store_csv(adjusted, name)
Expand All @@ -66,4 +66,4 @@ def build_audit_cat(audit_df, classifier, name, with_proba = True, fit_params =
auto_df = load_auto("Auto")
auto_X, auto_y = split_csv(auto_df)

build_auto_opt(auto_df, LGBMRegressor(objective = "regression", random_state = 13), "LGBMAuto", fit_params = {"regressor__eval_set" : [(auto_X[auto_test_mask], auto_y[auto_test_mask])], "regressor__eval_metric" : "rmse", "regressor__early_stopping_rounds" : 3})
build_auto_opt(auto_df, LGBMRegressor(objective = "regression", random_state = 13), "LGBMAuto", fit_params = {"regressor__eval_set" : [(auto_X[auto_test_mask], auto_y[auto_test_mask])], "regressor__eval_metric" : "rmse", "regressor__early_stopping_rounds" : 3})
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import sys

from pandas import DataFrame
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.statsmodels import StatsModelsClassifier, StatsModelsRegressor
from statsmodels.api import GLM, Logit, MNLogit, OLS, Poisson, WLS

import statsmodels.genmod.families as families

import sys
from statsmodels.genmod import families

sys.path.append("../../../../pmml-sklearn/src/test/resources/")

Expand Down Expand Up @@ -118,4 +117,4 @@ def build_visit(visit_df, regressor, name):
visit_df = load_visit("Visit")

build_visit(visit_df, StatsModelsRegressor(GLM, family = families.Poisson()), "GLMVisit")
build_visit(visit_df, StatsModelsRegressor(Poisson), "PoissonVisit")
build_visit(visit_df, StatsModelsRegressor(Poisson), "PoissonVisit")
6 changes: 3 additions & 3 deletions pmml-sklearn-xgboost/src/test/resources/main-xgboost.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from xgboost.sklearn import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor

import sys

from xgboost.sklearn import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor

sys.path.append("../../../../pmml-sklearn/src/test/resources/")

from main import *
Expand Down Expand Up @@ -77,4 +77,4 @@ def build_audit_na_direct(audit_na_df, classifier, name):
if "Housing" in datasets:
housing_df = load_housing("Housing")

build_housing(housing_df, GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 5, random_state = 13), SGDRegressor(penalty = "elasticnet", random_state = 13)), "XGBRFLMHousing")
build_housing(housing_df, GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 5, random_state = 13), SGDRegressor(penalty = "elasticnet", random_state = 13)), "XGBRFLMHousing")
11 changes: 0 additions & 11 deletions pmml-sklearn/src/test/resources/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import joblib
import pandas
#import pickle

def load_csv(name):
return pandas.read_csv("csv/" + name + ".csv", na_values = ["N/A", "NA"])
Expand All @@ -25,16 +24,6 @@ def store_mojo(estimator, name):
def store_pkl(obj, name):
joblib.dump(obj, "pkl/" + name + ".pkl", compress = 9)

# Pickle dump
#def store_pkl(obj, name):
# con = open("pkl/" + name, "wb")
# pickle.dump(obj, con, protocol = -1)
# con.close()

def dump(obj):
for attr in dir(obj):
print("obj.%s = %s" % (attr, getattr(obj, attr)))

def load_audit(name, stringify = True):
df = load_csv(name)
print(df.dtypes)
Expand Down
6 changes: 3 additions & 3 deletions pmml-sklearn/src/test/resources/extensions/bspline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from common import *

from pandas import DataFrame
from scipy.interpolate import make_interp_spline
from scipy.stats import norm
Expand All @@ -9,6 +7,8 @@

import numpy

from common import *

# See https://ndsplines.readthedocs.io/en/latest/auto_examples/1d-interp.html

def gaussian(x):
Expand Down Expand Up @@ -45,4 +45,4 @@ def tanh(x):
store_pkl(pipeline, name)

y = DataFrame(pipeline.predict_transform(X), columns = ["y", "bspline(predict(y))"])
store_csv(y, name)
store_csv(y, name)
5 changes: 3 additions & 2 deletions pmml-sklearn/src/test/resources/extensions/sklearn2pmml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from common import *
import sys

from pandas import DataFrame, Series
from sklearn_pandas import DataFrameMapper
Expand All @@ -19,7 +19,8 @@
from sklearn2pmml.util import Predicate

import numpy
import sys

from common import *

sys.path.append("../")

Expand Down
6 changes: 3 additions & 3 deletions pmml-sklearn/src/test/resources/extensions/temporal.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from common import *

from pandas import DataFrame
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import FeatureUnion, Pipeline
Expand All @@ -9,6 +7,8 @@
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import ExpressionTransformer, DateTimeFormatter, DaysSinceYearTransformer, SecondsSinceYearTransformer

from common import *

df = DataFrame([
["1968-12-21T12:51:00", None, "1968-12-27T15:51:42", True], # Apollo 8
["1969-05-18T16:49:00", None, "1969-05-26T16:52:23", True], # Apollo 10
Expand Down Expand Up @@ -66,4 +66,4 @@ def make_datetime_pipeline():
[([col], make_datetime_pipeline()) for col in ["launch", "return"]]
)

build_apollo(mapper, "DayMonthYearApollo")
build_apollo(mapper, "DayMonthYearApollo")
6 changes: 3 additions & 3 deletions pmml-sklearn/src/test/resources/extensions/text.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from common import *

from sklearn2pmml.feature_extraction.text import Matcher, Splitter

from common import *

stop_words = ["a", "and", "are", "d", "i", "is", "it", "ll", "m", "s", "the", "ve", "we", "you"]

def tokenize(sentiment_df, tokenizer, name):
Expand All @@ -19,4 +19,4 @@ def process(line):
tokenize(sentiment_df, Matcher("(?u)\\b\\w\\w+\\b"), "CountVectorizerSentiment");

tokenize(sentiment_df, Matcher("\\w+"), "MatcherSentiment")
tokenize(sentiment_df, Splitter("\\s+"), "SplitterSentiment")
tokenize(sentiment_df, Splitter("\\s+"), "SplitterSentiment")
8 changes: 5 additions & 3 deletions pmml-sklearn/src/test/resources/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sklearn.ensemble import AdaBoostRegressor, BaggingClassifier, BaggingRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, IsolationForest, RandomForestClassifier, RandomForestRegressor, StackingClassifier, StackingRegressor, VotingClassifier, VotingRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, f_classif, f_regression
from sklearn.feature_selection import f_classif, f_regression
from sklearn.feature_selection import SelectFromModel, SelectKBest, SelectPercentile
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.isotonic import IsotonicRegression
Expand All @@ -24,7 +24,7 @@
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import Binarizer, FunctionTransformer, KBinsDiscretizer, LabelBinarizer, LabelEncoder, MaxAbsScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures, PowerTransformer, RobustScaler, StandardScaler
from sklearn.svm import LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM, SVC, SVR
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import EstimatorProxy, SelectorProxy
from sklearn2pmml.decoration import Alias, CategoricalDomain, ContinuousDomain, ContinuousDomainEraser, DiscreteDomainEraser, MultiAlias, MultiDomain
from sklearn2pmml.feature_extraction.text import Matcher, Splitter
Expand All @@ -33,7 +33,7 @@
from sklearn2pmml.preprocessing import Aggregator, CastTransformer, ConcatTransformer, CutTransformer, DataFrameConstructor, DaysSinceYearTransformer, ExpressionTransformer, FilterLookupTransformer, LookupTransformer, MatchesTransformer, MultiLookupTransformer, PMMLLabelBinarizer, PMMLLabelEncoder, PowerFunctionTransformer, ReplaceTransformer, SubstringTransformer, StringNormalizer, WordCountTransformer
from sklearn2pmml.util import Slicer
from sklearn_pandas import CategoricalImputer, DataFrameMapper
from xgboost.sklearn import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor
from xgboost.sklearn import XGBClassifier, XGBRegressor

import numpy
import pandas
Expand Down Expand Up @@ -477,6 +477,7 @@ def build_iris_opt(iris_df, classifier, name, fit_params = {}, **pmml_options):
("classifier", classifier)
])
pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params)
pipeline.configure(**pmml_options)
if isinstance(classifier, XGBClassifier):
pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
else:
Expand Down Expand Up @@ -636,6 +637,7 @@ def build_auto_opt(auto_df, regressor, name, fit_params = {}, **pmml_options):
("regressor", regressor)
])
pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params)
pipeline.configure(**pmml_options)
if isinstance(regressor, XGBRegressor):
pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5)
else:
Expand Down

0 comments on commit 3f7757d

Please sign in to comment.