diff --git a/pmml-sklearn-extension/src/test/resources/extensions/category_encoders.py b/pmml-sklearn-extension/src/test/resources/extensions/category_encoders.py index 3805954f0..3405e2ab0 100644 --- a/pmml-sklearn-extension/src/test/resources/extensions/category_encoders.py +++ b/pmml-sklearn-extension/src/test/resources/extensions/category_encoders.py @@ -1,3 +1,6 @@ +import os +import sys + from category_encoders import BaseNEncoder, BinaryEncoder, CatBoostEncoder, CountEncoder, LeaveOneOutEncoder, OneHotEncoder, OrdinalEncoder, TargetEncoder, WOEEncoder from mlxtend.preprocessing import DenseTransformer from pandas import DataFrame @@ -11,8 +14,6 @@ from sklearn2pmml.pipeline import PMMLPipeline import numpy -import os -import sys sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/")) @@ -127,4 +128,4 @@ def build_audit(cat_encoder, cont_encoder, classifier, name, **pmml_options): build_audit(LeaveOneOutEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "LeaveOneOutEncoderAuditNA") build_audit(TargetEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "TargetEncoderAuditNA") - build_audit(WOEEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "WOEEncoderAuditNA") \ No newline at end of file + build_audit(WOEEncoder(handle_missing = "value", handle_unknown = "value"), "passthrough", clone(classifier), "WOEEncoderAuditNA") diff --git a/pmml-sklearn-extension/src/test/resources/extensions/imblearn.py b/pmml-sklearn-extension/src/test/resources/extensions/imblearn.py index 159f6b373..98f2b0669 100644 --- a/pmml-sklearn-extension/src/test/resources/extensions/imblearn.py +++ b/pmml-sklearn-extension/src/test/resources/extensions/imblearn.py @@ -1,3 +1,6 @@ +import os +import sys + from imblearn.combine import SMOTEENN, SMOTETomek from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE @@ -12,9 +15,6 @@ from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import ExpressionTransformer -import os -import sys - sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/")) from common import * diff --git a/pmml-sklearn-extension/src/test/resources/extensions/optbinning.py b/pmml-sklearn-extension/src/test/resources/extensions/optbinning.py index b4e1ab0ec..3193e555b 100644 --- a/pmml-sklearn-extension/src/test/resources/extensions/optbinning.py +++ b/pmml-sklearn-extension/src/test/resources/extensions/optbinning.py @@ -1,16 +1,15 @@ -from optbinning import BinningProcess, OptimalBinning, Scorecard +import os +import sys + +from optbinning import BinningProcess, Scorecard from pandas import DataFrame from sklearn_pandas import DataFrameMapper from sklearn.linear_model import HuberRegressor, LinearRegression, LogisticRegression -from sklearn.preprocessing import OneHotEncoder from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn2pmml.decoration import Alias from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import ExpressionTransformer -import os -import sys - sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/")) from common import * diff --git a/pmml-sklearn-extension/src/test/resources/extensions/pycaret.py b/pmml-sklearn-extension/src/test/resources/extensions/pycaret.py index 9254d4fc9..cd7eb8a4a 100644 --- a/pmml-sklearn-extension/src/test/resources/extensions/pycaret.py +++ b/pmml-sklearn-extension/src/test/resources/extensions/pycaret.py @@ -1,3 +1,6 @@ +import os +import sys + from pandas import DataFrame, Int64Dtype, Series from pycaret.classification import ClassificationExperiment from pycaret.clustering import ClusteringExperiment @@ -6,9 +9,6 @@ from sklearn2pmml.pycaret import _escape import numpy -import os -import pycaret -import sys sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/")) @@ -113,4 +113,4 @@ def make_regression(df, estimator, name, **setup_params): for cat_col in cat_cols: auto_df[cat_col] = auto_df[cat_col].astype(Int64Dtype()) - make_regression(auto_df, "lr", "PyCaretAutoNA", feature_selection = True, feature_selection_method = "classic", n_features_to_select = 0.85) \ No newline at end of file + make_regression(auto_df, "lr", "PyCaretAutoNA", feature_selection = True, feature_selection_method = "classic", n_features_to_select = 0.85) diff --git a/pmml-sklearn-extension/src/test/resources/extensions/sklego.py b/pmml-sklearn-extension/src/test/resources/extensions/sklego.py index 8b4a2e829..05586e62a 100644 --- a/pmml-sklearn-extension/src/test/resources/extensions/sklego.py +++ b/pmml-sklearn-extension/src/test/resources/extensions/sklego.py @@ -1,3 +1,6 @@ +import os +import sys + from mlxtend.preprocessing import DenseTransformer from pandas import DataFrame from sklearn.cluster import KMeans @@ -10,17 +13,13 @@ from sklearn.preprocessing import OneHotEncoder from sklearn.svm import LinearSVC, OneClassSVM from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from sklearn2pmml.decoration import Alias, CategoricalDomain, ContinuousDomain +from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain from sklearn2pmml.pipeline import PMMLPipeline -from sklearn2pmml.preprocessing import ExpressionTransformer from sklearn2pmml.util import Reshaper from sklego.meta import EstimatorTransformer from sklego.pipeline import DebugPipeline from sklego.preprocessing import IdentityTransformer -import os -import sys - sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/")) from common import * @@ -166,4 +165,4 @@ def build_estimatortransformer_visit(name): docvis = DataFrame(pipeline.predict(visit_X), columns = ["docvis"]) store_csv(docvis, name) -build_estimatortransformer_visit("EstimatorTransformerVisit") \ No newline at end of file +build_estimatortransformer_visit("EstimatorTransformerVisit") diff --git a/pmml-sklearn-extension/src/test/resources/extensions/tpot.py b/pmml-sklearn-extension/src/test/resources/extensions/tpot.py index e2bd4c02c..8791327fd 100644 --- a/pmml-sklearn-extension/src/test/resources/extensions/tpot.py +++ b/pmml-sklearn-extension/src/test/resources/extensions/tpot.py @@ -1,3 +1,6 @@ +import os +import sys + from pandas import DataFrame from sklearn_pandas import DataFrameMapper from sklearn.pipeline import Pipeline @@ -10,8 +13,6 @@ from tpot.config import classifier_config_dict, regressor_config_dict import pandas -import os -import sys sys.path.append(os.path.abspath("../../../../pmml-sklearn/src/test/resources/")) diff --git a/pmml-sklearn-h2o/src/test/resources/main-h2o.py b/pmml-sklearn-h2o/src/test/resources/main-h2o.py index 5104d0971..ba80012c2 100644 --- a/pmml-sklearn-h2o/src/test/resources/main-h2o.py +++ b/pmml-sklearn-h2o/src/test/resources/main-h2o.py @@ -1,3 +1,5 @@ +import sys + from h2o import H2OFrame from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.glm import H2OGeneralizedLinearEstimator @@ -10,7 +12,6 @@ from sklearn_pandas import DataFrameMapper import h2o -import sys sys.path.append("../../../../pmml-sklearn/src/test/resources/") @@ -88,4 +89,4 @@ def build_auto(auto_df, regressor, name): build_auto(auto_df, H2ORandomForestEstimator(distribution = "gaussian", seed = 13), "H2ORandomForestAuto") build_auto(auto_df, H2OXGBoostEstimator(ntrees = 17, seed = 13), "H2OXGBoostAuto") -h2o.shutdown() \ No newline at end of file +h2o.shutdown() diff --git a/pmml-sklearn-lightgbm/src/test/resources/main-lightgbm.py b/pmml-sklearn-lightgbm/src/test/resources/main-lightgbm.py index bc271ae3d..d80823ad0 100644 --- a/pmml-sklearn-lightgbm/src/test/resources/main-lightgbm.py +++ b/pmml-sklearn-lightgbm/src/test/resources/main-lightgbm.py @@ -1,7 +1,7 @@ -from lightgbm import LGBMClassifier, LGBMRegressor - import sys +from lightgbm import LGBMClassifier, LGBMRegressor + sys.path.append("../../../../pmml-sklearn/src/test/resources/") from main import * @@ -43,7 +43,7 @@ def build_audit_cat(audit_df, classifier, name, with_proba = True, fit_params = pipeline.verify(audit_X.sample(frac = 0.05, random_state = 13)) store_pkl(pipeline, name) adjusted = DataFrame(pipeline.predict(audit_X), columns = ["Adjusted"]) - if with_proba == True: + if with_proba: adjusted_proba = DataFrame(pipeline.predict_proba(audit_X), columns = ["probability(0)", "probability(1)"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis = 1) store_csv(adjusted, name) @@ -66,4 +66,4 @@ def build_audit_cat(audit_df, classifier, name, with_proba = True, fit_params = auto_df = load_auto("Auto") auto_X, auto_y = split_csv(auto_df) - build_auto_opt(auto_df, LGBMRegressor(objective = "regression", random_state = 13), "LGBMAuto", fit_params = {"regressor__eval_set" : [(auto_X[auto_test_mask], auto_y[auto_test_mask])], "regressor__eval_metric" : "rmse", "regressor__early_stopping_rounds" : 3}) \ No newline at end of file + build_auto_opt(auto_df, LGBMRegressor(objective = "regression", random_state = 13), "LGBMAuto", fit_params = {"regressor__eval_set" : [(auto_X[auto_test_mask], auto_y[auto_test_mask])], "regressor__eval_metric" : "rmse", "regressor__early_stopping_rounds" : 3}) diff --git a/pmml-sklearn-statsmodels/src/test/resources/main-statsmodels.py b/pmml-sklearn-statsmodels/src/test/resources/main-statsmodels.py index 123b74f9e..3a94abbb9 100644 --- a/pmml-sklearn-statsmodels/src/test/resources/main-statsmodels.py +++ b/pmml-sklearn-statsmodels/src/test/resources/main-statsmodels.py @@ -1,13 +1,12 @@ +import sys + from pandas import DataFrame from sklearn_pandas import DataFrameMapper from sklearn.preprocessing import OneHotEncoder from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.statsmodels import StatsModelsClassifier, StatsModelsRegressor from statsmodels.api import GLM, Logit, MNLogit, OLS, Poisson, WLS - -import statsmodels.genmod.families as families - -import sys +from statsmodels.genmod import families sys.path.append("../../../../pmml-sklearn/src/test/resources/") @@ -118,4 +117,4 @@ def build_visit(visit_df, regressor, name): visit_df = load_visit("Visit") build_visit(visit_df, StatsModelsRegressor(GLM, family = families.Poisson()), "GLMVisit") -build_visit(visit_df, StatsModelsRegressor(Poisson), "PoissonVisit") \ No newline at end of file +build_visit(visit_df, StatsModelsRegressor(Poisson), "PoissonVisit") diff --git a/pmml-sklearn-xgboost/src/test/resources/main-xgboost.py b/pmml-sklearn-xgboost/src/test/resources/main-xgboost.py index f637e812e..aac57843e 100644 --- a/pmml-sklearn-xgboost/src/test/resources/main-xgboost.py +++ b/pmml-sklearn-xgboost/src/test/resources/main-xgboost.py @@ -1,7 +1,7 @@ -from xgboost.sklearn import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor - import sys +from xgboost.sklearn import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor + sys.path.append("../../../../pmml-sklearn/src/test/resources/") from main import * @@ -77,4 +77,4 @@ def build_audit_na_direct(audit_na_df, classifier, name): if "Housing" in datasets: housing_df = load_housing("Housing") - build_housing(housing_df, GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 5, random_state = 13), SGDRegressor(penalty = "elasticnet", random_state = 13)), "XGBRFLMHousing") \ No newline at end of file + build_housing(housing_df, GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 5, random_state = 13), SGDRegressor(penalty = "elasticnet", random_state = 13)), "XGBRFLMHousing") diff --git a/pmml-sklearn/src/test/resources/common.py b/pmml-sklearn/src/test/resources/common.py index d727c4f84..13485c579 100644 --- a/pmml-sklearn/src/test/resources/common.py +++ b/pmml-sklearn/src/test/resources/common.py @@ -1,6 +1,5 @@ import joblib import pandas -#import pickle def load_csv(name): return pandas.read_csv("csv/" + name + ".csv", na_values = ["N/A", "NA"]) @@ -25,16 +24,6 @@ def store_mojo(estimator, name): def store_pkl(obj, name): joblib.dump(obj, "pkl/" + name + ".pkl", compress = 9) -# Pickle dump -#def store_pkl(obj, name): -# con = open("pkl/" + name, "wb") -# pickle.dump(obj, con, protocol = -1) -# con.close() - -def dump(obj): - for attr in dir(obj): - print("obj.%s = %s" % (attr, getattr(obj, attr))) - def load_audit(name, stringify = True): df = load_csv(name) print(df.dtypes) diff --git a/pmml-sklearn/src/test/resources/extensions/bspline.py b/pmml-sklearn/src/test/resources/extensions/bspline.py index a4d60c045..528f05123 100644 --- a/pmml-sklearn/src/test/resources/extensions/bspline.py +++ b/pmml-sklearn/src/test/resources/extensions/bspline.py @@ -1,5 +1,3 @@ -from common import * - from pandas import DataFrame from scipy.interpolate import make_interp_spline from scipy.stats import norm @@ -9,6 +7,8 @@ import numpy +from common import * + # See https://ndsplines.readthedocs.io/en/latest/auto_examples/1d-interp.html def gaussian(x): @@ -45,4 +45,4 @@ def tanh(x): store_pkl(pipeline, name) y = DataFrame(pipeline.predict_transform(X), columns = ["y", "bspline(predict(y))"]) - store_csv(y, name) \ No newline at end of file + store_csv(y, name) diff --git a/pmml-sklearn/src/test/resources/extensions/sklearn2pmml.py b/pmml-sklearn/src/test/resources/extensions/sklearn2pmml.py index 5e35269ff..9e1a75523 100644 --- a/pmml-sklearn/src/test/resources/extensions/sklearn2pmml.py +++ b/pmml-sklearn/src/test/resources/extensions/sklearn2pmml.py @@ -1,4 +1,4 @@ -from common import * +import sys from pandas import DataFrame, Series from sklearn_pandas import DataFrameMapper @@ -19,7 +19,8 @@ from sklearn2pmml.util import Predicate import numpy -import sys + +from common import * sys.path.append("../") diff --git a/pmml-sklearn/src/test/resources/extensions/temporal.py b/pmml-sklearn/src/test/resources/extensions/temporal.py index 94b01d970..4aa7765d4 100644 --- a/pmml-sklearn/src/test/resources/extensions/temporal.py +++ b/pmml-sklearn/src/test/resources/extensions/temporal.py @@ -1,5 +1,3 @@ -from common import * - from pandas import DataFrame from sklearn_pandas import DataFrameMapper from sklearn.pipeline import FeatureUnion, Pipeline @@ -9,6 +7,8 @@ from sklearn2pmml.pipeline import PMMLPipeline from sklearn2pmml.preprocessing import ExpressionTransformer, DateTimeFormatter, DaysSinceYearTransformer, SecondsSinceYearTransformer +from common import * + df = DataFrame([ ["1968-12-21T12:51:00", None, "1968-12-27T15:51:42", True], # Apollo 8 ["1969-05-18T16:49:00", None, "1969-05-26T16:52:23", True], # Apollo 10 @@ -66,4 +66,4 @@ def make_datetime_pipeline(): [([col], make_datetime_pipeline()) for col in ["launch", "return"]] ) -build_apollo(mapper, "DayMonthYearApollo") \ No newline at end of file +build_apollo(mapper, "DayMonthYearApollo") diff --git a/pmml-sklearn/src/test/resources/extensions/text.py b/pmml-sklearn/src/test/resources/extensions/text.py index ab1107bdc..46db93164 100644 --- a/pmml-sklearn/src/test/resources/extensions/text.py +++ b/pmml-sklearn/src/test/resources/extensions/text.py @@ -1,7 +1,7 @@ -from common import * - from sklearn2pmml.feature_extraction.text import Matcher, Splitter +from common import * + stop_words = ["a", "and", "are", "d", "i", "is", "it", "ll", "m", "s", "the", "ve", "we", "you"] def tokenize(sentiment_df, tokenizer, name): @@ -19,4 +19,4 @@ def process(line): tokenize(sentiment_df, Matcher("(?u)\\b\\w\\w+\\b"), "CountVectorizerSentiment"); tokenize(sentiment_df, Matcher("\\w+"), "MatcherSentiment") -tokenize(sentiment_df, Splitter("\\s+"), "SplitterSentiment") \ No newline at end of file +tokenize(sentiment_df, Splitter("\\s+"), "SplitterSentiment") diff --git a/pmml-sklearn/src/test/resources/main.py b/pmml-sklearn/src/test/resources/main.py index 949f4613e..bff9728b2 100644 --- a/pmml-sklearn/src/test/resources/main.py +++ b/pmml-sklearn/src/test/resources/main.py @@ -9,7 +9,7 @@ from sklearn.ensemble import AdaBoostRegressor, BaggingClassifier, BaggingRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, IsolationForest, RandomForestClassifier, RandomForestRegressor, StackingClassifier, StackingRegressor, VotingClassifier, VotingRegressor from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.feature_selection import chi2, f_classif, f_regression +from sklearn.feature_selection import f_classif, f_regression from sklearn.feature_selection import SelectFromModel, SelectKBest, SelectPercentile from sklearn.impute import MissingIndicator, SimpleImputer from sklearn.isotonic import IsotonicRegression @@ -24,7 +24,7 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.preprocessing import Binarizer, FunctionTransformer, KBinsDiscretizer, LabelBinarizer, LabelEncoder, MaxAbsScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, PolynomialFeatures, PowerTransformer, RobustScaler, StandardScaler from sklearn.svm import LinearSVC, LinearSVR, NuSVC, NuSVR, OneClassSVM, SVC, SVR -from sklearn2pmml import make_pmml_pipeline, sklearn2pmml +from sklearn2pmml import make_pmml_pipeline from sklearn2pmml import EstimatorProxy, SelectorProxy from sklearn2pmml.decoration import Alias, CategoricalDomain, ContinuousDomain, ContinuousDomainEraser, DiscreteDomainEraser, MultiAlias, MultiDomain from sklearn2pmml.feature_extraction.text import Matcher, Splitter @@ -33,7 +33,7 @@ from sklearn2pmml.preprocessing import Aggregator, CastTransformer, ConcatTransformer, CutTransformer, DataFrameConstructor, DaysSinceYearTransformer, ExpressionTransformer, FilterLookupTransformer, LookupTransformer, MatchesTransformer, MultiLookupTransformer, PMMLLabelBinarizer, PMMLLabelEncoder, PowerFunctionTransformer, ReplaceTransformer, SubstringTransformer, StringNormalizer, WordCountTransformer from sklearn2pmml.util import Slicer from sklearn_pandas import CategoricalImputer, DataFrameMapper -from xgboost.sklearn import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor +from xgboost.sklearn import XGBClassifier, XGBRegressor import numpy import pandas @@ -477,6 +477,7 @@ def build_iris_opt(iris_df, classifier, name, fit_params = {}, **pmml_options): ("classifier", classifier) ]) pipeline.fit(iris_X[iris_train_mask], iris_y[iris_train_mask], **fit_params) + pipeline.configure(**pmml_options) if isinstance(classifier, XGBClassifier): pipeline.verify(iris_X.sample(frac = 0.10, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: @@ -636,6 +637,7 @@ def build_auto_opt(auto_df, regressor, name, fit_params = {}, **pmml_options): ("regressor", regressor) ]) pipeline.fit(auto_X[auto_train_mask], auto_y[auto_train_mask], **fit_params) + pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), precision = 1e-5, zeroThreshold = 1e-5) else: