fix tests

Zhi Lin · Zhi Lin · commit 5363036ef291 · 2024-04-10T10:29:34.000Z
Signed-off-by: Zhi Lin &lt;zhi.lin@intel.com&gt;
diff --git a/python/raydp/spark/ray_cluster.py b/python/raydp/spark/ray_cluster.py
@@ -125,6 +125,7 @@ def _prepare_spark_configs(self):
 
         raydp_agent_path = os.path.abspath(os.path.join(os.path.abspath(__file__),
                                                         "../../jars/raydp-agent*.jar"))
+        print(raydp_agent_path)
         raydp_agent_jar = glob.glob(raydp_agent_path)[0]
         self._configs[SPARK_JAVAAGENT] = raydp_agent_jar
         # for JVM running in ray
diff --git a/python/raydp/tf/estimator.py b/python/raydp/tf/estimator.py
@@ -43,7 +43,7 @@ def __init__(self,
                  metrics: Union[List[keras.metrics.Metric], List[str]] = None,
                  feature_columns: Union[str, List[str]] = None,
                  label_columns: Union[str, List[str]] = None,
-                 merge_feature_columns: bool = True,
+                 merge_feature_columns: bool = False,
                  batch_size: int = 128,
                  drop_last: bool = False,
                  num_epochs: int = 1,
@@ -211,10 +211,6 @@ def fit(self,
             train_ds = train_ds.random_shuffle()
             if evaluate_ds:
                 evaluate_ds = evaluate_ds.random_shuffle()
-        datasets = {"train": train_ds}
-        if evaluate_ds is not None:
-            train_loop_config["evaluate"] = True
-            datasets["evaluate"] = evaluate_ds
         preprocessor = None
         if self._merge_feature_columns:
             if isinstance(self._feature_columns, list) and len(self._feature_columns) > 1:
@@ -224,6 +220,11 @@ def fit(self,
                 preprocessor = Concatenator(output_column_name="features",
                                             exclude=label_cols)
                 train_loop_config["feature_columns"] = "features"
+                train_ds = preprocessor.fit_transform(train_ds)
+        datasets = {"train": train_ds}
+        if evaluate_ds is not None:
+            train_loop_config["evaluate"] = True
+            datasets["evaluate"] = evaluate_ds
         self._trainer = TensorflowTrainer(TFEstimator.train_func,
                                           train_loop_config=train_loop_config,
                                           scaling_config=scaling_config,
diff --git a/python/raydp/torch/estimator.py b/python/raydp/torch/estimator.py
@@ -378,4 +378,4 @@ def fit_on_spark(self,
 
     def get_model(self):
         assert self._trainer is not None, "Must call fit first"
-        return TorchCheckpoint(self._trained_results.checkpoint).get_model()
+        return TorchCheckpoint(self._trained_results.checkpoint.as_directory()).get_model()
diff --git a/python/raydp/xgboost/estimator.py b/python/raydp/xgboost/estimator.py
@@ -31,7 +31,6 @@ class XGBoostEstimator(EstimatorInterface, SparkEstimatorInterface):
     def __init__(self,
                  xgboost_params: Dict,
                  label_column: str,
-                 dmatrix_params: Dict = None,
                  num_workers: int = 1,
                  resources_per_worker: Optional[Dict[str, float]] = None,
                  shuffle: bool = True):
@@ -41,10 +40,6 @@ def __init__(self,
               for a list of possible parameters.
         :param label_column: Name of the label column. A column with this name
               must be present in the training dataset passed to fit() later.
-        :param dmatrix_params: Dict of ``dataset name:dict of kwargs`` passed to respective
-              :class:`xgboost_ray.RayDMatrix` initializations, which in turn are passed
-              to ``xgboost.DMatrix`` objects created on each worker. For example, this can
-              be used to add sample weights with the ``weights`` parameter.
         :param num_workers: the number of workers to do the distributed training.
         :param resources_per_worker: the resources defined in this Dict will be reserved for
               each worker. The ``CPU`` and ``GPU`` keys (case-sensitive) can be defined to
@@ -53,7 +48,6 @@ def __init__(self,
         """
         self._xgboost_params = xgboost_params
         self._label_column = label_column
-        self._dmatrix_params = dmatrix_params
         self._num_workers = num_workers
         self._resources_per_worker = resources_per_worker
         self._shuffle = shuffle
@@ -76,7 +70,6 @@ def fit(self,
                                 datasets=datasets,
                                 label_column=self._label_column,
                                 params=self._xgboost_params,
-                                dmatrix_params=self._dmatrix_params,
                                 run_config=run_config)
         self._results = trainer.fit()