diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 2ab3d79c1d..d49e4240f9 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -41,7 +41,13 @@ def __init__( namespace: str = utils.get_default_target_namespace(), job_kind: str = constants.PYTORCHJOB_KIND, ): - """TrainingClient constructor. + """TrainingClient constructor. Configure logging in your application + as follows to see detailed information from the TrainingClient APIs: + .. code-block:: python + import logging + logging.basicConfig() + log = logging.getLogger("kubeflow.training.api.training_client") + log.setLevel(logging.DEBUG) Args: config_file: Path to the kube-config file. Defaults to ~/.kube/config. @@ -771,8 +777,8 @@ def get_job_logs( follow: bool = False, timeout: int = constants.DEFAULT_TIMEOUT, ) -> Dict[str, str]: - """Print the training logs for the Job. By default it returns logs from - the `master` pod. + """Get the logs for every Training Job pod. By default it returns logs from + the `master` pod. Logs are returned in this format: { "pod-name": "Log data" }. Args: name: Name for the Job. @@ -796,7 +802,7 @@ def get_job_logs( For PaddleJob one of `master` or `worker`. replica_index: Optional, index for the Job replica. container: Pod container to get the logs. - follow: Whether to follow the log stream of the pod. + follow: Whether to follow the log stream of the pod and print logs to StdOut. timeout: Optional, Kubernetes API server timeout in seconds to execute the request. @@ -843,7 +849,7 @@ def get_job_logs( while True: for index, log_queue in enumerate(log_queue_pool): if all(finished): - return + return logs_dict if finished[index]: continue # grouping the every 50 log lines of the same pod @@ -853,7 +859,14 @@ def get_job_logs( if logline is None: finished[index] = True break + + # Print logs to the StdOut print(f"[Pod {pods[index]}]: {logline}") + # Add logs to the results dict. + if pods[index] not in logs_dict: + logs_dict[pods[index]] = logline + else: + logs_dict[pods[index]] += logline except queue.Empty: break elif pods: diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index b3b7ca1d28..2da525331c 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -321,7 +321,7 @@ def get_pytorchjob_template( ), ) - if num_procs_per_worker > 0: + if num_procs_per_worker: pytorchjob.spec.nproc_per_node = num_procs_per_worker if elastic_policy: pytorchjob.spec.elastic_policy = elastic_policy @@ -334,7 +334,7 @@ def get_pytorchjob_template( template=master_pod_template_spec, ) - if num_worker_replicas >= 1: + if num_worker_replicas: pytorchjob.spec.pytorch_replica_specs[ constants.REPLICA_TYPE_WORKER ] = models.KubeflowOrgV1ReplicaSpec(