From 7c8adaa7aa1c504e42048a30aa9505f593dcef92 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 13:43:26 -0700 Subject: [PATCH 01/17] testing current logs --- benchmarks/auto_benchmark.py | 6 +++++ benchmarks/benchmark_config_gpu.yaml | 25 +++++++++---------- .../bert_multi_gpu_better_transformer.yaml | 3 ++- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index c0d132dd2a..6b14f9a04d 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -212,6 +212,12 @@ def run_benchmark(bm_config): # generate stats metrics from ab_report.csv bm_model = model_json_config[0 : -len(".json")] + # bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) + # os.makedirs(bm_model_log_path, exist_ok=True) + + # cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) + # execute(cmd, wait=True) + gen_metrics_json.gen_metric( "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml index 7e8969945e..e31a5c4da7 100644 --- a/benchmarks/benchmark_config_gpu.yaml +++ b/benchmarks/benchmark_config_gpu.yaml @@ -3,18 +3,18 @@ # - nightly: "2022.3.16" # - release: "0.5.3" # Nightly build will be installed if "ts_version" is not specifiged -#ts_version: -# branch: &ts_version "master" +ts_version: + branch: &ts_version "ci_logs" # a list of model configure yaml files defined in benchmarks/models_config # or a list of model configure yaml files with full path models: - "bert_multi_gpu.yaml" - "bert_multi_gpu_better_transformer.yaml" - - "bert_multi_gpu_no_better_transformer.yaml" - - "fastrcnn.yaml" - - "mnist.yaml" - - "vgg16.yaml" + # - "bert_multi_gpu_no_better_transformer.yaml" + # - "fastrcnn.yaml" + # - "mnist.yaml" + # - "vgg16.yaml" # - "wf_dog_breed.yaml" # benchmark on "cpu" or "gpu". @@ -28,11 +28,11 @@ hardware: &hardware "gpu" # - keep the values order as the same as the command definition. # - set up the command before enabling `metrics_cmd`. # For example, aws client and AWS credentials need to be setup before trying this example. -metrics_cmd: - - "cmd": "aws cloudwatch put-metric-data" - - "--namespace": ["torchserve_benchmark_nightly_", *hardware] - - "--region": "us-east-2" - - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' +# metrics_cmd: +# - "cmd": "aws cloudwatch put-metric-data" +# - "--namespace": ["torchserve_benchmark_nightly_", *hardware] +# - "--region": "us-east-2" +# - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' # load report to remote storage or local different path if "report_cmd" is set. # the command line to load report to remote storage. @@ -48,5 +48,4 @@ metrics_cmd: report_cmd: - "cmd": "aws s3 cp --recursive" - "source": '/tmp/ts_benchmark/' - - "dest": ['s3://torchserve-benchmark/nightly', "today()", *hardware] - + - "dest": ['s3://torchserve-benchmark/nightly', "today()", "test", *hardware] diff --git a/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml b/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml index c476aad5eb..6ec677088c 100644 --- a/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml +++ b/benchmarks/models_config/bert_multi_gpu_better_transformer.yaml @@ -2,7 +2,8 @@ bert_bt: eager_mode: benchmark_engine: "ab" - url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar + # url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-BT.mar + url: https://torchserve.pytorch.org/mar_files/BERTSeqClassification-ERROR.mar workers: - 4 batch_delay: 100 From 9256fb8c28eafa4bd05fa5541392fd22c39df5b1 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 13:44:40 -0700 Subject: [PATCH 02/17] testing current logs --- .github/workflows/benchmark_nightly.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml index b2da7cbb5b..df86056cd2 100644 --- a/.github/workflows/benchmark_nightly.yml +++ b/.github/workflows/benchmark_nightly.yml @@ -2,15 +2,18 @@ name: Benchmark torchserve nightly on: # run every day at 2:15am - schedule: - - cron: '15 02 * * *' + # schedule: + # - cron: '15 02 * * *' + push: + branches: + - "ci_logs" jobs: nightly: strategy: fail-fast: false matrix: - hardware: [cpu, gpu, inf2] + hardware: [gpu] #[cpu, gpu, inf2] runs-on: - self-hosted - ${{ matrix.hardware }} From 627c8ca1294716e1cdbe1d40ffd4172a2395b4c5 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 13:49:34 -0700 Subject: [PATCH 03/17] moving log publish --- benchmarks/auto_benchmark.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index 6b14f9a04d..505d8d899e 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -212,11 +212,11 @@ def run_benchmark(bm_config): # generate stats metrics from ab_report.csv bm_model = model_json_config[0 : -len(".json")] - # bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) - # os.makedirs(bm_model_log_path, exist_ok=True) + bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) + os.makedirs(bm_model_log_path, exist_ok=True) - # cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) - # execute(cmd, wait=True) + cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) + execute(cmd, wait=True) gen_metrics_json.gen_metric( "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), @@ -228,8 +228,8 @@ def run_benchmark(bm_config): execute(bm_config["metrics_cmd"], wait=True) # cp benchmark logs to local - bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) - os.makedirs(bm_model_log_path, exist_ok=True) + # bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) + # os.makedirs(bm_model_log_path, exist_ok=True) csv_file = "{}/ab_report.csv".format(BENCHMARK_TMP_PATH) if os.path.exists(csv_file): shutil.move(csv_file, bm_model_log_path) @@ -238,8 +238,8 @@ def run_benchmark(bm_config): ) execute(cmd, wait=True) - cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) - execute(cmd, wait=True) + # cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) + # execute(cmd, wait=True) print("finish benchmark {}".format(bm_model)) # generate final report From 1137bec7c39c535e4000bb6c5f95dc89f367b941 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 13:58:32 -0700 Subject: [PATCH 04/17] moving log publish --- benchmarks/auto_benchmark.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index 505d8d899e..c2b9512ed1 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -218,10 +218,15 @@ def run_benchmark(bm_config): cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) execute(cmd, wait=True) - gen_metrics_json.gen_metric( - "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), - "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), - ) + try: + gen_metrics_json.gen_metric( + "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), + "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), + ) + except Exception as e: + print(f"An error occurred: {e}") + if "report_cmd" in bm_config: + execute(bm_config["report_cmd"], wait=True) # load stats metrics to remote metrics storage if "metrics_cmd" in bm_config: From fc337cc762d90c1938b4c8f330a4308aa4975fa3 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 14:38:14 -0700 Subject: [PATCH 05/17] testing publish --- benchmarks/auto_benchmark.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index c2b9512ed1..78e7e7b981 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -11,6 +11,7 @@ MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config" BENCHMARK_TMP_PATH = "/tmp/benchmark" BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark" +BENCHMARK_REPORT_PATH_TEST = "/tmp/ts_benchmark/fail" TS_LOGS_PATH = CWD + "/logs" MODEL_STORE = "/tmp/model_store" WF_STORE = "/tmp/wf_store" @@ -212,18 +213,20 @@ def run_benchmark(bm_config): # generate stats metrics from ab_report.csv bm_model = model_json_config[0 : -len(".json")] - bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) - os.makedirs(bm_model_log_path, exist_ok=True) - - cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) - execute(cmd, wait=True) - try: gen_metrics_json.gen_metric( "{}/ab_report.csv".format(BENCHMARK_TMP_PATH), "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), ) except Exception as e: + bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH_TEST, bm_model) + os.makedirs(bm_model_log_path, exist_ok=True) + + cmd = "tar -cvzf {}/logs.tar.gz {}".format( + bm_model_log_path, TS_LOGS_PATH + ) + execute(cmd, wait=True) + print(f"An error occurred: {e}") if "report_cmd" in bm_config: execute(bm_config["report_cmd"], wait=True) @@ -233,8 +236,8 @@ def run_benchmark(bm_config): execute(bm_config["metrics_cmd"], wait=True) # cp benchmark logs to local - # bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) - # os.makedirs(bm_model_log_path, exist_ok=True) + bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) + os.makedirs(bm_model_log_path, exist_ok=True) csv_file = "{}/ab_report.csv".format(BENCHMARK_TMP_PATH) if os.path.exists(csv_file): shutil.move(csv_file, bm_model_log_path) @@ -243,8 +246,8 @@ def run_benchmark(bm_config): ) execute(cmd, wait=True) - # cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) - # execute(cmd, wait=True) + cmd = "tar -cvzf {}/logs.tar.gz {}".format(bm_model_log_path, TS_LOGS_PATH) + execute(cmd, wait=True) print("finish benchmark {}".format(bm_model)) # generate final report From 6b5d94ca54866e33f78e674b8a8402f1b280238d Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 14:53:58 -0700 Subject: [PATCH 06/17] adding skips --- benchmarks/auto_benchmark.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index 78e7e7b981..7a3b44a2f5 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -231,6 +231,8 @@ def run_benchmark(bm_config): if "report_cmd" in bm_config: execute(bm_config["report_cmd"], wait=True) + continue + # load stats metrics to remote metrics storage if "metrics_cmd" in bm_config: execute(bm_config["metrics_cmd"], wait=True) From d46a956448e9f64bb9ed8f02c6914557e62a6584 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 15:55:18 -0700 Subject: [PATCH 07/17] testing success --- benchmarks/benchmark_config_gpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml index e31a5c4da7..b98ce958af 100644 --- a/benchmarks/benchmark_config_gpu.yaml +++ b/benchmarks/benchmark_config_gpu.yaml @@ -10,7 +10,7 @@ ts_version: # or a list of model configure yaml files with full path models: - "bert_multi_gpu.yaml" - - "bert_multi_gpu_better_transformer.yaml" + # - "bert_multi_gpu_better_transformer.yaml" # - "bert_multi_gpu_no_better_transformer.yaml" # - "fastrcnn.yaml" # - "mnist.yaml" From 8c3d6ad1c929a5c7a7f7f9d4dea31e6e7cb14b46 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 19:04:02 -0700 Subject: [PATCH 08/17] compile test --- .github/workflows/benchmark_nightly.yml | 10 +++++----- .github/workflows/benchmark_torch_compile_nightly.yml | 5 +++-- benchmarks/auto_benchmark.py | 5 +++++ benchmarks/benchmark_config_gpu.yaml | 2 +- benchmarks/benchmark_config_torch_compile_gpu.yaml | 4 ++-- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml index df86056cd2..589615c4cc 100644 --- a/.github/workflows/benchmark_nightly.yml +++ b/.github/workflows/benchmark_nightly.yml @@ -2,11 +2,11 @@ name: Benchmark torchserve nightly on: # run every day at 2:15am - # schedule: - # - cron: '15 02 * * *' - push: - branches: - - "ci_logs" + schedule: + - cron: '15 02 * * *' + # push: + # branches: + # - "ci_logs" jobs: nightly: diff --git a/.github/workflows/benchmark_torch_compile_nightly.yml b/.github/workflows/benchmark_torch_compile_nightly.yml index 310a7a5d00..e9e6fb450d 100644 --- a/.github/workflows/benchmark_torch_compile_nightly.yml +++ b/.github/workflows/benchmark_torch_compile_nightly.yml @@ -2,8 +2,9 @@ name: Benchmark torch.compile models nightly on: # run every day at 9:15pm - schedule: - - cron: '15 21 * * *' + push: + branches: + - "ci_logs" jobs: nightly: diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index 7a3b44a2f5..c2e8a4af18 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -159,7 +159,12 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly): cmd = "python ts_scripts/install_dependencies.py --environment dev" if nightly: cmd += " --nightly_torch" + + print("-------------TESTTTTT------") + print(cmd) + execute(cmd, wait=True) + print("successfully install install_dependencies.py") # install torchserve diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml index b98ce958af..e31a5c4da7 100644 --- a/benchmarks/benchmark_config_gpu.yaml +++ b/benchmarks/benchmark_config_gpu.yaml @@ -10,7 +10,7 @@ ts_version: # or a list of model configure yaml files with full path models: - "bert_multi_gpu.yaml" - # - "bert_multi_gpu_better_transformer.yaml" + - "bert_multi_gpu_better_transformer.yaml" # - "bert_multi_gpu_no_better_transformer.yaml" # - "fastrcnn.yaml" # - "mnist.yaml" diff --git a/benchmarks/benchmark_config_torch_compile_gpu.yaml b/benchmarks/benchmark_config_torch_compile_gpu.yaml index 97cf53351f..e4fbd5e2b5 100644 --- a/benchmarks/benchmark_config_torch_compile_gpu.yaml +++ b/benchmarks/benchmark_config_torch_compile_gpu.yaml @@ -3,8 +3,8 @@ # - nightly: "2022.3.16" # - release: "0.5.3" # Nightly build will be installed if "ts_version" is not specifiged -#ts_version: -# branch: &ts_version "master" +ts_version: + branch: &ts_version "ci_logs" # a list of model configure yaml files defined in benchmarks/models_config # or a list of model configure yaml files with full path From 83a18887b49ae165e28654ea3d8d4d990c7ee951 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 19:16:46 -0700 Subject: [PATCH 09/17] adding test --- benchmarks/auto_benchmark.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index c2e8a4af18..a66d470153 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -159,10 +159,6 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly): cmd = "python ts_scripts/install_dependencies.py --environment dev" if nightly: cmd += " --nightly_torch" - - print("-------------TESTTTTT------") - print(cmd) - execute(cmd, wait=True) print("successfully install install_dependencies.py") @@ -338,7 +334,9 @@ def main(): else False ) bm_config = load_benchmark_config(arguments.input, skip_ts_config, skip_upload) + print("-------------TESTTTTT------") benchmark_env_setup(bm_config, skip_ts_config, nightly) + print("-------------TESTTTTT 3------") run_benchmark(bm_config) clean_up_benchmark_env(bm_config) print("benchmark_serving.sh finished successfully.") From 5895e0bef7ec1441fbd80b1d27d104a2812d9452 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 19:35:00 -0700 Subject: [PATCH 10/17] removing nightly --- .github/workflows/benchmark_torch_compile_nightly.yml | 2 +- benchmarks/auto_benchmark.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark_torch_compile_nightly.yml b/.github/workflows/benchmark_torch_compile_nightly.yml index e9e6fb450d..d79cb41fd7 100644 --- a/.github/workflows/benchmark_torch_compile_nightly.yml +++ b/.github/workflows/benchmark_torch_compile_nightly.yml @@ -45,4 +45,4 @@ jobs: chmod +x benchmarks/benchmark_model_dependencies.sh source benchmarks/benchmark_model_dependencies.sh - name: Benchmark gpu nightly - run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_torch_compile_gpu.yaml --skip false --nightly True + run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_torch_compile_gpu.yaml --skip false #--nightly True diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index a66d470153..f1281d1c43 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -137,8 +137,8 @@ def install_torchserve(skip_ts_install, hw, ts_version, nightly): return # git checkout branch if it is needed - cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase" - execute(cmd, wait=True) + # cmd = "git checkout master && git reset --hard && git clean -dffx . && git pull --rebase" + # execute(cmd, wait=True) print("successfully reset git") ts_install_cmd = None @@ -334,9 +334,7 @@ def main(): else False ) bm_config = load_benchmark_config(arguments.input, skip_ts_config, skip_upload) - print("-------------TESTTTTT------") benchmark_env_setup(bm_config, skip_ts_config, nightly) - print("-------------TESTTTTT 3------") run_benchmark(bm_config) clean_up_benchmark_env(bm_config) print("benchmark_serving.sh finished successfully.") From 294dcb82d1c7fdffc918d4f39de94bf4f855a918 Mon Sep 17 00:00:00 2001 From: udaij12 Date: Wed, 5 Jun 2024 21:25:29 -0700 Subject: [PATCH 11/17] adding nightly flag back --- .github/workflows/benchmark_torch_compile_nightly.yml | 2 +- ts_scripts/install_dependencies.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark_torch_compile_nightly.yml b/.github/workflows/benchmark_torch_compile_nightly.yml index d79cb41fd7..e9e6fb450d 100644 --- a/.github/workflows/benchmark_torch_compile_nightly.yml +++ b/.github/workflows/benchmark_torch_compile_nightly.yml @@ -45,4 +45,4 @@ jobs: chmod +x benchmarks/benchmark_model_dependencies.sh source benchmarks/benchmark_model_dependencies.sh - name: Benchmark gpu nightly - run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_torch_compile_gpu.yaml --skip false #--nightly True + run: python benchmarks/auto_benchmark.py --input benchmarks/benchmark_config_torch_compile_gpu.yaml --skip false --nightly True diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py index 9e62f3b824..0a28e5d1ef 100644 --- a/ts_scripts/install_dependencies.py +++ b/ts_scripts/install_dependencies.py @@ -141,7 +141,7 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly) f"pip3 install numpy --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/{pt_nightly}" ) os.system( - f"pip3 install --pre torchtext --index-url https://download.pytorch.org/whl/nightly/cpu" + f"pip3 install --pre torchtext --index-url https://download.pytorch.org/whl/nightly/cpu --no-deps" ) else: self.install_torch_packages(cuda_version) From 255ca3f60d7e1eb7e1f37fc7afcc6cebf3434ddf Mon Sep 17 00:00:00 2001 From: udaij12 Date: Thu, 6 Jun 2024 12:01:30 -0700 Subject: [PATCH 12/17] testing torchtext --- ts_scripts/install_dependencies.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ts_scripts/install_dependencies.py b/ts_scripts/install_dependencies.py index 0a28e5d1ef..0928ad45bd 100644 --- a/ts_scripts/install_dependencies.py +++ b/ts_scripts/install_dependencies.py @@ -138,11 +138,11 @@ def install_python_packages(self, cuda_version, requirements_file_path, nightly) if nightly: pt_nightly = "cpu" if not cuda_version else cuda_version os.system( - f"pip3 install numpy --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/{pt_nightly}" - ) - os.system( - f"pip3 install --pre torchtext --index-url https://download.pytorch.org/whl/nightly/cpu --no-deps" + f"pip3 install numpy --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/{pt_nightly}" ) + # os.system( + # f"pip3 install --pre torchtext --index-url https://download.pytorch.org/whl/nightly/cpu --no-deps" + # ) else: self.install_torch_packages(cuda_version) From 10655dceea7d0902cdb0cffdf8cae23c2ca96a6e Mon Sep 17 00:00:00 2001 From: udai Date: Tue, 23 Jul 2024 17:34:11 +0000 Subject: [PATCH 13/17] testing gpu benchmark not torch compile --- .github/workflows/benchmark_nightly.yml | 10 +++++----- .../workflows/benchmark_torch_compile_nightly.yml | 8 +++++--- benchmarks/benchmark_config_torch_compile_gpu.yaml | 14 +++++++------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml index 589615c4cc..df86056cd2 100644 --- a/.github/workflows/benchmark_nightly.yml +++ b/.github/workflows/benchmark_nightly.yml @@ -2,11 +2,11 @@ name: Benchmark torchserve nightly on: # run every day at 2:15am - schedule: - - cron: '15 02 * * *' - # push: - # branches: - # - "ci_logs" + # schedule: + # - cron: '15 02 * * *' + push: + branches: + - "ci_logs" jobs: nightly: diff --git a/.github/workflows/benchmark_torch_compile_nightly.yml b/.github/workflows/benchmark_torch_compile_nightly.yml index e9e6fb450d..f11152994c 100644 --- a/.github/workflows/benchmark_torch_compile_nightly.yml +++ b/.github/workflows/benchmark_torch_compile_nightly.yml @@ -2,9 +2,11 @@ name: Benchmark torch.compile models nightly on: # run every day at 9:15pm - push: - branches: - - "ci_logs" + schedule: + - cron: '15 02 * * *' + # push: + # branches: + # - "ci_logs" jobs: nightly: diff --git a/benchmarks/benchmark_config_torch_compile_gpu.yaml b/benchmarks/benchmark_config_torch_compile_gpu.yaml index e4fbd5e2b5..57fc3124ce 100644 --- a/benchmarks/benchmark_config_torch_compile_gpu.yaml +++ b/benchmarks/benchmark_config_torch_compile_gpu.yaml @@ -10,8 +10,8 @@ ts_version: # or a list of model configure yaml files with full path models: - "bert_torch_compile_gpu.yaml" - - "resnet50_torch_compile_gpu.yaml" - - "sam_fast_torch_compile_gpu_best_latency.yaml" + # - "resnet50_torch_compile_gpu.yaml" + # - "sam_fast_torch_compile_gpu_best_latency.yaml" # benchmark on "cpu" or "gpu". # "cpu" is set if "hardware" is not specified @@ -24,11 +24,11 @@ hardware: &hardware "gpu" # - keep the values order as the same as the command definition. # - set up the command before enabling `metrics_cmd`. # For example, aws client and AWS credentials need to be setup before trying this example. -metrics_cmd: - - "cmd": "aws cloudwatch put-metric-data" - - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware] - - "--region": "us-east-2" - - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' +# metrics_cmd: +# - "cmd": "aws cloudwatch put-metric-data" +# - "--namespace": ["torchserve_benchmark_nightly_torch_compile_", *hardware] +# - "--region": "us-east-2" +# - "--metric-data": 'file:///tmp/benchmark/logs/stats_metrics.json' # load report to remote storage or local different path if "report_cmd" is set. # the command line to load report to remote storage. From a346628768a70519faf6152db6e0026c6e13737f Mon Sep 17 00:00:00 2001 From: udai Date: Tue, 23 Jul 2024 17:56:16 +0000 Subject: [PATCH 14/17] testing new report --- benchmarks/auto_benchmark.py | 7 +++---- benchmarks/benchmark_config_gpu.yaml | 4 ++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index f1281d1c43..2ad5ba1c51 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -11,7 +11,6 @@ MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config" BENCHMARK_TMP_PATH = "/tmp/benchmark" BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark" -BENCHMARK_REPORT_PATH_TEST = "/tmp/ts_benchmark/fail" TS_LOGS_PATH = CWD + "/logs" MODEL_STORE = "/tmp/model_store" WF_STORE = "/tmp/wf_store" @@ -220,7 +219,7 @@ def run_benchmark(bm_config): "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), ) except Exception as e: - bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH_TEST, bm_model) + bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) os.makedirs(bm_model_log_path, exist_ok=True) cmd = "tar -cvzf {}/logs.tar.gz {}".format( @@ -229,8 +228,8 @@ def run_benchmark(bm_config): execute(cmd, wait=True) print(f"An error occurred: {e}") - if "report_cmd" in bm_config: - execute(bm_config["report_cmd"], wait=True) + if "error_report_cmd" in bm_config: + execute(bm_config["error_report_cmd"], wait=True) continue diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml index 5d8bc35b27..5a84996bfe 100644 --- a/benchmarks/benchmark_config_gpu.yaml +++ b/benchmarks/benchmark_config_gpu.yaml @@ -51,3 +51,7 @@ report_cmd: - "source": '/tmp/ts_benchmark/' - "dest": ['s3://torchserve-benchmark/nightly', "today()", "test", *hardware] +error_report_cmd: + - "cmd": "aws s3 cp --recursive" + - "source": '/tmp/ts_benchmark/' + - "dest": ['s3://torchserve-benchmark/nightly', "today()", "failure", *hardware] From 0ef64cfc8c882eba202619b3f95c9717ebd4e176 Mon Sep 17 00:00:00 2001 From: udai Date: Tue, 23 Jul 2024 18:39:28 +0000 Subject: [PATCH 15/17] testing --- benchmarks/auto_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index 2ad5ba1c51..cb32b5ecdd 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -11,6 +11,7 @@ MODEL_JSON_CONFIG_PATH = CWD + "/model_json_config" BENCHMARK_TMP_PATH = "/tmp/benchmark" BENCHMARK_REPORT_PATH = "/tmp/ts_benchmark" +BENCHMARK_REPORT_PATH_TEST = "/tmp/ts_benchmark/fail" TS_LOGS_PATH = CWD + "/logs" MODEL_STORE = "/tmp/model_store" WF_STORE = "/tmp/wf_store" @@ -219,7 +220,7 @@ def run_benchmark(bm_config): "{}/logs/stats_metrics.json".format(BENCHMARK_TMP_PATH), ) except Exception as e: - bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH, bm_model) + bm_model_log_path = "{}/{}".format(BENCHMARK_REPORT_PATH_TEST, bm_model) os.makedirs(bm_model_log_path, exist_ok=True) cmd = "tar -cvzf {}/logs.tar.gz {}".format( From 36374176043143f011708f63f3062650873d8881 Mon Sep 17 00:00:00 2001 From: udai Date: Tue, 23 Jul 2024 18:59:22 +0000 Subject: [PATCH 16/17] testing --- benchmarks/auto_benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index cb32b5ecdd..f1281d1c43 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -229,8 +229,8 @@ def run_benchmark(bm_config): execute(cmd, wait=True) print(f"An error occurred: {e}") - if "error_report_cmd" in bm_config: - execute(bm_config["error_report_cmd"], wait=True) + if "report_cmd" in bm_config: + execute(bm_config["report_cmd"], wait=True) continue From 9ac5400bb2510ac06f25fdf81a19f28c58083774 Mon Sep 17 00:00:00 2001 From: udai Date: Tue, 23 Jul 2024 20:42:01 +0000 Subject: [PATCH 17/17] testing non continue --- benchmarks/auto_benchmark.py | 2 -- benchmarks/benchmark_config_gpu.yaml | 5 ----- 2 files changed, 7 deletions(-) diff --git a/benchmarks/auto_benchmark.py b/benchmarks/auto_benchmark.py index f1281d1c43..54aa935a08 100644 --- a/benchmarks/auto_benchmark.py +++ b/benchmarks/auto_benchmark.py @@ -232,8 +232,6 @@ def run_benchmark(bm_config): if "report_cmd" in bm_config: execute(bm_config["report_cmd"], wait=True) - continue - # load stats metrics to remote metrics storage if "metrics_cmd" in bm_config: execute(bm_config["metrics_cmd"], wait=True) diff --git a/benchmarks/benchmark_config_gpu.yaml b/benchmarks/benchmark_config_gpu.yaml index 5a84996bfe..8d02417b42 100644 --- a/benchmarks/benchmark_config_gpu.yaml +++ b/benchmarks/benchmark_config_gpu.yaml @@ -50,8 +50,3 @@ report_cmd: - "cmd": "aws s3 cp --recursive" - "source": '/tmp/ts_benchmark/' - "dest": ['s3://torchserve-benchmark/nightly', "today()", "test", *hardware] - -error_report_cmd: - - "cmd": "aws s3 cp --recursive" - - "source": '/tmp/ts_benchmark/' - - "dest": ['s3://torchserve-benchmark/nightly', "today()", "failure", *hardware]