|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import glob |
| 4 | +import gzip |
| 5 | +import json |
| 6 | +import logging |
| 7 | +import os |
| 8 | +import platform |
| 9 | +import socket |
| 10 | +import time |
| 11 | +from argparse import Action, ArgumentParser, Namespace |
| 12 | +from logging import info, warning |
| 13 | +from typing import Any, Dict, List, Optional, Tuple |
| 14 | + |
| 15 | +import boto3 |
| 16 | +import psutil |
| 17 | +import torch |
| 18 | +from git import Repo |
| 19 | + |
| 20 | +logging.basicConfig(level=logging.INFO) |
| 21 | + |
| 22 | + |
| 23 | +REPO = "vllm-project/vllm" |
| 24 | + |
| 25 | + |
| 26 | +class ValidateDir(Action): |
| 27 | + def __call__( |
| 28 | + self, |
| 29 | + parser: ArgumentParser, |
| 30 | + namespace: Namespace, |
| 31 | + values: Any, |
| 32 | + option_string: Optional[str] = None, |
| 33 | + ) -> None: |
| 34 | + if os.path.isdir(values): |
| 35 | + setattr(namespace, self.dest, values) |
| 36 | + return |
| 37 | + |
| 38 | + parser.error(f"{values} is not a valid directory") |
| 39 | + |
| 40 | + |
| 41 | +def parse_args() -> Any: |
| 42 | + parser = ArgumentParser("Upload vLLM benchmarks results to S3") |
| 43 | + parser.add_argument( |
| 44 | + "--vllm", |
| 45 | + type=str, |
| 46 | + required=True, |
| 47 | + action=ValidateDir, |
| 48 | + help="the directory that vllm repo is checked out", |
| 49 | + ) |
| 50 | + parser.add_argument( |
| 51 | + "--benchmark-results", |
| 52 | + type=str, |
| 53 | + required=True, |
| 54 | + action=ValidateDir, |
| 55 | + help="the directory with the benchmark results", |
| 56 | + ) |
| 57 | + parser.add_argument( |
| 58 | + "--s3-bucket", |
| 59 | + type=str, |
| 60 | + required=False, |
| 61 | + default="ossci-benchmarks", |
| 62 | + help="the S3 bucket to upload the benchmark results", |
| 63 | + ) |
| 64 | + parser.add_argument( |
| 65 | + "--dry-run", |
| 66 | + action="store_true", |
| 67 | + ) |
| 68 | + |
| 69 | + return parser.parse_args() |
| 70 | + |
| 71 | + |
| 72 | +def get_git_metadata(vllm_dir: str) -> Tuple[str, str]: |
| 73 | + repo = Repo(vllm_dir) |
| 74 | + return repo.active_branch.name, repo.head.object.hexsha |
| 75 | + |
| 76 | + |
| 77 | +def get_benchmark_metadata(head_branch: str, head_sha: str) -> Dict[str, Any]: |
| 78 | + timestamp = int(time.time()) |
| 79 | + return { |
| 80 | + "timestamp": timestamp, |
| 81 | + "schema_version": "v3", |
| 82 | + "name": "vLLM benchmark", |
| 83 | + "repo": REPO, |
| 84 | + "head_branch": head_branch, |
| 85 | + "head_sha": head_sha, |
| 86 | + "workflow_id": os.getenv("WORKFLOW_ID", timestamp), |
| 87 | + "run_attempt": os.getenv("RUN_ATTEMPT", 1), |
| 88 | + "job_id": os.getenv("JOB_ID", timestamp), |
| 89 | + } |
| 90 | + |
| 91 | + |
| 92 | +def get_runner_info() -> Dict[str, Any]: |
| 93 | + return { |
| 94 | + # TODO (huydhn): Figure out a better way to set the name here without |
| 95 | + # hard coding it to cuda |
| 96 | + "name": "cuda", |
| 97 | + "type": torch.cuda.get_device_name(), |
| 98 | + "cpu_info": platform.processor(), |
| 99 | + "cpu_count": psutil.cpu_count(), |
| 100 | + "avail_mem_in_gb": int(psutil.virtual_memory().total / (1024 * 1024 * 1024)), |
| 101 | + "gpu_info": torch.cuda.get_device_name(), |
| 102 | + "gpu_count": torch.cuda.device_count(), |
| 103 | + "avail_gpu_mem_in_gb": int( |
| 104 | + torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024) |
| 105 | + ), |
| 106 | + "extra_info": { |
| 107 | + "hostname": socket.gethostname(), |
| 108 | + }, |
| 109 | + } |
| 110 | + |
| 111 | + |
| 112 | +def load(benchmark_results: str) -> Dict[str, List]: |
| 113 | + results = {} |
| 114 | + |
| 115 | + for file in glob.glob(f"{benchmark_results}/*.json"): |
| 116 | + filename = os.path.basename(file) |
| 117 | + with open(file) as f: |
| 118 | + try: |
| 119 | + r = json.load(f) |
| 120 | + except json.JSONDecodeError as e: |
| 121 | + warning(f"Fail to load {file}: {e}") |
| 122 | + continue |
| 123 | + |
| 124 | + if not r: |
| 125 | + warning(f"Find no benchmark results in {file}") |
| 126 | + continue |
| 127 | + |
| 128 | + if type(r) is not list or "benchmark" not in r[0]: |
| 129 | + warning(f"Find no PyToch benchmark results in {file}") |
| 130 | + continue |
| 131 | + |
| 132 | + results[filename] = r |
| 133 | + |
| 134 | + return results |
| 135 | + |
| 136 | + |
| 137 | +def aggregate( |
| 138 | + metadata: Dict[str, Any], runner: Dict[str, Any], benchmark_results: Dict[str, List] |
| 139 | +) -> List[Dict[str, Any]]: |
| 140 | + aggregated_results = [] |
| 141 | + for _, results in benchmark_results.items(): |
| 142 | + for result in results: |
| 143 | + r: Dict[str, Any] = {**metadata, **result} |
| 144 | + r["runners"] = [runner] |
| 145 | + aggregated_results.append(r) |
| 146 | + return aggregated_results |
| 147 | + |
| 148 | + |
| 149 | +def upload_to_s3( |
| 150 | + s3_bucket: str, |
| 151 | + head_branch: str, |
| 152 | + head_sha: str, |
| 153 | + aggregated_results: List[Dict[str, Any]], |
| 154 | + dry_run: bool = True, |
| 155 | +) -> None: |
| 156 | + s3_path = f"v3/{REPO}/{head_branch}/{head_sha}/benchmark_results.json" |
| 157 | + info(f"Upload benchmark results to s3://{s3_bucket}/{s3_path}") |
| 158 | + if not dry_run: |
| 159 | + # Write in JSONEachRow format |
| 160 | + data = "\n".join([json.dumps(r) for r in aggregated_results]) |
| 161 | + boto3.resource("s3").Object( |
| 162 | + f"{s3_bucket}", |
| 163 | + f"{s3_path}", |
| 164 | + ).put( |
| 165 | + ACL="public-read", |
| 166 | + Body=gzip.compress(data.encode()), |
| 167 | + ContentEncoding="gzip", |
| 168 | + ContentType="application/json", |
| 169 | + ) |
| 170 | + |
| 171 | + |
| 172 | +def main() -> None: |
| 173 | + args = parse_args() |
| 174 | + |
| 175 | + head_branch, head_sha = get_git_metadata(args.vllm) |
| 176 | + # Gather some information about the benchmark |
| 177 | + metadata = get_benchmark_metadata(head_branch, head_sha) |
| 178 | + runner = get_runner_info() |
| 179 | + |
| 180 | + # Extract and aggregate the benchmark results |
| 181 | + aggregated_results = aggregate(metadata, runner, load(args.benchmark_results)) |
| 182 | + upload_to_s3( |
| 183 | + args.s3_bucket, head_branch, head_sha, aggregated_results, args.dry_run |
| 184 | + ) |
| 185 | + |
| 186 | + |
| 187 | +if __name__ == "__main__": |
| 188 | + main() |
0 commit comments