Skip to content

Commit e9ced09

Browse files
committed
multiple cache from in pipeline-gen and fix/add tests
Signed-off-by: Junpu Fan <[email protected]>
1 parent 31e793a commit e9ced09

File tree

9 files changed

+276
-89
lines changed

9 files changed

+276
-89
lines changed

buildkite/pipeline_generator/buildkite_step.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from pydantic import BaseModel
22
from typing import Dict, List, Optional, Any, Union
3-
from step import Step
4-
from utils_lib.docker_utils import get_image, get_ecr_cache_registry
5-
from global_config import get_global_config
6-
from plugin.k8s_plugin import get_k8s_plugin
7-
from plugin.docker_plugin import get_docker_plugin
8-
from constants import GPUType, AgentQueue
3+
from .step import Step
4+
from .utils_lib.docker_utils import get_image, resolve_ecr_cache_vars
5+
from .global_config import get_global_config
6+
from .plugin.k8s_plugin import get_k8s_plugin
7+
from .plugin.docker_plugin import get_docker_plugin
8+
from .constants import GPUType, AgentQueue
99

1010

1111
class BuildkiteCommandStep(BaseModel):
@@ -87,7 +87,7 @@ def _get_variables_to_inject() -> Dict[str, str]:
8787
if global_config["name"] != "vllm_ci":
8888
return {}
8989

90-
cache_from_tag, cache_to_tag = get_ecr_cache_registry()
90+
cache_from, cache_from_base_branch, cache_from_main, cache_to = resolve_ecr_cache_vars()
9191
return {
9292
"$REGISTRY": global_config["registries"],
9393
"$REPO": global_config["repositories"]["main"]
@@ -97,8 +97,10 @@ def _get_variables_to_inject() -> Dict[str, str]:
9797
"$BRANCH": global_config["branch"],
9898
"$VLLM_USE_PRECOMPILED": "1" if global_config["use_precompiled"] else "0",
9999
"$VLLM_MERGE_BASE_COMMIT": global_config["merge_base_commit"],
100-
"$CACHE_FROM": cache_from_tag,
101-
"$CACHE_TO": cache_to_tag,
100+
"$CACHE_FROM": cache_from,
101+
"$CACHE_FROM_BASE_BRANCH": cache_from_base_branch,
102+
"$CACHE_FROM_MAIN": cache_from_main,
103+
"$CACHE_TO": cache_to,
102104
}
103105

104106

buildkite/pipeline_generator/global_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
import re
55
import requests
6-
from utils_lib.git_utils import get_merge_base_commit, get_list_file_diff, get_pr_labels
6+
from .utils_lib.git_utils import get_merge_base_commit, get_list_file_diff, get_pr_labels
77

88

99
class GlobalConfig(TypedDict):

buildkite/pipeline_generator/pipeline_generator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
import yaml
33
import subprocess
44
import sys
5-
from step import read_steps_from_job_dir, group_steps
6-
from buildkite_step import convert_group_step_to_buildkite_step
7-
from global_config import init_global_config, get_global_config
5+
from .step import read_steps_from_job_dir, group_steps
6+
from .buildkite_step import convert_group_step_to_buildkite_step
7+
from .global_config import init_global_config, get_global_config
88

99

1010
class PipelineGenerator:

buildkite/pipeline_generator/plugin/docker_plugin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from step import Step
2-
from constants import GPUType
1+
from ..step import Step
2+
from ..constants import GPUType
33
import copy
44

55
docker_plugin_template = {

buildkite/pipeline_generator/plugin/k8s_plugin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import copy
2-
from step import Step
3-
from constants import GPUType
2+
from ..step import Step
3+
from ..constants import GPUType
44

55
HF_HOME = "/root/.cache/huggingface"
66

buildkite/pipeline_generator/step.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pydantic import model_validator
44
from typing_extensions import Self
55
from collections import defaultdict
6-
from global_config import get_global_config
6+
from .global_config import get_global_config
77
import os
88
import yaml
99

@@ -91,3 +91,8 @@ def group_steps(steps: List[Step]) -> Dict[str, List[Step]]:
9191
for group, steps in grouped_steps.items():
9292
sorted_grouped_steps[group] = sorted(steps, key=lambda x: x.label)
9393
return sorted_grouped_steps
94+
95+
96+
def sort_steps(steps: List[Step]) -> List[Step]:
97+
"""Sort steps by group alphabetically, then by label within each group."""
98+
return sorted(steps, key=lambda x: (x.group, x.label))

buildkite/pipeline_generator/utils_lib/docker_utils.py

Lines changed: 53 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33
import re
44
from typing import Tuple
5-
from global_config import get_global_config
5+
from ..global_config import get_global_config
66

77

88
def get_image(cpu: bool = False) -> str:
@@ -21,80 +21,63 @@ def get_image(cpu: bool = False) -> str:
2121
return image
2222

2323

24-
def _clean_docker_tag(tag: str) -> str:
25-
# Only allows alphanumeric, dashes and underscores for Docker tags, and replaces others with '-'
26-
return re.sub(r"[^a-zA-Z0-9_.-]", "-", tag or "")
24+
def clean_docker_tag(tag: str) -> str:
25+
"""
26+
Function to replace invalid characters in Docker image tags and truncate to 128 chars
27+
Valid characters: a-z, A-Z, 0-9, _, ., -
28+
"""
29+
# Replace invalid characters with underscore and truncate to 128 chars
30+
cleaned = re.sub(r"[^a-zA-Z0-9._-]", "_", tag or "")
31+
return cleaned[:128]
2732

2833

29-
def _docker_manifest_exists(image_tag: str) -> bool:
30-
try:
31-
subprocess.run(
32-
["docker", "manifest", "inspect", image_tag],
33-
stdout=subprocess.DEVNULL,
34-
stderr=subprocess.DEVNULL,
35-
check=True,
36-
)
37-
return True
38-
except subprocess.CalledProcessError:
39-
return False
4034

41-
42-
def get_ecr_cache_registry() -> Tuple[str, str]:
35+
def resolve_ecr_cache_vars() -> Tuple[str, str, str, str]:
36+
"""
37+
Resolve ECR cache-from, cache-to using buildkite environment variables:
38+
- BUILDKITE_BRANCH
39+
- BUILDKITE_PULL_REQUEST
40+
- BUILDKITE_PULL_REQUEST_BASE_BRANCH
41+
Return tuple of:
42+
- CACHE_FROM: primary cache source
43+
- CACHE_FROM_BASE_BRANCH: secondary cache source
44+
- CACHE_FROM_MAIN: fallback cache source
45+
- CACHE_TO: cache destination
46+
Note: CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN could be the same.
47+
This is intended behavior to allow BuildKit to merge all possible cache source
48+
to maximize cache hit potential.
49+
"""
4350
global_config = get_global_config()
4451
branch = global_config["branch"]
52+
pull_request = global_config["pull_request"]
53+
54+
# Define ECR repository URLs for test and main cache
4555
test_cache_ecr = "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
46-
postmerge_cache_ecr = (
47-
"936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
48-
)
49-
cache_from_tag, cache_to_tag = None, None
50-
# Authenticate Docker to AWS ECR
51-
login_cmd = ["aws", "ecr", "get-login-password", "--region", "us-east-1"]
52-
try:
53-
proc = subprocess.Popen(login_cmd, stdout=subprocess.PIPE)
54-
subprocess.run(
55-
[
56-
"docker",
57-
"login",
58-
"--username",
59-
"AWS",
60-
"--password-stdin",
61-
"936637512419.dkr.ecr.us-east-1.amazonaws.com",
62-
],
63-
stdin=proc.stdout,
64-
check=True,
65-
)
66-
proc.stdout.close()
67-
proc.wait()
68-
except Exception as e:
69-
raise RuntimeError(f"Failed to authenticate with AWS ECR: {e}")
70-
71-
if global_config["pull_request"]: # PR build
72-
cache_to_tag = f"{test_cache_ecr}:pr-{global_config['pull_request']}"
73-
if _docker_manifest_exists(cache_to_tag): # use PR cache if exists
74-
cache_from_tag = cache_to_tag
75-
elif (
76-
os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH") != "main"
77-
): # use base branch cache if exists
78-
clean_base = _clean_docker_tag(
79-
os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH")
80-
)
81-
if _docker_manifest_exists(f"{test_cache_ecr}:{clean_base}"):
82-
cache_from_tag = f"{test_cache_ecr}:{clean_base}"
83-
else: # fall back to postmerge cache ecr if base branch cache does not exist
84-
cache_from_tag = f"{postmerge_cache_ecr}:latest"
56+
main_cache_ecr = "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
57+
58+
if not pull_request or pull_request == "false":
59+
# Not a PR
60+
if branch == "main":
61+
cache = f"{main_cache_ecr}:latest"
8562
else:
86-
cache_from_tag = f"{postmerge_cache_ecr}:latest"
87-
else: # non-PR build
88-
if branch == "main": # postmerge
89-
cache_to_tag = f"{postmerge_cache_ecr}:latest"
90-
cache_from_tag = f"{postmerge_cache_ecr}:latest"
63+
clean_branch = clean_docker_tag(branch)
64+
cache = f"{test_cache_ecr}:{clean_branch}"
65+
66+
cache_to = cache
67+
cache_from = cache
68+
cache_from_base_branch = cache
69+
else:
70+
# PR build
71+
cache_to = f"{test_cache_ecr}:pr-{pull_request}"
72+
cache_from = f"{test_cache_ecr}:pr-{pull_request}"
73+
74+
base_branch = os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH", "main")
75+
if base_branch == "main":
76+
cache_from_base_branch = f"{main_cache_ecr}:latest"
9177
else:
92-
clean_branch = _clean_docker_tag(branch)
93-
cache_to_tag = f"{test_cache_ecr}:{clean_branch}"
94-
if _docker_manifest_exists(f"{test_cache_ecr}:{clean_branch}"):
95-
cache_from_tag = f"{test_cache_ecr}:{clean_branch}"
96-
else:
97-
cache_from_tag = f"{postmerge_cache_ecr}:latest"
98-
if not cache_from_tag or not cache_to_tag:
99-
raise RuntimeError("Failed to get ECR cache tags")
100-
return cache_from_tag, cache_to_tag
78+
clean_base = clean_docker_tag(base_branch)
79+
cache_from_base_branch = f"{test_cache_ecr}:{clean_base}"
80+
81+
cache_from_main = f"{main_cache_ecr}:latest"
82+
83+
return cache_from, cache_from_base_branch, cache_from_main, cache_to

0 commit comments

Comments
 (0)