Skip to content

Commit

Permalink
Merge branch 'master' into improve-pvc-error-message
Browse files Browse the repository at this point in the history
  • Loading branch information
mahdikhashan committed Jan 26, 2025
2 parents 05dbea6 + 40e1e65 commit 274060f
Show file tree
Hide file tree
Showing 127 changed files with 1,616 additions and 2,545 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-darts-cifar10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ jobs:
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
# Comma Delimited
experiments: ["darts-cpu"]
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-enas-cifar10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ jobs:
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
# Comma Delimited
experiments: ["enas-cpu"]
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-pytorch-mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
# Comma Delimited
experiments:
# suggestion-hyperopt
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-simple-pbt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ jobs:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
# Comma Delimited
experiments: ["simple-pbt"]
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-tf-mnist-with-summaries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ jobs:
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
# Comma Delimited
experiments: ["tfjob-mnist-with-summaries"]
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-tune-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ jobs:
fail-fast: false
matrix:
# Detail: https://hub.docker.com/r/kindest/node
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-ui-random-search-postgres.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
kubernetes-version: ["v1.29.2", "v1.30.7", "v1.31.3"]
2 changes: 1 addition & 1 deletion .github/workflows/template-publish-image/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,5 +58,5 @@ runs:
push: ${{ inputs.push }}
tags: ${{ steps.meta.outputs.tags }}
cache-from: type=gha
cache-to: type=gha,mode=max
cache-to: type=gha,mode=max,ignore-error=true
platforms: ${{ inputs.platforms }}
2 changes: 1 addition & 1 deletion .github/workflows/template-setup-e2e-test/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ runs:
cni: flannel
driver: none
kubernetes-version: ${{ inputs.kubernetes-version }}
minikube-version: 1.31.1
minikube-version: 1.34.0
start-args: --wait-timeout=120s

- name: Setup Docker Buildx
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test-go.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
fail-fast: false
matrix:
# Detail: `setup-envtest list`
kubernetes-version: ["1.27.1", "1.28.3", "1.29.3"]
kubernetes-version: ["1.29.3", "1.30.0", "1.31.0"]

# notifies that all test jobs are finished.
finish:
Expand Down
7 changes: 4 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,10 @@ Below is a list of command-line flags accepted by Katib controller:

Below is a list of command-line flags accepted by Katib DB Manager:

| Name | Type | Default | Description |
| --------------- | ------------- | ------- | ------------------------------------------------------- |
| connect-timeout | time.Duration | 60s | Timeout before calling error during database connection |
| Name | Type | Default | Description |
| --------------- | ------------- | -------------| ------------------------------------------------------------------- |
| connect-timeout | time.Duration | 60s | Timeout before calling error during database connection |
| listen-address | string | 0.0.0.0:6789 | The network interface or IP address to receive incoming connections |

## Katib admission webhooks

Expand Down
12 changes: 8 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ HAS_MOCKGEN := $(shell command -v mockgen;)
COMMIT := v1beta1-$(shell git rev-parse --short=7 HEAD)
KATIB_REGISTRY := docker.io/kubeflowkatib
CPU_ARCH ?= linux/amd64,linux/arm64
ENVTEST_K8S_VERSION ?= 1.29
ENVTEST_K8S_VERSION ?= 1.31
MOCKGEN_VERSION ?= $(shell grep 'go.uber.org/mock' go.mod | cut -d ' ' -f 2)
GO_VERSION=$(shell grep '^go' go.mod | cut -d ' ' -f 2)
GOPATH ?= $(shell go env GOPATH)
Expand All @@ -21,7 +21,7 @@ test: envtest

envtest:
ifndef HAS_SETUP_ENVTEST
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@bf15e44028f908c790721fc8fe67c7bf2d06a611 #v0.17.3
go install sigs.k8s.io/controller-runtime/tools/setup-envtest@release-0.19
$(info "setup-envtest has been installed")
endif
$(info "setup-envtest has already installed")
Expand Down Expand Up @@ -79,18 +79,22 @@ endif
sync-go-mod:
go mod tidy -go $(GO_VERSION)

.PHONY: go-mod-download
go-mod-download:
go mod download

CONTROLLER_GEN = $(shell pwd)/bin/controller-gen
.PHONY: controller-gen
controller-gen:
@GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.14.0
@GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.16.5

# Run this if you update any existing controller APIs.
# 1. Generate deepcopy, clientset, listers, informers for the APIs (hack/update-codegen.sh)
# 2. Generate open-api for the APIs (hack/update-openapigen)
# 3. Generate Python SDK for Katib (hack/gen-python-sdk/gen-sdk.sh)
# 4. Generate gRPC manager APIs (pkg/apis/manager/v1beta1/build.sh and pkg/apis/manager/health/build.sh)
# 5. Generate Go mock codes
generate: controller-gen
generate: go-mod-download controller-gen
ifndef HAS_MOCKGEN
go install go.uber.org/mock/mockgen@$(MOCKGEN_VERSION)
$(info "mockgen has been installed")
Expand Down
10 changes: 6 additions & 4 deletions cmd/db-manager/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ import (
api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
db "github.com/kubeflow/katib/pkg/db/v1beta1"
"github.com/kubeflow/katib/pkg/db/v1beta1/common"
"k8s.io/klog"
"k8s.io/klog/v2"

"google.golang.org/grpc"
"google.golang.org/grpc/reflection"
)

const (
port = "0.0.0.0:6789"
defaultListenAddress = "0.0.0.0:6789"
defaultConnectTimeout = time.Second * 60
)

Expand Down Expand Up @@ -90,7 +90,9 @@ func (s *server) Check(ctx context.Context, in *health_pb.HealthCheckRequest) (*

func main() {
var connectTimeout time.Duration
var listenAddress string
flag.DurationVar(&connectTimeout, "connect-timeout", defaultConnectTimeout, "Timeout before calling error during database connection. (e.g. 120s)")
flag.StringVar(&listenAddress, "listen-address", defaultListenAddress, "The network interface or IP address to receive incoming connections. (e.g. 0.0.0.0:6789)")
flag.Parse()

var err error
Expand All @@ -104,13 +106,13 @@ func main() {
klog.Fatalf("Failed to open db connection: %v", err)
}
dbIf.DBInit()
listener, err := net.Listen("tcp", port)
listener, err := net.Listen("tcp", listenAddress)
if err != nil {
klog.Fatalf("Failed to listen: %v", err)
}

size := 1<<31 - 1
klog.Infof("Start Katib manager: %s", port)
klog.Infof("Start Katib manager: %s", listenAddress)
s := grpc.NewServer(grpc.MaxRecvMsgSize(size), grpc.MaxSendMsgSize(size))
api_pb.RegisterDBManagerServer(s, &server{})
health_pb.RegisterHealthServer(s, &server{})
Expand Down
2 changes: 1 addition & 1 deletion cmd/earlystopping/medianstop/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
protobuf>=4.21.12,<5
googleapis-common-protos==1.6.0
kubernetes==22.6.0
Expand Down
6 changes: 3 additions & 3 deletions cmd/metricscollector/v1beta1/file-metricscollector/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ import (
psutil "github.com/shirou/gopsutil/v3/process"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/klog"
"k8s.io/klog/v2"

commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1"
api "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
Expand Down Expand Up @@ -311,7 +311,7 @@ func watchMetricsFile(mFile string, stopRules stopRulesFlag, filters []string, f
}

// Create connection and client for Early Stopping service.
conn, err := grpc.Dial(*earlyStopServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
conn, err := grpc.NewClient(*earlyStopServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
klog.Fatalf("Could not connect to Early Stopping service, error: %v", err)
}
Expand Down Expand Up @@ -433,7 +433,7 @@ func main() {

func reportMetrics(filters []string, fileFormat commonv1beta1.FileFormat) {

conn, err := grpc.Dial(*dbManagerServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
conn, err := grpc.NewClient(*dbManagerServiceAddr, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
klog.Fatalf("Could not connect to DB manager service, error: %v", err)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
psutil==5.9.4
rfc3339>=6.2
grpcio>=1.41.1
grpcio>=1.64.1
googleapis-common-protos==1.6.0
tensorflow==2.16.1
protobuf>=4.21.12,<5
2 changes: 1 addition & 1 deletion cmd/suggestion/goptuna/v1beta1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
api_v1_beta1 "github.com/kubeflow/katib/pkg/apis/manager/v1beta1"
suggestion "github.com/kubeflow/katib/pkg/suggestion/v1beta1/goptuna"
"google.golang.org/grpc"
"k8s.io/klog"
"k8s.io/klog/v2"
)

const (
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/hyperband/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
cloudpickle==0.5.6
numpy>=1.25.2
scikit-learn>=0.24.0
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/hyperopt/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
cloudpickle==0.5.6
numpy>=1.25.2
scikit-learn>=0.24.0
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/nas/darts/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
protobuf>=4.21.12,<5
googleapis-common-protos==1.6.0
cython>=0.29.24
2 changes: 1 addition & 1 deletion cmd/suggestion/nas/enas/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
googleapis-common-protos==1.6.0
cython>=0.29.24
tensorflow==2.16.1
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/optuna/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
protobuf>=4.21.12,<5
googleapis-common-protos==1.53.0
optuna==3.3.0
2 changes: 1 addition & 1 deletion cmd/suggestion/pbt/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
protobuf>=4.21.12,<5
googleapis-common-protos==1.53.0
numpy==1.25.2
2 changes: 1 addition & 1 deletion cmd/suggestion/skopt/v1beta1/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
grpcio>=1.41.1
grpcio>=1.64.1
cloudpickle==0.5.6
# This is a workaround to avoid the following error.
# AttributeError: module 'numpy' has no attribute 'int'
Expand Down
2 changes: 1 addition & 1 deletion docs/proposals/2339-hpo-for-llm-fine-tuning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Our goal is to develop a high-level API for tuning hyperparameters of LLMs that

## Design for API

![Design for API](images/design_api.jpg)
![Design for API](hp-optimization-api-design.jpg)

```python
import kubeflow.katib as katib
Expand Down
File renamed without changes
2 changes: 1 addition & 1 deletion examples/v1beta1/kind-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Install the following tools to run the example:

- [Docker](https://docs.docker.com/get-docker) >= 20.10
- [Kind](https://kind.sigs.k8s.io/docs/user/quick-start/#installation) >= 0.13
- [`kubectl`](https://kubernetes.io/docs/tasks/tools/#kubectl) >= 1.27
- [`kubectl`](https://kubernetes.io/docs/tasks/tools/#kubectl) >= 1.29

## Installation

Expand Down
Loading

0 comments on commit 274060f

Please sign in to comment.