diff --git a/go.mod b/go.mod index c7d1f1aa0e..6b6c74558b 100644 --- a/go.mod +++ b/go.mod @@ -1,26 +1,37 @@ module github.com/kubeflow/trainer -go 1.23 +go 1.23.0 + +toolchain go1.23.1 require ( github.com/go-logr/logr v1.4.2 github.com/google/go-cmp v0.6.0 - github.com/onsi/ginkgo/v2 v2.20.1 - github.com/onsi/gomega v1.35.1 + github.com/onsi/ginkgo/v2 v2.22.2 + github.com/onsi/gomega v1.36.2 github.com/open-policy-agent/cert-controller v0.12.0 go.uber.org/zap v1.27.0 - k8s.io/api v0.31.3 - k8s.io/apimachinery v0.31.3 - k8s.io/client-go v0.31.3 - k8s.io/code-generator v0.31.3 + k8s.io/api v0.32.1 + k8s.io/apimachinery v0.32.1 + k8s.io/client-go v0.32.1 + k8s.io/code-generator v0.32.1 k8s.io/klog/v2 v2.130.1 - k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f - k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 - sigs.k8s.io/controller-runtime v0.19.1 - sigs.k8s.io/jobset v0.5.2 + k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f + k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 + sigs.k8s.io/controller-runtime v0.20.1 + sigs.k8s.io/jobset v0.8.0-devel.0.20250212132206-c69f95cd53b4 sigs.k8s.io/kueue v0.6.3 sigs.k8s.io/scheduler-plugins v0.30.6 - sigs.k8s.io/structured-merge-diff/v4 v4.4.1 + sigs.k8s.io/structured-merge-diff/v4 v4.5.0 +) + +replace ( + k8s.io/api => k8s.io/api v0.31.3 + k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.31.3 + k8s.io/apimachinery => k8s.io/apimachinery v0.31.3 + k8s.io/client-go => k8s.io/client-go v0.31.3 + k8s.io/code-generator => k8s.io/code-generator v0.31.3 + sigs.k8s.io/controller-runtime => sigs.k8s.io/controller-runtime v0.19.1 ) require ( @@ -42,7 +53,7 @@ require ( github.com/golang/protobuf v1.5.4 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 // indirect + github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect github.com/imdario/mergo v0.3.16 // indirect github.com/josharian/intern v1.0.0 // indirect @@ -53,7 +64,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.20.2 // indirect + github.com/prometheus/client_golang v1.20.5 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect @@ -62,23 +73,23 @@ require ( go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/mod v0.20.0 // indirect + golang.org/x/mod v0.22.0 // indirect golang.org/x/net v0.33.0 // indirect - golang.org/x/oauth2 v0.21.0 // indirect + golang.org/x/oauth2 v0.23.0 // indirect golang.org/x/sync v0.10.0 // indirect golang.org/x/sys v0.28.0 // indirect golang.org/x/term v0.27.0 // indirect golang.org/x/text v0.21.0 // indirect - golang.org/x/time v0.6.0 // indirect - golang.org/x/tools v0.24.0 // indirect + golang.org/x/time v0.7.0 // indirect + golang.org/x/tools v0.28.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/protobuf v1.35.1 // indirect + google.golang.org/protobuf v1.36.1 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.31.2 // indirect - k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 // indirect - sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect + k8s.io/apiextensions-apiserver v0.32.0 // indirect + k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect + sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/yaml v1.4.0 // indirect ) diff --git a/go.sum b/go.sum index 65d94f209d..dd330c8f51 100644 --- a/go.sum +++ b/go.sum @@ -42,8 +42,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 h1:5iH8iuqE5apketRbSFBy+X1V0o+l+8NF1avt4HWl7cA= -github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= +github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= @@ -71,10 +71,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.20.1 h1:YlVIbqct+ZmnEph770q9Q7NVAz4wwIiVNahee6JyUzo= -github.com/onsi/ginkgo/v2 v2.20.1/go.mod h1:lG9ey2Z29hR41WMVthyJBGUBcBhGOtoPF2VFMvBXFCI= -github.com/onsi/gomega v1.35.1 h1:Cwbd75ZBPxFSuZ6T+rN/WCb/gOc6YgFBXLlZLhC7Ds4= -github.com/onsi/gomega v1.35.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog= +github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU= +github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= +github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= +github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= github.com/open-policy-agent/cert-controller v0.12.0 h1:RKXlBafMcCh+++I1geJetXo77tAjyb4542DQc/+aZIw= github.com/open-policy-agent/cert-controller v0.12.0/go.mod h1:N5bCFXdAXMYx0PdS6ZQ9lrDQQMz+F6deoChym6VleXw= github.com/open-policy-agent/frameworks/constraint v0.0.0-20241101234656-e78c8abd754a h1:gQtOJ50XFyL2Xh3lDD9zP4KQ2PY4mZKQ9hDcWc81Sp8= @@ -84,8 +84,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.2 h1:5ctymQzZlyOON1666svgwn3s6IKWgfbjsejTMiXIyjg= -github.com/prometheus/client_golang v1.20.2/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= +github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= @@ -98,8 +98,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -119,16 +119,16 @@ golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= -golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= +golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= +golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -145,22 +145,22 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= -golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= +golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= -golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= +golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8= +golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= -google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk= +google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -168,42 +168,41 @@ gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSP gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8= k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE= -k8s.io/apiextensions-apiserver v0.31.2 h1:W8EwUb8+WXBLu56ser5IudT2cOho0gAKeTOnywBLxd0= -k8s.io/apiextensions-apiserver v0.31.2/go.mod h1:i+Geh+nGCJEGiCGR3MlBDkS7koHIIKWVfWeRFiOsUcM= +k8s.io/apiextensions-apiserver v0.31.3 h1:+GFGj2qFiU7rGCsA5o+p/rul1OQIq6oYpQw4+u+nciE= +k8s.io/apiextensions-apiserver v0.31.3/go.mod h1:2DSpFhUZZJmn/cr/RweH1cEVVbzFw9YBu4T+U3mf1e4= k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4= k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo= k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4= k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs= k8s.io/code-generator v0.31.3 h1:Pj0fYOBms+ZrsulLi4DMsCEx1jG8fWKRLy44onHsLBI= k8s.io/code-generator v0.31.3/go.mod h1:/umCIlT84g1+Yu5ZXtP1KGSRTnGiIzzX5AzUAxsNlts= -k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70 h1:NGrVE502P0s0/1hudf8zjgwki1X/TByhmAoILTarmzo= -k8s.io/gengo/v2 v2.0.0-20240228010128-51d4e06bde70/go.mod h1:VH3AT8AaQOqiGjMF9p0/IM1Dj+82ZwjfxUP1IxaHE+8= +k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 h1:si3PfKm8dDYxgfbeA6orqrtLkvvIeH8UqffFJDl0bz4= +k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-aggregator v0.31.2 h1:Uw1zUP2D/4wiSjKWVVzSOcCGLuW/+IdRwjjC0FJooYU= k8s.io/kube-aggregator v0.31.2/go.mod h1:41/VIXH+/Qcg9ERNAY6bRF/WQR6xL1wFgYagdHac1X4= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f h1:0LQagt0gDpKqvIkAMPaRGcXawNMouPECM1+F9BVxEaM= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f/go.mod h1:S9tOR0FxgyusSNR+MboCuiDpVWkAifZvaYI1Q2ubgro= -k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A= -k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= +k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= +k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.19.1 h1:Son+Q40+Be3QWb+niBXAg2vFiYWolDjjRfO8hn/cxOk= sigs.k8s.io/controller-runtime v0.19.1/go.mod h1:iRmWllt8IlaLjvTTDLhRBXIEtkCK6hwVBJJsYS9Ajf4= -sigs.k8s.io/jobset v0.5.2 h1:276q5Pi/ErLYj+GQ0ydEXR6tx3LwBhEzHLQv+k8bYF4= -sigs.k8s.io/jobset v0.5.2/go.mod h1:Vg99rj/6OoGvy1uvywGEHOcVLCWWJYkJtisKqdWzcFw= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= -sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/jobset v0.8.0-devel.0.20250212132206-c69f95cd53b4 h1:f4fx7+T4Bp6v+nFs5bCPq/py+Xt6DYEHbWhF/CRkAUQ= +sigs.k8s.io/jobset v0.8.0-devel.0.20250212132206-c69f95cd53b4/go.mod h1:egRLNm7qi4s1cj+sPvleUagDF5icYb7UH4FwGlni6+Q= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= +sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/kueue v0.6.3 h1:PmccdKPDFQIaAboyuSG6M0w6hXtxVA51RV+DjCUtBtQ= sigs.k8s.io/kueue v0.6.3/go.mod h1:rliYfK/K7pJ7CT4ReV1szzciNkAo3sBn5Bmr5Sn6uCY= sigs.k8s.io/scheduler-plugins v0.30.6 h1:P4pViMVoyVNHWmkG96UtJ4LvxkUIeenIUKLZd09vDyw= sigs.k8s.io/scheduler-plugins v0.30.6/go.mod h1:EDYYqHmpHR//VYKAeud1TTQbTFSvpdGFeyEg9ejOmnI= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= -sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= +sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk= +sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml b/manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml index 3aaa6efdc3..d37596c2d4 100644 --- a/manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml +++ b/manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml @@ -637,6 +637,31 @@ spec: description: Specification of the desired JobSet which will be created from TrainJob. properties: + coordinator: + description: |- + Coordinator can be used to assign a specific pod as the coordinator for + the JobSet. If defined, an annotation will be added to all Jobs and pods with + coordinator pod, which contains the stable network endpoint where the + coordinator pod can be reached. + jobset.sigs.k8s.io/coordinator=. + properties: + jobIndex: + description: |- + JobIndex is the index of Job which contains the coordinator pod + (i.e., for a ReplicatedJob with N replicas, there are Job indexes 0 to N-1). + type: integer + podIndex: + description: PodIndex is the Job completion index of the + coordinator pod. + type: integer + replicatedJob: + description: |- + ReplicatedJob is the name of the ReplicatedJob which contains + the coordinator pod. + type: string + required: + - replicatedJob + type: object failurePolicy: description: |- FailurePolicy, if set, configures when to declare the JobSet as @@ -650,13 +675,79 @@ spec: A restart is achieved by recreating all active child jobs. format: int32 type: integer + restartStrategy: + default: Recreate + description: |- + RestartStrategy defines the strategy to use when restarting the JobSet. + Defaults to Recreate. + enum: + - Recreate + - BlockingRecreate + type: string + rules: + description: |- + List of failure policy rules for this JobSet. + For a given Job failure, the rules will be evaluated in order, + and only the first matching rule will be executed. + If no matching rule is found, the RestartJobSet action is applied. + items: + description: |- + FailurePolicyRule defines a FailurePolicyAction to be executed if a child job + fails due to a reason listed in OnJobFailureReasons. + properties: + action: + description: The action to take if the rule is matched. + enum: + - FailJobSet + - RestartJobSet + - RestartJobSetAndIgnoreMaxRestarts + type: string + name: + description: |- + The name of the failure policy rule. + The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. + The name must match the regular expression "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$". + type: string + onJobFailureReasons: + description: |- + The requirement on the job failure reasons. The requirement + is satisfied if at least one reason matches the list. + The rules are evaluated in order, and the first matching + rule is executed. + An empty list applies the rule to any job failure reason. + items: + type: string + type: array + targetReplicatedJobs: + description: |- + TargetReplicatedJobs are the names of the replicated jobs the operator applies to. + An empty list will apply to all replicatedJobs. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - action + - name + type: object + type: array type: object x-kubernetes-validations: - message: Value is immutable rule: self == oldSelf managedBy: - description: ManagedBy is used to indicate the controller - or entity that manages a JobSet + description: |- + ManagedBy is used to indicate the controller or entity that manages a JobSet. + The built-in JobSet controller reconciles JobSets which don't have this + field at all or the field value is the reserved string + `jobset.sigs.k8s.io/jobset-controller`, but skips reconciling JobSets + with a custom value for this field. + + The value must be a valid domain-prefixed path (e.g. acme.io/foo) - + all characters before the first "/" must be a valid subdomain as defined + by RFC 1123. All characters trailing the first "/" must be valid HTTP Path + characters as defined by RFC 3986. The value cannot exceed 63 characters. + The field is immutable. type: string network: description: Network defines the networking options for the @@ -668,6 +759,11 @@ spec: Pods will be reachable using the fully qualified pod hostname: ---. type: boolean + publishNotReadyAddresses: + description: |- + Indicates if DNS records of pods should be published before the pods are ready. + Defaults to True. + type: boolean subdomain: description: |- Subdomain is an explicit choice for a network subdomain name @@ -683,6 +779,44 @@ spec: form the set. items: properties: + dependsOn: + description: |- + DependsOn is an optional list that specifies the preceding ReplicatedJobs upon which + the current ReplicatedJob depends. If specified, the ReplicatedJob will be created + only after the referenced ReplicatedJobs reach their desired state. + The Order of ReplicatedJobs is defined by their enumeration in the slice. + Note, that the first ReplicatedJob in the slice cannot use the DependsOn API. + Currently, only a single item is supported in the DependsOn list. + If JobSet is suspended the all active ReplicatedJobs will be suspended. When JobSet is + resumed the Job sequence starts again. + This API is mutually exclusive with the StartupPolicy API. + items: + description: DependsOn defines the dependency on the + previous ReplicatedJob status. + properties: + name: + description: Name of the previous ReplicatedJob. + type: string + status: + description: Status defines the condition for + the ReplicatedJob. Only Ready or Complete status + can be set. + enum: + - Ready + - Complete + type: string + required: + - name + - status + type: object + maxItems: 1 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf name: description: |- Name is the name of the entry and will be used as a suffix @@ -9686,8 +9820,9 @@ spec: - name x-kubernetes-list-type: map startupPolicy: - description: StartupPolicy, if set, configures in what order - jobs must be started + description: |- + StartupPolicy, if set, configures in what order jobs must be started + Deprecated: StartupPolicy is deprecated, please use the DependsOn API. properties: startupPolicyOrder: description: |- @@ -9751,6 +9886,12 @@ spec: minimum: 0 type: integer type: object + x-kubernetes-validations: + - message: StartupPolicy and DependsOn APIs are mutually exclusive + rule: '!(has(self.startupPolicy) && self.startupPolicy.startupPolicyOrder + == ''InOrder'' && self.replicatedJobs.exists(x, has(x.dependsOn)))' + - message: DependsOn can't be set for the first ReplicatedJob + rule: '!(has(self.replicatedJobs[0].dependsOn))' type: object required: - template diff --git a/manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml b/manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml index 30d6a445c4..caad84524f 100644 --- a/manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml +++ b/manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml @@ -637,6 +637,31 @@ spec: description: Specification of the desired JobSet which will be created from TrainJob. properties: + coordinator: + description: |- + Coordinator can be used to assign a specific pod as the coordinator for + the JobSet. If defined, an annotation will be added to all Jobs and pods with + coordinator pod, which contains the stable network endpoint where the + coordinator pod can be reached. + jobset.sigs.k8s.io/coordinator=. + properties: + jobIndex: + description: |- + JobIndex is the index of Job which contains the coordinator pod + (i.e., for a ReplicatedJob with N replicas, there are Job indexes 0 to N-1). + type: integer + podIndex: + description: PodIndex is the Job completion index of the + coordinator pod. + type: integer + replicatedJob: + description: |- + ReplicatedJob is the name of the ReplicatedJob which contains + the coordinator pod. + type: string + required: + - replicatedJob + type: object failurePolicy: description: |- FailurePolicy, if set, configures when to declare the JobSet as @@ -650,13 +675,79 @@ spec: A restart is achieved by recreating all active child jobs. format: int32 type: integer + restartStrategy: + default: Recreate + description: |- + RestartStrategy defines the strategy to use when restarting the JobSet. + Defaults to Recreate. + enum: + - Recreate + - BlockingRecreate + type: string + rules: + description: |- + List of failure policy rules for this JobSet. + For a given Job failure, the rules will be evaluated in order, + and only the first matching rule will be executed. + If no matching rule is found, the RestartJobSet action is applied. + items: + description: |- + FailurePolicyRule defines a FailurePolicyAction to be executed if a child job + fails due to a reason listed in OnJobFailureReasons. + properties: + action: + description: The action to take if the rule is matched. + enum: + - FailJobSet + - RestartJobSet + - RestartJobSetAndIgnoreMaxRestarts + type: string + name: + description: |- + The name of the failure policy rule. + The name is defaulted to 'failurePolicyRuleN' where N is the index of the failure policy rule. + The name must match the regular expression "^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$". + type: string + onJobFailureReasons: + description: |- + The requirement on the job failure reasons. The requirement + is satisfied if at least one reason matches the list. + The rules are evaluated in order, and the first matching + rule is executed. + An empty list applies the rule to any job failure reason. + items: + type: string + type: array + targetReplicatedJobs: + description: |- + TargetReplicatedJobs are the names of the replicated jobs the operator applies to. + An empty list will apply to all replicatedJobs. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - action + - name + type: object + type: array type: object x-kubernetes-validations: - message: Value is immutable rule: self == oldSelf managedBy: - description: ManagedBy is used to indicate the controller - or entity that manages a JobSet + description: |- + ManagedBy is used to indicate the controller or entity that manages a JobSet. + The built-in JobSet controller reconciles JobSets which don't have this + field at all or the field value is the reserved string + `jobset.sigs.k8s.io/jobset-controller`, but skips reconciling JobSets + with a custom value for this field. + + The value must be a valid domain-prefixed path (e.g. acme.io/foo) - + all characters before the first "/" must be a valid subdomain as defined + by RFC 1123. All characters trailing the first "/" must be valid HTTP Path + characters as defined by RFC 3986. The value cannot exceed 63 characters. + The field is immutable. type: string network: description: Network defines the networking options for the @@ -668,6 +759,11 @@ spec: Pods will be reachable using the fully qualified pod hostname: ---. type: boolean + publishNotReadyAddresses: + description: |- + Indicates if DNS records of pods should be published before the pods are ready. + Defaults to True. + type: boolean subdomain: description: |- Subdomain is an explicit choice for a network subdomain name @@ -683,6 +779,44 @@ spec: form the set. items: properties: + dependsOn: + description: |- + DependsOn is an optional list that specifies the preceding ReplicatedJobs upon which + the current ReplicatedJob depends. If specified, the ReplicatedJob will be created + only after the referenced ReplicatedJobs reach their desired state. + The Order of ReplicatedJobs is defined by their enumeration in the slice. + Note, that the first ReplicatedJob in the slice cannot use the DependsOn API. + Currently, only a single item is supported in the DependsOn list. + If JobSet is suspended the all active ReplicatedJobs will be suspended. When JobSet is + resumed the Job sequence starts again. + This API is mutually exclusive with the StartupPolicy API. + items: + description: DependsOn defines the dependency on the + previous ReplicatedJob status. + properties: + name: + description: Name of the previous ReplicatedJob. + type: string + status: + description: Status defines the condition for + the ReplicatedJob. Only Ready or Complete status + can be set. + enum: + - Ready + - Complete + type: string + required: + - name + - status + type: object + maxItems: 1 + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + x-kubernetes-validations: + - message: Value is immutable + rule: self == oldSelf name: description: |- Name is the name of the entry and will be used as a suffix @@ -9686,8 +9820,9 @@ spec: - name x-kubernetes-list-type: map startupPolicy: - description: StartupPolicy, if set, configures in what order - jobs must be started + description: |- + StartupPolicy, if set, configures in what order jobs must be started + Deprecated: StartupPolicy is deprecated, please use the DependsOn API. properties: startupPolicyOrder: description: |- @@ -9751,6 +9886,12 @@ spec: minimum: 0 type: integer type: object + x-kubernetes-validations: + - message: StartupPolicy and DependsOn APIs are mutually exclusive + rule: '!(has(self.startupPolicy) && self.startupPolicy.startupPolicyOrder + == ''InOrder'' && self.replicatedJobs.exists(x, has(x.dependsOn)))' + - message: DependsOn can't be set for the first ReplicatedJob + rule: '!(has(self.replicatedJobs[0].dependsOn))' type: object required: - template