Skip to content

Commit 277d866

Browse files
author
Kubernetes Submit Queue
authored
Merge pull request kubernetes#50984 from timothysc/checkpoint
Automatic merge from submit-queue (batch tested with PRs 55812, 55752, 55447, 55848, 50984). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Initial basic bootstrap-checkpoint support **What this PR does / why we need it**: Adds initial support for Pod checkpointing to allow for controlled recovery of the control plane during self host failure conditions. fixes kubernetes#49236 xref kubernetes/enhancements#378 **Special notes for your reviewer**: Proposal is here: https://docs.google.com/document/d/1hhrCa_nv0Sg4O_zJYOnelE8a5ClieyewEsQM6c7-5-o/edit?ts=5988fba8# 1. Controlled tests work, but I have not tested the self hosted api-server recovery, that requires validation and logs. /cc @luxas 2. In adding hooks for checkpoint manager much of the tests around basicpodmanager appears to be stub'd. This has become an anti-pattern in the code and should be avoided. 3. I need a node-e2e to ensure consistency of behavior. **Release note**: ``` Add basic bootstrap checkpointing support to the kubelet for control plane recovery ``` /cc @kubernetes/sig-cluster-lifecycle-misc @kubernetes/sig-node-pr-reviews
2 parents 8a5cf78 + 763122a commit 277d866

File tree

25 files changed

+918
-101
lines changed

25 files changed

+918
-101
lines changed

Godeps/Godeps.json

+76-71
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Godeps/LICENSES

+34
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/kubelet/app/options/options.go

+4
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ type KubeletFlags struct {
157157
ExitOnLockContention bool
158158
// seccompProfileRoot is the directory path for seccomp profiles.
159159
SeccompProfileRoot string
160+
// bootstrapCheckpointPath is the path to the directory containing pod checkpoints to
161+
// run on restore
162+
BootstrapCheckpointPath string
160163

161164
// DEPRECATED FLAGS
162165
// minimumGCAge is the minimum age for a finished container before it is
@@ -343,6 +346,7 @@ func (f *KubeletFlags) AddFlags(fs *pflag.FlagSet) {
343346
fs.StringVar(&f.LockFilePath, "lock-file", f.LockFilePath, "<Warning: Alpha feature> The path to file for kubelet to use as a lock file.")
344347
fs.BoolVar(&f.ExitOnLockContention, "exit-on-lock-contention", f.ExitOnLockContention, "Whether kubelet should exit upon lock-file contention.")
345348
fs.StringVar(&f.SeccompProfileRoot, "seccomp-profile-root", f.SeccompProfileRoot, "<Warning: Alpha feature> Directory path for seccomp profiles.")
349+
fs.StringVar(&f.BootstrapCheckpointPath, "bootstrap-checkpoint-path", f.BootstrapCheckpointPath, "<Warning: Alpha feature> Path to to the directory where the checkpoints are stored")
346350

347351
// DEPRECATED FLAGS
348352
fs.DurationVar(&f.MinimumGCAge.Duration, "minimum-container-ttl-duration", f.MinimumGCAge.Duration, "Minimum age for a finished container before it is garbage collected. Examples: '300ms', '10s' or '2h45m'")

cmd/kubelet/app/server.go

+6-3
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,8 @@ func RunKubelet(kubeFlags *options.KubeletFlags, kubeCfg *kubeletconfiginternal.
729729
kubeFlags.NonMasqueradeCIDR,
730730
kubeFlags.KeepTerminatedPodVolumes,
731731
kubeFlags.NodeLabels,
732-
kubeFlags.SeccompProfileRoot)
732+
kubeFlags.SeccompProfileRoot,
733+
kubeFlags.BootstrapCheckpointPath)
733734
if err != nil {
734735
return fmt.Errorf("failed to create kubelet: %v", err)
735736
}
@@ -802,7 +803,8 @@ func CreateAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
802803
nonMasqueradeCIDR string,
803804
keepTerminatedPodVolumes bool,
804805
nodeLabels map[string]string,
805-
seccompProfileRoot string) (k kubelet.Bootstrap, err error) {
806+
seccompProfileRoot string,
807+
bootstrapCheckpointPath string) (k kubelet.Bootstrap, err error) {
806808
// TODO: block until all sources have delivered at least one update to the channel, or break the sync loop
807809
// up into "per source" synchronizations
808810

@@ -835,7 +837,8 @@ func CreateAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
835837
nonMasqueradeCIDR,
836838
keepTerminatedPodVolumes,
837839
nodeLabels,
838-
seccompProfileRoot)
840+
seccompProfileRoot,
841+
bootstrapCheckpointPath)
839842
if err != nil {
840843
return nil, err
841844
}

pkg/apis/core/annotation_key_constants.go

+4
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ const (
6868
// This annotation can be attached to node.
6969
ObjectTTLAnnotationKey string = "node.alpha.kubernetes.io/ttl"
7070

71+
// BootstrapCheckpointAnnotationKey represents a Resource (Pod) that should be checkpointed by
72+
// the kubelet prior to running
73+
BootstrapCheckpointAnnotationKey string = "node.kubernetes.io/bootstrap-checkpoint"
74+
7175
// annotation key prefix used to identify non-convertible json paths.
7276
NonConvertibleAnnotationPrefix = "non-convertible.kubernetes.io"
7377

pkg/kubelet/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ filegroup(
254254
"//pkg/kubelet/apis:all-srcs",
255255
"//pkg/kubelet/cadvisor:all-srcs",
256256
"//pkg/kubelet/certificate:all-srcs",
257+
"//pkg/kubelet/checkpoint:all-srcs",
257258
"//pkg/kubelet/client:all-srcs",
258259
"//pkg/kubelet/cm:all-srcs",
259260
"//pkg/kubelet/config:all-srcs",

pkg/kubelet/checkpoint/BUILD

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
2+
3+
go_library(
4+
name = "go_default_library",
5+
srcs = ["checkpoint.go"],
6+
importpath = "k8s.io/kubernetes/pkg/kubelet/checkpoint",
7+
visibility = ["//visibility:public"],
8+
deps = [
9+
"//pkg/apis/core:go_default_library",
10+
"//pkg/volume/util:go_default_library",
11+
"//vendor/github.com/dchest/safefile:go_default_library",
12+
"//vendor/github.com/ghodss/yaml:go_default_library",
13+
"//vendor/github.com/golang/glog:go_default_library",
14+
"//vendor/k8s.io/api/core/v1:go_default_library",
15+
],
16+
)
17+
18+
go_test(
19+
name = "go_default_test",
20+
srcs = ["checkpoint_test.go"],
21+
importpath = "k8s.io/kubernetes/pkg/kubelet/checkpoint",
22+
library = ":go_default_library",
23+
deps = [
24+
"//pkg/apis/core:go_default_library",
25+
"//vendor/k8s.io/api/core/v1:go_default_library",
26+
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
27+
],
28+
)
29+
30+
filegroup(
31+
name = "package-srcs",
32+
srcs = glob(["**"]),
33+
tags = ["automanaged"],
34+
visibility = ["//visibility:private"],
35+
)
36+
37+
filegroup(
38+
name = "all-srcs",
39+
srcs = [":package-srcs"],
40+
tags = ["automanaged"],
41+
visibility = ["//visibility:public"],
42+
)

pkg/kubelet/checkpoint/checkpoint.go

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/*
2+
Copyright 2017 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package checkpoint
18+
19+
import (
20+
"fmt"
21+
"io/ioutil"
22+
"os"
23+
"path/filepath"
24+
"strings"
25+
"sync"
26+
27+
"github.com/dchest/safefile"
28+
"github.com/ghodss/yaml"
29+
"github.com/golang/glog"
30+
31+
"k8s.io/api/core/v1"
32+
"k8s.io/kubernetes/pkg/apis/core"
33+
"k8s.io/kubernetes/pkg/volume/util"
34+
)
35+
36+
const (
37+
// Delimiter used on checkpoints written to disk
38+
delimiter = "_"
39+
podPrefix = "Pod"
40+
)
41+
42+
// Manager is the interface used to manage checkpoints
43+
// which involves writing resources to disk to recover
44+
// during restart or failure scenarios.
45+
// https://github.com/kubernetes/community/pull/1241/files
46+
type Manager interface {
47+
// LoadPods will load checkpointed Pods from disk
48+
LoadPods() ([]*v1.Pod, error)
49+
50+
// WritePod will serialize a Pod to disk
51+
WritePod(pod *v1.Pod) error
52+
53+
// Deletes the checkpoint of the given pod from disk
54+
DeletePod(pod *v1.Pod) error
55+
}
56+
57+
var instance Manager
58+
var mutex = &sync.Mutex{}
59+
60+
// fileCheckPointManager - is a checkpointer that writes contents to disk
61+
// The type information of the resource objects are encoded in the name
62+
type fileCheckPointManager struct {
63+
path string
64+
}
65+
66+
// NewCheckpointManager will create a Manager that points to the following path
67+
func NewCheckpointManager(path string) Manager {
68+
// NOTE: This is a precaution; current implementation should not run
69+
// multiple checkpoint managers.
70+
mutex.Lock()
71+
defer mutex.Unlock()
72+
instance = &fileCheckPointManager{path: path}
73+
return instance
74+
}
75+
76+
// GetInstance will return the current Manager, there should be only one.
77+
func GetInstance() Manager {
78+
mutex.Lock()
79+
defer mutex.Unlock()
80+
return instance
81+
}
82+
83+
// loadPod will load Pod Checkpoint yaml file.
84+
func (fcp *fileCheckPointManager) loadPod(file string) (*v1.Pod, error) {
85+
return util.LoadPodFromFile(file)
86+
}
87+
88+
// checkAnnotations will validate the checkpoint annotations exist on the Pod
89+
func (fcp *fileCheckPointManager) checkAnnotations(pod *v1.Pod) bool {
90+
if podAnnotations := pod.GetAnnotations(); podAnnotations != nil {
91+
if podAnnotations[core.BootstrapCheckpointAnnotationKey] == "true" {
92+
return true
93+
}
94+
}
95+
return false
96+
}
97+
98+
// getPodPath returns the full qualified path for the pod checkpoint
99+
func (fcp *fileCheckPointManager) getPodPath(pod *v1.Pod) string {
100+
return fmt.Sprintf("%v/Pod%v%v.yaml", fcp.path, delimiter, pod.GetUID())
101+
}
102+
103+
// LoadPods Loads All Checkpoints from disk
104+
func (fcp *fileCheckPointManager) LoadPods() ([]*v1.Pod, error) {
105+
checkpoints := make([]*v1.Pod, 0)
106+
files, err := ioutil.ReadDir(fcp.path)
107+
if err != nil {
108+
return nil, err
109+
}
110+
for _, f := range files {
111+
// get just the filename
112+
_, fname := filepath.Split(f.Name())
113+
// Get just the Resource from "Resource_Name"
114+
fnfields := strings.Split(fname, delimiter)
115+
switch fnfields[0] {
116+
case podPrefix:
117+
pod, err := fcp.loadPod(fmt.Sprintf("%s/%s", fcp.path, f.Name()))
118+
if err != nil {
119+
return nil, err
120+
}
121+
checkpoints = append(checkpoints, pod)
122+
default:
123+
glog.Warningf("Unsupported checkpoint file detected %v", f)
124+
}
125+
}
126+
return checkpoints, nil
127+
}
128+
129+
// Writes a checkpoint to a file on disk if annotation is present
130+
func (fcp *fileCheckPointManager) WritePod(pod *v1.Pod) error {
131+
var err error
132+
if fcp.checkAnnotations(pod) {
133+
if blob, err := yaml.Marshal(pod); err == nil {
134+
err = safefile.WriteFile(fcp.getPodPath(pod), blob, 0644)
135+
}
136+
} else {
137+
// This is to handle an edge where a pod update could remove
138+
// an annotation and the checkpoint should then be removed.
139+
err = fcp.DeletePod(pod)
140+
}
141+
return err
142+
}
143+
144+
// Deletes a checkpoint from disk if present
145+
func (fcp *fileCheckPointManager) DeletePod(pod *v1.Pod) error {
146+
podPath := fcp.getPodPath(pod)
147+
if err := os.Remove(podPath); !os.IsNotExist(err) {
148+
return err
149+
}
150+
return nil
151+
}

0 commit comments

Comments
 (0)