Skip to content

Commit 806ef42

Browse files
authored
Add interlocks to ensure operations are not interrupted (#150)
* Add interlocks to ensure system-agent does not get restarted when it is applying a plan and does not start applying a plan when a restart is pending * Remove s390x from drone file * Don't always set CROSS to true when building Signed-off-by: Chris Kim <[email protected]>
1 parent 57830e0 commit 806ef42

File tree

6 files changed

+183
-98
lines changed

6 files changed

+183
-98
lines changed

.drone.yml

Lines changed: 94 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -160,96 +160,96 @@ volumes:
160160
host:
161161
path: /var/run/docker.sock
162162

163-
---
164-
kind: pipeline
165-
name: s390x
166-
167-
platform:
168-
os: linux
169-
arch: amd64
170-
171-
# Hack needed for s390x: https://gist.github.com/colstrom/c2f359f72658aaabb44150ac20b16d7c#gistcomment-3858388
172-
node:
173-
arch: s390x
174-
175-
steps:
176-
- name: build
177-
image: rancher/dapper:v0.6.0
178-
commands:
179-
- dapper ci
180-
volumes:
181-
- name: docker
182-
path: /var/run/docker.sock
183-
184-
- name: github_binary_release
185-
image: rancher/drone-images:github-release-s390x
186-
settings:
187-
api_key:
188-
from_secret: github_token
189-
prerelease: true
190-
checksum:
191-
- sha256
192-
checksum_file: CHECKSUMsum-s390x.txt
193-
checksum_flatten: true
194-
files:
195-
- "dist/artifacts/*"
196-
when:
197-
instance:
198-
- drone-publish.rancher.io
199-
ref:
200-
- refs/head/master
201-
- refs/tags/*
202-
event:
203-
- tag
204-
205-
- name: docker-publish
206-
image: rancher/drone-images:docker-s390x
207-
volumes:
208-
- name: docker
209-
path: /var/run/docker.sock
210-
settings:
211-
dockerfile: package/Dockerfile
212-
password:
213-
from_secret: docker_password
214-
repo: "rancher/system-agent"
215-
tag: "${DRONE_TAG}-s390x"
216-
username:
217-
from_secret: docker_username
218-
when:
219-
instance:
220-
- drone-publish.rancher.io
221-
ref:
222-
- refs/head/master
223-
- refs/tags/*
224-
event:
225-
- tag
226-
227-
- name: docker-publish-suc
228-
image: rancher/drone-images:docker-s390x
229-
volumes:
230-
- name: docker
231-
path: /var/run/docker.sock
232-
settings:
233-
dockerfile: package/Dockerfile.suc
234-
password:
235-
from_secret: docker_password
236-
repo: "rancher/system-agent"
237-
tag: "${DRONE_TAG}-suc-s390x"
238-
username:
239-
from_secret: docker_username
240-
when:
241-
instance:
242-
- drone-publish.rancher.io
243-
ref:
244-
- refs/head/master
245-
- refs/tags/*
246-
event:
247-
- tag
248-
249-
volumes:
250-
- name: docker
251-
host:
252-
path: /var/run/docker.sock
163+
#---
164+
#kind: pipeline
165+
#name: s390x
166+
#
167+
#platform:
168+
# os: linux
169+
# arch: amd64
170+
#
171+
## Hack needed for s390x: https://gist.github.com/colstrom/c2f359f72658aaabb44150ac20b16d7c#gistcomment-3858388
172+
#node:
173+
# arch: s390x
174+
#
175+
#steps:
176+
#- name: build
177+
# image: rancher/dapper:v0.6.0
178+
# commands:
179+
# - dapper ci
180+
# volumes:
181+
# - name: docker
182+
# path: /var/run/docker.sock
183+
#
184+
#- name: github_binary_release
185+
# image: rancher/drone-images:github-release-s390x
186+
# settings:
187+
# api_key:
188+
# from_secret: github_token
189+
# prerelease: true
190+
# checksum:
191+
# - sha256
192+
# checksum_file: CHECKSUMsum-s390x.txt
193+
# checksum_flatten: true
194+
# files:
195+
# - "dist/artifacts/*"
196+
# when:
197+
# instance:
198+
# - drone-publish.rancher.io
199+
# ref:
200+
# - refs/head/master
201+
# - refs/tags/*
202+
# event:
203+
# - tag
204+
#
205+
#- name: docker-publish
206+
# image: rancher/drone-images:docker-s390x
207+
# volumes:
208+
# - name: docker
209+
# path: /var/run/docker.sock
210+
# settings:
211+
# dockerfile: package/Dockerfile
212+
# password:
213+
# from_secret: docker_password
214+
# repo: "rancher/system-agent"
215+
# tag: "${DRONE_TAG}-s390x"
216+
# username:
217+
# from_secret: docker_username
218+
# when:
219+
# instance:
220+
# - drone-publish.rancher.io
221+
# ref:
222+
# - refs/head/master
223+
# - refs/tags/*
224+
# event:
225+
# - tag
226+
#
227+
#- name: docker-publish-suc
228+
# image: rancher/drone-images:docker-s390x
229+
# volumes:
230+
# - name: docker
231+
# path: /var/run/docker.sock
232+
# settings:
233+
# dockerfile: package/Dockerfile.suc
234+
# password:
235+
# from_secret: docker_password
236+
# repo: "rancher/system-agent"
237+
# tag: "${DRONE_TAG}-suc-s390x"
238+
# username:
239+
# from_secret: docker_username
240+
# when:
241+
# instance:
242+
# - drone-publish.rancher.io
243+
# ref:
244+
# - refs/head/master
245+
# - refs/tags/*
246+
# event:
247+
# - tag
248+
#
249+
#volumes:
250+
#- name: docker
251+
# host:
252+
# path: /var/run/docker.sock
253253

254254
---
255255
kind: pipeline
@@ -270,7 +270,7 @@ steps:
270270
platforms:
271271
- linux/amd64
272272
- linux/arm64
273-
- linux/s390x
273+
# - linux/s390x
274274
target: "rancher/system-agent:${DRONE_TAG}"
275275
template: "rancher/system-agent:${DRONE_TAG}-ARCH"
276276
when:
@@ -285,7 +285,7 @@ steps:
285285
depends_on:
286286
- amd64
287287
- arm64
288-
- s390x
288+
#- s390x
289289

290290
---
291291
kind: pipeline
@@ -306,7 +306,7 @@ steps:
306306
platforms:
307307
- linux/amd64
308308
- linux/arm64
309-
- linux/s390x
309+
# - linux/s390x
310310
target: "rancher/system-agent:${DRONE_TAG}-suc"
311311
template: "rancher/system-agent:${DRONE_TAG}-suc-ARCH"
312312
when:
@@ -321,4 +321,4 @@ steps:
321321
depends_on:
322322
- amd64
323323
- arm64
324-
- s390x
324+
#- s390x

install.sh

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ fi
4747
FALLBACK=v0.2.9
4848
CACERTS_PATH=cacerts
4949
RETRYCOUNT=4500
50+
APPLYINATOR_ACTIVE_WAIT_COUNT=60 # If the system-agent is unhealthy but had created an interlock file to indicate it was actively applying a plan, after 5 minutes, ignore the interlock.
5051

5152
# info logs the given argument at info log level.
5253
info() {
@@ -426,11 +427,13 @@ setup_env() {
426427
}
427428

428429
ensure_directories() {
429-
mkdir -p ${CATTLE_AGENT_VAR_DIR}
430+
mkdir -p ${CATTLE_AGENT_VAR_DIR}/interlock
430431
mkdir -p ${CATTLE_AGENT_CONFIG_DIR}
431432
chmod 700 ${CATTLE_AGENT_VAR_DIR}
433+
chmod 700 ${CATTLE_AGENT_VAR_DIR}/interlock
432434
chmod 700 ${CATTLE_AGENT_CONFIG_DIR}
433435
chown root:root ${CATTLE_AGENT_VAR_DIR}
436+
chown root:root ${CATTLE_AGENT_VAR_DIR}/interlock
434437
chown root:root ${CATTLE_AGENT_CONFIG_DIR}
435438
}
436439

@@ -783,6 +786,7 @@ appliedPlanDirectory: ${CATTLE_AGENT_VAR_DIR}/applied
783786
remoteEnabled: ${CATTLE_REMOTE_ENABLED}
784787
localEnabled: ${CATTLE_LOCAL_ENABLED}
785788
localPlanDirectory: ${CATTLE_AGENT_VAR_DIR}/plans
789+
interlockDirectory: ${CATTLE_AGENT_VAR_DIR}/interlock
786790
preserveWorkDirectory: ${CATTLE_PRESERVE_WORKDIR}
787791
EOF
788792
umask "${UMASK}"
@@ -796,6 +800,9 @@ generate_cattle_identifier() {
796800
info "Generating Cattle ID"
797801
if [ -f "${CATTLE_AGENT_CONFIG_DIR}/cattle-id" ]; then
798802
CATTLE_ID=$(cat ${CATTLE_AGENT_CONFIG_DIR}/cattle-id);
803+
if [ -z "${CATTLE_ID}" ]; then
804+
fatal "Cattle ID was empty, aborting installation"
805+
fi
799806
info "Cattle ID was already detected as ${CATTLE_ID}. Not generating a new one."
800807
return
801808
fi
@@ -805,6 +812,9 @@ generate_cattle_identifier() {
805812
umask 0177
806813
echo "${CATTLE_ID}" > ${CATTLE_AGENT_CONFIG_DIR}/cattle-id
807814
umask "${UMASK}"
815+
if [ ! -s ${CATTLE_AGENT_CONFIG_DIR}/cattle-id ]; then
816+
fatal "Cattle ID could not be persisted. Aborting installation"
817+
fi
808818
return
809819
fi
810820
info "Not generating Cattle ID"
@@ -832,6 +842,19 @@ create_env_file() {
832842
done
833843
}
834844

845+
ensure_applyinator_not_active() {
846+
i=1
847+
while [ "${i}" -ne "${APPLYINATOR_ACTIVE_WAIT_COUNT}" ]; do
848+
if [ -f "${CATTLE_AGENT_VAR_DIR}/interlock/applyinator-active" ]; then
849+
i=$((i + 1))
850+
info "Active plan reconciliation detected. Sleeping for 5 seconds and retrying check"
851+
sleep 5
852+
continue
853+
fi
854+
break
855+
done
856+
}
857+
835858
do_install() {
836859
if [ $(id -u) != 0 ]; then
837860
fatal "This script must be run as root."
@@ -843,6 +866,9 @@ do_install() {
843866
ensure_directories
844867
verify_downloader curl || fatal "can not find curl for downloading files"
845868

869+
touch ${CATTLE_AGENT_VAR_DIR}/interlock/restart-pending
870+
ensure_applyinator_not_active
871+
846872
if [ -n "${CATTLE_CA_CHECKSUM}" ]; then
847873
validate_ca_required
848874
fi
@@ -865,6 +891,7 @@ do_install() {
865891
systemctl enable rancher-system-agent
866892
info "Starting/restarting rancher-system-agent.service"
867893
systemctl restart rancher-system-agent
894+
rm -f ${CATTLE_AGENT_VAR_DIR}/interlock/restart-pending
868895
}
869896

870897
do_install "$@"

main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func run(c *cli.Context) error {
8181
logrus.Infof("Using directory %s for work", cf.WorkDir)
8282

8383
imageUtil := image.NewUtility(cf.ImagesDir, cf.ImageCredentialProviderConfig, cf.ImageCredentialProviderBinDir, cf.AgentRegistriesFile)
84-
applyinator := applyinator.NewApplyinator(cf.WorkDir, cf.PreserveWorkDir, cf.AppliedPlanDir, imageUtil)
84+
applyinator := applyinator.NewApplyinator(cf.WorkDir, cf.PreserveWorkDir, cf.AppliedPlanDir, cf.InterlockDir, imageUtil)
8585

8686
if cf.RemoteEnabled {
8787
logrus.Infof("Starting remote watch of plans")

0 commit comments

Comments
 (0)