Skip to content

Commit a8c47a4

Browse files
committed
fix some bugs. compatible appcenter,
1 parent 4dba826 commit a8c47a4

File tree

2 files changed

+181
-49
lines changed

2 files changed

+181
-49
lines changed

image/postgresql/docker-entrypoint.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ main() {
201201
done
202202

203203
if [ -s "$PGDATA/postgresql-auto-failover-standby.conf" ]; then
204-
echo "delay start(60 seconds) on slave node"
205-
sleep 60
204+
echo "delay start(10 seconds) on slave node"
205+
sleep 10
206206
fi
207207

208208
# delete old pid

platforms/kubernetes/postgres-operator/postgres/handle.py

+179-47
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ def get_primary_host(
330330
return output.strip()
331331

332332

333-
def waiting_cluster_correct_status(
333+
def waiting_cluster_final_status(
334334
meta: kopf.Meta,
335335
spec: kopf.Spec,
336336
patch: kopf.Patch,
@@ -341,7 +341,80 @@ def waiting_cluster_correct_status(
341341
return
342342

343343
# waiting for restart
344-
time.sleep(5)
344+
auto_failover_conns = connections(spec, meta, patch,
345+
get_field(AUTOFAILOVER), False, None,
346+
logger, None, status, False)
347+
for conn in auto_failover_conns.get_conns():
348+
not_correct_cmd = [
349+
"pgtools", "-w", "0", "-Q", "pg_auto_failover", "-q",
350+
'''" select count(*) from pgautofailover.node where reportedstate <> 'primary' and reportedstate <> 'secondary' and reportedstate <> 'single' "'''
351+
]
352+
primary_cmd = [
353+
"pgtools", "-w", "0", "-Q", "pg_auto_failover", "-q",
354+
'''" select count(*) from pgautofailover.node where reportedstate = 'primary' or reportedstate = 'single' "'''
355+
]
356+
nodes_cmd = [
357+
"pgtools", "-w", "0", "-Q", "pg_auto_failover", "-q",
358+
'''" select count(*) from pgautofailover.node "'''
359+
]
360+
361+
i = 0
362+
maxtry = 60
363+
while True:
364+
logger.info(
365+
f"waiting auto_failover cluster final status, {i} times. ")
366+
i += 1
367+
time.sleep(1)
368+
if i >= maxtry:
369+
logger.warning(
370+
f"cluster maybe maybe not right. skip waitting.")
371+
break
372+
output = exec_command(conn, primary_cmd, logger, interrupt=False)
373+
if output != '1':
374+
logger.warning(
375+
f"not find primary node in autofailover, output is {output}"
376+
)
377+
continue
378+
output = exec_command(conn,
379+
not_correct_cmd,
380+
logger,
381+
interrupt=False)
382+
if output != '0':
383+
logger.warning(
384+
f"there are {output} nodes is not primary/secondary/single"
385+
)
386+
continue
387+
388+
if conn.get_machine() == None:
389+
total_nodes = int(
390+
spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS]) + int(
391+
spec[POSTGRESQL][READONLYINSTANCE][REPLICAS])
392+
else:
393+
total_nodes = len(
394+
spec.get(POSTGRESQL).get(READWRITEINSTANCE).get(MACHINES)
395+
) + len(
396+
spec.get(POSTGRESQL).get(READONLYINSTANCE).get(MACHINES))
397+
output = exec_command(conn, nodes_cmd, logger, interrupt=False)
398+
if output != str(total_nodes):
399+
logger.warning(
400+
f"there are {output} nodes in autofailover, expect {total_nodes} nodes"
401+
)
402+
continue
403+
404+
break
405+
auto_failover_conns.free_conns()
406+
407+
408+
def waiting_cluster_correct_status(
409+
meta: kopf.Meta,
410+
spec: kopf.Spec,
411+
patch: kopf.Patch,
412+
status: kopf.Status,
413+
logger: logging.Logger,
414+
) -> None:
415+
if spec[ACTION] == ACTION_STOP:
416+
return
417+
345418
auto_failover_conns = connections(spec, meta, patch,
346419
get_field(AUTOFAILOVER), False, None,
347420
logger, None, status, False)
@@ -370,15 +443,6 @@ def waiting_cluster_correct_status(
370443
logger.warning(
371444
f"cluster maybe maybe not right. skip waitting.")
372445
break
373-
#total_nodes = int(
374-
# spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS]) + int(
375-
# spec[POSTGRESQL][READONLYINSTANCE][REPLICAS]) # TODO machinemode len(machines)
376-
#output = exec_command(conn, nodes_cmd, logger, interrupt=False)
377-
#if output != str(total_nodes):
378-
# logger.warning(
379-
# f"there are {output} nodes in autofailover, expect {total_nodes} nodes"
380-
# )
381-
# continue
382446
output = exec_command(conn, primary_cmd, logger, interrupt=False)
383447
if output != '1':
384448
logger.warning(
@@ -395,12 +459,28 @@ def waiting_cluster_correct_status(
395459
)
396460
continue
397461

462+
if conn.get_machine() == None:
463+
total_nodes = int(
464+
spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS]) + int(
465+
spec[POSTGRESQL][READONLYINSTANCE][REPLICAS])
466+
else:
467+
total_nodes = len(
468+
spec.get(POSTGRESQL).get(READWRITEINSTANCE).get(MACHINES)
469+
) + len(
470+
spec.get(POSTGRESQL).get(READONLYINSTANCE).get(MACHINES))
471+
output = exec_command(conn, nodes_cmd, logger, interrupt=False)
472+
if output != str(total_nodes):
473+
logger.warning(
474+
f"there are {output} nodes in autofailover, expect {total_nodes} nodes"
475+
)
476+
continue
477+
398478
break
399479
auto_failover_conns.free_conns()
400480

401481

402482
def waiting_postgresql_ready(conns: InstanceConnections,
403-
logger: logging.Logger):
483+
logger: logging.Logger) -> bool:
404484
for conn in conns.get_conns():
405485
i = 0
406486
maxtry = 300
@@ -416,11 +496,10 @@ def waiting_postgresql_ready(conns: InstanceConnections,
416496
f"postgresql is not ready. try {i} times. {output}")
417497
if i >= maxtry:
418498
logger.warning(f"postgresql is not ready. skip waitting.")
419-
break
499+
return False
420500
else:
421501
break
422-
# wait service refresh endpoint
423-
time.sleep(10)
502+
return True
424503

425504

426505
def waiting_instance_ready(conns: InstanceConnections, logger: logging.Logger):
@@ -1210,7 +1289,8 @@ def exec_command(conn: InstanceConnection,
12101289
if conn.get_machine() != None:
12111290
return docker_exec_command(conn.get_machine().get_role(),
12121291
conn.get_machine().get_ssh(), cmd, logger,
1213-
interrupt, user)
1292+
interrupt, user,
1293+
conn.get_machine().get_host())
12141294

12151295

12161296
def pod_exec_command(name: str,
@@ -1248,7 +1328,8 @@ def docker_exec_command(role: str,
12481328
cmd: [str],
12491329
logger: logging.Logger,
12501330
interrupt: bool = True,
1251-
user: str = "root") -> str:
1331+
user: str = "root",
1332+
host: str = None) -> str:
12521333
if role == AUTOFAILOVER:
12531334
machine_data_path = operator_config.DATA_PATH_AUTOFAILOVER
12541335
if role == POSTGRESQL:
@@ -1257,12 +1338,13 @@ def docker_exec_command(role: str,
12571338
workdir = os.path.join(machine_data_path, DOCKER_COMPOSE_DIR)
12581339
#cmd = "cd " + workdir + "; docker-compose exec " + role + " " + " ".join(cmd)
12591340
cmd = "docker exec " + role + " " + " ".join(['gosu', user] + cmd)
1260-
logger.info(f"docker exec command {cmd}")
1341+
logger.info(f"docker exec command {cmd} on host {host}")
12611342
ssh_stdin, ssh_stdout, ssh_stderr = ssh.exec_command(cmd, get_pty=True)
12621343
except Exception as e:
12631344
if interrupt:
12641345
raise kopf.PermanentError(f"can't run command: {cmd} , {e}")
12651346
else:
1347+
logger.error(f"can't run command: {cmd} , {e}")
12661348
return FAILED
12671349

12681350
# see pod_exec_command, don't check ret_code
@@ -1893,7 +1975,7 @@ def create_services(
18931975
read_vip = service[VIP]
18941976
elif service[SELECTOR] == SERVICE_STANDBY_READONLY:
18951977
machines = spec.get(POSTGRESQL).get(READWRITEINSTANCE).get(
1896-
MACHINES)
1978+
MACHINES).copy()
18971979
machines += spec.get(POSTGRESQL).get(READONLYINSTANCE).get(
18981980
MACHINES)
18991981
read_vip = service[VIP]
@@ -1926,7 +2008,10 @@ def create_services(
19262008
conns = connections(spec, meta, patch,
19272009
get_field(POSTGRESQL, READWRITEINSTANCE), False,
19282010
None, logger, None, status, False)
1929-
for conn in conns.get_conns():
2011+
readonly_conns = connections(spec, meta, patch,
2012+
get_field(POSTGRESQL, READONLYINSTANCE),
2013+
False, None, logger, None, status, False)
2014+
for conn in (conns.get_conns() + readonly_conns.get_conns()):
19302015
machine_sftp_put(conn.get_machine().get_sftp(), lvs_conf,
19312016
KEEPALIVED_CONF)
19322017
machine_exec_command(
@@ -1935,6 +2020,7 @@ def create_services(
19352020
machine_exec_command(conn.get_machine().get_ssh(),
19362021
START_KEEPALIVED)
19372022
conns.free_conns()
2023+
readonly_conns.free_conns()
19382024

19392025

19402026
def check_param(spec: kopf.Spec,
@@ -2005,7 +2091,9 @@ async def create_postgresql_cluster(
20052091
#conns = connections(spec, meta, patch,
20062092
# get_field(POSTGRESQL, READWRITEINSTANCE), False, None,
20072093
# logger, None, status, False)
2008-
#create_users(meta, spec, patch, status, logger, conns)
2094+
#if conns.get_conns()[0].get_machine() != None:
2095+
# waiting_postgresql_ready(conns, logger)
2096+
# waiting_cluster_final_status(meta, spec, patch, status, logger)
20092097
#conns.free_conns()
20102098

20112099
# create postgresql & readonly node
@@ -2033,7 +2121,7 @@ async def create_cluster(
20332121
await create_postgresql_cluster(meta, spec, patch, status, logger)
20342122

20352123
logger.info("waiting for create_cluster success")
2036-
waiting_cluster_correct_status(meta, spec, patch, status, logger)
2124+
waiting_cluster_final_status(meta, spec, patch, status, logger)
20372125

20382126
# wait a few seconds to prevent the pod not running
20392127
time.sleep(5)
@@ -2217,18 +2305,25 @@ async def correct_keepalived(
22172305
status: kopf.Status,
22182306
logger: logging.Logger,
22192307
) -> None:
2220-
readwrite_conns = connections(spec, meta, patch,
2221-
get_field(POSTGRESQL, READWRITEINSTANCE),
2222-
False, None, logger, None, status, False)
2223-
for conn in readwrite_conns.get_conns():
2308+
conns = connections(spec, meta, patch,
2309+
get_field(POSTGRESQL, READWRITEINSTANCE), False, None,
2310+
logger, None, status, False)
2311+
readonly_conns = connections(spec, meta, patch,
2312+
get_field(POSTGRESQL, READONLYINSTANCE),
2313+
False, None, logger, None, status, False)
2314+
for conn in (conns.get_conns() + readonly_conns.get_conns()):
2315+
if conn.get_machine() == None:
2316+
break
2317+
22242318
output = machine_exec_command(conn.get_machine().get_ssh(),
22252319
STATUS_KEEPALIVED,
22262320
interrupt=False)
22272321
if output.find("Active: active (running)") == -1:
22282322
delete_services(meta, spec, patch, status, logger)
22292323
create_services(meta, spec, patch, status, logger)
22302324
break
2231-
readwrite_conns.free_conns()
2325+
conns.free_conns()
2326+
readonly_conns.free_conns()
22322327

22332328

22342329
async def correct_postgresql_role(
@@ -2591,11 +2686,15 @@ def delete_services(
25912686
conns = connections(spec, meta, patch,
25922687
get_field(POSTGRESQL, READWRITEINSTANCE), False,
25932688
None, logger, None, status, False)
2594-
for conn in conns.get_conns():
2689+
readonly_conns = connections(spec, meta, patch,
2690+
get_field(POSTGRESQL, READONLYINSTANCE),
2691+
False, None, logger, None, status, False)
2692+
for conn in (conns.get_conns() + readonly_conns.get_conns()):
2693+
machine_exec_command(conn.get_machine().get_ssh(), STOP_KEEPALIVED)
25952694
machine_exec_command(conn.get_machine().get_ssh(),
25962695
"rm -rf " + KEEPALIVED_CONF)
2597-
machine_exec_command(conn.get_machine().get_ssh(), STOP_KEEPALIVED)
25982696
conns.free_conns()
2697+
readonly_conns.free_conns()
25992698

26002699

26012700
def update_service(
@@ -2677,26 +2776,44 @@ def update_configs(
26772776

26782777
waiting_postgresql_ready(readwrite_conns, logger)
26792778
waiting_postgresql_ready(readonly_conns, logger)
2680-
logger.info("update configs(" + str(cmd) + ")")
2779+
primary_conn = None
26812780
for conn in conns:
2682-
if get_primary_host(
2683-
meta, spec, patch, status,
2684-
logger) == get_connhost(conn) and int(
2685-
spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS]) > 1:
2686-
autofailover_switchover(meta, spec, patch, status, logger)
2687-
2688-
waiting_postgresql_ready(readwrite_conns, logger)
2689-
waiting_postgresql_ready(readonly_conns, logger)
2781+
if get_primary_host(meta, spec, patch, status,
2782+
logger) == get_connhost(conn):
2783+
primary_conn = conn
2784+
continue
26902785

2786+
#if conn.get_machine() != None:
2787+
# replicas = len(spec[POSTGRESQL][READWRITEINSTANCE][MACHINES])
2788+
#else:
2789+
# replicas = int(spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS])
2790+
#if replicas > 1 and get_primary_host( meta, spec, patch, status, logger) == get_connhost(conn):
2791+
# autofailover_switchover(meta, spec, patch, status, logger)
2792+
# if port_change == True or restart_postgresql == True:
2793+
# waiting_cluster_correct_status(meta, spec, patch, status, logger)
2794+
# else:
2795+
# waiting_cluster_final_status(meta, spec, patch, status, logger)
2796+
2797+
logger.info("update configs (" + str(cmd) +
2798+
") on %s " % get_connhost(conn))
26912799
output = exec_command(conn, cmd, logger, interrupt=False)
26922800
if output.find(SUCCESS) == -1:
26932801
logger.error(f"update configs {cmd} failed. {output}")
26942802

26952803
#if port_change == True or restart_postgresql == True:
2696-
if port_change == True:
2697-
logger.info(f"wait readwrite instance update finish")
2698-
waiting_cluster_correct_status(meta, spec, patch, status,
2699-
logger)
2804+
# time.sleep(10)
2805+
#if port_change == True:
2806+
# waiting_cluster_correct_status(meta, spec, patch, status,
2807+
# logger)
2808+
if port_change == True or restart_postgresql == True:
2809+
waiting_cluster_correct_status(meta, spec, patch, status, logger)
2810+
time.sleep(6)
2811+
2812+
logger.info("update configs (" + str(cmd) +
2813+
") on %s " % get_connhost(primary_conn))
2814+
output = exec_command(primary_conn, cmd, logger, interrupt=False)
2815+
if output.find(SUCCESS) == -1:
2816+
logger.error(f"update configs {cmd} failed. {output}")
27002817

27012818
if port_change == True:
27022819
delete_services(meta, spec, patch, status, logger)
@@ -2864,24 +2981,39 @@ async def update_cluster(
28642981
NEW = diff[3]
28652982

28662983
logger.info(diff)
2867-
update_replicas(meta, spec, patch, status, logger, AC, FIELD, OLD,
2868-
NEW)
2984+
28692985
update_action(meta, spec, patch, status, logger, AC, FIELD, OLD,
28702986
NEW)
28712987
update_service(meta, spec, patch, status, logger, AC, FIELD, OLD,
28722988
NEW)
2989+
2990+
for diff in diffs:
2991+
AC = diff[0]
2992+
FIELD = diff[1]
2993+
OLD = diff[2]
2994+
NEW = diff[3]
2995+
2996+
update_replicas(meta, spec, patch, status, logger, AC, FIELD, OLD,
2997+
NEW)
2998+
update_podspec_volume(meta, spec, patch, status, logger, AC, FIELD,
2999+
OLD, NEW)
3000+
3001+
for diff in diffs:
3002+
AC = diff[0]
3003+
FIELD = diff[1]
3004+
OLD = diff[2]
3005+
NEW = diff[3]
3006+
28733007
update_hbas(meta, spec, patch, status, logger, AC, FIELD, OLD, NEW)
28743008
update_users(meta, spec, patch, status, logger, AC, FIELD, OLD,
28753009
NEW)
28763010
update_streaming(meta, spec, patch, status, logger, AC, FIELD, OLD,
28773011
NEW)
2878-
update_podspec_volume(meta, spec, patch, status, logger, AC, FIELD,
2879-
OLD, NEW)
28803012
update_configs(meta, spec, patch, status, logger, AC, FIELD, OLD,
28813013
NEW)
28823014

28833015
logger.info("waiting for update_cluster success")
2884-
waiting_cluster_correct_status(meta, spec, patch, status, logger)
3016+
waiting_cluster_final_status(meta, spec, patch, status, logger)
28853017

28863018
# wait a few seconds to prevent the pod not running
28873019
time.sleep(5)

0 commit comments

Comments
 (0)