@@ -330,7 +330,7 @@ def get_primary_host(
330330    return  output .strip ()
331331
332332
333- def  waiting_cluster_correct_status (
333+ def  waiting_cluster_final_status (
334334    meta : kopf .Meta ,
335335    spec : kopf .Spec ,
336336    patch : kopf .Patch ,
@@ -341,7 +341,80 @@ def waiting_cluster_correct_status(
341341        return 
342342
343343    # waiting for restart 
344-     time .sleep (5 )
344+     auto_failover_conns  =  connections (spec , meta , patch ,
345+                                       get_field (AUTOFAILOVER ), False , None ,
346+                                       logger , None , status , False )
347+     for  conn  in  auto_failover_conns .get_conns ():
348+         not_correct_cmd  =  [
349+             "pgtools" , "-w" , "0" , "-Q" , "pg_auto_failover" , "-q" ,
350+             '''" select count(*) from pgautofailover.node where reportedstate <> 'primary' and reportedstate <> 'secondary' and reportedstate <> 'single'  "''' 
351+         ]
352+         primary_cmd  =  [
353+             "pgtools" , "-w" , "0" , "-Q" , "pg_auto_failover" , "-q" ,
354+             '''" select count(*) from pgautofailover.node where reportedstate = 'primary' or reportedstate = 'single'  "''' 
355+         ]
356+         nodes_cmd  =  [
357+             "pgtools" , "-w" , "0" , "-Q" , "pg_auto_failover" , "-q" ,
358+             '''" select count(*) from pgautofailover.node  "''' 
359+         ]
360+ 
361+         i  =  0 
362+         maxtry  =  60 
363+         while  True :
364+             logger .info (
365+                 f"waiting auto_failover cluster final status, { i }  )
366+             i  +=  1 
367+             time .sleep (1 )
368+             if  i  >=  maxtry :
369+                 logger .warning (
370+                     f"cluster maybe maybe not right. skip waitting." )
371+                 break 
372+             output  =  exec_command (conn , primary_cmd , logger , interrupt = False )
373+             if  output  !=  '1' :
374+                 logger .warning (
375+                     f"not find primary node in autofailover, output is { output }  
376+                 )
377+                 continue 
378+             output  =  exec_command (conn ,
379+                                   not_correct_cmd ,
380+                                   logger ,
381+                                   interrupt = False )
382+             if  output  !=  '0' :
383+                 logger .warning (
384+                     f"there are { output }  
385+                 )
386+                 continue 
387+ 
388+             if  conn .get_machine () ==  None :
389+                 total_nodes  =  int (
390+                     spec [POSTGRESQL ][READWRITEINSTANCE ][REPLICAS ]) +  int (
391+                         spec [POSTGRESQL ][READONLYINSTANCE ][REPLICAS ])
392+             else :
393+                 total_nodes  =  len (
394+                     spec .get (POSTGRESQL ).get (READWRITEINSTANCE ).get (MACHINES )
395+                 ) +  len (
396+                     spec .get (POSTGRESQL ).get (READONLYINSTANCE ).get (MACHINES ))
397+             output  =  exec_command (conn , nodes_cmd , logger , interrupt = False )
398+             if  output  !=  str (total_nodes ):
399+                 logger .warning (
400+                     f"there are { output } { total_nodes }  
401+                 )
402+                 continue 
403+ 
404+             break 
405+     auto_failover_conns .free_conns ()
406+ 
407+ 
408+ def  waiting_cluster_correct_status (
409+     meta : kopf .Meta ,
410+     spec : kopf .Spec ,
411+     patch : kopf .Patch ,
412+     status : kopf .Status ,
413+     logger : logging .Logger ,
414+ ) ->  None :
415+     if  spec [ACTION ] ==  ACTION_STOP :
416+         return 
417+ 
345418    auto_failover_conns  =  connections (spec , meta , patch ,
346419                                      get_field (AUTOFAILOVER ), False , None ,
347420                                      logger , None , status , False )
@@ -370,15 +443,6 @@ def waiting_cluster_correct_status(
370443                logger .warning (
371444                    f"cluster maybe maybe not right. skip waitting." )
372445                break 
373-             #total_nodes = int( 
374-             #    spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS]) + int( 
375-             #        spec[POSTGRESQL][READONLYINSTANCE][REPLICAS]) # TODO machinemode len(machines) 
376-             #output = exec_command(conn, nodes_cmd, logger, interrupt=False) 
377-             #if output != str(total_nodes): 
378-             #    logger.warning( 
379-             #        f"there are {output} nodes in autofailover, expect {total_nodes} nodes" 
380-             #    ) 
381-             #    continue 
382446            output  =  exec_command (conn , primary_cmd , logger , interrupt = False )
383447            if  output  !=  '1' :
384448                logger .warning (
@@ -395,12 +459,28 @@ def waiting_cluster_correct_status(
395459                )
396460                continue 
397461
462+             if  conn .get_machine () ==  None :
463+                 total_nodes  =  int (
464+                     spec [POSTGRESQL ][READWRITEINSTANCE ][REPLICAS ]) +  int (
465+                         spec [POSTGRESQL ][READONLYINSTANCE ][REPLICAS ])
466+             else :
467+                 total_nodes  =  len (
468+                     spec .get (POSTGRESQL ).get (READWRITEINSTANCE ).get (MACHINES )
469+                 ) +  len (
470+                     spec .get (POSTGRESQL ).get (READONLYINSTANCE ).get (MACHINES ))
471+             output  =  exec_command (conn , nodes_cmd , logger , interrupt = False )
472+             if  output  !=  str (total_nodes ):
473+                 logger .warning (
474+                     f"there are { output } { total_nodes }  
475+                 )
476+                 continue 
477+ 
398478            break 
399479    auto_failover_conns .free_conns ()
400480
401481
402482def  waiting_postgresql_ready (conns : InstanceConnections ,
403-                              logger : logging .Logger ):
483+                              logger : logging .Logger )  ->   bool :
404484    for  conn  in  conns .get_conns ():
405485        i  =  0 
406486        maxtry  =  300 
@@ -416,11 +496,10 @@ def waiting_postgresql_ready(conns: InstanceConnections,
416496                    f"postgresql is not ready. try { i } { output }  )
417497                if  i  >=  maxtry :
418498                    logger .warning (f"postgresql is not ready. skip waitting." )
419-                     break 
499+                     return   False 
420500            else :
421501                break 
422-     # wait service refresh endpoint 
423-     time .sleep (10 )
502+     return  True 
424503
425504
426505def  waiting_instance_ready (conns : InstanceConnections , logger : logging .Logger ):
@@ -1210,7 +1289,8 @@ def exec_command(conn: InstanceConnection,
12101289    if  conn .get_machine () !=  None :
12111290        return  docker_exec_command (conn .get_machine ().get_role (),
12121291                                   conn .get_machine ().get_ssh (), cmd , logger ,
1213-                                    interrupt , user )
1292+                                    interrupt , user ,
1293+                                    conn .get_machine ().get_host ())
12141294
12151295
12161296def  pod_exec_command (name : str ,
@@ -1248,7 +1328,8 @@ def docker_exec_command(role: str,
12481328                        cmd : [str ],
12491329                        logger : logging .Logger ,
12501330                        interrupt : bool  =  True ,
1251-                         user : str  =  "root" ) ->  str :
1331+                         user : str  =  "root" ,
1332+                         host : str  =  None ) ->  str :
12521333    if  role  ==  AUTOFAILOVER :
12531334        machine_data_path  =  operator_config .DATA_PATH_AUTOFAILOVER 
12541335    if  role  ==  POSTGRESQL :
@@ -1257,12 +1338,13 @@ def docker_exec_command(role: str,
12571338        workdir  =  os .path .join (machine_data_path , DOCKER_COMPOSE_DIR )
12581339        #cmd = "cd " + workdir + "; docker-compose exec " + role + " " + " ".join(cmd) 
12591340        cmd  =  "docker exec "  +  role  +  " "  +  " " .join (['gosu' , user ] +  cmd )
1260-         logger .info (f"docker exec command { cmd }  )
1341+         logger .info (f"docker exec command { cmd }  on host  { host }  )
12611342        ssh_stdin , ssh_stdout , ssh_stderr  =  ssh .exec_command (cmd , get_pty = True )
12621343    except  Exception  as  e :
12631344        if  interrupt :
12641345            raise  kopf .PermanentError (f"can't run command: { cmd } { e }  )
12651346        else :
1347+             logger .error (f"can't run command: { cmd } { e }  )
12661348            return  FAILED 
12671349
12681350    # see pod_exec_command, don't check ret_code 
@@ -1893,7 +1975,7 @@ def create_services(
18931975                read_vip  =  service [VIP ]
18941976            elif  service [SELECTOR ] ==  SERVICE_STANDBY_READONLY :
18951977                machines  =  spec .get (POSTGRESQL ).get (READWRITEINSTANCE ).get (
1896-                     MACHINES )
1978+                     MACHINES ). copy () 
18971979                machines  +=  spec .get (POSTGRESQL ).get (READONLYINSTANCE ).get (
18981980                    MACHINES )
18991981                read_vip  =  service [VIP ]
@@ -1926,7 +2008,10 @@ def create_services(
19262008        conns  =  connections (spec , meta , patch ,
19272009                            get_field (POSTGRESQL , READWRITEINSTANCE ), False ,
19282010                            None , logger , None , status , False )
1929-         for  conn  in  conns .get_conns ():
2011+         readonly_conns  =  connections (spec , meta , patch ,
2012+                                      get_field (POSTGRESQL , READONLYINSTANCE ),
2013+                                      False , None , logger , None , status , False )
2014+         for  conn  in  (conns .get_conns () +  readonly_conns .get_conns ()):
19302015            machine_sftp_put (conn .get_machine ().get_sftp (), lvs_conf ,
19312016                             KEEPALIVED_CONF )
19322017            machine_exec_command (
@@ -1935,6 +2020,7 @@ def create_services(
19352020            machine_exec_command (conn .get_machine ().get_ssh (),
19362021                                 START_KEEPALIVED )
19372022        conns .free_conns ()
2023+         readonly_conns .free_conns ()
19382024
19392025
19402026def  check_param (spec : kopf .Spec ,
@@ -2005,7 +2091,9 @@ async def create_postgresql_cluster(
20052091    #conns = connections(spec, meta, patch, 
20062092    #                    get_field(POSTGRESQL, READWRITEINSTANCE), False, None, 
20072093    #                    logger, None, status, False) 
2008-     #create_users(meta, spec, patch, status, logger, conns) 
2094+     #if conns.get_conns()[0].get_machine() != None: 
2095+     #    waiting_postgresql_ready(conns, logger) 
2096+     #    waiting_cluster_final_status(meta, spec, patch, status, logger) 
20092097    #conns.free_conns() 
20102098
20112099    # create postgresql & readonly node 
@@ -2033,7 +2121,7 @@ async def create_cluster(
20332121        await  create_postgresql_cluster (meta , spec , patch , status , logger )
20342122
20352123        logger .info ("waiting for create_cluster success" )
2036-         waiting_cluster_correct_status (meta , spec , patch , status , logger )
2124+         waiting_cluster_final_status (meta , spec , patch , status , logger )
20372125
20382126        # wait a few seconds to prevent the pod not running 
20392127        time .sleep (5 )
@@ -2217,18 +2305,25 @@ async def correct_keepalived(
22172305    status : kopf .Status ,
22182306    logger : logging .Logger ,
22192307) ->  None :
2220-     readwrite_conns  =  connections (spec , meta , patch ,
2221-                                   get_field (POSTGRESQL , READWRITEINSTANCE ),
2222-                                   False , None , logger , None , status , False )
2223-     for  conn  in  readwrite_conns .get_conns ():
2308+     conns  =  connections (spec , meta , patch ,
2309+                         get_field (POSTGRESQL , READWRITEINSTANCE ), False , None ,
2310+                         logger , None , status , False )
2311+     readonly_conns  =  connections (spec , meta , patch ,
2312+                                  get_field (POSTGRESQL , READONLYINSTANCE ),
2313+                                  False , None , logger , None , status , False )
2314+     for  conn  in  (conns .get_conns () +  readonly_conns .get_conns ()):
2315+         if  conn .get_machine () ==  None :
2316+             break 
2317+ 
22242318        output  =  machine_exec_command (conn .get_machine ().get_ssh (),
22252319                                      STATUS_KEEPALIVED ,
22262320                                      interrupt = False )
22272321        if  output .find ("Active: active (running)" ) ==  - 1 :
22282322            delete_services (meta , spec , patch , status , logger )
22292323            create_services (meta , spec , patch , status , logger )
22302324            break 
2231-     readwrite_conns .free_conns ()
2325+     conns .free_conns ()
2326+     readonly_conns .free_conns ()
22322327
22332328
22342329async  def  correct_postgresql_role (
@@ -2591,11 +2686,15 @@ def delete_services(
25912686        conns  =  connections (spec , meta , patch ,
25922687                            get_field (POSTGRESQL , READWRITEINSTANCE ), False ,
25932688                            None , logger , None , status , False )
2594-         for  conn  in  conns .get_conns ():
2689+         readonly_conns  =  connections (spec , meta , patch ,
2690+                                      get_field (POSTGRESQL , READONLYINSTANCE ),
2691+                                      False , None , logger , None , status , False )
2692+         for  conn  in  (conns .get_conns () +  readonly_conns .get_conns ()):
2693+             machine_exec_command (conn .get_machine ().get_ssh (), STOP_KEEPALIVED )
25952694            machine_exec_command (conn .get_machine ().get_ssh (),
25962695                                 "rm -rf "  +  KEEPALIVED_CONF )
2597-             machine_exec_command (conn .get_machine ().get_ssh (), STOP_KEEPALIVED )
25982696        conns .free_conns ()
2697+         readonly_conns .free_conns ()
25992698
26002699
26012700def  update_service (
@@ -2677,26 +2776,44 @@ def update_configs(
26772776
26782777        waiting_postgresql_ready (readwrite_conns , logger )
26792778        waiting_postgresql_ready (readonly_conns , logger )
2680-         logger . info ( "update configs("   +   str ( cmd )  +   ")" ) 
2779+         primary_conn   =   None 
26812780        for  conn  in  conns :
2682-             if  get_primary_host (
2683-                     meta , spec , patch , status ,
2684-                     logger ) ==  get_connhost (conn ) and  int (
2685-                         spec [POSTGRESQL ][READWRITEINSTANCE ][REPLICAS ]) >  1 :
2686-                 autofailover_switchover (meta , spec , patch , status , logger )
2687- 
2688-             waiting_postgresql_ready (readwrite_conns , logger )
2689-             waiting_postgresql_ready (readonly_conns , logger )
2781+             if  get_primary_host (meta , spec , patch , status ,
2782+                                 logger ) ==  get_connhost (conn ):
2783+                 primary_conn  =  conn 
2784+                 continue 
26902785
2786+             #if conn.get_machine() != None: 
2787+             #    replicas = len(spec[POSTGRESQL][READWRITEINSTANCE][MACHINES]) 
2788+             #else: 
2789+             #    replicas = int(spec[POSTGRESQL][READWRITEINSTANCE][REPLICAS]) 
2790+             #if replicas > 1 and get_primary_host( meta, spec, patch, status, logger) == get_connhost(conn): 
2791+             #    autofailover_switchover(meta, spec, patch, status, logger) 
2792+             #    if port_change == True or restart_postgresql == True: 
2793+             #        waiting_cluster_correct_status(meta, spec, patch, status, logger) 
2794+             #    else: 
2795+             #        waiting_cluster_final_status(meta, spec, patch, status, logger) 
2796+ 
2797+             logger .info ("update configs ("  +  str (cmd ) + 
2798+                         ") on %s "  %  get_connhost (conn ))
26912799            output  =  exec_command (conn , cmd , logger , interrupt = False )
26922800            if  output .find (SUCCESS ) ==  - 1 :
26932801                logger .error (f"update configs { cmd } { output }  )
26942802
26952803            #if port_change == True or restart_postgresql == True: 
2696-             if  port_change  ==  True :
2697-                 logger .info (f"wait readwrite instance update finish" )
2698-                 waiting_cluster_correct_status (meta , spec , patch , status ,
2699-                                                logger )
2804+             #    time.sleep(10) 
2805+             #if port_change == True: 
2806+             #    waiting_cluster_correct_status(meta, spec, patch, status, 
2807+             #                                   logger) 
2808+         if  port_change  ==  True  or  restart_postgresql  ==  True :
2809+             waiting_cluster_correct_status (meta , spec , patch , status , logger )
2810+             time .sleep (6 )
2811+ 
2812+         logger .info ("update configs ("  +  str (cmd ) + 
2813+                     ") on %s "  %  get_connhost (primary_conn ))
2814+         output  =  exec_command (primary_conn , cmd , logger , interrupt = False )
2815+         if  output .find (SUCCESS ) ==  - 1 :
2816+             logger .error (f"update configs { cmd } { output }  )
27002817
27012818        if  port_change  ==  True :
27022819            delete_services (meta , spec , patch , status , logger )
@@ -2864,24 +2981,39 @@ async def update_cluster(
28642981            NEW  =  diff [3 ]
28652982
28662983            logger .info (diff )
2867-             update_replicas (meta , spec , patch , status , logger , AC , FIELD , OLD ,
2868-                             NEW )
2984+ 
28692985            update_action (meta , spec , patch , status , logger , AC , FIELD , OLD ,
28702986                          NEW )
28712987            update_service (meta , spec , patch , status , logger , AC , FIELD , OLD ,
28722988                           NEW )
2989+ 
2990+         for  diff  in  diffs :
2991+             AC  =  diff [0 ]
2992+             FIELD  =  diff [1 ]
2993+             OLD  =  diff [2 ]
2994+             NEW  =  diff [3 ]
2995+ 
2996+             update_replicas (meta , spec , patch , status , logger , AC , FIELD , OLD ,
2997+                             NEW )
2998+             update_podspec_volume (meta , spec , patch , status , logger , AC , FIELD ,
2999+                                   OLD , NEW )
3000+ 
3001+         for  diff  in  diffs :
3002+             AC  =  diff [0 ]
3003+             FIELD  =  diff [1 ]
3004+             OLD  =  diff [2 ]
3005+             NEW  =  diff [3 ]
3006+ 
28733007            update_hbas (meta , spec , patch , status , logger , AC , FIELD , OLD , NEW )
28743008            update_users (meta , spec , patch , status , logger , AC , FIELD , OLD ,
28753009                         NEW )
28763010            update_streaming (meta , spec , patch , status , logger , AC , FIELD , OLD ,
28773011                             NEW )
2878-             update_podspec_volume (meta , spec , patch , status , logger , AC , FIELD ,
2879-                                   OLD , NEW )
28803012            update_configs (meta , spec , patch , status , logger , AC , FIELD , OLD ,
28813013                           NEW )
28823014
28833015        logger .info ("waiting for update_cluster success" )
2884-         waiting_cluster_correct_status (meta , spec , patch , status , logger )
3016+         waiting_cluster_final_status (meta , spec , patch , status , logger )
28853017
28863018        # wait a few seconds to prevent the pod not running 
28873019        time .sleep (5 )
0 commit comments