Skip to content

Commit c62351b

Browse files
authored
DPE-3389 support rollback incompat data dir (#385)
* rollback from incompatible data_dir * port test * remove vm fixture * missing test parameter * no need to patch for oci * fixes * remove dup error and properly translating exception * always replace layer and fixed call ordering * complete test and complete upgrade process * fix/add test * big timeout for recovery * continuous writes are testes on other tests * mitigate false-positive failure * more robust test * missing retry action * fix list to exclude unit by name
1 parent 96a10f3 commit c62351b

11 files changed

+421
-104
lines changed

src/charm.py

Lines changed: 50 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
BYTES_1MB,
1919
MySQLAddInstanceToClusterError,
2020
MySQLCharmBase,
21-
MySQLConfigureInstanceError,
22-
MySQLConfigureMySQLUsersError,
2321
MySQLCreateClusterError,
2422
MySQLGetClusterPrimaryAddressError,
2523
MySQLGetMemberStateError,
@@ -72,11 +70,7 @@
7270
)
7371
from k8s_helpers import KubernetesHelpers
7472
from log_rotate_manager import LogRotateManager
75-
from mysql_k8s_helpers import (
76-
MySQL,
77-
MySQLCreateCustomConfigFileError,
78-
MySQLInitialiseMySQLDError,
79-
)
73+
from mysql_k8s_helpers import MySQL
8074
from relations.mysql import MySQLRelation
8175
from relations.mysql_provider import MySQLProvider
8276
from relations.mysql_root import MySQLRootRelation
@@ -276,7 +270,7 @@ def _is_unit_waiting_to_join_cluster(self) -> bool:
276270
and not self.unit_peer_data.get("unit-initialized")
277271
)
278272

279-
def _join_unit_to_cluster(self) -> None:
273+
def join_unit_to_cluster(self) -> None:
280274
"""Join the unit to the cluster.
281275
282276
Try to join the unit from the primary unit.
@@ -485,45 +479,45 @@ def _open_ports(self) -> None:
485479
except ops.ModelError:
486480
logger.exception("failed to open port")
487481

488-
def _configure_instance(self, container) -> bool:
489-
"""Configure the instance for use in Group Replication."""
490-
try:
491-
# Run mysqld for the first time to
492-
# bootstrap the data directory and users
493-
logger.debug("Initializing instance")
494-
self._mysql.fix_data_dir(container)
495-
self._mysql.initialise_mysqld()
496-
497-
# Add the pebble layer
498-
logger.debug("Adding pebble layer")
499-
container.add_layer(MYSQLD_SAFE_SERVICE, self._pebble_layer, combine=False)
500-
container.restart(MYSQLD_SAFE_SERVICE)
501-
502-
logger.debug("Waiting for instance to be ready")
503-
self._mysql.wait_until_mysql_connection(check_port=False)
504-
505-
logger.info("Configuring instance")
506-
# Configure all base users and revoke privileges from the root users
507-
self._mysql.configure_mysql_users()
508-
# Configure instance as a cluster node
509-
self._mysql.configure_instance()
482+
def _write_mysqld_configuration(self):
483+
"""Write the mysqld configuration to the file."""
484+
memory_limit_bytes = (self.config.profile_limit_memory or 0) * BYTES_1MB
485+
new_config_content, _ = self._mysql.render_mysqld_configuration(
486+
profile=self.config.profile,
487+
memory_limit=memory_limit_bytes,
488+
)
489+
self._mysql.write_content_to_file(path=MYSQLD_CONFIG_FILE, content=new_config_content)
510490

511-
if self.has_cos_relation:
512-
if container.get_services(MYSQLD_EXPORTER_SERVICE)[
513-
MYSQLD_EXPORTER_SERVICE
514-
].is_running():
515-
# Restart exporter service after configuration
516-
container.restart(MYSQLD_EXPORTER_SERVICE)
517-
else:
518-
container.start(MYSQLD_EXPORTER_SERVICE)
519-
except (
520-
MySQLConfigureInstanceError,
521-
MySQLConfigureMySQLUsersError,
522-
MySQLInitialiseMySQLDError,
523-
MySQLCreateCustomConfigFileError,
524-
) as e:
525-
logger.debug("Unable to configure instance: {}".format(e))
526-
return False
491+
def _configure_instance(self, container) -> None:
492+
"""Configure the instance for use in Group Replication."""
493+
# Run mysqld for the first time to
494+
# bootstrap the data directory and users
495+
logger.debug("Initializing instance")
496+
self._mysql.fix_data_dir(container)
497+
self._mysql.initialise_mysqld()
498+
499+
# Add the pebble layer
500+
logger.debug("Adding pebble layer")
501+
container.add_layer(MYSQLD_SAFE_SERVICE, self._pebble_layer, combine=True)
502+
container.restart(MYSQLD_SAFE_SERVICE)
503+
504+
logger.debug("Waiting for instance to be ready")
505+
self._mysql.wait_until_mysql_connection(check_port=False)
506+
507+
logger.info("Configuring instance")
508+
# Configure all base users and revoke privileges from the root users
509+
self._mysql.configure_mysql_users()
510+
# Configure instance as a cluster node
511+
self._mysql.configure_instance()
512+
513+
if self.has_cos_relation:
514+
if container.get_services(MYSQLD_EXPORTER_SERVICE)[
515+
MYSQLD_EXPORTER_SERVICE
516+
].is_running():
517+
# Restart exporter service after configuration
518+
container.restart(MYSQLD_EXPORTER_SERVICE)
519+
else:
520+
container.start(MYSQLD_EXPORTER_SERVICE)
527521

528522
self._open_ports()
529523

@@ -535,8 +529,6 @@ def _configure_instance(self, container) -> bool:
535529
# Do not block the charm if the version cannot be retrieved
536530
pass
537531

538-
return True
539-
540532
def _mysql_pebble_ready_checks(self, event) -> bool:
541533
"""Executes some checks to see if it is safe to execute the pebble ready handler."""
542534
if not self._is_peer_data_set:
@@ -560,17 +552,13 @@ def _on_mysql_pebble_ready(self, event) -> None:
560552
event.defer()
561553
return
562554

555+
if not self.upgrade.idle:
556+
# when upgrading pebble ready is
557+
# task delegated to upgrade code
558+
return
559+
563560
container = event.workload
564-
try:
565-
memory_limit_bytes = (self.config.profile_limit_memory or 0) * BYTES_1MB
566-
new_config_content, _ = self._mysql.render_mysqld_configuration(
567-
profile=self.config.profile,
568-
memory_limit=memory_limit_bytes,
569-
)
570-
self._mysql.write_content_to_file(path=MYSQLD_CONFIG_FILE, content=new_config_content)
571-
except MySQLCreateCustomConfigFileError:
572-
logger.exception("Unable to write custom config file")
573-
raise
561+
self._write_mysqld_configuration()
574562

575563
logger.info("Setting up the logrotate configurations")
576564
self._mysql.setup_logrotate_config()
@@ -585,15 +573,13 @@ def _on_mysql_pebble_ready(self, event) -> None:
585573
self.unit.status = MaintenanceStatus("Initialising mysqld")
586574

587575
# First run setup
588-
if not self._configure_instance(container):
589-
raise
576+
self._configure_instance(container)
590577

591578
if not self.unit.is_leader():
592579
# Non-leader units should wait for leader to add them to the cluster
593580
self.unit.status = WaitingStatus("Waiting for instance to join the cluster")
594581
self.unit_peer_data.update({"member-role": "secondary", "member-state": "waiting"})
595-
596-
self._join_unit_to_cluster()
582+
self.join_unit_to_cluster()
597583
return
598584

599585
try:
@@ -607,7 +593,6 @@ def _on_mysql_pebble_ready(self, event) -> None:
607593
self.app_peer_data["units-added-to-cluster"] = "1"
608594

609595
state, role = self._mysql.get_member_state()
610-
611596
self.unit_peer_data.update(
612597
{"member-state": state, "member-role": role, "unit-initialized": "True"}
613598
)
@@ -702,7 +687,7 @@ def _on_update_status(self, _: Optional[UpdateStatusEvent]) -> None:
702687
if not self.unit.is_leader() and self._is_unit_waiting_to_join_cluster():
703688
# join cluster test takes precedence over blocked test
704689
# due to matching criteria
705-
self._join_unit_to_cluster()
690+
self.join_unit_to_cluster()
706691
return
707692

708693
if self._is_cluster_blocked():
@@ -748,7 +733,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent) -> None:
748733
return
749734

750735
if self._is_unit_waiting_to_join_cluster():
751-
self._join_unit_to_cluster()
736+
self.join_unit_to_cluster()
752737

753738
def _on_database_storage_detaching(self, _) -> None:
754739
"""Handle the database storage detaching event."""

src/mysql_k8s_helpers.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
MySQLConfigureMySQLUsersError,
1717
MySQLExecError,
1818
MySQLGetClusterEndpointsError,
19+
MySQLServiceNotRunningError,
1920
MySQLStartMySQLDError,
2021
MySQLStopMySQLDError,
2122
)
@@ -61,10 +62,6 @@ class MySQLInitialiseMySQLDError(Error):
6162
"""Exception raised when there is an issue initialising an instance."""
6263

6364

64-
class MySQLServiceNotRunningError(Error):
65-
"""Exception raised when the MySQL service is not running."""
66-
67-
6865
class MySQLCreateCustomConfigFileError(Error):
6966
"""Exception raised when there is an issue creating custom config file."""
7067

@@ -229,10 +226,13 @@ def wait_until_mysql_connection(self, check_port: bool = True) -> None:
229226
Retry every 5 seconds for 30 seconds if there is an issue obtaining a connection.
230227
"""
231228
if not self.container.exists(MYSQLD_SOCK_FILE):
232-
raise MySQLServiceNotRunningError()
229+
raise MySQLServiceNotRunningError
233230

234-
if check_port and not self.check_mysqlsh_connection():
235-
raise MySQLServiceNotRunningError("Connection with mysqlsh not possible")
231+
try:
232+
if check_port and not self.check_mysqlsh_connection():
233+
raise MySQLServiceNotRunningError("Connection with mysqlsh not possible")
234+
except MySQLClientError:
235+
raise MySQLServiceNotRunningError
236236

237237
logger.debug("MySQL connection possible")
238238

@@ -852,3 +852,7 @@ def set_cluster_primary(self, new_primary_address: str) -> None:
852852
"""Set the cluster primary and update pod labels."""
853853
super().set_cluster_primary(new_primary_address)
854854
self.update_endpoints()
855+
856+
def fetch_error_log(self) -> Optional[str]:
857+
"""Fetch the MySQL error log."""
858+
return self.read_file_content("/var/log/mysql/error.log")

src/relations/mysql_provider.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,12 @@ def _on_update_status(self, _) -> None:
216216
if self.charm._is_cluster_blocked():
217217
return
218218

219+
if self.charm.upgrade.state == "failed":
220+
# skip updating endpoints if upgrade failed
221+
# unit pod still will be labeled from another unit
222+
logger.debug("Skip labelling pods on failed upgrade")
223+
return
224+
219225
container = self.charm.unit.get_container(CONTAINER_NAME)
220226
if (
221227
not container.can_connect()

src/rotate_mysql_logs.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,12 @@ def __init__(self, charm: "MySQLOperatorCharm"):
4343

4444
def _rotate_mysql_logs(self, _) -> None:
4545
"""Rotate the mysql logs."""
46-
if self.charm.peers is None or self.charm.unit_peer_data.get("unit-initialized") != "True":
46+
if (
47+
self.charm.peers is None
48+
or self.charm.unit_peer_data.get("unit-initialized") != "True"
49+
or not self.charm.upgrade.idle
50+
):
51+
# skip when not initialized or during an upgrade
4752
return
4853

4954
try:

src/upgrade.py

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,24 @@
1515
)
1616
from charms.mysql.v0.mysql import (
1717
MySQLGetMySQLVersionError,
18+
MySQLRebootFromCompleteOutageError,
1819
MySQLRescanClusterError,
20+
MySQLServerNotUpgradableError,
21+
MySQLServiceNotRunningError,
1922
MySQLSetClusterPrimaryError,
2023
MySQLSetVariableError,
2124
)
22-
from ops import JujuVersion
25+
from ops import Container, JujuVersion
2326
from ops.model import BlockedStatus, MaintenanceStatus, RelationDataContent
27+
from ops.pebble import ChangeError
2428
from pydantic import BaseModel
2529
from tenacity import RetryError, Retrying
2630
from tenacity.stop import stop_after_attempt
2731
from tenacity.wait import wait_fixed
2832
from typing_extensions import override
2933

3034
import k8s_helpers
35+
from constants import MYSQLD_SAFE_SERVICE
3136

3237
if TYPE_CHECKING:
3338
from charm import MySQLOperatorCharm
@@ -218,27 +223,45 @@ def _on_pebble_ready(self, event) -> None:
218223
if self.state not in ["upgrading", "recovery"]:
219224
return
220225

226+
container = event.workload
227+
self.charm._write_mysqld_configuration()
228+
229+
logger.info("Setting up the logrotate configurations")
230+
self.charm._mysql.setup_logrotate_config()
231+
221232
try:
222-
self.charm.unit.set_workload_version(self.charm._mysql.get_mysql_version() or "unset")
223-
except MySQLGetMySQLVersionError:
224-
# don't fail on this, just log it
225-
logger.warning("Failed to get MySQL version")
226-
try:
227-
failure_message = "unknown error"
233+
self.charm._reconcile_pebble_layer(container)
228234
self._check_server_upgradeability()
229235
self.charm.unit.status = MaintenanceStatus("recovering unit after upgrade")
230236
if self.charm.app.planned_units() > 1:
231237
self._recover_multi_unit_cluster()
232238
else:
233239
self._recover_single_unit_cluster()
234240
self._complete_upgrade()
235-
except Exception:
236-
failure_message = "Unit failed to rejoin the cluster after upgrade"
237-
logger.error(failure_message)
241+
except MySQLRebootFromCompleteOutageError:
242+
logger.error("Failed to reboot single unit from outage after upgrade")
238243
self.set_unit_failed()
239244
self.charm.unit.status = BlockedStatus(
240245
"upgrade failed. Check logs for rollback instruction"
241246
)
247+
except (
248+
RetryError,
249+
MySQLServerNotUpgradableError,
250+
MySQLServiceNotRunningError,
251+
ChangeError,
252+
):
253+
# Failed to recover unit
254+
if (
255+
not self._check_server_unsupported_downgrade()
256+
or self.charm.app.planned_units() == 1
257+
):
258+
# don't try to recover single unit cluster or errors other then downgrade
259+
logger.error("Unit failed to rejoin the cluster after upgrade")
260+
self.set_unit_failed()
261+
return
262+
logger.warning("Downgrade is incompatible. Resetting workload")
263+
self._reset_on_unsupported_downgrade(container)
264+
self._complete_upgrade()
242265

243266
def _recover_multi_unit_cluster(self) -> None:
244267
logger.debug("Recovering unit")
@@ -265,6 +288,11 @@ def _recover_single_unit_cluster(self) -> None:
265288
def _complete_upgrade(self):
266289
# complete upgrade for the unit
267290
logger.debug("Upgraded unit is healthy. Set upgrade state to `completed`")
291+
try:
292+
self.charm.unit.set_workload_version(self.charm._mysql.get_mysql_version() or "unset")
293+
except MySQLGetMySQLVersionError:
294+
# don't fail on this, just log it
295+
logger.warning("Failed to get MySQL version")
268296
self.set_unit_completed()
269297
if self.charm.unit_label == f"{self.charm.app.name}/1":
270298
# penultimate unit, reset the primary for faster switchover
@@ -301,3 +329,27 @@ def _check_server_upgradeability(self) -> None:
301329
instance = self.charm._get_unit_fqdn(f"{self.charm.app.name}/0")
302330
self.charm._mysql.verify_server_upgradable(instance=instance)
303331
logger.debug("MySQL server is upgradeable")
332+
333+
def _check_server_unsupported_downgrade(self) -> bool:
334+
"""Check error log for unsupported downgrade.
335+
336+
https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html
337+
"""
338+
if log_content := self.charm._mysql.fetch_error_log():
339+
return "MY-013171" in log_content
340+
341+
return False
342+
343+
def _reset_on_unsupported_downgrade(self, container: Container) -> None:
344+
"""Reset the cluster on unsupported downgrade."""
345+
container.stop(MYSQLD_SAFE_SERVICE)
346+
self.charm._mysql.reset_data_dir()
347+
self.charm._write_mysqld_configuration()
348+
self.charm._configure_instance(container)
349+
# reset flags
350+
self.charm.unit_peer_data.update({"member-role": "secondary", "member-state": "waiting"})
351+
# rescan is needed to remove the instance old incarnation from the cluster
352+
leader = self.charm._get_primary_from_online_peer()
353+
self.charm._mysql.rescan_cluster(from_instance=leader, remove_instances=True)
354+
# rejoin after
355+
self.charm.join_unit_to_cluster()

0 commit comments

Comments
 (0)