-
Notifications
You must be signed in to change notification settings - Fork 286
Watcher skip on producer failures #2430
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 62 commits
1395bf4
95b06cd
ff3a1ba
4f59350
f0d097b
0802ef8
b328a7c
ef55c48
ad3d307
75b33bd
01e310a
9267fa5
b546146
454b071
d471db4
b696707
273464b
4fcb25b
e4be216
fc3e923
405c15e
c5c8682
ee0f516
6ec11a2
e85b0f0
c3a6bed
cb77108
06ce6b2
c783cb3
40d7dfc
6fbf25c
66161f9
001d69e
6765bfa
41e6d80
01cfee3
170505b
106acb8
43de64a
6574f65
f2e773d
ea7dc72
5a52371
31a7a9d
d0d375f
975e2a1
255b08c
92e9e3d
bccbb33
2f90883
2134a6d
466d472
6d70d11
543a615
f1c4711
30856a0
7fa2b29
2306178
e5540c9
fe3404d
93c7ab4
758d53f
df54719
a099ea0
f84cae6
9914d9c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,12 +6,15 @@ | |
| from typing import Any | ||
|
|
||
| try: # Airflow 3 | ||
| from airflow.providers.standard.operators.empty import EmptyOperator | ||
| from airflow.sdk.bases.operator import BaseOperator | ||
| except ImportError: # Airflow 2 | ||
| from airflow.models import BaseOperator | ||
| from airflow.operators.empty import EmptyOperator # type: ignore[no-redef] | ||
|
|
||
| from airflow.models.base import ID_LEN as AIRFLOW_MAX_ID_LENGTH | ||
| from airflow.models.dag import DAG | ||
| from airflow.utils.trigger_rule import TriggerRule | ||
|
|
||
| try: | ||
| # Airflow 3.1 onwards | ||
|
|
@@ -679,7 +682,7 @@ def _add_watcher_producer_task( | |
| render_config: RenderConfig | None = None, | ||
| execution_mode: ExecutionMode = ExecutionMode.WATCHER, | ||
| tests_per_model: dict[str, list[str]] | None = None, | ||
| ) -> BaseOperator: | ||
| ) -> tuple[BaseOperator, EmptyOperator]: | ||
| """ | ||
| Create the producer task for the watcher execution mode and add it to the tasks_map. | ||
| The producer task is the task that will be used to produce the events for the watcher execution mode. | ||
|
|
@@ -711,12 +714,22 @@ def _add_watcher_producer_task( | |
| ) | ||
| producer_airflow_task = create_airflow_task(producer_task_metadata, dag, task_group=task_group) | ||
| tasks_map[PRODUCER_WATCHER_TASK_ID] = producer_airflow_task | ||
| return producer_airflow_task | ||
|
|
||
| producer_task_gate = EmptyOperator( # type: ignore[no-untyped-call] | ||
| task_id=f"{PRODUCER_WATCHER_TASK_ID}_gate", | ||
| dag=dag, | ||
| task_group=task_group, | ||
| trigger_rule=TriggerRule.NONE_FAILED, | ||
| depends_on_past=producer_airflow_task.depends_on_past, | ||
| ) | ||
| producer_airflow_task >> producer_task_gate | ||
| return producer_airflow_task, producer_task_gate | ||
|
|
||
|
|
||
| def _add_watcher_dependencies( | ||
| dag: DAG, | ||
| producer_airflow_task: BaseOperator, | ||
| producer_gate: BaseOperator, | ||
| task_args: dict[str, Any], | ||
| tasks_map: dict[str, Any], | ||
| nodes: dict[str, DbtNode] | None = None, | ||
|
|
@@ -728,7 +741,7 @@ def _add_watcher_dependencies( | |
| """ | ||
| for node_id, task_or_taskgroup in tasks_map.items(): | ||
| # We do not want to set a dependency between the producer task and itself | ||
| if node_id == PRODUCER_WATCHER_TASK_ID: | ||
| if node_id == PRODUCER_WATCHER_TASK_ID or node_id == f"{PRODUCER_WATCHER_TASK_ID}_gate": | ||
| continue | ||
|
|
||
| node_tasks = ( | ||
|
|
@@ -758,6 +771,10 @@ def _add_watcher_dependencies( | |
| for task in always_run_tasks: | ||
| task.trigger_rule = task_args.get("trigger_rule", "always") # type: ignore[attr-defined] | ||
|
johnhoran marked this conversation as resolved.
|
||
|
|
||
| # If depends_on_past isn't true then gating all the tasks isn't really needed. | ||
| if producer_airflow_task.wait_for_downstream and not task_or_taskgroup.downstream_task_ids: | ||
| task_or_taskgroup >> producer_gate | ||
|
Comment on lines
+774
to
+776
|
||
|
|
||
|
johnhoran marked this conversation as resolved.
|
||
|
|
||
| def should_create_detached_nodes(render_config: RenderConfig) -> bool: | ||
| """ | ||
|
|
@@ -908,7 +925,7 @@ def build_airflow_graph( # noqa: C901 TODO: https://github.com/astronomer/astro | |
| task_groups: dict[str, TaskGroup] = {} | ||
| task_or_group: TaskGroup | BaseOperator | None | ||
| parent_task_group = task_group | ||
| producer_task: BaseOperator | None = None | ||
| producer_tasks: tuple[BaseOperator, EmptyOperator] | None = None | ||
|
|
||
| # Identify test nodes that should be run detached from the associated dbt resource nodes because they | ||
| # have multiple parents | ||
|
|
@@ -926,7 +943,7 @@ def build_airflow_graph( # noqa: C901 TODO: https://github.com/astronomer/astro | |
| # We are intentionally creating the producer task ahead of the consumer tasks | ||
| # Airflow priority weight is not being respected in multiple versions of the library, including 3.1 | ||
| # To instantiate the producer before helps having it before on the DAG topological order and scheduling this task before the consumer tasks | ||
| producer_task = _add_watcher_producer_task( | ||
| producer_tasks = _add_watcher_producer_task( | ||
| dag=dag, | ||
| task_args={**task_args, **setup_operator_args}, | ||
| tasks_map=tasks_map, | ||
|
|
@@ -988,6 +1005,9 @@ def build_airflow_graph( # noqa: C901 TODO: https://github.com/astronomer/astro | |
| leaves_ids = calculate_leaves(tasks_ids=list(tasks_map.keys()), nodes=nodes) | ||
| for leaf_node_id in leaves_ids: | ||
| tasks_map[leaf_node_id] >> test_task | ||
| if producer_tasks and producer_tasks[0].depends_on_past: | ||
| test_task >> producer_tasks[1] | ||
| test_task.wait_for_downstream = True | ||
| elif render_config.test_behavior in (TestBehavior.BUILD, TestBehavior.AFTER_EACH): | ||
| # Handle detached test nodes | ||
| for node_id, node in detached_nodes.items(): | ||
|
|
@@ -1012,10 +1032,11 @@ def build_airflow_graph( # noqa: C901 TODO: https://github.com/astronomer/astro | |
|
|
||
| create_airflow_task_dependencies(nodes, tasks_map) | ||
|
|
||
| if producer_task: | ||
| if producer_tasks: | ||
| _add_watcher_dependencies( | ||
| dag=dag, | ||
| producer_airflow_task=producer_task, | ||
| producer_airflow_task=producer_tasks[0], | ||
| producer_gate=producer_tasks[1], | ||
| task_args=task_args, | ||
| tasks_map=tasks_map, | ||
| nodes=nodes, | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -288,6 +288,9 @@ def __init__( | |||||
| self.deferrable = deferrable | ||||||
| self.model_unique_id = extra_context.get("dbt_node_config", {}).get("unique_id") | ||||||
|
|
||||||
| if self.depends_on_past: | ||||||
| self.wait_for_downstream = True | ||||||
|
|
||||||
| @staticmethod | ||||||
| def _filter_flags(flags: list[str]) -> list[str]: | ||||||
| """Filters out dbt flags that are incompatible with retry (e.g., --select, --exclude).""" | ||||||
|
|
@@ -502,11 +505,10 @@ def poke(self, context: Context) -> bool: | |||||
| _log_dbt_event(dbt_events) | ||||||
|
|
||||||
| if status is None: | ||||||
|
|
||||||
| if producer_task_state == "failed": | ||||||
| if producer_task_state == "failed" or producer_task_state == "skipped": | ||||||
| if self.poke_retry_number > 0: | ||||||
| raise AirflowException( | ||||||
| f"The dbt build command failed in producer task. Please check the log of task {self.producer_task_id} for details." | ||||||
| f"The dbt build command {producer_task_state} in the producer task. Please check the log of task {self.producer_task_id} for details." | ||||||
|
johnhoran marked this conversation as resolved.
johnhoran marked this conversation as resolved.
|
||||||
| f"The dbt build command {producer_task_state} in the producer task. Please check the log of task {self.producer_task_id} for details." | |
| f"The dbt build command was {producer_task_state} in the producer task. Please check the log of task {self.producer_task_id} for details." |
Copilot
AI
Mar 5, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This logic now treats a skipped producer as a failure signal. There should be test coverage for the producer_task_state == "skipped" branch (including the two paths for poke_retry_number > 0 vs == 0) to ensure sensors reliably fall back to non-watcher execution rather than looping until timeout.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,6 @@ | |
|
|
||
|
|
||
| class WatcherTrigger(BaseTrigger): | ||
|
|
||
| def __init__( | ||
| self, | ||
| model_unique_id: str, | ||
|
|
@@ -213,10 +212,11 @@ async def run(self) -> AsyncIterator[TriggerEvent]: | |
| event_data["compiled_sql"] = compiled_sql | ||
| yield TriggerEvent(event_data) # type: ignore[no-untyped-call] | ||
| return | ||
| elif producer_task_state == "failed": | ||
| elif producer_task_state == "failed" or producer_task_state == "skipped": | ||
| logger.error( | ||
| "Watcher producer task '%s' failed before delivering results for node '%s'", | ||
| "Watcher producer task '%s' %s before delivering results for node '%s'", | ||
| self.producer_task_id, | ||
| producer_task_state, | ||
| self.model_unique_id, | ||
| ) | ||
|
johnhoran marked this conversation as resolved.
Comment on lines
240
to
245
|
||
| yield TriggerEvent({"status": EventStatus.FAILED, "reason": "producer_failed"}) # type: ignore[no-untyped-call] | ||
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,7 @@ | |
| from datetime import timedelta | ||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| from airflow.exceptions import AirflowException | ||
| from airflow.exceptions import AirflowException, AirflowSkipException | ||
|
|
||
| from cosmos.config import ProfileConfig | ||
| from cosmos.operators._watcher import _parse_compressed_xcom, safe_xcom_push | ||
|
|
@@ -120,6 +120,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: | |
| if self.invocation_mode == InvocationMode.SUBPROCESS: | ||
| self.log_format = "json" | ||
|
|
||
| if self.depends_on_past: | ||
| self.wait_for_downstream = True | ||
|
|
||
|
Comment on lines
+109
to
+111
|
||
| @staticmethod | ||
| def _serialize_event(event_message: EventMsg) -> dict[str, Any]: | ||
| """Convert structured dbt EventMsg to plain dict.""" | ||
|
|
@@ -187,12 +190,10 @@ def execute(self, context: Context, **kwargs: Any) -> Any: | |
| try_number = getattr(task_instance, "try_number", 1) | ||
|
|
||
| if try_number > 1: | ||
| self.log.info( | ||
| "Dbt WATCHER producer task does not support Airflow retries. " | ||
| "Detected attempt #%s; skipping execution to avoid running a second dbt build.", | ||
| try_number, | ||
| raise AirflowSkipException( | ||
| "DbtProducerWatcherOperator does not support Airflow retries. " | ||
| f"Detected attempt #{try_number}; skipping execution to avoid running a second dbt build." | ||
| ) | ||
|
johnhoran marked this conversation as resolved.
|
||
| return None | ||
|
|
||
| self.log.info( | ||
| "Dbt WATCHER producer task forces Airflow retries to 0 so the dbt build only runs once; " | ||
|
|
@@ -238,9 +239,10 @@ def _callback(event_message: EventMsg) -> None: | |
| safe_xcom_push(task_instance=context["ti"], key="task_status", value="completed") | ||
| return return_value | ||
|
|
||
| except Exception: | ||
| except Exception as e: | ||
| safe_xcom_push(task_instance=context["ti"], key="task_status", value="completed") | ||
| raise | ||
| self.log.exception("DbtProducerWatcherOperator execution failed") | ||
| raise AirflowSkipException("Skipping execution due to task failure") from e | ||
|
johnhoran marked this conversation as resolved.
johnhoran marked this conversation as resolved.
Comment on lines
+223
to
+226
|
||
|
|
||
|
|
||
| class DbtConsumerWatcherSensor(BaseConsumerSensor, DbtRunLocalOperator): # type: ignore[misc] | ||
|
|
@@ -352,6 +354,8 @@ class DbtTestWatcherOperator(EmptyOperator): | |
| """ | ||
|
|
||
| def __init__(self, *args: Any, **kwargs: Any): | ||
| default_args = kwargs.get("default_args", {}) | ||
| desired_keys = ("dag", "task_group", "task_id") | ||
| new_kwargs = {key: value for key, value in kwargs.items() if key in desired_keys} | ||
| super().__init__(**new_kwargs) # type: ignore[no-untyped-call] | ||
| depends_on_past = kwargs.get("depends_on_past", False) or default_args.get("depends_on_past", False) | ||
| super().__init__(depends_on_past=depends_on_past, wait_for_downstream=depends_on_past, **new_kwargs) # type: ignore[no-untyped-call] | ||
|
johnhoran marked this conversation as resolved.
Outdated
|
||
Uh oh!
There was an error while loading. Please reload this page.