You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
It seems duckdb makes it hard to rename a table with primary key / index on it:
sqlalchemy.exc.DBAPIError: (duckdb.duckdb.Error) Cannot alter entry "task_2_out__copy" because there are entries that depend on it.
[SQL: ALTER TABLE stage_1__even.task_2_out__copy RENAME TO task_2_out]
from __future__ importannotationsimportloggingimporttempfileimportpandasaspdimportsqlalchemyassafrompydiverse.pipedagimportFlow, Stage, Table, materialize, input_stage_versionsfrompydiverse.pipedag.contextimportStageLockContextfrompydiverse.pipedag.core.configimportcreate_basic_pipedag_configfrompydiverse.pipedag.util.structlogimportsetup_logging@input_stage_versions(input_type=pd.DataFrame)defvalidate_stage1(
tbls: dict[str, pd.DataFrame], other_tbls: dict[str, pd.DataFrame]
):
# Any tests can be done on tables of both versions. They can either fail and prevent# the schema swap, print some information about differences, or produce a table# with results of validation.asserttbls["task_1_out"]["x"][0] ==1asserttbls["task_2_out"]["a"].sum() ==3assertlen({t.lower() fortintbls.keys()} - {"task_1_out", "task_2_out", "dfa"}) ==1assertlist({t.lower() fortintbls.keys()} - {"task_1_out", "task_2_out", "dfa"})[0].startswith("dfb_")
logger=logging.getLogger(f"{__name__}-validate_stage1")
logger.info("Additional tables: %s", set(tbls.keys()) -set(other_tbls.keys()))
logger.info("Missing tables: %s", set(other_tbls.keys()) -set(tbls.keys()) - {"column_diffs"})
# Producing a table with differences of matching table names can be done a lot more# elaborate in a library. Here is just an idea to get started:defget_missing_columns(tbl: pd.DataFrame, other_tbl: pd.DataFrame):
returnset(other_tbl.columns) -set(tbl.columns)
missing_columns= {tbl: get_missing_columns(tbls[tbl], other_tbls[tbl]) fortblinset(tbls.keys()) &set(other_tbls.keys())}
col_diff_dfs= [pd.DataFrame(dict(table=[], column=[], value=[]))]
fortbl, columnsinmissing_columns.items():
col_diff_df=pd.DataFrame(dict(
table=tbl,
column=list(columns),
value="missing",
))
col_diff_dfs.append(col_diff_df)
col_diff_df=pd.concat(col_diff_dfs, ignore_index=True, axis="rows")
returnTable(col_diff_df, name="column_diffs")
@materialize(lazy=True)deflazy_task_1():
returnTable(sa.select(
sa.literal(1).label("x"),
sa.literal(2).label("y"),
), name="task_1_out")
@materialize(lazy=True, input_type=sa.Table)deflazy_task_2(input1: sa.sql.expression.Alias, input2: sa.sql.expression.Alias):
query=sa.select(
(input1.c.x*5).label("x5"),
input2.c.a,
).select_from(input1.outerjoin(input2, input2.c.x==input1.c.x))
returnTable(query, name="task_2_out", primary_key=["a"])
@materialize(nout=2, version="1.0.0")defeager_inputs():
dfA=pd.DataFrame(
{
"a": [0, 1, 2, 4],
"b": [9, 8, 7, 6],
}
)
dfB=pd.DataFrame(
{
"a": [2, 1, 0, 1],
"x": [1, 1, 2, 2],
}
)
returnTable(dfA, "dfA"), Table(dfB, "dfB_%%")
defmain():
logger=logging.getLogger(__name__)
withtempfile.TemporaryDirectory() astemp_dir:
cfg=create_basic_pipedag_config(
f"duckdb:///{temp_dir}/db.duckdb",
disable_stage_locking=True, # This is special for duckdb# Attention: If uncommented, stage and task names might be sent to the# following URL. You can self-host kroki if you like:# https://docs.kroki.io/kroki/setup/install/# kroki_url="https://kroki.io",
).get("default")
withcfg:
withFlow() asf:
withStage("stage_1"):
lazy_1=lazy_task_1()
a, b=eager_inputs()
lazy_2=lazy_task_2(lazy_1, b)
col_diff=validate_stage1()
_=a, lazy_2# unused terminal output tables# Run Flow a bit different firstresult=f.run()
assertresult.successful# Run Flow and print diff resultwithStageLockContext():
result=f.run()
assertresult.successfullogger.info("Column differences: %s", result.get(col_diff, as_type=pd.DataFrame))
if__name__=="__main__":
setup_logging() # you can setup the logging and/or structlog libraries as you wishmain()
The text was updated successfully, but these errors were encountered:
It seems duckdb makes it hard to rename a table with primary key / index on it:
The text was updated successfully, but these errors were encountered: