Skip to content

Fix archiver bug ignoring deletions when comparing two files with no missing columns #1522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions _delphi_utils_python/delphi_utils/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@
Files = List[str]
FileDiffMap = Dict[str, Optional[str]]

EXPORT_CSV_DTYPES = {
"geo_id": str, "val": float, "se": float, "sample_size": float,
"missing_val": "Int64", "missing_se": "Int64", "missing_sample_size": "Int64"
}

def diff_export_csv(
before_csv: str,
Expand All @@ -75,15 +79,10 @@ def diff_export_csv(
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
added_df is the pd.DataFrame of added rows from after_csv.
"""
export_csv_dtypes = {
"geo_id": str, "val": float, "se": float, "sample_size": float,
"missing_val": int, "missing_se": int, "missing_sample_size": int
}

before_df = pd.read_csv(before_csv, dtype=export_csv_dtypes)
before_df = pd.read_csv(before_csv, dtype=EXPORT_CSV_DTYPES)
before_df.set_index("geo_id", inplace=True)
before_df = before_df.round({"val": 7, "se": 7})
after_df = pd.read_csv(after_csv, dtype=export_csv_dtypes)
after_df = pd.read_csv(after_csv, dtype=EXPORT_CSV_DTYPES)
after_df.set_index("geo_id", inplace=True)
after_df = after_df.round({"val": 7, "se": 7})
deleted_idx = before_df.index.difference(after_df.index)
Expand All @@ -93,20 +92,21 @@ def diff_export_csv(
before_df_cmn = before_df.reindex(common_idx)
after_df_cmn = after_df.reindex(common_idx)

# If CSVs have different columns (no missingness), mark all values as new
if ("missing_val" in before_df_cmn.columns) ^ ("missing_val" in after_df_cmn.columns):
# If new CSV has missingness columns, but old doesn't, mark all values as new
if ("missing_val" not in before_df_cmn.columns) & ("missing_val" in after_df_cmn.columns):
same_mask = after_df_cmn.copy()
same_mask.loc[:] = False
else:
# Exact comparisons, treating NA == NA as True
same_mask = before_df_cmn == after_df_cmn
same_mask |= pd.isna(before_df_cmn) & pd.isna(after_df_cmn)

# Code deleted entries as nans with the deleted missing code
# Any deleted entries become rows with nans and the deleted missing code
deleted_df = before_df.loc[deleted_idx, :].copy()
deleted_df[["val", "se", "sample_size"]] = np.nan
if "missing_val" in after_df_cmn.columns:
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED
# If the new file doesn't have missing columsn, then when the deleted, changed, and added
# rows are concatenated (in diff_exports), they will default to NA
deleted_df[["missing_val", "missing_se", "missing_sample_size"]] = Nans.DELETED

return (
deleted_df,
Expand Down
Loading