Skip to content

Commit

Permalink
Fix data filter bug
Browse files Browse the repository at this point in the history
  • Loading branch information
martimpassos committed Apr 12, 2024
1 parent 841bd54 commit 98253bb
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 10 deletions.
12 changes: 5 additions & 7 deletions imaginerio-etl/scripts/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,22 @@


def main():
# Compare data, overwrite current data file if there are changes and return those changes
metadata = get_metadata_changes(CURRENT_JSTOR, NEW_JSTOR)
# Filter changes in published items
metadata = metadata.loc[metadata["Status"] == "In imagineRio"]
# Compare data, overwrite current data file if there are changes
all_data, changed_data = get_metadata_changes(CURRENT_JSTOR, NEW_JSTOR)

# Update viewcones if any
if any(file for file in os.listdir(KMLS_IN) if file != ".gitkeep"):
viewcones_info = viewcones.update()
viewcones_info = viewcones.update(all_data)
else:
logger.info("No KMLs to process, skipping")
viewcones_info = None

# Update manifests if published items data has changed
if metadata.empty:
if not changed_data:
logger.info("No metadata changes detected, exiting")
manifest_info = None
else:
manifest_info = iiif.update_manifests(metadata)
manifest_info = iiif.update(changed_data)

if viewcones_info or manifest_info:
summary = summarize(viewcones_info, manifest_info)
Expand Down
8 changes: 5 additions & 3 deletions imaginerio-etl/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,18 @@ def get_metadata_changes(current_file, download_dir):
new_file = os.path.join(download_dir, os.listdir(download_dir)[0])
new_data = load_xls(new_file, "SSID")
filtered_new_data = new_data.drop(columns=["Notes"]).loc[
filtered_new_data["Status"] == "In imagineRio"
new_data["Status"] == "In imagineRio"
]

# Compare files and get changed rows
comparison = filtered_new_data.compare(current_data, keep_shape=True)
changes = comparison.notna().any(axis=1)
changed_data = filtered_new_data[changes]
changed_data = (
filtered_new_data[changes] if not filtered_new_data[changes].empty else None
)

# Replace current with new filtered data
if not changed_data.empty:
if changed_data:
new_file = filtered_new_data.to_excel(current_file, engine="openpyxl")

return new_data, changed_data
Expand Down

0 comments on commit 98253bb

Please sign in to comment.