diff --git a/offsets_db_data/apx.py b/offsets_db_data/apx.py index 1fe6a78..ad3eea8 100644 --- a/offsets_db_data/apx.py +++ b/offsets_db_data/apx.py @@ -89,7 +89,9 @@ def process_apx_credits( if download_type == 'issuances': data = data.aggregate_issuance_transactions() - data = data.validate(schema=credit_without_id_schema) + data = data.add_missing_columns(schema=credit_without_id_schema).validate( + schema=credit_without_id_schema + ) if arb is not None and not arb.empty: data = data.merge_with_arb(arb=arb) return data diff --git a/offsets_db_data/arb.py b/offsets_db_data/arb.py index acf2a5a..d60b257 100644 --- a/offsets_db_data/arb.py +++ b/offsets_db_data/arb.py @@ -132,8 +132,10 @@ def process_arb(df: pd.DataFrame) -> pd.DataFrame: data['registry'] = data.project_id.apply(_get_registry) data['vintage'] = data['vintage'].astype(int) - data = data.convert_to_datetime(columns=['transaction_date']).validate( - schema=credit_without_id_schema + data = ( + data.add_missing_columns(schema=credit_without_id_schema) + .convert_to_datetime(columns=['transaction_date']) + .validate(schema=credit_without_id_schema) ) return data diff --git a/offsets_db_data/common.py b/offsets_db_data/common.py index 2334bc2..01e03b6 100644 --- a/offsets_db_data/common.py +++ b/offsets_db_data/common.py @@ -141,7 +141,7 @@ def add_missing_columns(df: pd.DataFrame, *, schema: pa.DataFrameSchema) -> pd.D for column, value in schema.columns.items(): dtype = value.dtype.type if column not in df.columns: - default_value = default_values.get(dtype, None) + default_value = default_values.get(dtype) df[column] = pd.Series([default_value] * len(df), index=df.index, dtype=dtype) return df diff --git a/offsets_db_data/configs/credits-raw-columns-mapping.json b/offsets_db_data/configs/credits-raw-columns-mapping.json index 5476e25..1b1c98c 100644 --- a/offsets_db_data/configs/credits-raw-columns-mapping.json +++ b/offsets_db_data/configs/credits-raw-columns-mapping.json @@ -1,82 +1,130 @@ { "american-carbon-registry": { "cancellations": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "Project ID", "quantity": "Quantity of Credits", + "reason": null, "transaction_date": "Status Effective (GMT)", "vintage": "Vintage" }, "issuances": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "Project ID", "quantity": "Total Credits Issued", + "reason": null, "transaction_date": "Date Issued (GMT)", "vintage": "Vintage" }, "retirements": { + "account": "Account Holder", + "beneficiary": null, + "note": "Purpose of Retirement", "project_id": "Project ID", "quantity": "Quantity of Credits", + "reason": "Retirement Reason", "transaction_date": "Status Effective (GMT)", "vintage": "Vintage" } }, "art-trees": { "cancellations": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "Program ID", "quantity": "Quantity of Credits", + "reason": null, "transaction_date": "Status Effective", "vintage": "Vintage" }, "issuances": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "Program ID", "quantity": "Credits Verified", + "reason": null, "transaction_date": "Date Approved", "vintage": "Vintage" }, "retirements": { + "account": "Account Holder", + "beneficiary": null, + "note": "Retirement Reason Details", "project_id": "Program ID", "quantity": "Quantity of Credits", + "reason": "Retirement Reason", "transaction_date": "Status Effective", "vintage": "Vintage" } }, "climate-action-reserve": { "cancellations": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "Project ID", "quantity": "Quantity of Offset Credits", + "reason": null, "transaction_date": "Status Effective", "vintage": "Vintage" }, "issuances": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "Project ID", "quantity": "Total Offset Credits Issued", + "reason": null, "transaction_date": "Date Issued", "vintage": "Vintage" }, "retirements": { + "account": "Account Holder", + "beneficiary": null, + "note": "Retirement Reason Details", "project_id": "Project ID", "quantity": "Quantity of Offset Credits", + "reason": "Retirement Reason", "transaction_date": "Status Effective", "vintage": "Vintage" } }, "gold-standard": { "issuances": { + "account": null, + "beneficiary": null, + "note": null, "project_id": "GSID", "quantity": "Quantity", + "reason": null, "transaction_date": "Issuance Date", "vintage": "Vintage" }, "retirements": { + "account": null, + "beneficiary": "Using Entity", + "note": "Note", "project_id": "GSID", "quantity": "Quantity", + "reason": null, "transaction_date": "Retirement Date", "vintage": "Vintage" } }, "verra": { "transactions": { + "account": null, + "beneficiary": "Retirement Beneficiary", + "note": "Retirement Details", "project_id": null, "quantity": null, + "reason": "Retirement Reason", "transaction_date": null, "vintage": null } diff --git a/offsets_db_data/gld.py b/offsets_db_data/gld.py index 7bfdeaf..ef6faf8 100644 --- a/offsets_db_data/gld.py +++ b/offsets_db_data/gld.py @@ -126,7 +126,9 @@ def process_gld_credits( if download_type == 'issuances': data = data.aggregate_issuance_transactions() - data = data.validate(schema=credit_without_id_schema) + data = data.add_missing_columns(schema=credit_without_id_schema).validate( + schema=credit_without_id_schema + ) if arb is not None and not arb.empty: data = data.merge_with_arb(arb=arb) @@ -136,6 +138,7 @@ def process_gld_credits( pd.DataFrame(columns=credit_without_id_schema.columns.keys()) .add_missing_columns(schema=credit_without_id_schema) .convert_to_datetime(columns=['transaction_date'], format='%Y-%m-%d') + .add_missing_columns(schema=credit_without_id_schema) .validate(schema=credit_without_id_schema) ) diff --git a/offsets_db_data/models.py b/offsets_db_data/models.py index 470288b..5874847 100644 --- a/offsets_db_data/models.py +++ b/offsets_db_data/models.py @@ -48,6 +48,10 @@ 'vintage': pa.Column(pa.Int, nullable=True, coerce=True), 'transaction_date': pa.Column(pd.DatetimeTZDtype(tz='UTC'), nullable=True), 'transaction_type': pa.Column(pa.String, nullable=True), + 'account': pa.Column(pa.String, nullable=True), + 'reason': pa.Column(pa.String, nullable=True), + 'note': pa.Column(pa.String, nullable=True), + 'beneficiary': pa.Column(pa.String, nullable=True), } ) diff --git a/offsets_db_data/vcs.py b/offsets_db_data/vcs.py index c6a0975..24bc75d 100644 --- a/offsets_db_data/vcs.py +++ b/offsets_db_data/vcs.py @@ -225,6 +225,7 @@ def process_vcs_credits( data = ( pd.concat([issuances, retirements]) .reset_index(drop=True) + .add_missing_columns(schema=credit_without_id_schema) .validate(schema=credit_without_id_schema) ) diff --git a/tests/test_integration.py b/tests/test_integration.py index 2516027..cabd01f 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -10,7 +10,7 @@ @pytest.fixture def date() -> str: - return '2024-05-03' + return '2024-08-27' @pytest.fixture @@ -21,7 +21,7 @@ def bucket() -> str: @pytest.fixture def arb() -> pd.DataFrame: data = pd.read_excel( - 's3://carbonplan-offsets-db/raw/2024-05-03/arb/nc-arboc_issuance.xlsx', sheet_name=3 + 's3://carbonplan-offsets-db/raw/2024-08-27/arb/nc-arboc_issuance.xlsx', sheet_name=3 ) return data.process_arb() @@ -103,10 +103,10 @@ def test_gld( pd.concat( [ pd.read_csv( - 's3://carbonplan-offsets-db/raw/2024-05-03/gold-standard/issuances.csv.gz' + 's3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/issuances.csv.gz' ).process_gld_credits(download_type='issuances'), pd.read_csv( - 's3://carbonplan-offsets-db/raw/2024-05-03/gold-standard/retirements.csv.gz' + 's3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/retirements.csv.gz' ).process_gld_credits(download_type='retirements'), ] ), @@ -116,7 +116,7 @@ def test_gld( 'projects', [ pd.DataFrame(), - pd.read_csv('s3://carbonplan-offsets-db/raw/2024-05-03/gold-standard/projects.csv.gz'), + pd.read_csv('s3://carbonplan-offsets-db/raw/2024-08-27/gold-standard/projects.csv.gz'), ], ) def test_gld_empty(df_credits, projects):