-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_transactions.py
More file actions
40 lines (32 loc) · 1.84 KB
/
clean_transactions.py
File metadata and controls
40 lines (32 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pandas as pd
import numpy as np
def clean_transactions(df):
# --- 1. Convert types first ---
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['Price Per Unit'] = pd.to_numeric(df['Price Per Unit'], errors='coerce')
df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors='coerce') # keep NaT if invalid
# --- 2. Clean string columns ---
df['Item'] = df['Item'].astype(str).str.strip().str.title()
df['Payment Method'] = df['Payment Method'].astype(str).str.strip().str.title()
df['Location'] = df['Location'].astype(str).str.strip().str.title()
# --- 3. Replace invalid strings ---
df['Item'] = df['Item'].replace(['Nan', 'None', '', 'Unknown', 'Error'], 'Unknown Item')
df['Payment Method'] = df['Payment Method'].replace(['Nan', 'None', '', 'Unknown', 'Error'], 'Other')
df['Location'] = df['Location'].replace(['Nan', 'None', '', 'Unknown', 'Error'], 'Other')
# --- 4. Smart Total Spent ---
mask_valid = (~df['Quantity'].isna()) & (~df['Price Per Unit'].isna())
df.loc[mask_valid, 'Total Spent'] = df.loc[mask_valid, 'Quantity'] * df.loc[mask_valid, 'Price Per Unit']
# --- 5. Replace missing numeric values with median ---
numeric_cols = ['Quantity', 'Price Per Unit', 'Total Spent']
for col in numeric_cols:
# Compute median only from positive numbers
median_value = df.loc[df[col] > 0, col].median(skipna=True)
median_value = round(median_value, 2) # match notebook rounding
df[col] = df[col].fillna(median_value)
df[col] = df[col].round(2) # round all numeric columns
# --- 6. Transaction Date ---
# Missing dates stay as NaT/pd.NA (no replacement)
# --- 7. Reset index ---
df.reset_index(drop=True, inplace=True)
return df