Transaction-Data-Cleaning/clean_transactions.py at main · rajab-bett-analytics/Transaction-Data-Cleaning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pandas as pd
import numpy as np

def clean_transactions(df):
    # --- 1. Convert types first ---
    df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
    df['Price Per Unit'] = pd.to_numeric(df['Price Per Unit'], errors='coerce')
    df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')
    df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors='coerce')  # keep NaT if invalid

    # --- 2. Clean string columns ---
    df['Item'] = df['Item'].astype(str).str.strip().str.title()
    df['Payment Method'] = df['Payment Method'].astype(str).str.strip().str.title()
    df['Location'] = df['Location'].astype(str).str.strip().str.title()

    # --- 3. Replace invalid strings ---
    df['Item'] = df['Item'].replace(['Nan', 'None', '', 'Unknown', 'Error'], 'Unknown Item')
    df['Payment Method'] = df['Payment Method'].replace(['Nan', 'None', '', 'Unknown', 'Error'], 'Other')
    df['Location'] = df['Location'].replace(['Nan', 'None', '', 'Unknown', 'Error'], 'Other')

    # --- 4. Smart Total Spent ---
    mask_valid = (~df['Quantity'].isna()) & (~df['Price Per Unit'].isna())
    df.loc[mask_valid, 'Total Spent'] = df.loc[mask_valid, 'Quantity'] * df.loc[mask_valid, 'Price Per Unit']

    # --- 5. Replace missing numeric values with median ---
    numeric_cols = ['Quantity', 'Price Per Unit', 'Total Spent']
    for col in numeric_cols:
        # Compute median only from positive numbers
        median_value = df.loc[df[col] > 0, col].median(skipna=True)
        median_value = round(median_value, 2)  # match notebook rounding
        df[col] = df[col].fillna(median_value)
        df[col] = df[col].round(2)  # round all numeric columns

    # --- 6. Transaction Date ---
    # Missing dates stay as NaT/pd.NA (no replacement)

    # --- 7. Reset index ---
    df.reset_index(drop=True, inplace=True)

    return df