Skip to content

Commit 6d3a178

Browse files
committed
fixed week1
1 parent fd41ed9 commit 6d3a178

File tree

3 files changed

+26
-67
lines changed

3 files changed

+26
-67
lines changed

notebooks/week1/00.dataexploration.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -390,11 +390,11 @@
390390
"metadata": {},
391391
"outputs": [],
392392
"source": [
393-
"# Merge the aggregated actigraphy features with the train data\n",
394-
"combined_df = pd.merge(train_df, aggregated_actigraphy_df, on=\"id\", how=\"left\")\n",
393+
"# # Merge the aggregated actigraphy features with the train data\n",
394+
"# combined_df = pd.merge(train_df, aggregated_actigraphy_df, on=\"id\", how=\"left\")\n",
395395
"\n",
396-
"# Inspect the combined DataFrame\n",
397-
"print(combined_df.head())"
396+
"# # Inspect the combined DataFrame\n",
397+
"# print(combined_df.head())"
398398
]
399399
},
400400
{

notebooks/week1/01.dataExploraton.py

Lines changed: 16 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from concurrent.futures import ThreadPoolExecutor
1515

1616
import pandas as pd
17+
from pyspark.dbutils import DBUtils
1718
from pyspark.sql import SparkSession
1819

1920
from childHealth.config import ProjectConfig
@@ -22,60 +23,51 @@
2223

2324
# Initialize Spark session
2425
spark = SparkSession.builder.getOrCreate()
26+
dbutils = DBUtils(spark)
2527

2628
# Define original paths
2729
dirname_train_ts = "/Volumes/mlops_students/javedhassi/data/series_train.parquet"
2830
dirname_test_ts = "/Volumes/mlops_students/javedhassi/data/series_test.parquet"
2931

30-
# COMMAND ----------
31-
3232
# Load project configuration from YAML file
3333
config = ProjectConfig.from_yaml(config_path="../../project_config.yml")
3434
num_features = config.num_features
3535
cat_features = config.cat_features
3636

3737

38-
# COMMAND ----------
3938
def process_file(filename, dirname):
4039
filepath = os.path.join(dirname, filename, "part-0.parquet")
41-
df = spark.read.parquet(filepath)
42-
df = df.drop("step")
43-
# Ensure 'id' column is included
40+
df = spark.read.parquet(filepath).drop("step")
4441
if "id" not in df.columns:
45-
df = df.withColumn("id", df["relative_date_PCIAT"]) # Use an existing column or create a new one
42+
df = df.withColumn("id", df["relative_date_PCIAT"])
4643
return df.toPandas(), filename.split("=")[1]
4744

4845

4946
def load_time_series(dirname) -> pd.DataFrame:
50-
# List all subdirectories in the specified path
5147
directories = [file.path for file in dbutils.fs.ls(dirname) if file.path.endswith("/")]
52-
5348
results = []
5449
with ThreadPoolExecutor() as executor:
5550
futures = {executor.submit(process_file, path.split("/")[-2], dirname): path for path in directories}
5651
for i, future in enumerate(futures):
5752
result = future.result()
5853
results.append(result)
5954
print(f"Processed {i + 1}/{len(directories)} files")
60-
61-
# Separate stats and identifiers
6255
stats, indexes = zip(*results, strict=False) if results else ([], [])
63-
64-
# Create DataFrame with statistics and identifiers
6556
combined_df = pd.concat([df for df in stats], ignore_index=True)
6657
combined_df["id"] = indexes
67-
6858
return combined_df
6959

7060

71-
# COMMAND ----------
61+
def update(df):
62+
for c in cat_features:
63+
df[c] = df[c].fillna("Missing").astype("category")
64+
return df
65+
7266

7367
# Load time series data
7468
train_ts = load_time_series(dirname_train_ts)
7569
test_ts = load_time_series(dirname_test_ts)
7670

77-
# COMMAND ----------
78-
7971
# Load train and test CSV files with Spark
8072
train = spark.read.csv("/Volumes/mlops_students/javedhassi/data/childHealth.csv", header=True, inferSchema=True)
8173
test = spark.read.csv("/Volumes/mlops_students/javedhassi/data/test.csv", header=True, inferSchema=True)
@@ -85,69 +77,36 @@ def load_time_series(dirname) -> pd.DataFrame:
8577
test_pd = test.toPandas()
8678

8779
# Ensure 'id' column exists in both DataFrames
88-
if "id" not in train_pd.columns:
89-
train_pd["id"] = train_pd.index
90-
if "id" not in test_pd.columns:
91-
test_pd["id"] = test_pd.index
92-
93-
# COMMAND ----------
80+
train_pd["id"] = train_pd.get("id", train_pd.index)
81+
test_pd["id"] = test_pd.get("id", test_pd.index)
9482

9583
# Merge the data
9684
train_merged = pd.merge(train_pd, train_ts, how="left", on="id")
9785
test_merged = pd.merge(test_pd, test_ts, how="left", on="id")
9886

99-
# Check the result
100-
print(train_merged.head())
101-
print(test_merged.head())
102-
103-
# COMMAND ----------
104-
10587
# Update the list of numerical features to include time series columns
10688
time_series_cols = train_ts.columns.tolist()
107-
time_series_cols.remove("id") # Temporarily remove 'id' column from the list of time series columns
89+
time_series_cols.remove("id")
10890
num_features += time_series_cols
10991

110-
# COMMAND ----------
111-
112-
113-
def update(df):
114-
for c in cat_features:
115-
df[c] = df[c].fillna("Missing")
116-
df[c] = df[c].astype("category")
117-
return df
118-
119-
120-
# COMMAND ----------
121-
12292
# Update the train and test DataFrames
12393
train_merged = update(train_merged)
12494
test_merged = update(test_merged)
12595

126-
# COMMAND ----------
127-
128-
# # Include 'id' column back in the numerical features if needed
129-
# num_features.append('id')
130-
13196
# Check the updated DataFrames
13297
print(train_merged.head())
13398
print(test_merged.head())
13499

135-
136-
# COMMAND ----------
137-
# Read the Parquet file
100+
# Read and show the Parquet file
138101
df = spark.read.parquet(
139102
"/Volumes/mlops_students/javedhassi/data/series_train.parquet/id=00115b9f/part-0.parquet",
140103
header=True,
141104
inferSchema=True,
142105
)
143-
144-
# Show the DataFrame
145106
df.show()
146107

147-
# COMMAND ----------
148-
108+
# Convert to Pandas DataFrame
149109
df_pandas = df.toPandas()
150-
# COMMAND ----------
151-
train = spark.read.csv("/Volumes/mlops_students/javedhassi/data/childHealth.csv", header=True, inferSchema=True)
110+
111+
# Filter and show specific train data
152112
train.filter(train.id == "00115b9f").show()
153-
# COMMAND ----------

src/childHealth/config.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
class ProjectConfig(BaseModel):
88
catalog_name: str
99
schema_name: str
10-
random_forest_parameters: Dict[str, Any] # Dictionary to hold Random Forest parameters
11-
lgb_parameters: Dict[str, Any] # Dictionary to hold LightGBM parameters
10+
random_forest_parameters: Dict[str, Any]
11+
lgb_parameters: Dict[str, Any]
1212
num_features: List[str]
1313
cat_features: List[str]
1414
target: str
@@ -20,12 +20,12 @@ def from_yaml(cls, config_path: str) -> "ProjectConfig":
2020
with open(config_path, "r") as f:
2121
config_dict = yaml.safe_load(f)
2222
return cls(**config_dict)
23-
except FileNotFoundError:
24-
raise FileNotFoundError(f"Configuration file not found: {config_path}")
23+
except FileNotFoundError as e:
24+
raise FileNotFoundError(f"Configuration file not found: {config_path}") from e
2525
except yaml.YAMLError as e:
26-
raise ValueError(f"Error parsing YAML file: {e}")
26+
raise ValueError(f"Error parsing YAML file: {e}") from e
2727
except ValidationError as e:
28-
raise ValueError(f"Validation error: {e}")
28+
raise ValueError(f"Validation error: {e}") from e
2929

3030

3131
# Example usage:

0 commit comments

Comments
 (0)