Skip to content

Commit

Permalink
fixed week1
Browse files Browse the repository at this point in the history
  • Loading branch information
javedhassans committed Nov 19, 2024
1 parent fd41ed9 commit 6d3a178
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 67 deletions.
8 changes: 4 additions & 4 deletions notebooks/week1/00.dataexploration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -390,11 +390,11 @@
"metadata": {},
"outputs": [],
"source": [
"# Merge the aggregated actigraphy features with the train data\n",
"combined_df = pd.merge(train_df, aggregated_actigraphy_df, on=\"id\", how=\"left\")\n",
"# # Merge the aggregated actigraphy features with the train data\n",
"# combined_df = pd.merge(train_df, aggregated_actigraphy_df, on=\"id\", how=\"left\")\n",
"\n",
"# Inspect the combined DataFrame\n",
"print(combined_df.head())"
"# # Inspect the combined DataFrame\n",
"# print(combined_df.head())"
]
},
{
Expand Down
73 changes: 16 additions & 57 deletions notebooks/week1/01.dataExploraton.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
from pyspark.dbutils import DBUtils
from pyspark.sql import SparkSession

from childHealth.config import ProjectConfig
Expand All @@ -22,60 +23,51 @@

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()
dbutils = DBUtils(spark)

# Define original paths
dirname_train_ts = "/Volumes/mlops_students/javedhassi/data/series_train.parquet"
dirname_test_ts = "/Volumes/mlops_students/javedhassi/data/series_test.parquet"

# COMMAND ----------

# Load project configuration from YAML file
config = ProjectConfig.from_yaml(config_path="../../project_config.yml")
num_features = config.num_features
cat_features = config.cat_features


# COMMAND ----------
def process_file(filename, dirname):
filepath = os.path.join(dirname, filename, "part-0.parquet")
df = spark.read.parquet(filepath)
df = df.drop("step")
# Ensure 'id' column is included
df = spark.read.parquet(filepath).drop("step")
if "id" not in df.columns:
df = df.withColumn("id", df["relative_date_PCIAT"]) # Use an existing column or create a new one
df = df.withColumn("id", df["relative_date_PCIAT"])
return df.toPandas(), filename.split("=")[1]


def load_time_series(dirname) -> pd.DataFrame:
# List all subdirectories in the specified path
directories = [file.path for file in dbutils.fs.ls(dirname) if file.path.endswith("/")]

results = []
with ThreadPoolExecutor() as executor:
futures = {executor.submit(process_file, path.split("/")[-2], dirname): path for path in directories}
for i, future in enumerate(futures):
result = future.result()
results.append(result)
print(f"Processed {i + 1}/{len(directories)} files")

# Separate stats and identifiers
stats, indexes = zip(*results, strict=False) if results else ([], [])

# Create DataFrame with statistics and identifiers
combined_df = pd.concat([df for df in stats], ignore_index=True)
combined_df["id"] = indexes

return combined_df


# COMMAND ----------
def update(df):
for c in cat_features:
df[c] = df[c].fillna("Missing").astype("category")
return df


# Load time series data
train_ts = load_time_series(dirname_train_ts)
test_ts = load_time_series(dirname_test_ts)

# COMMAND ----------

# Load train and test CSV files with Spark
train = spark.read.csv("/Volumes/mlops_students/javedhassi/data/childHealth.csv", header=True, inferSchema=True)
test = spark.read.csv("/Volumes/mlops_students/javedhassi/data/test.csv", header=True, inferSchema=True)
Expand All @@ -85,69 +77,36 @@ def load_time_series(dirname) -> pd.DataFrame:
test_pd = test.toPandas()

# Ensure 'id' column exists in both DataFrames
if "id" not in train_pd.columns:
train_pd["id"] = train_pd.index
if "id" not in test_pd.columns:
test_pd["id"] = test_pd.index

# COMMAND ----------
train_pd["id"] = train_pd.get("id", train_pd.index)
test_pd["id"] = test_pd.get("id", test_pd.index)

# Merge the data
train_merged = pd.merge(train_pd, train_ts, how="left", on="id")
test_merged = pd.merge(test_pd, test_ts, how="left", on="id")

# Check the result
print(train_merged.head())
print(test_merged.head())

# COMMAND ----------

# Update the list of numerical features to include time series columns
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id") # Temporarily remove 'id' column from the list of time series columns
time_series_cols.remove("id")
num_features += time_series_cols

# COMMAND ----------


def update(df):
for c in cat_features:
df[c] = df[c].fillna("Missing")
df[c] = df[c].astype("category")
return df


# COMMAND ----------

# Update the train and test DataFrames
train_merged = update(train_merged)
test_merged = update(test_merged)

# COMMAND ----------

# # Include 'id' column back in the numerical features if needed
# num_features.append('id')

# Check the updated DataFrames
print(train_merged.head())
print(test_merged.head())


# COMMAND ----------
# Read the Parquet file
# Read and show the Parquet file
df = spark.read.parquet(
"/Volumes/mlops_students/javedhassi/data/series_train.parquet/id=00115b9f/part-0.parquet",
header=True,
inferSchema=True,
)

# Show the DataFrame
df.show()

# COMMAND ----------

# Convert to Pandas DataFrame
df_pandas = df.toPandas()
# COMMAND ----------
train = spark.read.csv("/Volumes/mlops_students/javedhassi/data/childHealth.csv", header=True, inferSchema=True)

# Filter and show specific train data
train.filter(train.id == "00115b9f").show()
# COMMAND ----------
12 changes: 6 additions & 6 deletions src/childHealth/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
class ProjectConfig(BaseModel):
catalog_name: str
schema_name: str
random_forest_parameters: Dict[str, Any] # Dictionary to hold Random Forest parameters
lgb_parameters: Dict[str, Any] # Dictionary to hold LightGBM parameters
random_forest_parameters: Dict[str, Any]
lgb_parameters: Dict[str, Any]
num_features: List[str]
cat_features: List[str]
target: str
Expand All @@ -20,12 +20,12 @@ def from_yaml(cls, config_path: str) -> "ProjectConfig":
with open(config_path, "r") as f:
config_dict = yaml.safe_load(f)
return cls(**config_dict)
except FileNotFoundError:
raise FileNotFoundError(f"Configuration file not found: {config_path}")
except FileNotFoundError as e:
raise FileNotFoundError(f"Configuration file not found: {config_path}") from e
except yaml.YAMLError as e:
raise ValueError(f"Error parsing YAML file: {e}")
raise ValueError(f"Error parsing YAML file: {e}") from e
except ValidationError as e:
raise ValueError(f"Validation error: {e}")
raise ValueError(f"Validation error: {e}") from e


# Example usage:
Expand Down

0 comments on commit 6d3a178

Please sign in to comment.