1414from concurrent .futures import ThreadPoolExecutor
1515
1616import pandas as pd
17+ from pyspark .dbutils import DBUtils
1718from pyspark .sql import SparkSession
1819
1920from childHealth .config import ProjectConfig
2223
2324# Initialize Spark session
2425spark = SparkSession .builder .getOrCreate ()
26+ dbutils = DBUtils (spark )
2527
2628# Define original paths
2729dirname_train_ts = "/Volumes/mlops_students/javedhassi/data/series_train.parquet"
2830dirname_test_ts = "/Volumes/mlops_students/javedhassi/data/series_test.parquet"
2931
30- # COMMAND ----------
31-
3232# Load project configuration from YAML file
3333config = ProjectConfig .from_yaml (config_path = "../../project_config.yml" )
3434num_features = config .num_features
3535cat_features = config .cat_features
3636
3737
38- # COMMAND ----------
3938def process_file (filename , dirname ):
4039 filepath = os .path .join (dirname , filename , "part-0.parquet" )
41- df = spark .read .parquet (filepath )
42- df = df .drop ("step" )
43- # Ensure 'id' column is included
40+ df = spark .read .parquet (filepath ).drop ("step" )
4441 if "id" not in df .columns :
45- df = df .withColumn ("id" , df ["relative_date_PCIAT" ]) # Use an existing column or create a new one
42+ df = df .withColumn ("id" , df ["relative_date_PCIAT" ])
4643 return df .toPandas (), filename .split ("=" )[1 ]
4744
4845
4946def load_time_series (dirname ) -> pd .DataFrame :
50- # List all subdirectories in the specified path
5147 directories = [file .path for file in dbutils .fs .ls (dirname ) if file .path .endswith ("/" )]
52-
5348 results = []
5449 with ThreadPoolExecutor () as executor :
5550 futures = {executor .submit (process_file , path .split ("/" )[- 2 ], dirname ): path for path in directories }
5651 for i , future in enumerate (futures ):
5752 result = future .result ()
5853 results .append (result )
5954 print (f"Processed { i + 1 } /{ len (directories )} files" )
60-
61- # Separate stats and identifiers
6255 stats , indexes = zip (* results , strict = False ) if results else ([], [])
63-
64- # Create DataFrame with statistics and identifiers
6556 combined_df = pd .concat ([df for df in stats ], ignore_index = True )
6657 combined_df ["id" ] = indexes
67-
6858 return combined_df
6959
7060
71- # COMMAND ----------
61+ def update (df ):
62+ for c in cat_features :
63+ df [c ] = df [c ].fillna ("Missing" ).astype ("category" )
64+ return df
65+
7266
7367# Load time series data
7468train_ts = load_time_series (dirname_train_ts )
7569test_ts = load_time_series (dirname_test_ts )
7670
77- # COMMAND ----------
78-
7971# Load train and test CSV files with Spark
8072train = spark .read .csv ("/Volumes/mlops_students/javedhassi/data/childHealth.csv" , header = True , inferSchema = True )
8173test = spark .read .csv ("/Volumes/mlops_students/javedhassi/data/test.csv" , header = True , inferSchema = True )
@@ -85,69 +77,36 @@ def load_time_series(dirname) -> pd.DataFrame:
8577test_pd = test .toPandas ()
8678
8779# Ensure 'id' column exists in both DataFrames
88- if "id" not in train_pd .columns :
89- train_pd ["id" ] = train_pd .index
90- if "id" not in test_pd .columns :
91- test_pd ["id" ] = test_pd .index
92-
93- # COMMAND ----------
80+ train_pd ["id" ] = train_pd .get ("id" , train_pd .index )
81+ test_pd ["id" ] = test_pd .get ("id" , test_pd .index )
9482
9583# Merge the data
9684train_merged = pd .merge (train_pd , train_ts , how = "left" , on = "id" )
9785test_merged = pd .merge (test_pd , test_ts , how = "left" , on = "id" )
9886
99- # Check the result
100- print (train_merged .head ())
101- print (test_merged .head ())
102-
103- # COMMAND ----------
104-
10587# Update the list of numerical features to include time series columns
10688time_series_cols = train_ts .columns .tolist ()
107- time_series_cols .remove ("id" ) # Temporarily remove 'id' column from the list of time series columns
89+ time_series_cols .remove ("id" )
10890num_features += time_series_cols
10991
110- # COMMAND ----------
111-
112-
113- def update (df ):
114- for c in cat_features :
115- df [c ] = df [c ].fillna ("Missing" )
116- df [c ] = df [c ].astype ("category" )
117- return df
118-
119-
120- # COMMAND ----------
121-
12292# Update the train and test DataFrames
12393train_merged = update (train_merged )
12494test_merged = update (test_merged )
12595
126- # COMMAND ----------
127-
128- # # Include 'id' column back in the numerical features if needed
129- # num_features.append('id')
130-
13196# Check the updated DataFrames
13297print (train_merged .head ())
13398print (test_merged .head ())
13499
135-
136- # COMMAND ----------
137- # Read the Parquet file
100+ # Read and show the Parquet file
138101df = spark .read .parquet (
139102 "/Volumes/mlops_students/javedhassi/data/series_train.parquet/id=00115b9f/part-0.parquet" ,
140103 header = True ,
141104 inferSchema = True ,
142105)
143-
144- # Show the DataFrame
145106df .show ()
146107
147- # COMMAND ----------
148-
108+ # Convert to Pandas DataFrame
149109df_pandas = df .toPandas ()
150- # COMMAND ----------
151- train = spark . read . csv ( "/Volumes/mlops_students/javedhassi/data/childHealth.csv" , header = True , inferSchema = True )
110+
111+ # Filter and show specific train data
152112train .filter (train .id == "00115b9f" ).show ()
153- # COMMAND ----------
0 commit comments