4
4
5
5
# Load the datasets (assuming they are in the same directory as the script)
6
6
OS_PATH = os .path .dirname (os .path .realpath ('__file__' ))
7
- weather_sensors_df = os .path .join (OS_PATH , 'data/pems-bay/sensors/pems_bay_sensors_weather.csv' )
8
- traffic_sensors_df = os .path .join (OS_PATH , 'data/pems-bay/sensors/pems_bay_sensors_traffic.csv' )
9
7
8
+ weather_sensors_df = pd .read_csv (os .path .join (OS_PATH , 'data/metr-la/sensors/metr_la_sensors_weather.csv' ))
9
+ traffic_sensors_df = pd .read_csv (os .path .join (OS_PATH , 'data/metr-la/sensors/metr_la_sensors_traffic.csv' ))
10
10
11
- traffic_speed_df = os .path .join (OS_PATH , 'data/pems-bay/traffic/speed.csv' )
12
- air_temp_df = os .path .join (OS_PATH , 'data/pems-bay/weather/air_temp_set_1_fahrenheit.csv' )
13
-
11
+ traffic_speed_df = pd .read_csv (os .path .join (OS_PATH , 'data/metr-la/traffic/speed.csv' ))
12
+ air_temp_df = pd .read_csv (os .path .join (OS_PATH , 'data/metr-la/weather/air_temp_set_1_fahrenheit.csv' ))
14
13
15
14
# Haversine formula to calculate the distance between two geographical points
16
15
def haversine (lat1 , lon1 , lat2 , lon2 ):
@@ -35,32 +34,51 @@ def find_nearest_weather_sensor(traffic_lat, traffic_lon, weather_df):
35
34
# Dictionary mapping of traffic sensor to its nearest weather sensor
36
35
sensor_to_weather_mapping = dict (zip (traffic_sensors_df ['detid' ], traffic_sensors_df ['nearest_weather_sensor' ]))
37
36
38
- # Merge weather data with traffic data in chunks
39
- chunk_size = 5000
37
+ # Function to merge data based on timestamps
38
+ def merge_data_on_timestamps (traffic_speed_df , air_temp_df , sensor_to_weather_mapping ):
39
+ # Initialize merged dataframe with DATETIMESTAMP column
40
+ merged_df = pd .DataFrame ()
41
+ merged_df ["DATETIMESTAMP" ] = traffic_speed_df ["DATETIMESTAMP" ]
42
+
43
+ # Iterate through each sensor in the traffic_speed_df
44
+ for sensor in traffic_speed_df .columns [1 :]:
45
+ # Copy the speed data
46
+ merged_df [sensor ] = traffic_speed_df [sensor ]
47
+
48
+ # Find corresponding weather sensor
49
+ weather_sensor = sensor_to_weather_mapping [int (sensor )]
50
+
51
+ # Find corresponding temperature data
52
+ if f"{ weather_sensor } " in air_temp_df .columns :
53
+ merged_df [f"{ sensor } _temp" ] = air_temp_df [f"{ weather_sensor } " ].reindex_like (traffic_speed_df )
54
+
55
+ return merged_df
40
56
41
- def merge_weather_with_traffic ( traffic_data , weather_data , sensor_to_weather_mapping ):
42
- merged_data = traffic_data [[ 'DATETIMESTAMP' ]]. copy ()
43
- for sensor in traffic_data . columns :
44
- if sensor != "DATETIMESTAMP" :
45
- nearest_weather_sensor = sensor_to_weather_mapping . get ( sensor , None )
46
- if nearest_weather_sensor :
47
- weather_chunk = weather_data [[ 'Date_Time' , nearest_weather_sensor ]]. rename ( columns = { nearest_weather_sensor : sensor })
48
- merged_data = merged_data . merge ( weather_chunk , left_on = "DATETIMESTAMP" , right_on = "Date_Time" , how = "left" ). drop ( columns = "Date_Time" )
49
- return merged_data
57
+ # Reorder columns function
58
+ def reorder_columns ( merged_df ):
59
+ # Create a list for the reordered columns
60
+ speed_cols = [ col for col in merged_df . columns if '_temp' not in col ]
61
+ temp_cols = [ col for col in merged_df . columns if '_temp' in col ]
62
+ # Reorder the columns
63
+ final_order = speed_cols + temp_cols
64
+ merged_df = merged_df [ final_order ]
65
+ return merged_df
50
66
51
- # Example of merging air temperature data with traffic data in chunks
52
- merged_chunks = []
53
- for start_row in range (0 , traffic_speed_df .shape [0 ], chunk_size ):
54
- end_row = start_row + chunk_size
55
- traffic_chunk = traffic_speed_df .iloc [start_row :end_row ]
56
- merged_chunk = merge_weather_with_traffic (traffic_chunk , air_temp_df , sensor_to_weather_mapping )
57
- merged_chunks .append (merged_chunk )
67
+ # Rename speed columns with "_speed" suffix
68
+ def rename_speed_columns (merged_df ):
69
+ speed_cols = [col for col in merged_df .columns if '_temp' not in col and col != "DATETIMESTAMP" ]
70
+ speed_columns_renamed = {col : f"{ col } _speed" for col in speed_cols }
71
+ merged_df .rename (columns = speed_columns_renamed , inplace = True )
72
+ return merged_df
58
73
74
+ # Merge, reorder, rename and save the dataframe
75
+ merged_df = merge_data_on_timestamps (traffic_speed_df , air_temp_df , sensor_to_weather_mapping )
76
+ merged_df = reorder_columns (merged_df )
77
+ merged_df = rename_speed_columns (merged_df )
59
78
60
- final_merged_df = pd .concat (merged_chunks , axis = 0 )
61
79
62
- # Define output path and save the final merged dataframe
63
- output_folder = os .path .join (OS_PATH , 'output' )
80
+ # Save the merged dataframe to a CSV file
81
+ output_folder = os .path .join (OS_PATH , 'output/metr-la ' )
64
82
os .makedirs (output_folder , exist_ok = True ) # Create output folder if it doesn't exist
65
- output_file_path = os .path .join (output_folder , 'merged_traffic_weather_data .csv' )
66
- final_merged_df .to_csv (output_file_path , index = False )
83
+ merged_file_path = os .path .join (output_folder , 'merged_speed_traffic_and_air_temperature_data .csv' )
84
+ merged_df .to_csv (merged_file_path , index = False )
0 commit comments