1
+ # Aggregate updated TIMS data through 2021
2
+ ## Based on .csv data received from Henry
3
+
4
+ import os
5
+ os .environ ["CALITP_BQ_MAX_BYTES" ] = str (1_000_000_000_000 ) ## 1TB?
6
+
7
+ import pandas as pd
8
+ import geopandas as gpd
9
+ from siuba import *
10
+
11
+ import shared_utils
12
+
13
+ from gcsfs import GCSFileSystem
14
+ from calitp_data_analysis import utils
15
+ from calitp_data_analysis import get_fs
16
+ fs = get_fs ()
17
+
18
+ GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/safety_projects/"
19
+
20
+ # Initialize GCSFileSystem
21
+ gcs = GCSFileSystem ()
22
+
23
+ def tims_combine ():
24
+
25
+ # Get a list of all CSV files in the GCS bucket folder
26
+ csv_files = [file for file in gcs .ls (f'{ GCS_FILE_PATH } TIMS_by_county' ) if file .endswith ('.csv' )]
27
+
28
+ # Read each CSV file into a DataFrame and concatenate them
29
+ dfs = []
30
+ for file in csv_files :
31
+ with gcs .open (file , 'rb' ) as f :
32
+ df = pd .read_csv (f )
33
+ dfs .append (df )
34
+
35
+ # Concatenate all DataFrames into one
36
+ combined_df = pd .concat (dfs , ignore_index = True )
37
+
38
+ # Clean the data: Convert object columns to string
39
+ object_columns = combined_df .select_dtypes (include = ['object' ]).columns
40
+ combined_df [object_columns ] = combined_df [object_columns ].astype (str )
41
+
42
+ # Convert the DataFrame to a GeoDataFrame
43
+ # Assuming you have latitude and longitude columns named 'LATITUDE' and 'LONGITUDE'
44
+ gdf = gpd .GeoDataFrame (combined_df , geometry = gpd .points_from_xy (combined_df ['POINT_X' ], combined_df ['POINT_Y' ]))
45
+
46
+ #dropping for conversion erro
47
+ #gdf = (gdf >> select(-_.JURIS))
48
+
49
+ return gdf
50
+
51
+ if __name__ == "__main__" :
52
+ tims = tims_combine ()
53
+
54
+ #print some info in the terminal to verify
55
+ tims .info ()
56
+
57
+ # Save the GeoDataFrame as a GeoParquet file
58
+ utils .geoparquet_gcs_export (tims , GCS_FILE_PATH , "TIMS_Data_to_2021" )
0 commit comments