Skip to content

Commit 2987b4b

Browse files
script to read in and concatenate Henry's TIMS csvs
1 parent a26d2d7 commit 2987b4b

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed

safety_projects/update_tims_2021.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Aggregate updated TIMS data through 2021
2+
## Based on .csv data received from Henry
3+
4+
import os
5+
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?
6+
7+
import pandas as pd
8+
import geopandas as gpd
9+
from siuba import *
10+
11+
import shared_utils
12+
13+
from gcsfs import GCSFileSystem
14+
from calitp_data_analysis import utils
15+
from calitp_data_analysis import get_fs
16+
fs = get_fs()
17+
18+
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/safety_projects/"
19+
20+
# Initialize GCSFileSystem
21+
gcs = GCSFileSystem()
22+
23+
def tims_combine():
24+
25+
# Get a list of all CSV files in the GCS bucket folder
26+
csv_files = [file for file in gcs.ls(f'{GCS_FILE_PATH}TIMS_by_county') if file.endswith('.csv')]
27+
28+
# Read each CSV file into a DataFrame and concatenate them
29+
dfs = []
30+
for file in csv_files:
31+
with gcs.open(file, 'rb') as f:
32+
df = pd.read_csv(f)
33+
dfs.append(df)
34+
35+
# Concatenate all DataFrames into one
36+
combined_df = pd.concat(dfs, ignore_index=True)
37+
38+
# Clean the data: Convert object columns to string
39+
object_columns = combined_df.select_dtypes(include=['object']).columns
40+
combined_df[object_columns] = combined_df[object_columns].astype(str)
41+
42+
# Convert the DataFrame to a GeoDataFrame
43+
# Assuming you have latitude and longitude columns named 'LATITUDE' and 'LONGITUDE'
44+
gdf = gpd.GeoDataFrame(combined_df, geometry=gpd.points_from_xy(combined_df['POINT_X'], combined_df['POINT_Y']))
45+
46+
#dropping for conversion erro
47+
#gdf = (gdf >> select(-_.JURIS))
48+
49+
return gdf
50+
51+
if __name__ == "__main__":
52+
tims = tims_combine()
53+
54+
#print some info in the terminal to verify
55+
tims.info()
56+
57+
# Save the GeoDataFrame as a GeoParquet file
58+
utils.geoparquet_gcs_export(tims, GCS_FILE_PATH, "TIMS_Data_to_2021")

0 commit comments

Comments
 (0)