@@ -136,6 +136,7 @@ def test_athena_ctas(path, path2, path3, glue_table, glue_table2, glue_database,
136
136
assert len (wr .s3 .list_objects (path = path3 )) == 0
137
137
138
138
139
+ @pytest .mark .modin_index
139
140
def test_athena_read_sql_ctas_bucketing (path , path2 , glue_table , glue_table2 , glue_database , glue_ctas_database ):
140
141
df = pd .DataFrame ({"c0" : [0 , 1 ], "c1" : ["foo" , "bar" ]})
141
142
wr .s3 .to_parquet (
@@ -155,12 +156,14 @@ def test_athena_read_sql_ctas_bucketing(path, path2, glue_table, glue_table2, gl
155
156
bucketing_info = (["c0" ], 1 ),
156
157
),
157
158
s3_output = path2 ,
159
+ pyarrow_additional_kwargs = {"ignore_metadata" : True },
158
160
)
159
161
df_no_ctas = wr .athena .read_sql_query (
160
162
sql = f"SELECT * FROM { glue_table } " ,
161
163
ctas_approach = False ,
162
164
database = glue_database ,
163
165
s3_output = path2 ,
166
+ pyarrow_additional_kwargs = {"ignore_metadata" : True },
164
167
)
165
168
assert df_ctas .equals (df_no_ctas )
166
169
@@ -855,6 +858,7 @@ def test_bucketing_catalog_parquet_table(path, glue_database, glue_table):
855
858
assert table ["StorageDescriptor" ]["BucketColumns" ] == bucket_cols
856
859
857
860
861
+ @pytest .mark .modin_index
858
862
@pytest .mark .parametrize ("bucketing_data" , [[0 , 1 , 2 ], [False , True , False ], ["b" , "c" , "d" ]])
859
863
@pytest .mark .parametrize (
860
864
"dtype" ,
@@ -907,12 +911,12 @@ def test_bucketing_parquet_dataset(path, glue_database, glue_table, bucketing_da
907
911
if isinstance (bucketing_data [0 ], str ):
908
912
dtype = pd .StringDtype ()
909
913
910
- first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
914
+ first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
911
915
assert len (first_bucket_df ) == 2
912
916
assert pandas_equals (pd .Series ([bucketing_data [0 ], bucketing_data [2 ]], dtype = dtype ), first_bucket_df ["c0" ])
913
917
assert pandas_equals (pd .Series (["foo" , "baz" ], dtype = pd .StringDtype ()), first_bucket_df ["c1" ])
914
918
915
- second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
919
+ second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
916
920
assert len (second_bucket_df ) == 1
917
921
assert pandas_equals (pd .Series ([bucketing_data [1 ]], dtype = dtype ), second_bucket_df ["c0" ])
918
922
assert pandas_equals (pd .Series (["bar" ], dtype = pd .StringDtype ()), second_bucket_df ["c1" ])
@@ -943,6 +947,7 @@ def test_bucketing_catalog_csv_table(path, glue_database, glue_table):
943
947
assert table ["StorageDescriptor" ]["BucketColumns" ] == bucket_cols
944
948
945
949
950
+ @pytest .mark .modin_index
946
951
@pytest .mark .parametrize ("bucketing_data" , [[0 , 1 , 2 ], [False , True , False ], ["b" , "c" , "d" ]])
947
952
@pytest .mark .parametrize (
948
953
"dtype" ,
@@ -988,12 +993,12 @@ def test_bucketing_csv_dataset(path, glue_database, glue_table, bucketing_data,
988
993
assert r ["paths" ][0 ].endswith ("bucket-00000.csv" )
989
994
assert r ["paths" ][1 ].endswith ("bucket-00001.csv" )
990
995
991
- first_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][0 ]], header = None , names = ["c0" , "c1" ])
996
+ first_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][0 ]], header = None , names = ["c0" , "c1" ]). reset_index ( drop = True )
992
997
assert len (first_bucket_df ) == 2
993
998
assert pandas_equals (pd .Series ([bucketing_data [0 ], bucketing_data [2 ]]), first_bucket_df ["c0" ])
994
999
assert pandas_equals (pd .Series (["foo" , "baz" ]), first_bucket_df ["c1" ])
995
1000
996
- second_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][1 ]], header = None , names = ["c0" , "c1" ])
1001
+ second_bucket_df = wr .s3 .read_csv (path = [r ["paths" ][1 ]], header = None , names = ["c0" , "c1" ]). reset_index ( drop = True )
997
1002
assert len (second_bucket_df ) == 1
998
1003
assert pandas_equals (pd .Series ([bucketing_data [1 ]]), second_bucket_df ["c0" ])
999
1004
assert pandas_equals (pd .Series (["bar" ]), second_bucket_df ["c1" ])
@@ -1008,6 +1013,7 @@ def test_bucketing_csv_dataset(path, glue_database, glue_table, bucketing_data,
1008
1013
assert all (x in bucketing_data for x in loaded_df ["c0" ].to_list ())
1009
1014
1010
1015
1016
+ @pytest .mark .modin_index
1011
1017
@pytest .mark .parametrize ("bucketing_data" , [[0 , 1 , 2 , 3 ], [False , True , False , True ], ["b" , "c" , "d" , "e" ]])
1012
1018
def test_combined_bucketing_partitioning_parquet_dataset (path , glue_database , glue_table , bucketing_data ):
1013
1019
nb_of_buckets = 2
@@ -1045,22 +1051,22 @@ def test_combined_bucketing_partitioning_parquet_dataset(path, glue_database, gl
1045
1051
if isinstance (bucketing_data [0 ], str ):
1046
1052
dtype = pd .StringDtype ()
1047
1053
1048
- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
1054
+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
1049
1055
assert len (bucket_df ) == 1
1050
1056
assert pandas_equals (pd .Series ([bucketing_data [0 ]], dtype = dtype ), bucket_df ["c0" ])
1051
1057
assert pandas_equals (pd .Series (["foo" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
1052
1058
1053
- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
1059
+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
1054
1060
assert len (bucket_df ) == 1
1055
1061
assert pandas_equals (pd .Series ([bucketing_data [1 ]], dtype = dtype ), bucket_df ["c0" ])
1056
1062
assert pandas_equals (pd .Series (["bar" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
1057
1063
1058
- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][2 ]])
1064
+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][2 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
1059
1065
assert len (bucket_df ) == 1
1060
1066
assert pandas_equals (pd .Series ([bucketing_data [2 ]], dtype = dtype ), bucket_df ["c0" ])
1061
1067
assert pandas_equals (pd .Series (["baz" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
1062
1068
1063
- bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][3 ]])
1069
+ bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][3 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
1064
1070
assert len (bucket_df ) == 1
1065
1071
assert pandas_equals (pd .Series ([bucketing_data [3 ]], dtype = dtype ), bucket_df ["c0" ])
1066
1072
assert pandas_equals (pd .Series (["boo" ], dtype = pd .StringDtype ()), bucket_df ["c1" ])
@@ -1135,6 +1141,7 @@ def test_combined_bucketing_partitioning_csv_dataset(path, glue_database, glue_t
1135
1141
assert all (x in bucketing_data for x in loaded_df ["c0" ].to_list ())
1136
1142
1137
1143
1144
+ @pytest .mark .modin_index
1138
1145
def test_multiple_bucketing_columns_parquet_dataset (path , glue_database , glue_table ):
1139
1146
nb_of_buckets = 2
1140
1147
df = pd .DataFrame ({"c0" : [0 , 1 , 2 , 3 ], "c1" : [4 , 6 , 5 , 7 ], "c2" : ["foo" , "bar" , "baz" , "boo" ]})
@@ -1152,13 +1159,13 @@ def test_multiple_bucketing_columns_parquet_dataset(path, glue_database, glue_ta
1152
1159
assert r ["paths" ][0 ].endswith ("bucket-00000.snappy.parquet" )
1153
1160
assert r ["paths" ][1 ].endswith ("bucket-00001.snappy.parquet" )
1154
1161
1155
- first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]])
1162
+ first_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][0 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
1156
1163
assert len (first_bucket_df ) == 2
1157
1164
assert pandas_equals (pd .Series ([0 , 3 ], dtype = pd .Int64Dtype ()), first_bucket_df ["c0" ])
1158
1165
assert pandas_equals (pd .Series ([4 , 7 ], dtype = pd .Int64Dtype ()), first_bucket_df ["c1" ])
1159
1166
assert pandas_equals (pd .Series (["foo" , "boo" ], dtype = pd .StringDtype ()), first_bucket_df ["c2" ])
1160
1167
1161
- second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]])
1168
+ second_bucket_df = wr .s3 .read_parquet (path = [r ["paths" ][1 ]], pyarrow_additional_kwargs = { "ignore_metadata" : True } )
1162
1169
assert len (second_bucket_df ) == 2
1163
1170
assert pandas_equals (pd .Series ([1 , 2 ], dtype = pd .Int64Dtype ()), second_bucket_df ["c0" ])
1164
1171
assert pandas_equals (pd .Series ([6 , 5 ], dtype = pd .Int64Dtype ()), second_bucket_df ["c1" ])
0 commit comments