1
- import time
2
- from datetime import datetime
3
1
from types import MethodType
4
2
from typing import List , Optional , Set , Union , no_type_check
5
3
6
4
import pandas as pd
7
- import pyarrow
8
5
from pyspark import SparkContext
9
6
from pyspark .sql import DataFrame , SparkSession
10
7
from pyspark .sql .avro .functions import from_avro
11
8
from pyspark .sql .column import Column , _to_java_column
12
- from pyspark .sql .functions import col , from_json , udf
9
+ from pyspark .sql .functions import col , from_json
13
10
from pyspark .sql .streaming import StreamingQuery
14
- from pyspark .sql .types import BinaryType
15
11
16
12
from feast import FeatureView
17
13
from feast .data_format import AvroFormat , ConfluentAvroFormat , JsonFormat , StreamFormat
27
23
)
28
24
from feast .infra .provider import get_provider
29
25
from feast .stream_feature_view import StreamFeatureView
30
- from feast .utils import _convert_arrow_to_proto , _run_pyarrow_field_mapping
31
26
32
27
33
28
class SparkProcessorConfig (ProcessorConfig ):
@@ -142,10 +137,7 @@ def ingest_stream_feature_view(
142
137
self ._create_infra_if_necessary ()
143
138
ingested_stream_df = self ._ingest_stream_data ()
144
139
transformed_df = self ._construct_transformation_plan (ingested_stream_df )
145
- if self .fs .config .provider == "expedia" :
146
- online_store_query = self ._write_stream_data_expedia (transformed_df , to )
147
- else :
148
- online_store_query = self ._write_stream_data (transformed_df , to )
140
+ online_store_query = self ._write_stream_data (transformed_df , to )
149
141
return online_store_query
150
142
151
143
# In the line 116 of __init__(), the "data_source" is assigned a stream_source (and has to be KafkaSource as in line 80).
@@ -247,208 +239,17 @@ def _construct_transformation_plan(self, df: StreamTable) -> StreamTable:
247
239
248
240
if len (drop_list ) > 0 :
249
241
print (
250
- f"INFO!!! Dropping extra columns in the DataFrame: { drop_list } . Avoid unnecessary columns in the dataframe."
242
+ f"INFO: Dropping extra columns in the DataFrame: { drop_list } . Avoid unnecessary columns in the dataframe."
251
243
)
252
244
return df .drop (* drop_list )
253
245
else :
254
246
raise Exception (f"Stream source is not defined for { self .sfv .name } " )
255
247
elif isinstance (self .sfv , StreamFeatureView ):
256
248
return self .sfv .udf .__call__ (df ) if self .sfv .udf else df
257
249
258
- def _write_stream_data_expedia (self , df : StreamTable , to : PushMode ):
259
- """
260
- Ensures materialization logic in sync with stream ingestion.
261
- Support only write to online store. No support for preprocess_fn also.
262
- In Spark 3.2.2, toPandas() is throwing error when the dataframe has Boolean columns.
263
- To fix this error, we need spark 3.4.0 or numpy < 1.20.0 but feast needs numpy >= 1.22.
264
- Switching to use mapInPandas to solve the problem for boolean columns and
265
- toPandas() also load all data into driver's memory.
266
- Error Message:
267
- AttributeError: module 'numpy' has no attribute 'bool'.
268
- `np.bool` was a deprecated alias for the builtin `bool`.
269
- To avoid this error in existing code, use `bool` by itself.
270
- Doing this will not modify any behavior and is safe.
271
- If you specifically wanted the numpy scalar type, use `np.bool_` here.
272
- """
273
-
274
- # TODO: Support writing to offline store and preprocess_fn. Remove _write_stream_data method
275
-
276
- # Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema.
277
- def batch_write_pandas_df (
278
- iterator , spark_serialized_artifacts , join_keys , batch_id
279
- ):
280
- current_datetime_with_ms = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S.%f" )[
281
- :- 3
282
- ]
283
- print (f"{ current_datetime_with_ms } Started processing batch { batch_id } " )
284
- for pdf in iterator :
285
- (
286
- feature_view ,
287
- online_store ,
288
- repo_config ,
289
- ) = spark_serialized_artifacts .unserialize ()
290
-
291
- if isinstance (feature_view , StreamFeatureView ):
292
- ts_field = feature_view .timestamp_field
293
- else :
294
- ts_field = feature_view .stream_source .timestamp_field
295
-
296
- # Extract the latest feature values for each unique entity row (i.e. the join keys).
297
- pdf = (
298
- pdf .sort_values (by = [* join_keys , ts_field ], ascending = False )
299
- .groupby (join_keys )
300
- .nth (0 )
301
- )
302
-
303
- table = pyarrow .Table .from_pandas (pdf )
304
- current_datetime_with_ms = datetime .now ().strftime (
305
- "%Y-%m-%d %H:%M:%S.%f"
306
- )[:- 3 ]
307
- print (
308
- f"{ current_datetime_with_ms } Started processing _run_pyarrow_field_mapping { batch_id } "
309
- )
310
- if feature_view .batch_source .field_mapping is not None :
311
- table = _run_pyarrow_field_mapping (
312
- table , feature_view .batch_source .field_mapping
313
- )
314
-
315
- join_key_to_value_type = {
316
- entity .name : entity .dtype .to_value_type ()
317
- for entity in feature_view .entity_columns
318
- }
319
- current_datetime_with_ms = datetime .now ().strftime (
320
- "%Y-%m-%d %H:%M:%S.%f"
321
- )[:- 3 ]
322
- print (
323
- f"{ current_datetime_with_ms } Started processing _convert_arrow_to_proto { batch_id } "
324
- )
325
- rows_to_write = _convert_arrow_to_proto (
326
- table , feature_view , join_key_to_value_type
327
- )
328
- online_store .online_write_batch (
329
- repo_config ,
330
- feature_view ,
331
- rows_to_write ,
332
- lambda x : None ,
333
- )
334
- # data_list = online_store.online_write_batch_connector(
335
- # repo_config,
336
- # feature_view,
337
- # rows_to_write,
338
- # lambda x: None,
339
- # )
340
-
341
- # keyspace = repo_config.online_store.keyspace
342
-
343
- # fqtable = CassandraOnlineStore._fq_table_name(
344
- # keyspace, repo_config.project, feature_view
345
- # )
346
-
347
- # schema = StructType(
348
- # [
349
- # StructField("feature_name", StringType(), False),
350
- # StructField("value", BinaryType(), True),
351
- # StructField("entity_key", StringType(), False),
352
- # StructField("event_ts", TimestampType(), True),
353
- # ]
354
- # )
355
-
356
- # df = self.spark.createDataFrame(
357
- # data_list,
358
- # schema=schema,
359
- # )
360
-
361
- # df.write.format("org.apache.spark.sql.cassandra").mode(
362
- # "append"
363
- # ).options(table=fqtable, keyspace=keyspace).save()
364
-
365
- yield pd .DataFrame ([pd .Series (range (1 , 2 ))]) # dummy result
366
-
367
- def batch_write (
368
- sdf : DataFrame ,
369
- batch_id : int ,
370
- spark_serialized_artifacts ,
371
- join_keys ,
372
- feature_view ,
373
- ):
374
- start_time = time .time ()
375
- current_datetime_with_ms = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S.%f" )[
376
- :- 3
377
- ]
378
- print (f"{ current_datetime_with_ms } Started batch write.." )
379
- sdf .mapInPandas (
380
- lambda x : batch_write_pandas_df (
381
- x , spark_serialized_artifacts , join_keys , batch_id
382
- ),
383
- "status int" ,
384
- ).count () # dummy action to force evaluation
385
- current_datetime_with_ms = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S.%f" )[
386
- :- 3
387
- ]
388
- print (
389
- f"{ current_datetime_with_ms } Time taken to write batch { batch_id } is: { (time .time () - start_time ) * 1000 :.2f} ms"
390
- )
391
-
392
- def batch_write_with_connector (
393
- sdf : DataFrame ,
394
- batch_id : int ,
395
- ):
396
- start_time = time .time ()
397
- convert_to_blob = udf (lambda s : s .encode ("utf-8" ), BinaryType ())
398
- sdf = sdf .withColumn ("value" , convert_to_blob (col ("feature_value" ))).drop (
399
- "event_header" ,
400
- "feature_value" ,
401
- )
402
- sdf .write .format ("org.apache.spark.sql.cassandra" ).mode ("append" ).options (
403
- table = "mlpfs_scylladb_perf_test_cc_stream_fv" , keyspace = "feast"
404
- ).save ()
405
- print (
406
- f"Time taken to write batch { batch_id } is: { (time .time () - start_time ) * 1000 :.2f} ms"
407
- )
408
-
409
- query = None
410
- if self .sfv .name != "cc_stream_fv" :
411
- query = (
412
- df .writeStream .outputMode ("update" )
413
- .option ("checkpointLocation" , self .checkpoint_location )
414
- .trigger (processingTime = self .processing_time )
415
- .foreachBatch (
416
- lambda df , batch_id : batch_write (
417
- df ,
418
- batch_id ,
419
- self .spark_serialized_artifacts ,
420
- self .join_keys ,
421
- self .sfv ,
422
- )
423
- )
424
- .start ()
425
- )
426
- else :
427
- query = (
428
- df .writeStream .outputMode ("update" )
429
- .option ("checkpointLocation" , self .checkpoint_location )
430
- .trigger (processingTime = self .processing_time )
431
- .foreachBatch (
432
- lambda df , batch_id : batch_write_with_connector (
433
- df ,
434
- batch_id ,
435
- )
436
- )
437
- .start ()
438
- )
439
-
440
- query .awaitTermination (timeout = self .query_timeout )
441
- return query
442
-
443
250
def _write_stream_data (self , df : StreamTable , to : PushMode ) -> StreamingQuery :
444
251
# Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema.
445
252
def batch_write (row : DataFrame , batch_id : int ):
446
- current_datetime_with_ms = datetime .now ().strftime ("%Y-%m-%d %H:%M:%S.%f" )[
447
- :- 3
448
- ]
449
- print (
450
- f"{ current_datetime_with_ms } Started batch write for batch_id: { batch_id } "
451
- )
452
253
rows : pd .DataFrame = row .toPandas ()
453
254
454
255
# Extract the latest feature values for each unique entity row (i.e. the join keys).
@@ -463,7 +264,6 @@ def batch_write(row: DataFrame, batch_id: int):
463
264
.nth (0 )
464
265
)
465
266
# Created column is not used anywhere in the code, but it is added to the dataframe.
466
- # Expedia provider drops the unused columns from dataframe
467
267
# Commenting this out as it is not used anywhere in the code
468
268
# rows["created"] = pd.to_datetime("now", utc=True)
469
269
@@ -477,19 +277,7 @@ def batch_write(row: DataFrame, batch_id: int):
477
277
# Finally persist the data to the online store and/or offline store.
478
278
if rows .size > 0 :
479
279
if to == PushMode .ONLINE or to == PushMode .ONLINE_AND_OFFLINE :
480
- current_datetime_with_ms = datetime .now ().strftime (
481
- "%Y-%m-%d %H:%M:%S.%f"
482
- )[:- 3 ]
483
- print (
484
- f"{ current_datetime_with_ms } Started write_to_online_store for batch_id: { batch_id } "
485
- )
486
280
self .fs .write_to_online_store (self .sfv .name , rows )
487
- current_datetime_with_ms = datetime .now ().strftime (
488
- "%Y-%m-%d %H:%M:%S.%f"
489
- )[:- 3 ]
490
- print (
491
- f"{ current_datetime_with_ms } Completed write_to_online_store for batch_id: { batch_id } "
492
- )
493
281
if to == PushMode .OFFLINE or to == PushMode .ONLINE_AND_OFFLINE :
494
282
self .fs .write_to_offline_store (self .sfv .name , rows )
495
283
0 commit comments