fix: Added sleep to test rate limiting

Bhargav Dodla · Bhargav Dodla · commit 5076bd6e8d12 · 2025-02-04T08:49:46.000-08:00
diff --git a/sdk/python/feast/infra/contrib/spark_kafka_processor.py b/sdk/python/feast/infra/contrib/spark_kafka_processor.py
@@ -1,17 +1,13 @@
-import time
-from datetime import datetime
 from types import MethodType
 from typing import List, Optional, Set, Union, no_type_check
 
 import pandas as pd
-import pyarrow
 from pyspark import SparkContext
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.avro.functions import from_avro
 from pyspark.sql.column import Column, _to_java_column
-from pyspark.sql.functions import col, from_json, udf
+from pyspark.sql.functions import col, from_json
 from pyspark.sql.streaming import StreamingQuery
-from pyspark.sql.types import BinaryType
 
 from feast import FeatureView
 from feast.data_format import AvroFormat, ConfluentAvroFormat, JsonFormat, StreamFormat
@@ -27,7 +23,6 @@
 )
 from feast.infra.provider import get_provider
 from feast.stream_feature_view import StreamFeatureView
-from feast.utils import _convert_arrow_to_proto, _run_pyarrow_field_mapping
 
 
 class SparkProcessorConfig(ProcessorConfig):
@@ -142,10 +137,7 @@ def ingest_stream_feature_view(
         self._create_infra_if_necessary()
         ingested_stream_df = self._ingest_stream_data()
         transformed_df = self._construct_transformation_plan(ingested_stream_df)
-        if self.fs.config.provider == "expedia":
-            online_store_query = self._write_stream_data_expedia(transformed_df, to)
-        else:
-            online_store_query = self._write_stream_data(transformed_df, to)
+        online_store_query = self._write_stream_data(transformed_df, to)
         return online_store_query
 
     # In the line 116 of __init__(), the "data_source" is assigned a stream_source (and has to be KafkaSource as in line 80).
@@ -247,208 +239,17 @@ def _construct_transformation_plan(self, df: StreamTable) -> StreamTable:
 
                 if len(drop_list) > 0:
                     print(
-                        f"INFO!!! Dropping extra columns in the DataFrame: {drop_list}. Avoid unnecessary columns in the dataframe."
+                        f"INFO: Dropping extra columns in the DataFrame: {drop_list}. Avoid unnecessary columns in the dataframe."
                     )
                 return df.drop(*drop_list)
             else:
                 raise Exception(f"Stream source is not defined for {self.sfv.name}")
         elif isinstance(self.sfv, StreamFeatureView):
             return self.sfv.udf.__call__(df) if self.sfv.udf else df
 
-    def _write_stream_data_expedia(self, df: StreamTable, to: PushMode):
-        """
-        Ensures materialization logic in sync with stream ingestion.
-        Support only write to online store. No support for preprocess_fn also.
-        In Spark 3.2.2, toPandas() is throwing error when the dataframe has Boolean columns.
-        To fix this error, we need spark 3.4.0 or numpy < 1.20.0 but feast needs numpy >= 1.22.
-        Switching to use mapInPandas to solve the problem for boolean columns and
-        toPandas() also load all data into driver's memory.
-        Error Message:
-            AttributeError: module 'numpy' has no attribute 'bool'.
-            `np.bool` was a deprecated alias for the builtin `bool`.
-            To avoid this error in existing code, use `bool` by itself.
-            Doing this will not modify any behavior and is safe.
-            If you specifically wanted the numpy scalar type, use `np.bool_` here.
-        """
-
-        # TODO: Support writing to offline store and preprocess_fn. Remove _write_stream_data method
-
-        # Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema.
-        def batch_write_pandas_df(
-            iterator, spark_serialized_artifacts, join_keys, batch_id
-        ):
-            current_datetime_with_ms = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
-                :-3
-            ]
-            print(f"{current_datetime_with_ms} Started processing batch {batch_id}")
-            for pdf in iterator:
-                (
-                    feature_view,
-                    online_store,
-                    repo_config,
-                ) = spark_serialized_artifacts.unserialize()
-
-                if isinstance(feature_view, StreamFeatureView):
-                    ts_field = feature_view.timestamp_field
-                else:
-                    ts_field = feature_view.stream_source.timestamp_field
-
-                # Extract the latest feature values for each unique entity row (i.e. the join keys).
-                pdf = (
-                    pdf.sort_values(by=[*join_keys, ts_field], ascending=False)
-                    .groupby(join_keys)
-                    .nth(0)
-                )
-
-                table = pyarrow.Table.from_pandas(pdf)
-                current_datetime_with_ms = datetime.now().strftime(
-                    "%Y-%m-%d %H:%M:%S.%f"
-                )[:-3]
-                print(
-                    f"{current_datetime_with_ms} Started processing _run_pyarrow_field_mapping {batch_id}"
-                )
-                if feature_view.batch_source.field_mapping is not None:
-                    table = _run_pyarrow_field_mapping(
-                        table, feature_view.batch_source.field_mapping
-                    )
-
-                join_key_to_value_type = {
-                    entity.name: entity.dtype.to_value_type()
-                    for entity in feature_view.entity_columns
-                }
-                current_datetime_with_ms = datetime.now().strftime(
-                    "%Y-%m-%d %H:%M:%S.%f"
-                )[:-3]
-                print(
-                    f"{current_datetime_with_ms} Started processing _convert_arrow_to_proto {batch_id}"
-                )
-                rows_to_write = _convert_arrow_to_proto(
-                    table, feature_view, join_key_to_value_type
-                )
-                online_store.online_write_batch(
-                    repo_config,
-                    feature_view,
-                    rows_to_write,
-                    lambda x: None,
-                )
-                # data_list = online_store.online_write_batch_connector(
-                #     repo_config,
-                #     feature_view,
-                #     rows_to_write,
-                #     lambda x: None,
-                # )
-
-                # keyspace = repo_config.online_store.keyspace
-
-                # fqtable = CassandraOnlineStore._fq_table_name(
-                #     keyspace, repo_config.project, feature_view
-                # )
-
-                # schema = StructType(
-                #     [
-                #         StructField("feature_name", StringType(), False),
-                #         StructField("value", BinaryType(), True),
-                #         StructField("entity_key", StringType(), False),
-                #         StructField("event_ts", TimestampType(), True),
-                #     ]
-                # )
-
-                # df = self.spark.createDataFrame(
-                #     data_list,
-                #     schema=schema,
-                # )
-
-                # df.write.format("org.apache.spark.sql.cassandra").mode(
-                #     "append"
-                # ).options(table=fqtable, keyspace=keyspace).save()
-
-            yield pd.DataFrame([pd.Series(range(1, 2))])  # dummy result
-
-        def batch_write(
-            sdf: DataFrame,
-            batch_id: int,
-            spark_serialized_artifacts,
-            join_keys,
-            feature_view,
-        ):
-            start_time = time.time()
-            current_datetime_with_ms = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
-                :-3
-            ]
-            print(f"{current_datetime_with_ms} Started batch write..")
-            sdf.mapInPandas(
-                lambda x: batch_write_pandas_df(
-                    x, spark_serialized_artifacts, join_keys, batch_id
-                ),
-                "status int",
-            ).count()  # dummy action to force evaluation
-            current_datetime_with_ms = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
-                :-3
-            ]
-            print(
-                f"{current_datetime_with_ms} Time taken to write batch {batch_id} is: {(time.time() - start_time) * 1000:.2f} ms"
-            )
-
-        def batch_write_with_connector(
-            sdf: DataFrame,
-            batch_id: int,
-        ):
-            start_time = time.time()
-            convert_to_blob = udf(lambda s: s.encode("utf-8"), BinaryType())
-            sdf = sdf.withColumn("value", convert_to_blob(col("feature_value"))).drop(
-                "event_header",
-                "feature_value",
-            )
-            sdf.write.format("org.apache.spark.sql.cassandra").mode("append").options(
-                table="mlpfs_scylladb_perf_test_cc_stream_fv", keyspace="feast"
-            ).save()
-            print(
-                f"Time taken to write batch {batch_id} is: {(time.time() - start_time) * 1000:.2f} ms"
-            )
-
-        query = None
-        if self.sfv.name != "cc_stream_fv":
-            query = (
-                df.writeStream.outputMode("update")
-                .option("checkpointLocation", self.checkpoint_location)
-                .trigger(processingTime=self.processing_time)
-                .foreachBatch(
-                    lambda df, batch_id: batch_write(
-                        df,
-                        batch_id,
-                        self.spark_serialized_artifacts,
-                        self.join_keys,
-                        self.sfv,
-                    )
-                )
-                .start()
-            )
-        else:
-            query = (
-                df.writeStream.outputMode("update")
-                .option("checkpointLocation", self.checkpoint_location)
-                .trigger(processingTime=self.processing_time)
-                .foreachBatch(
-                    lambda df, batch_id: batch_write_with_connector(
-                        df,
-                        batch_id,
-                    )
-                )
-                .start()
-            )
-
-        query.awaitTermination(timeout=self.query_timeout)
-        return query
-
     def _write_stream_data(self, df: StreamTable, to: PushMode) -> StreamingQuery:
         # Validation occurs at the fs.write_to_online_store() phase against the stream feature view schema.
         def batch_write(row: DataFrame, batch_id: int):
-            current_datetime_with_ms = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[
-                :-3
-            ]
-            print(
-                f"{current_datetime_with_ms} Started batch write for batch_id: {batch_id}"
-            )
             rows: pd.DataFrame = row.toPandas()
 
             # Extract the latest feature values for each unique entity row (i.e. the join keys).
@@ -463,7 +264,6 @@ def batch_write(row: DataFrame, batch_id: int):
                 .nth(0)
             )
             # Created column is not used anywhere in the code, but it is added to the dataframe.
-            # Expedia provider drops the unused columns from dataframe
             # Commenting this out as it is not used anywhere in the code
             # rows["created"] = pd.to_datetime("now", utc=True)
 
@@ -477,19 +277,7 @@ def batch_write(row: DataFrame, batch_id: int):
             # Finally persist the data to the online store and/or offline store.
             if rows.size > 0:
                 if to == PushMode.ONLINE or to == PushMode.ONLINE_AND_OFFLINE:
-                    current_datetime_with_ms = datetime.now().strftime(
-                        "%Y-%m-%d %H:%M:%S.%f"
-                    )[:-3]
-                    print(
-                        f"{current_datetime_with_ms} Started write_to_online_store for batch_id: {batch_id}"
-                    )
                     self.fs.write_to_online_store(self.sfv.name, rows)
-                    current_datetime_with_ms = datetime.now().strftime(
-                        "%Y-%m-%d %H:%M:%S.%f"
-                    )[:-3]
-                    print(
-                        f"{current_datetime_with_ms} Completed write_to_online_store for batch_id: {batch_id}"
-                    )
                 if to == PushMode.OFFLINE or to == PushMode.ONLINE_AND_OFFLINE:
                     self.fs.write_to_offline_store(self.sfv.name, rows)
 
diff --git a/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/cassandra_online_store.py b/sdk/python/feast/infra/online_stores/contrib/cassandra_online_store/cassandra_online_store.py
@@ -19,6 +19,7 @@
 """
 
 import logging
+import time
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple
 
@@ -352,6 +353,7 @@ def online_write_batch(
                       display progress.
         """
         write_concurrency = config.online_store.write_concurrency
+        write_limit = 1.0 / write_concurrency
         project = config.project
         ttl = (
             table.online_store_key_ttl_seconds
@@ -386,39 +388,38 @@ def online_write_batch(
                 )
                 batch.add(insert_cql, params)
             futures.append(session.execute_async(batch))
+            time.sleep(write_limit)
 
             # TODO: Make this efficient by leveraging continuous writes rather
             # than blocking until all writes are done. We may need to rate limit
             # the writes to reduce the impact on read performance.
             if len(futures) >= write_concurrency:
                 # Raises exception if at least one of the batch fails
-                try:
-                    for future in futures:
-                        future.result()
-                    futures = []
-                except Exception as exc:
-                    logger.error(f"Error writing a batch: {exc}")
-                    print(f"Error writing a batch: {exc}")
-                    raise Exception("Error writing a batch") from exc
+                self._wait_for_futures(futures)
+                futures.clear()
 
             # this happens N-1 times, will be corrected outside:
             if progress:
                 progress(1)
 
         if len(futures) > 0:
-            try:
-                for future in futures:
-                    future.result()
-                futures = []
-            except Exception as exc:
-                logger.error(f"Error writing a batch: {exc}")
-                print(f"Error writing a batch: {exc}")
-                raise Exception("Error writing a batch") from exc
+            self._wait_for_futures(futures)
+            futures.clear()
 
         # correction for the last missing call to `progress`:
         if progress:
             progress(1)
 
+    def _wait_for_futures(self, futures):
+        try:
+            for future in futures:
+                future.result()
+            futures = []
+        except Exception as exc:
+            logger.error(f"Error writing a batch: {exc}")
+            print(f"Error writing a batch: {exc}")
+            raise Exception("Error writing a batch") from exc
+
     def online_read(
         self,
         config: RepoConfig,