Add 'end_user', 'user' and 'requested_model' on more prometheus metrics (#7399)

krrishdholakia · web-flow · commit 78fe124c142a · 2024-12-24T14:08:30.000-08:00
* fix(prometheus.py): support streaming end user litellm_proxy_total_requests_metric tracking

* fix(prometheus.py): add 'requested_model' and 'end_user_id' to 'litellm_request_total_latency_metric_bucket'

enables latency tracking by end user + requested model

* fix(prometheus.py): add end user, user and requested model metrics to 'litellm_llm_api_latency_metric'

* test: update prometheus unit tests

* test(test_prometheus.py): update tests

* test(test_prometheus.py): fix test

* test: reorder test
diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
@@ -69,11 +69,14 @@ def __init__(
                 "litellm_request_total_latency_metric",
                 "Total latency (seconds) for a request to LiteLLM",
                 labelnames=[
-                    "model",
-                    "hashed_api_key",
-                    "api_key_alias",
-                    "team",
-                    "team_alias",
+                    UserAPIKeyLabelNames.END_USER.value,
+                    UserAPIKeyLabelNames.API_KEY_HASH.value,
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value,
+                    REQUESTED_MODEL,
+                    UserAPIKeyLabelNames.TEAM.value,
+                    UserAPIKeyLabelNames.TEAM_ALIAS.value,
+                    UserAPIKeyLabelNames.USER.value,
+                    UserAPIKeyLabelNames.LITELLM_MODEL.value,
                 ],
                 buckets=LATENCY_BUCKETS,
             )
@@ -82,11 +85,14 @@ def __init__(
                 "litellm_llm_api_latency_metric",
                 "Total latency (seconds) for a models LLM API call",
                 labelnames=[
-                    "model",
-                    "hashed_api_key",
-                    "api_key_alias",
-                    "team",
-                    "team_alias",
+                    UserAPIKeyLabelNames.LITELLM_MODEL.value,
+                    UserAPIKeyLabelNames.API_KEY_HASH.value,
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value,
+                    UserAPIKeyLabelNames.TEAM.value,
+                    UserAPIKeyLabelNames.TEAM_ALIAS.value,
+                    UserAPIKeyLabelNames.REQUESTED_MODEL.value,
+                    UserAPIKeyLabelNames.END_USER.value,
+                    UserAPIKeyLabelNames.USER.value,
                 ],
                 buckets=LATENCY_BUCKETS,
             )
@@ -447,7 +453,20 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti
         self.set_llm_deployment_success_metrics(
             kwargs, start_time, end_time, output_tokens
         )
-        pass
+
+        if (
+            standard_logging_payload["stream"] is True
+        ):  # log successful streaming requests from logging event hook.
+            self.litellm_proxy_total_requests_metric.labels(
+                end_user=end_user_id,
+                hashed_api_key=user_api_key,
+                api_key_alias=user_api_key_alias,
+                requested_model=model,
+                team=user_api_team,
+                team_alias=user_api_team_alias,
+                user=user_id,
+                status_code="200",
+            ).inc()
 
     def _increment_token_metrics(
         self,
@@ -631,23 +650,44 @@ def _set_latency_metrics(
             api_call_total_time: timedelta = end_time - api_call_start_time
             api_call_total_time_seconds = api_call_total_time.total_seconds()
             self.litellm_llm_api_latency_metric.labels(
-                model,
-                user_api_key,
-                user_api_key_alias,
-                user_api_team,
-                user_api_team_alias,
+                **{
+                    UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
+                    UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
+                    UserAPIKeyLabelNames.TEAM.value: user_api_team,
+                    UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias,
+                    UserAPIKeyLabelNames.USER.value: standard_logging_payload[
+                        "metadata"
+                    ]["user_api_key_user_id"],
+                    UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[
+                        "metadata"
+                    ]["user_api_key_end_user_id"],
+                    UserAPIKeyLabelNames.REQUESTED_MODEL.value: standard_logging_payload[
+                        "model_group"
+                    ],
+                }
             ).observe(api_call_total_time_seconds)
 
         # total request latency
         if start_time is not None and isinstance(start_time, datetime):
             total_time: timedelta = end_time - start_time
             total_time_seconds = total_time.total_seconds()
+
             self.litellm_request_total_latency_metric.labels(
-                model,
-                user_api_key,
-                user_api_key_alias,
-                user_api_team,
-                user_api_team_alias,
+                **{
+                    UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[
+                        "metadata"
+                    ]["user_api_key_end_user_id"],
+                    UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key,
+                    UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias,
+                    REQUESTED_MODEL: standard_logging_payload["model_group"],
+                    UserAPIKeyLabelNames.TEAM.value: user_api_team,
+                    UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias,
+                    UserAPIKeyLabelNames.USER.value: standard_logging_payload[
+                        "metadata"
+                    ]["user_api_key_user_id"],
+                    UserAPIKeyLabelNames.LITELLM_MODEL.value: model,
+                }
             ).observe(total_time_seconds)
 
     async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
@@ -2961,11 +2961,19 @@ def get_standard_logging_object_payload(
             kwargs=kwargs,
         )
 
+        stream: Optional[bool] = None
+        if (
+            kwargs.get("complete_streaming_response") is not None
+            or kwargs.get("async_complete_streaming_response") is not None
+        ):
+            stream = True
+
         payload: StandardLoggingPayload = StandardLoggingPayload(
             id=str(id),
             trace_id=kwargs.get("litellm_trace_id"),  # type: ignore
             call_type=call_type or "",
             cache_hit=cache_hit,
+            stream=stream,
             status=status,
             saved_cache_cost=saved_cache_cost,
             startTime=start_time_float,
diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -1,8 +1,12 @@
 model_list:
-  - model_name: whisper
+  - model_name: openai/*
     litellm_params:
-      model: whisper-1
+      model: openai/*
       api_key: os.environ/OPENAI_API_KEY
-    model_info:
-      mode: audio_transcription
-   
+  - model_name: fake-openai-endpoint
+    litellm_params:
+      model: openai/gpt-3.5-turbo
+      api_key: os.environ/OPENAI_API_KEY
+
+litellm_settings:
+  callbacks: ["prometheus"]
diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py
@@ -1,3 +1,5 @@
+from enum import Enum
+
 REQUESTED_MODEL = "requested_model"
 EXCEPTION_STATUS = "exception_status"
 EXCEPTION_CLASS = "exception_class"
@@ -41,3 +43,14 @@
     300.0,
     float("inf"),
 )
+
+
+class UserAPIKeyLabelNames(Enum):
+    END_USER = "end_user"
+    USER = "user"
+    API_KEY_HASH = "hashed_api_key"
+    API_KEY_ALIAS = "api_key_alias"
+    TEAM = "team"
+    TEAM_ALIAS = "team_alias"
+    REQUESTED_MODEL = REQUESTED_MODEL
+    LITELLM_MODEL = "model"
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
@@ -1506,6 +1506,7 @@ class StandardLoggingPayload(TypedDict):
     id: str
     trace_id: str  # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries)
     call_type: str
+    stream: Optional[bool]
     response_cost: float
     response_cost_failure_debug_info: Optional[
         StandardLoggingModelCostFailureDebugInformation
diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py
@@ -274,7 +274,7 @@ def test_vertex_ai_anthropic_streaming():
 # )
 @pytest.mark.asyncio
 @pytest.mark.flaky(retries=3, delay=1)
-async def test_vertex_ai_anthropic_async():
+async def test_aavertex_ai_anthropic_async():
     # load_vertex_ai_credentials()
     try:
 
diff --git a/tests/logging_callback_tests/test_prometheus_unit_tests.py b/tests/logging_callback_tests/test_prometheus_unit_tests.py
@@ -46,6 +46,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload:
     return StandardLoggingPayload(
         id="test_id",
         call_type="completion",
+        stream=False,
         response_cost=0.1,
         response_cost_failure_debug_info=None,
         status="success",
@@ -72,6 +73,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload:
             spend_logs_metadata=None,
             requester_ip_address="127.0.0.1",
             requester_metadata=None,
+            user_api_key_end_user_id="test_end_user",
         ),
         cache_hit=False,
         cache_key=None,
@@ -110,6 +112,7 @@ async def test_async_log_success_event(prometheus_logger):
                 "user_api_key": "test_key",
                 "user_api_key_user_id": "test_user",
                 "user_api_key_team_id": "test_team",
+                "user_api_key_end_user_id": "test_end_user",
             }
         },
         "start_time": datetime.now(),
@@ -299,15 +302,29 @@ def test_set_latency_metrics(prometheus_logger):
 
     # end_time - api_call_start_time
     prometheus_logger.litellm_llm_api_latency_metric.labels.assert_called_once_with(
-        "gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1"
+        model="gpt-3.5-turbo",
+        hashed_api_key="key1",
+        api_key_alias="alias1",
+        team="team1",
+        team_alias="team_alias1",
+        user="test_user",
+        end_user="test_end_user",
+        requested_model="openai-gpt",
     )
     prometheus_logger.litellm_llm_api_latency_metric.labels().observe.assert_called_once_with(
         1.5
     )
 
     # total latency for the request
     prometheus_logger.litellm_request_total_latency_metric.labels.assert_called_once_with(
-        "gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1"
+        end_user="test_end_user",
+        hashed_api_key="key1",
+        api_key_alias="alias1",
+        requested_model="openai-gpt",
+        team="team1",
+        team_alias="team_alias1",
+        user="test_user",
+        model="gpt-3.5-turbo",
     )
     prometheus_logger.litellm_request_total_latency_metric.labels().observe.assert_called_once_with(
         2.0
diff --git a/tests/otel_tests/test_prometheus.py b/tests/otel_tests/test_prometheus.py
@@ -145,12 +145,12 @@ async def test_proxy_success_metrics():
 
         # Check if the success metric is present and correct
         assert (
-            'litellm_request_total_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
+            'litellm_request_total_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}'
             in metrics
         )
 
         assert (
-            'litellm_llm_api_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}'
+            'litellm_llm_api_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}'
             in metrics
         )