diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 569d9daaf06a..5d38afd3900e 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -69,11 +69,14 @@ def __init__( "litellm_request_total_latency_metric", "Total latency (seconds) for a request to LiteLLM", labelnames=[ - "model", - "hashed_api_key", - "api_key_alias", - "team", - "team_alias", + UserAPIKeyLabelNames.END_USER.value, + UserAPIKeyLabelNames.API_KEY_HASH.value, + UserAPIKeyLabelNames.API_KEY_ALIAS.value, + REQUESTED_MODEL, + UserAPIKeyLabelNames.TEAM.value, + UserAPIKeyLabelNames.TEAM_ALIAS.value, + UserAPIKeyLabelNames.USER.value, + UserAPIKeyLabelNames.LITELLM_MODEL.value, ], buckets=LATENCY_BUCKETS, ) @@ -82,11 +85,14 @@ def __init__( "litellm_llm_api_latency_metric", "Total latency (seconds) for a models LLM API call", labelnames=[ - "model", - "hashed_api_key", - "api_key_alias", - "team", - "team_alias", + UserAPIKeyLabelNames.LITELLM_MODEL.value, + UserAPIKeyLabelNames.API_KEY_HASH.value, + UserAPIKeyLabelNames.API_KEY_ALIAS.value, + UserAPIKeyLabelNames.TEAM.value, + UserAPIKeyLabelNames.TEAM_ALIAS.value, + UserAPIKeyLabelNames.REQUESTED_MODEL.value, + UserAPIKeyLabelNames.END_USER.value, + UserAPIKeyLabelNames.USER.value, ], buckets=LATENCY_BUCKETS, ) @@ -447,7 +453,20 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti self.set_llm_deployment_success_metrics( kwargs, start_time, end_time, output_tokens ) - pass + + if ( + standard_logging_payload["stream"] is True + ): # log successful streaming requests from logging event hook. + self.litellm_proxy_total_requests_metric.labels( + end_user=end_user_id, + hashed_api_key=user_api_key, + api_key_alias=user_api_key_alias, + requested_model=model, + team=user_api_team, + team_alias=user_api_team_alias, + user=user_id, + status_code="200", + ).inc() def _increment_token_metrics( self, @@ -631,23 +650,44 @@ def _set_latency_metrics( api_call_total_time: timedelta = end_time - api_call_start_time api_call_total_time_seconds = api_call_total_time.total_seconds() self.litellm_llm_api_latency_metric.labels( - model, - user_api_key, - user_api_key_alias, - user_api_team, - user_api_team_alias, + **{ + UserAPIKeyLabelNames.LITELLM_MODEL.value: model, + UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key, + UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias, + UserAPIKeyLabelNames.TEAM.value: user_api_team, + UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias, + UserAPIKeyLabelNames.USER.value: standard_logging_payload[ + "metadata" + ]["user_api_key_user_id"], + UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[ + "metadata" + ]["user_api_key_end_user_id"], + UserAPIKeyLabelNames.REQUESTED_MODEL.value: standard_logging_payload[ + "model_group" + ], + } ).observe(api_call_total_time_seconds) # total request latency if start_time is not None and isinstance(start_time, datetime): total_time: timedelta = end_time - start_time total_time_seconds = total_time.total_seconds() + self.litellm_request_total_latency_metric.labels( - model, - user_api_key, - user_api_key_alias, - user_api_team, - user_api_team_alias, + **{ + UserAPIKeyLabelNames.END_USER.value: standard_logging_payload[ + "metadata" + ]["user_api_key_end_user_id"], + UserAPIKeyLabelNames.API_KEY_HASH.value: user_api_key, + UserAPIKeyLabelNames.API_KEY_ALIAS.value: user_api_key_alias, + REQUESTED_MODEL: standard_logging_payload["model_group"], + UserAPIKeyLabelNames.TEAM.value: user_api_team, + UserAPIKeyLabelNames.TEAM_ALIAS.value: user_api_team_alias, + UserAPIKeyLabelNames.USER.value: standard_logging_payload[ + "metadata" + ]["user_api_key_user_id"], + UserAPIKeyLabelNames.LITELLM_MODEL.value: model, + } ).observe(total_time_seconds) async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index f460cf757ef0..dc558e427a25 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -2961,11 +2961,19 @@ def get_standard_logging_object_payload( kwargs=kwargs, ) + stream: Optional[bool] = None + if ( + kwargs.get("complete_streaming_response") is not None + or kwargs.get("async_complete_streaming_response") is not None + ): + stream = True + payload: StandardLoggingPayload = StandardLoggingPayload( id=str(id), trace_id=kwargs.get("litellm_trace_id"), # type: ignore call_type=call_type or "", cache_hit=cache_hit, + stream=stream, status=status, saved_cache_cost=saved_cache_cost, startTime=start_time_float, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 74739c8e341f..b454c37ad604 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,8 +1,12 @@ model_list: - - model_name: whisper + - model_name: openai/* litellm_params: - model: whisper-1 + model: openai/* api_key: os.environ/OPENAI_API_KEY - model_info: - mode: audio_transcription - \ No newline at end of file + - model_name: fake-openai-endpoint + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: ["prometheus"] \ No newline at end of file diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py index c5d6fc7ab64c..22da0425e4f9 100644 --- a/litellm/types/integrations/prometheus.py +++ b/litellm/types/integrations/prometheus.py @@ -1,3 +1,5 @@ +from enum import Enum + REQUESTED_MODEL = "requested_model" EXCEPTION_STATUS = "exception_status" EXCEPTION_CLASS = "exception_class" @@ -41,3 +43,14 @@ 300.0, float("inf"), ) + + +class UserAPIKeyLabelNames(Enum): + END_USER = "end_user" + USER = "user" + API_KEY_HASH = "hashed_api_key" + API_KEY_ALIAS = "api_key_alias" + TEAM = "team" + TEAM_ALIAS = "team_alias" + REQUESTED_MODEL = REQUESTED_MODEL + LITELLM_MODEL = "model" diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 8176d9a50c1a..f0ac2fcab2cd 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -1506,6 +1506,7 @@ class StandardLoggingPayload(TypedDict): id: str trace_id: str # Trace multiple LLM calls belonging to same overall request (e.g. fallbacks/retries) call_type: str + stream: Optional[bool] response_cost: float response_cost_failure_debug_info: Optional[ StandardLoggingModelCostFailureDebugInformation diff --git a/tests/local_testing/test_amazing_vertex_completion.py b/tests/local_testing/test_amazing_vertex_completion.py index 9fa95437db56..ca26cf468410 100644 --- a/tests/local_testing/test_amazing_vertex_completion.py +++ b/tests/local_testing/test_amazing_vertex_completion.py @@ -274,7 +274,7 @@ def test_vertex_ai_anthropic_streaming(): # ) @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) -async def test_vertex_ai_anthropic_async(): +async def test_aavertex_ai_anthropic_async(): # load_vertex_ai_credentials() try: diff --git a/tests/logging_callback_tests/test_prometheus_unit_tests.py b/tests/logging_callback_tests/test_prometheus_unit_tests.py index 19c183d7383a..8caf5d079f13 100644 --- a/tests/logging_callback_tests/test_prometheus_unit_tests.py +++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py @@ -46,6 +46,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload: return StandardLoggingPayload( id="test_id", call_type="completion", + stream=False, response_cost=0.1, response_cost_failure_debug_info=None, status="success", @@ -72,6 +73,7 @@ def create_standard_logging_payload() -> StandardLoggingPayload: spend_logs_metadata=None, requester_ip_address="127.0.0.1", requester_metadata=None, + user_api_key_end_user_id="test_end_user", ), cache_hit=False, cache_key=None, @@ -110,6 +112,7 @@ async def test_async_log_success_event(prometheus_logger): "user_api_key": "test_key", "user_api_key_user_id": "test_user", "user_api_key_team_id": "test_team", + "user_api_key_end_user_id": "test_end_user", } }, "start_time": datetime.now(), @@ -299,7 +302,14 @@ def test_set_latency_metrics(prometheus_logger): # end_time - api_call_start_time prometheus_logger.litellm_llm_api_latency_metric.labels.assert_called_once_with( - "gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1" + model="gpt-3.5-turbo", + hashed_api_key="key1", + api_key_alias="alias1", + team="team1", + team_alias="team_alias1", + user="test_user", + end_user="test_end_user", + requested_model="openai-gpt", ) prometheus_logger.litellm_llm_api_latency_metric.labels().observe.assert_called_once_with( 1.5 @@ -307,7 +317,14 @@ def test_set_latency_metrics(prometheus_logger): # total latency for the request prometheus_logger.litellm_request_total_latency_metric.labels.assert_called_once_with( - "gpt-3.5-turbo", "key1", "alias1", "team1", "team_alias1" + end_user="test_end_user", + hashed_api_key="key1", + api_key_alias="alias1", + requested_model="openai-gpt", + team="team1", + team_alias="team_alias1", + user="test_user", + model="gpt-3.5-turbo", ) prometheus_logger.litellm_request_total_latency_metric.labels().observe.assert_called_once_with( 2.0 diff --git a/tests/otel_tests/test_prometheus.py b/tests/otel_tests/test_prometheus.py index 3c52781ce8e8..9bb9ae8d4cca 100644 --- a/tests/otel_tests/test_prometheus.py +++ b/tests/otel_tests/test_prometheus.py @@ -145,12 +145,12 @@ async def test_proxy_success_metrics(): # Check if the success metric is present and correct assert ( - 'litellm_request_total_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}' + 'litellm_request_total_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}' in metrics ) assert ( - 'litellm_llm_api_latency_metric_bucket{api_key_alias="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",team="None",team_alias="None"}' + 'litellm_llm_api_latency_metric_bucket{api_key_alias="None",end_user="None",hashed_api_key="88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",le="0.005",model="fake",requested_model="fake-openai-endpoint",team="None",team_alias="None",user="default_user_id"}' in metrics )