Merge pull request #8386 from minwhoo/triton-completions-streaming-fix

ishaan-jaff · web-flow · commit 94667e1cf06b · 2025-03-10T16:07:19.000-07:00
Fix triton streaming completions bug
diff --git a/litellm/llms/triton/completion/transformation.py b/litellm/llms/triton/completion/transformation.py
@@ -3,7 +3,7 @@
 """
 
 import json
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional, Union
 
 from httpx import Headers, Response
 
@@ -67,6 +67,18 @@ def map_openai_params(
                 optional_params[param] = value
         return optional_params
 
+    def get_complete_url(
+        self,
+        api_base: str,
+        model: str,
+        optional_params: dict,
+        stream: Optional[bool] = None,
+    ) -> str:
+        llm_type = self._get_triton_llm_type(api_base)
+        if llm_type == "generate" and stream:
+            return api_base + "_stream"
+        return api_base
+
     def transform_response(
         self,
         model: str,
@@ -149,6 +161,18 @@ def _get_triton_llm_type(self, api_base: str) -> Literal["generate", "infer"]:
         else:
             raise ValueError(f"Invalid Triton API base: {api_base}")
 
+    def get_model_response_iterator(
+        self,
+        streaming_response: Union[Iterator[str], AsyncIterator[str], ModelResponse],
+        sync_stream: bool,
+        json_mode: Optional[bool] = False,
+    ) -> Any:
+        return TritonResponseIterator(
+            streaming_response=streaming_response,
+            sync_stream=sync_stream,
+            json_mode=json_mode,
+        )
+
 
 class TritonGenerateConfig(TritonConfig):
     """
@@ -204,7 +228,7 @@ def transform_response(
         return model_response
 
 
-class TritonInferConfig(TritonGenerateConfig):
+class TritonInferConfig(TritonConfig):
     """
     Transformations for triton /infer endpoint (his is an infer model with a custom model on triton)
     """
diff --git a/tests/llm_translation/test_triton.py b/tests/llm_translation/test_triton.py
@@ -49,16 +49,26 @@ def test_split_embedding_by_shape_fails_with_shape_value_error():
         )
 
 
-def test_completion_triton_generate_api():
+@pytest.mark.parametrize("stream", [True, False])
+def test_completion_triton_generate_api(stream):
     try:
         mock_response = MagicMock()
-
-        def return_val():
-            return {
-                "text_output": "I am an AI assistant",
-            }
-
-        mock_response.json = return_val
+        if stream:
+            def mock_iter_lines():
+                mock_output = ''.join([
+                    'data: {"model_name":"ensemble","model_version":"1","sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":"' + t + '"}\n\n'
+                    for t in ["I", " am", " an", " AI", " assistant"]
+                ])
+                for out in mock_output.split('\n'):
+                    yield out
+            mock_response.iter_lines = mock_iter_lines
+        else:
+            def return_val():
+                return {
+                    "text_output": "I am an AI assistant",
+                }
+
+            mock_response.json = return_val
         mock_response.status_code = 200
 
         with patch(
@@ -71,6 +81,7 @@ def return_val():
                 max_tokens=10,
                 timeout=5,
                 api_base="http://localhost:8000/generate",
+                stream=stream,
             )
 
             # Verify the call was made
@@ -81,7 +92,10 @@ def return_val():
             call_kwargs = mock_post.call_args.kwargs  # Access kwargs directly
 
             # Verify URL
-            assert call_kwargs["url"] == "http://localhost:8000/generate"
+            if stream:
+                assert call_kwargs["url"] == "http://localhost:8000/generate_stream"
+            else:
+                assert call_kwargs["url"] == "http://localhost:8000/generate"
 
             # Parse the request data from the JSON string
             request_data = json.loads(call_kwargs["data"])
@@ -91,7 +105,15 @@ def return_val():
             assert request_data["parameters"]["max_tokens"] == 10
 
             # Verify response
-            assert response.choices[0].message.content == "I am an AI assistant"
+            if stream:
+                tokens = ["I", " am", " an", " AI", " assistant", None]
+                idx = 0
+                for chunk in response:
+                    assert chunk.choices[0].delta.content == tokens[idx]
+                    idx += 1
+                assert idx == len(tokens)
+            else:
+                assert response.choices[0].message.content == "I am an AI assistant"
 
     except Exception as e:
         print("exception", e)