feat: add support for chat sessions (#167)

cdoern · web-flow · commit ce3b30f83eb1 · 2025-02-26T12:48:05.000-08:00
# What does this PR do?

today, to chat with a model, a user has to run one command per
completion. add --session which enables a user to have an interactive
chat session with the inference model. --session can be passed with or
without --message. If no --message is passed, the user is prompted to
give the first message.

This is useful as it also enables context to be saved between each
completion unlike today.

```
llama-stack-client inference chat-completion --session
&gt;&gt;&gt; hi whats up!
Assistant&gt; Not much! How's your day going so far? Is there something I can help you with or would you like to chat?
&gt;&gt;&gt; what color is the sky?
Assistant&gt; The color of the sky can vary depending on the time of day and atmospheric conditions. Here are some common colors you might see:

* During the daytime, when the sun is overhead, the sky typically appears blue.
* At sunrise and sunset, the sky can take on hues of red, orange, pink, and purple due to the scattering of light by atmospheric particles.
* On a clear day with no clouds, the sky can appear a bright blue, often referred to as "cerulean."
* In areas with high levels of pollution or dust, the sky can appear more hazy or grayish.
* At night, the sky can be dark and black, although some stars and moonlight can make it visible.

So, what's your favorite color of the sky?
&gt;&gt;&gt;
```


## Test Plan

tested locally with and without --message

Signed-off-by: Charlie Doern &lt;cdoern@redhat.com&gt;
diff --git a/src/llama_stack_client/lib/cli/inference/inference.py b/src/llama_stack_client/lib/cli/inference/inference.py
@@ -4,7 +4,8 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from typing import Optional
+from typing import Optional, List, Dict
+import traceback
 
 import click
 from rich.console import Console
@@ -19,30 +20,67 @@ def inference():
 
 
 @click.command("chat-completion")
-@click.option("--message", required=True, help="Message")
+@click.option("--message", help="Message")
 @click.option("--stream", is_flag=True, help="Streaming", default=False)
+@click.option("--session", is_flag=True, help="Start a Chat Session", default=False)
 @click.option("--model-id", required=False, help="Model ID")
 @click.pass_context
 @handle_client_errors("inference chat-completion")
-def chat_completion(ctx, message: str, stream: bool, model_id: Optional[str]):
+def chat_completion(ctx, message: str, stream: bool, session: bool, model_id: Optional[str]):
     """Show available inference chat completion endpoints on distribution endpoint"""
+    if not message and not session:
+        click.secho(
+            "you must specify either --message or --session",
+            fg="red",
+        )
+        raise click.exceptions.Exit(1)
     client = ctx.obj["client"]
     console = Console()
 
     if not model_id:
         available_models = [model.identifier for model in client.models.list() if model.model_type == "llm"]
         model_id = available_models[0]
 
-    response = client.inference.chat_completion(
-        model_id=model_id,
-        messages=[{"role": "user", "content": message}],
-        stream=stream,
-    )
-    if not stream:
-        console.print(response)
-    else:
-        for event in EventLogger().log(response):
-            event.print()
+    messages = []
+    if message:
+        messages.append({"role": "user", "content": message})
+        response = client.inference.chat_completion(
+            model_id=model_id,
+            messages=messages,
+            stream=stream,
+        )
+        if not stream:
+            console.print(response)
+        else:
+            for event in EventLogger().log(response):
+                event.print()
+    if session:
+        chat_session(client=client, model_id=model_id, messages=messages, console=console)
+
+
+def chat_session(client, model_id: Optional[str], messages: List[Dict[str, str]], console: Console):
+    """Run an interactive chat session with the served model"""
+    while True:
+        try:
+            message = input(">>> ")
+            if message in ["\\q", "quit"]:
+                console.print("Exiting")
+                break
+            messages.append({"role": "user", "content": message})
+            response = client.inference.chat_completion(
+                model_id=model_id,
+                messages=messages,
+                stream=True,
+            )
+            for event in EventLogger().log(response):
+                event.print()
+        except Exception as exc:
+            traceback.print_exc()
+            console.print(f"Error in chat session {exc}")
+            break
+        except KeyboardInterrupt as exc:
+            console.print("\nDetected user interrupt, exiting")
+            break
 
 
 # Register subcommands