feat(errors): add LLMServiceError for generic LLM failures and enhance error classification

garland3 · garland3 · commit cd39c41ddf07 · 2025-11-25T04:46:34.000Z
diff --git a/backend/application/chat/utilities/error_utils.py b/backend/application/chat/utilities/error_utils.py
@@ -8,7 +8,7 @@
 import logging
 from typing import Any, Dict, List, Optional, Callable, Awaitable, Tuple
 
-from domain.errors import ValidationError, RateLimitError, LLMTimeoutError, LLMAuthenticationError
+from domain.errors import ValidationError, RateLimitError, LLMTimeoutError, LLMAuthenticationError, LLMServiceError
 from domain.messages.models import MessageType
 
 logger = logging.getLogger(__name__)
@@ -65,7 +65,9 @@ def classify_llm_error(error: Exception) -> Tuple[type, str, str]:
     Classify LLM errors and return appropriate error type, user message, and log message.
     
     Returns:
-        Tuple of (error_class, user_message, log_message)
+        Tuple of (error_class, user_message, log_message).
+
+    NOTE: user_message MUST NOT contain raw exception details or sensitive data.
     """
     error_str = str(error)
     error_type_name = type(error).__name__
@@ -88,10 +90,10 @@ def classify_llm_error(error: Exception) -> Tuple[type, str, str]:
         log_msg = f"Authentication error: {error_str}"
         return (LLMAuthenticationError, user_msg, log_msg)
     
-    # Generic LLM error
-    user_msg = f"The AI service encountered an error. Please try again or contact support if the issue persists."
+    # Generic LLM service error (non-validation)
+    user_msg = "The AI service encountered an error. Please try again or contact support if the issue persists."
     log_msg = f"LLM error: {error_str}"
-    return (ValidationError, user_msg, log_msg)
+    return (LLMServiceError, user_msg, log_msg)
 
 
 async def safe_call_llm_with_tools(
diff --git a/backend/domain/errors.py b/backend/domain/errors.py
@@ -46,6 +46,11 @@ class LLMError(DomainError):
     pass
 
 
+class LLMServiceError(LLMError):
+    """Generic LLM service failure that is not a validation issue."""
+    pass
+
+
 class ToolError(DomainError):
     """Tool execution error."""
     pass
diff --git a/backend/tests/test_error_classification.py b/backend/tests/test_error_classification.py
@@ -1,7 +1,7 @@
 """Tests for error classification and user-friendly error messages."""
 
 from application.chat.utilities.error_utils import classify_llm_error
-from domain.errors import RateLimitError, LLMTimeoutError, LLMAuthenticationError, ValidationError
+from domain.errors import RateLimitError, LLMTimeoutError, LLMAuthenticationError, LLMServiceError
 
 
 class TestErrorClassification:
@@ -77,7 +77,7 @@ def test_classify_generic_llm_error(self):
         
         error_class, user_msg, log_msg = classify_llm_error(error)
         
-        assert error_class == ValidationError
+        assert error_class == LLMServiceError
         assert "error" in user_msg.lower()
         assert "try again" in user_msg.lower() or "contact support" in user_msg.lower()
 
diff --git a/docs/developer/README.md b/docs/developer/README.md
@@ -5,13 +5,9 @@ This guide provides technical details for developers contributing to the Atlas U
 ## Topics
 
 ### Getting Started
-- [Architecture Overview](architecture.md) - System architecture and design patterns
-- [Development Conventions](conventions.md) - Coding standards and best practices
 
 ### Building MCP Servers
-- [Creating MCP Servers](creating-mcp-servers.md) - How to build tool servers
-- [Working with Files](working-with-files.md) - File access patterns for tools
-- [Progress Updates](progress-updates.md) - Sending intermediate results to users
 
 ### Frontend Development
-- [Custom Canvas Renderers](canvas-renderers.md) - Adding support for new file types
+- [Error Handling Improvements](error_handling_improvements.md) - LLM error classification and surfacing
+- [Error Flow Diagram](error_flow_diagram.md) - End-to-end error flow diagram
diff --git a/docs/developer/error_flow_diagram.md b/docs/developer/error_flow_diagram.md
@@ -1,3 +1,4 @@
+```markdown
 # Error Flow Diagram
 
 ## Complete Error Handling Flow
@@ -6,107 +7,107 @@
 ┌─────────────────────────────────────────────────────────────────────┐
 │                         USER SENDS MESSAGE                           │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                    WebSocket Handler (main.py)                       │
 │                  handle_chat() async function                        │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                   ChatService.handle_chat_message()                  │
 │                      (service.py)                                    │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                    ChatOrchestrator.execute()                        │
 │                     (orchestrator.py)                                │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                   ToolsModeRunner.run()                              │
 │                      (modes/tools.py)                                │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │           error_utils.safe_call_llm_with_tools()                     │
 │              (utilities/error_utils.py)                              │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                  LLMCaller.call_with_tools()                         │
 │                  (modules/llm/litellm_caller.py)                     │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                         LiteLLM Library                              │
 │                  (calls Cerebras/OpenAI/etc.)                        │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
-                    ┌─────────────┴─────────────┐
-                    │                           │
-             ┌──────▼───────┐          ┌───────▼────────┐
-             │   SUCCESS    │          │     ERROR      │
-             │  (200 OK)    │          │  (Rate Limit)  │
-             └──────┬───────┘          └───────┬────────┘
-                    │                           │
-                    │                           ▼
-                    │              ┌──────────────────────────────┐
-                    │              │  Exception: RateLimitError   │
-                    │              │  "We're experiencing high    │
-                    │              │   traffic right now!"        │
-                    │              └──────────┬───────────────────┘
-                    │                         │
-                    │                         ▼
-                    │              ┌──────────────────────────────┐
-                    │              │ error_utils.classify_llm_    │
-                    │              │       error(exception)        │
-                    │              │                               │
-                    │              │  Returns:                     │
-                    │              │  - error_class: RateLimitError│
-                    │              │  - user_msg: "The AI service  │
-                    │              │    is experiencing high       │
-                    │              │    traffic..."                │
-                    │              │  - log_msg: Full details      │
-                    │              └──────────┬───────────────────┘
-                    │                         │
-                    │                         ▼
-                    │              ┌──────────────────────────────┐
-                    │              │ Raise RateLimitError(user_msg)│
-                    │              └──────────┬───────────────────┘
-                    │                         │
-                    │                         ▼
+				      │
+				      ▼
+		      ┌─────────────┴─────────────┐
+		      │                           │
+	      ┌──────▼───────┐          ┌───────▼────────┐
+	      │   SUCCESS    │          │     ERROR      │
+	      │  (200 OK)    │          │  (Rate Limit)  │
+	      └──────┬───────┘          └───────┬────────┘
+		      │                           │
+		      │                           ▼
+		      │              ┌──────────────────────────────┐
+		      │              │  Exception: RateLimitError   │
+		      │              │  "We're experiencing high    │
+		      │              │   traffic right now!"        │
+		      │              └──────────┬───────────────────┘
+		      │                         │
+		      │                         ▼
+		      │              ┌──────────────────────────────┐
+		      │              │ error_utils.classify_llm_    │
+		      │              │       error(exception)        │
+		      │              │                               │
+		      │              │  Returns:                     │
+		      │              │  - error_class: RateLimitError│
+		      │              │  - user_msg: "The AI service  │
+		      │              │    is experiencing high       │
+		      │              │    traffic..."                │
+		      │              │  - log_msg: Full details      │
+		      │              └──────────┬───────────────────┘
+		      │                         │
+		      │                         ▼
+		      │              ┌──────────────────────────────┐
+		      │              │ Raise RateLimitError(user_msg)│
+		      │              └──────────┬───────────────────┘
+		      │                         │
+		      │                         ▼
 ┌───────────────────┴─────────────────────────┴─────────────────────┐
 │             Back to WebSocket Handler (main.py)                    │
 │                    Exception Catching                              │
 └────────────────────────────────────────────────────────────────────┘
-                                  │
-                    ┌─────────────┴─────────────┐
-                    │                           │
-             ┌──────▼────────┐        ┌────────▼────────────┐
-             │ except         │        │ except              │
-             │ RateLimitError │        │ LLMTimeoutError     │
-             │                │        │ LLMAuth...Error     │
-             │ Send to user:  │        │ ValidationError     │
-             │ {              │        │ etc.                │
-             │  type: "error",│        │                     │
-             │  message: user │        │ Send appropriate    │
-             │   friendly msg,│        │ message to user     │
-             │  error_type:   │        │                     │
-             │   "rate_limit" │        │                     │
-             │ }              │        │                     │
-             └───────┬────────┘        └────────┬────────────┘
-                     │                          │
-                     └──────────┬───────────────┘
-                                │
-                                ▼
+				      │
+		      ┌─────────────┴─────────────┐
+		      │                           │
+	      ┌──────▼────────┐        ┌────────▼────────────┐
+	      │ except         │        │ except              │
+	      │ RateLimitError │        │ LLMTimeoutError     │
+	      │                │        │ LLMAuth...Error     │
+	      │ Send to user:  │        │ ValidationError     │
+	      │ {              │        │ etc.                │
+	      │  type: "error",│        │                     │
+	      │  message: user │        │ Send appropriate    │
+	      │   friendly msg,│        │ message to user     │
+	      │  error_type:   │        │                     │
+	      │   "rate_limit" │        │                     │
+	      │ }              │        │                     │
+	      └───────┬────────┘        └────────┬────────────┘
+			│                          │
+			└──────────┬───────────────┘
+				    │
+				    ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                       WebSocket Message Sent                         │
 │  {                                                                   │
@@ -115,8 +116,8 @@
 │    "error_type": "rate_limit"                                        │
 │  }                                                                   │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │               Frontend (websocketHandlers.js)                        │
 │                                                                      │
@@ -128,8 +129,8 @@
 │      timestamp: new Date().toISOString()                             │
 │    })                                                                │
 └─────────────────────────────────────────────────────────────────────┘
-                                  │
-                                  ▼
+				      │
+				      ▼
 ┌─────────────────────────────────────────────────────────────────────┐
 │                      UI DISPLAYS ERROR                               │
 │                                                                      │
@@ -152,3 +153,5 @@
 4. **Error Type Field**: The `error_type` field allows the frontend to potentially handle different error types differently in the future (e.g., automatic retry for timeouts).
 
 5. **No Sensitive Data Exposure**: API keys, stack traces, and other sensitive information are never sent to the frontend.
+```
+
diff --git a/docs/developer/error_handling_improvements.md b/docs/developer/error_handling_improvements.md
diff --git a/mocks/llm-mock/main_rate_limit.py b/mocks/llm-mock/main_rate_limit.py