From 4582939c9cd0bc49402f99ead43892ad0c6a237e Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 2 Nov 2025 05:20:03 +0000 Subject: [PATCH 1/6] added the progress demo back in --- config/overrides/mcp.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/config/overrides/mcp.json b/config/overrides/mcp.json index 2aca381..26a0c6b 100644 --- a/config/overrides/mcp.json +++ b/config/overrides/mcp.json @@ -53,8 +53,15 @@ "description": "Specialized system prompts for AI behavior modification", "author": "Chat UI Team", "short_description": "AI behavior prompts", - "help_email": "support@chatui.example.com", + "help_email": "support@chatui.example.com", "compliance_level": "Public" + }, + "progress_demo": { + "transport": "stdio", + "command": ["python", "main.py"], + "cwd": "backend/mcp/progress_demo", + "groups": ["mcp_basic"], + "compliance_level": "Public" } } From 542dd487cbfaa9ef968f0ac18b7ed8540e245abc Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 2 Nov 2025 05:34:11 +0000 Subject: [PATCH 2/6] chore: add .ruff_cache to .gitignore Add .ruff_cache to .gitignore to exclude Ruff linter cache files from version control, preventing unnecessary commits of generated cache data. --- .gitignore | 2 +- GEMINI.md | 65 +++++++++++++++++++++++++++++++++++++++ docs/config-migration.md | 66 ---------------------------------------- 3 files changed, 66 insertions(+), 67 deletions(-) create mode 100644 GEMINI.md delete mode 100644 docs/config-migration.md diff --git a/.gitignore b/.gitignore index f18bc00..6d5990d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ *.pptx *.jsonl - +.ruff_cache # Environment variables .env .claude diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 0000000..45380c4 --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,65 @@ +# GEMINI.md + +This file provides guidance to the Gemini AI agent when working with code in this repository. + +## Project Overview + +Atlas UI 3 is a full-stack LLM chat interface with Model Context Protocol (MCP) integration, supporting multiple LLM providers (OpenAI, Anthropic Claude, Google Gemini), RAG, and agentic capabilities. + +**Tech Stack:** +- Backend: FastAPI + WebSockets, LiteLLM, FastMCP +- Frontend: React 19 + Vite 7 + Tailwind CSS +- Python Package Manager: **uv** (NOT pip!) +- Configuration: Pydantic with YAML/JSON configs + +## Building and Running + +### Quick Start (Recommended) +```bash +bash agent_start.sh +``` +This script handles: killing old processes, clearing logs, building frontend, and starting the backend. + +### Manual Development Workflow + +**Frontend Build (CRITICAL):** +```bash +cd frontend +npm install +npm run build # Use build, NOT npm run dev (WebSocket issues) +``` + +**Backend Start:** +```bash +cd backend +python main.py # NEVER use uvicorn --reload (causes problems) +``` + +### Testing + +**Run all tests:** +```bash +./test/run_tests.sh all +``` + +**Individual test suites:** +```bash +./test/run_tests.sh backend +./test/run_tests.sh frontend +./test/run_tests.sh e2e +``` + +## Development Conventions + +- **Python Package Manager**: **ALWAYS use `uv`**, never pip or conda. +- **Frontend Development**: **NEVER use `npm run dev`**, it has WebSocket connection problems. Always use `npm run build`. +- **Backend Development**: **NEVER use `uvicorn --reload`**, it causes problems. +- **File Naming**: Do not use generic names like `utils.py` or `helpers.py`. Use descriptive names that reflect the file's purpose. +- **No Emojis**: No emojis should ever be added in this repo. +- **Linting**: Run `ruff check backend/` for Python and `npm run lint` for the frontend before committing. + + +Also read. +/workspaces/atlas-ui-3/.github/copilot-instructions.md + +and CLAUDE.md \ No newline at end of file diff --git a/docs/config-migration.md b/docs/config-migration.md deleted file mode 100644 index 8a6f456..0000000 --- a/docs/config-migration.md +++ /dev/null @@ -1,66 +0,0 @@ -# Configuration & Runtime Directory Migration - -This project migrated to a cleaner separation of code, configuration, and runtime artifacts. - -## New Layout - -``` -config/ - defaults/ # Template / version-controlled baseline configs - overrides/ # Editable overrides (mounted volume / env APP_CONFIG_OVERRIDES) -runtime/ - logs/ # Application logs (JSONL) - feedback/ # User feedback JSON files - uploads/ # Future file uploads -``` - -Legacy directories (still supported for backward compatibility): - -``` -backend/configfiles -> config/defaults -backend/configfilesadmin -> config/overrides -backend/logs -> runtime/logs -feedback (root) -> runtime/feedback -``` - -## Environment Variables - -You can customize locations: - -- `APP_CONFIG_OVERRIDES` (default: `config/overrides`) -- `APP_CONFIG_DEFAULTS` (default: `config/defaults`) -- `RUNTIME_FEEDBACK_DIR` (default: `runtime/feedback`) - -## Code Changes - -- `ConfigManager._search_paths` now searches new directories first, then legacy paths. -- Admin routes seed `config/overrides` from `config/defaults` or legacy dirs if empty. -- Feedback routes use `runtime/feedback` (override with `RUNTIME_FEEDBACK_DIR`). -- MCP tool manager chooses `config/overrides/mcp.json` with legacy fallback. - -## Migration Steps (Already Applied) - -1. Created `config/defaults` and copied existing `backend/configfiles` contents. -2. Created `config/overrides` and copied existing `backend/configfilesadmin` contents. -3. Added new runtime directories: `runtime/logs`, `runtime/feedback`. -4. Updated `.gitignore` to exclude runtime artifacts. -5. Added backward-compatible search paths so no immediate breakage. - -## Next Clean-Up (Optional) - -- Remove legacy `backend/configfiles*` once confident no tooling relies on them. -- Update any deployment manifests / Docker volumes to mount `config/overrides` & `runtime`. -- Document environment variables in main README. - -## Rollback - -If needed, you can restore previous behavior by setting: - -``` -APP_CONFIG_OVERRIDES=backend/configfilesadmin -APP_CONFIG_DEFAULTS=backend/configfiles -RUNTIME_FEEDBACK_DIR=feedback -``` - ---- -Generated on migration date. From 8acc3593620416e6f2a0ece6ea5470a100a5ea81 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 2 Nov 2025 18:18:39 +0000 Subject: [PATCH 3/6] feat: add mock S3 option for simplified development Introduce USE_MOCK_S3 flag in .env to toggle between in-process mock S3 (default) and MinIO via Docker. This simplifies the development workflow by eliminating the need for Docker in local setups, while maintaining production compatibility. Updated documentation and startup script accordingly. --- .env.example | 26 +- CLAUDE.md | 37 +- agent_start.sh | 26 +- .../chat/preprocessors/message_builder.py | 13 +- backend/infrastructure/app_factory.py | 8 +- backend/modules/config/manager.py | 1 + .../modules/file_storage/mock_s3_client.py | 380 ++++++++++++++++++ mocks/s3-mock/README.md | 177 ++++++++ mocks/s3-mock/main.py | 214 ++++++++++ mocks/s3-mock/s3_xml.py | 78 ++++ mocks/s3-mock/smoke_test.py | 278 +++++++++++++ mocks/s3-mock/storage.py | 135 +++++++ 12 files changed, 1342 insertions(+), 31 deletions(-) create mode 100644 backend/modules/file_storage/mock_s3_client.py create mode 100644 mocks/s3-mock/README.md create mode 100644 mocks/s3-mock/main.py create mode 100644 mocks/s3-mock/s3_xml.py create mode 100644 mocks/s3-mock/smoke_test.py create mode 100644 mocks/s3-mock/storage.py diff --git a/.env.example b/.env.example index 0e60bb9..8ba9f6c 100644 --- a/.env.example +++ b/.env.example @@ -78,22 +78,28 @@ FEATURE_AGENT_MODE_AVAILABLE=true AGENT_LOOP_STRATEGY=think-act # (Adjust above to stage rollouts. For a bare-bones chat set them all to false.) -APP_LOG_DIR=/workspaces/atlas-ui-3-11/logs +APP_LOG_DIR=/workspaces/atlas-ui-3/logs CAPABILITY_TOKEN_SECRET=blablah ############################################# # S3/MinIO Storage Configuration ############################################# -# MinIO endpoint (use localhost for local dev, minio for docker-compose) -S3_ENDPOINT=http://localhost:9000 -S3_BUCKET_NAME=atlas-files -S3_ACCESS_KEY=minioadmin -S3_SECRET_KEY=minioadmin -S3_REGION=us-east-1 -S3_TIMEOUT=30 - -S3_USE_SSL=false +# Choose ONE option below (comment out the other) + +# --- Option 1: Mock S3 (Default - No Docker required) --- +USE_MOCK_S3=true + +# --- Option 2: MinIO (Requires Docker) --- +# Uncomment below and set USE_MOCK_S3=false to use MinIO +# USE_MOCK_S3=false +# S3_ENDPOINT=http://localhost:9000 +# S3_BUCKET_NAME=atlas-files # Must match bucket created in docker-compose.yml +# S3_ACCESS_KEY=minioadmin +# S3_SECRET_KEY=minioadmin +# S3_REGION=us-east-1 +# S3_TIMEOUT=30 +# S3_USE_SSL=false SECURITY_CSP_VALUE="default-src 'self'; img-src 'self' data: blob:; script-src 'self'; style-src 'self' 'unsafe-inline'; connect-src 'self'; frame-src 'self' blob: data:; frame-ancestors 'self'" diff --git a/CLAUDE.md b/CLAUDE.md index fa64b0c..3608881 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -61,12 +61,16 @@ mkdir -p logs ```bash bash agent_start.sh ``` -This script handles: killing old processes, clearing logs, building frontend, starting mock S3, and starting backend. +This script handles: killing old processes, clearing logs, building frontend, starting S3 storage (MinIO or Mock based on `USE_MOCK_S3` in `.env`), and starting backend. **Options:** - `bash agent_start.sh -f` - Only rebuild frontend - `bash agent_start.sh -b` - Only restart backend +**Note:** The script automatically reads `USE_MOCK_S3` from `.env`: +- If `true`: Uses in-process Mock S3 (no Docker) +- If `false`: Starts MinIO via docker-compose + ### Manual Development Workflow **Frontend Build (CRITICAL):** @@ -82,11 +86,24 @@ cd backend python main.py # NEVER use uvicorn --reload (causes problems) ``` -**Mock S3 (Optional):** -```bash -cd mocks/s3-mock -python main.py # Runs on http://127.0.0.1:8003 -``` +**S3 Storage (Mock vs MinIO):** + +The project supports two S3 storage backends: + +1. **Mock S3 (Default, Recommended for Development)** + - Set `USE_MOCK_S3=true` in `.env` + - Uses in-process FastAPI TestClient (no Docker required) + - Files stored in `minio-data/chatui/` on disk + - No external server needed - integrated directly into backend + - Faster startup, simpler development workflow + +2. **MinIO (Production-like)** + - Set `USE_MOCK_S3=false` in `.env` + - Requires Docker: `docker-compose up -d minio minio-init` + - Full S3 compatibility with all features + - Use for testing production-like scenarios + +The mock automatically activates when the backend starts if `USE_MOCK_S3=true`. ### Testing @@ -246,9 +263,11 @@ Three agent loop strategies implement different reasoning patterns: - **Act** (`backend/application/chat/agent/act_loop.py`): Pure action loop without explicit reasoning steps, fastest with minimal overhead. LLM calls tools directly and signals completion via the "finished" tool ### File Storage -S3-compatible storage via `backend/modules/file_storage/s3_client.py`: -- Production: Real S3 or S3-compatible service -- Development: Mock S3 (`mocks/s3-mock/`) +S3-compatible storage via `backend/modules/file_storage/`: +- Production/MinIO: `s3_client.py` - boto3-based client for real S3/MinIO +- Development: `mock_s3_client.py` - TestClient-based in-process mock (no Docker) +- Controlled by `USE_MOCK_S3` env var (default: true) +- Both implementations share the same interface ### Security Middleware Stack ``` diff --git a/agent_start.sh b/agent_start.sh index bcaf53c..8d6cf67 100755 --- a/agent_start.sh +++ b/agent_start.sh @@ -20,14 +20,26 @@ done # Configuration USE_NEW_FRONTEND=${USE_NEW_FRONTEND:-true} -# Check if MinIO is running -if ! docker ps | grep -q atlas-minio; then - echo "⚠️ MinIO is not running. Starting MinIO with docker-compose..." - docker-compose up -d minio minio-init - echo "✅ MinIO started successfully" - sleep 3 +# Read USE_MOCK_S3 from .env file +if [ -f .env ]; then + USE_MOCK_S3=$(grep -E "^USE_MOCK_S3=" .env | cut -d '=' -f2) else - echo "✅ MinIO is already running" + USE_MOCK_S3="true" # Default to mock if no .env +fi + +# Only start MinIO if not using mock S3 +if [ "$USE_MOCK_S3" = "true" ]; then + echo "✅ Using Mock S3 (no Docker required)" +else + # Check if MinIO is running + if ! docker ps | grep -q atlas-minio; then + echo "⚠️ MinIO is not running. Starting MinIO with docker-compose..." + docker-compose up -d minio minio-init + echo "✅ MinIO started successfully" + sleep 3 + else + echo "✅ MinIO is already running" + fi fi # Kill any running uvicorn processes (skip if only rebuilding frontend) diff --git a/backend/application/chat/preprocessors/message_builder.py b/backend/application/chat/preprocessors/message_builder.py index a2f306e..bc0f95e 100644 --- a/backend/application/chat/preprocessors/message_builder.py +++ b/backend/application/chat/preprocessors/message_builder.py @@ -41,22 +41,27 @@ async def build_messages( ) -> List[Dict[str, Any]]: """ Build messages array from session history and context. - + Args: session: Current chat session include_files_manifest: Whether to append files manifest - + Returns: List of messages ready for LLM call """ # Get conversation history from session messages = session.history.get_messages_for_llm() - + # Optionally add files manifest if include_files_manifest: session_context = build_session_context(session) + files_in_context = session_context.get("files", {}) + logger.info(f"DEBUG: Session has {len(files_in_context)} files: {list(files_in_context.keys())}") files_manifest = file_utils.build_files_manifest(session_context) if files_manifest: + logger.info(f"DEBUG: Adding files manifest to messages: {files_manifest['content'][:100]}") messages.append(files_manifest) - + else: + logger.warning("DEBUG: No files manifest generated despite include_files_manifest=True") + return messages diff --git a/backend/infrastructure/app_factory.py b/backend/infrastructure/app_factory.py index a56d1da..46bd386 100644 --- a/backend/infrastructure/app_factory.py +++ b/backend/infrastructure/app_factory.py @@ -7,6 +7,7 @@ from interfaces.transport import ChatConnectionProtocol from modules.config import ConfigManager from modules.file_storage import S3StorageClient, FileManager +from modules.file_storage.mock_s3_client import MockS3StorageClient from modules.llm.litellm_caller import LiteLLMCaller from modules.mcp_tools import MCPToolManager from modules.rag import RAGClient @@ -37,7 +38,12 @@ def __init__(self) -> None: ) # File storage & manager - self.file_storage = S3StorageClient() + if self.config_manager.app_settings.use_mock_s3: + logger.info("Using MockS3StorageClient (in-process, no Docker required)") + self.file_storage = MockS3StorageClient() + else: + logger.info("Using S3StorageClient (MinIO/AWS S3)") + self.file_storage = S3StorageClient() self.file_manager = FileManager(self.file_storage) logger.info("AppFactory initialized") diff --git a/backend/modules/config/manager.py b/backend/modules/config/manager.py index 9b5698a..f21810a 100644 --- a/backend/modules/config/manager.py +++ b/backend/modules/config/manager.py @@ -127,6 +127,7 @@ def agent_mode_available(self) -> bool: test_user: str = "test@test.com" # Test user for development # S3/MinIO storage settings + use_mock_s3: bool = False # Use in-process S3 mock (no Docker required) s3_endpoint: str = "http://localhost:9000" s3_bucket_name: str = "atlas-files" s3_access_key: str = "minioadmin" diff --git a/backend/modules/file_storage/mock_s3_client.py b/backend/modules/file_storage/mock_s3_client.py new file mode 100644 index 0000000..35e6867 --- /dev/null +++ b/backend/modules/file_storage/mock_s3_client.py @@ -0,0 +1,380 @@ +""" +Mock S3 Storage Client using FastAPI TestClient. + +This client provides the same interface as S3StorageClient but uses the +S3 mock server via TestClient, eliminating the need for Docker/MinIO in development. +""" + +import base64 +import hashlib +import logging +import time +import uuid +from typing import Dict, List, Optional, Any + +from core.utils import sanitize_for_logging + + +logger = logging.getLogger(__name__) + + +class MockS3StorageClient: + """Mock S3 client using FastAPI TestClient for in-process testing.""" + + def __init__( + self, + s3_bucket_name: str = None, + ): + """Initialize the mock S3 client.""" + from modules.config import config_manager + + self.bucket_name = s3_bucket_name or config_manager.app_settings.s3_bucket_name + self.endpoint_url = "in-process-mock" # For health check compatibility + self.region = "us-east-1" # For health check compatibility + self._client = None # Lazy initialization + + logger.info(f"MockS3StorageClient initialized with bucket: {self.bucket_name}") + + @property + def client(self): + """Lazy-load the TestClient to avoid circular imports.""" + if self._client is None: + from fastapi.testclient import TestClient + import sys + import importlib.util + from pathlib import Path + + # Get the S3 mock path + mock_path = Path(__file__).parent.parent.parent.parent / "mocks" / "s3-mock" + main_py_path = mock_path / "main.py" + + # Add mock directory to sys.path for relative imports (storage, s3_xml) + mock_path_str = str(mock_path) + if mock_path_str not in sys.path: + sys.path.insert(0, mock_path_str) + + # Import the main.py module explicitly with a unique name + spec = importlib.util.spec_from_file_location("s3_mock_app", main_py_path) + s3_mock_module = importlib.util.module_from_spec(spec) + + # Add to sys.modules so relative imports work + sys.modules['s3_mock_app'] = s3_mock_module + spec.loader.exec_module(s3_mock_module) + + self._client = TestClient(s3_mock_module.get_app()) + logger.info("TestClient for S3 mock initialized") + + return self._client + + def _generate_s3_key(self, user_email: str, filename: str, source_type: str = "user") -> str: + """Generate an S3-style key with user isolation.""" + timestamp = int(time.time()) + unique_id = str(uuid.uuid4())[:8] + safe_filename = filename.replace(" ", "_").replace("/", "_") + + if source_type == "tool": + return f"users/{user_email}/generated/{timestamp}_{unique_id}_{safe_filename}" + else: + return f"users/{user_email}/uploads/{timestamp}_{unique_id}_{safe_filename}" + + def _calculate_etag(self, content_bytes: bytes) -> str: + """Calculate ETag for file content.""" + return hashlib.md5(content_bytes).hexdigest() + + async def upload_file( + self, + user_email: str, + filename: str, + content_base64: str, + content_type: str = "application/octet-stream", + tags: Optional[Dict[str, str]] = None, + source_type: str = "user" + ) -> Dict[str, Any]: + """ + Upload a file to mock S3 storage. + + Args: + user_email: Email of the user uploading the file + filename: Original filename + content_base64: Base64 encoded file content + content_type: MIME type of the file + tags: Additional metadata tags + source_type: Type of file ("user" or "tool") + + Returns: + Dictionary containing file metadata including the S3 key + """ + try: + # Decode base64 content + content_bytes = base64.b64decode(content_base64) + + # Generate S3 key + s3_key = self._generate_s3_key(user_email, filename, source_type) + + # Prepare tags + file_tags = tags or {} + file_tags["source"] = source_type + file_tags["user_email"] = user_email + file_tags["original_filename"] = filename + + # Convert tags to query param format + tag_param = "&".join([f"{k}={v}" for k, v in file_tags.items()]) + + # Upload via TestClient + headers = { + "Content-Type": content_type, + "x-amz-meta-user_email": user_email, + "x-amz-meta-original_filename": filename, + "x-amz-meta-source_type": source_type + } + + response = self.client.put( + f"/{self.bucket_name}/{s3_key}", + content=content_bytes, + headers=headers, + params={"tagging": tag_param} + ) + + if response.status_code != 200: + raise Exception(f"Upload failed: {response.text}") + + etag = response.headers.get("ETag", "").strip('"') + + result = { + "key": s3_key, + "filename": filename, + "size": len(content_bytes), + "content_type": content_type, + "last_modified": None, # Mock doesn't need exact timestamp + "etag": etag, + "tags": file_tags, + "user_email": user_email + } + + logger.info(f"File uploaded successfully: {sanitize_for_logging(s3_key)} for user {sanitize_for_logging(user_email)}") + return result + + except Exception as e: + logger.error(f"Error uploading file to mock S3: {str(e)}") + raise + + async def get_file(self, user_email: str, file_key: str) -> Dict[str, Any]: + """ + Get a file from mock S3 storage. + + Args: + user_email: Email of the user requesting the file + file_key: S3 key of the file to retrieve + + Returns: + Dictionary containing file data and metadata + """ + try: + # Verify user has access to this file + if not file_key.startswith(f"users/{user_email}/"): + logger.warning(f"Access denied: {sanitize_for_logging(user_email)} attempted to access {sanitize_for_logging(file_key)}") + raise Exception("Access denied to file") + + # Get object via TestClient + response = self.client.get(f"/{self.bucket_name}/{file_key}") + + if response.status_code == 404: + logger.warning(f"File not found: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return None + + if response.status_code != 200: + raise Exception(f"Get failed: {response.text}") + + # Read file content + content_bytes = response.content + content_base64 = base64.b64encode(content_bytes).decode() + + # Get tags + tags_response = self.client.get(f"/{self.bucket_name}/{file_key}", params={"tagging": ""}) + tags = {} + if tags_response.status_code == 200: + # Parse XML tags (simplified - just extract from response) + import xml.etree.ElementTree as ET + try: + root = ET.fromstring(tags_response.text) + for tag_elem in root.findall(".//Tag"): + key_elem = tag_elem.find("Key") + value_elem = tag_elem.find("Value") + if key_elem is not None and value_elem is not None: + tags[key_elem.text] = value_elem.text + except: + pass + + # Extract filename from metadata headers + filename = response.headers.get("x-amz-meta-original_filename", file_key.split('/')[-1]) + + result = { + "key": file_key, + "filename": filename, + "content_base64": content_base64, + "content_type": response.headers.get("Content-Type", "application/octet-stream"), + "size": len(content_bytes), + "last_modified": None, + "etag": response.headers.get("ETag", "").strip('"'), + "tags": tags + } + + logger.info(f"File retrieved successfully: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return result + + except Exception as e: + logger.error(f"Error getting file from mock S3: {str(e)}") + raise + + async def list_files( + self, + user_email: str, + file_type: Optional[str] = None, + limit: int = 100 + ) -> List[Dict[str, Any]]: + """ + List files for a user. + + Args: + user_email: Email of the user + file_type: Optional filter by file type ("user" or "tool") + limit: Maximum number of files to return + + Returns: + List of file metadata dictionaries + """ + try: + # Determine prefix + prefix = f"users/{user_email}/" + if file_type == "tool": + prefix = f"users/{user_email}/generated/" + elif file_type == "user": + prefix = f"users/{user_email}/uploads/" + + # List via TestClient + response = self.client.get( + f"/{self.bucket_name}", + params={"list-type": "2", "prefix": prefix, "max-keys": str(limit)} + ) + + if response.status_code != 200: + raise Exception(f"List failed: {response.text}") + + # Parse XML response + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'} + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + files = [] + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + size_elem = content.find("s3:Size", ns) or content.find("Size") + etag_elem = content.find("s3:ETag", ns) or content.find("ETag") + + if key_elem is None: + continue + + key = key_elem.text + size = int(size_elem.text) if size_elem is not None else 0 + etag = etag_elem.text.strip('"') if etag_elem is not None else "" + + # Get metadata via HEAD + head_response = self.client.head(f"/{self.bucket_name}/{key}") + filename = head_response.headers.get("x-amz-meta-original_filename", key.split('/')[-1]) + content_type = head_response.headers.get("Content-Type", "application/octet-stream") + + files.append({ + "key": key, + "filename": filename, + "size": size, + "content_type": content_type, + "last_modified": None, + "etag": etag, + "tags": {}, + "user_email": user_email + }) + + logger.info(f"Listed {len(files)} files for user {sanitize_for_logging(user_email)}") + return files + + except Exception as e: + logger.error(f"Error listing files from mock S3: {str(e)}") + raise + + async def delete_file(self, user_email: str, file_key: str) -> bool: + """ + Delete a file from mock S3 storage. + + Args: + user_email: Email of the user deleting the file + file_key: S3 key of the file to delete + + Returns: + True if deletion was successful + """ + try: + # Verify user has access to this file + if not file_key.startswith(f"users/{user_email}/"): + logger.warning(f"Access denied for deletion: {sanitize_for_logging(user_email)} attempted to delete {sanitize_for_logging(file_key)}") + raise Exception("Access denied to delete file") + + # Delete via TestClient + response = self.client.delete(f"/{self.bucket_name}/{file_key}") + + if response.status_code == 404: + logger.warning(f"File not found for deletion: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return False + + if response.status_code != 204: + raise Exception(f"Delete failed: {response.text}") + + logger.info(f"File deleted successfully: {sanitize_for_logging(file_key)} for user {sanitize_for_logging(user_email)}") + return True + + except Exception as e: + logger.error(f"Error deleting file from mock S3: {str(e)}") + raise + + async def get_user_stats(self, user_email: str) -> Dict[str, Any]: + """ + Get file statistics for a user. + + Args: + user_email: Email of the user + + Returns: + Dictionary containing file statistics + """ + try: + # List all user files + files = await self.list_files(user_email, limit=1000) + + total_size = 0 + upload_count = 0 + generated_count = 0 + + for file_data in files: + total_size += file_data['size'] + + # Determine type from key path + if "/generated/" in file_data['key']: + generated_count += 1 + else: + upload_count += 1 + + result = { + "total_files": len(files), + "total_size": total_size, + "upload_count": upload_count, + "generated_count": generated_count + } + + logger.info(f"Got file stats for user {sanitize_for_logging(user_email)}: {result}") + return result + + except Exception as e: + logger.error(f"Error getting user stats from mock S3: {str(e)}") + raise diff --git a/mocks/s3-mock/README.md b/mocks/s3-mock/README.md new file mode 100644 index 0000000..01af194 --- /dev/null +++ b/mocks/s3-mock/README.md @@ -0,0 +1,177 @@ +# S3 Mock Server + +A minimal FastAPI server that mimics S3's REST API for development and testing purposes. Compatible with botocore/boto3 expectations. + +## Features + +- **Path-style addressing**: `http://localhost:9001/{bucket}/{key}` +- **S3v4 signed requests**: Accepts but doesn't validate signatures +- **Persistent storage**: Data survives server restarts +- **XML responses**: Compatible with botocore parsing +- **Minimal S3 operations**: PUT, GET, HEAD, DELETE, ListObjectsV2, Tagging + +## Supported Operations + +### Object Operations +- `PUT /{bucket}/{key}` - Upload object with optional metadata and tagging +- `GET /{bucket}/{key}` - Download object +- `HEAD /{bucket}/{key}` - Get object metadata +- `DELETE /{bucket}/{key}` - Delete object + +### Listing Operations +- `GET /{bucket}?list-type=2&prefix=...` - List objects V2 + +### Tagging Operations +- `GET /{bucket}/{key}?tagging` - Get object tags +- `PUT /{bucket}/{key}?tagging` - Set object tags + +## Storage + +- **Root directory**: `minio-data/chatui/` (configurable via `MOCK_S3_ROOT`) +- **Bucket structure**: `{root}/{bucket}/` +- **Object files**: `{bucket}/{key}` (data) +- **Metadata files**: `{bucket}/{key}.meta.json` (content-type, etag, metadata, tags) +- **Tag files**: Tags stored in metadata JSON + +## Running the Server + +### Prerequisites +```bash +pip install fastapi uvicorn +``` + +### Start the server +```bash +cd mocks/s3-mock +python main.py +``` + +Server runs on `http://localhost:9001` by default. Configure port with `PORT` environment variable. + +## Backend Configuration + +Configure your backend to use the mock server: + +```bash +# Environment variables +S3_ENDPOINT_URL=http://localhost:9001 +S3_REGION=us-east-1 +S3_BUCKET=atlas-files +S3_USE_SSL=false +S3_ADDRESSING_STYLE=path +S3_ACCESS_KEY_ID=any_value +S3_SECRET_ACCESS_KEY=any_value +``` + +## Testing + +Use the included smoke test script to verify functionality: + +```bash +cd mocks/s3-mock +python smoke_test.py +``` + +The test performs: +- PUT object with metadata and tags +- GET object and verify content/metadata/ETag +- HEAD object and verify headers +- List objects with prefix filtering +- Get/set object tagging +- DELETE object and verify cleanup + +## API Details + +### PUT Object +``` +PUT /{bucket}/{key} +Content-Type: +x-amz-meta-*: +?tagging=key1%3Dvalue1%26key2%3Dvalue2 + +Body: + +Response: 200 OK +ETag: "" +``` + +### GET Object +``` +GET /{bucket}/{key} + +Response: 200 OK +Content-Type: +ETag: "" +x-amz-meta-*: +Body: +``` + +### List Objects V2 +``` +GET /{bucket}?list-type=2&prefix=&max-keys= + +Response: 200 OK +Content-Type: application/xml + + + bucket + prefix + n + 1000 + false + + object-key + 2025-11-02T10:00:00.000Z + "md5hex" + 123 + STANDARD + + +``` + +### Tagging +``` +GET /{bucket}/{key}?tagging + +Response: 200 OK +Content-Type: application/xml + + + + + key1 + value1 + + + +``` + +## Error Responses + +Returns S3-compatible XML error responses: + +```xml + + NoSuchKey + The specified key does not exist. + /{bucket}/{key} + +``` + +## Limitations + +- No multipart upload support +- No versioning, ACLs, or presigned URLs +- No signature verification (accepts all auth headers) +- Virtual-hosted-style addressing not supported +- Single PUT operations only (no chunked uploads) +- Basic prefix filtering for listing (no delimiter support) + +## Development + +The mock server is structured as: + +- `main.py` - FastAPI application and routes +- `storage.py` - Filesystem storage operations +- `s3_xml.py` - XML generation and parsing +- `README.md` - This documentation diff --git a/mocks/s3-mock/main.py b/mocks/s3-mock/main.py new file mode 100644 index 0000000..52bafde --- /dev/null +++ b/mocks/s3-mock/main.py @@ -0,0 +1,214 @@ +import os +import urllib.parse +from pathlib import Path +from typing import Dict, Optional + +from fastapi import FastAPI, HTTPException, Request, Response +from fastapi.responses import StreamingResponse +import uvicorn + +from storage import ( + ensure_bucket, load_object, save_object, delete_object, + list_objects, get_tags, set_tags, load_meta +) +from s3_xml import ( + create_list_objects_xml, create_tagging_xml, parse_tagging_xml, create_error_xml +) + +app = FastAPI(title="S3 Mock Server") + + +@app.middleware("http") +async def log_requests(request: Request, call_next): + """Middleware to log when the S3 mock is hit.""" + print(f"S3 Mock hit: {request.method} {request.url.path}") + response = await call_next(request) + return response + + +def get_app(): + """Get the FastAPI app instance for testing.""" + return app + +# Configuration +MOCK_S3_ROOT = Path(os.getenv("MOCK_S3_ROOT", "minio-data/chatui")) + + +def get_bucket_root(bucket: str) -> Path: + """Get the root path for a bucket.""" + return ensure_bucket(MOCK_S3_ROOT, bucket) + + +def extract_metadata(headers: Dict[str, str]) -> Dict[str, str]: + """Extract x-amz-meta-* headers.""" + metadata = {} + for key, value in headers.items(): + if key.lower().startswith("x-amz-meta-"): + meta_key = key[11:].lower() # Remove "x-amz-meta-" prefix + metadata[meta_key] = value + return metadata + + +def add_metadata_headers(response: Response, metadata: Dict[str, str]): + """Add metadata headers to response.""" + for key, value in metadata.items(): + response.headers[f"x-amz-meta-{key}"] = value + + +@app.put("/{bucket}/{key:path}") +async def put_object(bucket: str, key: str, request: Request): + """PUT Object endpoint.""" + try: + bucket_root = get_bucket_root(bucket) + body = await request.body() + + # Extract content type + content_type = request.headers.get("content-type", "application/octet-stream") + + # Extract metadata + metadata = extract_metadata(dict(request.headers)) + + # Extract tagging from query params + tags = {} + tagging_param = request.query_params.get("tagging") + if tagging_param: + # Parse URL-encoded tags: key1=value1&key2=value2 + tag_pairs = urllib.parse.parse_qs(tagging_param, keep_blank_values=True) + for tag_key, tag_values in tag_pairs.items(): + if tag_values: + tags[tag_key] = tag_values[0] + + # Save object + meta = save_object(bucket_root, key, body, content_type, metadata, tags) + + # Return response with ETag + response = Response(status_code=200) + response.headers["ETag"] = f'"{meta["etag"]}"' + return response + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/{bucket}/{key:path}") +async def get_object(bucket: str, key: str, request: Request): + """GET Object endpoint.""" + bucket_root = get_bucket_root(bucket) + + # Check if this is a tagging request + if request.query_params.get("tagging"): + # Check if object exists + if load_meta(bucket_root, key) is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml, media_type="application/xml") + + tags = get_tags(bucket_root, key) + xml_response = create_tagging_xml(tags) + return Response(content=xml_response, media_type="application/xml") + + # Regular object GET + result = load_object(bucket_root, key) + if result is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + data, meta = result + + # Create streaming response + def iter_data(): + yield data + + response = StreamingResponse(iter_data(), media_type=meta.get("content_type", "application/octet-stream")) + response.headers["ETag"] = f'"{meta["etag"]}"' + response.headers["Content-Type"] = meta.get("content_type", "application/octet-stream") + + # Add metadata headers + add_metadata_headers(response, meta.get("metadata", {})) + + return response + + +@app.head("/{bucket}/{key:path}") +async def head_object(bucket: str, key: str): + """HEAD Object endpoint.""" + bucket_root = get_bucket_root(bucket) + result = load_object(bucket_root, key) + if result is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + data, meta = result + + response = Response(status_code=200) + response.headers["ETag"] = f'"{meta["etag"]}"' + response.headers["Content-Type"] = meta.get("content_type", "application/octet-stream") + + # Add metadata headers + add_metadata_headers(response, meta.get("metadata", {})) + + return response + + +@app.delete("/{bucket}/{key:path}") +async def delete_object_endpoint(bucket: str, key: str): + """DELETE Object endpoint.""" + bucket_root = get_bucket_root(bucket) + deleted = delete_object(bucket_root, key) + if not deleted: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + return Response(status_code=204) + + +@app.get("/{bucket}") +async def list_objects_v2(bucket: str, request: Request): + """List Objects V2 endpoint.""" + bucket_root = get_bucket_root(bucket) + + # Check if bucket exists (has any objects) + if not any(bucket_root.rglob("*")): + error_xml = create_error_xml("NoSuchBucket", "The specified bucket does not exist.", f"/{bucket}") + raise HTTPException(status_code=404, detail=error_xml) + + prefix = request.query_params.get("prefix", "") + max_keys = int(request.query_params.get("max-keys", "1000")) + + objects = list_objects(bucket_root, prefix, max_keys) + xml_response = create_list_objects_xml(bucket, prefix, objects) + + return Response(content=xml_response, media_type="application/xml") + + +@app.put("/{bucket}/{key:path}") +async def put_object_tagging(bucket: str, key: str, request: Request): + """PUT Object Tagging endpoint.""" + bucket_root = get_bucket_root(bucket) + + # Check if object exists + if load_meta(bucket_root, key) is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + try: + body = await request.body() + xml_content = body.decode("utf-8") + tags = parse_tagging_xml(xml_content) + set_tags(bucket_root, key, tags) + return Response(status_code=200) + except ValueError: + error_xml = create_error_xml("MalformedXML", "The XML you provided was not well-formed or did not validate against the published schema.", f"/{bucket}/{key}?tagging") + raise HTTPException(status_code=400, detail=error_xml) + + +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + """Handle HTTP exceptions with XML error responses.""" + if exc.detail and exc.detail.startswith(""): + return Response(content=exc.detail, media_type="application/xml", status_code=exc.status_code) + return Response(content=str(exc.detail), status_code=exc.status_code) + + +if __name__ == "__main__": + port = int(os.getenv("PORT", "9002")) + uvicorn.run(app, host="0.0.0.0", port=port) diff --git a/mocks/s3-mock/s3_xml.py b/mocks/s3-mock/s3_xml.py new file mode 100644 index 0000000..e1188f0 --- /dev/null +++ b/mocks/s3-mock/s3_xml.py @@ -0,0 +1,78 @@ +import xml.etree.ElementTree as ET +from typing import Dict, List +from xml.dom import minidom + + +def create_list_objects_xml(bucket: str, prefix: str, objects: List[Dict[str, str]], is_truncated: bool = False, continuation_token: str = "") -> str: + """Generate S3 ListObjectsV2 XML response.""" + root = ET.Element("ListBucketResult", xmlns="http://s3.amazonaws.com/doc/2006-03-01/") + + ET.SubElement(root, "Name").text = bucket + ET.SubElement(root, "Prefix").text = prefix + ET.SubElement(root, "KeyCount").text = str(len(objects)) + ET.SubElement(root, "MaxKeys").text = "1000" + ET.SubElement(root, "IsTruncated").text = "true" if is_truncated else "false" + if continuation_token: + ET.SubElement(root, "ContinuationToken").text = continuation_token + + for obj in objects: + contents = ET.SubElement(root, "Contents") + ET.SubElement(contents, "Key").text = obj["Key"] + ET.SubElement(contents, "LastModified").text = obj["LastModified"] + ET.SubElement(contents, "ETag").text = f'"{obj["ETag"]}"' + ET.SubElement(contents, "Size").text = str(obj["Size"]) + ET.SubElement(contents, "StorageClass").text = "STANDARD" + + # Pretty print XML + rough_string = ET.tostring(root, encoding='unicode') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") + + +def create_tagging_xml(tags: Dict[str, str]) -> str: + """Generate S3 Tagging XML response.""" + root = ET.Element("Tagging") + tag_set = ET.SubElement(root, "TagSet") + + for key, value in tags.items(): + tag = ET.SubElement(tag_set, "Tag") + ET.SubElement(tag, "Key").text = key + ET.SubElement(tag, "Value").text = value + + # Pretty print XML + rough_string = ET.tostring(root, encoding='unicode') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") + + +def parse_tagging_xml(xml_content: str) -> Dict[str, str]: + """Parse S3 Tagging XML input.""" + try: + root = ET.fromstring(xml_content) + tags = {} + tag_set = root.find("TagSet") + if tag_set is not None: + for tag in tag_set.findall("Tag"): + key_elem = tag.find("Key") + value_elem = tag.find("Value") + if key_elem is not None and value_elem is not None: + key = key_elem.text or "" + value = value_elem.text or "" + tags[key] = value + return tags + except ET.ParseError: + raise ValueError("Malformed XML") + + +def create_error_xml(code: str, message: str, resource: str = "") -> str: + """Generate S3 error XML response.""" + root = ET.Element("Error") + ET.SubElement(root, "Code").text = code + ET.SubElement(root, "Message").text = message + if resource: + ET.SubElement(root, "Resource").text = resource + + # Pretty print XML + rough_string = ET.tostring(root, encoding='unicode') + reparsed = minidom.parseString(rough_string) + return reparsed.toprettyxml(indent=" ") diff --git a/mocks/s3-mock/smoke_test.py b/mocks/s3-mock/smoke_test.py new file mode 100644 index 0000000..2956172 --- /dev/null +++ b/mocks/s3-mock/smoke_test.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Smoke test for S3 Mock Server using FastAPI TestClient. +Tests all supported S3 operations to ensure compatibility. +""" + +import hashlib +import time +import os +from pathlib import Path +from fastapi.testclient import TestClient +from main import get_app + +# Test configuration +BUCKET = "test-bucket" + +def create_test_client(): + """Create FastAPI TestClient for direct testing.""" + app = get_app() + return TestClient(app) + +def test_put_get_object(): + """Test PUT and GET object operations.""" + print("Testing PUT/GET object...") + + client = create_test_client() + key = "test-file.txt" + test_data = b"Hello, S3 Mock!" + metadata = {"author": "test", "version": "1.0"} + tags = "environment=test&type=smoke" + + # PUT object + headers = { + "Content-Type": "text/plain", + "x-amz-meta-author": "test", + "x-amz-meta-version": "1.0" + } + response = client.put( + f"/{BUCKET}/{key}", + content=test_data, + headers=headers, + params={"tagging": tags} + ) + assert response.status_code == 200 + etag = response.headers.get("ETag") + print(f" PUT successful, ETag: {etag}") + + # GET object + response = client.get(f"/{BUCKET}/{key}") + assert response.status_code == 200 + assert response.content == test_data, "Data mismatch" + assert response.headers["Content-Type"] == "text/plain", "Content-Type mismatch" + assert response.headers.get("ETag") == etag, "ETag mismatch" + assert response.headers.get("x-amz-meta-author") == "test", "Metadata mismatch" + assert response.headers.get("x-amz-meta-version") == "1.0", "Metadata mismatch" + + print(" GET successful, data/metadata verified") + +def test_head_object(): + """Test HEAD object operation.""" + print("Testing HEAD object...") + + client = create_test_client() + key = "test-file.txt" + + response = client.head(f"/{BUCKET}/{key}") + assert response.status_code == 200 + + assert response.headers["Content-Type"] == "text/plain", "Content-Type mismatch" + assert 'ETag' in response.headers, "ETag missing" + assert response.headers.get("x-amz-meta-author") == "test", "Metadata mismatch" + + print(" HEAD successful, headers verified") + +def test_list_objects(): + """Test ListObjectsV2 operation.""" + print("Testing ListObjectsV2...") + + client = create_test_client() + + # Create test objects for this test + test_objects = ["list-test-file.txt", "folder/file0.txt", "folder/file1.txt", "folder/file2.txt"] + + for obj_key in test_objects: + response = client.put( + f"/{BUCKET}/{obj_key}", + content=f"Content for {obj_key}".encode(), + headers={"Content-Type": "text/plain"} + ) + assert response.status_code == 200 + + # List all objects + response = client.get(f"/{BUCKET}", params={"list-type": "2"}) + assert response.status_code == 200 + + # Parse XML response + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + + # Handle XML namespace + ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'} + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + assert len(contents) >= 4, f"Not all objects listed, found {len(contents)}" + + # Check that our test files are present + keys = [] + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + if key_elem is not None and key_elem.text: + keys.append(key_elem.text) + + print(f" Expected keys: {test_objects}") + print(f" Found keys: {keys}") + + for obj_key in test_objects: + assert obj_key in keys, f"Test file {obj_key} not in listing" + + print(f" Listed {len(contents)} objects") + + # Test prefix filtering + response = client.get(f"/{BUCKET}", params={"list-type": "2", "prefix": "folder/"}) + assert response.status_code == 200 + + root = ET.fromstring(response.text) + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + assert len(contents) == 3, f"Prefix filtering failed, found {len(contents)}" + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + key = key_elem.text if key_elem is not None else "" + assert key.startswith("folder/"), f"Prefix filter not working for {key}" + + print(" Prefix filtering works correctly") + +def test_tagging(): + """Test object tagging operations.""" + print("Testing object tagging...") + + client = create_test_client() + key = "tagged-file.txt" + + # Create object + response = client.put( + f"/{BUCKET}/{key}", + content=b"Tagged content", + headers={"Content-Type": "text/plain"} + ) + assert response.status_code == 200 + + # Set tags + tagging_xml = """ + + + + Environment + Test + + + Type + SmokeTest + + +""" + + response = client.put( + f"/{BUCKET}/{key}", + content=tagging_xml, + headers={"Content-Type": "application/xml"} + ) + assert response.status_code == 200 + + # Get tags + response = client.get(f"/{BUCKET}/{key}", params={"tagging": ""}) + assert response.status_code == 200 + + # Parse XML response + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + tag_elements = root.findall(".//Tag") + + retrieved_tags = {} + for tag_elem in tag_elements: + key_elem = tag_elem.find("Key") + value_elem = tag_elem.find("Value") + if key_elem is not None and value_elem is not None: + retrieved_tags[key_elem.text] = value_elem.text + + expected_tags = {"Environment": "Test", "Type": "SmokeTest"} + + assert retrieved_tags == expected_tags, f"Tags mismatch: {retrieved_tags} != {expected_tags}" + + print(" Tagging operations successful") + +def test_delete_object(): + """Test DELETE object operation.""" + print("Testing DELETE object...") + + client = create_test_client() + key = "to-delete.txt" + + # Create object + response = client.put( + f"/{BUCKET}/{key}", + content=b"Delete me", + headers={"Content-Type": "text/plain"} + ) + assert response.status_code == 200 + + # Verify it exists + response = client.head(f"/{BUCKET}/{key}") + assert response.status_code == 200 + + # Delete object + response = client.delete(f"/{BUCKET}/{key}") + assert response.status_code == 204 + + # Verify it's gone + response = client.head(f"/{BUCKET}/{key}") + assert response.status_code == 404 + + print(" DELETE successful, object no longer exists") + +def cleanup(): + """Clean up test objects.""" + print("Cleaning up test objects...") + + client = create_test_client() + + # List all objects + response = client.get(f"/{BUCKET}", params={"list-type": "2"}) + if response.status_code == 200: + import xml.etree.ElementTree as ET + root = ET.fromstring(response.text) + ns = {'s3': 'http://s3.amazonaws.com/doc/2006-03-01/'} + contents = root.findall(".//s3:Contents", ns) or root.findall(".//Contents") + + for content in contents: + key_elem = content.find("s3:Key", ns) + if key_elem is None: + key_elem = content.find("Key") + if key_elem is not None and key_elem.text: + key = key_elem.text + client.delete(f"/{BUCKET}/{key}") + print(f" Deleted {key}") + +def main(): + """Run all smoke tests.""" + print("S3 Mock Server Smoke Test") + print("=" * 40) + + try: + # Test all operations + test_put_get_object() + test_head_object() + test_list_objects() + test_tagging() + test_delete_object() + + print("\n" + "=" * 40) + print("✅ All tests passed!") + + except Exception as e: + print(f"\n❌ Test failed: {e}") + raise + finally: + # Always cleanup + try: + cleanup() + except Exception as e: + print(f"Warning: Cleanup failed: {e}") + +if __name__ == "__main__": + main() diff --git a/mocks/s3-mock/storage.py b/mocks/s3-mock/storage.py new file mode 100644 index 0000000..8d09e15 --- /dev/null +++ b/mocks/s3-mock/storage.py @@ -0,0 +1,135 @@ +import os +import json +import hashlib +from datetime import datetime, timezone +from pathlib import Path +from typing import Dict, Tuple, Optional, List + +HTTP_DATE_FMT = "%a, %d %b %Y %H:%M:%S GMT" + + +def ensure_bucket(root: Path, bucket: str) -> Path: + bucket_path = root / bucket + bucket_path.mkdir(parents=True, exist_ok=True) + return bucket_path + + +def object_paths(bucket_root: Path, key: str) -> Tuple[Path, Path]: + # Sanitize key to prevent traversal + key = key.lstrip("/") + safe_parts = [] + for part in key.split("/"): + if part in ("..", ""): + continue + safe_parts.append(part) + safe_key = "/".join(safe_parts) + obj_path = bucket_root / safe_key + meta_path = bucket_root / f"{safe_key}.meta.json" + obj_path.parent.mkdir(parents=True, exist_ok=True) + return obj_path, meta_path + + +def calc_etag(data: bytes) -> str: + return hashlib.md5(data).hexdigest() + + +def httpdate(ts: float) -> str: + return datetime.fromtimestamp(ts, tz=timezone.utc).strftime(HTTP_DATE_FMT) + + +def iso8601(ts: float) -> str: + return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat().replace("+00:00", "Z") + + +def save_object(bucket_root: Path, key: str, data: bytes, content_type: str, metadata: Dict[str, str], tags: Dict[str, str]) -> Dict[str, str]: + obj_path, meta_path = object_paths(bucket_root, key) + tmp_path = obj_path.with_suffix(obj_path.suffix + ".tmp") + # Atomic write + with open(tmp_path, "wb") as f: + f.write(data) + os.replace(tmp_path, obj_path) + etag = calc_etag(data) + meta = { + "content_type": content_type or "application/octet-stream", + "metadata": metadata or {}, + "tags": tags or {}, + "etag": etag, + "last_modified": httpdate(os.path.getmtime(obj_path)), + } + with open(meta_path, "w", encoding="utf-8") as f: + json.dump(meta, f) + return meta + + +def load_meta(bucket_root: Path, key: str) -> Optional[Dict[str, str]]: + _, meta_path = object_paths(bucket_root, key) + if not meta_path.exists(): + return None + try: + with open(meta_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception: + return None + + +def load_object(bucket_root: Path, key: str) -> Optional[Tuple[bytes, Dict[str, str]]]: + obj_path, _ = object_paths(bucket_root, key) + if not obj_path.exists(): + return None + with open(obj_path, "rb") as f: + data = f.read() + meta = load_meta(bucket_root, key) or {} + # Refresh last_modified if needed + meta["last_modified"] = httpdate(os.path.getmtime(obj_path)) + return data, meta + + +def delete_object(bucket_root: Path, key: str) -> bool: + obj_path, meta_path = object_paths(bucket_root, key) + deleted = False + if obj_path.exists(): + obj_path.unlink() + deleted = True + if meta_path.exists(): + meta_path.unlink() + return deleted + + +def list_objects(bucket_root: Path, prefix: str = "", max_keys: int = 100) -> List[Dict[str, str]]: + items: List[Dict[str, str]] = [] + base = bucket_root + for root, _, files in os.walk(base): + for name in files: + if name.endswith(".meta.json"): + continue + rel_path = Path(root).joinpath(name).relative_to(base) + key = str(rel_path).replace(os.sep, "/") + if not key.startswith(prefix): + continue + obj_path = base / rel_path + size = obj_path.stat().st_size + meta = load_meta(bucket_root, key) or {} + items.append({ + "Key": key, + "Size": size, + "ETag": meta.get("etag") or calc_etag(open(obj_path, "rb").read()), + "LastModified": iso8601(obj_path.stat().st_mtime), + }) + if len(items) >= max_keys: + return items + # Sort by last modified desc to mimic S3 behavior often seen by clients + items.sort(key=lambda x: x["LastModified"], reverse=True) + return items + + +def set_tags(bucket_root: Path, key: str, tags: Dict[str, str]) -> None: + meta = load_meta(bucket_root, key) or {"metadata": {}, "content_type": "application/octet-stream", "etag": ""} + meta["tags"] = tags + _, meta_path = object_paths(bucket_root, key) + with open(meta_path, "w", encoding="utf-8") as f: + json.dump(meta, f) + + +def get_tags(bucket_root: Path, key: str) -> Dict[str, str]: + meta = load_meta(bucket_root, key) or {} + return meta.get("tags", {}) From 400b005fc87ec46861e9df4b2e5042b978969070 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 2 Nov 2025 18:30:47 +0000 Subject: [PATCH 4/6] refactor: improve logging, exception handling, and S3 mock structure - Adjust log levels from info to debug in message_builder.py and remove debug prefixes from warnings for cleaner output - Replace bare except with specific ET.ParseError in mock_s3_client.py for better error handling - Update README.md to use uv pip for installation - Consolidate object tagging logic into main PUT endpoint in main.py, removing redundant function for simpler API structure --- .../chat/preprocessors/message_builder.py | 6 +-- .../modules/file_storage/mock_s3_client.py | 2 +- mocks/s3-mock/README.md | 2 +- mocks/s3-mock/main.py | 44 +++++++++---------- mocks/s3-mock/smoke_test.py | 5 --- mocks/s3-mock/storage.py | 6 ++- 6 files changed, 32 insertions(+), 33 deletions(-) diff --git a/backend/application/chat/preprocessors/message_builder.py b/backend/application/chat/preprocessors/message_builder.py index bc0f95e..98021bc 100644 --- a/backend/application/chat/preprocessors/message_builder.py +++ b/backend/application/chat/preprocessors/message_builder.py @@ -56,12 +56,12 @@ async def build_messages( if include_files_manifest: session_context = build_session_context(session) files_in_context = session_context.get("files", {}) - logger.info(f"DEBUG: Session has {len(files_in_context)} files: {list(files_in_context.keys())}") + logger.debug(f"Session has {len(files_in_context)} files: {list(files_in_context.keys())}") files_manifest = file_utils.build_files_manifest(session_context) if files_manifest: - logger.info(f"DEBUG: Adding files manifest to messages: {files_manifest['content'][:100]}") + logger.debug(f"Adding files manifest to messages: {files_manifest['content'][:100]}") messages.append(files_manifest) else: - logger.warning("DEBUG: No files manifest generated despite include_files_manifest=True") + logger.warning("No files manifest generated despite include_files_manifest=True") return messages diff --git a/backend/modules/file_storage/mock_s3_client.py b/backend/modules/file_storage/mock_s3_client.py index 35e6867..2d31119 100644 --- a/backend/modules/file_storage/mock_s3_client.py +++ b/backend/modules/file_storage/mock_s3_client.py @@ -202,7 +202,7 @@ async def get_file(self, user_email: str, file_key: str) -> Dict[str, Any]: value_elem = tag_elem.find("Value") if key_elem is not None and value_elem is not None: tags[key_elem.text] = value_elem.text - except: + except ET.ParseError: pass # Extract filename from metadata headers diff --git a/mocks/s3-mock/README.md b/mocks/s3-mock/README.md index 01af194..90a30ce 100644 --- a/mocks/s3-mock/README.md +++ b/mocks/s3-mock/README.md @@ -37,7 +37,7 @@ A minimal FastAPI server that mimics S3's REST API for development and testing p ### Prerequisites ```bash -pip install fastapi uvicorn +uv pip install fastapi uvicorn ``` ### Start the server diff --git a/mocks/s3-mock/main.py b/mocks/s3-mock/main.py index 52bafde..5c97f19 100644 --- a/mocks/s3-mock/main.py +++ b/mocks/s3-mock/main.py @@ -1,7 +1,7 @@ import os import urllib.parse from pathlib import Path -from typing import Dict, Optional +from typing import Dict from fastapi import FastAPI, HTTPException, Request, Response from fastapi.responses import StreamingResponse @@ -58,6 +58,26 @@ def add_metadata_headers(response: Response, metadata: Dict[str, str]): @app.put("/{bucket}/{key:path}") async def put_object(bucket: str, key: str, request: Request): """PUT Object endpoint.""" + # Check if this is a tagging request with XML body + if request.query_params.get("tagging") and request.headers.get("content-type") == "application/xml": + bucket_root = get_bucket_root(bucket) + + # Check if object exists + if load_meta(bucket_root, key) is None: + error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") + raise HTTPException(status_code=404, detail=error_xml) + + try: + body = await request.body() + xml_content = body.decode("utf-8") + tags = parse_tagging_xml(xml_content) + set_tags(bucket_root, key, tags) + return Response(status_code=200) + except ValueError: + error_xml = create_error_xml("MalformedXML", "The XML you provided was not well-formed or did not validate against the published schema.", f"/{bucket}/{key}?tagging") + raise HTTPException(status_code=400, detail=error_xml) + + # Regular object PUT try: bucket_root = get_bucket_root(bucket) body = await request.body() @@ -180,26 +200,6 @@ async def list_objects_v2(bucket: str, request: Request): return Response(content=xml_response, media_type="application/xml") -@app.put("/{bucket}/{key:path}") -async def put_object_tagging(bucket: str, key: str, request: Request): - """PUT Object Tagging endpoint.""" - bucket_root = get_bucket_root(bucket) - - # Check if object exists - if load_meta(bucket_root, key) is None: - error_xml = create_error_xml("NoSuchKey", "The specified key does not exist.", f"/{bucket}/{key}") - raise HTTPException(status_code=404, detail=error_xml) - - try: - body = await request.body() - xml_content = body.decode("utf-8") - tags = parse_tagging_xml(xml_content) - set_tags(bucket_root, key, tags) - return Response(status_code=200) - except ValueError: - error_xml = create_error_xml("MalformedXML", "The XML you provided was not well-formed or did not validate against the published schema.", f"/{bucket}/{key}?tagging") - raise HTTPException(status_code=400, detail=error_xml) - @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException): @@ -210,5 +210,5 @@ async def http_exception_handler(request: Request, exc: HTTPException): if __name__ == "__main__": - port = int(os.getenv("PORT", "9002")) + port = int(os.getenv("PORT", "9001")) uvicorn.run(app, host="0.0.0.0", port=port) diff --git a/mocks/s3-mock/smoke_test.py b/mocks/s3-mock/smoke_test.py index 2956172..2f8b7e0 100644 --- a/mocks/s3-mock/smoke_test.py +++ b/mocks/s3-mock/smoke_test.py @@ -4,10 +4,6 @@ Tests all supported S3 operations to ensure compatibility. """ -import hashlib -import time -import os -from pathlib import Path from fastapi.testclient import TestClient from main import get_app @@ -26,7 +22,6 @@ def test_put_get_object(): client = create_test_client() key = "test-file.txt" test_data = b"Hello, S3 Mock!" - metadata = {"author": "test", "version": "1.0"} tags = "environment=test&type=smoke" # PUT object diff --git a/mocks/s3-mock/storage.py b/mocks/s3-mock/storage.py index 8d09e15..5b93394 100644 --- a/mocks/s3-mock/storage.py +++ b/mocks/s3-mock/storage.py @@ -109,10 +109,14 @@ def list_objects(bucket_root: Path, prefix: str = "", max_keys: int = 100) -> Li obj_path = base / rel_path size = obj_path.stat().st_size meta = load_meta(bucket_root, key) or {} + etag = meta.get("etag") + if not etag: + with open(obj_path, "rb") as f: + etag = calc_etag(f.read()) items.append({ "Key": key, "Size": size, - "ETag": meta.get("etag") or calc_etag(open(obj_path, "rb").read()), + "ETag": etag, "LastModified": iso8601(obj_path.stat().st_mtime), }) if len(items) >= max_keys: From 28a05f877ff626be2347121f3211d00e5576b9de Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 2 Nov 2025 18:47:28 +0000 Subject: [PATCH 5/6] docs: add critical security warnings for S3 mock server - Remove outdated comment about enabling mock S3 in README.md - Add production usage warning in main README guidelines - Expand S3 mock README with detailed security limitations and alternatives - Ensures developers understand risks and avoid production deployment of insecure mock --- README.md | 6 ++---- mocks/s3-mock/README.md | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d7e7045..db8312b 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,7 @@ cp .env.example .env # Edit with your API keys # Build frontend cd frontend && npm install && npm run build -# there is a mock s3 that you might want to enable. Switching to minio sooon. -cd mocks/s3-mocks && python main.py - -# Start backend +# Start backend cd ../backend && python main.py # OR the quickest way to start is to use the agent_start.sh @@ -79,6 +76,7 @@ bash agent_start.sh - **Use `npm run build`** instead of `npm run dev` for frontend development - **File limit**: Maximum 400 lines per file for maintainability - **Container Environment**: Use Fedora latest for Docker containers (GitHub Actions uses Ubuntu runners) +- **Mock S3**: The included S3 mock (`mocks/s3-mock/`) is for development/testing only and must NEVER be used in production due to lack of authentication, encryption, and other critical security features. ## License diff --git a/mocks/s3-mock/README.md b/mocks/s3-mock/README.md index 90a30ce..41014e8 100644 --- a/mocks/s3-mock/README.md +++ b/mocks/s3-mock/README.md @@ -2,6 +2,25 @@ A minimal FastAPI server that mimics S3's REST API for development and testing purposes. Compatible with botocore/boto3 expectations. +## CRITICAL WARNING: DEVELOPMENT ONLY + +**THIS MOCK SERVER MUST NEVER BE USED IN PRODUCTION ENVIRONMENTS.** + +This implementation is intended ONLY for local development and testing. It has critical security limitations that make it completely unsuitable for production use: + +- **No authentication or authorization** - Accepts all requests regardless of credentials +- **No signature verification** - Does not validate S3v4 signatures (security bypass) +- **No encryption** - Data stored in plain text on disk +- **No access control** - No bucket policies, IAM, or ACLs +- **No audit logging** - No CloudTrail equivalent for compliance +- **No data durability guarantees** - Simple filesystem storage without redundancy +- **Minimal error handling** - May expose internal paths or system information + +For production deployments, use: +- **AWS S3** - For AWS-hosted applications +- **MinIO** - For self-hosted S3-compatible storage with proper security +- Other production-grade S3-compatible services + ## Features - **Path-style addressing**: `http://localhost:9001/{bucket}/{key}` From 2343ecf1b8a8bf778986486d8865b72608e922a4 Mon Sep 17 00:00:00 2001 From: Anthony Date: Sun, 2 Nov 2025 19:06:34 +0000 Subject: [PATCH 6/6] refactor: improve logging messages and enhance security checks in S3 mock implementation --- agent_start.sh | 8 +++---- .../modules/file_storage/mock_s3_client.py | 5 ++-- mocks/s3-mock/smoke_test.py | 4 ++-- mocks/s3-mock/storage.py | 23 ++++++++++++++----- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/agent_start.sh b/agent_start.sh index 8d6cf67..15182d2 100755 --- a/agent_start.sh +++ b/agent_start.sh @@ -29,16 +29,16 @@ fi # Only start MinIO if not using mock S3 if [ "$USE_MOCK_S3" = "true" ]; then - echo "✅ Using Mock S3 (no Docker required)" + echo "Using Mock S3 (no Docker required)" else # Check if MinIO is running if ! docker ps | grep -q atlas-minio; then - echo "⚠️ MinIO is not running. Starting MinIO with docker-compose..." + echo "MinIO is not running. Starting MinIO with docker-compose..." docker-compose up -d minio minio-init - echo "✅ MinIO started successfully" + echo "MinIO started successfully" sleep 3 else - echo "✅ MinIO is already running" + echo "MinIO is already running" fi fi diff --git a/backend/modules/file_storage/mock_s3_client.py b/backend/modules/file_storage/mock_s3_client.py index 2d31119..7feb355 100644 --- a/backend/modules/file_storage/mock_s3_client.py +++ b/backend/modules/file_storage/mock_s3_client.py @@ -79,7 +79,7 @@ def _generate_s3_key(self, user_email: str, filename: str, source_type: str = "u def _calculate_etag(self, content_bytes: bytes) -> str: """Calculate ETag for file content.""" - return hashlib.md5(content_bytes).hexdigest() + return hashlib.md5(content_bytes, usedforsecurity=False).hexdigest() async def upload_file( self, @@ -203,7 +203,8 @@ async def get_file(self, user_email: str, file_key: str) -> Dict[str, Any]: if key_elem is not None and value_elem is not None: tags[key_elem.text] = value_elem.text except ET.ParseError: - pass + # Failed to parse tags XML; tags will be left empty. This is non-fatal as tags are optional. + logger.warning(f"Failed to parse tags XML for file {sanitize_for_logging(file_key)}", exc_info=True) # Extract filename from metadata headers filename = response.headers.get("x-amz-meta-original_filename", file_key.split('/')[-1]) diff --git a/mocks/s3-mock/smoke_test.py b/mocks/s3-mock/smoke_test.py index 2f8b7e0..0db88d1 100644 --- a/mocks/s3-mock/smoke_test.py +++ b/mocks/s3-mock/smoke_test.py @@ -257,10 +257,10 @@ def main(): test_delete_object() print("\n" + "=" * 40) - print("✅ All tests passed!") + print("All tests passed!") except Exception as e: - print(f"\n❌ Test failed: {e}") + print(f"\nERROR: Test failed: {e}") raise finally: # Always cleanup diff --git a/mocks/s3-mock/storage.py b/mocks/s3-mock/storage.py index 5b93394..4f0efad 100644 --- a/mocks/s3-mock/storage.py +++ b/mocks/s3-mock/storage.py @@ -15,22 +15,33 @@ def ensure_bucket(root: Path, bucket: str) -> Path: def object_paths(bucket_root: Path, key: str) -> Tuple[Path, Path]: - # Sanitize key to prevent traversal + # Sanitize key to prevent traversal and reject dangerous keys key = key.lstrip("/") + if os.path.isabs(key): + raise ValueError("Absolute paths are not allowed in S3 keys") safe_parts = [] for part in key.split("/"): - if part in ("..", ""): - continue + if part in ("..", ".", ""): + raise ValueError(f"Invalid path component in S3 key: {part!r}") + # On Windows, reject drive letters (e.g., C:) + if os.name == "nt" and len(part) == 2 and part[1] == ":" and part[0].isalpha(): + raise ValueError(f"Drive letter not allowed in S3 key: {part!r}") safe_parts.append(part) safe_key = "/".join(safe_parts) - obj_path = bucket_root / safe_key - meta_path = bucket_root / f"{safe_key}.meta.json" + obj_path = (bucket_root / safe_key).resolve() + meta_path = (bucket_root / f"{safe_key}.meta.json").resolve() + # Ensure the resolved paths are within the bucket root + bucket_root_resolved = bucket_root.resolve() + if not str(obj_path).startswith(str(bucket_root_resolved)): + raise ValueError("Path traversal detected in S3 key") + if not str(meta_path).startswith(str(bucket_root_resolved)): + raise ValueError("Path traversal detected in S3 key (meta)") obj_path.parent.mkdir(parents=True, exist_ok=True) return obj_path, meta_path def calc_etag(data: bytes) -> str: - return hashlib.md5(data).hexdigest() + return hashlib.md5(data, usedforsecurity=False).hexdigest() def httpdate(ts: float) -> str: