Skip to content

Commit

Permalink
update test.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Wang committed Nov 20, 2024
1 parent f6de5fe commit 5df87ad
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 59 deletions.
41 changes: 25 additions & 16 deletions services/APIService/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
)
from shared.prompt_types import PromptTracker
from shared.podcast_types import SavedPodcast, SavedPodcastWithAudio, Conversation
from shared.pdf_types import PDFFileUpload, FileContentTuple
from shared.pdf_types import FileContentTuple
from shared.connection import ConnectionManager
from shared.storage import StorageManager
from shared.otel import OpenTelemetryInstrumentation, OpenTelemetryConfig
Expand All @@ -35,7 +35,7 @@
import logging
import time
import asyncio
from typing import Dict, List, Union, Annotated
from typing import Dict, List

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -98,6 +98,7 @@
)
logger.info(f"CORS configured with allowed origins: {allowed_origins}")


@app.websocket("/ws/status/{job_id}")
async def websocket_endpoint(websocket: WebSocket, job_id: str):
try:
Expand Down Expand Up @@ -182,9 +183,12 @@ def process_pdf_task(
"application/pdf",
transcription_params,
)
logger.info(
f"Stored {len(files)} original PDFs for {job_id} in storage"
)
logger.info(f"Stored {len(files)} original PDFs for {job_id} in storage")
files_for_request = []
for idx, (content, file_type) in enumerate(files):
files_for_request.append(
("files", (f"file_{idx}.pdf", content, "application/pdf"))
)
logger.info(
f"Sending {len(files)} PDFs to PDF Service for {job_id} with VDB task: {transcription_params.vdb_task}"
)
Expand Down Expand Up @@ -289,30 +293,35 @@ def process_pdf_task(
@app.post("/process_pdf", status_code=202)
async def process_pdf(
background_tasks: BackgroundTasks,
files: Annotated[Union[PDFFileUpload, List[PDFFileUpload]], File(...)],
files: List[UploadFile] = File(...),
file_types: List[str] = Form(...),
transcription_params: str = Form(...),
):
with telemetry.tracer.start_as_current_span("api.process_pdf") as span:
# Convert single file to list for consistent handling
files = [files] if isinstance(files, PDFFileUpload) else files
if len(files) != len(file_types):
raise HTTPException(
status_code=400,
detail="Number of files must match number of file types",
)

span.set_attribute("request", transcription_params)
span.set_attribute("num_files", len(files))
if len(files) == 1 and files[0].type != "target":

if len(files) == 1 and file_types[0] != "target":
raise HTTPException(
status_code=400,
detail="Single file must be designated as 'target'"
status_code=400, detail="Single file must be designated as 'target'"
)

# Ensure at least one target file
if not any(f.type == "target" for f in files):
if not any(ft == "target" for ft in file_types):
raise HTTPException(
status_code=400,
detail="At least one file must be designated as 'target'"
detail="At least one file must be designated as 'target'",
)

# Validate all files are PDFs
for file in files:
if file.file.content_type != "application/pdf":
if file.content_type != "application/pdf":
span.set_status(
status=StatusCode.ERROR, description="invalid file type"
)
Expand All @@ -334,8 +343,8 @@ async def process_pdf(

# Read all files
files_data: List[FileContentTuple] = []
for file_upload, file_type in files:
content = await file_upload.file.read()
for file, file_type in zip(files, file_types):
content = await file.read()
files_data.append((content, file_type))

# Start processing
Expand Down
5 changes: 4 additions & 1 deletion services/AgentService/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ def test_transcribe_api():

# Create a proper TranscriptionRequest
pdf_metadata_1 = PDFMetadata(
filename="sample.pdf", markdown="Sample markdown content", summary="", type="target"
filename="sample.pdf",
markdown="Sample markdown content",
summary="",
type="target",
)

pdf_metadata_2 = PDFMetadata(
Expand Down
7 changes: 6 additions & 1 deletion services/PDFService/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
import asyncio
import ujson as json
from typing import List
from shared.pdf_types import PDFConversionResult, ConversionStatus, PDFMetadata, FileContentTuple
from shared.pdf_types import (
PDFConversionResult,
ConversionStatus,
PDFMetadata,
FileContentTuple,
)
from shared.api_types import ServiceType, JobStatus, StatusResponse

logging.basicConfig(level=logging.INFO)
Expand Down
13 changes: 3 additions & 10 deletions shared/shared/pdf_types.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from fastapi import UploadFile, Form, File
from fastapi import UploadFile
from pydantic import BaseModel, Field
from typing import Optional, Union, Literal, Tuple
from datetime import datetime
from enum import Enum


class ConversionStatus(str, Enum):
SUCCESS = "success"
FAILED = "failed"
Expand All @@ -25,14 +26,6 @@ class PDFMetadata(BaseModel):
error: Optional[str] = None
created_at: datetime = Field(default_factory=datetime.utcnow)

class PDFFileUpload:
def __init__(
self,
file: UploadFile = File(...),
type: Literal["target", "context"] = Form(...)
):
self.file = file
self.type = type

FileTypeTuple = Tuple[UploadFile, Literal["target", "context"]]
FileContentTuple = Tuple[bytes, Literal["target", "context"]]
FileContentTuple = Tuple[bytes, Literal["target", "context"]]
2 changes: 1 addition & 1 deletion tests/prod-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ python3 test.py --monologue \
db-context.pdf \
gs-context.pdf \
hsbc-context.pdf \
investorpres-main.pdf \
investorpres-main.pdf target \
jpm-context.pdf \
keybanc-context.pdf
107 changes: 77 additions & 30 deletions tests/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import asyncio
from urllib.parse import urljoin
import argparse
from typing import List
from typing import List, Tuple

# Add global TEST_USER_ID
TEST_USER_ID = "test-userid"
Expand Down Expand Up @@ -218,7 +218,10 @@ def test_saved_podcasts(base_url: str, job_id: str, max_retries=5, retry_delay=5


def test_api(
base_url: str, pdf_files: List[str], monologue: bool = False, vdb: bool = False
base_url: str,
pdf_files_with_types: List[Tuple[str, str]],
monologue: bool = False,
vdb: bool = False,
):
voice_mapping = {
"speaker-1": "iP95p4xoKVk53GoZ742B",
Expand All @@ -230,19 +233,18 @@ def test_api(
process_url = f"{base_url}/process_pdf"

# Update path resolution
current_dir = os.path.dirname(
os.path.abspath(__file__)
) # This gets /tests directory
project_root = os.path.dirname(current_dir) # Go up one level to project root
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
samples_dir = os.path.join(project_root, "samples")

# Rest of the path handling remains the same
sample_pdf_paths = []
for pdf_file in pdf_files:
sample_pdf_paths_with_types = []
for pdf_file, file_type in pdf_files_with_types:
if os.path.isabs(pdf_file):
sample_pdf_paths.append(pdf_file)
sample_pdf_paths_with_types.append((pdf_file, file_type))
else:
sample_pdf_paths.append(os.path.join(samples_dir, pdf_file))
sample_pdf_paths_with_types.append(
(os.path.join(samples_dir, pdf_file), file_type)
)

# Prepare the payload with updated schema and userId
transcription_params = {
Expand All @@ -265,30 +267,40 @@ def test_api(
)
print(f"Using voices: {voice_mapping}")

pdf_files = [open(path, "rb") for path in sample_pdf_paths]
# Prepare multipart form data
form_data = []
file_types = []

# Add each file to the form data
for path, file_type in sample_pdf_paths_with_types:
with open(path, "rb") as pdf_file:
content = pdf_file.read()
form_data.append(
("files", (os.path.basename(path), content, "application/pdf"))
)
file_types.append(file_type)

# Add the file types as separate form fields
for file_type in file_types:
form_data.append(("file_types", (None, file_type)))

# Add transcription parameters
form_data.append(("transcription_params", (None, json.dumps(transcription_params))))

try:
files = [
("files", (os.path.basename(path), pdf_file, "application/pdf"))
for path, pdf_file in zip(sample_pdf_paths, pdf_files)
]

response = requests.post(
process_url,
files=files,
data={"transcription_params": json.dumps(transcription_params)},
)
response = requests.post(process_url, files=form_data)

assert (
response.status_code == 202
), f"Expected status code 202, but got {response.status_code}"
), f"Expected status code 202, but got {response.status_code}. Response: {response.text}"
job_data = response.json()
assert "job_id" in job_data, "Response missing job_id"
job_id = job_data["job_id"]
print(f"[{datetime.now().strftime('%H:%M:%S')}] Job ID received: {job_id}")

finally:
for f in pdf_files:
f.close()
except Exception as e:
print(f"Error during PDF submission: {e}")
raise

# Step 2: Start monitoring status via WebSocket
monitor = StatusMonitor(base_url, job_id)
Expand Down Expand Up @@ -340,9 +352,29 @@ def test_api(

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process PDF files for audio conversion"
description="Process PDF files for audio conversion",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process single file (defaults to context)
python test.py file1.pdf
# Process single file as target
python test.py file1.pdf target
# Process multiple files with explicit types
python test.py file1.pdf target file2.pdf context file3.pdf context
# Process multiple files (defaulting to context)
python test.py file1.pdf target file2.pdf file3.pdf
""",
)

parser.add_argument(
"files",
nargs="+",
help="PDF files and their types (optional). Format: <file> [type] <file> [type] ...",
)
parser.add_argument("pdf_files", nargs="+", help="PDF files to process")
parser.add_argument(
"--api-url",
default=os.getenv("API_SERVICE_URL", "http://localhost:8002"),
Expand All @@ -360,10 +392,25 @@ def test_api(
)

args = parser.parse_args()

# Process the files argument to pair files with their types
pdf_files_with_types = []
i = 0
while i < len(args.files):
pdf_file = args.files[i]
# Check if next argument is a type specification
if i + 1 < len(args.files) and args.files[i + 1] in ["target", "context"]:
file_type = args.files[i + 1]
i += 2
else:
file_type = "context" # default type
i += 1
pdf_files_with_types.append((pdf_file, file_type))

print(f"API URL: {args.api_url}")
print(f"Processing PDF files: {args.pdf_files}")
print(f"Processing PDF files: {pdf_files_with_types}")
print(f"Monologue mode: {args.monologue}")
print(f"VDB mode: {args.vdb}")
print(f"Using test user ID: {TEST_USER_ID}")

test_api(args.api_url, args.pdf_files, args.monologue, args.vdb)
test_api(args.api_url, pdf_files_with_types, args.monologue, args.vdb)

0 comments on commit 5df87ad

Please sign in to comment.