Process pdf problem sets (#2402)

shanbady · web-flow · commit 6fac716c8ff8 · 2025-07-31T10:34:06.000-04:00
* adding transcription model as setting

* adding initial pdf based appraoch

* adding dependencies for pdf to image conversion

* constraining pdf size and moving prompt to settings

* removing unused pydantic model

* adding custom settings for litellm and fixing types for test method
diff --git a/Dockerfile b/Dockerfile
@@ -11,6 +11,7 @@ COPY apt.txt /tmp/apt.txt
 RUN apt-get update && \
     apt-get install -y --no-install-recommends $(grep -vE "^\s*#" apt.txt | tr "\n" " ") && \
     apt-get install libpq-dev postgresql-client -y --no-install-recommends && \
+    apt-get install poppler-utils -y && \
     apt-get clean && \
     apt-get purge &&  \
     rm -rf /var/lib/apt/lists/*
diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
@@ -1,13 +1,18 @@
+import base64
 import logging
 import sys
 import zipfile
 from collections import defaultdict
 from collections.abc import Generator
+from io import BytesIO
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
 from defusedxml import ElementTree
 from django.conf import settings
+from litellm import completion
+from pdf2image import convert_from_path
+from PIL import Image
 
 from learning_resources.constants import (
     VALID_TUTOR_PROBLEM_TYPES,
@@ -180,6 +185,7 @@ def transform_canvas_problem_files(
             problem_file_data = {
                 key: file_data[key] for key in keys_to_keep if key in file_data
             }
+
             path = file_data["source_path"]
             path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
             path_parts = path.split("/", 1)
@@ -188,7 +194,15 @@ def transform_canvas_problem_files(
                 if problem_type in path_parts[1].lower():
                     problem_file_data["type"] = problem_type
                     break
-
+            if (
+                problem_file_data["file_extension"].lower() == ".pdf"
+                and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
+            ):
+                markdown_content = _pdf_to_markdown(
+                    Path(olx_path) / Path(problem_file_data["source_path"])
+                )
+                if markdown_content:
+                    problem_file_data["content"] = markdown_content
             yield problem_file_data
 
 
@@ -269,3 +283,75 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
                 {"title": title, "files": files, "type": resource.get("type")}
             )
     return dict(resources_dict)
+
+
+def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
+    """
+    Convert a PDF file to a list of base64 encoded images (one per page).
+    Resizes images to reduce file size while keeping good OCR quality.
+
+    Args:
+        pdf_path (str): Path to the PDF file
+        dpi (int): DPI for the output images (default: 200)
+        fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG')
+        max_size (int): Maximum width/height in pixels (default: 2000)
+        quality (int): JPEG quality (1-100, default: 85)
+
+    Returns:
+        list: List of base64 encoded strings (one per page)
+    """
+    images = convert_from_path(pdf_path, dpi=dpi)
+    base64_images = []
+
+    for image in images:
+        # Resize the image if it's too large (preserving aspect ratio)
+        if max(image.size) > max_size:
+            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+
+        buffered = BytesIO()
+
+        # Save with optimized settings
+        if fmt.upper() == "JPEG":
+            image.save(buffered, format="JPEG", quality=quality, optimize=True)
+        else:  # PNG
+            image.save(buffered, format="PNG", optimize=True)
+
+        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        base64_images.append(img_str)
+
+    return base64_images
+
+
+def _pdf_to_markdown(pdf_path):
+    markdown = ""
+    for im in pdf_to_base64_images(pdf_path):
+        response = completion(
+            api_base=settings.LITELLM_API_BASE,
+            custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
+            model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": settings.CANVAS_TRANSCRIPTION_PROMPT,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{im}",
+                            },
+                        },
+                    ],
+                }
+            ],
+        )
+        markdown_snippet = (
+            response.json()["choices"][0]["message"]["content"]
+            .removeprefix("```markdown\n")
+            .removesuffix("\n```")
+        )
+
+        markdown += markdown_snippet
+    return markdown
diff --git a/learning_resources/etl/canvas_test.py b/learning_resources/etl/canvas_test.py
@@ -11,6 +11,7 @@
     parse_module_meta,
     run_for_canvas_archive,
     transform_canvas_content_files,
+    transform_canvas_problem_files,
 )
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.utils import get_edx_module_id
@@ -32,6 +33,18 @@ def canvas_platform():
     return LearningResourcePlatformFactory.create(code=PlatformType.canvas.name)
 
 
+def canvas_zip_with_problem_files(tmp_path: str, files: dict[tuple[str, bytes]]) -> str:
+    """
+    Create a Canvas zip with problem files in the tutorbot folder.
+    `files` is a list of tuples: (filename, content_bytes)
+    """
+    zip_path = tmp_path / "canvas_course_with_problems.zip"
+    with zipfile.ZipFile(zip_path, "w") as zf:
+        for filename, content in files:
+            zf.writestr(f"tutorbot/{filename}", content)
+    return zip_path
+
+
 @pytest.fixture
 def canvas_settings_zip(tmp_path):
     # Create a minimal XML for course_settings.xml
@@ -338,3 +351,80 @@ def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_
 
     # Ensure unpublished content is deleted and unpublished actions called
     bulk_unpub.assert_called_once_with([unpublished_cf.id], CONTENT_FILE_TYPE)
+
+
+def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
+    tmp_path, mocker, settings
+):
+    """
+    Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files.
+    """
+
+    settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
+    settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
+    pdf_filename = "problemset1/problem.pdf"
+    pdf_content = b"%PDF-1.4 fake pdf content"
+    zip_path = canvas_zip_with_problem_files(tmp_path, [(pdf_filename, pdf_content)])
+
+    # return a file with pdf extension
+    fake_file_data = {
+        "run": "run",
+        "content": "original pdf content",
+        "archive_checksum": "checksum",
+        "source_path": f"tutorbot/{pdf_filename}",
+        "file_extension": ".pdf",
+    }
+    mocker.patch(
+        "learning_resources.etl.canvas._process_olx_path",
+        return_value=iter([fake_file_data]),
+    )
+
+    # Patch _pdf_to_markdown to return a known value
+    pdf_to_md = mocker.patch(
+        "learning_resources.etl.canvas._pdf_to_markdown",
+        return_value="markdown content from pdf",
+    )
+
+    # Patch Path(olx_path) / Path(problem_file_data["source_path"]) to exist
+    run = mocker.Mock()
+
+    results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
+
+    pdf_to_md.assert_called_once()
+    assert results[0]["content"] == "markdown content from pdf"
+    assert results[0]["problem_title"] == "problemset1"
+
+
+def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(
+    tmp_path, mocker, settings
+):
+    """
+    Test that transform_canvas_problem_files does not call _pdf_to_markdown for non-PDF files.
+    """
+    settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
+    settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
+    html_filename = "problemset2/problem.html"
+    html_content = b"<html>problem</html>"
+    zip_path = canvas_zip_with_problem_files(tmp_path, [(html_filename, html_content)])
+
+    fake_file_data = {
+        "run": "run",
+        "content": "original html content",
+        "archive_checksum": "checksum",
+        "source_path": f"tutorbot/{html_filename}",
+        "file_extension": ".html",
+    }
+    mocker.patch(
+        "learning_resources.etl.canvas._process_olx_path",
+        return_value=iter([fake_file_data]),
+    )
+
+    pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
+
+    run = mocker.Mock()
+
+    results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
+
+    pdf_to_md.assert_not_called()
+    assert results[0]["content"] == "original html content"
+    assert results[0]["problem_title"] == "problemset2"
diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py
@@ -68,6 +68,14 @@
 CANVAS_COURSE_BUCKET_PREFIX = get_string(
     "CANVAS_COURSE_BUCKET_PREFIX", "canvas/course_content"
 )
+CANVAS_PDF_TRANSCRIPTION_MODEL = get_string(
+    name="CANVAS_PDF_TRANSCRIPTION_MODEL", default=None
+)
+CANVAS_TRANSCRIPTION_PROMPT = get_string(
+    "CANVAS_TRANSCRIPTION_PROMPT",
+    """Transcribe the contents of this file into markdown.
+    Do not include anything but the markdown content in your response""",
+)
 # More MIT URLs
 SEE_API_URL = get_string("SEE_API_URL", None)
 SEE_API_ACCESS_TOKEN_URL = get_string("SEE_API_ACCESS_TOKEN_URL", None)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -111,6 +111,7 @@ uwsgi = "^2.0.29"
 uwsgitop = "^0.12"
 wrapt = "^1.14.1"
 youtube-transcript-api = "^1.0.0"
+pdf2image = "^1.17.0"
 
 [tool.poetry.group.dev.dependencies]
 bpython = "^0.25"