Skip to content

Commit 6fac716

Browse files
authored
Process pdf problem sets (#2402)
* adding transcription model as setting * adding initial pdf based appraoch * adding dependencies for pdf to image conversion * constraining pdf size and moving prompt to settings * removing unused pydantic model * adding custom settings for litellm and fixing types for test method
1 parent 2a7701d commit 6fac716

File tree

6 files changed

+203
-2
lines changed

6 files changed

+203
-2
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ COPY apt.txt /tmp/apt.txt
1111
RUN apt-get update && \
1212
apt-get install -y --no-install-recommends $(grep -vE "^\s*#" apt.txt | tr "\n" " ") && \
1313
apt-get install libpq-dev postgresql-client -y --no-install-recommends && \
14+
apt-get install poppler-utils -y && \
1415
apt-get clean && \
1516
apt-get purge && \
1617
rm -rf /var/lib/apt/lists/*

learning_resources/etl/canvas.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1+
import base64
12
import logging
23
import sys
34
import zipfile
45
from collections import defaultdict
56
from collections.abc import Generator
7+
from io import BytesIO
68
from pathlib import Path
79
from tempfile import TemporaryDirectory
810

911
from defusedxml import ElementTree
1012
from django.conf import settings
13+
from litellm import completion
14+
from pdf2image import convert_from_path
15+
from PIL import Image
1116

1217
from learning_resources.constants import (
1318
VALID_TUTOR_PROBLEM_TYPES,
@@ -180,6 +185,7 @@ def transform_canvas_problem_files(
180185
problem_file_data = {
181186
key: file_data[key] for key in keys_to_keep if key in file_data
182187
}
188+
183189
path = file_data["source_path"]
184190
path = path[len(settings.CANVAS_TUTORBOT_FOLDER) :]
185191
path_parts = path.split("/", 1)
@@ -188,7 +194,15 @@ def transform_canvas_problem_files(
188194
if problem_type in path_parts[1].lower():
189195
problem_file_data["type"] = problem_type
190196
break
191-
197+
if (
198+
problem_file_data["file_extension"].lower() == ".pdf"
199+
and settings.CANVAS_PDF_TRANSCRIPTION_MODEL
200+
):
201+
markdown_content = _pdf_to_markdown(
202+
Path(olx_path) / Path(problem_file_data["source_path"])
203+
)
204+
if markdown_content:
205+
problem_file_data["content"] = markdown_content
192206
yield problem_file_data
193207

194208

@@ -269,3 +283,75 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
269283
{"title": title, "files": files, "type": resource.get("type")}
270284
)
271285
return dict(resources_dict)
286+
287+
288+
def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
289+
"""
290+
Convert a PDF file to a list of base64 encoded images (one per page).
291+
Resizes images to reduce file size while keeping good OCR quality.
292+
293+
Args:
294+
pdf_path (str): Path to the PDF file
295+
dpi (int): DPI for the output images (default: 200)
296+
fmt (str): Output format ('JPEG' or 'PNG') (default: 'JPEG')
297+
max_size (int): Maximum width/height in pixels (default: 2000)
298+
quality (int): JPEG quality (1-100, default: 85)
299+
300+
Returns:
301+
list: List of base64 encoded strings (one per page)
302+
"""
303+
images = convert_from_path(pdf_path, dpi=dpi)
304+
base64_images = []
305+
306+
for image in images:
307+
# Resize the image if it's too large (preserving aspect ratio)
308+
if max(image.size) > max_size:
309+
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
310+
311+
buffered = BytesIO()
312+
313+
# Save with optimized settings
314+
if fmt.upper() == "JPEG":
315+
image.save(buffered, format="JPEG", quality=quality, optimize=True)
316+
else: # PNG
317+
image.save(buffered, format="PNG", optimize=True)
318+
319+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
320+
base64_images.append(img_str)
321+
322+
return base64_images
323+
324+
325+
def _pdf_to_markdown(pdf_path):
326+
markdown = ""
327+
for im in pdf_to_base64_images(pdf_path):
328+
response = completion(
329+
api_base=settings.LITELLM_API_BASE,
330+
custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
331+
model=settings.CANVAS_PDF_TRANSCRIPTION_MODEL,
332+
messages=[
333+
{
334+
"role": "user",
335+
"content": [
336+
{
337+
"type": "text",
338+
"text": settings.CANVAS_TRANSCRIPTION_PROMPT,
339+
},
340+
{
341+
"type": "image_url",
342+
"image_url": {
343+
"url": f"data:image/jpeg;base64,{im}",
344+
},
345+
},
346+
],
347+
}
348+
],
349+
)
350+
markdown_snippet = (
351+
response.json()["choices"][0]["message"]["content"]
352+
.removeprefix("```markdown\n")
353+
.removesuffix("\n```")
354+
)
355+
356+
markdown += markdown_snippet
357+
return markdown

learning_resources/etl/canvas_test.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
parse_module_meta,
1212
run_for_canvas_archive,
1313
transform_canvas_content_files,
14+
transform_canvas_problem_files,
1415
)
1516
from learning_resources.etl.constants import ETLSource
1617
from learning_resources.etl.utils import get_edx_module_id
@@ -32,6 +33,18 @@ def canvas_platform():
3233
return LearningResourcePlatformFactory.create(code=PlatformType.canvas.name)
3334

3435

36+
def canvas_zip_with_problem_files(tmp_path: str, files: dict[tuple[str, bytes]]) -> str:
37+
"""
38+
Create a Canvas zip with problem files in the tutorbot folder.
39+
`files` is a list of tuples: (filename, content_bytes)
40+
"""
41+
zip_path = tmp_path / "canvas_course_with_problems.zip"
42+
with zipfile.ZipFile(zip_path, "w") as zf:
43+
for filename, content in files:
44+
zf.writestr(f"tutorbot/{filename}", content)
45+
return zip_path
46+
47+
3548
@pytest.fixture
3649
def canvas_settings_zip(tmp_path):
3750
# Create a minimal XML for course_settings.xml
@@ -338,3 +351,80 @@ def test_transform_canvas_content_files_removes_unpublished_content(mocker, tmp_
338351

339352
# Ensure unpublished content is deleted and unpublished actions called
340353
bulk_unpub.assert_called_once_with([unpublished_cf.id], CONTENT_FILE_TYPE)
354+
355+
356+
def test_transform_canvas_problem_files_pdf_calls_pdf_to_markdown(
357+
tmp_path, mocker, settings
358+
):
359+
"""
360+
Test that transform_canvas_problem_files calls _pdf_to_markdown for PDF files.
361+
"""
362+
363+
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
364+
settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
365+
pdf_filename = "problemset1/problem.pdf"
366+
pdf_content = b"%PDF-1.4 fake pdf content"
367+
zip_path = canvas_zip_with_problem_files(tmp_path, [(pdf_filename, pdf_content)])
368+
369+
# return a file with pdf extension
370+
fake_file_data = {
371+
"run": "run",
372+
"content": "original pdf content",
373+
"archive_checksum": "checksum",
374+
"source_path": f"tutorbot/{pdf_filename}",
375+
"file_extension": ".pdf",
376+
}
377+
mocker.patch(
378+
"learning_resources.etl.canvas._process_olx_path",
379+
return_value=iter([fake_file_data]),
380+
)
381+
382+
# Patch _pdf_to_markdown to return a known value
383+
pdf_to_md = mocker.patch(
384+
"learning_resources.etl.canvas._pdf_to_markdown",
385+
return_value="markdown content from pdf",
386+
)
387+
388+
# Patch Path(olx_path) / Path(problem_file_data["source_path"]) to exist
389+
run = mocker.Mock()
390+
391+
results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
392+
393+
pdf_to_md.assert_called_once()
394+
assert results[0]["content"] == "markdown content from pdf"
395+
assert results[0]["problem_title"] == "problemset1"
396+
397+
398+
def test_transform_canvas_problem_files_non_pdf_does_not_call_pdf_to_markdown(
399+
tmp_path, mocker, settings
400+
):
401+
"""
402+
Test that transform_canvas_problem_files does not call _pdf_to_markdown for non-PDF files.
403+
"""
404+
settings.CANVAS_TUTORBOT_FOLDER = "tutorbot/"
405+
settings.CANVAS_PDF_TRANSCRIPTION_MODEL = "fake-model"
406+
html_filename = "problemset2/problem.html"
407+
html_content = b"<html>problem</html>"
408+
zip_path = canvas_zip_with_problem_files(tmp_path, [(html_filename, html_content)])
409+
410+
fake_file_data = {
411+
"run": "run",
412+
"content": "original html content",
413+
"archive_checksum": "checksum",
414+
"source_path": f"tutorbot/{html_filename}",
415+
"file_extension": ".html",
416+
}
417+
mocker.patch(
418+
"learning_resources.etl.canvas._process_olx_path",
419+
return_value=iter([fake_file_data]),
420+
)
421+
422+
pdf_to_md = mocker.patch("learning_resources.etl.canvas._pdf_to_markdown")
423+
424+
run = mocker.Mock()
425+
426+
results = list(transform_canvas_problem_files(zip_path, run, overwrite=True))
427+
428+
pdf_to_md.assert_not_called()
429+
assert results[0]["content"] == "original html content"
430+
assert results[0]["problem_title"] == "problemset2"

main/settings_course_etl.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@
6868
CANVAS_COURSE_BUCKET_PREFIX = get_string(
6969
"CANVAS_COURSE_BUCKET_PREFIX", "canvas/course_content"
7070
)
71+
CANVAS_PDF_TRANSCRIPTION_MODEL = get_string(
72+
name="CANVAS_PDF_TRANSCRIPTION_MODEL", default=None
73+
)
74+
CANVAS_TRANSCRIPTION_PROMPT = get_string(
75+
"CANVAS_TRANSCRIPTION_PROMPT",
76+
"""Transcribe the contents of this file into markdown.
77+
Do not include anything but the markdown content in your response""",
78+
)
7179
# More MIT URLs
7280
SEE_API_URL = get_string("SEE_API_URL", None)
7381
SEE_API_ACCESS_TOKEN_URL = get_string("SEE_API_ACCESS_TOKEN_URL", None)

poetry.lock

Lines changed: 16 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ uwsgi = "^2.0.29"
111111
uwsgitop = "^0.12"
112112
wrapt = "^1.14.1"
113113
youtube-transcript-api = "^1.0.0"
114+
pdf2image = "^1.17.0"
114115

115116
[tool.poetry.group.dev.dependencies]
116117
bpython = "^0.25"

0 commit comments

Comments
 (0)