Skip to content

Commit e840928

Browse files
committed
update pdf image extractor code to fit the new version of PyMuPDF
1 parent 94ee964 commit e840928

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

web-scraping/pdf-image-extractor/pdf_image_extractor.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,18 @@
1010
for page_index in range(len(pdf_file)):
1111
# get the page itself
1212
page = pdf_file[page_index]
13-
image_list = page.getImageList()
13+
# get image list
14+
image_list = page.get_images()
1415
# printing number of images found in this page
1516
if image_list:
1617
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
1718
else:
1819
print("[!] No images found on page", page_index)
19-
for image_index, img in enumerate(page.getImageList(), start=1):
20+
for image_index, img in enumerate(image_list, start=1):
2021
# get the XREF of the image
2122
xref = img[0]
2223
# extract the image bytes
23-
base_image = pdf_file.extractImage(xref)
24+
base_image = pdf_file.extract_image(xref)
2425
image_bytes = base_image["image"]
2526
# get the image extension
2627
image_ext = base_image["ext"]

0 commit comments

Comments
 (0)