-
Notifications
You must be signed in to change notification settings - Fork 680
Description
Description of the bug
Hi, I'm trying to process PDFs to translate them. Now, working with widgets, I am facing the issue of not being able to update the text correctly. I get that I can use the button_caption field to update text, but I need the original text as well to translate it. However, when I see the original button caption, it's None.
This is a sample dict representation of the widget:
{'border_color': None, 'border_style': 'Solid', 'border_width': 1, 'border_dashes': None, 'choice_values': None, 'rb_parent': None, 'field_name': 'TOC.Page 2', 'field_label': None, 'field_value': '', 'field_flags': 65536, 'field_display': 0, 'field_type': 1, 'field_type_string': 'Button', 'fill_color': None, 'button_caption': None, 'is_signed': None, 'text_color': (0, 0, 0), 'text_font': 'Helv', 'text_fontsize': 0, 'text_maxlen': 0, 'text_format': 0, '_text_da': '', 'script': None, 'script_stroke': None, 'script_format': None, 'script_change': None, 'script_calc': None, 'script_blur': None, 'script_focus': None, 'rect': Rect(778.302001953125, 6.64398193359375, 793.7509765625, 29.593994140625), 'xref': 654, 'parent': <weakproxy at 0x741d3acba2a0 to Page at 0x741d3ac47580>, '_annot': 'Widget' annotation on page 1 of <memory, doc# 1>}
heree None {'border_color': None, 'border_style': 'Solid', 'border_width': 1, 'border_dashes': None, 'choice_values': None, 'rb_parent': None, 'field_name': 'References.Page 2', 'field_label': None, 'field_value': '', 'field_flags': 65536, 'field_display': 0, 'field_type': 1, 'field_type_string': 'Button', 'fill_color': None, 'button_caption': None, 'is_signed': None, 'text_color': (0, 0, 0), 'text_font': 'Helv', 'text_fontsize': 0, 'text_maxlen': 0, 'text_format': 0, '_text_da': '', 'script': None, 'script_stroke': None, 'script_format': None, 'script_change': None, 'script_calc': None, 'script_blur': None, 'script_focus': None, 'rect': Rect(808.3410034179688, 6.957000732421875, 821.7559814453125, 29.593994140625), 'xref': 653, 'parent': <weakproxy at 0x741d3acba2a0 to Page at 0x741d3ac47580>, '_annot': 'Widget' annotation on page 1 of <memory, doc# 1>}
The field_name is what I see on the PDF, i.e., TOC in this case, this is true for all the widgets on the PDF. My question is, is that the norm? Can I generalize this for all PDFs to get the button text, and get the field name minus the page number? Or is there more to it? Appreciate any guidance that can help cover all bases.
How to reproduce the bug
You can try this pdf: superstar-messaging.pdf
Code:
def extract_text_blocks_1(self, doc):
pages_blocks = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict", sort=True, clip=pymupdf.INFINITE_RECT())
blocks = blocks["blocks"]
page_blocks = []
for block in blocks:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
# logger.info(f"properties {span}, flag, {span}")
font_properties = {"size": span["size"], "color": "#%06x" % span["color"], "font-family": '%s' % span["font"], "font-weight": "bold" if self.flags_decomposer(span['flags']) else "normal", "font-style": "italic" if self.flags_decomposer(span['flags']) else "normal"}
if span["text"].strip():
bbox = span["bbox"]
if isinstance(bbox, (list, tuple)) and len(bbox) == 4:
bbox = [float(coord) for coord in bbox]
if span["text"]:
page_blocks.append({
"text": span["text"],
"bbox": bbox,
"font_properties": font_properties,
"angle": self.get_angle(bbox, line["dir"])
})
pages_blocks.append(page_blocks)
return pages_blocks
doc = pymupdf.open(stream=file, filetype="pdf")
pages_blocks = self.extract_text_blocks_1(doc)
count_rc = 0
logger.info(f"{len(pages_blocks)} Pages found")
for page_num, page_blocks in enumerate(pages_blocks[:2]):
page = doc.load_page(page_num)
logger.info(f"Page no: {page_num}")
for widget in page.widgets():
if widget.field_type == pymupdf.PDF_WIDGET_TYPE_BUTTON:
print("heree", widget.button_caption, widget.__dict__)
text = "" #how to get text?
if any(c.isalpha() for c in text):
translated_text = translate_file_text_function(text) #how to get text?
widget.button_caption = translated_text
widget.update()
PyMuPDF version
1.25.4
Operating system
Linux
Python version
3.10