Skip to content

Commit

Permalink
Fix issue with unpickling HOCRResult
Browse files Browse the repository at this point in the history
Fixes [Bug]: HOCRResult.from_json() not unpickling correctly #1427
  • Loading branch information
jbarlow83 committed Nov 10, 2024
1 parent 5d128a9 commit dbd3c93
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 27 deletions.
45 changes: 19 additions & 26 deletions src/ocrmypdf/_pipelines/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,23 @@ class PageResult(NamedTuple):
"""Orientation correction in degrees."""


class HOCRResultEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Path):
return {'Path': str(obj)}
return super().default(obj)


class HOCRResultDecoder(json.JSONDecoder):
def __init__(self, *args, **kwargs):
super().__init__(object_hook=self.dict_to_object, *args, **kwargs)

def dict_to_object(self, d):
if 'Path' in d:
return Path(d['Path'])
return d


@dataclass
class HOCRResult:
"""Result when hOCR is finished processing."""
Expand All @@ -123,38 +140,14 @@ class HOCRResult:
orientation_correction: int = 0
"""Orientation correction in degrees."""

def __getstate__(self):
"""Return state values to be pickled."""
return {
k: (
('Path://' + str(v))
if k in ('pdf_page_from_image', 'hocr', 'textpdf') and v is not None
else v
)
for k, v in self.__dict__.items()
}

def __setstate__(self, state):
"""Restore state from the unpickled state values."""
self.__dict__.update(
{
k: (
Path(v.removeprefix('Path://'))
if k in ('pdf_page_from_image', 'hocr', 'textpdf') and v is not None
else v
)
for k, v in state.items()
}
)

@classmethod
def from_json(cls, json_str: str) -> HOCRResult:
"""Create an instance from a dict."""
return cls(**json.loads(json_str))
return cls(**json.loads(json_str, cls=HOCRResultDecoder))

def to_json(self) -> str:
"""Serialize to a JSON string."""
return json.dumps(self.__getstate__())
return json.dumps(self.__dict__, cls=HOCRResultEncoder)


def configure_debug_logging(
Expand Down
30 changes: 29 additions & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

from __future__ import annotations

import pickle
from io import BytesIO
from pathlib import Path

import pytest
from pdfminer.high_level import extract_text

import ocrmypdf
import ocrmypdf._pipelines
import ocrmypdf.api


Expand All @@ -35,7 +37,7 @@ def test_sidecar_stringio(resources: Path, outdir: Path, outpdf: Path):
resources / 'ccitt.pdf',
outpdf,
plugins=['tests/plugins/tesseract_cache.py'],
sidecar=s
sidecar=s,
)
s.seek(0)
assert b'the' in s.getvalue()
Expand Down Expand Up @@ -75,3 +77,29 @@ def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path):
text = extract_text(outpdf)
assert 'hocr' in text and 'the' not in text


def test_hocr_result_json():
result = ocrmypdf._pipelines._common.HOCRResult(
pageno=1,
pdf_page_from_image=Path('a'),
hocr=Path('b'),
textpdf=Path('c'),
orientation_correction=180,
)
assert (
result.to_json()
== '{"pageno": 1, "pdf_page_from_image": {"Path": "a"}, "hocr": {"Path": "b"}, '
'"textpdf": {"Path": "c"}, "orientation_correction": 180}'
)
assert ocrmypdf._pipelines._common.HOCRResult.from_json(result.to_json()) == result


def test_hocr_result_pickle():
result = ocrmypdf._pipelines._common.HOCRResult(
pageno=1,
pdf_page_from_image=Path('a'),
hocr=Path('b'),
textpdf=Path('c'),
orientation_correction=180,
)
assert result == pickle.loads(pickle.dumps(result))

0 comments on commit dbd3c93

Please sign in to comment.