Skip to content

Commit 78c292c

Browse files
committed
added support Tesseract config variables
1 parent 3a67ea6 commit 78c292c

File tree

5 files changed

+149
-30
lines changed

5 files changed

+149
-30
lines changed

.tool-versions

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
python 3.12.0
1+
python 3.12.1

README.md

Lines changed: 109 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,67 +18,123 @@ pip install aiopytesseract
1818

1919
## Usage
2020

21-
```python
22-
from pathlib import Path
21+
### List all available languages by Tesseract installation
2322

23+
``` python
2424
import aiopytesseract
2525

26-
27-
# list all available languages by tesseract installation
2826
await aiopytesseract.languages()
2927
await aiopytesseract.get_languages()
28+
```
3029

30+
### Tesseract version
31+
32+
``` python
33+
import aiopytesseract
3134

32-
# tesseract version
3335
await aiopytesseract.tesseract_version()
3436
await aiopytesseract.get_tesseract_version()
37+
```
38+
39+
### Tesseract parameters
3540

41+
``` python
42+
import aiopytesseract
3643

37-
# tesseract parameters
3844
await aiopytesseract.tesseract_parameters()
45+
```
3946

47+
### Confidence only info
48+
49+
``` python
50+
import aiopytesseract
4051

41-
# confidence only info
4252
await aiopytesseract.confidence("tests/samples/file-sample_150kB.png")
53+
```
4354

55+
### Deskew info
56+
57+
``` python
58+
import aiopytesseract
4459

45-
# deskew info
4660
await aiopytesseract.deskew("tests/samples/file-sample_150kB.png")
61+
```
4762

63+
### Extract text from an image: locally or bytes
64+
65+
``` python
66+
from pathlib import Path
67+
68+
import aiopytesseract
4869

49-
# extract text from an image: locally or bytes
5070
await aiopytesseract.image_to_string("tests/samples/file-sample_150kB.png")
5171
await aiopytesseract.image_to_string(
52-
Path("tests/samples/file-sample_150kB.png")read_bytes(), dpi=220, lang='eng+por'
72+
Path("tests/samples/file-sample_150kB.png").read_bytes(), dpi=220, lang='eng+por'
5373
)
74+
```
75+
76+
### Box estimates
5477

78+
``` python
79+
from pathlib import Path
80+
81+
import aiopytesseract
5582

56-
# box estimates
5783
await aiopytesseract.image_to_boxes("tests/samples/file-sample_150kB.png")
5884
await aiopytesseract.image_to_boxes(Path("tests/samples/file-sample_150kB.png")
85+
```
86+
87+
### Boxes, confidence and page numbers
88+
89+
``` python
90+
from pathlib import Path
5991

92+
import aiopytesseract
6093

61-
# boxes, confidence and page numbers
6294
await aiopytesseract.image_to_data("tests/samples/file-sample_150kB.png")
6395
await aiopytesseract.image_to_data(Path("tests/samples/file-sample_150kB.png")
96+
```
6497

98+
### Information about orientation and script detection
99+
100+
``` python
101+
from pathlib import Path
102+
103+
import aiopytesseract
65104

66-
# information about orientation and script detection
67105
await aiopytesseract.image_to_osd("tests/samples/file-sample_150kB.png")
68106
await aiopytesseract.image_to_osd(Path("tests/samples/file-sample_150kB.png")
107+
```
108+
109+
### Generate a searchable PDF
110+
111+
``` python
112+
from pathlib import Path
69113

114+
import aiopytesseract
70115

71-
# generate a searchable PDF
72116
await aiopytesseract.image_to_pdf("tests/samples/file-sample_150kB.png")
73117
await aiopytesseract.image_to_pdf(Path("tests/samples/file-sample_150kB.png")
118+
```
119+
120+
### Generate HOCR output
121+
122+
``` python
123+
from pathlib import Path
74124

125+
import aiopytesseract
75126

76-
# generate HOCR output
77127
await aiopytesseract.image_to_hocr("tests/samples/file-sample_150kB.png")
78128
await aiopytesseract.image_to_hocr(Path("tests/samples/file-sample_150kB.png")
129+
```
79130

131+
### Multi ouput
132+
133+
``` python
134+
from pathlib import Path
135+
136+
import aiopytesseract
80137

81-
# multi ouput
82138
async with aiopytesseract.run(
83139
Path('tests/samples/file-sample_150kB.png').read_bytes(),
84140
'output',
@@ -89,7 +145,43 @@ async with aiopytesseract.run(
89145
alto_file, tsv_file, txt_file = resp
90146
```
91147

92-
For more details on Tesseract best practices and the aiopytesseract, see the folder: `docs`.
148+
### Config variables
149+
150+
``` python
151+
from pathlib import Path
152+
153+
import aiopytesseract
154+
155+
async with aiopytesseract.run(
156+
Path('tests/samples/text-with-chars-and-numbers.png').read_bytes(),
157+
'output',
158+
'alto tsv txt'
159+
config=[("tessedit_char_whitelist", "0123456789")]
160+
) as resp:
161+
# will generate (output.xml, output.tsv and output.txt)
162+
print(resp)
163+
alto_file, tsv_file, txt_file = resp
164+
```
165+
166+
``` python
167+
from pathlib import Path
168+
169+
import aiopytesseract
170+
171+
await aiopytesseract.image_to_string(
172+
"tests/samples/text-with-chars-and-numbers.png",
173+
config=[("tessedit_char_whitelist", "0123456789")]
174+
)
175+
176+
await aiopytesseract.image_to_string(
177+
Path("tests/samples/text-with-chars-and-numbers.png").read_bytes(),
178+
dpi=220,
179+
lang='eng+por',
180+
config=[("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")]
181+
)
182+
```
183+
184+
> For more details on Tesseract best practices and the aiopytesseract, see the folder: `docs`.
93185

94186
## Examples
95187

aiopytesseract/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
)
1717
from .models import OSD, Box, Data, Parameter
1818

19-
__version__ = "0.13.0"
19+
__version__ = "0.14.0"
2020
__all__ = [
2121
"__version__",
2222
"OSD",

aiopytesseract/base_command.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ async def execute(
5151
user_words: Union[None, str] = None,
5252
user_patterns: Union[None, str] = None,
5353
tessdata_dir: Union[None, str] = None,
54+
config: Union[None, List[Tuple[str, str]]] = None,
55+
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
5456
) -> bytes:
5557
raise NotImplementedError
5658

@@ -67,6 +69,8 @@ async def _(
6769
user_words: Union[None, str] = None,
6870
user_patterns: Union[None, str] = None,
6971
tessdata_dir: Union[None, str] = None,
72+
config: Union[None, List[Tuple[str, str]]] = None,
73+
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
7074
) -> bytes:
7175
await file_exists(image)
7276
response: bytes = await execute(
@@ -80,6 +84,8 @@ async def _(
8084
user_words=user_words,
8185
user_patterns=user_patterns,
8286
tessdata_dir=tessdata_dir,
87+
config=config,
88+
encoding=encoding,
8389
)
8490
return response
8591

@@ -89,24 +95,26 @@ async def _(
8995
image: bytes,
9096
output_format: str,
9197
dpi: int,
92-
lang: Union[None, str],
9398
psm: int,
9499
oem: int,
95100
timeout: float,
101+
lang: Union[None, str] = None,
96102
user_words: Union[None, str] = None,
97103
user_patterns: Union[None, str] = None,
98104
tessdata_dir: Union[None, str] = None,
105+
config: Union[None, List[Tuple[str, str]]] = None,
99106
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
100107
) -> bytes:
101108
cmd_args = await _build_cmd_args(
102109
output_extension=output_format,
103110
dpi=dpi,
104111
psm=psm,
105112
oem=oem,
113+
lang=lang,
106114
user_words=user_words,
107115
user_patterns=user_patterns,
108116
tessdata_dir=tessdata_dir,
109-
lang=lang,
117+
config=config,
110118
)
111119
try:
112120
proc = await asyncio.wait_for(
@@ -142,6 +150,7 @@ async def execute_multi_output_cmd(
142150
user_words: Union[None, str] = None,
143151
user_patterns: Union[None, str] = None,
144152
tessdata_dir: Union[None, str] = None,
153+
config: Union[None, List[Tuple[str, str]]] = None,
145154
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
146155
) -> Tuple[str, ...]:
147156
cmd_args = await _build_cmd_args(
@@ -154,6 +163,7 @@ async def execute_multi_output_cmd(
154163
tessdata_dir=tessdata_dir,
155164
lang=lang,
156165
output=output_file,
166+
config=config,
157167
)
158168
try:
159169
proc = await asyncio.wait_for(
@@ -187,6 +197,7 @@ async def _build_cmd_args(
187197
tessdata_dir: Union[None, str] = None,
188198
lang: Union[None, str] = None,
189199
output: str = "stdout",
200+
config: Union[None, List[Tuple[str, str]]] = None,
190201
) -> List[str]:
191202
await asyncio.gather(psm_is_valid(psm), oem_is_valid(oem))
192203
# OCR options must occur before any configfile.
@@ -212,6 +223,11 @@ async def _build_cmd_args(
212223
cmd_args.append("-l")
213224
cmd_args.append(lang)
214225

226+
if config:
227+
for option, value in config:
228+
cmd_args.append("-c")
229+
cmd_args.append(f"{option}={value} ")
230+
215231
extension = reversed(output_extension.split())
216232
for ext in extension:
217233
cmd_args.append(ext)

0 commit comments

Comments
 (0)