Skip to content

Commit 8cd475f

Browse files
committed
Remove pandas dependency in minimal requirements
1 parent 99e019f commit 8cd475f

24 files changed

+486
-213
lines changed

MANIFEST.in

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
include *.txt
2+
recursive-include extras *.txt
3+
recursive-include textractor *

NOTICE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
Amazon Textract Textractor
2-
Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.

README.md

+6-5
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@ If you are looking for the other amazon-textract-* packages, you can find them u
1414

1515
## Installation
1616

17-
Textractor is available on PyPI and can be installed with `pip install amazon-textract-textractor`. By default this will install the minimal version of textractor. The following extras can be used to add features:
17+
Textractor is available on PyPI and can be installed with `pip install amazon-textract-textractor`. By default this will install the minimal version of Textractor which is suitable for lambda execution. The following extras can be used to add features:
1818

19-
- `pdf` (`pip install amazon-textract-textractor[pdf]`) includes `pdf2image` and enables PDF rasterization in Textractor. Note that this is **not** necessary to call Textract with a PDF file.
20-
- `torch` (`pip install amazon-textract-textractor[torch]`) includes `sentence_transformers` for better word search and matching. This will work on CPU but be noticeably slower than non-machine learning based approaches.
21-
- `dev` (`pip install amazon-textract-textractor[dev]`) includes all the dependencies above and everything else needed to test the code.
19+
- `pandas` (`pip install "amazon-textract-textractor[pandas]"`) installs pandas which is used to enable DataFrame and CSV exports.
20+
- `pdf` (`pip install "amazon-textract-textractor[pdf]"`) includes `pdf2image` and enables PDF rasterization in Textractor. Note that this is **not** necessary to call Textract with a PDF file.
21+
- `torch` (`pip install "amazon-textract-textractor[torch]"`) includes `sentence_transformers` for better word search and matching. This will work on CPU but be noticeably slower than non-machine learning based approaches.
22+
- `dev` (`pip install "amazon-textract-textractor[dev]"`) includes all the dependencies above and everything else needed to test the code.
2223

23-
You can pick several extras by separating the labels with commas like this `pip install amazon-textract-textractor[pdf,torch]`.
24+
You can pick several extras by separating the labels with commas like this `pip install "amazon-textract-textractor[pdf,torch]"`.
2425

2526
## Documentation
2627

docs/source/notebooks/imgs/excel.png

401 KB
Loading

docs/source/notebooks/table_data_to_various_formats.ipynb

+73-11
Large diffs are not rendered by default.

extras/pandas.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
numpy==1.21.*
2+
pandas

requirements.txt

-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
numpy==1.21.*
21
awscli
32
amazon-textract-response-parser==0.1.33
43
amazon-textract-caller==0.0.24
54
boto3==1.24.*
65
botocore==1.27.90
76
jsonschema
87
Pillow
9-
pandas
108
tabulate==0.8.10
119
XlsxWriter==3.0.3
1210
pyxDamerauLevenshtein==1.7.1

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def read_requirements(path):
1616
setup(
1717
# include data files
1818
name="amazon-textract-textractor",
19-
version="1.0.5",
19+
version="1.0.6",
2020
description="A package to use AWS Textract services.",
2121
long_description=long_description,
2222
long_description_content_type="text/markdown",

textractor/cli/cli.py

+212-64
Large diffs are not rendered by default.

textractor/data/constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ class AnalyzeIDFields(Enum):
209209
# Only available in passports
210210
PLACE_OF_BIRTH = "PLACE_OF_BIRTH"
211211

212+
212213
class CLIPrint(Enum):
213214
ALL = 0
214215
TEXT = 1
@@ -218,6 +219,7 @@ class CLIPrint(Enum):
218219
EXPENSES = 5
219220
IDS = 6
220221

222+
221223
class CLIOverlay(Enum):
222224
ALL = 0
223225
WORDS = 1

textractor/entities/bbox.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33

44
from abc import ABC
55
from typing import Tuple
6-
import numpy as np
6+
7+
try:
8+
import numpy as np
9+
except ImportError:
10+
# Used in an export_as_numpy function which won't be called if the user doesn't have numpy.
11+
pass
12+
713
from typing import Dict
814
from dataclasses import dataclass
915

textractor/entities/document.py

+29-26
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class Document(SpatialObject):
4646
"""
4747

4848
@classmethod
49-
def open(self, fp: Union[dict, str, IO[AnyStr]]):
49+
def open(cls, fp: Union[dict, str, IO[AnyStr]]):
5050
"""Create a Document object from a JSON file path, file handle or response dictionary
5151
5252
:param fp: _description_
@@ -297,13 +297,12 @@ def filter_checkboxes(
297297
:return: Returns checkboxes that match the conditions set by the flags.
298298
:rtype: EntityList[KeyValue]
299299
"""
300-
300+
301301
checkboxes = EntityList([])
302302
for page in self.pages:
303-
checkboxes.extend(page.filter_checkboxes(
304-
selected=selected,
305-
not_selected=not_selected
306-
))
303+
checkboxes.extend(
304+
page.filter_checkboxes(selected=selected, not_selected=not_selected)
305+
)
307306
return checkboxes
308307

309308
def get_words_by_type(self, text_type: TextTypes = TextTypes.PRINTED) -> List[Word]:
@@ -328,7 +327,7 @@ def search_words(
328327
self,
329328
keyword: str,
330329
top_k: int = 1,
331-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
330+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
332331
similarity_threshold: float = 0.6,
333332
) -> List[Word]:
334333
"""
@@ -350,12 +349,14 @@ def search_words(
350349

351350
top_n_words = []
352351
for page in self.pages:
353-
top_n_words.extend(page._search_words_with_similarity(
354-
keyword=keyword,
355-
top_k=top_k,
356-
similarity_metric=similarity_metric,
357-
similarity_threshold=similarity_threshold,
358-
))
352+
top_n_words.extend(
353+
page._search_words_with_similarity(
354+
keyword=keyword,
355+
top_k=top_k,
356+
similarity_metric=similarity_metric,
357+
similarity_threshold=similarity_threshold,
358+
)
359+
)
359360

360361
top_n_words = sorted(top_n_words, key=lambda x: x[0], reverse=True)[:top_k]
361362
top_n_words = EntityList([ent[1] for ent in top_n_words])
@@ -366,7 +367,7 @@ def search_lines(
366367
self,
367368
keyword: str,
368369
top_k: int = 1,
369-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
370+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
370371
similarity_threshold: float = 0.6,
371372
) -> List[Line]:
372373
"""
@@ -392,12 +393,14 @@ def search_lines(
392393

393394
top_n_lines = []
394395
for page in self.pages:
395-
top_n_lines.extend(page._search_lines_with_similarity(
396-
keyword=keyword,
397-
top_k=top_k,
398-
similarity_metric=similarity_metric,
399-
similarity_threshold=similarity_threshold,
400-
))
396+
top_n_lines.extend(
397+
page._search_lines_with_similarity(
398+
keyword=keyword,
399+
top_k=top_k,
400+
similarity_metric=similarity_metric,
401+
similarity_threshold=similarity_threshold,
402+
)
403+
)
401404

402405
top_n_lines = EntityList([ent[1] for ent in top_n_lines][:top_k])
403406

@@ -408,7 +411,7 @@ def get_value_by_key(
408411
self,
409412
key: str,
410413
top_k_matches: int = 1,
411-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
414+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
412415
similarity_threshold: float = 0.6,
413416
):
414417
"""
@@ -457,7 +460,9 @@ def get_value_by_key(
457460
for word in edited_document_key.split(" ")
458461
]
459462
similarity.append(
460-
SearchUtils.get_word_similarity(key, edited_document_key, similarity_metric)
463+
SearchUtils.get_word_similarity(
464+
key, edited_document_key, similarity_metric
465+
)
461466
)
462467

463468
similarity = (
@@ -612,11 +617,9 @@ def return_duplicates(self):
612617
:rtype: Dict[page_num, List[EntityList[DocumentEntity]]]
613618
"""
614619
document_duplicates = defaultdict(list)
615-
620+
616621
for page in self.pages:
617-
document_duplicates[
618-
page.page_num
619-
].extend(page.return_duplicates())
622+
document_duplicates[page.page_num].extend(page.return_duplicates())
620623

621624
return document_duplicates
622625

textractor/entities/expense_document.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,6 @@ def keys(self) -> List[str]:
9696
return self._summary_fields.keys()
9797

9898
def __repr__(self) -> str:
99-
return os.linesep.join([f"{str(k)}: {str(v)}" for k, v in self._summary_fields.items()])
99+
return os.linesep.join(
100+
[f"{str(k)}: {str(v)}" for k, v in self._summary_fields.items()]
101+
)

textractor/entities/identity_field.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@ def value(self) -> str:
1515
@property
1616
def confidence(self) -> float:
1717
return self._confidence
18-
18+
1919
def __repr__(self) -> str:
2020
return self.value

textractor/entities/key_value.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,14 @@ def is_selected(self) -> bool:
183183
if len(self.value.children) == 1:
184184
return self.value.children[0].is_selected()
185185
else:
186-
logging.info("is_checked() was called on a KeyValue that contains more than one checkbox. Returning first checkbox")
186+
logging.info(
187+
"is_checked() was called on a KeyValue that contains more than one checkbox. Returning first checkbox"
188+
)
187189
return self.value.children[0].is_selected()
188190
else:
189-
logging.info("is_checked() was called on a KeyValue that does not contain checkboxes. Returning False")
191+
logging.info(
192+
"is_checked() was called on a KeyValue that does not contain checkboxes. Returning False"
193+
)
190194
return False
191195

192196
def __repr__(self) -> str:

textractor/entities/page.py

+37-27
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def _search_words_with_similarity(
315315
self,
316316
keyword: str,
317317
top_k: int = 1,
318-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
318+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
319319
similarity_threshold: float = 0.6,
320320
) -> List[Tuple[Word, float]]:
321321
"""
@@ -346,7 +346,9 @@ def _search_words_with_similarity(
346346
lowest_similarity = similarity_threshold
347347

348348
for word in self.words:
349-
similarity = SearchUtils.get_word_similarity(keyword, word.text, similarity_metric)
349+
similarity = SearchUtils.get_word_similarity(
350+
keyword, word.text, similarity_metric
351+
)
350352
similarity = (
351353
similarity
352354
if similarity_metric == SimilarityMetric.COSINE
@@ -361,14 +363,14 @@ def _search_words_with_similarity(
361363
continue
362364
top_n_words = sorted(top_n_words, key=lambda x: x[0], reverse=True)
363365
lowest_similarity = top_n_words[-1][0]
364-
366+
365367
return top_n_words
366368

367369
def search_words(
368370
self,
369371
keyword: str,
370372
top_k: int = 1,
371-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
373+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
372374
similarity_threshold: float = 0.6,
373375
) -> EntityList[Word]:
374376
"""
@@ -387,23 +389,25 @@ def search_words(
387389
:rtype: EntityList[Word]
388390
"""
389391

390-
top_n_words = EntityList([
391-
ent[1]
392-
for ent in self._search_words_with_similarity(
393-
keyword=keyword,
394-
top_k=top_k,
395-
similarity_metric=similarity_metric,
396-
similarity_threshold=similarity_threshold,
397-
)
398-
])
392+
top_n_words = EntityList(
393+
[
394+
ent[1]
395+
for ent in self._search_words_with_similarity(
396+
keyword=keyword,
397+
top_k=top_k,
398+
similarity_metric=similarity_metric,
399+
similarity_threshold=similarity_threshold,
400+
)
401+
]
402+
)
399403

400404
return top_n_words
401405

402406
def _search_lines_with_similarity(
403407
self,
404408
keyword: str,
405409
top_k: int = 1,
406-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
410+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
407411
similarity_threshold: int = 0.6,
408412
) -> List[Tuple[Line, float]]:
409413
"""
@@ -441,7 +445,9 @@ def _search_lines_with_similarity(
441445
for word in line.__repr__().split(" ")
442446
]
443447
similarity.append(
444-
SearchUtils.get_word_similarity(keyword, line.__repr__(), similarity_metric)
448+
SearchUtils.get_word_similarity(
449+
keyword, line.__repr__(), similarity_metric
450+
)
445451
)
446452
similarity = (
447453
max(similarity)
@@ -464,7 +470,7 @@ def search_lines(
464470
self,
465471
keyword: str,
466472
top_k: int = 1,
467-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
473+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
468474
similarity_threshold: int = 0.6,
469475
) -> EntityList[Line]:
470476
"""
@@ -484,15 +490,17 @@ def search_lines(
484490
:rtype: EntityList[Line]
485491
"""
486492

487-
top_n_lines = EntityList([
488-
ent[1]
489-
for ent in self._search_lines_with_similarity(
490-
keyword=keyword,
491-
top_k=top_k,
492-
similarity_metric=similarity_metric,
493-
similarity_threshold=similarity_threshold,
494-
)
495-
])
493+
top_n_lines = EntityList(
494+
[
495+
ent[1]
496+
for ent in self._search_lines_with_similarity(
497+
keyword=keyword,
498+
top_k=top_k,
499+
similarity_metric=similarity_metric,
500+
similarity_threshold=similarity_threshold,
501+
)
502+
]
503+
)
496504

497505
return top_n_lines
498506

@@ -501,7 +509,7 @@ def get_value_by_key(
501509
self,
502510
key: str,
503511
top_k_matches: int = 1,
504-
similarity_metric: SimilarityMetric= SimilarityMetric.LEVENSHTEIN,
512+
similarity_metric: SimilarityMetric = SimilarityMetric.LEVENSHTEIN,
505513
similarity_threshold: float = 0.6,
506514
) -> EntityList[KeyValue]:
507515
"""
@@ -551,7 +559,9 @@ def get_value_by_key(
551559
for word in edited_document_key.split(" ")
552560
]
553561
similarity.append(
554-
SearchUtils.get_word_similarity(key, edited_document_key, similarity_metric)
562+
SearchUtils.get_word_similarity(
563+
key, edited_document_key, similarity_metric
564+
)
555565
)
556566

557567
similarity = (

0 commit comments

Comments
 (0)