-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsrc.py
159 lines (148 loc) · 6.3 KB
/
src.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_bytes
from io import StringIO
import os
from PIL import Image, ImageDraw
import re
class TextReport:
"""Class for textual reports handling"""
def __init__(self, file_obj, lang):
"""Init Method for the class
Args:
file_obj (file object): File object of the PDF
lang (str): Language of the PDF (fra for french, eng for english)
"""
self.file_obj = file_obj
self.lang = lang
self.image_stack = []
self.raw_text = ""
self.text_as_list = []
self.header_text = []
self.results_match_dict = {}
self.image_stack = convert_from_bytes(self.file_obj.read())
def get_grayscale(self, image):
"""Convert an image as numpy array to grayscale
Args:
image (numpy array): Image as numpy array
Returns:
image: image object
"""
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
def thresholding(self, image):
"""Treshold pixel of greyscale image
Args:
image (numpy array): Image as numpy array
Returns:
image: image object
"""
return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
def pdf_to_text(self):
"""Convert PDF file object from image to text using Tesseract with langage settings.
OEM 1 PSM 1
Returns:
str: raw text as a string
"""
page_list = []
# Loop on each image (page) of the PDF file
for image in self.image_stack:
open_cv_image = np.array(image)
# Convert RGB to BGR
open_cv_image = open_cv_image[:, :, ::-1].copy()
# Preprocess image
open_cv_image = self.thresholding(self.get_grayscale(open_cv_image))
# Tesseract OCR
custom_config = r"-l " + self.lang + r" --oem 1 --psm 1 "
text_page = pytesseract.image_to_string(open_cv_image, config=custom_config)
# Save text results
page_list.append(text_page)
self.raw_text = "\n".join(page_list)
self.text_as_list = self.raw_text.split("\n")
return self.raw_text
def pdf_censor(self, output_folder, mode="regex", to_censor_list=None):
censored_image_stack = []
page_list = []
new_filename = os.path.basename(self.file_obj.name) + "_censored.pdf"
# Loop on each image (page) of the PDF file
for image in self.image_stack:
open_cv_image = np.array(image)
open_cv_image = open_cv_image[:, :, ::-1].copy()
open_cv_image = self.thresholding(self.get_grayscale(open_cv_image))
custom_config = r"-l " + self.lang + r" --oem 1 --psm 1 "
text_page = pytesseract.image_to_string(open_cv_image, config=custom_config)
self.image_data_table_as_str = pytesseract.image_to_data(
open_cv_image, config=custom_config
)
image_data_table = pd.read_csv(
StringIO(self.image_data_table_as_str),
sep="\t",
quoting=3,
on_bad_lines="skip",
)
text_as_list = text_page.split("\n")
biopsy_id, to_censor = self._regex_match(text_as_list)
if mode == "openAI":
to_censor.extend(to_censor_list)
if biopsy_id is not None:
new_filename = biopsy_id.replace("/", "-") + "_" + self.file_obj.name
list_bbox = self._get_bbox(image_data_table, to_censor)
image_censored = self._draw_censor(image, list_bbox)
censored_image_stack.append(image_censored)
output_path = os.path.join(output_folder, new_filename)
while os.path.isfile(output_path):
new_filename = "dup_" + new_filename
output_path = os.path.join(output_folder, new_filename)
censored_image_stack[0].save(
output_path, save_all=True, append_images=censored_image_stack[1:]
)
return output_path
def _regex_match(self, text_as_list):
re_nom = re.compile(r"Nom.*: *([A-Za-zÀ-ÿ- ]+)")
re_nom2 = re.compile(r"(([A-Z][a-zÀ-ÿ-]{3,} ?)+ ([A-Z-]{3,} ?)+)")
re_nom3 = re.compile(r"(([A-Z-]{3,} ?)+ ([A-Z][a-zÀ-ÿ-]{3,} ?)+)")
re_date = re.compile(
r"([(\.]?[0-9]{1,2}[\.\/][0-9]{1,2}[\.\/][0-9]{1,4}[()\.]?)"
)
re_biopsy_number = re.compile(r": *([0-9]{4,8}[-\/]?[0-9]{0,3})")
re_biopsy_number2 = re.compile(r"Bps +N° +([0-9-\/]{1,10})")
to_censor = []
biopsy_id = None
for line in text_as_list:
match_nom = re_nom.search(line)
if match_nom is not None:
to_censor.append(match_nom.group(1))
match_nom2 = re_nom2.search(line)
if match_nom2 is not None:
to_censor.append(match_nom2.group(1))
match_nom3 = re_nom3.search(line)
if match_nom3 is not None:
to_censor.append(match_nom3.group(1))
match_date = re_date.search(line)
if match_date is not None:
to_censor.append(match_date.group(1))
match_number_biop = re_biopsy_number.search(line)
if match_number_biop is not None:
biopsy_id = match_number_biop.group(1)
match_number_biop2 = re_biopsy_number2.search(line)
if match_number_biop2 is not None:
biopsy_id = match_number_biop2.group(1)
return biopsy_id, to_censor
def _get_bbox(self, ocr_datatable, to_censor):
list_bbox = []
for terms in to_censor:
for words in terms.split(" "):
line = ocr_datatable.loc[ocr_datatable["text"] == words]
# try:
for index, row in line.iterrows():
bbox = [row[6], row[7], row[6] + row[8], row[7] + row[9]]
list_bbox.append(bbox)
# except Exception as e:
# print("Error for word:", words, "in ", self.file_obj.name)
return list_bbox
def _draw_censor(self, image, list_bbox):
im_draw = ImageDraw.Draw(image)
for bbox in list_bbox:
im_draw.rectangle(bbox, fill="black")
return image