Skip to content

Commit 9694f1f

Browse files
committed
No more broken lines in output extracted from PDF.
1 parent 4f2bafd commit 9694f1f

File tree

1 file changed

+46
-48
lines changed

1 file changed

+46
-48
lines changed

note_point_extractor/library/text_handler.py

+46-48
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22

33
corestring = """Extracted Annotations (7/13/2018, 9:14:41 PM)
44
sample notes extracted from PDF
5-
#g asd asd asd g#
5+
#g eita ek
6+
line g#
67
#b asd asd
78
asd b#
89
#p asd asd asd p#
910
#i asd asd asd i#
1011
#g asd asd asd g#
1112
#b asd asd asd b#
12-
#p asd asd
13-
asd p#
13+
#p point point
14+
point p#
1415
1516
"""
1617

@@ -26,7 +27,7 @@ def __count_all_tags(text: str) -> int:
2627
__count_tags(text, "#p", "p#") + __count_tags(text, "#c", "c#")
2728

2829

29-
def __find_tag(text: str):
30+
def __find_start_tag_in_line(text: str):
3031
if list(tag_pairs)[0] in text:
3132
return list(tag_pairs)[0]
3233
if list(tag_pairs)[1] in text:
@@ -37,50 +38,41 @@ def __find_tag(text: str):
3738
return list(tag_pairs)[3]
3839

3940

40-
# all_lines: List[str] = corestring.splitlines()
41-
42-
# good_points: List[str] = []
43-
# bad_points: List[str] = []
44-
# comments: List[str] = []
45-
# i_points: List[str] = []
46-
# all_comments = {"b#": bad_points, "g#": good_points,
47-
# "c#": comments, "p#": i_points}
48-
49-
# flag = False
50-
# end_tag: str = ""
51-
# for line in all_lines:
52-
# if list(tag_pairs)[0] in line or list(tag_pairs)[1] in line or \
53-
# list(tag_pairs)[2] in line or list(tag_pairs)[3] in line:
54-
# flag = True
55-
# end_tag = tag_pairs[__find_tag(line)]
56-
# if flag:
57-
# all_comments[end_tag].append(line)
58-
# if end_tag in line:
59-
# flag = False
60-
61-
6241
def __beautify_output_lines(lines: List[str], tag_type: str, start_tag: str,
63-
markdown: bool = False)->str:
64-
print(len(lines))
42+
markdown: bool = False) -> str:
6543
if len(lines) < 1:
6644
print("no lines found, exiting")
6745
return ""
6846
combined_line: str = tag_type.upper()
6947
if markdown:
7048
combined_line = "# " + combined_line
7149
combined_line += "\n"
50+
end_tag: str = tag_pairs[start_tag]
7251
for line in lines:
73-
line = line.replace(start_tag, "")
74-
line = line.replace(tag_pairs[start_tag], "")
75-
line = line.strip()
76-
if markdown:
77-
combined_line += "- " + line + "\n"
78-
else:
79-
combined_line += line + "\n"
80-
return combined_line
8152

53+
is_end_tag_in_line: bool = line.find(end_tag) != -1
54+
is_start_tag_in_line: bool = line.find(start_tag) != -1
55+
56+
# print(line + ": " + str(is_end_tag_in_line))
57+
if is_start_tag_in_line and markdown:
58+
combined_line += "-"
59+
# line = line.replace("\n", "")
60+
combined_line += " " + line.strip()
61+
if is_end_tag_in_line:
62+
combined_line += "\n"
63+
combined_line = combined_line.replace(start_tag, "")
64+
combined_line = combined_line.replace(end_tag, "")
65+
combined_line = combined_line.replace(" ", " ")
66+
return "\n"+combined_line
8267

83-
def process_content(value: str, markdown: bool=False)->str:
68+
69+
def __start_tag_in_line(line: str):
70+
global tag_pairs
71+
return list(tag_pairs)[0] in line or list(tag_pairs)[1] in line or \
72+
list(tag_pairs)[2] in line or list(tag_pairs)[3] in line
73+
74+
75+
def process_content(value: str, markdown: bool=False) -> str:
8476
global tag_pairs
8577
tag_count = __count_all_tags(value)
8678
if tag_count % 2 is not 0:
@@ -93,25 +85,31 @@ def process_content(value: str, markdown: bool=False)->str:
9385
i_points: List[str] = []
9486
all_comments = {"b#": bad_points, "g#": good_points,
9587
"c#": comments, "p#": i_points}
96-
flag = False
88+
is_looking_for_end_tag = False
9789
end_tag: str = ""
9890
for line in all_lines:
99-
if list(tag_pairs)[0] in line or list(tag_pairs)[1] in line or \
100-
list(tag_pairs)[2] in line or list(tag_pairs)[3] in line:
101-
flag = True
102-
end_tag = tag_pairs[__find_tag(line)]
103-
if flag:
91+
# print("processing: "+line)
92+
if __start_tag_in_line(line):
93+
is_looking_for_end_tag = True
94+
end_tag = tag_pairs[__find_start_tag_in_line(line)]
95+
if is_looking_for_end_tag:
96+
# print("looking for end tag: " + line)
10497
all_comments[end_tag].append(line)
10598
if end_tag in line:
106-
flag = False
99+
# print("found end tag: " + line)
100+
is_looking_for_end_tag = False
107101
full_content: str = __beautify_output_lines(
108-
good_points, "Good Points", "#g", markdown) + "\n"
102+
good_points, "Good Points", "#g", markdown)
109103
full_content += __beautify_output_lines(bad_points,
110104
"Bad Points",
111-
"#b", markdown) + "\n"
105+
"#b", markdown)
112106
full_content += __beautify_output_lines(comments,
113-
"Comments", "#c", markdown) + "\n"
107+
"Comments", "#c", markdown)
114108
full_content += __beautify_output_lines(i_points,
115109
"Intersting Points",
116-
"#p", markdown) + "\n"
110+
"#p", markdown)
117111
return full_content
112+
113+
114+
if __name__ == "__main__":
115+
print(process_content(corestring, markdown=True))

0 commit comments

Comments
 (0)