-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessors.py
94 lines (75 loc) · 3.99 KB
/
preprocessors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from bs4 import BeautifulSoup, SoupStrainer
class GithubBlogpostPreprocessor:
"""
A class to preprocess HTML content from GitHub blog posts, focusing on specific
parts of the post like the title, header, and content.
"""
def __init__(self):
pass
def get_text(self, html_content):
"""
Extracts and returns clean text from the post content, title, and header.
Returns:
str: Cleaned text from specified parts of the HTML content.
"""
only_post_text = SoupStrainer(class_=["post-content", "post-title", "post-header"])
soup = BeautifulSoup(html_content, "html.parser", parse_only=only_post_text)
cleaned_text = soup.get_text()
return cleaned_text
class ArxivHtmlPaperPreprocessor:
def __init__(self):
pass
def get_text(self, html_content):
title = self._extract_title(html_content)
authors_affiliations = self._extract_authors_and_affiliations(html_content)
abstract = self._extract_abstract(html_content)
sections = []
for i in range(1, 9):
section_id = "S" + str(i)
section_text = self._extract_section_with_subheadings(html_content, section_id)
sections.append(section_text)
cleaned_text = title + "\n\n" + authors_affiliations + "\n\n" + abstract + "\n\n" + "\n\n".join(sections)
return cleaned_text
def _extract_title(self, html_content):
strainer = SoupStrainer('h1', class_="ltx_title ltx_title_document")
soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)
title_text = soup.get_text()
return title_text
def _extract_authors_and_affiliations(self, html_content):
strainer = SoupStrainer('div', class_="ltx_authors")
soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)
formatted_output = []
for author in soup.find_all('span', class_='ltx_creator ltx_role_author'):
name = author.find('span', class_='ltx_personname').get_text(strip=True)
affiliation = ' '.join(span.get_text(strip=True) for span in author.find_all('span', class_='ltx_contact ltx_role_affiliation'))
formatted_output.append(f"{name}: {affiliation}\n\n")
output_text = "\n".join(formatted_output)
return output_text
def _extract_abstract(self, html_content):
soup = BeautifulSoup(html_content, 'html.parser')
abstract_div = soup.find('div', class_='ltx_abstract')
if abstract_div:
abstract_title = abstract_div.find('h6', class_='ltx_title ltx_title_abstract')
abstract_title_text = abstract_title.get_text(strip=True) if abstract_title else "Abstract"
abstract_paragraph = abstract_div.find('p', class_='ltx_p')
if abstract_paragraph:
for footnote in abstract_paragraph.find_all('span', class_='ltx_note'):
footnote.decompose()
return f"{abstract_title_text}\n\n{abstract_paragraph.get_text(strip=True)}"
return "Abstract not found"
def _extract_section_with_subheadings(self, html_content, section_id):
soup = BeautifulSoup(html_content, 'html.parser')
section = soup.find('section', id=section_id)
if section:
output_text = []
main_heading = section.find(['h2', 'h3'], class_='ltx_title')
if main_heading:
output_text.append(main_heading.get_text(strip=True))
elements = section.find_all(['p', 'h3'], class_=lambda x: x in ['ltx_p', 'ltx_title ltx_title_subsection'])
for element in elements:
if element.name == 'h3':
output_text.append("\n\n" + element.get_text(strip=True))
else:
output_text.append(element.get_text(strip=True))
return '\n\n'.join(output_text)
return "Section not found"