Skip to content

Commit 2671f57

Browse files
authored
refactor(readability): simplify the regexes in internal/reader/readability/readability.go
- Use strings.ToLower() instead of having case-insensitive regex - Remove overlapping words in the regex - Split a condition to increase readability
1 parent 2f56ebd commit 2671f57

File tree

1 file changed

+12
-7
lines changed

1 file changed

+12
-7
lines changed

internal/reader/readability/readability.go

+12-7
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ var (
2626
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
2727
sentenceRegexp = regexp.MustCompile(`\.( |$)`)
2828

29-
blacklistCandidatesRegexp = regexp.MustCompile(`(?i)popupbody|-ad|g-plus`)
30-
okMaybeItsACandidateRegexp = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
31-
unlikelyCandidatesRegexp = regexp.MustCompile(`(?i)banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
29+
blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
30+
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
31+
unlikelyCandidatesRegexp = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
3232

33-
negativeRegexp = regexp.MustCompile(`(?i)hidden|^hid$|hid$|hid|^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author`)
34-
positiveRegexp = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
33+
negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
34+
positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
3535
)
3636

3737
type candidate struct {
@@ -154,9 +154,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
154154
}
155155
class, _ := s.Attr("class")
156156
id, _ := s.Attr("id")
157-
str := class + id
157+
str := strings.ToLower(class + id)
158158

159-
if blacklistCandidatesRegexp.MatchString(str) || (unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str)) {
159+
if blacklistCandidatesRegexp.MatchString(str) {
160+
removeNodes(s)
161+
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
160162
removeNodes(s)
161163
}
162164
})
@@ -277,6 +279,9 @@ func getClassWeight(s *goquery.Selection) float32 {
277279
class, _ := s.Attr("class")
278280
id, _ := s.Attr("id")
279281

282+
class = strings.ToLower(class)
283+
id = strings.ToLower(id)
284+
280285
if class != "" {
281286
if negativeRegexp.MatchString(class) {
282287
weight -= 25

0 commit comments

Comments
 (0)