@@ -26,12 +26,12 @@ var (
26
26
divToPElementsRegexp = regexp .MustCompile (`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)` )
27
27
sentenceRegexp = regexp .MustCompile (`\.( |$)` )
28
28
29
- blacklistCandidatesRegexp = regexp .MustCompile (`(?i) popupbody|-ad|g-plus` )
30
- okMaybeItsACandidateRegexp = regexp .MustCompile (`(?i) and|article|body|column|main|shadow` )
31
- unlikelyCandidatesRegexp = regexp .MustCompile (`(?i) banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote` )
29
+ blacklistCandidatesRegexp = regexp .MustCompile (`popupbody|-ad|g-plus` )
30
+ okMaybeItsACandidateRegexp = regexp .MustCompile (`and|article|body|column|main|shadow` )
31
+ unlikelyCandidatesRegexp = regexp .MustCompile (`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote` )
32
32
33
- negativeRegexp = regexp .MustCompile (`(?i)hidden|^ hid$|hid$|hid|^hid | banner|combx|comment|com-|contact|foot|footer|footnote| masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby|p-author ` )
34
- positiveRegexp = regexp .MustCompile (`(?i) article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story` )
33
+ negativeRegexp = regexp .MustCompile (`hid| banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby` )
34
+ positiveRegexp = regexp .MustCompile (`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story` )
35
35
)
36
36
37
37
type candidate struct {
@@ -154,9 +154,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
154
154
}
155
155
class , _ := s .Attr ("class" )
156
156
id , _ := s .Attr ("id" )
157
- str := class + id
157
+ str := strings . ToLower ( class + id )
158
158
159
- if blacklistCandidatesRegexp .MatchString (str ) || (unlikelyCandidatesRegexp .MatchString (str ) && ! okMaybeItsACandidateRegexp .MatchString (str )) {
159
+ if blacklistCandidatesRegexp .MatchString (str ) {
160
+ removeNodes (s )
161
+ } else if unlikelyCandidatesRegexp .MatchString (str ) && ! okMaybeItsACandidateRegexp .MatchString (str ) {
160
162
removeNodes (s )
161
163
}
162
164
})
@@ -277,6 +279,9 @@ func getClassWeight(s *goquery.Selection) float32 {
277
279
class , _ := s .Attr ("class" )
278
280
id , _ := s .Attr ("id" )
279
281
282
+ class = strings .ToLower (class )
283
+ id = strings .ToLower (id )
284
+
280
285
if class != "" {
281
286
if negativeRegexp .MatchString (class ) {
282
287
weight -= 25
0 commit comments