Skip to content

Commit 9c9a0c5

Browse files
committed
Find feeds via sitemap
1 parent d5cfcf8 commit 9c9a0c5

File tree

2 files changed

+122
-8
lines changed

2 files changed

+122
-8
lines changed

internal/reader/subscription/finder.go

+82-8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package subscription // import "miniflux.app/v2/internal/reader/subscription"
55

66
import (
77
"bytes"
8+
"encoding/xml"
89
"fmt"
910
"io"
1011
"log/slog"
@@ -125,6 +126,14 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
125126
slog.Debug("Subscriptions found with well-known URLs", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
126127
return subscriptions, nil
127128
}
129+
// Step 7) Check if the website has feeds in its sitemap.
130+
slog.Debug("Try to detect feeds from sitemap", slog.String("website_url", websiteURL))
131+
if subscriptions, localizedError := f.FindSubscriptionsFromSitemap(websiteURL); localizedError != nil {
132+
return nil, localizedError
133+
} else if len(subscriptions) > 0 {
134+
slog.Debug("Subscriptions found with sitemap", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
135+
return subscriptions, nil
136+
}
128137

129138
return nil, nil
130139
}
@@ -190,14 +199,16 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL, contentTyp
190199

191200
func (f *SubscriptionFinder) FindSubscriptionsFromWellKnownURLs(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
192201
knownURLs := map[string]string{
193-
"atom.xml": parser.FormatAtom,
194-
"feed.xml": parser.FormatAtom,
195-
"feed/": parser.FormatAtom,
196-
"rss.xml": parser.FormatRSS,
197-
"rss/": parser.FormatRSS,
198-
"index.rss": parser.FormatRSS,
199-
"index.xml": parser.FormatRSS,
200-
"feed.atom": parser.FormatAtom,
202+
"atom.xml": parser.FormatAtom,
203+
"feed.xml": parser.FormatAtom,
204+
"feed": parser.FormatAtom,
205+
"rss.xml": parser.FormatRSS,
206+
"rss": parser.FormatRSS,
207+
"index.rss": parser.FormatRSS,
208+
"index.xml": parser.FormatRSS,
209+
"feed.atom": parser.FormatAtom,
210+
"atom": parser.FormatAtom,
211+
"index.atom": parser.FormatAtom,
201212
}
202213

203214
websiteURLRoot := urllib.RootURL(websiteURL)
@@ -316,3 +327,66 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL
316327

317328
return nil, nil
318329
}
330+
331+
func (f *SubscriptionFinder) FindSubscriptionsFromSitemap(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
332+
websiteURLRoot := urllib.RootURL(websiteURL)
333+
334+
responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(websiteURLRoot + "/sitemap.xml"))
335+
defer responseHandler.Close()
336+
337+
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
338+
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
339+
return nil, localizedError
340+
}
341+
342+
responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
343+
if localizedError != nil {
344+
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
345+
return nil, localizedError
346+
}
347+
return findSubscriptionsFromDownloadedSitemap(bytes.NewReader(responseBody))
348+
}
349+
350+
func findSubscriptionsFromDownloadedSitemap(body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
351+
var subscriptions Subscriptions
352+
loc := struct {
353+
Content string `xml:",chardata"`
354+
}{}
355+
356+
decoder := xml.NewDecoder(body)
357+
for {
358+
t, _ := decoder.Token()
359+
if t == nil {
360+
break
361+
}
362+
switch se := t.(type) {
363+
case xml.StartElement:
364+
if se.Name.Local != "loc" {
365+
continue
366+
}
367+
368+
if err := decoder.DecodeElement(&loc, &se); err != nil {
369+
slog.Warn("Unable to decode loc", slog.Any("error", err))
370+
}
371+
feedUrl := loc.Content
372+
switch {
373+
case strings.Contains(feedUrl, ".xml"),
374+
strings.Contains(feedUrl, "rss"):
375+
subscriptions = append(subscriptions, &Subscription{
376+
Type: parser.FormatRSS,
377+
Title: feedUrl,
378+
URL: feedUrl,
379+
})
380+
case strings.Contains(feedUrl, "feed"),
381+
strings.Contains(feedUrl, "atom"):
382+
subscriptions = append(subscriptions, &Subscription{
383+
Type: parser.FormatAtom,
384+
Title: feedUrl,
385+
URL: feedUrl,
386+
})
387+
}
388+
}
389+
}
390+
391+
return subscriptions, nil
392+
}

internal/reader/subscription/finder_test.go

+40
Original file line numberDiff line numberDiff line change
@@ -481,3 +481,43 @@ func TestParseWebPageWithNoHref(t *testing.T) {
481481
t.Fatal(`Incorrect number of subscriptions returned`)
482482
}
483483
}
484+
485+
func TestParseSiteMap(t *testing.T) {
486+
htmlPage := `
487+
<?xml version="1.0" encoding="UTF-8"?>
488+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
489+
<url>
490+
<loc>http://www.example.com/</loc>
491+
<lastmod>2005-01-01</lastmod>
492+
<changefreq>monthly</changefreq>
493+
<priority>0.8</priority>
494+
</url>
495+
<url>
496+
<loc>http://www.example.com/feed/myfeed</loc>
497+
<lastmod>2005-01-01</lastmod>
498+
<changefreq>monthly</changefreq>
499+
<priority>0.8</priority>
500+
</url>
501+
<url>
502+
<loc>http://www.example.com/myfeed.xml</loc>
503+
<lastmod>2005-01-01</lastmod>
504+
<changefreq>monthly</changefreq>
505+
<priority>0.8</priority>
506+
</url>
507+
<url>
508+
<loc>http://www.example.com/atom_feed.xml</loc>
509+
<lastmod>2005-01-01</lastmod>
510+
<changefreq>monthly</changefreq>
511+
<priority>0.8</priority>
512+
</url>
513+
</urlset> `
514+
515+
subscriptions, err := findSubscriptionsFromDownloadedSitemap(strings.NewReader(htmlPage))
516+
if err != nil {
517+
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
518+
}
519+
520+
if len(subscriptions) != 3 {
521+
t.Fatal(`Incorrect number of subscriptions returned`)
522+
}
523+
}

0 commit comments

Comments
 (0)