From 54e5ea3a6a788380e183371219a85a063f0632c5 Mon Sep 17 00:00:00 2001 From: Ryan Pendleton Date: Sun, 6 Oct 2019 04:30:02 -0600 Subject: [PATCH] don't crawl files that haven't been modified since the last crawl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to RFC 7232 ยง4.1, when a page is returned with a 304 Not Modified header, it is recommended that headers such as Content-Type should not be sent the the client again. When verbose logging in osg is enabled, this would manifest as not crawling a page because its content type was an empty string. Instead of doing that, we should just log that the page hasn't been modified and avoid crawling it completely. This is okay since a 304 response isn't permitted to have a response body anyway, so there's no need to crawl it. --- main.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index 7935f47..5d56198 100644 --- a/main.go +++ b/main.go @@ -132,7 +132,12 @@ func (c *Crawler) Crawl(u *url.URL, root *url.URL, lastmod string) { r.Lastmod = lm.Format(SitemapTimeFormat) ct := res.Header.Get("Content-Type") var dontCrawl bool - if strings.HasPrefix(ct, "text/html") { + if res.StatusCode == http.StatusNotModified { + if *verbose { + log.Println("Not crawling", s, "(304 not modified)") + } + dontCrawl = true + } else if strings.HasPrefix(ct, "text/html") { // Proceed } else if strings.HasPrefix(ct, "text/xml") { // TODO: Read sitemaps