From 8ac5a8600c4efdbd70393435a19696e52e5ce9b1 Mon Sep 17 00:00:00 2001 From: Ryan Pendleton Date: Sun, 6 Oct 2019 05:41:09 -0600 Subject: [PATCH] ignore the fragment portion of URLs when crawling links --- main.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index 7935f47..e4234e3 100644 --- a/main.go +++ b/main.go @@ -269,8 +269,17 @@ L: } } else if t.Type == A && t.Href != "" { ignore := false - if t.Href[0] == '#' { + hashIndex := strings.Index(t.Href, "#") + if hashIndex == 0 { + if *verbose { + log.Println("Link to", t.Href, "on page", u, "has an anchor to itself; skipping link") + } ignore = true + } else if hashIndex > 0 { + if *verbose { + log.Println("Link to", t.Href, "on page", u, "has an anchor to another page; removing fragment from URL") + } + t.Href = string(t.Href[0:hashIndex]) } if strings.Contains(t.Rel, "nofollow") { if *noRobots {