added link extractor tutorial

x4nth055 · x4nth055 · commit e7fe8ee7b94b · 2019-10-07T18:19:52.000+02:00
diff --git a/README.md b/README.md
@@ -46,5 +46,6 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Extract YouTube Data in Python](https://www.thepythoncode.com/article/get-youtube-data-python). ([code](web-scraping/youtube-extractor))
     - [How to Extract Weather Data from Google in Python](https://www.thepythoncode.com/article/extract-weather-data-python). ([code](web-scraping/weather-extractor))
     - [How to Download All Images from a Web Page in Python](https://www.thepythoncode.com/article/download-web-page-images-python). ([code](web-scraping/download-images))
+    - [How to Extract All Website Links in Python](https://www.thepythoncode.com/article/extract-all-website-links-python). ([code](web-scraping/link-extractor))
 
 For any feedback, please consider pulling requests.
diff --git a/web-scraping/link-extractor/README.md b/web-scraping/link-extractor/README.md
@@ -0,0 +1,37 @@
+# [How to Extract All Website Links in Python](https://www.thepythoncode.com/article/extract-all-website-links-python)
+To run this:
+- `pip3 install -r requirements.txt`
+-
+    ```
+    python link_extractor.py --help
+    ```
+    **Output:**
+    ```
+    usage: link_extractor.py [-h] [-m MAX_URLS] url
+
+    Link Extractor Tool with Python
+
+    positional arguments:
+    url                   The URL to extract links from.
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -m MAX_URLS, --max-urls MAX_URLS
+                            Number of max URLs to crawl, default is 30.
+    ```
+- For instance, to extract all links from 2 first URLs appeared in github.com:
+    ```
+    python link_extractor.py https://github.com -m 2
+    ```
+    This will result in a large list, here is the last 5 links:
+    ```
+    [!] External link: https://developer.github.com/
+    [*] Internal link: https://help.github.com/
+    [!] External link: https://github.blog/
+    [*] Internal link: https://help.github.com/articles/github-terms-of-service/
+    [*] Internal link: https://help.github.com/articles/github-privacy-statement/
+    [+] Total Internal links: 85
+    [+] Total External links: 21
+    [+] Total URLs: 106
+    ```
+    This will also save these URLs in `github.com_external_links.txt` for external links and `github.com_internal_links.txt` for internal links.
diff --git a/web-scraping/link-extractor/link_extractor.py b/web-scraping/link-extractor/link_extractor.py
@@ -0,0 +1,108 @@
+import requests
+from urllib.request import urlparse, urljoin
+from bs4 import BeautifulSoup
+import colorama
+
+# init the colorama module
+colorama.init()
+
+GREEN = colorama.Fore.GREEN
+GRAY = colorama.Fore.LIGHTBLACK_EX
+RESET = colorama.Fore.RESET
+
+# initialize the set of links (unique links)
+internal_urls = set()
+external_urls = set()
+
+total_urls_visited = 0
+
+
+def is_valid(url):
+    """
+    Checks whether `url` is a valid URL.
+    """
+    parsed = urlparse(url)
+    return bool(parsed.netloc) and bool(parsed.scheme)
+
+
+def get_all_website_links(url):
+    """
+    Returns all URLs that is found on `url` in which it belongs to the same website
+    """
+    # all URLs of `url`
+    urls = set()
+    # domain name of the URL without the protocol
+    domain_name = urlparse(url).netloc
+    soup = BeautifulSoup(requests.get(url).content, "html.parser")
+    for a_tag in soup.findAll("a"):
+        href = a_tag.attrs.get("href")
+        if href == "" or href is None:
+            # href empty tag
+            continue
+        # join the URL if it's relative (not absolute link)
+        href = urljoin(url, href)
+        parsed_href = urlparse(href)
+        # remove URL GET parameters, URL fragments, etc.
+        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
+        if not is_valid(href):
+            # not a valid URL
+            continue
+        if href in internal_urls:
+            # already in the set
+            continue
+        if domain_name not in href:
+            # external link
+            if href not in external_urls:
+                print(f"{GRAY}[!] External link: {href}{RESET}")
+                external_urls.add(href)
+            continue
+        print(f"{GREEN}[*] Internal link: {href}{RESET}")
+        urls.add(href)
+        internal_urls.add(href)
+    return urls
+
+
+def crawl(url, max_urls=50):
+    """
+    Crawls a web page and extracts all links.
+    You'll find all links in `external_urls` and `internal_urls` global set variables.
+    params:
+        max_urls (int): number of max urls to crawl, default is 30.
+    """
+    global total_urls_visited
+    total_urls_visited += 1
+    links = get_all_website_links(url)
+    for link in links:
+        if total_urls_visited > max_urls:
+            break
+        crawl(link, max_urls=max_urls)
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
+    parser.add_argument("url", help="The URL to extract links from.")
+    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
+    
+    args = parser.parse_args()
+    url = args.url
+    max_urls = args.max_urls
+
+    crawl(url, max_urls=max_urls)
+
+    print("[+] Total Internal links:", len(internal_urls))
+    print("[+] Total External links:", len(external_urls))
+    print("[+] Total URLs:", len(external_urls) + len(internal_urls))
+
+    domain_name = urlparse(url).netloc
+
+    # save the internal links to a file
+    with open(f"{domain_name}_internal_links.txt", "w") as f:
+        for internal_link in internal_urls:
+            print(internal_link.strip(), file=f)
+
+    # save the external links to a file
+    with open(f"{domain_name}_external_links.txt", "w") as f:
+        for external_link in external_urls:
+            print(external_link.strip(), file=f)
+        
diff --git a/web-scraping/link-extractor/requirements.txt b/web-scraping/link-extractor/requirements.txt
@@ -0,0 +1,3 @@
+requests
+bs4
+colorama