added extract links from pdf tutorial

x4nth055 · x4nth055 · commit 67b14ec07148 · 2020-08-22T11:19:24.000+02:00
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
     - [How to Extract and Submit Web Forms from a URL using Python](https://www.thepythoncode.com/article/extracting-and-submitting-web-page-forms-in-python). ([code](web-scraping/extract-and-fill-forms))
     - [How to Get Domain Name Information in Python](https://www.thepythoncode.com/article/extracting-domain-name-information-in-python). ([code](web-scraping/get-domain-info))
     - [How to Extract YouTube Comments in Python](https://www.thepythoncode.com/article/extract-youtube-comments-in-python). ([code](web-scraping/youtube-comments-extractor))
+    - [How to Extract All PDF Links in Python](https://www.thepythoncode.com/article/extract-pdf-links-with-python). ([code](web-scraping/pdf-url-extractor))
 
 - ### [Python Standard Library](https://www.thepythoncode.com/topic/python-standard-library)
     - [How to Transfer Files in the Network using Sockets in Python](https://www.thepythoncode.com/article/send-receive-files-using-sockets-python). ([code](general/transfer-files/))
diff --git a/web-scraping/pdf-url-extractor/1710.05006.pdf b/web-scraping/pdf-url-extractor/1710.05006.pdf
diff --git a/web-scraping/pdf-url-extractor/1810.04805.pdf b/web-scraping/pdf-url-extractor/1810.04805.pdf
diff --git a/web-scraping/pdf-url-extractor/README.md b/web-scraping/pdf-url-extractor/README.md
@@ -0,0 +1,4 @@
+# [How to Extract All PDF Links in Python](https://www.thepythoncode.com/article/extract-pdf-links-with-python)
+To run this:
+- `pip3 install -r requirements.txt`
+- Use `pdf_link_extractor.py` to get clickable links, and `pdf_link_extractor_regex.py` to get links that are in text form.
diff --git a/web-scraping/pdf-url-extractor/pdf_link_extractor.py b/web-scraping/pdf-url-extractor/pdf_link_extractor.py
@@ -0,0 +1,15 @@
+import pikepdf # pip3 install pikepdf
+
+file = "1810.04805.pdf"
+# file = "1710.05006.pdf"
+pdf_file = pikepdf.Pdf.open(file)
+urls = []
+# iterate over PDF pages
+for page in pdf_file.pages:
+    for annots in page.get("/Annots"):
+        uri = annots.get("/A").get("/URI")
+        if uri is not None:
+            print("[+] URL Found:", uri)
+            urls.append(uri)
+
+print("[*] Total URLs extracted:", len(urls))
diff --git a/web-scraping/pdf-url-extractor/pdf_link_extractor_regex.py b/web-scraping/pdf-url-extractor/pdf_link_extractor_regex.py
@@ -0,0 +1,22 @@
+import fitz # pip install PyMuPDF
+import re
+
+# a regular expression of URLs
+url_regex = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
+# extract raw text from pdf
+# file = "1710.05006.pdf"
+file = "1810.04805.pdf"
+# open the PDF file
+with fitz.open(file) as pdf:
+    text = ""
+    for page in pdf:
+        # extract text of each PDF page
+        text += page.getText()
+urls = []
+# extract all urls using the regular expression
+for match in re.finditer(url_regex, text):
+    url = match.group()
+    print("[+] URL Found:", url)
+    urls.append(url)
+print("[*] Total URLs extracted:", len(urls))
+
diff --git a/web-scraping/pdf-url-extractor/requirements.txt b/web-scraping/pdf-url-extractor/requirements.txt
@@ -0,0 +1,2 @@
+pikepdf
+PyMuPDF