Merge pull request #263 from kaustubhgupta/pdf_to_csv

powerexploit · web-flow · commit fbd51da85a46 · 2020-09-14T19:15:25.000+05:30
PDF tables to CSV files
diff --git a/System-Automation-Scripts/PDF_Tables_To_CSV/.gitignore b/System-Automation-Scripts/PDF_Tables_To_CSV/.gitignore
@@ -0,0 +1,9 @@
+# Vscode files
+.vscode
+
+# Sample Files
+sample.pdf
+sample2.pdf
+
+# Python
+__pycache__
diff --git a/System-Automation-Scripts/PDF_Tables_To_CSV/README.md b/System-Automation-Scripts/PDF_Tables_To_CSV/README.md
@@ -0,0 +1,22 @@
+# PDF to CSV
+This script will convert the tables in the PDF file into CSV files. Each CSV file has one table from the PDF and the number of CSV equal to the number of tables in the PDF.
+
+# Requirements
+`pip install tabula-py, pandas`
+
+# How to use?
+Just use the following command while executing the scrpit:
+
+`python app.py location_of_pdf pages`
+
+Pages have two options:
+- 'all' will extract tables from whole PDF
+- specific page (ex 1,2,54..) will extract table from that page
+
+Example:
+- `python app.py sample.pdf all`
+- `python app.py sample2.pdf 45`
+
+# Preview
+
+![](preview.gif)
diff --git a/System-Automation-Scripts/PDF_Tables_To_CSV/app.py b/System-Automation-Scripts/PDF_Tables_To_CSV/app.py
@@ -0,0 +1,19 @@
+import tabula
+import pandas as pd
+import sys
+
+def extract(path, number_pages):
+    tables = tabula.read_pdf(path, multiple_tables=True, pages=number_pages)
+    count = 1
+    if len(tables)!=0:
+        for table in tables:
+            print
+            print(f"Saving file -{count}")
+            table.to_csv(f'Table- {count}.csv')
+            count += 1
+        print("All tables saved as seperate files !")
+    else:
+        print("No tables found !")
+
+if __name__ == "__main__":
+    extract(sys.argv[1], sys.argv[2])
diff --git a/System-Automation-Scripts/PDF_Tables_To_CSV/preview.gif b/System-Automation-Scripts/PDF_Tables_To_CSV/preview.gif