Skip to content

Commit 7b60b6a

Browse files
committed
add watermark pdf tutorial
1 parent 6267704 commit 7b60b6a

File tree

5 files changed

+330
-0
lines changed

5 files changed

+330
-0
lines changed

Diff for: README.md

+1
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
9191
- [Asynchronous Tasks with Celery in Python](https://www.thepythoncode.com/article/async-tasks-with-celery-redis-and-flask-in-python). ([code](https://github.com/bassemmarji/flask_sync_async))
9292
- [How to Change Text Color in Python](https://www.thepythoncode.com/article/change-text-color-in-python). ([code](general/printing-in-colors))
9393
- [How to Create a Watchdog in Python](https://www.thepythoncode.com/article/create-a-watchdog-in-python). ([code](general/directory-watcher))
94+
- [How to Watermark PDF Files in Python](https://www.thepythoncode.com/article/watermark-in-pdf-using-python). ([code](general/add-watermark-pdf))
9495

9596

9697
- ### [Web Scraping](https://www.thepythoncode.com/topic/web-scraping)

Diff for: general/add-watermark-pdf/README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# [How to Watermark PDF Files in Python](https://www.thepythoncode.com/article/watermark-in-pdf-using-python)
2+
To run this:
3+
- `pip3 install -r requirements.txt`
4+
- ```python pdf_watermarker.py --help```
5+
6+
**Output:**
7+
```
8+
9+
Available Options
10+
11+
optional arguments:
12+
-h, --help show this help message and exit
13+
-i INPUT_PATH, --input_path INPUT_PATH
14+
Enter the path of the file or the folder to process
15+
-a {watermark,unwatermark}, --action {watermark,unwatermark}
16+
Choose whether to watermark or to unwatermark
17+
-m {RAM,HDD}, --mode {RAM,HDD}
18+
Choose whether to process on the hard disk drive or in memory
19+
-w WATERMARK_TEXT, --watermark_text WATERMARK_TEXT
20+
Enter a valid watermark text
21+
-p PAGES, --pages PAGES
22+
Enter the pages to consider e.g.: [2,4]
23+
```
24+
- To add a watermark with any text on `lorem-ipsum.pdf` file and output it as `watermarked_lorem-ipsum.pdf`:
25+
```
26+
python pdf_watermarker.py -i lorem-ipsum.pdf -a watermark -w "text here" -o watermarked_lorem-ipsum.pdf
27+
```

Diff for: general/add-watermark-pdf/lorem-ipsum.pdf

75.3 KB
Binary file not shown.

Diff for: general/add-watermark-pdf/pdf_watermarker.py

+300
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
from PyPDF4 import PdfFileReader, PdfFileWriter
2+
from PyPDF4.pdf import ContentStream
3+
from PyPDF4.generic import TextStringObject, NameObject
4+
from PyPDF4.utils import b_
5+
import os
6+
import argparse
7+
from io import BytesIO
8+
from typing import Tuple
9+
# Import the reportlab library
10+
from reportlab.pdfgen import canvas
11+
# The size of the page supposedly A4
12+
from reportlab.lib.pagesizes import A4
13+
# The color of the watermark
14+
from reportlab.lib import colors
15+
16+
PAGESIZE = A4
17+
FONTNAME = 'Helvetica-Bold'
18+
FONTSIZE = 40
19+
# using colors module
20+
# COLOR = colors.lightgrey
21+
# or simply RGB
22+
# COLOR = (190, 190, 190)
23+
COLOR = colors.red
24+
# The position attributes of the watermark
25+
X = 250
26+
Y = 10
27+
# The rotation angle in order to display the watermark diagonally if needed
28+
ROTATION_ANGLE = 45
29+
30+
31+
def get_info(input_file: str):
32+
"""
33+
Extracting the file info
34+
"""
35+
# If PDF is encrypted the file metadata cannot be extracted
36+
with open(input_file, 'rb') as pdf_file:
37+
pdf_reader = PdfFileReader(pdf_file, strict=False)
38+
output = {
39+
"File": input_file, "Encrypted": ("True" if pdf_reader.isEncrypted else "False")
40+
}
41+
if not pdf_reader.isEncrypted:
42+
info = pdf_reader.getDocumentInfo()
43+
num_pages = pdf_reader.getNumPages()
44+
output["Author"] = info.author
45+
output["Creator"] = info.creator
46+
output["Producer"] = info.producer
47+
output["Subject"] = info.subject
48+
output["Title"] = info.title
49+
output["Number of pages"] = num_pages
50+
# To Display collected metadata
51+
print("## File Information ##################################################")
52+
print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
53+
print("######################################################################")
54+
return True, output
55+
56+
57+
def get_output_file(input_file: str, output_file: str):
58+
"""
59+
Check whether a temporary output file is needed or not
60+
"""
61+
input_path = os.path.dirname(input_file)
62+
input_filename = os.path.basename(input_file)
63+
# If output file is empty -> generate a temporary output file
64+
# If output file is equal to input_file -> generate a temporary output file
65+
if not output_file or input_file == output_file:
66+
tmp_file = os.path.join(input_path, 'tmp_' + input_filename)
67+
return True, tmp_file
68+
return False, output_file
69+
70+
71+
def create_watermark(wm_text: str):
72+
"""
73+
Creates a watermark template.
74+
"""
75+
if wm_text:
76+
# Generate the output to a memory buffer
77+
output_buffer = BytesIO()
78+
# Default Page Size = A4
79+
c = canvas.Canvas(output_buffer, pagesize=PAGESIZE)
80+
# you can also add image instead of text
81+
# c.drawImage("logo.png", X, Y, 160, 160)
82+
# Set the size and type of the font
83+
c.setFont(FONTNAME, FONTSIZE)
84+
# Set the color
85+
if isinstance(COLOR, tuple):
86+
color = (c/255 for c in COLOR)
87+
c.setFillColorRGB(*color)
88+
else:
89+
c.setFillColor(COLOR)
90+
# Rotate according to the configured parameter
91+
c.rotate(ROTATION_ANGLE)
92+
# Position according to the configured parameter
93+
c.drawString(X, Y, wm_text)
94+
c.save()
95+
return True, output_buffer
96+
return False, None
97+
98+
99+
def save_watermark(wm_buffer, output_file):
100+
"""
101+
Saves the generated watermark template to disk
102+
"""
103+
with open(output_file, mode='wb') as f:
104+
f.write(wm_buffer.getbuffer())
105+
f.close()
106+
return True
107+
108+
109+
def watermark_pdf(input_file: str, wm_text: str, pages: Tuple = None):
110+
"""
111+
Adds watermark to a pdf file.
112+
"""
113+
result, wm_buffer = create_watermark(wm_text)
114+
if result:
115+
wm_reader = PdfFileReader(wm_buffer)
116+
pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False)
117+
pdf_writer = PdfFileWriter()
118+
try:
119+
for page in range(pdf_reader.getNumPages()):
120+
# If required to watermark specific pages not all the document pages
121+
if pages:
122+
if str(page) not in pages:
123+
continue
124+
page = pdf_reader.getPage(page)
125+
page.mergePage(wm_reader.getPage(0))
126+
pdf_writer.addPage(page)
127+
except Exception as e:
128+
print("Exception = ", e)
129+
return False, None, None
130+
131+
return True, pdf_reader, pdf_writer
132+
133+
134+
def unwatermark_pdf(input_file: str, wm_text: str, pages: Tuple = None):
135+
"""
136+
Removes watermark from the pdf file.
137+
"""
138+
pdf_reader = PdfFileReader(open(input_file, 'rb'), strict=False)
139+
pdf_writer = PdfFileWriter()
140+
for page in range(pdf_reader.getNumPages()):
141+
# If required for specific pages
142+
if pages:
143+
if str(page) not in pages:
144+
continue
145+
page = pdf_reader.getPage(page)
146+
# Get the page content
147+
content_object = page["/Contents"].getObject()
148+
content = ContentStream(content_object, pdf_reader)
149+
# Loop through all the elements page elements
150+
for operands, operator in content.operations:
151+
# Checks the TJ operator and replaces the corresponding string operand (Watermark text) with ''
152+
if operator == b_("Tj"):
153+
text = operands[0]
154+
if isinstance(text, str) and text.startswith(wm_text):
155+
operands[0] = TextStringObject('')
156+
page.__setitem__(NameObject('/Contents'), content)
157+
pdf_writer.addPage(page)
158+
return True, pdf_reader, pdf_writer
159+
160+
161+
def watermark_unwatermark_file(**kwargs):
162+
input_file = kwargs.get('input_file')
163+
wm_text = kwargs.get('wm_text')
164+
# watermark -> Watermark
165+
# unwatermark -> Unwatermark
166+
action = kwargs.get('action')
167+
# HDD -> Temporary files are saved on the Hard Disk Drive and then deleted
168+
# RAM -> Temporary files are saved in memory and then deleted.
169+
mode = kwargs.get('mode')
170+
pages = kwargs.get('pages')
171+
temporary, output_file = get_output_file(
172+
input_file, kwargs.get('output_file'))
173+
if action == "watermark":
174+
result, pdf_reader, pdf_writer = watermark_pdf(
175+
input_file=input_file, wm_text=wm_text, pages=pages)
176+
elif action == "unwatermark":
177+
result, pdf_reader, pdf_writer = unwatermark_pdf(
178+
input_file=input_file, wm_text=wm_text, pages=pages)
179+
# Completed successfully
180+
if result:
181+
# Generate to memory
182+
if mode == "RAM":
183+
output_buffer = BytesIO()
184+
pdf_writer.write(output_buffer)
185+
pdf_reader.stream.close()
186+
# No need to create a temporary file in RAM Mode
187+
if temporary:
188+
output_file = input_file
189+
with open(output_file, mode='wb') as f:
190+
f.write(output_buffer.getbuffer())
191+
f.close()
192+
elif mode == "HDD":
193+
# Generate to a new file on the hard disk
194+
with open(output_file, 'wb') as pdf_output_file:
195+
pdf_writer.write(pdf_output_file)
196+
pdf_output_file.close()
197+
198+
pdf_reader.stream.close()
199+
if temporary:
200+
if os.path.isfile(input_file):
201+
os.replace(output_file, input_file)
202+
output_file = input_file
203+
204+
205+
def watermark_unwatermark_folder(**kwargs):
206+
"""
207+
Watermarks all PDF Files within a specified path
208+
Unwatermarks all PDF Files within a specified path
209+
"""
210+
input_folder = kwargs.get('input_folder')
211+
wm_text = kwargs.get('wm_text')
212+
# Run in recursive mode
213+
recursive = kwargs.get('recursive')
214+
# watermark -> Watermark
215+
# unwatermark -> Unwatermark
216+
action = kwargs.get('action')
217+
# HDD -> Temporary files are saved on the Hard Disk Drive and then deleted
218+
# RAM -> Temporary files are saved in memory and then deleted.
219+
mode = kwargs.get('mode')
220+
pages = kwargs.get('pages')
221+
# Loop though the files within the input folder.
222+
for foldername, dirs, filenames in os.walk(input_folder):
223+
for filename in filenames:
224+
# Check if pdf file
225+
if not filename.endswith('.pdf'):
226+
continue
227+
# PDF File found
228+
inp_pdf_file = os.path.join(foldername, filename)
229+
print("Processing file:", inp_pdf_file)
230+
watermark_unwatermark_file(input_file=inp_pdf_file, output_file=None,
231+
wm_text=wm_text, action=action, mode=mode, pages=pages)
232+
if not recursive:
233+
break
234+
235+
236+
def is_valid_path(path):
237+
"""
238+
Validates the path inputted and checks whether it is a file path or a folder path
239+
"""
240+
if not path:
241+
raise ValueError(f"Invalid Path")
242+
if os.path.isfile(path):
243+
return path
244+
elif os.path.isdir(path):
245+
return path
246+
else:
247+
raise ValueError(f"Invalid Path {path}")
248+
249+
250+
def parse_args():
251+
"""
252+
Get user command line parameters
253+
"""
254+
parser = argparse.ArgumentParser(description="Available Options")
255+
parser.add_argument('-i', '--input_path', dest='input_path', type=is_valid_path,
256+
required=True, help="Enter the path of the file or the folder to process")
257+
parser.add_argument('-a', '--action', dest='action', choices=[
258+
'watermark', 'unwatermark'], type=str, default='watermark',
259+
help="Choose whether to watermark or to unwatermark")
260+
parser.add_argument('-m', '--mode', dest='mode', choices=['RAM', 'HDD'], type=str,
261+
default='RAM', help="Choose whether to process on the hard disk drive or in memory")
262+
parser.add_argument('-w', '--watermark_text', dest='watermark_text',
263+
type=str, required=True, help="Enter a valid watermark text")
264+
parser.add_argument('-p', '--pages', dest='pages', type=tuple,
265+
help="Enter the pages to consider e.g.: [2,4]")
266+
path = parser.parse_known_args()[0].input_path
267+
if os.path.isfile(path):
268+
parser.add_argument('-o', '--output_file', dest='output_file',
269+
type=str, help="Enter a valid output file")
270+
if os.path.isdir(path):
271+
parser.add_argument('-r', '--recursive', dest='recursive', default=False, type=lambda x: (
272+
str(x).lower() in ['true', '1', 'yes']), help="Process Recursively or Non-Recursively")
273+
# To Porse The Command Line Arguments
274+
args = vars(parser.parse_args())
275+
# To Display The Command Line Arguments
276+
print("## Command Arguments #################################################")
277+
print("\n".join("{}:{}".format(i, j) for i, j in args.items()))
278+
print("######################################################################")
279+
return args
280+
281+
282+
if __name__ == '__main__':
283+
# Parsing command line arguments entered by user
284+
args = parse_args()
285+
# If File Path
286+
if os.path.isfile(args['input_path']):
287+
# Extracting File Info
288+
get_info(input_file=args['input_path'])
289+
# Encrypting or Decrypting a File
290+
watermark_unwatermark_file(
291+
input_file=args['input_path'], wm_text=args['watermark_text'], action=args[
292+
'action'], mode=args['mode'], output_file=args['output_file'], pages=args['pages']
293+
)
294+
# If Folder Path
295+
elif os.path.isdir(args['input_path']):
296+
# Encrypting or Decrypting a Folder
297+
watermark_unwatermark_folder(
298+
input_folder=args['input_path'], wm_text=args['watermark_text'],
299+
action=args['action'], mode=args['mode'], recursive=args['recursive'], pages=args['pages']
300+
)

Diff for: general/add-watermark-pdf/requirements.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
PyPDF4==1.27.0
2+
reportlab==3.5.59

0 commit comments

Comments
 (0)