-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpaizo-dl.py
245 lines (203 loc) · 10.2 KB
/
paizo-dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import requests
import time
import os
import argparse
import zipfile
import re
from bs4 import BeautifulSoup
# Define constants
ASSETS_URL = "https://paizo.com/paizo/account/assets"
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
DOWNLOAD_DIR = "downloads" # Directory to save the downloaded files
MAX_RETRIES = 3 # Number of retries if download fails
# Headers extracted from the curl command
HEADERS = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-US,en;q=0.9,nb;q=0.8',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://paizo.com/paizo/account/assets',
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Linux"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': USER_AGENT
}
# Cookies extracted from the curl command
COOKIES = {
'sessionId': 'CHANGE ME', # Replace with your actual sessionId from the browser, look in the web dev tool
'_pk_id.1.638a': '156ac78b02e09b8e.1725558046.',
'_pk_ses.1.638a': '1'
}
def sanitize_name(name):
"""Sanitize asset name to remove unwanted characters or messages."""
# Remove any non-alphanumeric characters, except for a few allowed ones
sanitized_name = re.sub(r'[^\w\s\(\)\-\.\,]', '', name)
# Remove any 'Personalizing...' messages or newlines
sanitized_name = re.sub(r'Personalizing.*$', '', sanitized_name).strip()
return sanitized_name
def fetch_assets(session, debug=False):
"""Fetch the list of digital assets and their download packages from Paizo."""
print(f"Fetching assets from: {ASSETS_URL}")
response = session.get(ASSETS_URL)
if debug:
print(f"Response Status Code: {response.status_code}")
if response.status_code != 200:
print(f"Failed to load assets page. Status code: {response.status_code}")
return []
# Debug: Print a portion of the HTML response for inspection
if debug:
print("Snippet of fetched HTML (first 1000 characters):")
print(response.text[:1000]) # Print the first 1000 characters of the HTML for debugging
# Parse the HTML response
soup = BeautifulSoup(response.content, 'html.parser')
# Find and extract asset information
assets = []
tbodies = soup.find_all('tbody')
if debug:
print(f"Number of <tbody> elements found: {len(tbodies)}")
for tbody in tbodies:
asset_link = tbody.find('a')
if asset_link and 'digitalAsset' in asset_link['href']:
hex_id = asset_link['href'].split('digitalAsset=')[1].split('&')[0]
asset_name = asset_link.find('b').text.strip()
# Extract format or version (e.g., "Single File" or "File Per Chapter")
version_span = tbody.find('span', class_='tiny no-wrap')
asset_version = version_span.text.strip() if version_span else "Unknown Format"
asset_name_with_version = f"{asset_name} ({asset_version})"
# Sanitize asset name
asset_name_with_version = sanitize_name(asset_name_with_version)
download_package = extract_download_package(tbody) # Dynamically extract the correct download package
assets.append({
'hex_id': hex_id,
'name': asset_name_with_version,
'download_package': download_package
})
return assets
def extract_download_package(tbody):
"""Extract the download package dynamically from the HTML."""
download_link = tbody.find('a', href=True)
if download_link and 'downloadPackage=' in download_link['href']:
download_package = download_link['href'].split('downloadPackage=')[1].split('&')[0]
return download_package
return None # Fallback if not found
def get_unique_filename(file_path):
"""Check if the file exists and generate a unique file name by appending a number."""
base, ext = os.path.splitext(file_path)
counter = 1
while os.path.exists(file_path):
file_path = f"{base}-({counter}){ext}"
counter += 1
return file_path
def start_personalization_process(session, hex_id, download_package, debug=False):
"""Send a request to start the personalization process for the asset."""
personalization_url = f"https://paizo.com/cgi-bin/WebObjects/Store.woa/wa/Personalizer/downloadDigitalAsset?digitalAsset={hex_id}&downloadPackage={download_package}&_r=true"
# Send the request to initiate personalization
if debug:
print(f"Initiating personalization for asset {hex_id}...")
response = session.get(personalization_url)
if response.status_code == 200:
if debug:
print(f"Personalization started for {hex_id}. Waiting for server to prepare the asset.")
time.sleep(40) # Adjust this delay to ensure server has time to personalize the asset
else:
print(f"Failed to start personalization for {hex_id}. Status code: {response.status_code}")
def download_asset(session, hex_id, download_package, asset_name, debug=False):
"""Download the asset after personalization completes with retries."""
retry_count = 0
while retry_count < MAX_RETRIES:
download_check_url = f"https://paizo.com/cgi-bin/WebObjects/Store.woa/wa/Personalizer/downloadDigitalAsset?digitalAsset={hex_id}&downloadPackage={download_package}&_r=true"
if debug:
print(f"Fetching and downloading asset for {hex_id} (Attempt {retry_count + 1}/{MAX_RETRIES})...")
# Send the request to fetch the asset
response = session.get(download_check_url, stream=True)
content_type = response.headers.get('Content-Type', '')
if response.status_code == 200:
# Determine the file extension based on the content type or name
if 'application/pdf' in content_type:
file_extension = '.pdf'
elif 'application/zip' in content_type or 'application/octet-stream' in content_type:
# If the asset name contains "ePub", assume it's an EPUB file
if 'epub' in asset_name.lower():
file_extension = '.epub'
else:
file_extension = '.zip'
elif 'html' in content_type:
print(f"Received HTML page instead of the file for {asset_name}. Retrying...")
retry_count += 1
time.sleep(10) # Wait before retrying
continue # Retry the download
else:
file_extension = '.bin' # Fallback to binary if unsure
if debug:
print(f"Downloading file for asset {hex_id} (Content-Type: {content_type})...")
# Ensure the download directory exists
if not os.path.exists(DOWNLOAD_DIR):
os.makedirs(DOWNLOAD_DIR)
# Generate a unique file name
file_name = f"{asset_name}{file_extension}"
file_path = os.path.join(DOWNLOAD_DIR, file_name)
file_path = get_unique_filename(file_path) # Ensure unique file name
# Save the file
with open(file_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # Filter out keep-alive new chunks
file.write(chunk)
print(f"{asset_name} downloaded successfully and saved to {file_path}")
# Unzip if it's a zip file and rename contents
if file_extension == '.zip':
unzip_and_rename(file_path, asset_name)
return # Exit after a successful download
else:
print(f"Failed to download {asset_name}. Status code: {response.status_code}")
print(f"Content-Type: {content_type}")
retry_count += 1
time.sleep(10) # Wait before retrying
print(f"Failed to download {asset_name} after {MAX_RETRIES} attempts. Skipping.")
def unzip_and_rename(zip_path, asset_name):
"""Unzip the downloaded zip file and rename its contents based on the asset name."""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
extract_dir = os.path.splitext(zip_path)[0]
zip_ref.extractall(extract_dir)
# Rename all extracted files based on the asset name
for root, dirs, files in os.walk(extract_dir):
for file in files:
old_path = os.path.join(root, file)
new_name = f"{asset_name}_{file}"
new_path = os.path.join(root, new_name)
os.rename(old_path, new_path)
print(f"Unzipped and renamed files in: {extract_dir}")
def main():
# Setup argument parser
parser = argparse.ArgumentParser(description="Paizo Digital Assets Downloader")
parser.add_argument('--list-assets', action='store_true', help="List all available assets without downloading.")
parser.add_argument('--debug', action='store_true', help="Enable debug output.")
args = parser.parse_args()
# Initialize session
session = requests.Session()
session.headers.update(HEADERS)
session.cookies.update(COOKIES)
# Fetch all assets
assets = fetch_assets(session, debug=args.debug)
if not assets:
print("No digital assets found.")
return
# If listing assets, print them and exit
if args.list_assets:
print("Available Digital Assets:")
for idx, asset in enumerate(assets):
print(f"{idx + 1}. Hex ID: {asset['hex_id']} - Name: {asset['name']}")
return
# Iterate over each asset and download one at a time
for asset in assets:
start_personalization_process(session, asset['hex_id'], asset['download_package'], debug=args.debug)
print(f"Waiting for asset {asset['name']} to be ready for download...")
time.sleep(40) # Adjust delay if needed to wait for personalization to complete
download_asset(session, asset['hex_id'], asset['download_package'], asset['name'], debug=args.debug)
if __name__ == "__main__":
main()