From d56fb9c98496c88f6ab8c0999cb300b551dc2129 Mon Sep 17 00:00:00 2001
From: Daniel Parks <dp+git@oxidized.org>
Date: Mon, 30 Dec 2024 09:31:59 -0800
Subject: [PATCH] Expand list of invalid characters in file names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are issues with certain characters in file names (see issue #8 —
Check how Anki handles special characters in `[sound: ...]`).

I’ve listed my best guess at which characters are a problem in
`FILENAME_ILLEGAL_CHARS`, added escaping to prevent then, and added a
check to ensure that none made it into the video ID, which is embedded
directly in a file name.
---
 yanki/video.py | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/yanki/video.py b/yanki/video.py
index e6c2dcb..5c7fc49 100644
--- a/yanki/video.py
+++ b/yanki/video.py
@@ -20,6 +20,10 @@
 
 STILL_FORMATS = frozenset(['png', 'jpeg', 'jpg'])
 TIME_FORMAT = '%0.06f'
+FILENAME_ILLEGAL_CHARS = '/"[]'
+
+def chars_in(chars, input):
+  return [char for char in chars if char in input]
 
 class BadURL(ValueError):
   pass
@@ -72,7 +76,16 @@ def url_to_id(url_str):
     # Try to load the URL with yt_dlp and see what happens.
     pass
 
-  return url_str.replace('|', '||').replace('/', '|')
+  # FIXME check this against FILENAME_ILLEGAL_CHARS somehow
+  return (
+    url_str
+    .replace('\\', '\\\\')
+    .replace('|', r'\|')
+    .replace('"', r"\'")
+    .replace('[', r"\(")
+    .replace(']', r"\)")
+    .replace('/', '|')
+  )
 
 def file_url_to_path(url):
   parts = urlparse(url)
@@ -102,15 +115,17 @@ def __init__(self, url, working_dir='.', cache_path='.', reprocess=False, logger
     self.cache_path = cache_path
     self.reprocess = reprocess
     self.logger = logger
+
+    self.id = url_to_id(url)
+    invalid = chars_in(FILENAME_ILLEGAL_CHARS, self.id)
+    if invalid:
+      raise BadURL(
+        f'Invalid characters ({"".join(invalid)}) in video ID: {repr(self.id)}'
+      )
+
     self._info = None
     self._raw_metadata = None
     self._format = None
-
-    # ffmpeg parameters. Every run with the same parameters should produce the
-    # same video (assuming the source hasn’t changed).
-    self.id = url_to_id(url)
-    if '/' in self.id:
-      raise BadURL(f"Invalid '/' in video ID: {repr(self.id)}")
     self._crop = None
     self._overlay_text = ''
     self._slow_filter = None
@@ -133,7 +148,7 @@ def raw_metadata_cache_path(self):
   def processed_video_cache_path(self, prefix='processed_'):
     parameters = '_'.join(self.parameters())
 
-    if '/' in parameters or len(parameters) > 60:
+    if len(parameters) > 60 or chars_in(FILENAME_ILLEGAL_CHARS, parameters):
       parameters = hashlib.blake2b(
         parameters.encode(encoding='utf-8'),
         digest_size=16,