From c4771582c278ff9b7541f7ebab86de8444fa7ef4 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:14:21 +0200 Subject: [PATCH 01/11] Move errors to a specfic file --- lib/errors.py | 2 ++ lib/markdown2.py | 8 ++------ 2 files changed, 4 insertions(+), 6 deletions(-) create mode 100644 lib/errors.py diff --git a/lib/errors.py b/lib/errors.py new file mode 100644 index 00000000..65f53087 --- /dev/null +++ b/lib/errors.py @@ -0,0 +1,2 @@ +class MarkdownError(Exception): + pass diff --git a/lib/markdown2.py b/lib/markdown2.py index 397a832a..d5b7ee04 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -111,6 +111,8 @@ import codecs from collections import defaultdict +from lib.errors import MarkdownError + # ---- globals DEBUG = False @@ -133,12 +135,6 @@ def _hash_text(s): # http://bumppo.net/projects/amputator/ _AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') - -# ---- exceptions -class MarkdownError(Exception): - pass - - # ---- public api def markdown_path(path, encoding="utf-8", From 102c3a9a087651db2d7f6f448bf2e5447c417477 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:17:55 +0200 Subject: [PATCH 02/11] Move slugify to utils --- lib/markdown2.py | 19 ++----------------- lib/utils.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 17 deletions(-) create mode 100644 lib/utils.py diff --git a/lib/markdown2.py b/lib/markdown2.py index d5b7ee04..8271cba3 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -112,6 +112,7 @@ from collections import defaultdict from lib.errors import MarkdownError +from lib.utils import slugify # ---- globals @@ -1604,7 +1605,7 @@ def header_id_from_text(self, text, prefix, n): None to not have an id attribute and to exclude this header from the TOC (if the "toc" extra is specified). """ - header_id = _slugify(text) + header_id = slugify(text) if prefix and isinstance(prefix, str): header_id = prefix + '-' + header_id @@ -2504,22 +2505,6 @@ class UnicodeWithAttrs(str): metadata = None toc_html = None -## {{{ http://code.activestate.com/recipes/577257/ (r1) -_slugify_strip_re = re.compile(r'[^\w\s-]') -_slugify_hyphenate_re = re.compile(r'[-\s]+') -def _slugify(value): - """ - Normalizes string, converts to lowercase, removes non-alpha characters, - and converts spaces to hyphens. - - From Django's "django/template/defaultfilters.py". - """ - import unicodedata - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() - value = _slugify_strip_re.sub('', value).strip().lower() - return _slugify_hyphenate_re.sub('-', value) -## end of http://code.activestate.com/recipes/577257/ }}} - # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 def _curry(*args, **kwargs): diff --git a/lib/utils.py b/lib/utils.py new file mode 100644 index 00000000..aebd152d --- /dev/null +++ b/lib/utils.py @@ -0,0 +1,23 @@ +import re + + +## {{{ http://code.activestate.com/recipes/577257/ (r1) + +_slugify_strip_re = re.compile(r"[^\w\s-]") +_slugify_hyphenate_re = re.compile(r"[-\s]+") + + +def slugify(value): + """ + Normalizes string, converts to lowercase, removes non-alpha characters, + and converts spaces to hyphens. + + From Django's "django/template/defaultfilters.py". + """ + import unicodedata + + value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode() + value = _slugify_strip_re.sub("", value).strip().lower() + return _slugify_hyphenate_re.sub("-", value) + +## end of http://code.activestate.com/recipes/577257/ }}} From 1b1543e645c726849b861db5fe391902c719f265 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:22:41 +0200 Subject: [PATCH 03/11] Move calculate_toc_html to utils --- lib/markdown2.py | 37 +------------------------------------ lib/utils.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 8271cba3..77e554a5 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -112,7 +112,7 @@ from collections import defaultdict from lib.errors import MarkdownError -from lib.utils import slugify +from lib.utils import slugify, calculate_toc_html # ---- globals @@ -2462,41 +2462,6 @@ class MarkdownWithExtras(Markdown): # ---- internal support functions - -def calculate_toc_html(toc): - """Return the HTML for the current TOC. - - This expects the `_toc` attribute to have been set on this instance. - """ - if toc is None: - return None - - def indent(): - return ' ' * (len(h_stack) - 1) - lines = [] - h_stack = [0] # stack of header-level numbers - for level, id, name in toc: - if level > h_stack[-1]: - lines.append("%s
    " % indent()) - h_stack.append(level) - elif level == h_stack[-1]: - lines[-1] += "" - else: - while level < h_stack[-1]: - h_stack.pop() - if not lines[-1].endswith(""): - lines[-1] += "" - lines.append("%s
" % indent()) - lines.append('%s
  • %s' % ( - indent(), id, name)) - while len(h_stack) > 1: - h_stack.pop() - if not lines[-1].endswith("
  • "): - lines[-1] += "" - lines.append("%s" % indent()) - return '\n'.join(lines) + '\n' - - class UnicodeWithAttrs(str): """A subclass of unicode used for the return value of conversion to possibly attach some attributes. E.g. the "toc_html" attribute when diff --git a/lib/utils.py b/lib/utils.py index aebd152d..d26f8235 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -20,4 +20,39 @@ def slugify(value): value = _slugify_strip_re.sub("", value).strip().lower() return _slugify_hyphenate_re.sub("-", value) + ## end of http://code.activestate.com/recipes/577257/ }}} + + +def calculate_toc_html(toc): + """Return the HTML for the current TOC. + + This expects the `_toc` attribute to have been set on this instance. + """ + if toc is None: + return None + + def indent(): + return " " * (len(h_stack) - 1) + + lines = [] + h_stack = [0] # stack of header-level numbers + for level, id, name in toc: + if level > h_stack[-1]: + lines.append("%s
      " % indent()) + h_stack.append(level) + elif level == h_stack[-1]: + lines[-1] += "" + else: + while level < h_stack[-1]: + h_stack.pop() + if not lines[-1].endswith(""): + lines[-1] += "" + lines.append("%s
    " % indent()) + lines.append('%s
  • %s' % (indent(), id, name)) + while len(h_stack) > 1: + h_stack.pop() + if not lines[-1].endswith("
  • "): + lines[-1] += "" + lines.append("%s" % indent()) + return "\n".join(lines) + "\n" From 5a58d5fb99cced9b39ec94033389316973f6e706 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:23:55 +0200 Subject: [PATCH 04/11] Move curry to utils --- lib/markdown2.py | 14 ++------------ lib/utils.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 77e554a5..b2221a27 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -112,7 +112,7 @@ from collections import defaultdict from lib.errors import MarkdownError -from lib.utils import slugify, calculate_toc_html +from lib.utils import slugify, calculate_toc_html, curry # ---- globals @@ -757,7 +757,7 @@ def _hash_html_blocks(self, text, raw=False): return text # Pass `raw` value into our calls to self._hash_html_block_sub. - hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) + hash_html_block_sub = curry(self._hash_html_block_sub, raw=raw) # First, look for nested blocks, e.g.: #
    @@ -2471,16 +2471,6 @@ class UnicodeWithAttrs(str): toc_html = None -# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 -def _curry(*args, **kwargs): - function, args = args[0], args[1:] - def result(*rest, **kwrest): - combined = kwargs.copy() - combined.update(kwrest) - return function(*args + rest, **combined) - return result - - # Recipe: regex_from_encoded_pattern (1.0) def _regex_from_encoded_pattern(s): """'foo' -> re.compile(re.escape('foo')) diff --git a/lib/utils.py b/lib/utils.py index d26f8235..d5d21521 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -56,3 +56,15 @@ def indent(): lines[-1] += "" lines.append("%s" % indent()) return "\n".join(lines) + "\n" + + +# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 +def curry(*args, **kwargs): + function, args = args[0], args[1:] + + def result(*rest, **kwrest): + combined = kwargs.copy() + combined.update(kwrest) + return function(*args + rest, **combined) + + return result From 1306c713c9bc1154cfb1f804904fa696985ecb10 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:25:31 +0200 Subject: [PATCH 05/11] Move regex_from_encoded_pattern to utils --- lib/markdown2.py | 129 +++-------------------------------------------- lib/utils.py | 118 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 123 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index b2221a27..393a90b7 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -112,7 +112,7 @@ from collections import defaultdict from lib.errors import MarkdownError -from lib.utils import slugify, calculate_toc_html, curry +from lib.utils import slugify, calculate_toc_html, curry, regex_from_encoded_pattern, dedentlines, dedent # ---- globals @@ -514,7 +514,7 @@ def parse_structured_value(value): # Multiline value if v[:3] == " >\n": - self.metadata[k.strip()] = _dedent(v[3:]).strip() + self.metadata[k.strip()] = dedent(v[3:]).strip() # Empty value elif v == "\n": @@ -944,7 +944,7 @@ def _do_numbering(self, text): def _extract_footnote_def_sub(self, match): id, text = match.groups() - text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() + text = dedent(text, skip_first_line=not text.startswith('\n')).strip() normed_id = re.sub(r'\W', '-', id) # Ensure footnote text ends with a couple newlines (for some # block gamut matches). @@ -1031,10 +1031,10 @@ def _run_block_gamut(self, text): def _pyshell_block_sub(self, match): if "fenced-code-blocks" in self.extras: - dedented = _dedent(match.group(0)) + dedented = dedent(match.group(0)) return self._do_fenced_code_blocks("```pycon\n" + dedented + "```\n") lines = match.group(0).splitlines(0) - _dedentlines(lines) + dedentlines(lines) indent = ' ' * self.tab_width s = ('\n' # separate from possible cuddled paragraph + indent + ('\n'+indent).join(lines) @@ -2471,123 +2471,6 @@ class UnicodeWithAttrs(str): toc_html = None -# Recipe: regex_from_encoded_pattern (1.0) -def _regex_from_encoded_pattern(s): - """'foo' -> re.compile(re.escape('foo')) - '/foo/' -> re.compile('foo') - '/foo/i' -> re.compile('foo', re.I) - """ - if s.startswith('/') and s.rfind('/') != 0: - # Parse it: /PATTERN/FLAGS - idx = s.rfind('/') - _, flags_str = s[1:idx], s[idx+1:] - flag_from_char = { - "i": re.IGNORECASE, - "l": re.LOCALE, - "s": re.DOTALL, - "m": re.MULTILINE, - "u": re.UNICODE, - } - flags = 0 - for char in flags_str: - try: - flags |= flag_from_char[char] - except KeyError: - raise ValueError("unsupported regex flag: '%s' in '%s' " - "(must be one of '%s')" - % (char, s, ''.join(list(flag_from_char.keys())))) - return re.compile(s[1:idx], flags) - else: # not an encoded regex - return re.compile(re.escape(s)) - - -# Recipe: dedent (0.1.2) -def _dedentlines(lines, tabsize=8, skip_first_line=False): - """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines - - "lines" is a list of lines to dedent. - "tabsize" is the tab width to use for indent width calculations. - "skip_first_line" is a boolean indicating if the first line should - be skipped for calculating the indent width and for dedenting. - This is sometimes useful for docstrings and similar. - - Same as dedent() except operates on a sequence of lines. Note: the - lines list is modified **in-place**. - """ - DEBUG = False - if DEBUG: - print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ - % (tabsize, skip_first_line)) - margin = None - for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue - indent = 0 - for ch in line: - if ch == ' ': - indent += 1 - elif ch == '\t': - indent += tabsize - (indent % tabsize) - elif ch in '\r\n': - continue # skip all-whitespace lines - else: - break - else: - continue # skip all-whitespace lines - if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) - if margin is None: - margin = indent - else: - margin = min(margin, indent) - if DEBUG: print("dedent: margin=%r" % margin) - - if margin is not None and margin > 0: - for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue - removed = 0 - for j, ch in enumerate(line): - if ch == ' ': - removed += 1 - elif ch == '\t': - removed += tabsize - (removed % tabsize) - elif ch in '\r\n': - if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) - lines[i] = lines[i][j:] - break - else: - raise ValueError("unexpected non-whitespace char %r in " - "line %r while removing %d-space margin" - % (ch, line, margin)) - if DEBUG: - print("dedent: %r: %r -> removed %d/%d"\ - % (line, ch, removed, margin)) - if removed == margin: - lines[i] = lines[i][j+1:] - break - elif removed > margin: - lines[i] = ' '*(removed-margin) + lines[i][j+1:] - break - else: - if removed: - lines[i] = lines[i][removed:] - return lines - - -def _dedent(text, tabsize=8, skip_first_line=False): - """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text - - "text" is the text to dedent. - "tabsize" is the tab width to use for indent width calculations. - "skip_first_line" is a boolean indicating if the first line should - be skipped for calculating the indent width and for dedenting. - This is sometimes useful for docstrings and similar. - - textwrap.dedent(s), but don't expand tabs to spaces - """ - lines = text.splitlines(1) - _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) - return ''.join(lines) - - class _memoized(object): """Decorator that caches a function's return value each time it is called. If called later with the same arguments, the cached value is returned, and @@ -2786,7 +2669,7 @@ def main(argv=None): raise MarkdownError("%s:%d: invalid link pattern line: %r" % (opts.link_patterns_file, i+1, line)) link_patterns.append( - (_regex_from_encoded_pattern(pat), href)) + (regex_from_encoded_pattern(pat), href)) finally: f.close() else: diff --git a/lib/utils.py b/lib/utils.py index d5d21521..482f5c7e 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -68,3 +68,121 @@ def result(*rest, **kwrest): return function(*args + rest, **combined) return result + + +# Recipe: regex_from_encoded_pattern (1.0) +def regex_from_encoded_pattern(s): + """'foo' -> re.compile(re.escape('foo')) + '/foo/' -> re.compile('foo') + '/foo/i' -> re.compile('foo', re.I) + """ + if s.startswith("/") and s.rfind("/") != 0: + # Parse it: /PATTERN/FLAGS + idx = s.rfind("/") + _, flags_str = s[1:idx], s[idx + 1 :] + flag_from_char = { + "i": re.IGNORECASE, + "l": re.LOCALE, + "s": re.DOTALL, + "m": re.MULTILINE, + "u": re.UNICODE, + } + flags = 0 + for char in flags_str: + try: + flags |= flag_from_char[char] + except KeyError: + raise ValueError( + "unsupported regex flag: '%s' in '%s' " + "(must be one of '%s')" + % (char, s, "".join(list(flag_from_char.keys()))) + ) + return re.compile(s[1:idx], flags) + else: # not an encoded regex + return re.compile(re.escape(s)) + +# Recipe: dedent (0.1.2) +def dedentlines(lines, tabsize=8, skip_first_line=False): + """dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines + + "lines" is a list of lines to dedent. + "tabsize" is the tab width to use for indent width calculations. + "skip_first_line" is a boolean indicating if the first line should + be skipped for calculating the indent width and for dedenting. + This is sometimes useful for docstrings and similar. + + Same as dedent() except operates on a sequence of lines. Note: the + lines list is modified **in-place**. + """ + DEBUG = False + if DEBUG: + print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ + % (tabsize, skip_first_line)) + margin = None + for i, line in enumerate(lines): + if i == 0 and skip_first_line: continue + indent = 0 + for ch in line: + if ch == ' ': + indent += 1 + elif ch == '\t': + indent += tabsize - (indent % tabsize) + elif ch in '\r\n': + continue # skip all-whitespace lines + else: + break + else: + continue # skip all-whitespace lines + if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) + if margin is None: + margin = indent + else: + margin = min(margin, indent) + if DEBUG: print("dedent: margin=%r" % margin) + + if margin is not None and margin > 0: + for i, line in enumerate(lines): + if i == 0 and skip_first_line: continue + removed = 0 + for j, ch in enumerate(line): + if ch == ' ': + removed += 1 + elif ch == '\t': + removed += tabsize - (removed % tabsize) + elif ch in '\r\n': + if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) + lines[i] = lines[i][j:] + break + else: + raise ValueError("unexpected non-whitespace char %r in " + "line %r while removing %d-space margin" + % (ch, line, margin)) + if DEBUG: + print("dedent: %r: %r -> removed %d/%d"\ + % (line, ch, removed, margin)) + if removed == margin: + lines[i] = lines[i][j+1:] + break + elif removed > margin: + lines[i] = ' '*(removed-margin) + lines[i][j+1:] + break + else: + if removed: + lines[i] = lines[i][removed:] + return lines + + +def dedent(text, tabsize=8, skip_first_line=False): + """dedent(text, tabsize=8, skip_first_line=False) -> dedented text + + "text" is the text to dedent. + "tabsize" is the tab width to use for indent width calculations. + "skip_first_line" is a boolean indicating if the first line should + be skipped for calculating the indent width and for dedenting. + This is sometimes useful for docstrings and similar. + + textwrap.dedent(s), but don't expand tabs to spaces + """ + lines = text.splitlines(1) + dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) + return ''.join(lines) \ No newline at end of file From fe7b94afd08fc125c5f42905430214d92a13f86c Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:30:58 +0200 Subject: [PATCH 06/11] Move memoized to utils --- lib/markdown2.py | 41 +++++++++++------------------------------ lib/utils.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 393a90b7..86b4306c 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -112,7 +112,15 @@ from collections import defaultdict from lib.errors import MarkdownError -from lib.utils import slugify, calculate_toc_html, curry, regex_from_encoded_pattern, dedentlines, dedent +from lib.utils import ( + slugify, + calculate_toc_html, + curry, + regex_from_encoded_pattern, + dedentlines, + dedent, + memoized, +) # ---- globals @@ -2471,33 +2479,6 @@ class UnicodeWithAttrs(str): toc_html = None -class _memoized(object): - """Decorator that caches a function's return value each time it is called. - If called later with the same arguments, the cached value is returned, and - not re-evaluated. - - http://wiki.python.org/moin/PythonDecoratorLibrary - """ - def __init__(self, func): - self.func = func - self.cache = {} - - def __call__(self, *args): - try: - return self.cache[args] - except KeyError: - self.cache[args] = value = self.func(*args) - return value - except TypeError: - # uncachable -- for instance, passing a list as an argument. - # Better to not cache than to blow up entirely. - return self.func(*args) - - def __repr__(self): - """Return the function's docstring.""" - return self.func.__doc__ - - def _xml_oneliner_re_from_tab_width(tab_width): """Standalone XML processing instruction regex.""" return re.compile(r""" @@ -2517,7 +2498,7 @@ def _xml_oneliner_re_from_tab_width(tab_width): (?=\n{2,}|\Z) # followed by a blank line or end of document ) """ % (tab_width - 1), re.X) -_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) +_xml_oneliner_re_from_tab_width = memoized(_xml_oneliner_re_from_tab_width) def _hr_tag_re_from_tab_width(tab_width): @@ -2537,7 +2518,7 @@ def _hr_tag_re_from_tab_width(tab_width): (?=\n{2,}|\Z) # followed by a blank line or end of document ) """ % (tab_width - 1), re.X) -_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) +_hr_tag_re_from_tab_width = memoized(_hr_tag_re_from_tab_width) def _xml_escape_attr(attr, skip_single_quote=True): diff --git a/lib/utils.py b/lib/utils.py index 482f5c7e..5bd65c12 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -185,4 +185,31 @@ def dedent(text, tabsize=8, skip_first_line=False): """ lines = text.splitlines(1) dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) - return ''.join(lines) \ No newline at end of file + return ''.join(lines) + + +class memoized(object): + """Decorator that caches a function's return value each time it is called. + If called later with the same arguments, the cached value is returned, and + not re-evaluated. + + http://wiki.python.org/moin/PythonDecoratorLibrary + """ + def __init__(self, func): + self.func = func + self.cache = {} + + def __call__(self, *args): + try: + return self.cache[args] + except KeyError: + self.cache[args] = value = self.func(*args) + return value + except TypeError: + # uncachable -- for instance, passing a list as an argument. + # Better to not cache than to blow up entirely. + return self.func(*args) + + def __repr__(self): + """Return the function's docstring.""" + return self.func.__doc__ \ No newline at end of file From a1d4d6c0db5c31df6993f1cf0390682d81faa844 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:32:53 +0200 Subject: [PATCH 07/11] Move xml_oneliner_re_from_tab_width to utils --- lib/markdown2.py | 26 ++------------------------ lib/utils.py | 25 ++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 86b4306c..908b5c6b 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -120,6 +120,7 @@ dedentlines, dedent, memoized, + xml_oneliner_re_from_tab_width, ) # ---- globals @@ -852,7 +853,7 @@ def _hash_html_blocks(self, text, raw=False): # # # - _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) + _xml_oneliner_re = xml_oneliner_re_from_tab_width(self.tab_width) text = _xml_oneliner_re.sub(hash_html_block_sub, text) return text @@ -2478,29 +2479,6 @@ class UnicodeWithAttrs(str): metadata = None toc_html = None - -def _xml_oneliner_re_from_tab_width(tab_width): - """Standalone XML processing instruction regex.""" - return re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,%d} - (?: - <\?\w+\b\s+.*?\?> # XML processing instruction - | - <\w+:\w+\b\s+.*?/> # namespaced single tag - ) - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - """ % (tab_width - 1), re.X) -_xml_oneliner_re_from_tab_width = memoized(_xml_oneliner_re_from_tab_width) - - def _hr_tag_re_from_tab_width(tab_width): return re.compile(r""" (?: diff --git a/lib/utils.py b/lib/utils.py index 5bd65c12..660160e0 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -212,4 +212,27 @@ def __call__(self, *args): def __repr__(self): """Return the function's docstring.""" - return self.func.__doc__ \ No newline at end of file + return self.func.__doc__ + + + +def xml_oneliner_re_from_tab_width(tab_width): + """Standalone XML processing instruction regex.""" + return re.compile(r""" + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,%d} + (?: + <\?\w+\b\s+.*?\?> # XML processing instruction + | + <\w+:\w+\b\s+.*?/> # namespaced single tag + ) + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + """ % (tab_width - 1), re.X) +xml_oneliner_re_from_tab_width = memoized(xml_oneliner_re_from_tab_width) From e84392708a2cdd33bdbc4cd0dda517eafa8cbfb6 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:34:51 +0200 Subject: [PATCH 08/11] Move hr_tag_re_from_tab_width to utils --- lib/markdown2.py | 22 ++---------- lib/utils.py | 90 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 68 insertions(+), 44 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 908b5c6b..147e6a51 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -121,6 +121,7 @@ dedent, memoized, xml_oneliner_re_from_tab_width, + hr_tag_re_from_tab_width, ) # ---- globals @@ -787,7 +788,7 @@ def _hash_html_blocks(self, text, raw=False): # Special case just for
    . It was easier to make a special # case than to make the other regex more complicated. if "])*? # - /?> # the matching end tag - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - """ % (tab_width - 1), re.X) -_hr_tag_re_from_tab_width = memoized(_hr_tag_re_from_tab_width) - def _xml_escape_attr(attr, skip_single_quote=True): """Escape the given string for use in an HTML/XML tag attribute. diff --git a/lib/utils.py b/lib/utils.py index 660160e0..3eec2329 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -101,6 +101,7 @@ def regex_from_encoded_pattern(s): else: # not an encoded regex return re.compile(re.escape(s)) + # Recipe: dedent (0.1.2) def dedentlines(lines, tabsize=8, skip_first_line=False): """dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines @@ -116,55 +117,64 @@ def dedentlines(lines, tabsize=8, skip_first_line=False): """ DEBUG = False if DEBUG: - print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ - % (tabsize, skip_first_line)) + print( + "dedent: dedent(..., tabsize=%d, skip_first_line=%r)" + % (tabsize, skip_first_line) + ) margin = None for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue + if i == 0 and skip_first_line: + continue indent = 0 for ch in line: - if ch == ' ': + if ch == " ": indent += 1 - elif ch == '\t': + elif ch == "\t": indent += tabsize - (indent % tabsize) - elif ch in '\r\n': + elif ch in "\r\n": continue # skip all-whitespace lines else: break else: continue # skip all-whitespace lines - if DEBUG: print("dedent: indent=%d: %r" % (indent, line)) + if DEBUG: + print("dedent: indent=%d: %r" % (indent, line)) if margin is None: margin = indent else: margin = min(margin, indent) - if DEBUG: print("dedent: margin=%r" % margin) + if DEBUG: + print("dedent: margin=%r" % margin) if margin is not None and margin > 0: for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue + if i == 0 and skip_first_line: + continue removed = 0 for j, ch in enumerate(line): - if ch == ' ': + if ch == " ": removed += 1 - elif ch == '\t': + elif ch == "\t": removed += tabsize - (removed % tabsize) - elif ch in '\r\n': - if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line) + elif ch in "\r\n": + if DEBUG: + print("dedent: %r: EOL -> strip up to EOL" % line) lines[i] = lines[i][j:] break else: - raise ValueError("unexpected non-whitespace char %r in " - "line %r while removing %d-space margin" - % (ch, line, margin)) + raise ValueError( + "unexpected non-whitespace char %r in " + "line %r while removing %d-space margin" % (ch, line, margin) + ) if DEBUG: - print("dedent: %r: %r -> removed %d/%d"\ - % (line, ch, removed, margin)) + print( + "dedent: %r: %r -> removed %d/%d" % (line, ch, removed, margin) + ) if removed == margin: - lines[i] = lines[i][j+1:] + lines[i] = lines[i][j + 1 :] break elif removed > margin: - lines[i] = ' '*(removed-margin) + lines[i][j+1:] + lines[i] = " " * (removed - margin) + lines[i][j + 1 :] break else: if removed: @@ -185,7 +195,7 @@ def dedent(text, tabsize=8, skip_first_line=False): """ lines = text.splitlines(1) dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) - return ''.join(lines) + return "".join(lines) class memoized(object): @@ -195,6 +205,7 @@ class memoized(object): http://wiki.python.org/moin/PythonDecoratorLibrary """ + def __init__(self, func): self.func = func self.cache = {} @@ -215,10 +226,10 @@ def __repr__(self): return self.func.__doc__ - def xml_oneliner_re_from_tab_width(tab_width): """Standalone XML processing instruction regex.""" - return re.compile(r""" + return re.compile( + r""" (?: (?<=\n\n) # Starting after a blank line | # or @@ -234,5 +245,36 @@ def xml_oneliner_re_from_tab_width(tab_width): [ \t]* (?=\n{2,}|\Z) # followed by a blank line or end of document ) - """ % (tab_width - 1), re.X) + """ + % (tab_width - 1), + re.X, + ) + + xml_oneliner_re_from_tab_width = memoized(xml_oneliner_re_from_tab_width) + + +def hr_tag_re_from_tab_width(tab_width): + return re.compile( + r""" + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in \1 + [ ]{0,%d} + <(hr) # start tag = \2 + \b # word break + ([^<>])*? # + /?> # the matching end tag + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + """ + % (tab_width - 1), + re.X, + ) + + +hr_tag_re_from_tab_width = memoized(hr_tag_re_from_tab_width) From 9e01776566e616a86ee40c52ac4832edaf596a9a Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:37:43 +0200 Subject: [PATCH 09/11] Move xml_escape_attr to utils --- lib/markdown2.py | 26 +++++--------------------- lib/utils.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 147e6a51..1282e2cd 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -122,6 +122,7 @@ memoized, xml_oneliner_re_from_tab_width, hr_tag_re_from_tab_width, + xml_escape_attr, ) # ---- globals @@ -1501,7 +1502,7 @@ def _do_links(self, text): .replace('_', self._escape_table['_']) if title: title_str = ' title="%s"' % ( - _xml_escape_attr(title) + xml_escape_attr(_AMPERSAND_RE, title) .replace('*', self._escape_table['*']) .replace('_', self._escape_table['_'])) else: @@ -1510,7 +1511,7 @@ def _do_links(self, text): img_class_str = self._html_class_str_from_tag("img") result = '%s', '>')) - if not skip_single_quote: - escaped = escaped.replace("'", "'") - return escaped - - def _xml_encode_email_char_at_random(ch): r = random() # Roughly 10% raw, 45% hex, 45% dec. diff --git a/lib/utils.py b/lib/utils.py index 3eec2329..a3a84ee6 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -278,3 +278,20 @@ def hr_tag_re_from_tab_width(tab_width): hr_tag_re_from_tab_width = memoized(hr_tag_re_from_tab_width) + + +def xml_escape_attr(ampersand_re, attr, skip_single_quote=True): + """Escape the given string for use in an HTML/XML tag attribute. + + By default this doesn't bother with escaping `'` to `'`, presuming that + the tag attribute is surrounded by double quotes. + """ + escaped = ampersand_re.sub('&', attr) + + escaped = (attr + .replace('"', '"') + .replace('<', '<') + .replace('>', '>')) + if not skip_single_quote: + escaped = escaped.replace("'", "'") + return escaped \ No newline at end of file From 723a1f0f12d518e133b70d45736a46c4d32b5f58 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:39:10 +0200 Subject: [PATCH 10/11] Move xml_encode_email_char_at_random to utils --- lib/markdown2.py | 17 ++--------------- lib/utils.py | 24 ++++++++++++++++++------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 1282e2cd..f55d84f0 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -123,6 +123,7 @@ xml_oneliner_re_from_tab_width, hr_tag_re_from_tab_width, xml_escape_attr, + xml_encode_email_char_at_random, ) # ---- globals @@ -2363,7 +2364,7 @@ def _encode_email_address(self, addr): # # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk # mailing list: - chars = [_xml_encode_email_char_at_random(ch) + chars = [xml_encode_email_char_at_random(ch) for ch in "mailto:" + addr] # Strip the mailto: from the visible part. addr = '%s' \ @@ -2482,20 +2483,6 @@ class UnicodeWithAttrs(str): toc_html = None -def _xml_encode_email_char_at_random(ch): - r = random() - # Roughly 10% raw, 45% hex, 45% dec. - # '@' *must* be encoded. I [John Gruber] insist. - # Issue 26: '_' must be encoded. - if r > 0.9 and ch not in "@_": - return ch - elif r < 0.45: - # The [1:] is to drop leading '0': 0x63 -> x63 - return '&#%s;' % hex(ord(ch))[1:] - else: - return '&#%s;' % ord(ch) - - def _html_escape_url(attr, safe_mode=False): """Replace special characters that are potentially malicious in url string.""" escaped = (attr diff --git a/lib/utils.py b/lib/utils.py index a3a84ee6..18293824 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,3 +1,4 @@ +from random import random import re @@ -286,12 +287,23 @@ def xml_escape_attr(ampersand_re, attr, skip_single_quote=True): By default this doesn't bother with escaping `'` to `'`, presuming that the tag attribute is surrounded by double quotes. """ - escaped = ampersand_re.sub('&', attr) + escaped = ampersand_re.sub("&", attr) - escaped = (attr - .replace('"', '"') - .replace('<', '<') - .replace('>', '>')) + escaped = attr.replace('"', """).replace("<", "<").replace(">", ">") if not skip_single_quote: escaped = escaped.replace("'", "'") - return escaped \ No newline at end of file + return escaped + + +def xml_encode_email_char_at_random(ch): + r = random() + # Roughly 10% raw, 45% hex, 45% dec. + # '@' *must* be encoded. I [John Gruber] insist. + # Issue 26: '_' must be encoded. + if r > 0.9 and ch not in "@_": + return ch + elif r < 0.45: + # The [1:] is to drop leading '0': 0x63 -> x63 + return "&#%s;" % hex(ord(ch))[1:] + else: + return "&#%s;" % ord(ch) From 2a56b907756bccca3614b49184a24372c72d7716 Mon Sep 17 00:00:00 2001 From: Yann Rabiller Date: Fri, 17 Jun 2022 00:40:44 +0200 Subject: [PATCH 11/11] Move html_escape_url to utils --- lib/markdown2.py | 20 +++++--------------- lib/utils.py | 12 ++++++++++++ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index f55d84f0..2a3f364f 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -124,6 +124,7 @@ hr_tag_re_from_tab_width, xml_escape_attr, xml_encode_email_char_at_random, + html_escape_url, ) # ---- globals @@ -1511,7 +1512,7 @@ def _do_links(self, text): if is_img: img_class_str = self._html_class_str_from_tag("img") result = '%s', '>')) - if safe_mode: - escaped = escaped.replace('+', ' ') - escaped = escaped.replace("'", "'") - return escaped - # ---- mainline diff --git a/lib/utils.py b/lib/utils.py index 18293824..ce0087db 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -307,3 +307,15 @@ def xml_encode_email_char_at_random(ch): return "&#%s;" % hex(ord(ch))[1:] else: return "&#%s;" % ord(ch) + + +def html_escape_url(attr, safe_mode=False): + """Replace special characters that are potentially malicious in url string.""" + escaped = (attr + .replace('"', '"') + .replace('<', '<') + .replace('>', '>')) + if safe_mode: + escaped = escaped.replace('+', ' ') + escaped = escaped.replace("'", "'") + return escaped \ No newline at end of file