Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring: split into files #452

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions lib/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class MarkdownError(Exception):
pass
347 changes: 34 additions & 313 deletions lib/markdown2.py
Original file line number Diff line number Diff line change
@@ -111,6 +111,22 @@
import codecs
from collections import defaultdict

from lib.errors import MarkdownError
from lib.utils import (
slugify,
calculate_toc_html,
curry,
regex_from_encoded_pattern,
dedentlines,
dedent,
memoized,
xml_oneliner_re_from_tab_width,
hr_tag_re_from_tab_width,
xml_escape_attr,
xml_encode_email_char_at_random,
html_escape_url,
)

# ---- globals

DEBUG = False
@@ -133,12 +149,6 @@ def _hash_text(s):
# http://bumppo.net/projects/amputator/
_AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')


# ---- exceptions
class MarkdownError(Exception):
pass


# ---- public api

def markdown_path(path, encoding="utf-8",
@@ -517,7 +527,7 @@ def parse_structured_value(value):

# Multiline value
if v[:3] == " >\n":
self.metadata[k.strip()] = _dedent(v[3:]).strip()
self.metadata[k.strip()] = dedent(v[3:]).strip()

# Empty value
elif v == "\n":
@@ -760,7 +770,7 @@ def _hash_html_blocks(self, text, raw=False):
return text

# Pass `raw` value into our calls to self._hash_html_block_sub.
hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
hash_html_block_sub = curry(self._hash_html_block_sub, raw=raw)

# First, look for nested blocks, e.g.:
# <div>
@@ -781,7 +791,7 @@ def _hash_html_blocks(self, text, raw=False):
# Special case just for <hr />. It was easier to make a special
# case than to make the other regex more complicated.
if "<hr" in text:
_hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
_hr_tag_re = hr_tag_re_from_tab_width(self.tab_width)
text = _hr_tag_re.sub(hash_html_block_sub, text)

# Special case for standalone HTML comments:
@@ -847,7 +857,7 @@ def _hash_html_blocks(self, text, raw=False):
# <?foo bar?>
#
# <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
_xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
_xml_oneliner_re = xml_oneliner_re_from_tab_width(self.tab_width)
text = _xml_oneliner_re.sub(hash_html_block_sub, text)

return text
@@ -947,7 +957,7 @@ def _do_numbering(self, text):

def _extract_footnote_def_sub(self, match):
id, text = match.groups()
text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
text = dedent(text, skip_first_line=not text.startswith('\n')).strip()
normed_id = re.sub(r'\W', '-', id)
# Ensure footnote text ends with a couple newlines (for some
# block gamut matches).
@@ -1034,10 +1044,10 @@ def _run_block_gamut(self, text):

def _pyshell_block_sub(self, match):
if "fenced-code-blocks" in self.extras:
dedented = _dedent(match.group(0))
dedented = dedent(match.group(0))
return self._do_fenced_code_blocks("```pycon\n" + dedented + "```\n")
lines = match.group(0).splitlines(0)
_dedentlines(lines)
dedentlines(lines)
indent = ' ' * self.tab_width
s = ('\n' # separate from possible cuddled paragraph
+ indent + ('\n'+indent).join(lines)
@@ -1494,16 +1504,16 @@ def _do_links(self, text):
.replace('_', self._escape_table['_'])
if title:
title_str = ' title="%s"' % (
_xml_escape_attr(title)
xml_escape_attr(_AMPERSAND_RE, title)
.replace('*', self._escape_table['*'])
.replace('_', self._escape_table['_']))
else:
title_str = ''
if is_img:
img_class_str = self._html_class_str_from_tag("img")
result = '<img src="%s" alt="%s"%s%s%s' \
% (_html_escape_url(url, safe_mode=self.safe_mode),
_xml_escape_attr(link_text),
% (html_escape_url(url, safe_mode=self.safe_mode),
xml_escape_attr(_AMPERSAND_RE, link_text),
title_str,
img_class_str,
self.empty_element_suffix)
@@ -1516,7 +1526,7 @@ def _do_links(self, text):
if self.safe_mode and not safe_link:
result_head = '<a href="#"%s>' % (title_str)
else:
result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str)
result_head = '<a href="%s"%s>' % (html_escape_url(url, safe_mode=self.safe_mode), title_str)
result = '%s%s</a>' % (result_head, link_text)
if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"'])
@@ -1549,7 +1559,7 @@ def _do_links(self, text):
.replace('_', self._escape_table['_'])
title = self.titles.get(link_id)
if title:
title = _xml_escape_attr(title) \
title = xml_escape_attr(_AMPERSAND_RE, title) \
.replace('*', self._escape_table['*']) \
.replace('_', self._escape_table['_'])
title_str = ' title="%s"' % title
@@ -1558,8 +1568,8 @@ def _do_links(self, text):
if is_img:
img_class_str = self._html_class_str_from_tag("img")
result = '<img src="%s" alt="%s"%s%s%s' \
% (_html_escape_url(url, safe_mode=self.safe_mode),
_xml_escape_attr(link_text),
% (html_escape_url(url, safe_mode=self.safe_mode),
xml_escape_attr(_AMPERSAND_RE, link_text),
title_str,
img_class_str,
self.empty_element_suffix)
@@ -1571,7 +1581,7 @@ def _do_links(self, text):
if self.safe_mode and not self._safe_protocols.match(url):
result_head = '<a href="#"%s>' % (title_str)
else:
result_head = '<a href="%s"%s>' % (_html_escape_url(url, safe_mode=self.safe_mode), title_str)
result_head = '<a href="%s"%s>' % (html_escape_url(url, safe_mode=self.safe_mode), title_str)
result = '%s%s</a>' % (result_head, link_text)
if "smarty-pants" in self.extras:
result = result.replace('"', self._escape_table['"'])
@@ -1608,7 +1618,7 @@ def header_id_from_text(self, text, prefix, n):
None to not have an id attribute and to exclude this header from
the TOC (if the "toc" extra is specified).
"""
header_id = _slugify(text)
header_id = slugify(text)
if prefix and isinstance(prefix, str):
header_id = prefix + '-' + header_id

@@ -2355,7 +2365,7 @@ def _encode_email_address(self, addr):
#
# Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
# mailing list: <http://tinyurl.com/yu7ue>
chars = [_xml_encode_email_char_at_random(ch)
chars = [xml_encode_email_char_at_random(ch)
for ch in "mailto:" + addr]
# Strip the mailto: from the visible part.
addr = '<a href="%s">%s</a>' \
@@ -2465,41 +2475,6 @@ class MarkdownWithExtras(Markdown):

# ---- internal support functions


def calculate_toc_html(toc):
"""Return the HTML for the current TOC.
This expects the `_toc` attribute to have been set on this instance.
"""
if toc is None:
return None

def indent():
return ' ' * (len(h_stack) - 1)
lines = []
h_stack = [0] # stack of header-level numbers
for level, id, name in toc:
if level > h_stack[-1]:
lines.append("%s<ul>" % indent())
h_stack.append(level)
elif level == h_stack[-1]:
lines[-1] += "</li>"
else:
while level < h_stack[-1]:
h_stack.pop()
if not lines[-1].endswith("</li>"):
lines[-1] += "</li>"
lines.append("%s</ul></li>" % indent())
lines.append('%s<li><a href="#%s">%s</a>' % (
indent(), id, name))
while len(h_stack) > 1:
h_stack.pop()
if not lines[-1].endswith("</li>"):
lines[-1] += "</li>"
lines.append("%s</ul>" % indent())
return '\n'.join(lines) + '\n'


class UnicodeWithAttrs(str):
"""A subclass of unicode used for the return value of conversion to
possibly attach some attributes. E.g. the "toc_html" attribute when
@@ -2508,260 +2483,6 @@ class UnicodeWithAttrs(str):
metadata = None
toc_html = None

## {{{ http://code.activestate.com/recipes/577257/ (r1)
_slugify_strip_re = re.compile(r'[^\w\s-]')
_slugify_hyphenate_re = re.compile(r'[-\s]+')
def _slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
From Django's "django/template/defaultfilters.py".
"""
import unicodedata
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
value = _slugify_strip_re.sub('', value).strip().lower()
return _slugify_hyphenate_re.sub('-', value)
## end of http://code.activestate.com/recipes/577257/ }}}


# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
def _curry(*args, **kwargs):
function, args = args[0], args[1:]
def result(*rest, **kwrest):
combined = kwargs.copy()
combined.update(kwrest)
return function(*args + rest, **combined)
return result


# Recipe: regex_from_encoded_pattern (1.0)
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith('/') and s.rfind('/') != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind('/')
_, flags_str = s[1:idx], s[idx+1:]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError("unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))


# Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False):
"""_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
"lines" is a list of lines to dedent.
"tabsize" is the tab width to use for indent width calculations.
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
Same as dedent() except operates on a sequence of lines. Note: the
lines list is modified **in-place**.
"""
DEBUG = False
if DEBUG:
print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
% (tabsize, skip_first_line))
margin = None
for i, line in enumerate(lines):
if i == 0 and skip_first_line: continue
indent = 0
for ch in line:
if ch == ' ':
indent += 1
elif ch == '\t':
indent += tabsize - (indent % tabsize)
elif ch in '\r\n':
continue # skip all-whitespace lines
else:
break
else:
continue # skip all-whitespace lines
if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
if margin is None:
margin = indent
else:
margin = min(margin, indent)
if DEBUG: print("dedent: margin=%r" % margin)

if margin is not None and margin > 0:
for i, line in enumerate(lines):
if i == 0 and skip_first_line: continue
removed = 0
for j, ch in enumerate(line):
if ch == ' ':
removed += 1
elif ch == '\t':
removed += tabsize - (removed % tabsize)
elif ch in '\r\n':
if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
lines[i] = lines[i][j:]
break
else:
raise ValueError("unexpected non-whitespace char %r in "
"line %r while removing %d-space margin"
% (ch, line, margin))
if DEBUG:
print("dedent: %r: %r -> removed %d/%d"\
% (line, ch, removed, margin))
if removed == margin:
lines[i] = lines[i][j+1:]
break
elif removed > margin:
lines[i] = ' '*(removed-margin) + lines[i][j+1:]
break
else:
if removed:
lines[i] = lines[i][removed:]
return lines


def _dedent(text, tabsize=8, skip_first_line=False):
"""_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
"text" is the text to dedent.
"tabsize" is the tab width to use for indent width calculations.
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
textwrap.dedent(s), but don't expand tabs to spaces
"""
lines = text.splitlines(1)
_dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
return ''.join(lines)


class _memoized(object):
"""Decorator that caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned, and
not re-evaluated.
http://wiki.python.org/moin/PythonDecoratorLibrary
"""
def __init__(self, func):
self.func = func
self.cache = {}

def __call__(self, *args):
try:
return self.cache[args]
except KeyError:
self.cache[args] = value = self.func(*args)
return value
except TypeError:
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)

def __repr__(self):
"""Return the function's docstring."""
return self.func.__doc__


def _xml_oneliner_re_from_tab_width(tab_width):
"""Standalone XML processing instruction regex."""
return re.compile(r"""
(?:
(?<=\n\n) # Starting after a blank line
| # or
\A\n? # the beginning of the doc
)
( # save in $1
[ ]{0,%d}
(?:
<\?\w+\b\s+.*?\?> # XML processing instruction
|
<\w+:\w+\b\s+.*?/> # namespaced single tag
)
[ \t]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
)
""" % (tab_width - 1), re.X)
_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)


def _hr_tag_re_from_tab_width(tab_width):
return re.compile(r"""
(?:
(?<=\n\n) # Starting after a blank line
| # or
\A\n? # the beginning of the doc
)
( # save in \1
[ ]{0,%d}
<(hr) # start tag = \2
\b # word break
([^<>])*? #
/?> # the matching end tag
[ \t]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
)
""" % (tab_width - 1), re.X)
_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)


def _xml_escape_attr(attr, skip_single_quote=True):
"""Escape the given string for use in an HTML/XML tag attribute.
By default this doesn't bother with escaping `'` to `&#39;`, presuming that
the tag attribute is surrounded by double quotes.
"""
escaped = _AMPERSAND_RE.sub('&amp;', attr)

escaped = (attr
.replace('"', '&quot;')
.replace('<', '&lt;')
.replace('>', '&gt;'))
if not skip_single_quote:
escaped = escaped.replace("'", "&#39;")
return escaped


def _xml_encode_email_char_at_random(ch):
r = random()
# Roughly 10% raw, 45% hex, 45% dec.
# '@' *must* be encoded. I [John Gruber] insist.
# Issue 26: '_' must be encoded.
if r > 0.9 and ch not in "@_":
return ch
elif r < 0.45:
# The [1:] is to drop leading '0': 0x63 -> x63
return '&#%s;' % hex(ord(ch))[1:]
else:
return '&#%s;' % ord(ch)


def _html_escape_url(attr, safe_mode=False):
"""Replace special characters that are potentially malicious in url string."""
escaped = (attr
.replace('"', '&quot;')
.replace('<', '&lt;')
.replace('>', '&gt;'))
if safe_mode:
escaped = escaped.replace('+', ' ')
escaped = escaped.replace("'", "&#39;")
return escaped


# ---- mainline
@@ -2850,7 +2571,7 @@ def main(argv=None):
raise MarkdownError("%s:%d: invalid link pattern line: %r"
% (opts.link_patterns_file, i+1, line))
link_patterns.append(
(_regex_from_encoded_pattern(pat), href))
(regex_from_encoded_pattern(pat), href))
finally:
f.close()
else:
321 changes: 321 additions & 0 deletions lib/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
from random import random
import re


## {{{ http://code.activestate.com/recipes/577257/ (r1)

_slugify_strip_re = re.compile(r"[^\w\s-]")
_slugify_hyphenate_re = re.compile(r"[-\s]+")


def slugify(value):
"""
Normalizes string, converts to lowercase, removes non-alpha characters,
and converts spaces to hyphens.
From Django's "django/template/defaultfilters.py".
"""
import unicodedata

value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode()
value = _slugify_strip_re.sub("", value).strip().lower()
return _slugify_hyphenate_re.sub("-", value)


## end of http://code.activestate.com/recipes/577257/ }}}


def calculate_toc_html(toc):
"""Return the HTML for the current TOC.
This expects the `_toc` attribute to have been set on this instance.
"""
if toc is None:
return None

def indent():
return " " * (len(h_stack) - 1)

lines = []
h_stack = [0] # stack of header-level numbers
for level, id, name in toc:
if level > h_stack[-1]:
lines.append("%s<ul>" % indent())
h_stack.append(level)
elif level == h_stack[-1]:
lines[-1] += "</li>"
else:
while level < h_stack[-1]:
h_stack.pop()
if not lines[-1].endswith("</li>"):
lines[-1] += "</li>"
lines.append("%s</ul></li>" % indent())
lines.append('%s<li><a href="#%s">%s</a>' % (indent(), id, name))
while len(h_stack) > 1:
h_stack.pop()
if not lines[-1].endswith("</li>"):
lines[-1] += "</li>"
lines.append("%s</ul>" % indent())
return "\n".join(lines) + "\n"


# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
def curry(*args, **kwargs):
function, args = args[0], args[1:]

def result(*rest, **kwrest):
combined = kwargs.copy()
combined.update(kwrest)
return function(*args + rest, **combined)

return result


# Recipe: regex_from_encoded_pattern (1.0)
def regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith("/") and s.rfind("/") != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind("/")
_, flags_str = s[1:idx], s[idx + 1 :]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError(
"unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, "".join(list(flag_from_char.keys())))
)
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))


# Recipe: dedent (0.1.2)
def dedentlines(lines, tabsize=8, skip_first_line=False):
"""dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
"lines" is a list of lines to dedent.
"tabsize" is the tab width to use for indent width calculations.
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
Same as dedent() except operates on a sequence of lines. Note: the
lines list is modified **in-place**.
"""
DEBUG = False
if DEBUG:
print(
"dedent: dedent(..., tabsize=%d, skip_first_line=%r)"
% (tabsize, skip_first_line)
)
margin = None
for i, line in enumerate(lines):
if i == 0 and skip_first_line:
continue
indent = 0
for ch in line:
if ch == " ":
indent += 1
elif ch == "\t":
indent += tabsize - (indent % tabsize)
elif ch in "\r\n":
continue # skip all-whitespace lines
else:
break
else:
continue # skip all-whitespace lines
if DEBUG:
print("dedent: indent=%d: %r" % (indent, line))
if margin is None:
margin = indent
else:
margin = min(margin, indent)
if DEBUG:
print("dedent: margin=%r" % margin)

if margin is not None and margin > 0:
for i, line in enumerate(lines):
if i == 0 and skip_first_line:
continue
removed = 0
for j, ch in enumerate(line):
if ch == " ":
removed += 1
elif ch == "\t":
removed += tabsize - (removed % tabsize)
elif ch in "\r\n":
if DEBUG:
print("dedent: %r: EOL -> strip up to EOL" % line)
lines[i] = lines[i][j:]
break
else:
raise ValueError(
"unexpected non-whitespace char %r in "
"line %r while removing %d-space margin" % (ch, line, margin)
)
if DEBUG:
print(
"dedent: %r: %r -> removed %d/%d" % (line, ch, removed, margin)
)
if removed == margin:
lines[i] = lines[i][j + 1 :]
break
elif removed > margin:
lines[i] = " " * (removed - margin) + lines[i][j + 1 :]
break
else:
if removed:
lines[i] = lines[i][removed:]
return lines


def dedent(text, tabsize=8, skip_first_line=False):
"""dedent(text, tabsize=8, skip_first_line=False) -> dedented text
"text" is the text to dedent.
"tabsize" is the tab width to use for indent width calculations.
"skip_first_line" is a boolean indicating if the first line should
be skipped for calculating the indent width and for dedenting.
This is sometimes useful for docstrings and similar.
textwrap.dedent(s), but don't expand tabs to spaces
"""
lines = text.splitlines(1)
dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
return "".join(lines)


class memoized(object):
"""Decorator that caches a function's return value each time it is called.
If called later with the same arguments, the cached value is returned, and
not re-evaluated.
http://wiki.python.org/moin/PythonDecoratorLibrary
"""

def __init__(self, func):
self.func = func
self.cache = {}

def __call__(self, *args):
try:
return self.cache[args]
except KeyError:
self.cache[args] = value = self.func(*args)
return value
except TypeError:
# uncachable -- for instance, passing a list as an argument.
# Better to not cache than to blow up entirely.
return self.func(*args)

def __repr__(self):
"""Return the function's docstring."""
return self.func.__doc__


def xml_oneliner_re_from_tab_width(tab_width):
"""Standalone XML processing instruction regex."""
return re.compile(
r"""
(?:
(?<=\n\n) # Starting after a blank line
| # or
\A\n? # the beginning of the doc
)
( # save in $1
[ ]{0,%d}
(?:
<\?\w+\b\s+.*?\?> # XML processing instruction
|
<\w+:\w+\b\s+.*?/> # namespaced single tag
)
[ \t]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
)
"""
% (tab_width - 1),
re.X,
)


xml_oneliner_re_from_tab_width = memoized(xml_oneliner_re_from_tab_width)


def hr_tag_re_from_tab_width(tab_width):
return re.compile(
r"""
(?:
(?<=\n\n) # Starting after a blank line
| # or
\A\n? # the beginning of the doc
)
( # save in \1
[ ]{0,%d}
<(hr) # start tag = \2
\b # word break
([^<>])*? #
/?> # the matching end tag
[ \t]*
(?=\n{2,}|\Z) # followed by a blank line or end of document
)
"""
% (tab_width - 1),
re.X,
)


hr_tag_re_from_tab_width = memoized(hr_tag_re_from_tab_width)


def xml_escape_attr(ampersand_re, attr, skip_single_quote=True):
"""Escape the given string for use in an HTML/XML tag attribute.
By default this doesn't bother with escaping `'` to `&#39;`, presuming that
the tag attribute is surrounded by double quotes.
"""
escaped = ampersand_re.sub("&amp;", attr)

escaped = attr.replace('"', "&quot;").replace("<", "&lt;").replace(">", "&gt;")
if not skip_single_quote:
escaped = escaped.replace("'", "&#39;")
return escaped


def xml_encode_email_char_at_random(ch):
r = random()
# Roughly 10% raw, 45% hex, 45% dec.
# '@' *must* be encoded. I [John Gruber] insist.
# Issue 26: '_' must be encoded.
if r > 0.9 and ch not in "@_":
return ch
elif r < 0.45:
# The [1:] is to drop leading '0': 0x63 -> x63
return "&#%s;" % hex(ord(ch))[1:]
else:
return "&#%s;" % ord(ch)


def html_escape_url(attr, safe_mode=False):
"""Replace special characters that are potentially malicious in url string."""
escaped = (attr
.replace('"', '&quot;')
.replace('<', '&lt;')
.replace('>', '&gt;'))
if safe_mode:
escaped = escaped.replace('+', ' ')
escaped = escaped.replace("'", "&#39;")
return escaped