Skip to content

Commit 5288737

Browse files
committed
Merge pull request #245 from gsnedders/calibre-selected-1
Selected patches from Calibre; r=nobody!
2 parents 143b0d4 + 761f3ab commit 5288737

File tree

8 files changed

+167
-126
lines changed

8 files changed

+167
-126
lines changed

README.rst

-4
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,6 @@ functionality:
116116
- ``chardet`` can be used as a fallback when character encoding cannot
117117
be determined.
118118

119-
- ``ordereddict`` can be used under Python 2.6
120-
(``collections.OrderedDict`` is used instead on later versions) to
121-
serialize attributes in alphabetical order.
122-
123119

124120
Bugs
125121
----

html5lib/constants.py

+67
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,73 @@
437437
(namespaces["mathml"], "mtext")
438438
])
439439

440+
adjustSVGAttributes = {
441+
"attributename": "attributeName",
442+
"attributetype": "attributeType",
443+
"basefrequency": "baseFrequency",
444+
"baseprofile": "baseProfile",
445+
"calcmode": "calcMode",
446+
"clippathunits": "clipPathUnits",
447+
"contentscripttype": "contentScriptType",
448+
"contentstyletype": "contentStyleType",
449+
"diffuseconstant": "diffuseConstant",
450+
"edgemode": "edgeMode",
451+
"externalresourcesrequired": "externalResourcesRequired",
452+
"filterres": "filterRes",
453+
"filterunits": "filterUnits",
454+
"glyphref": "glyphRef",
455+
"gradienttransform": "gradientTransform",
456+
"gradientunits": "gradientUnits",
457+
"kernelmatrix": "kernelMatrix",
458+
"kernelunitlength": "kernelUnitLength",
459+
"keypoints": "keyPoints",
460+
"keysplines": "keySplines",
461+
"keytimes": "keyTimes",
462+
"lengthadjust": "lengthAdjust",
463+
"limitingconeangle": "limitingConeAngle",
464+
"markerheight": "markerHeight",
465+
"markerunits": "markerUnits",
466+
"markerwidth": "markerWidth",
467+
"maskcontentunits": "maskContentUnits",
468+
"maskunits": "maskUnits",
469+
"numoctaves": "numOctaves",
470+
"pathlength": "pathLength",
471+
"patterncontentunits": "patternContentUnits",
472+
"patterntransform": "patternTransform",
473+
"patternunits": "patternUnits",
474+
"pointsatx": "pointsAtX",
475+
"pointsaty": "pointsAtY",
476+
"pointsatz": "pointsAtZ",
477+
"preservealpha": "preserveAlpha",
478+
"preserveaspectratio": "preserveAspectRatio",
479+
"primitiveunits": "primitiveUnits",
480+
"refx": "refX",
481+
"refy": "refY",
482+
"repeatcount": "repeatCount",
483+
"repeatdur": "repeatDur",
484+
"requiredextensions": "requiredExtensions",
485+
"requiredfeatures": "requiredFeatures",
486+
"specularconstant": "specularConstant",
487+
"specularexponent": "specularExponent",
488+
"spreadmethod": "spreadMethod",
489+
"startoffset": "startOffset",
490+
"stddeviation": "stdDeviation",
491+
"stitchtiles": "stitchTiles",
492+
"surfacescale": "surfaceScale",
493+
"systemlanguage": "systemLanguage",
494+
"tablevalues": "tableValues",
495+
"targetx": "targetX",
496+
"targety": "targetY",
497+
"textlength": "textLength",
498+
"viewbox": "viewBox",
499+
"viewtarget": "viewTarget",
500+
"xchannelselector": "xChannelSelector",
501+
"ychannelselector": "yChannelSelector",
502+
"zoomandpan": "zoomAndPan"
503+
}
504+
505+
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
506+
440507
adjustForeignAttributes = {
441508
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
442509
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),

html5lib/html5parser.py

+36-105
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,31 @@
11
from __future__ import absolute_import, division, unicode_literals
2-
from six import with_metaclass
2+
from six import with_metaclass, viewkeys, PY3
33

44
import types
55

6+
try:
7+
from collections import OrderedDict
8+
except ImportError:
9+
from ordereddict import OrderedDict
10+
611
from . import inputstream
712
from . import tokenizer
813

914
from . import treebuilders
1015
from .treebuilders._base import Marker
1116

1217
from . import utils
13-
from . import constants
14-
from .constants import spaceCharacters, asciiUpper2Lower
15-
from .constants import specialElements
16-
from .constants import headingElements
17-
from .constants import cdataElements, rcdataElements
18-
from .constants import tokenTypes, ReparseException, namespaces
19-
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
20-
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
21-
from .constants import E
18+
from .constants import (
19+
spaceCharacters, asciiUpper2Lower,
20+
specialElements, headingElements, cdataElements, rcdataElements,
21+
tokenTypes, tagTokenTypes,
22+
namespaces,
23+
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
24+
adjustForeignAttributes as adjustForeignAttributesMap,
25+
adjustMathMLAttributes, adjustSVGAttributes,
26+
E,
27+
ReparseException
28+
)
2229

2330

2431
def parse(doc, treebuilder="etree", encoding=None,
@@ -272,96 +279,18 @@ def normalizeToken(self, token):
272279
""" HTML5 specific normalizations to the token stream """
273280

274281
if token["type"] == tokenTypes["StartTag"]:
275-
token["data"] = dict(token["data"][::-1])
282+
token["data"] = OrderedDict(token['data'][::-1])
276283

277284
return token
278285

279286
def adjustMathMLAttributes(self, token):
280-
replacements = {"definitionurl": "definitionURL"}
281-
for k, v in replacements.items():
282-
if k in token["data"]:
283-
token["data"][v] = token["data"][k]
284-
del token["data"][k]
287+
adjust_attributes(token, adjustMathMLAttributes)
285288

286289
def adjustSVGAttributes(self, token):
287-
replacements = {
288-
"attributename": "attributeName",
289-
"attributetype": "attributeType",
290-
"basefrequency": "baseFrequency",
291-
"baseprofile": "baseProfile",
292-
"calcmode": "calcMode",
293-
"clippathunits": "clipPathUnits",
294-
"contentscripttype": "contentScriptType",
295-
"contentstyletype": "contentStyleType",
296-
"diffuseconstant": "diffuseConstant",
297-
"edgemode": "edgeMode",
298-
"externalresourcesrequired": "externalResourcesRequired",
299-
"filterres": "filterRes",
300-
"filterunits": "filterUnits",
301-
"glyphref": "glyphRef",
302-
"gradienttransform": "gradientTransform",
303-
"gradientunits": "gradientUnits",
304-
"kernelmatrix": "kernelMatrix",
305-
"kernelunitlength": "kernelUnitLength",
306-
"keypoints": "keyPoints",
307-
"keysplines": "keySplines",
308-
"keytimes": "keyTimes",
309-
"lengthadjust": "lengthAdjust",
310-
"limitingconeangle": "limitingConeAngle",
311-
"markerheight": "markerHeight",
312-
"markerunits": "markerUnits",
313-
"markerwidth": "markerWidth",
314-
"maskcontentunits": "maskContentUnits",
315-
"maskunits": "maskUnits",
316-
"numoctaves": "numOctaves",
317-
"pathlength": "pathLength",
318-
"patterncontentunits": "patternContentUnits",
319-
"patterntransform": "patternTransform",
320-
"patternunits": "patternUnits",
321-
"pointsatx": "pointsAtX",
322-
"pointsaty": "pointsAtY",
323-
"pointsatz": "pointsAtZ",
324-
"preservealpha": "preserveAlpha",
325-
"preserveaspectratio": "preserveAspectRatio",
326-
"primitiveunits": "primitiveUnits",
327-
"refx": "refX",
328-
"refy": "refY",
329-
"repeatcount": "repeatCount",
330-
"repeatdur": "repeatDur",
331-
"requiredextensions": "requiredExtensions",
332-
"requiredfeatures": "requiredFeatures",
333-
"specularconstant": "specularConstant",
334-
"specularexponent": "specularExponent",
335-
"spreadmethod": "spreadMethod",
336-
"startoffset": "startOffset",
337-
"stddeviation": "stdDeviation",
338-
"stitchtiles": "stitchTiles",
339-
"surfacescale": "surfaceScale",
340-
"systemlanguage": "systemLanguage",
341-
"tablevalues": "tableValues",
342-
"targetx": "targetX",
343-
"targety": "targetY",
344-
"textlength": "textLength",
345-
"viewbox": "viewBox",
346-
"viewtarget": "viewTarget",
347-
"xchannelselector": "xChannelSelector",
348-
"ychannelselector": "yChannelSelector",
349-
"zoomandpan": "zoomAndPan"
350-
}
351-
for originalName in list(token["data"].keys()):
352-
if originalName in replacements:
353-
svgName = replacements[originalName]
354-
token["data"][svgName] = token["data"][originalName]
355-
del token["data"][originalName]
290+
adjust_attributes(token, adjustSVGAttributes)
356291

357292
def adjustForeignAttributes(self, token):
358-
replacements = adjustForeignAttributesMap
359-
360-
for originalName in token["data"].keys():
361-
if originalName in replacements:
362-
foreignName = replacements[originalName]
363-
token["data"][foreignName] = token["data"][originalName]
364-
del token["data"][originalName]
293+
adjust_attributes(token, adjustForeignAttributesMap)
365294

366295
def reparseTokenNormal(self, token):
367296
# pylint:disable=unused-argument
@@ -434,7 +363,7 @@ def getPhases(debug):
434363
def log(function):
435364
"""Logger that records which phase processes each token"""
436365
type_names = dict((value, key) for key, value in
437-
constants.tokenTypes.items())
366+
tokenTypes.items())
438367

439368
def wrapped(self, *args, **kwargs):
440369
if function.__name__.startswith("process") and len(args) > 0:
@@ -443,7 +372,7 @@ def wrapped(self, *args, **kwargs):
443372
info = {"type": type_names[token['type']]}
444373
except:
445374
raise
446-
if token['type'] in constants.tagTokenTypes:
375+
if token['type'] in tagTokenTypes:
447376
info["name"] = token['name']
448377

449378
self.parser.log.append((self.parser.tokenizer.state.__name__,
@@ -1022,17 +951,9 @@ def __init__(self, parser, tree):
1022951
self.endTagHandler.default = self.endTagOther
1023952

1024953
def isMatchingFormattingElement(self, node1, node2):
1025-
if node1.name != node2.name or node1.namespace != node2.namespace:
1026-
return False
1027-
elif len(node1.attributes) != len(node2.attributes):
1028-
return False
1029-
else:
1030-
attributes1 = sorted(node1.attributes.items())
1031-
attributes2 = sorted(node2.attributes.items())
1032-
for attr1, attr2 in zip(attributes1, attributes2):
1033-
if attr1 != attr2:
1034-
return False
1035-
return True
954+
return (node1.name == node2.name and
955+
node1.namespace == node2.namespace and
956+
node1.attributes == node2.attributes)
1036957

1037958
# helper
1038959
def addFormattingElement(self, token):
@@ -2798,6 +2719,16 @@ def processEndTag(self, token):
27982719
}
27992720

28002721

2722+
def adjust_attributes(token, replacements):
2723+
if PY3 or utils.PY27:
2724+
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2725+
else:
2726+
needs_adjustment = frozenset(token['data']) & frozenset(replacements)
2727+
if needs_adjustment:
2728+
token['data'] = OrderedDict((replacements.get(k, k), v)
2729+
for k, v in token['data'].items())
2730+
2731+
28012732
def impliedTagToken(name, type="EndTag", attributes=None,
28022733
selfClosing=False):
28032734
if attributes is None:

html5lib/tests/test_parser2.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
from six import PY2, text_type
4+
35
import io
46

57
from . import support # noqa
68

79
from html5lib.constants import namespaces
8-
from html5lib import parse
10+
from html5lib import parse, HTMLParser
911

1012

1113
# tests that aren't autogenerated from text files
@@ -49,3 +51,40 @@ def test_namespace_html_elements_1_etree():
4951

5052
def test_unicode_file():
5153
assert parse(io.StringIO("a")) is not None
54+
55+
56+
def test_duplicate_attribute():
57+
# This is here because we impl it in parser and not tokenizer
58+
doc = parse('<p class=a class=b>')
59+
el = doc[1][0]
60+
assert el.get("class") == "a"
61+
62+
63+
def test_debug_log():
64+
parser = HTMLParser(debug=True)
65+
parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")
66+
67+
expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
68+
('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
69+
('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
70+
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
71+
('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
72+
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
73+
('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
74+
('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
75+
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
76+
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
77+
('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
78+
('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
79+
('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
80+
('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
81+
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
82+
('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
83+
('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
84+
85+
if PY2:
86+
for i, log in enumerate(expected):
87+
log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
88+
expected[i] = tuple(log)
89+
90+
assert parser.log == expected

0 commit comments

Comments
 (0)