@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128
128
return b"" .join (rv )
129
129
130
130
131
- def HTMLInputStream (source , encoding = None , parseMeta = True , chardet = True ):
131
+ def HTMLInputStream (source , ** kwargs ):
132
132
# Work around Python bug #20007: read(0) closes the connection.
133
133
# http://bugs.python.org/issue20007
134
134
if (isinstance (source , http_client .HTTPResponse ) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142
142
isUnicode = isinstance (source , text_type )
143
143
144
144
if isUnicode :
145
- if encoding is not None :
146
- raise TypeError ("Cannot explicitly set an encoding with a unicode string" )
145
+ encodings = [x for x in kwargs if x .endswith ("_encoding" )]
146
+ if encodings :
147
+ raise TypeError ("Cannot set an encoding with a unicode input, set %r" % encodings )
147
148
148
- return HTMLUnicodeInputStream (source )
149
+ return HTMLUnicodeInputStream (source , ** kwargs )
149
150
else :
150
- return HTMLBinaryInputStream (source , encoding , parseMeta , chardet )
151
+ return HTMLBinaryInputStream (source , ** kwargs )
151
152
152
153
153
154
class HTMLUnicodeInputStream (object ):
@@ -173,8 +174,6 @@ def __init__(self, source):
173
174
regardless of any BOM or later declaration (such as in a meta
174
175
element)
175
176
176
- parseMeta - Look for a <meta> element containing encoding information
177
-
178
177
"""
179
178
180
179
if not utils .supports_lone_surrogates :
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390
389
391
390
"""
392
391
393
- def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
392
+ def __init__ (self , source , override_encoding = None , transport_encoding = None ,
393
+ same_origin_parent_encoding = None , likely_encoding = None ,
394
+ default_encoding = "windows-1252" , useChardet = True ):
394
395
"""Initialises the HTMLInputStream.
395
396
396
397
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403
404
regardless of any BOM or later declaration (such as in a meta
404
405
element)
405
406
406
- parseMeta - Look for a <meta> element containing encoding information
407
-
408
407
"""
409
408
# Raw Stream - for unicode objects this will encode to utf-8 and set
410
409
# self.charEncoding as appropriate
411
410
self .rawStream = self .openStream (source )
412
411
413
412
HTMLUnicodeInputStream .__init__ (self , self .rawStream )
414
413
415
- self .charEncoding = (lookupEncoding (encoding ), "certain" )
416
-
417
414
# Encoding Information
418
415
# Number of bytes to use when looking for a meta element with
419
416
# encoding information
420
417
self .numBytesMeta = 1024
421
418
# Number of bytes to use when using detecting encoding using chardet
422
419
self .numBytesChardet = 100
423
- # Encoding to use if no other information can be found
424
- self .defaultEncoding = "windows-1252"
420
+ # Things from args
421
+ self .override_encoding = override_encoding
422
+ self .transport_encoding = transport_encoding
423
+ self .same_origin_parent_encoding = same_origin_parent_encoding
424
+ self .likely_encoding = likely_encoding
425
+ self .default_encoding = default_encoding
425
426
426
- # Detect encoding iff no explicit "transport level" encoding is supplied
427
- if (self .charEncoding [0 ] is None ):
428
- self .charEncoding = self .detectEncoding (parseMeta , chardet )
429
- assert self .charEncoding [0 ] is not None
427
+ # Determine encoding
428
+ self .charEncoding = self .determineEncoding (useChardet )
429
+ assert self .charEncoding [0 ] is not None
430
430
431
431
# Call superclass
432
432
self .reset ()
@@ -454,21 +454,45 @@ def openStream(self, source):
454
454
455
455
return stream
456
456
457
- def detectEncoding (self , parseMeta = True , chardet = True ):
458
- # First look for a BOM
457
+ def determineEncoding (self , chardet = True ):
458
+ # BOMs take precedence over everything
459
459
# This will also read past the BOM if present
460
- encoding = self .detectBOM ()
461
- confidence = "certain"
462
- # If there is no BOM need to look for meta elements with encoding
463
- # information
464
- if encoding is None and parseMeta :
465
- encoding = self .detectEncodingMeta ()
466
- confidence = "tentative"
460
+ charEncoding = self .detectBOM (), "certain"
461
+ if charEncoding [0 ] is not None :
462
+ return charEncoding
463
+
464
+ # If we've been overriden, we've been overriden
465
+ charEncoding = lookupEncoding (self .override_encoding ), "certain"
466
+ if charEncoding [0 ] is not None :
467
+ return charEncoding
468
+
469
+ # Now check the transport layer
470
+ charEncoding = lookupEncoding (self .transport_encoding ), "certain"
471
+ if charEncoding [0 ] is not None :
472
+ return charEncoding
473
+
474
+ # Look for meta elements with encoding information
475
+ charEncoding = self .detectEncodingMeta (), "tentative"
476
+ if charEncoding [0 ] is not None :
477
+ return charEncoding
478
+
479
+ # Parent document encoding
480
+ charEncoding = lookupEncoding (self .same_origin_parent_encoding ), "tentative"
481
+ if charEncoding [0 ] is not None and not charEncoding [0 ].name .startswith ("utf-16" ):
482
+ return charEncoding
483
+
484
+ # "likely" encoding
485
+ charEncoding = lookupEncoding (self .likely_encoding ), "tentative"
486
+ if charEncoding [0 ] is not None :
487
+ return charEncoding
488
+
467
489
# Guess with chardet, if available
468
- if encoding is None and chardet :
469
- confidence = "tentative"
490
+ if chardet :
470
491
try :
471
492
from chardet .universaldetector import UniversalDetector
493
+ except ImportError :
494
+ pass
495
+ else :
472
496
buffers = []
473
497
detector = UniversalDetector ()
474
498
while not detector .done :
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481
505
detector .close ()
482
506
encoding = lookupEncoding (detector .result ['encoding' ])
483
507
self .rawStream .seek (0 )
484
- except ImportError :
485
- pass
486
- # If all else fails use the default encoding
487
- if encoding is None :
488
- confidence = "tentative"
489
- encoding = lookupEncoding (self .defaultEncoding )
508
+ if encoding is not None :
509
+ return encoding , "tentative"
510
+
511
+ # Try the default encoding
512
+ charEncoding = lookupEncoding (self .default_encoding ), "tentative"
513
+ if charEncoding [0 ] is not None :
514
+ return charEncoding
490
515
491
- return encoding , confidence
516
+ # Fallback to html5lib's default if even that hasn't worked
517
+ return lookupEncoding ("windows-1252" ), "tentative"
492
518
493
519
def changeEncoding (self , newEncoding ):
494
520
assert self .charEncoding [1 ] != "certain"
0 commit comments