@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128
128
return b"" .join (rv )
129
129
130
130
131
- def HTMLInputStream (source , encoding = None , parseMeta = True , chardet = True ):
131
+ def HTMLInputStream (source , override_encoding = None , ** kwargs ):
132
132
# Work around Python bug #20007: read(0) closes the connection.
133
133
# http://bugs.python.org/issue20007
134
134
if (isinstance (source , http_client .HTTPResponse ) or
@@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142
142
isUnicode = isinstance (source , text_type )
143
143
144
144
if isUnicode :
145
- if encoding is not None :
146
- raise TypeError ("Cannot explicitly set an encoding with a unicode string " )
145
+ if override_encoding is not None :
146
+ raise TypeError ("Cannot set an override encoding with a unicode input " )
147
147
148
148
return HTMLUnicodeInputStream (source )
149
149
else :
150
- return HTMLBinaryInputStream (source , encoding , parseMeta , chardet )
150
+ return HTMLBinaryInputStream (source , override_encoding = override_encoding , ** kwargs )
151
151
152
152
153
153
class HTMLUnicodeInputStream (object ):
@@ -173,8 +173,6 @@ def __init__(self, source):
173
173
regardless of any BOM or later declaration (such as in a meta
174
174
element)
175
175
176
- parseMeta - Look for a <meta> element containing encoding information
177
-
178
176
"""
179
177
180
178
if not utils .supports_lone_surrogates :
@@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390
388
391
389
"""
392
390
393
- def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
391
+ def __init__ (self , source , override_encoding = None , transport_encoding = None ,
392
+ same_origin_parent_encoding = None , likely_encoding = None ,
393
+ default_encoding = "windows-1252" , useChardet = True ):
394
394
"""Initialises the HTMLInputStream.
395
395
396
396
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403
403
regardless of any BOM or later declaration (such as in a meta
404
404
element)
405
405
406
- parseMeta - Look for a <meta> element containing encoding information
407
-
408
406
"""
409
407
# Raw Stream - for unicode objects this will encode to utf-8 and set
410
408
# self.charEncoding as appropriate
411
409
self .rawStream = self .openStream (source )
412
410
413
411
HTMLUnicodeInputStream .__init__ (self , self .rawStream )
414
412
415
- self .charEncoding = (lookupEncoding (encoding ), "certain" )
416
-
417
413
# Encoding Information
418
414
# Number of bytes to use when looking for a meta element with
419
415
# encoding information
420
416
self .numBytesMeta = 1024
421
417
# Number of bytes to use when using detecting encoding using chardet
422
418
self .numBytesChardet = 100
423
- # Encoding to use if no other information can be found
424
- self .defaultEncoding = "windows-1252"
419
+ # Things from args
420
+ self .override_encoding = override_encoding
421
+ self .transport_encoding = transport_encoding
422
+ self .same_origin_parent_encoding = same_origin_parent_encoding
423
+ self .likely_encoding = likely_encoding
424
+ self .default_encoding = default_encoding
425
425
426
- # Detect encoding iff no explicit "transport level" encoding is supplied
427
- if (self .charEncoding [0 ] is None ):
428
- self .charEncoding = self .detectEncoding (parseMeta , chardet )
429
- assert self .charEncoding [0 ] is not None
426
+ # Determine encoding
427
+ self .charEncoding = self .determineEncoding (useChardet )
428
+ assert self .charEncoding [0 ] is not None
430
429
431
430
# Call superclass
432
431
self .reset ()
@@ -454,21 +453,45 @@ def openStream(self, source):
454
453
455
454
return stream
456
455
457
- def detectEncoding (self , parseMeta = True , chardet = True ):
458
- # First look for a BOM
456
+ def determineEncoding (self , chardet = True ):
457
+ # BOMs take precedence over everything
459
458
# This will also read past the BOM if present
460
- encoding = self .detectBOM ()
461
- confidence = "certain"
462
- # If there is no BOM need to look for meta elements with encoding
463
- # information
464
- if encoding is None and parseMeta :
465
- encoding = self .detectEncodingMeta ()
466
- confidence = "tentative"
459
+ charEncoding = self .detectBOM (), "certain"
460
+ if charEncoding [0 ] is not None :
461
+ return charEncoding
462
+
463
+ # If we've been overriden, we've been overriden
464
+ charEncoding = lookupEncoding (self .override_encoding ), "certain"
465
+ if charEncoding [0 ] is not None :
466
+ return charEncoding
467
+
468
+ # Now check the transport layer
469
+ charEncoding = lookupEncoding (self .transport_encoding ), "certain"
470
+ if charEncoding [0 ] is not None :
471
+ return charEncoding
472
+
473
+ # Look for meta elements with encoding information
474
+ charEncoding = self .detectEncodingMeta (), "tentative"
475
+ if charEncoding [0 ] is not None :
476
+ return charEncoding
477
+
478
+ # Parent document encoding
479
+ charEncoding = lookupEncoding (self .same_origin_parent_encoding ), "tentative"
480
+ if charEncoding [0 ] is not None and not charEncoding [0 ].name .startswith ("utf-16" ):
481
+ return charEncoding
482
+
483
+ # "likely" encoding
484
+ charEncoding = lookupEncoding (self .likely_encoding ), "tentative"
485
+ if charEncoding [0 ] is not None :
486
+ return charEncoding
487
+
467
488
# Guess with chardet, if available
468
- if encoding is None and chardet :
469
- confidence = "tentative"
489
+ if chardet :
470
490
try :
471
491
from chardet .universaldetector import UniversalDetector
492
+ except ImportError :
493
+ pass
494
+ else :
472
495
buffers = []
473
496
detector = UniversalDetector ()
474
497
while not detector .done :
@@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481
504
detector .close ()
482
505
encoding = lookupEncoding (detector .result ['encoding' ])
483
506
self .rawStream .seek (0 )
484
- except ImportError :
485
- pass
486
- # If all else fails use the default encoding
487
- if encoding is None :
488
- confidence = "tentative"
489
- encoding = lookupEncoding (self .defaultEncoding )
507
+ if encoding is not None :
508
+ return encoding , "tentative"
509
+
510
+ # Try the default encoding
511
+ charEncoding = lookupEncoding (self .default_encoding ), "tentative"
512
+ if charEncoding [0 ] is not None :
513
+ return charEncoding
490
514
491
- return encoding , confidence
515
+ # Fallback to html5lib's default if even that hasn't worked
516
+ return lookupEncoding ("windows-1252" ), "tentative"
492
517
493
518
def changeEncoding (self , newEncoding ):
494
519
assert self .charEncoding [1 ] != "certain"
0 commit comments