5
5
import sys
6
6
import tempfile
7
7
import textwrap
8
+ import typing
9
+ import urllib .parse
10
+ import urllib .request
8
11
from pathlib import Path
9
12
from typing import Iterable , Iterator , Union
10
13
11
14
from .handler import _check_log_handler , logger
12
15
from .pandoc_download import DEFAULT_TARGET_FOLDER , download_pandoc
13
- from .py3compat import cast_bytes , cast_unicode , url2path , urlparse
14
16
15
17
__author__ = "Juho Vepsäläinen"
16
18
__author_email__ = "[email protected] "
53
55
54
56
55
57
def convert_text (
56
- source : str ,
58
+ source : typing . Union [ str , bytes ] ,
57
59
to : str ,
58
60
format : str ,
59
61
extra_args : Iterable = (),
@@ -66,7 +68,7 @@ def convert_text(
66
68
) -> str :
67
69
"""Converts given `source` from `format` to `to`.
68
70
69
- :param str source: Unicode string or bytes (see encoding)
71
+ :param source: Unicode string or bytes (see encoding)
70
72
71
73
:param str to: format into which the input should be converted;
72
74
can be one of `pypandoc.get_pandoc_formats()[1]`
@@ -106,7 +108,10 @@ def convert_text(
106
108
if pandoc is not found; make sure it has been installed
107
109
and is available at path.
108
110
"""
109
- source = _as_unicode (source , encoding )
111
+
112
+ if isinstance (source , bytes ):
113
+ source = source .decode (encoding , errors = "ignore" )
114
+
110
115
return _convert_input (
111
116
source ,
112
117
format ,
@@ -286,7 +291,7 @@ def _identify_path(source) -> bool:
286
291
if not is_path :
287
292
try :
288
293
# check if it's an URL
289
- result = urlparse (source )
294
+ result = urllib . parse . urlparse (source )
290
295
if result .scheme in ["http" , "https" ]:
291
296
is_path = True
292
297
elif result .scheme and result .netloc and result .path :
@@ -303,7 +308,7 @@ def _identify_path(source) -> bool:
303
308
def _is_network_path (source ):
304
309
try :
305
310
# check if it's an URL
306
- result = urlparse (source )
311
+ result = urllib . parse . urlparse (source )
307
312
if result .scheme in ["http" , "https" ]:
308
313
return True
309
314
elif result .scheme and result .netloc and result .path :
@@ -320,17 +325,6 @@ def _identify_format_from_path(sourcefile: str, format: str) -> str:
320
325
return format or os .path .splitext (sourcefile )[1 ].strip ("." )
321
326
322
327
323
- def _as_unicode (source : any , encoding : str ) -> any :
324
- if encoding != "utf-8" :
325
- # if a source and a different encoding is given,
326
- # try to decode the source into a string
327
- try :
328
- source = cast_unicode (source , encoding = encoding )
329
- except (UnicodeDecodeError , UnicodeEncodeError ):
330
- pass
331
- return source
332
-
333
-
334
328
def normalize_format (fmt ):
335
329
formats = {
336
330
"dbk" : "docbook" ,
@@ -404,7 +398,7 @@ def _validate_formats(format, to, outputfile):
404
398
405
399
406
400
def _convert_input (
407
- source ,
401
+ source : str ,
408
402
format ,
409
403
input_type ,
410
404
to ,
@@ -509,19 +503,9 @@ def _convert_input(
509
503
)
510
504
511
505
if string_input :
512
- try :
513
- source = cast_bytes (source , encoding = "utf-8" )
514
- except (UnicodeDecodeError , UnicodeEncodeError ):
515
- # assume that it is already a utf-8 encoded string
516
- pass
517
- try :
518
- stdout , stderr = p .communicate (source if string_input else None )
519
- except OSError :
520
- # this is happening only on Py2.6 when pandoc dies before reading all
521
- # the input. We treat that the same as when we exit with an error...
522
- raise RuntimeError (
523
- 'Pandoc died with exitcode "%s" during conversion.' % (p .returncode )
524
- )
506
+ if isinstance (source , str ):
507
+ source = source .encode ("utf-8" )
508
+ stdout , stderr = p .communicate (source if string_input else None )
525
509
526
510
try :
527
511
if not (to in ["odt" , "docx" , "epub" , "epub3" , "pdf" ] and outputfile == "-" ):
@@ -957,6 +941,10 @@ def ensure_pandoc_installed(
957
941
_ensure_pandoc_path ()
958
942
959
943
944
+ def url2path (url ):
945
+ return urllib .request .url2pathname (urllib .parse .urlparse (url ).path )
946
+
947
+
960
948
# -----------------------------------------------------------------------------
961
949
# Internal state management
962
950
# -----------------------------------------------------------------------------
0 commit comments