Skip to content

Commit e82d093

Browse files
kawochenjreback
authored andcommitted
API: to_msgpack and read_msgpack encoding defaults to utf-8
closes #12170 Author: Ka Wo Chen <[email protected]> Closes #12277 from kawochen/API-12170 and squashes the following commits: 5adcf3b [Ka Wo Chen] API: to_msgpack and read_msgpack encoding defaults to utf-8
1 parent 81e3303 commit e82d093

File tree

4 files changed

+27
-14
lines changed

4 files changed

+27
-14
lines changed

doc/source/whatsnew/v0.18.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ Backwards incompatible API changes
346346
- ``DataFrame.round()`` leaves non-numeric columns unchanged in its return, rather than raises. (:issue:`11885`)
347347
- ``DataFrame.head(0)`` and ``DataFrame.tail(0)`` return empty frames, rather than ``self``. (:issue:`11937`)
348348
- ``Series.head(0)`` and ``Series.tail(0)`` return empty series, rather than ``self``. (:issue:`11937`)
349+
- ``to_msgpack`` and ``read_msgpack`` encoding now defaults to ``'utf-8'``. (:issue:`12170`)
349350

350351
NaT and Timedelta operations
351352
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pandas/core/generic.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ def to_hdf(self, path_or_buf, key, **kwargs):
939939
from pandas.io import pytables
940940
return pytables.to_hdf(path_or_buf, key, self, **kwargs)
941941

942-
def to_msgpack(self, path_or_buf=None, **kwargs):
942+
def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
943943
"""
944944
msgpack (serialize) object to input file path
945945
@@ -957,7 +957,8 @@ def to_msgpack(self, path_or_buf=None, **kwargs):
957957
"""
958958

959959
from pandas.io import packers
960-
return packers.to_msgpack(path_or_buf, self, **kwargs)
960+
return packers.to_msgpack(path_or_buf, self, encoding=encoding,
961+
**kwargs)
961962

962963
def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail',
963964
index=True, index_label=None, chunksize=None, dtype=None):

pandas/io/packers.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def to_msgpack(path_or_buf, *args, **kwargs):
7575
path_or_buf : string File path, buffer-like, or None
7676
if None, return generated string
7777
args : an object or objects to serialize
78+
encoding: encoding for unicode objects
7879
append : boolean whether to append to an existing msgpack
7980
(default is False)
8081
compress : type of compressor (zlib or blosc), default to None (no
@@ -103,7 +104,7 @@ def writer(fh):
103104
writer(path_or_buf)
104105

105106

106-
def read_msgpack(path_or_buf, iterator=False, **kwargs):
107+
def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
107108
"""
108109
Load msgpack pandas object from the specified
109110
file path
@@ -114,6 +115,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
114115
Parameters
115116
----------
116117
path_or_buf : string File path, BytesIO like or string
118+
encoding: Encoding for decoding msgpack str type
117119
iterator : boolean, if True, return an iterator to the unpacker
118120
(default is False)
119121
@@ -127,7 +129,7 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs):
127129
return Iterator(path_or_buf)
128130

129131
def read(fh):
130-
l = list(unpack(fh, **kwargs))
132+
l = list(unpack(fh, encoding=encoding, **kwargs))
131133
if len(l) == 1:
132134
return l[0]
133135
return l
@@ -573,7 +575,7 @@ def create_block(b):
573575

574576

575577
def pack(o, default=encode,
576-
encoding='latin1', unicode_errors='strict', use_single_float=False,
578+
encoding='utf-8', unicode_errors='strict', use_single_float=False,
577579
autoreset=1, use_bin_type=1):
578580
"""
579581
Pack an object and return the packed bytes.
@@ -587,7 +589,7 @@ def pack(o, default=encode,
587589

588590

589591
def unpack(packed, object_hook=decode,
590-
list_hook=None, use_list=False, encoding='latin1',
592+
list_hook=None, use_list=False, encoding='utf-8',
591593
unicode_errors='strict', object_pairs_hook=None,
592594
max_buffer_size=0, ext_hook=ExtType):
593595
"""
@@ -607,7 +609,7 @@ def unpack(packed, object_hook=decode,
607609
class Packer(_Packer):
608610

609611
def __init__(self, default=encode,
610-
encoding='latin1',
612+
encoding='utf-8',
611613
unicode_errors='strict',
612614
use_single_float=False,
613615
autoreset=1,
@@ -624,7 +626,7 @@ class Unpacker(_Unpacker):
624626

625627
def __init__(self, file_like=None, read_size=0, use_list=False,
626628
object_hook=decode,
627-
object_pairs_hook=None, list_hook=None, encoding='latin1',
629+
object_pairs_hook=None, list_hook=None, encoding='utf-8',
628630
unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
629631
super(Unpacker, self).__init__(file_like=file_like,
630632
read_size=read_size,

pandas/io/tests/test_packers.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -299,11 +299,8 @@ def test_multi_index(self):
299299
def test_unicode(self):
300300
i = tm.makeUnicodeIndex(100)
301301

302-
# this currently fails
303-
self.assertRaises(UnicodeEncodeError, self.encode_decode, i)
304-
305-
# i_rec = self.encode_decode(i)
306-
# self.assertTrue(i.equals(i_rec))
302+
i_rec = self.encode_decode(i)
303+
self.assertTrue(i.equals(i_rec))
307304

308305

309306
class TestSeries(TestPackers):
@@ -615,6 +612,14 @@ def test_utf(self):
615612
result = self.encode_decode(frame, encoding=encoding)
616613
assert_frame_equal(result, frame)
617614

615+
def test_default_encoding(self):
616+
for frame in compat.itervalues(self.frame):
617+
result = frame.to_msgpack()
618+
expected = frame.to_msgpack(encoding='utf8')
619+
self.assertEqual(result, expected)
620+
result = self.encode_decode(frame)
621+
assert_frame_equal(result, frame)
622+
618623

619624
class TestMsgpack():
620625
"""
@@ -652,7 +657,11 @@ def check_min_structure(self, data):
652657
typ], '"{0}" not found in data["{1}"]'.format(kind, typ)
653658

654659
def compare(self, vf, version):
655-
data = read_msgpack(vf)
660+
# GH12277 encoding default used to be latin-1, now utf-8
661+
if LooseVersion(version) < '0.18.0':
662+
data = read_msgpack(vf, encoding='latin-1')
663+
else:
664+
data = read_msgpack(vf)
656665
self.check_min_structure(data)
657666
for typ, dv in data.items():
658667
assert typ in self.all_data, ('unpacked data contains '

0 commit comments

Comments
 (0)