From 2cafb82d0ee4d45ce9135872258dccb8f74d14a5 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 27 Apr 2014 11:27:41 -0400 Subject: [PATCH] PERF: improved performance of compatible pickles (GH6899) --- doc/source/release.rst | 1 + doc/source/v0.14.0.txt | 2 ++ pandas/io/pickle.py | 18 +++++++++++++++--- vb_suite/packers.py | 31 ++++++++++++++++++++----------- 4 files changed, 38 insertions(+), 14 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index fce5f2f93e68b..d100541ecbf92 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -292,6 +292,7 @@ Improvements to existing features specified (:issue:`6607`) - ``read_excel`` can now read milliseconds in Excel dates and times with xlrd >= 0.9.3. (:issue:`5945`) - ``pivot_table`` can now accept ``Grouper`` by ``index`` and ``columns`` keywords (:issue:`6913`) +- Improved performance of compatible pickles (:issue:`6899`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index acacbd1c0b43c..43096b133f26e 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -522,6 +522,8 @@ Performance (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) - Improve performance of ``CustomBusinessDay`` (:issue:`6584`) - improve performance of slice indexing on Series with string keys (:issue:`6341`, :issue:`6372`) +- Performance improvements in timedelta conversions for integer dtypes (:issue:`6754`) +- Improved performance of compatible pickles (:issue:`6899`) Experimental ~~~~~~~~~~~~ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 915c1e9ae1574..e80bfec9c8dba 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -34,16 +34,28 @@ def read_pickle(path): """ def try_read(path, encoding=None): + # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param + + # cpickle + # GH 6899 try: with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=False) + return pkl.load(fh) except: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=True) + + # reg/patched pickle + try: + with open(path, 'rb') as fh: + return pc.load(fh, encoding=encoding, compat=False) + + # compat pickle + except: + with open(path, 'rb') as fh: + return pc.load(fh, encoding=encoding, compat=True) try: return try_read(path) diff --git a/vb_suite/packers.py b/vb_suite/packers.py index f2eac0e28cd44..ca0193e9b2c10 100644 --- a/vb_suite/packers.py +++ b/vb_suite/packers.py @@ -7,6 +7,7 @@ import os import pandas as pd from pandas.core import common as com +from random import randrange f = '__test__.msg' def remove(f): @@ -15,10 +16,18 @@ def remove(f): except: pass -index = date_range('20000101',periods=50000,freq='H') -df = DataFrame({'float1' : randn(50000), - 'float2' : randn(50000)}, +N=100000 +C=5 +index = date_range('20000101',periods=N,freq='H') +df = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]), index=index) + +N=100000 +C=5 +index = date_range('20000101',periods=N,freq='H') +df2 = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]), + index=index) +df2['object'] = ['%08x'%randrange(16**8) for _ in range(N)] remove(f) """ @@ -26,7 +35,7 @@ def remove(f): # msgpack setup = common_setup + """ -df.to_msgpack(f) +df2.to_msgpack(f) """ packers_read_pack = Benchmark("pd.read_msgpack(f)", setup, start_date=start_date) @@ -34,13 +43,13 @@ def remove(f): setup = common_setup + """ """ -packers_write_pack = Benchmark("df.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date) +packers_write_pack = Benchmark("df2.to_msgpack(f)", setup, cleanup="remove(f)", start_date=start_date) #---------------------------------------------------------------------- # pickle setup = common_setup + """ -df.to_pickle(f) +df2.to_pickle(f) """ packers_read_pickle = Benchmark("pd.read_pickle(f)", setup, start_date=start_date) @@ -48,7 +57,7 @@ def remove(f): setup = common_setup + """ """ -packers_write_pickle = Benchmark("df.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date) +packers_write_pickle = Benchmark("df2.to_pickle(f)", setup, cleanup="remove(f)", start_date=start_date) #---------------------------------------------------------------------- # csv @@ -68,7 +77,7 @@ def remove(f): # hdf store setup = common_setup + """ -df.to_hdf(f,'df') +df2.to_hdf(f,'df') """ packers_read_hdf_store = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date) @@ -76,13 +85,13 @@ def remove(f): setup = common_setup + """ """ -packers_write_hdf_store = Benchmark("df.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date) +packers_write_hdf_store = Benchmark("df2.to_hdf(f,'df')", setup, cleanup="remove(f)", start_date=start_date) #---------------------------------------------------------------------- # hdf table setup = common_setup + """ -df.to_hdf(f,'df',table=True) +df2.to_hdf(f,'df',table=True) """ packers_read_hdf_table = Benchmark("pd.read_hdf(f,'df')", setup, start_date=start_date) @@ -90,7 +99,7 @@ def remove(f): setup = common_setup + """ """ -packers_write_hdf_table = Benchmark("df.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date) +packers_write_hdf_table = Benchmark("df2.to_hdf(f,'df',table=True)", setup, cleanup="remove(f)", start_date=start_date) #---------------------------------------------------------------------- # json