Why do pickle + gzip outperform h5py on repetitive datasets?
问题 I am saving a numpy array which contains repetitive data: import numpy as np import gzip import cPickle as pkl import h5py a = np.random.randn(100000, 10) b = np.hstack( [a[cnt:a.shape[0]-10+cnt+1] for cnt in range(10)] ) f_pkl_gz = gzip.open('noise.pkl.gz', 'w') pkl.dump(b, f_pkl_gz, protocol = pkl.HIGHEST_PROTOCOL) f_pkl_gz.close() f_pkl = open('noise.pkl', 'w') pkl.dump(b, f_pkl, protocol = pkl.HIGHEST_PROTOCOL) f_pkl.close() f_hdf5 = h5py.File('noise.hdf5', 'w') f_hdf5.create_dataset('b',