CofeehousePy/deps/numpy/benchmarks/benchmarks/bench_io.py

245 lines
7.7 KiB
Python

from .common import Benchmark, get_squares
import numpy as np
from io import StringIO
class Copy(Benchmark):
params = ["int8", "int16", "float32", "float64",
"complex64", "complex128"]
param_names = ['type']
def setup(self, typename):
dtype = np.dtype(typename)
self.d = np.arange((50 * 500), dtype=dtype).reshape((500, 50))
self.e = np.arange((50 * 500), dtype=dtype).reshape((50, 500))
self.e_d = self.e.reshape(self.d.shape)
self.dflat = np.arange((50 * 500), dtype=dtype)
def time_memcpy(self, typename):
self.d[...] = self.e_d
def time_memcpy_large_out_of_place(self, typename):
l = np.ones(1024**2, dtype=np.dtype(typename))
l.copy()
def time_cont_assign(self, typename):
self.d[...] = 1
def time_strided_copy(self, typename):
self.d[...] = self.e.T
def time_strided_assign(self, typename):
self.dflat[::2] = 2
class CopyTo(Benchmark):
def setup(self):
self.d = np.ones(50000)
self.e = self.d.copy()
self.m = (self.d == 1)
self.im = (~ self.m)
self.m8 = self.m.copy()
self.m8[::8] = (~ self.m[::8])
self.im8 = (~ self.m8)
def time_copyto(self):
np.copyto(self.d, self.e)
def time_copyto_sparse(self):
np.copyto(self.d, self.e, where=self.m)
def time_copyto_dense(self):
np.copyto(self.d, self.e, where=self.im)
def time_copyto_8_sparse(self):
np.copyto(self.d, self.e, where=self.m8)
def time_copyto_8_dense(self):
np.copyto(self.d, self.e, where=self.im8)
class Savez(Benchmark):
def setup(self):
self.squares = get_squares()
def time_vb_savez_squares(self):
np.savez('tmp.npz', **self.squares)
class LoadtxtCSVComments(Benchmark):
# benchmarks for np.loadtxt comment handling
# when reading in CSV files
params = [10, int(1e2), int(1e4), int(1e5)]
param_names = ['num_lines']
def setup(self, num_lines):
data = [u'1,2,3 # comment'] * num_lines
# unfortunately, timeit will only run setup()
# between repeat events, but not for iterations
# within repeats, so the StringIO object
# will have to be rewinded in the benchmark proper
self.data_comments = StringIO(u'\n'.join(data))
def time_comment_loadtxt_csv(self, num_lines):
# benchmark handling of lines with comments
# when loading in from csv files
# inspired by similar benchmark in pandas
# for read_csv
# need to rewind StringIO object (unfortunately
# confounding timing result somewhat) for every
# call to timing test proper
np.loadtxt(self.data_comments,
delimiter=u',')
self.data_comments.seek(0)
class LoadtxtCSVdtypes(Benchmark):
# benchmarks for np.loadtxt operating with
# different dtypes parsed / cast from CSV files
params = (['float32', 'float64', 'int32', 'int64',
'complex128', 'str', 'object'],
[10, int(1e2), int(1e4), int(1e5)])
param_names = ['dtype', 'num_lines']
def setup(self, dtype, num_lines):
data = [u'5, 7, 888'] * num_lines
self.csv_data = StringIO(u'\n'.join(data))
def time_loadtxt_dtypes_csv(self, dtype, num_lines):
# benchmark loading arrays of various dtypes
# from csv files
# state-dependent timing benchmark requires
# rewind of StringIO object
np.loadtxt(self.csv_data,
delimiter=u',',
dtype=dtype)
self.csv_data.seek(0)
class LoadtxtCSVStructured(Benchmark):
# benchmarks for np.loadtxt operating with
# a structured data type & CSV file
def setup(self):
num_lines = 50000
data = [u"M, 21, 72, X, 155"] * num_lines
self.csv_data = StringIO(u'\n'.join(data))
def time_loadtxt_csv_struct_dtype(self):
# obligate rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.csv_data,
delimiter=u',',
dtype=[('category_1', 'S1'),
('category_2', 'i4'),
('category_3', 'f8'),
('category_4', 'S1'),
('category_5', 'f8')])
self.csv_data.seek(0)
class LoadtxtCSVSkipRows(Benchmark):
# benchmarks for loadtxt row skipping when
# reading in csv file data; a similar benchmark
# is present in the pandas asv suite
params = [0, 500, 10000]
param_names = ['skiprows']
def setup(self, skiprows):
np.random.seed(123)
test_array = np.random.rand(100000, 3)
self.fname = 'test_array.csv'
np.savetxt(fname=self.fname,
X=test_array,
delimiter=',')
def time_skiprows_csv(self, skiprows):
np.loadtxt(self.fname,
delimiter=',',
skiprows=skiprows)
class LoadtxtReadUint64Integers(Benchmark):
# pandas has a similar CSV reading benchmark
# modified to suit np.loadtxt
params = [550, 1000, 10000]
param_names = ['size']
def setup(self, size):
arr = np.arange(size).astype('uint64') + 2**63
self.data1 = StringIO(u'\n'.join(arr.astype(str).tolist()))
arr = arr.astype(object)
arr[500] = -1
self.data2 = StringIO(u'\n'.join(arr.astype(str).tolist()))
def time_read_uint64(self, size):
# mandatory rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.data1)
self.data1.seek(0)
def time_read_uint64_neg_values(self, size):
# mandatory rewind of StringIO object
# between iterations of a repeat:
np.loadtxt(self.data2)
self.data2.seek(0)
class LoadtxtUseColsCSV(Benchmark):
# benchmark selective column reading from CSV files
# using np.loadtxt
params = [2, [1, 3], [1, 3, 5, 7]]
param_names = ['usecols']
def setup(self, usecols):
num_lines = 5000
data = [u'0, 1, 2, 3, 4, 5, 6, 7, 8, 9'] * num_lines
self.csv_data = StringIO(u'\n'.join(data))
def time_loadtxt_usecols_csv(self, usecols):
# must rewind StringIO because of state
# dependence of file reading
np.loadtxt(self.csv_data,
delimiter=u',',
usecols=usecols)
self.csv_data.seek(0)
class LoadtxtCSVDateTime(Benchmark):
# benchmarks for np.loadtxt operating with
# datetime data in a CSV file
params = [20, 200, 2000, 20000]
param_names = ['num_lines']
def setup(self, num_lines):
# create the equivalent of a two-column CSV file
# with date strings in the first column and random
# floating point data in the second column
dates = np.arange('today', 20, dtype=np.datetime64)
np.random.seed(123)
values = np.random.rand(20)
date_line = u''
for date, value in zip(dates, values):
date_line += (str(date) + ',' + str(value) + '\n')
# expand data to specified number of lines
data = date_line * (num_lines // 20)
self.csv_data = StringIO(data)
def time_loadtxt_csv_datetime(self, num_lines):
# rewind StringIO object -- the timing iterations
# are state-dependent
X = np.loadtxt(self.csv_data,
delimiter=u',',
dtype=([('dates', 'M8[us]'),
('values', 'float64')]))
self.csv_data.seek(0)