Skip to content

Commit

Permalink
PERF: In loadtxt, rely on implicit string conversion.
Browse files Browse the repository at this point in the history
This patch takes advantage of the possibility of assigning a tuple of
*strs* to a structured dtype with e.g. float fields, and have the strs
be implicitly converted to floats by numpy at the C-level.  (A
Python-level fallback is kept to support e.g. hex floats.)  Together
with the previous commit, this provides a massive speedup (~2x on the
loadtxt_dtypes_csv benchmark for 10_000+ ints or floats), but is
beneficial with as little as 100 rows.  Very small reads (10 rows) are
still slower (nearly 2x for object), as well as reads using object
dtypes (due to the extra copy), but the tradeoff seems worthwhile.
  • Loading branch information
anntzer committed Aug 17, 2021
1 parent 048f370 commit 6668564
Showing 1 changed file with 104 additions and 63 deletions.
167 changes: 104 additions & 63 deletions numpy/lib/npyio.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,17 +774,25 @@ def _floatconv(x):
raise # Raise the original exception, which makes more sense.


_CONVERTERS = [ # These converters only ever get strs (not bytes) as input.
(np.bool_, lambda x: bool(int(x))),
(np.uint64, np.uint64),
(np.int64, np.int64),
(np.integer, lambda x: int(float(x))),
(np.longdouble, np.longdouble),
(np.floating, _floatconv),
(complex, lambda x: complex(x.replace('+-', '-'))),
(np.bytes_, methodcaller('encode', 'latin-1')),
(np.unicode_, str),
]
# These converters only ever get str (not bytes) as input.
_CONVERTER_DICT = {
np.bool_: int, # Implicitly converted to bool.
np.uint64: np.uint64,
np.int64: np.int64,
np.integer: lambda x: int(float(x)),
np.longdouble: np.longdouble,
np.floating: _floatconv,
complex: lambda x: complex(x.replace('+-', '-')),
np.bytes_: methodcaller('encode', 'latin-1'),
np.unicode_: str,
}
# These conversions can be done implicitly at the C-level, i.e., assigning
# a str to an array of that dtype will either work as if the conversion was
# explicitly applied first, or will throw a ValueError (_floatconv and complex
# accept more inputs), but will not result in the wrong item being stored.
_IMPLICIT_CONVERTERS = {
_CONVERTER_DICT[tp] for tp in [
np.uint64, np.int64, np.integer, np.longdouble, np.floating, complex]}


def _getconv(dtype):
Expand All @@ -794,7 +802,7 @@ def _getconv(dtype):
Even when a lambda is returned, it is defined at the toplevel, to allow
testing for equality and enabling optimization for single-type data.
"""
for base, conv in _CONVERTERS:
for base, conv in _CONVERTER_DICT.items():
if issubclass(dtype.type, base):
return conv
return str
Expand Down Expand Up @@ -962,33 +970,6 @@ def split_line(line: str):
line = line.strip('\r\n')
return line.split(delimiter) if line else []

def read_data(lineno_words_iter, chunk_size):
"""
Parse each line, including the first.
Parameters
----------
lineno_words_iter : Iterator[tuple[int, list[str]]]
Iterator returning line numbers and non-empty lines already split
into words.
chunk_size : int
At most `chunk_size` lines are read at a time, with iteration
until all lines are read.
"""
X = []
for lineno, words in lineno_words_iter:
if usecols:
words = usecols_getter(words)
elif len(words) != ncols:
raise ValueError(f"Wrong number of columns at line {lineno}")
# Convert each value according to its column.
X.append(convert_row(words))
if len(X) > chunk_size:
yield X
X = []
if X:
yield X

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Main body of loadtxt.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand Down Expand Up @@ -1137,6 +1118,7 @@ def tobytes_first(conv, x):
fencode = methodcaller("encode", fencoding)
converters = [conv if conv is not bytes else fencode
for conv in converters]

if len(set(converters)) == 1:
# Optimize single-type data. Note that this is only reached if
# `_getconv` returns equal callables (i.e. not local lambdas) on
Expand All @@ -1147,30 +1129,86 @@ def convert_row(vals, _conv=converters[0]):
def convert_row(vals):
return tuple(conv(val) for conv, val in zip(converters, vals))

# read data in chunks and fill it into an array via resize
# over-allocating and shrinking the array later may be faster but is
# probably not relevant compared to the cost of actually reading and
# converting the data
X = None
for x in read_data(lineno_words_iter, _loadtxt_chunksize):
if X is None:
if infer_dtype_size:
X = np.array(x, dtype)
else:
X = np.array(x, row_dtype)
if _IMPLICIT_CONVERTERS.issuperset(converters):

X = np.zeros(256, dtype=row_dtype)
i = None # Just in case there's no entry whatsoever.
for i, (lineno, words) in enumerate(lineno_words_iter):
if usecols:
words = usecols_getter(words)
elif len(words) != ncols:
raise ValueError(
f"Wrong number of columns at line {lineno}")
try:
X[i] = tuple(words) # Try implicit conversion of strs.
continue # OK, done.
except IndexError:
# Resize, and, for simplicity, use explicit converters too.
X.resize(2 * len(X), refcheck=False)
except ValueError:
# Fallback to explicit converters.
pass
X[i] = convert_row(words)
if i is None:
X = None
else:
# If using unsized string or byte dtype, make sure that the
# existing array is capable of storing the new data. If not,
# change the dtype so it is capable of doing so.
if infer_dtype_size:
x = np.array(x, dtype)
if x.dtype.itemsize > X.dtype.itemsize:
X = X.astype(x.dtype)
nshape = list(X.shape)
pos = nshape[0]
nshape[0] += len(x)
X.resize(nshape, refcheck=False)
X[pos:, ...] = x
X.resize(i + 1, refcheck=False)

else:

def read_data(lineno_words_iter, chunk_size):
"""
Parse each line, including the first.
Parameters
----------
lineno_words_iter : Iterator[tuple[int, list[str]]]
Iterator returning line numbers and non-empty lines already
split into words.
chunk_size : int
At most `chunk_size` lines are read at a time, with
iteration until all lines are read.
"""
X = []
for lineno, words in lineno_words_iter:
if usecols:
words = usecols_getter(words)
elif len(words) != ncols:
raise ValueError(
f"Wrong number of columns at line {lineno}")
# Convert each value according to its column.
X.append(convert_row(words))
if len(X) > chunk_size:
yield X
X = []
if X:
yield X

# read data in chunks and fill it into an array via resize
# over-allocating and shrinking the array later may be faster but
# is probably not relevant compared to the cost of actually reading
# and converting the data
X = None
for x in read_data(lineno_words_iter, _loadtxt_chunksize):
if X is None:
if infer_dtype_size:
X = np.array(x, dtype)
else:
X = np.array(x, row_dtype)
else:
# If using unsized string or byte dtype, make sure that the
# existing array is capable of storing the new data. If
# not, change the dtype so it is capable of doing so.
if infer_dtype_size:
x = np.array(x, dtype)
if x.dtype.itemsize > X.dtype.itemsize:
X = X.astype(x.dtype)
nshape = list(X.shape)
pos = nshape[0]
nshape[0] += len(x)
X.resize(nshape, refcheck=False)
X[pos:, ...] = x

finally:
if fown:
fh.close()
Expand All @@ -1187,7 +1225,10 @@ def convert_row(vals):
X = np.asarray(X, dtype)
else:
X = X.view(dtype)
X = X.reshape((nrows, -1))
if nrows == 0:
X = X.reshape(0)
else:
X = X.reshape((nrows, -1))

# Multicolumn data are returned with shape (1, N, M), i.e.
# (1, 1, M) for a single row - remove the singleton dimension there
Expand Down

0 comments on commit 6668564

Please sign in to comment.