PERF: In loadtxt, rely on implicit string conversion.

This patch takes advantage of the possibility of assigning a tuple of *strs* to a structured dtype with e.g. float fields, and have the strs be implicitly converted to floats by numpy at the C-level. (A Python-level fallback is kept to support e.g. hex floats.) Together with the previous commit, this provides a massive speedup (~2x on the loadtxt_dtypes_csv benchmark for 10_000+ ints or floats), but is beneficial with as little as 100 rows. Very small reads (10 rows) are still slower (nearly 2x for object), as well as reads using object dtypes (due to the extra copy), but the tradeoff seems worthwhile.
numpy · Aug 17, 2021 · 6668564 · 6668564
1 parent 048f370
commit 6668564
Showing 1 changed file with 104 additions and 63 deletions.
diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py
@@ -774,17 +774,25 @@ def _floatconv(x):
         raise  # Raise the original exception, which makes more sense.
 
 
-_CONVERTERS = [  # These converters only ever get strs (not bytes) as input.
-    (np.bool_, lambda x: bool(int(x))),
-    (np.uint64, np.uint64),
-    (np.int64, np.int64),
-    (np.integer, lambda x: int(float(x))),
-    (np.longdouble, np.longdouble),
-    (np.floating, _floatconv),
-    (complex, lambda x: complex(x.replace('+-', '-'))),
-    (np.bytes_, methodcaller('encode', 'latin-1')),
-    (np.unicode_, str),
-]
+# These converters only ever get str (not bytes) as input.
+_CONVERTER_DICT = {
+    np.bool_: int,  # Implicitly converted to bool.
+    np.uint64: np.uint64,
+    np.int64: np.int64,
+    np.integer: lambda x: int(float(x)),
+    np.longdouble: np.longdouble,
+    np.floating: _floatconv,
+    complex: lambda x: complex(x.replace('+-', '-')),
+    np.bytes_: methodcaller('encode', 'latin-1'),
+    np.unicode_: str,
+}
+# These conversions can be done implicitly at the C-level, i.e., assigning
+# a str to an array of that dtype will either work as if the conversion was
+# explicitly applied first, or will throw a ValueError (_floatconv and complex
+# accept more inputs), but will not result in the wrong item being stored.
+_IMPLICIT_CONVERTERS = {
+    _CONVERTER_DICT[tp] for tp in [
+        np.uint64, np.int64, np.integer, np.longdouble, np.floating, complex]}
 
 
 def _getconv(dtype):
@@ -794,7 +802,7 @@ def _getconv(dtype):
     Even when a lambda is returned, it is defined at the toplevel, to allow
     testing for equality and enabling optimization for single-type data.
     """
-    for base, conv in _CONVERTERS:
+    for base, conv in _CONVERTER_DICT.items():
         if issubclass(dtype.type, base):
             return conv
     return str
@@ -962,33 +970,6 @@ def split_line(line: str):
         line = line.strip('\r\n')
         return line.split(delimiter) if line else []
 
-    def read_data(lineno_words_iter, chunk_size):
-        """
-        Parse each line, including the first.
-
-        Parameters
-        ----------
-        lineno_words_iter : Iterator[tuple[int, list[str]]]
-            Iterator returning line numbers and non-empty lines already split
-            into words.
-        chunk_size : int
-            At most `chunk_size` lines are read at a time, with iteration
-            until all lines are read.
-        """
-        X = []
-        for lineno, words in lineno_words_iter:
-            if usecols:
-                words = usecols_getter(words)
-            elif len(words) != ncols:
-                raise ValueError(f"Wrong number of columns at line {lineno}")
-            # Convert each value according to its column.
-            X.append(convert_row(words))
-            if len(X) > chunk_size:
-                yield X
-                X = []
-        if X:
-            yield X
-
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     # Main body of loadtxt.
     # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -1137,6 +1118,7 @@ def tobytes_first(conv, x):
         fencode = methodcaller("encode", fencoding)
         converters = [conv if conv is not bytes else fencode
                       for conv in converters]
+
         if len(set(converters)) == 1:
             # Optimize single-type data. Note that this is only reached if
             # `_getconv` returns equal callables (i.e. not local lambdas) on
@@ -1147,30 +1129,86 @@ def convert_row(vals, _conv=converters[0]):
             def convert_row(vals):
                 return tuple(conv(val) for conv, val in zip(converters, vals))
 
-        # read data in chunks and fill it into an array via resize
-        # over-allocating and shrinking the array later may be faster but is
-        # probably not relevant compared to the cost of actually reading and
-        # converting the data
-        X = None
-        for x in read_data(lineno_words_iter, _loadtxt_chunksize):
-            if X is None:
-                if infer_dtype_size:
-                    X = np.array(x, dtype)
-                else:
-                    X = np.array(x, row_dtype)
+        if _IMPLICIT_CONVERTERS.issuperset(converters):
+
+            X = np.zeros(256, dtype=row_dtype)
+            i = None  # Just in case there's no entry whatsoever.
+            for i, (lineno, words) in enumerate(lineno_words_iter):
+                if usecols:
+                    words = usecols_getter(words)
+                elif len(words) != ncols:
+                    raise ValueError(
+                        f"Wrong number of columns at line {lineno}")
+                try:
+                    X[i] = tuple(words)  # Try implicit conversion of strs.
+                    continue  # OK, done.
+                except IndexError:
+                    # Resize, and, for simplicity, use explicit converters too.
+                    X.resize(2 * len(X), refcheck=False)
+                except ValueError:
+                    # Fallback to explicit converters.
+                    pass
+                X[i] = convert_row(words)
+            if i is None:
+                X = None
             else:
-                # If using unsized string or byte dtype, make sure that the
-                # existing array is capable of storing the new data. If not,
-                # change the dtype so it is capable of doing so.
-                if infer_dtype_size:
-                    x = np.array(x, dtype)
-                    if x.dtype.itemsize > X.dtype.itemsize:
-                        X = X.astype(x.dtype)
-                nshape = list(X.shape)
-                pos = nshape[0]
-                nshape[0] += len(x)
-                X.resize(nshape, refcheck=False)
-                X[pos:, ...] = x
+                X.resize(i + 1, refcheck=False)
+
+        else:
+
+            def read_data(lineno_words_iter, chunk_size):
+                """
+                Parse each line, including the first.
+
+                Parameters
+                ----------
+                lineno_words_iter : Iterator[tuple[int, list[str]]]
+                    Iterator returning line numbers and non-empty lines already
+                    split into words.
+                chunk_size : int
+                    At most `chunk_size` lines are read at a time, with
+                    iteration until all lines are read.
+                """
+                X = []
+                for lineno, words in lineno_words_iter:
+                    if usecols:
+                        words = usecols_getter(words)
+                    elif len(words) != ncols:
+                        raise ValueError(
+                            f"Wrong number of columns at line {lineno}")
+                    # Convert each value according to its column.
+                    X.append(convert_row(words))
+                    if len(X) > chunk_size:
+                        yield X
+                        X = []
+                if X:
+                    yield X
+
+            # read data in chunks and fill it into an array via resize
+            # over-allocating and shrinking the array later may be faster but
+            # is probably not relevant compared to the cost of actually reading
+            # and converting the data
+            X = None
+            for x in read_data(lineno_words_iter, _loadtxt_chunksize):
+                if X is None:
+                    if infer_dtype_size:
+                        X = np.array(x, dtype)
+                    else:
+                        X = np.array(x, row_dtype)
+                else:
+                    # If using unsized string or byte dtype, make sure that the
+                    # existing array is capable of storing the new data. If
+                    # not, change the dtype so it is capable of doing so.
+                    if infer_dtype_size:
+                        x = np.array(x, dtype)
+                        if x.dtype.itemsize > X.dtype.itemsize:
+                            X = X.astype(x.dtype)
+                    nshape = list(X.shape)
+                    pos = nshape[0]
+                    nshape[0] += len(x)
+                    X.resize(nshape, refcheck=False)
+                    X[pos:, ...] = x
+
     finally:
         if fown:
             fh.close()
@@ -1187,7 +1225,10 @@ def convert_row(vals):
                 X = np.asarray(X, dtype)
             else:
                 X = X.view(dtype)
-        X = X.reshape((nrows, -1))
+        if nrows == 0:
+            X = X.reshape(0)
+        else:
+            X = X.reshape((nrows, -1))
 
     # Multicolumn data are returned with shape (1, N, M), i.e.
     # (1, 1, M) for a single row - remove the singleton dimension there