352: Enhancement: Support tp.to_numpy() for temporian (google#378)

* add to_numpy functionality * removed unecessary files * add tests for to_numpy() * add no timestamps test * add more tests to to_numpy * format with black * reformat with black numpy_test.py * Add to_numpy to public symbols, and create a new file under docs * update docstrings
check-spelling-sandbox · Mar 7, 2024 · 6e8cd65 · 6e8cd65
1 parent e59ae4f
commit 6e8cd65
Show file tree

Hide file tree

Showing 10 changed files with 236 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,8 @@ tmp_*
 .cache/
 .env
 my_venv
+venv*
+.idea
 
 # benchmark outputs
 profile.*

diff --git a/docs/public_api_test.py b/docs/public_api_test.py
@@ -38,6 +38,7 @@
     "to_csv",
     "from_csv",
     "to_pandas",
+    "to_numpy",
     "from_pandas",
     "to_parquet",
     "from_parquet",

diff --git a/docs/src/reference/temporian/io/to_numpy.md b/docs/src/reference/temporian/io/to_numpy.md
diff --git a/temporian/BUILD b/temporian/BUILD
@@ -36,6 +36,7 @@ py_library(
         "//temporian/implementation/numpy/operators",
         "//temporian/io:csv",
         "//temporian/io:pandas",
+        "//temporian/io:numpy",
         "//temporian/io:parquet",
         "//temporian/io:tensorflow",
         "//temporian/utils:config",

diff --git a/temporian/__init__.py b/temporian/__init__.py
@@ -73,6 +73,7 @@
 from temporian.io.csv import to_csv
 from temporian.io.csv import from_csv
 from temporian.io.pandas import to_pandas
+from temporian.io.numpy import to_numpy
 from temporian.io.pandas import from_pandas
 from temporian.io.parquet import from_parquet
 from temporian.io.parquet import to_parquet

diff --git a/temporian/io/BUILD b/temporian/io/BUILD
@@ -13,6 +13,7 @@ py_library(
     deps = [
         ":csv",
         ":pandas",
+        ":numpy"
     ],
 )
 
@@ -54,6 +55,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "numpy",
+    srcs = ["numpy.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//temporian/implementation/numpy/data:event_set",
+        "//temporian/implementation/numpy/data:io",
+    ],
+)
+
 py_library(
     name = "tensorflow",
     srcs = ["tensorflow.py"],

diff --git a/temporian/io/__init__.py b/temporian/io/__init__.py
@@ -19,3 +19,5 @@
 
 from temporian.io.pandas import to_pandas
 from temporian.io.pandas import from_pandas
+
+from temporian.io.numpy import to_numpy
diff --git a/temporian/io/numpy.py b/temporian/io/numpy.py
@@ -0,0 +1,94 @@
+# Copyright 2021 Google LLC.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for converting EventSets to numpy arrays and viceversa."""
+
+import numpy as np
+from numpy import ndarray
+
+from typing import Dict
+from temporian.implementation.numpy.data.event_set import EventSet
+
+
+def to_numpy(
+    evset: EventSet,
+    timestamp_to_datetime: bool = True,
+    timestamps: bool = True,
+) -> Dict[str, ndarray]:
+    """Converts an [`EventSet`][temporian.EventSet] to a flattened dictionary with
+    numpy arrays.
+
+    Usage example:
+        ```python
+        >>> from datetime import datetime
+
+        >>> evset = tp.event_set(
+        ...     timestamps=['2023-11-08T17:14:38', '2023-11-29T21:44:46'],
+        ...     features={
+        ...         "store": ['STORE_1', 'STORE_2'],
+        ...         "revenue": [1571, 6101]
+        ...     },
+        ...     indexes=["store"],
+        ... )
+
+        # Timestamps are exported as datetime64[s] if they were created as datetimes,
+        # otherwhise they are floats
+        >>> res = tp.to_numpy(evset)
+        >>> res
+        {'store': array([b'STORE_2', b'STORE_1'], dtype='|S7'), 'revenue': array([6101, 1571]),
+        'timestamp': array(['2023-11-29T21:44:46', '2023-11-08T17:14:38'], dtype='datetime64[s]')}
+
+        ```
+
+    Args:
+        evset: input event set.
+        timestamp_to_datetime: If true, cast Temporian timestamps to datetime64
+            when is_unix_timestamp is set to True.
+        timestamps: If true, the timestamps are included as a column.
+
+    Returns:
+        object with numpy arrays created from EventSet.
+    """
+    timestamp_key = "timestamp"
+    index_names = evset.schema.index_names()
+    feature_names = evset.schema.feature_names()
+
+    column_names = index_names + feature_names
+    if timestamps:
+        column_names += [timestamp_key]
+
+    dst = {column_name: [] for column_name in column_names}
+    for index, data in evset.data.items():
+        assert isinstance(index, tuple)
+
+        if timestamps:
+            # Timestamps
+            if evset.schema.is_unix_timestamp and timestamp_to_datetime:
+                dst[timestamp_key].append(
+                    data.timestamps.astype("datetime64[s]")
+                )
+            else:
+                dst[timestamp_key].append(data.timestamps)
+
+        # Features
+        for feature_name, feature in zip(feature_names, data.features):
+            dst[feature_name].append(feature)
+
+        # Indexes
+        num_timestamps = len(data.timestamps)
+        for index_name, index_item in zip(index_names, index):
+            dst[index_name].append(np.repeat(index_item, num_timestamps))
+
+    dst = {k: np.concatenate(v) for k, v in dst.items()}
+    return dst
diff --git a/temporian/io/test/BUILD b/temporian/io/test/BUILD
@@ -20,6 +20,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "numpy_test",
+    srcs = ["numpy_test.py"],
+    srcs_version = "PY3",
+    deps = [
+        # already_there/absl/testing:absltest
+        # already_there/numpy
+        # already_there/absl/testing:parameterized
+        "//temporian/test:utils",
+        "//temporian/implementation/numpy/data:io",
+        "//temporian/io:numpy",
+    ],
+)
+
 py_test(
     name = "tensorflow_test",
     srcs = ["tensorflow_test.py"],

diff --git a/temporian/io/test/numpy_test.py b/temporian/io/test/numpy_test.py
@@ -0,0 +1,110 @@
+import numpy as np
+from absl.testing import absltest
+
+from temporian.implementation.numpy.data.io import event_set
+from temporian.io.numpy import to_numpy
+
+
+class NumpyTest(absltest.TestCase):
+    def test_correct(self):
+        evset = event_set(
+            timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
+            features={
+                "feature_1": [0.5, 0.6],
+                "my_index": ["red", "blue"],
+            },
+            indexes=["my_index"],
+        )
+
+        result = to_numpy(evset)
+
+        expected = {
+            "timestamp": np.array(
+                ["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
+                dtype="datetime64[s]",
+            ),
+            "feature_1": np.array([0.5, 0.6]),
+            "my_index": np.array([b"red", b"blue"]),
+        }
+
+        for k in expected:
+            np.testing.assert_array_equal(
+                np.sort(result[k]), np.sort(expected[k])
+            )
+
+    def test_no_index(self):
+        evset = event_set(
+            timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
+            features={
+                "feature_1": [0.5, 0.6],
+                "my_index": ["red", "blue"],
+            },
+        )
+
+        result = to_numpy(evset)
+
+        expected = {
+            "timestamp": np.array(
+                ["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
+                dtype="datetime64[s]",
+            ),
+            "feature_1": np.array([0.5, 0.6]),
+            "my_index": np.array([b"red", b"blue"]),
+        }
+
+        for k in expected:
+            np.testing.assert_array_equal(
+                np.sort(result[k]), np.sort(expected[k])
+            )
+
+    def test_no_timestamps(self):
+        evset = event_set(
+            timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
+            features={
+                "feature_1": [0.5, 0.6],
+                "my_index": ["red", "blue"],
+            },
+            indexes=["my_index"],
+        )
+
+        result = to_numpy(evset, timestamps=False)
+        assert "timestamp" not in result
+
+    def test_timestamp_to_datetime_param(self):
+        evset = event_set(
+            timestamps=[
+                np.datetime64("2022-01-01"),
+                np.datetime64("2022-01-02"),
+            ],
+            features={
+                "feature_1": [0.5, 0.6],
+                "my_index": ["red", "blue"],
+            },
+            indexes=["my_index"],
+        )
+
+        result = to_numpy(evset, timestamp_to_datetime=False)
+
+        assert "timestamp" in result
+        assert np.issubdtype(result["timestamp"].dtype, np.float64)
+
+    def test_empty_event_set(self):
+        evset = event_set(
+            timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"]
+        )
+        result = to_numpy(evset)
+
+        expected = {
+            "timestamp": np.array(
+                ["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
+                dtype="datetime64[s]",
+            )
+        }
+
+        np.testing.assert_array_equal(
+            np.sort(result["timestamp"]), np.sort(expected["timestamp"])
+        )
+
+
+if __name__ == "__main__":
+    absltest.main()