Skip to content

Commit

Permalink
352: Enhancement: Support tp.to_numpy() for temporian (google#378)
Browse files Browse the repository at this point in the history
* add to_numpy functionality

* removed unecessary files

* add tests for to_numpy()

* add no timestamps test

* add more tests to to_numpy

* format with black

* reformat with black numpy_test.py

* Add to_numpy to public symbols, and create a new file under docs

* update docstrings
  • Loading branch information
nagavenkateshgavini committed Mar 7, 2024
1 parent e59ae4f commit 6e8cd65
Show file tree
Hide file tree
Showing 10 changed files with 236 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -9,6 +9,8 @@ tmp_*
.cache/
.env
my_venv
venv*
.idea

# benchmark outputs
profile.*
Expand Down
1 change: 1 addition & 0 deletions docs/public_api_test.py
Expand Up @@ -38,6 +38,7 @@
"to_csv",
"from_csv",
"to_pandas",
"to_numpy",
"from_pandas",
"to_parquet",
"from_parquet",
Expand Down
Empty file.
1 change: 1 addition & 0 deletions temporian/BUILD
Expand Up @@ -36,6 +36,7 @@ py_library(
"//temporian/implementation/numpy/operators",
"//temporian/io:csv",
"//temporian/io:pandas",
"//temporian/io:numpy",
"//temporian/io:parquet",
"//temporian/io:tensorflow",
"//temporian/utils:config",
Expand Down
1 change: 1 addition & 0 deletions temporian/__init__.py
Expand Up @@ -73,6 +73,7 @@
from temporian.io.csv import to_csv
from temporian.io.csv import from_csv
from temporian.io.pandas import to_pandas
from temporian.io.numpy import to_numpy
from temporian.io.pandas import from_pandas
from temporian.io.parquet import from_parquet
from temporian.io.parquet import to_parquet
Expand Down
11 changes: 11 additions & 0 deletions temporian/io/BUILD
Expand Up @@ -13,6 +13,7 @@ py_library(
deps = [
":csv",
":pandas",
":numpy"
],
)

Expand Down Expand Up @@ -54,6 +55,16 @@ py_library(
],
)

py_library(
name = "numpy",
srcs = ["numpy.py"],
srcs_version = "PY3",
deps = [
"//temporian/implementation/numpy/data:event_set",
"//temporian/implementation/numpy/data:io",
],
)

py_library(
name = "tensorflow",
srcs = ["tensorflow.py"],
Expand Down
2 changes: 2 additions & 0 deletions temporian/io/__init__.py
Expand Up @@ -19,3 +19,5 @@

from temporian.io.pandas import to_pandas
from temporian.io.pandas import from_pandas

from temporian.io.numpy import to_numpy
94 changes: 94 additions & 0 deletions temporian/io/numpy.py
@@ -0,0 +1,94 @@
# Copyright 2021 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for converting EventSets to numpy arrays and viceversa."""

import numpy as np
from numpy import ndarray

from typing import Dict
from temporian.implementation.numpy.data.event_set import EventSet


def to_numpy(
evset: EventSet,
timestamp_to_datetime: bool = True,
timestamps: bool = True,
) -> Dict[str, ndarray]:
"""Converts an [`EventSet`][temporian.EventSet] to a flattened dictionary with
numpy arrays.
Usage example:
```python
>>> from datetime import datetime
>>> evset = tp.event_set(
... timestamps=['2023-11-08T17:14:38', '2023-11-29T21:44:46'],
... features={
... "store": ['STORE_1', 'STORE_2'],
... "revenue": [1571, 6101]
... },
... indexes=["store"],
... )
# Timestamps are exported as datetime64[s] if they were created as datetimes,
# otherwhise they are floats
>>> res = tp.to_numpy(evset)
>>> res
{'store': array([b'STORE_2', b'STORE_1'], dtype='|S7'), 'revenue': array([6101, 1571]),
'timestamp': array(['2023-11-29T21:44:46', '2023-11-08T17:14:38'], dtype='datetime64[s]')}
```
Args:
evset: input event set.
timestamp_to_datetime: If true, cast Temporian timestamps to datetime64
when is_unix_timestamp is set to True.
timestamps: If true, the timestamps are included as a column.
Returns:
object with numpy arrays created from EventSet.
"""
timestamp_key = "timestamp"
index_names = evset.schema.index_names()
feature_names = evset.schema.feature_names()

column_names = index_names + feature_names
if timestamps:
column_names += [timestamp_key]

dst = {column_name: [] for column_name in column_names}
for index, data in evset.data.items():
assert isinstance(index, tuple)

if timestamps:
# Timestamps
if evset.schema.is_unix_timestamp and timestamp_to_datetime:
dst[timestamp_key].append(
data.timestamps.astype("datetime64[s]")
)
else:
dst[timestamp_key].append(data.timestamps)

# Features
for feature_name, feature in zip(feature_names, data.features):
dst[feature_name].append(feature)

# Indexes
num_timestamps = len(data.timestamps)
for index_name, index_item in zip(index_names, index):
dst[index_name].append(np.repeat(index_item, num_timestamps))

dst = {k: np.concatenate(v) for k, v in dst.items()}
return dst
14 changes: 14 additions & 0 deletions temporian/io/test/BUILD
Expand Up @@ -20,6 +20,20 @@ py_test(
],
)

py_test(
name = "numpy_test",
srcs = ["numpy_test.py"],
srcs_version = "PY3",
deps = [
# already_there/absl/testing:absltest
# already_there/numpy
# already_there/absl/testing:parameterized
"//temporian/test:utils",
"//temporian/implementation/numpy/data:io",
"//temporian/io:numpy",
],
)

py_test(
name = "tensorflow_test",
srcs = ["tensorflow_test.py"],
Expand Down
110 changes: 110 additions & 0 deletions temporian/io/test/numpy_test.py
@@ -0,0 +1,110 @@
import numpy as np
from absl.testing import absltest

from temporian.implementation.numpy.data.io import event_set
from temporian.io.numpy import to_numpy


class NumpyTest(absltest.TestCase):
def test_correct(self):
evset = event_set(
timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
features={
"feature_1": [0.5, 0.6],
"my_index": ["red", "blue"],
},
indexes=["my_index"],
)

result = to_numpy(evset)

expected = {
"timestamp": np.array(
["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
dtype="datetime64[s]",
),
"feature_1": np.array([0.5, 0.6]),
"my_index": np.array([b"red", b"blue"]),
}

for k in expected:
np.testing.assert_array_equal(
np.sort(result[k]), np.sort(expected[k])
)

def test_no_index(self):
evset = event_set(
timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
features={
"feature_1": [0.5, 0.6],
"my_index": ["red", "blue"],
},
)

result = to_numpy(evset)

expected = {
"timestamp": np.array(
["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
dtype="datetime64[s]",
),
"feature_1": np.array([0.5, 0.6]),
"my_index": np.array([b"red", b"blue"]),
}

for k in expected:
np.testing.assert_array_equal(
np.sort(result[k]), np.sort(expected[k])
)

def test_no_timestamps(self):
evset = event_set(
timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
features={
"feature_1": [0.5, 0.6],
"my_index": ["red", "blue"],
},
indexes=["my_index"],
)

result = to_numpy(evset, timestamps=False)
assert "timestamp" not in result

def test_timestamp_to_datetime_param(self):
evset = event_set(
timestamps=[
np.datetime64("2022-01-01"),
np.datetime64("2022-01-02"),
],
features={
"feature_1": [0.5, 0.6],
"my_index": ["red", "blue"],
},
indexes=["my_index"],
)

result = to_numpy(evset, timestamp_to_datetime=False)

assert "timestamp" in result
assert np.issubdtype(result["timestamp"].dtype, np.float64)

def test_empty_event_set(self):
evset = event_set(
timestamps=["2023-11-08T17:14:38", "2023-11-29T21:44:46"]
)
result = to_numpy(evset)

expected = {
"timestamp": np.array(
["2023-11-08T17:14:38", "2023-11-29T21:44:46"],
dtype="datetime64[s]",
)
}

np.testing.assert_array_equal(
np.sort(result["timestamp"]), np.sort(expected["timestamp"])
)


if __name__ == "__main__":
absltest.main()

0 comments on commit 6e8cd65

Please sign in to comment.