Unoperate · kboroszko · Dec 2, 2021 · Nov 26, 2021 · Nov 26, 2021 · Nov 26, 2021
diff --git a/WORKSPACE b/WORKSPACE
@@ -109,9 +109,21 @@ http_archive(
 )
 
 # Note com_google_googleapis is placed earlier as we need to adjust switched_rules_by_language option
+# Note we have to change one word in the field_behavior.proto so it compiles on WINDOWS
+# for more infor please refer to https://github.com/protocolbuffers/protobuf/issues/7076
+# Because of a bug in protocol buffers (protocolbuffers/protobuf#7076), new versions of this project
+# fail to compile on Windows. The problem hinges on OPTIONAL being defined as an empty string under
+# Windows. This makes the preprocessor remove every mention of OPTIONAL from the code, which causes
+# compilation failures. This temporary workaround renames the name of the protobuf value OPTIONAL to
+# OPIONAL. This should be safe as it does not affect the generated protobufs.
 http_archive(
     name = "com_google_googleapis",
     build_file = "@com_github_googleapis_google_cloud_cpp//bazel:googleapis.BUILD",
+    patch_cmds = [
+        """sed -i.bak 's/OPTIONAL/OPIONAL/g' google/api/field_behavior.proto""",
+        """sed -i.bak 's/OPTIONAL/OPIONAL/g' google/pubsub/v1beta2/pubsub.proto""",
+        """sed -i.bak 's/OPTIONAL/OPIONAL/g' google/pubsub/v1/pubsub.proto""",
+    ],
     sha256 = "a53e15405f81d5a32594d7f6486e649131fadda5431cf28377dff4ae54d45d16",
     strip_prefix = "googleapis-d4d09eb3aec152015f35717102f9b423988b94f7",
     urls = [

diff --git a/tensorflow_io/core/kernels/bigtable/serialization.cc b/tensorflow_io/core/kernels/bigtable/serialization.cc
@@ -16,16 +16,73 @@ limitations under the License.
 
 #include "tensorflow_io/core/kernels/bigtable/serialization.h"
 
-#include "rpc/types.h"
-#include "rpc/xdr.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 
+namespace cbt = ::google::cloud::bigtable;
+
 namespace tensorflow {
 namespace io {
 namespace {
 
-inline StatusOr<float> BytesToFloat(std::string const& s) {
+#ifdef _WIN32
+
+#include <winsock.h>
+
+inline StatusOr<int32_t> BytesToInt32(const cbt::Cell& cell) {
+  std::string const& bytes = cell.value();
+  union {
+    char bytes[4];
+    int32_t res;
+  } u;
+  if (bytes.size() != 4U) {
+    return errors::InvalidArgument("Invalid int32 representation.");
+  }
+  memcpy(u.bytes, bytes.data(), 4);
+  return ntohl(u.res);
+}
+
+inline StatusOr<int64_t> BytesToInt64(const cbt::Cell& cell) {
+  auto maybe_value = cell.decode_big_endian_integer<int64_t>();
+  if (!maybe_value.ok()) {
+    return errors::InvalidArgument("Invalid int32 representation.");
+  }
+  return maybe_value.value();
+}
+
+inline StatusOr<float> BytesToFloat(const cbt::Cell& cell) {
+  auto const int_rep = BytesToInt32(cell);
+  if (!int_rep.ok()) {
+    return int_rep;
+  }
+  union {
+    float res;
+    int32_t int_rep;
+  } u;
+  u.int_rep = *int_rep;
+  return u.res;
+}
+
+inline StatusOr<double> BytesToDouble(const cbt::Cell& cell) {
+  auto const int_rep = BytesToInt64(cell);
+  if (!int_rep.ok()) {
+    return int_rep;
+  }
+  union {
+    double res;
+    int64_t int_rep;
+  } u;
+  u.int_rep = *int_rep;
+  return u.res;
+}
+
+#else  // _WIN32
+
+#include "rpc/types.h"
+#include "rpc/xdr.h"
+
+inline StatusOr<float> BytesToFloat(const cbt::Cell& cell) {
+  std::string const& s = cell.value();
   float v;
   XDR xdrs;
   xdrmem_create(&xdrs, const_cast<char*>(s.data()), sizeof(v), XDR_DECODE);
@@ -35,7 +92,8 @@ inline StatusOr<float> BytesToFloat(std::string const& s) {
   return v;
 }
 
-inline StatusOr<double> BytesToDouble(std::string const& s) {
+inline StatusOr<double> BytesToDouble(const cbt::Cell& cell) {
+  std::string const& s = cell.value();
   double v;
   XDR xdrs;
   xdrmem_create(&xdrs, const_cast<char*>(s.data()), sizeof(v), XDR_DECODE);
@@ -45,7 +103,8 @@ inline StatusOr<double> BytesToDouble(std::string const& s) {
   return v;
 }
 
-inline StatusOr<int64_t> BytesToInt64(std::string const& s) {
+inline StatusOr<int64_t> BytesToInt64(const cbt::Cell& cell) {
+  std::string const& s = cell.value();
   int64_t v;
   XDR xdrs;
   xdrmem_create(&xdrs, const_cast<char*>(s.data()), sizeof(v), XDR_DECODE);
@@ -55,7 +114,8 @@ inline StatusOr<int64_t> BytesToInt64(std::string const& s) {
   return v;
 }
 
-inline StatusOr<int32_t> BytesToInt32(std::string const& s) {
+inline StatusOr<int32_t> BytesToInt32(const cbt::Cell& cell) {
+  std::string const& s = cell.value();
   int32_t v;
   XDR xdrs;
   xdrmem_create(&xdrs, const_cast<char*>(s.data()), sizeof(v), XDR_DECODE);
@@ -65,16 +125,18 @@ inline StatusOr<int32_t> BytesToInt32(std::string const& s) {
   return v;
 }
 
-inline StatusOr<bool_t> BytesToBool(std::string const& s) {
-  bool_t v;
-  XDR xdrs;
-  xdrmem_create(&xdrs, const_cast<char*>(s.data()), sizeof(v), XDR_DECODE);
-  if (!xdr_bool(&xdrs, &v)) {
-    return errors::InvalidArgument("Error reading bool from byte array.");
+#endif  // _WIN32
+
+inline StatusOr<bool> BytesToBool(const cbt::Cell& cell) {
+  std::string const& bytes = cell.value();
+  if (bytes.size() != 1U) {
+    return errors::InvalidArgument("Invalid bool representation.");
   }
-  return v;
+  return (*bytes.data()) != 0;
 }
 
+}  // namespace
+
 Status PutCellValueInTensor(Tensor& tensor, size_t index, DataType cell_type,
                             google::cloud::bigtable::Cell const& cell) {
   switch (cell_type) {
@@ -84,39 +146,39 @@ Status PutCellValueInTensor(Tensor& tensor, size_t index, DataType cell_type,
     } break;
     case DT_BOOL: {
       auto tensor_data = tensor.tensor<bool, 1>();
-      auto maybe_parsed_data = BytesToBool(cell.value());
+      auto maybe_parsed_data = BytesToBool(cell);
       if (!maybe_parsed_data.ok()) {
         return maybe_parsed_data.status();
       }
       tensor_data(index) = maybe_parsed_data.ValueOrDie();
     } break;
     case DT_INT32: {
       auto tensor_data = tensor.tensor<int32_t, 1>();
-      auto maybe_parsed_data = BytesToInt32(cell.value());
+      auto maybe_parsed_data = BytesToInt32(cell);
       if (!maybe_parsed_data.ok()) {
         return maybe_parsed_data.status();
       }
       tensor_data(index) = maybe_parsed_data.ValueOrDie();
     } break;
     case DT_INT64: {
       auto tensor_data = tensor.tensor<int64_t, 1>();
-      auto maybe_parsed_data = BytesToInt64(cell.value());
+      auto maybe_parsed_data = BytesToInt64(cell);
       if (!maybe_parsed_data.ok()) {
         return maybe_parsed_data.status();
       }
       tensor_data(index) = maybe_parsed_data.ValueOrDie();
     } break;
     case DT_FLOAT: {
       auto tensor_data = tensor.tensor<float, 1>();
-      auto maybe_parsed_data = BytesToFloat(cell.value());
+      auto maybe_parsed_data = BytesToFloat(cell);
       if (!maybe_parsed_data.ok()) {
         return maybe_parsed_data.status();
       }
       tensor_data(index) = maybe_parsed_data.ValueOrDie();
     } break;
     case DT_DOUBLE: {
       auto tensor_data = tensor.tensor<double, 1>();
-      auto maybe_parsed_data = BytesToDouble(cell.value());
+      auto maybe_parsed_data = BytesToDouble(cell);
       if (!maybe_parsed_data.ok()) {
         return maybe_parsed_data.status();
       }
@@ -128,6 +190,5 @@ Status PutCellValueInTensor(Tensor& tensor, size_t index, DataType cell_type,
   return Status::OK();
 }
 
-}  // namespace
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow_io/core/kernels/bigtable/serialization.h b/tensorflow_io/core/kernels/bigtable/serialization.h
@@ -18,15 +18,24 @@ limitations under the License.
 
 #include "google/cloud/bigtable/table.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/statusor.h"
 
 namespace tensorflow {
 namespace io {
 
 // Bigtable only stores values as byte buffers - except for int64 the server
 // side does not have any notion of types. Tensorflow, needs to store shorter
-// integers, floats, doubles, so we needed to decide on how. We chose to follow
-// what HBase does, since there is a path for migrating from HBase to Bigtable.
-// XDR seems to match what HBase does.
+// integers, floats, doubles, so we needed to decide on how. We chose to
+// follow what HBase does, since there is a path for migrating from HBase to
+// Bigtable. HBase stores integers as big-endian and floats as IEEE754
+// (also big-endian). Given that integer endianness does not always match
+// float endianness, and the fact that there are architectures where it is
+// neither little nor big (BE-32), implementing this properly is non-trivial.
+// Ideally, we would use a library to do that. XDR matches what HBase does,
+// but it is not easily available on Windows, so we decided to go with a
+// hybrid approach. On Windows we assume that integer endianness matches float
+// endianness and implement the deserialization ourselves and everywhere else
+// we use XDR. For that reason we provide two implementations
 Status PutCellValueInTensor(Tensor& tensor, size_t index, DataType cell_type,
                             google::cloud::bigtable::Cell const& cell);