buffer: add buffer.isUtf8 for utf8 validation

anonrig · juanarbol · commit 16ee02f2eb4d · 2023-01-31T08:04:11.000-05:00
PR-URL: #45947 Reviewed-By: Robert Nagy <ronagy@icloud.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl> Reviewed-By: Anna Henningsen <anna@addaleax.net>
diff --git a/doc/api/buffer.md b/doc/api/buffer.md
@@ -5126,6 +5126,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
 and binary data should be performed using `Buffer.from(str, 'base64')` and
 `buf.toString('base64')`.**
 
+### `buffer.isUtf8(input)`
+
+<!-- YAML
+added: REPLACEME
+-->
+
+* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
+* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.
+
+This function is used to check if input contains UTF-8 code points (characters).
+
 ### `buffer.INSPECT_MAX_BYTES`
 
 <!-- YAML
diff --git a/lib/buffer.js b/lib/buffer.js
@@ -57,6 +57,7 @@ const {
   compareOffset,
   createFromString,
   fill: bindingFill,
+  isUtf8: bindingIsUtf8,
   indexOfBuffer,
   indexOfNumber,
   indexOfString,
@@ -83,7 +84,8 @@ const {
 const {
   isAnyArrayBuffer,
   isArrayBufferView,
-  isUint8Array
+  isUint8Array,
+  isTypedArray,
 } = require('internal/util/types');
 const {
   inspect: utilInspect
@@ -1322,13 +1324,22 @@ function atob(input) {
   return Buffer.from(input, 'base64').toString('latin1');
 }
 
+function isUtf8(input) {
+  if (isTypedArray(input) || isAnyArrayBuffer(input)) {
+    return bindingIsUtf8(input);
+  }
+
+  throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
+}
+
 module.exports = {
   Blob,
   File,
   resolveObjectURL,
   Buffer,
   SlowBuffer,
   transcode,
+  isUtf8,
 
   // Legacy
   kMaxLength,
diff --git a/src/node_buffer.cc b/src/node_buffer.cc
@@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
   results[1] = written;
 }
 
+static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
+  Environment* env = Environment::GetCurrent(args);
+  CHECK_EQ(args.Length(), 1);
+  CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
+        args[0]->IsSharedArrayBuffer());
+  ArrayBufferViewContents<char> abv(args[0]);
+
+  if (abv.WasDetached()) {
+    return node::THROW_ERR_INVALID_STATE(
+        env, "Cannot validate on a detached buffer");
+  }
+
+  args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
+}
 
 void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
   Environment* env = Environment::GetCurrent(args);
@@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
   SetMethod(context, target, "encodeInto", EncodeInto);
   SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);
 
+  SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);
+
   target
       ->Set(context,
             FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
@@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
   registry->Register(EncodeInto);
   registry->Register(EncodeUtf8String);
 
+  registry->Register(IsUtf8);
+
   registry->Register(StringSlice<ASCII>);
   registry->Register(StringSlice<BASE64>);
   registry->Register(StringSlice<BASE64URL>);
diff --git a/src/node_errors.h b/src/node_errors.h
@@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, bool is_heap_oom);
   V(ERR_INVALID_ARG_TYPE, TypeError)                                           \
   V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError)                             \
   V(ERR_INVALID_MODULE, Error)                                                 \
+  V(ERR_INVALID_STATE, Error)                                                  \
   V(ERR_INVALID_THIS, TypeError)                                               \
   V(ERR_INVALID_TRANSFER_OBJECT, TypeError)                                    \
   V(ERR_MEMORY_ALLOCATION_FAILED, Error)                                       \
diff --git a/src/util-inl.h b/src/util-inl.h
@@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
     auto ab = buf.As<v8::ArrayBuffer>();
     length_ = ab->ByteLength();
     data_ = static_cast<T*>(ab->Data());
+    was_detached_ = ab->WasDetached();
   } else {
     CHECK(buf->IsSharedArrayBuffer());
     auto sab = buf.As<v8::SharedArrayBuffer>();
diff --git a/src/util.h b/src/util.h
@@ -510,6 +510,7 @@ class ArrayBufferViewContents {
   inline void Read(v8::Local<v8::ArrayBufferView> abv);
   inline void ReadValue(v8::Local<v8::Value> buf);
 
+  inline bool WasDetached() const { return was_detached_; }
   inline const T* data() const { return data_; }
   inline size_t length() const { return length_; }
 
@@ -524,6 +525,7 @@ class ArrayBufferViewContents {
   T stack_storage_[kStackStorageSize];
   T* data_ = nullptr;
   size_t length_ = 0;
+  bool was_detached_ = false;
 };
 
 class Utf8Value : public MaybeStackBuffer<char> {
diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js
@@ -0,0 +1,86 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { isUtf8, Buffer } = require('buffer');
+const { TextEncoder } = require('util');
+
+const encoder = new TextEncoder();
+
+assert.strictEqual(isUtf8(encoder.encode('hello')), true);
+assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
+assert.strictEqual(isUtf8(Buffer.from([])), true);
+
+// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
+[
+  [0xFF], // 'invalid code'
+  [0xC0], // 'ends early'
+  [0xE0], // 'ends early 2'
+  [0xC0, 0x00], // 'invalid trail'
+  [0xC0, 0xC0], // 'invalid trail 2'
+  [0xE0, 0x00], // 'invalid trail 3'
+  [0xE0, 0xC0], // 'invalid trail 4'
+  [0xE0, 0x80, 0x00], // 'invalid trail 5'
+  [0xE0, 0x80, 0xC0], // 'invalid trail 6'
+  [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
+  [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'
+
+  // Overlong encodings
+  [0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
+  [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
+  [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'
+
+  [0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
+  [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
+  [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'
+
+  [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
+  [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
+  [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'
+
+  [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
+  [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
+  [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'
+
+  [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
+  [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'
+
+  // UTF-16 surrogates encoded as code points in UTF-8
+  [0xED, 0xA0, 0x80], // 'lead surrogate'
+  [0xED, 0xB0, 0x80], // 'trail surrogate'
+  [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
+].forEach((input) => {
+  assert.strictEqual(isUtf8(Buffer.from(input)), false);
+});
+
+[
+  null,
+  undefined,
+  'hello',
+  true,
+  false,
+].forEach((input) => {
+  assert.throws(
+    () => { isUtf8(input); },
+    {
+      code: 'ERR_INVALID_ARG_TYPE',
+    },
+  );
+});
+
+{
+  // Test with detached array buffers
+  const arrayBuffer = new ArrayBuffer(1024);
+  structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
+  assert.throws(
+    () => { isUtf8(arrayBuffer); },
+    {
+      code: 'ERR_INVALID_STATE'
+    }
+  );
+}