From be10b367a37e19091ada77ee94edcd51a0839879 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Thu, 22 Dec 2022 16:22:12 -0500 Subject: [PATCH 1/6] buffer: add buffer.isUtf8 for utf8 validation --- doc/api/buffer.md | 11 +++++++++++ lib/buffer.js | 17 ++++++++++++++++- src/node_buffer.cc | 12 ++++++++++++ test/parallel/test-buffer-isutf8.js | 15 +++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 test/parallel/test-buffer-isutf8.js diff --git a/doc/api/buffer.md b/doc/api/buffer.md index 978c26f734eee8..d7879ccbf8a03f 100644 --- a/doc/api/buffer.md +++ b/doc/api/buffer.md @@ -5130,6 +5130,17 @@ For code running using Node.js APIs, converting between base64-encoded strings and binary data should be performed using `Buffer.from(str, 'base64')` and `buf.toString('base64')`.** +### `buffer.isUtf8(input)` + + + +* input {Buffer | ArrayBuffer | TypedArray} The input to validate. +* Returns: {boolean} Returns true if and only if the input is valid UTF-8. + +This function is used to check if input contains UTF-8 code points (characters). + ### `buffer.INSPECT_MAX_BYTES` * input {Buffer | ArrayBuffer | TypedArray} The input to validate. -* Returns: {boolean} Returns true if and only if the input is valid UTF-8. +* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8. This function is used to check if input contains UTF-8 code points (characters). diff --git a/lib/buffer.js b/lib/buffer.js index af29341be68f09..65709ca5ed56a2 100644 --- a/lib/buffer.js +++ b/lib/buffer.js @@ -1317,15 +1317,11 @@ function atob(input) { } function isUtf8(input) { - if (isTypedArray(input) || Buffer.isBuffer(input)) { - return bindingIsUtf8(input.buffer); - } - - if (isAnyArrayBuffer(input)) { + if (isTypedArray(input) || isAnyArrayBuffer(input)) { return bindingIsUtf8(input); } - return false; + throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input); } module.exports = { diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 9b0a7b5a31b651..a8a5ce3cb8edbc 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -1224,12 +1224,32 @@ static void EncodeInto(const FunctionCallbackInfo& args) { } static void IsUtf8(const FunctionCallbackInfo& args) { - CHECK_GE(args.Length(), 1); - CHECK(args[0]->IsArrayBuffer()); - Local input = args[0].As(); - auto external = static_cast(input->Data()); - args.GetReturnValue().Set( - simdutf::validate_utf8(external, input->ByteLength())); + Environment* env = Environment::GetCurrent(args); + Isolate* isolate = env->isolate(); + + CHECK_EQ(args.Length(), 1); + CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer()); + + Local buf; + size_t offset = 0; + size_t length = 0; + + if (args[0]->IsTypedArray()) { + Local input = args[0].As(); + buf = input->Buffer(); + offset = input->ByteOffset(); + length = input->ByteLength(); + } else { + buf = args[0].As(); + length = buf->ByteLength(); + } + + if (buf->WasDetached()) { + return node::THROW_ERR_BUFFER_CONTEXT_NOT_AVAILABLE(isolate); + } + + const char* external = static_cast(buf->Data()) + offset; + args.GetReturnValue().Set(simdutf::validate_utf8(external, length)); } void SetBufferPrototype(const FunctionCallbackInfo& args) { diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js index 85f01c0e3bb5d4..1093243ef71af7 100644 --- a/test/parallel/test-buffer-isutf8.js +++ b/test/parallel/test-buffer-isutf8.js @@ -2,7 +2,7 @@ require('../common'); const assert = require('assert'); -const { isUtf8 } = require('buffer'); +const { isUtf8, Buffer } = require('buffer'); const { TextEncoder } = require('util'); const encoder = new TextEncoder(); @@ -10,6 +10,31 @@ const encoder = new TextEncoder(); assert.strictEqual(isUtf8(encoder.encode('hello')), true); assert.strictEqual(isUtf8(encoder.encode('ğ')), true); assert.strictEqual(isUtf8(Buffer.from([0xf8])), false); +assert.strictEqual(isUtf8(encoder.encode('aé日')), true); -assert.strictEqual(isUtf8(null), false); -assert.strictEqual(isUtf8(undefined), false); +[ + null, + undefined, + 'hello', + true, + false +].forEach((input) => { + assert.throws( + () => { isUtf8(input); }, + { + code: 'ERR_INVALID_ARG_TYPE', + }, + ); +}) + +{ + // Test with detached array buffers + const arrayBuffer = new ArrayBuffer(1024); + structuredClone(arrayBuffer, { transfer: [arrayBuffer] }); + assert.throws( + () => { isUtf8(arrayBuffer); }, + { + code: 'ERR_BUFFER_CONTEXT_NOT_AVAILABLE' + } + ) +} From 6c8ac383b17d736649f5197321ec15d13da1b1d0 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 23 Dec 2022 13:04:55 -0500 Subject: [PATCH 3/6] fixup! buffer: add buffer.isUtf8 for utf8 validation --- test/parallel/test-buffer-isutf8.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js index 1093243ef71af7..28f3a93893d00e 100644 --- a/test/parallel/test-buffer-isutf8.js +++ b/test/parallel/test-buffer-isutf8.js @@ -17,7 +17,7 @@ assert.strictEqual(isUtf8(encoder.encode('aé日')), true); undefined, 'hello', true, - false + false, ].forEach((input) => { assert.throws( () => { isUtf8(input); }, @@ -25,7 +25,7 @@ assert.strictEqual(isUtf8(encoder.encode('aé日')), true); code: 'ERR_INVALID_ARG_TYPE', }, ); -}) +}); { // Test with detached array buffers @@ -33,8 +33,8 @@ assert.strictEqual(isUtf8(encoder.encode('aé日')), true); structuredClone(arrayBuffer, { transfer: [arrayBuffer] }); assert.throws( () => { isUtf8(arrayBuffer); }, - { + { code: 'ERR_BUFFER_CONTEXT_NOT_AVAILABLE' } - ) + ); } From 103e80748cdfab6cfb3ff2247554a198a36997a2 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 23 Dec 2022 15:01:35 -0500 Subject: [PATCH 4/6] fixup! buffer: add buffer.isUtf8 for utf8 validation --- src/node_buffer.cc | 24 ++++-------------------- src/util-inl.h | 1 + src/util.h | 2 ++ 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index a8a5ce3cb8edbc..5cbaec707efc26 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -1225,31 +1225,15 @@ static void EncodeInto(const FunctionCallbackInfo& args) { static void IsUtf8(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); - Isolate* isolate = env->isolate(); - CHECK_EQ(args.Length(), 1); CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer()); + ArrayBufferViewContents abv(args[0]); - Local buf; - size_t offset = 0; - size_t length = 0; - - if (args[0]->IsTypedArray()) { - Local input = args[0].As(); - buf = input->Buffer(); - offset = input->ByteOffset(); - length = input->ByteLength(); - } else { - buf = args[0].As(); - length = buf->ByteLength(); - } - - if (buf->WasDetached()) { - return node::THROW_ERR_BUFFER_CONTEXT_NOT_AVAILABLE(isolate); + if (abv.WasDetached()) { + return node::THROW_ERR_BUFFER_CONTEXT_NOT_AVAILABLE(env->isolate()); } - const char* external = static_cast(buf->Data()) + offset; - args.GetReturnValue().Set(simdutf::validate_utf8(external, length)); + args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length())); } void SetBufferPrototype(const FunctionCallbackInfo& args) { diff --git a/src/util-inl.h b/src/util-inl.h index f98bb16aa7687a..833082291a16aa 100644 --- a/src/util-inl.h +++ b/src/util-inl.h @@ -555,6 +555,7 @@ void ArrayBufferViewContents::ReadValue(v8::Local buf) { auto ab = buf.As(); length_ = ab->ByteLength(); data_ = static_cast(ab->Data()); + was_detached_ = ab->WasDetached(); } else { CHECK(buf->IsSharedArrayBuffer()); auto sab = buf.As(); diff --git a/src/util.h b/src/util.h index 399018655ec38f..7a3885ec8f7b07 100644 --- a/src/util.h +++ b/src/util.h @@ -511,6 +511,7 @@ class ArrayBufferViewContents { inline void Read(v8::Local abv); inline void ReadValue(v8::Local buf); + inline bool WasDetached() const { return was_detached_; } inline const T* data() const { return data_; } inline size_t length() const { return length_; } @@ -525,6 +526,7 @@ class ArrayBufferViewContents { T stack_storage_[kStackStorageSize]; T* data_ = nullptr; size_t length_ = 0; + bool was_detached_ = false; }; class Utf8Value : public MaybeStackBuffer { From b590f061edbbca3568f07869dc461da4717a807e Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Fri, 23 Dec 2022 23:20:52 -0500 Subject: [PATCH 5/6] fixup! buffer: add buffer.isUtf8 for utf8 validation --- src/node_buffer.cc | 6 ++++-- src/node_errors.h | 1 + test/parallel/test-buffer-isutf8.js | 14 +++++++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 5cbaec707efc26..f7b008af3689a6 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -1226,11 +1226,13 @@ static void EncodeInto(const FunctionCallbackInfo& args) { static void IsUtf8(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); CHECK_EQ(args.Length(), 1); - CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer()); + CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() || + args[0]->IsSharedArrayBuffer()); ArrayBufferViewContents abv(args[0]); if (abv.WasDetached()) { - return node::THROW_ERR_BUFFER_CONTEXT_NOT_AVAILABLE(env->isolate()); + return node::THROW_ERR_INVALID_STATE( + env, "Cannot validate on a detached buffer"); } args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length())); diff --git a/src/node_errors.h b/src/node_errors.h index 706464acc87b5c..3f17a80a6278cc 100644 --- a/src/node_errors.h +++ b/src/node_errors.h @@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details); V(ERR_INVALID_ARG_TYPE, TypeError) \ V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \ V(ERR_INVALID_MODULE, Error) \ + V(ERR_INVALID_STATE, Error) \ V(ERR_INVALID_THIS, TypeError) \ V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \ V(ERR_MEMORY_ALLOCATION_FAILED, Error) \ diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js index 28f3a93893d00e..c4ce62521accba 100644 --- a/test/parallel/test-buffer-isutf8.js +++ b/test/parallel/test-buffer-isutf8.js @@ -9,9 +9,21 @@ const encoder = new TextEncoder(); assert.strictEqual(isUtf8(encoder.encode('hello')), true); assert.strictEqual(isUtf8(encoder.encode('ğ')), true); +assert.strictEqual(isUtf8(Buffer.from([])), true); + +// Invalid UTF-8 assert.strictEqual(isUtf8(Buffer.from([0xf8])), false); + +// CESU-8 +assert.strictEqual(isUtf8(encoder.encode('\u0045\u0205\u10400')), true); assert.strictEqual(isUtf8(encoder.encode('aé日')), true); +// Two byte overlong encoding +assert.strictEqual(isUtf8(encoder.encode('\u0000')), true); + +// WTF-8 +assert.strictEqual(isUtf8(encoder.encode('\uD800\uDFFF')), true); + [ null, undefined, @@ -34,7 +46,7 @@ assert.strictEqual(isUtf8(encoder.encode('aé日')), true); assert.throws( () => { isUtf8(arrayBuffer); }, { - code: 'ERR_BUFFER_CONTEXT_NOT_AVAILABLE' + code: 'ERR_INVALID_STATE' } ); } From e940f594f7550aa1bf6f50916e84f427497704d5 Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sat, 24 Dec 2022 18:29:02 -0500 Subject: [PATCH 6/6] fixup! buffer: add buffer.isUtf8 for utf8 validation --- test/parallel/test-buffer-isutf8.js | 52 ++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/test/parallel/test-buffer-isutf8.js b/test/parallel/test-buffer-isutf8.js index c4ce62521accba..204db3e6a5fe25 100644 --- a/test/parallel/test-buffer-isutf8.js +++ b/test/parallel/test-buffer-isutf8.js @@ -11,18 +11,52 @@ assert.strictEqual(isUtf8(encoder.encode('hello')), true); assert.strictEqual(isUtf8(encoder.encode('ğ')), true); assert.strictEqual(isUtf8(Buffer.from([])), true); -// Invalid UTF-8 -assert.strictEqual(isUtf8(Buffer.from([0xf8])), false); +// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js +[ + [0xFF], // 'invalid code' + [0xC0], // 'ends early' + [0xE0], // 'ends early 2' + [0xC0, 0x00], // 'invalid trail' + [0xC0, 0xC0], // 'invalid trail 2' + [0xE0, 0x00], // 'invalid trail 3' + [0xE0, 0xC0], // 'invalid trail 4' + [0xE0, 0x80, 0x00], // 'invalid trail 5' + [0xE0, 0x80, 0xC0], // 'invalid trail 6' + [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF' + [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte' + + // Overlong encodings + [0xC0, 0x80], // 'overlong U+0000 - 2 bytes' + [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes' + [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes' + [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes' + + [0xC1, 0xBF], // 'overlong U+007F - 2 bytes' + [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes' + [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes' + [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes' -// CESU-8 -assert.strictEqual(isUtf8(encoder.encode('\u0045\u0205\u10400')), true); -assert.strictEqual(isUtf8(encoder.encode('aé日')), true); + [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes' + [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes' + [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes' -// Two byte overlong encoding -assert.strictEqual(isUtf8(encoder.encode('\u0000')), true); + [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes' + [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes' + [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes' -// WTF-8 -assert.strictEqual(isUtf8(encoder.encode('\uD800\uDFFF')), true); + [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes' + [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes' + + // UTF-16 surrogates encoded as code points in UTF-8 + [0xED, 0xA0, 0x80], // 'lead surrogate' + [0xED, 0xB0, 0x80], // 'trail surrogate' + [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair' +].forEach((input) => { + assert.strictEqual(isUtf8(Buffer.from(input)), false); +}); [ null,