Skip to content

Commit

Permalink
buffer: add buffer.isUtf8 for utf8 validation
Browse files Browse the repository at this point in the history
PR-URL: #45947
Reviewed-By: Robert Nagy <ronagy@icloud.com>
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
Reviewed-By: Anna Henningsen <anna@addaleax.net>
  • Loading branch information
anonrig authored and juanarbol committed Jan 31, 2023
1 parent 4063cdc commit 16ee02f
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 1 deletion.
11 changes: 11 additions & 0 deletions doc/api/buffer.md
Expand Up @@ -5126,6 +5126,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
and binary data should be performed using `Buffer.from(str, 'base64')` and
`buf.toString('base64')`.**

### `buffer.isUtf8(input)`

<!-- YAML
added: REPLACEME
-->

* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.

This function is used to check if input contains UTF-8 code points (characters).

### `buffer.INSPECT_MAX_BYTES`

<!-- YAML
Expand Down
13 changes: 12 additions & 1 deletion lib/buffer.js
Expand Up @@ -57,6 +57,7 @@ const {
compareOffset,
createFromString,
fill: bindingFill,
isUtf8: bindingIsUtf8,
indexOfBuffer,
indexOfNumber,
indexOfString,
Expand All @@ -83,7 +84,8 @@ const {
const {
isAnyArrayBuffer,
isArrayBufferView,
isUint8Array
isUint8Array,
isTypedArray,
} = require('internal/util/types');
const {
inspect: utilInspect
Expand Down Expand Up @@ -1322,13 +1324,22 @@ function atob(input) {
return Buffer.from(input, 'base64').toString('latin1');
}

function isUtf8(input) {
if (isTypedArray(input) || isAnyArrayBuffer(input)) {
return bindingIsUtf8(input);
}

throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
}

module.exports = {
Blob,
File,
resolveObjectURL,
Buffer,
SlowBuffer,
transcode,
isUtf8,

// Legacy
kMaxLength,
Expand Down
18 changes: 18 additions & 0 deletions src/node_buffer.cc
Expand Up @@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
results[1] = written;
}

static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_EQ(args.Length(), 1);
CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
args[0]->IsSharedArrayBuffer());
ArrayBufferViewContents<char> abv(args[0]);

if (abv.WasDetached()) {
return node::THROW_ERR_INVALID_STATE(
env, "Cannot validate on a detached buffer");
}

args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
}

void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
Expand Down Expand Up @@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
SetMethod(context, target, "encodeInto", EncodeInto);
SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);

SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);

target
->Set(context,
FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
Expand Down Expand Up @@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
registry->Register(EncodeInto);
registry->Register(EncodeUtf8String);

registry->Register(IsUtf8);

registry->Register(StringSlice<ASCII>);
registry->Register(StringSlice<BASE64>);
registry->Register(StringSlice<BASE64URL>);
Expand Down
1 change: 1 addition & 0 deletions src/node_errors.h
Expand Up @@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, bool is_heap_oom);
V(ERR_INVALID_ARG_TYPE, TypeError) \
V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \
V(ERR_INVALID_MODULE, Error) \
V(ERR_INVALID_STATE, Error) \
V(ERR_INVALID_THIS, TypeError) \
V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \
V(ERR_MEMORY_ALLOCATION_FAILED, Error) \
Expand Down
1 change: 1 addition & 0 deletions src/util-inl.h
Expand Up @@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
auto ab = buf.As<v8::ArrayBuffer>();
length_ = ab->ByteLength();
data_ = static_cast<T*>(ab->Data());
was_detached_ = ab->WasDetached();
} else {
CHECK(buf->IsSharedArrayBuffer());
auto sab = buf.As<v8::SharedArrayBuffer>();
Expand Down
2 changes: 2 additions & 0 deletions src/util.h
Expand Up @@ -510,6 +510,7 @@ class ArrayBufferViewContents {
inline void Read(v8::Local<v8::ArrayBufferView> abv);
inline void ReadValue(v8::Local<v8::Value> buf);

inline bool WasDetached() const { return was_detached_; }
inline const T* data() const { return data_; }
inline size_t length() const { return length_; }

Expand All @@ -524,6 +525,7 @@ class ArrayBufferViewContents {
T stack_storage_[kStackStorageSize];
T* data_ = nullptr;
size_t length_ = 0;
bool was_detached_ = false;
};

class Utf8Value : public MaybeStackBuffer<char> {
Expand Down
86 changes: 86 additions & 0 deletions test/parallel/test-buffer-isutf8.js
@@ -0,0 +1,86 @@
'use strict';

require('../common');
const assert = require('assert');
const { isUtf8, Buffer } = require('buffer');
const { TextEncoder } = require('util');

const encoder = new TextEncoder();

assert.strictEqual(isUtf8(encoder.encode('hello')), true);
assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
assert.strictEqual(isUtf8(Buffer.from([])), true);

// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
[
[0xFF], // 'invalid code'
[0xC0], // 'ends early'
[0xE0], // 'ends early 2'
[0xC0, 0x00], // 'invalid trail'
[0xC0, 0xC0], // 'invalid trail 2'
[0xE0, 0x00], // 'invalid trail 3'
[0xE0, 0xC0], // 'invalid trail 4'
[0xE0, 0x80, 0x00], // 'invalid trail 5'
[0xE0, 0x80, 0xC0], // 'invalid trail 6'
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
[0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'

// Overlong encodings
[0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
[0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
[0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
[0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'

[0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
[0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
[0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
[0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'

[0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
[0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
[0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'

[0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
[0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
[0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'

[0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
[0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'

// UTF-16 surrogates encoded as code points in UTF-8
[0xED, 0xA0, 0x80], // 'lead surrogate'
[0xED, 0xB0, 0x80], // 'trail surrogate'
[0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
].forEach((input) => {
assert.strictEqual(isUtf8(Buffer.from(input)), false);
});

[
null,
undefined,
'hello',
true,
false,
].forEach((input) => {
assert.throws(
() => { isUtf8(input); },
{
code: 'ERR_INVALID_ARG_TYPE',
},
);
});

{
// Test with detached array buffers
const arrayBuffer = new ArrayBuffer(1024);
structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
assert.throws(
() => { isUtf8(arrayBuffer); },
{
code: 'ERR_INVALID_STATE'
}
);
}

0 comments on commit 16ee02f

Please sign in to comment.