Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

buffer: add buffer.isUtf8 for utf8 validation #45947

Merged
merged 6 commits into from Dec 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions doc/api/buffer.md
Expand Up @@ -5130,6 +5130,17 @@ For code running using Node.js APIs, converting between base64-encoded strings
and binary data should be performed using `Buffer.from(str, 'base64')` and
`buf.toString('base64')`.**

### `buffer.isUtf8(input)`

<!-- YAML
added: REPLACEME
-->

* input {Buffer | ArrayBuffer | TypedArray} The input to validate.
* Returns: {boolean} Returns `true` if and only if the input is valid UTF-8.

This function is used to check if input contains UTF-8 code points (characters).

### `buffer.INSPECT_MAX_BYTES`

<!-- YAML
Expand Down
13 changes: 12 additions & 1 deletion lib/buffer.js
Expand Up @@ -57,6 +57,7 @@ const {
compareOffset,
createFromString,
fill: bindingFill,
isUtf8: bindingIsUtf8,
indexOfBuffer,
indexOfNumber,
indexOfString,
Expand Down Expand Up @@ -84,7 +85,8 @@ const {
const {
isAnyArrayBuffer,
isArrayBufferView,
isUint8Array
isUint8Array,
isTypedArray,
} = require('internal/util/types');
const {
inspect: utilInspect
Expand Down Expand Up @@ -1314,10 +1316,19 @@ function atob(input) {
return Buffer.from(input, 'base64').toString('latin1');
}

function isUtf8(input) {
if (isTypedArray(input) || isAnyArrayBuffer(input)) {
return bindingIsUtf8(input);
}

throw new ERR_INVALID_ARG_TYPE('input', ['TypedArray', 'Buffer'], input);
}

module.exports = {
Buffer,
SlowBuffer,
transcode,
isUtf8,

// Legacy
kMaxLength,
Expand Down
18 changes: 18 additions & 0 deletions src/node_buffer.cc
Expand Up @@ -1223,6 +1223,20 @@ static void EncodeInto(const FunctionCallbackInfo<Value>& args) {
results[1] = written;
}

static void IsUtf8(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_EQ(args.Length(), 1);
CHECK(args[0]->IsTypedArray() || args[0]->IsArrayBuffer() ||
args[0]->IsSharedArrayBuffer());
ArrayBufferViewContents<char> abv(args[0]);

if (abv.WasDetached()) {
return node::THROW_ERR_INVALID_STATE(
env, "Cannot validate on a detached buffer");
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it's after the fact but why does the buffer being detached matter here? It would be otherwise indistinguishable from zero-length which we should just return false for anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Detached buffers create false sense of UTF8 validation, if there isn’t an error in here, since there is no way of accessing the underlying data store, and validating for UTF-8, I believe this error is valid.


args.GetReturnValue().Set(simdutf::validate_utf8(abv.data(), abv.length()));
}

void SetBufferPrototype(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
Expand Down Expand Up @@ -1358,6 +1372,8 @@ void Initialize(Local<Object> target,
SetMethod(context, target, "encodeInto", EncodeInto);
SetMethodNoSideEffect(context, target, "encodeUtf8String", EncodeUtf8String);

SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8);

target
->Set(context,
FIXED_ONE_BYTE_STRING(isolate, "kMaxLength"),
Expand Down Expand Up @@ -1413,6 +1429,8 @@ void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
registry->Register(EncodeInto);
registry->Register(EncodeUtf8String);

registry->Register(IsUtf8);

registry->Register(StringSlice<ASCII>);
registry->Register(StringSlice<BASE64>);
registry->Register(StringSlice<BASE64URL>);
Expand Down
1 change: 1 addition & 0 deletions src/node_errors.h
Expand Up @@ -68,6 +68,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details);
V(ERR_INVALID_ARG_TYPE, TypeError) \
V(ERR_INVALID_OBJECT_DEFINE_PROPERTY, TypeError) \
V(ERR_INVALID_MODULE, Error) \
V(ERR_INVALID_STATE, Error) \
V(ERR_INVALID_THIS, TypeError) \
V(ERR_INVALID_TRANSFER_OBJECT, TypeError) \
V(ERR_MEMORY_ALLOCATION_FAILED, Error) \
Expand Down
1 change: 1 addition & 0 deletions src/util-inl.h
Expand Up @@ -555,6 +555,7 @@ void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
auto ab = buf.As<v8::ArrayBuffer>();
length_ = ab->ByteLength();
data_ = static_cast<T*>(ab->Data());
was_detached_ = ab->WasDetached();
} else {
CHECK(buf->IsSharedArrayBuffer());
auto sab = buf.As<v8::SharedArrayBuffer>();
Expand Down
2 changes: 2 additions & 0 deletions src/util.h
Expand Up @@ -511,6 +511,7 @@ class ArrayBufferViewContents {
inline void Read(v8::Local<v8::ArrayBufferView> abv);
inline void ReadValue(v8::Local<v8::Value> buf);

inline bool WasDetached() const { return was_detached_; }
inline const T* data() const { return data_; }
inline size_t length() const { return length_; }

Expand All @@ -525,6 +526,7 @@ class ArrayBufferViewContents {
T stack_storage_[kStackStorageSize];
T* data_ = nullptr;
size_t length_ = 0;
bool was_detached_ = false;
anonrig marked this conversation as resolved.
Show resolved Hide resolved
};

class Utf8Value : public MaybeStackBuffer<char> {
Expand Down
86 changes: 86 additions & 0 deletions test/parallel/test-buffer-isutf8.js
@@ -0,0 +1,86 @@
'use strict';

require('../common');
const assert = require('assert');
const { isUtf8, Buffer } = require('buffer');
const { TextEncoder } = require('util');

const encoder = new TextEncoder();

assert.strictEqual(isUtf8(encoder.encode('hello')), true);
assert.strictEqual(isUtf8(encoder.encode('ğ')), true);
assert.strictEqual(isUtf8(Buffer.from([])), true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does a zero length buffer return true? I would expect this to be false.

Copy link
Member Author

@anonrig anonrig Dec 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because it does not include an invalid code point. Is there a similar Node function that has a different behavior?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the stated description of the API is, "This function is used to check if input contains UTF-8 code points"... An empty buffer does not contain UTF-8 code points so it really can't return true. Other methods we have that accept ArrayBuffer or TypedArray, with the exception of Web Streams which have specifically defined handling for detached, will treat those as indistinguishable from a zero-length input.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm.. Thats correct. What do you recommend?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would just follow up with an additional pr that returned false for zero-length, removing the detached check and error entirely.

Copy link
Member

@lpinca lpinca Dec 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion it should not be changed. It should return true. It's like the empty string which is valid UTF-8. I would be surprised if isUtf8(encoder.encode(''); returns false.

To avoid confusion the documentation can be updated like this "This function returns false if the input contains invalid UTF-8 code points, else true".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The challenge there is that with that logic the empty buffer would pass any encoding check. isASCII? Yes. isUTF16le? Yes. IsUTF32be? Yes. Is Shift-JIS? Yes.... Which just simply isn't useful. If you want the inverse check, isInvalidUtf8() then implement that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created a pull request: #45973

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The UTF-8 RFC specifies...

An octet sequence is valid UTF-8 only if it matches the
   following syntax, which is derived from the rules for encoding UTF-8
   and is expressed in the ABNF of [[RFC2234](https://www.rfc-editor.org/rfc/rfc2234)].

   UTF8-octets = *( UTF8-char )
...

Reference: https://www.rfc-editor.org/rfc/rfc3629

So UTF-8 explicitly, by its ABNF, includes the empty string.

Note that, in general, from a non-empty buffer alone, we cannot determine uniquely the character encoding. A BOM may help but UTF-8 is BOM-less.

A string of bytes may be interpreted under different encodings... and in some cases, it is by design. Thus, for example, ASCII buffers are always valid UTF-8 and Latin1 buffers (by design).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The challenge there is that with that logic the empty buffer would pass any encoding check.

Yes, I think it makes sense and that is how it works in some other popular programming languages.


// Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js
[
[0xFF], // 'invalid code'
[0xC0], // 'ends early'
[0xE0], // 'ends early 2'
[0xC0, 0x00], // 'invalid trail'
[0xC0, 0xC0], // 'invalid trail 2'
[0xE0, 0x00], // 'invalid trail 3'
[0xE0, 0xC0], // 'invalid trail 4'
[0xE0, 0x80, 0x00], // 'invalid trail 5'
[0xE0, 0x80, 0xC0], // 'invalid trail 6'
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF'
[0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte'

// Overlong encodings
[0xC0, 0x80], // 'overlong U+0000 - 2 bytes'
[0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes'
[0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes'
[0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes'

[0xC1, 0xBF], // 'overlong U+007F - 2 bytes'
[0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes'
[0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes'
[0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes'

[0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes'
[0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes'
[0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes'
[0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes'

[0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes'
[0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes'
[0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes'

[0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes'
[0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes'

// UTF-16 surrogates encoded as code points in UTF-8
[0xED, 0xA0, 0x80], // 'lead surrogate'
[0xED, 0xB0, 0x80], // 'trail surrogate'
[0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair'
].forEach((input) => {
assert.strictEqual(isUtf8(Buffer.from(input)), false);
});

[
null,
undefined,
'hello',
true,
false,
].forEach((input) => {
assert.throws(
() => { isUtf8(input); },
{
code: 'ERR_INVALID_ARG_TYPE',
},
);
});

{
// Test with detached array buffers
const arrayBuffer = new ArrayBuffer(1024);
structuredClone(arrayBuffer, { transfer: [arrayBuffer] });
assert.throws(
() => { isUtf8(arrayBuffer); },
{
code: 'ERR_INVALID_STATE'
}
);
}