Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v18.x backport] backport utf8 performance improvements #45650

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
37 changes: 37 additions & 0 deletions benchmark/util/text-decoder.js
@@ -0,0 +1,37 @@
'use strict';

const common = require('../common.js');

const bench = common.createBenchmark(main, {
encoding: ['utf-8', 'latin1', 'iso-8859-3'],
ignoreBOM: [0, 1],
len: [256, 1024 * 16, 1024 * 512],
n: [1e2],
type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer']
});

function main({ encoding, len, n, ignoreBOM, type }) {
const decoder = new TextDecoder(encoding, { ignoreBOM });
let buf;

switch (type) {
case 'SharedArrayBuffer': {
buf = new SharedArrayBuffer(len);
break;
}
case 'ArrayBuffer': {
buf = new ArrayBuffer(len);
break;
}
case 'Buffer': {
buf = Buffer.allocUnsafe(len);
break;
}
}

bench.start();
for (let i = 0; i < n; i++) {
decoder.decode(buf);
}
bench.end(n);
}
47 changes: 30 additions & 17 deletions lib/internal/encoding.js
Expand Up @@ -4,6 +4,7 @@
// https://encoding.spec.whatwg.org

const {
Boolean,
ObjectCreate,
ObjectDefineProperties,
ObjectGetOwnPropertyDescriptors,
Expand All @@ -18,7 +19,6 @@ const {
} = primordials;

const {
ERR_ENCODING_INVALID_ENCODED_DATA,
ERR_ENCODING_NOT_SUPPORTED,
ERR_INVALID_ARG_TYPE,
ERR_INVALID_THIS,
Expand All @@ -29,6 +29,8 @@ const kFlags = Symbol('flags');
const kEncoding = Symbol('encoding');
const kDecoder = Symbol('decoder');
const kEncoder = Symbol('encoder');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const {
getConstructorOf,
Expand All @@ -50,7 +52,8 @@ const {

const {
encodeInto,
encodeUtf8String
encodeUtf8String,
decodeUTF8,
} = internalBinding('buffer');

let Buffer;
Expand Down Expand Up @@ -398,26 +401,40 @@ function makeTextDecoderICU() {
flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0;
}

const handle = getConverter(enc, flags);
if (handle === undefined)
throw new ERR_ENCODING_NOT_SUPPORTED(encoding);
// Only support fast path for UTF-8 without FATAL flag
const fastPathAvailable = enc === 'utf-8' && !(options?.fatal);

this[kDecoder] = true;
this[kHandle] = handle;
this[kFlags] = flags;
this[kEncoding] = enc;
this[kIgnoreBOM] = Boolean(options?.ignoreBOM);
this[kUTF8FastPath] = fastPathAvailable;
this[kHandle] = undefined;

if (!fastPathAvailable) {
this.#prepareConverter();
}
}

#prepareConverter() {
if (this[kHandle] !== undefined) return;
const handle = getConverter(this[kEncoding], this[kFlags]);
if (handle === undefined)
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
this[kHandle] = handle;
}

decode(input = empty, options = kEmptyObject) {
validateDecoder(this);
if (isAnyArrayBuffer(input)) {
input = lazyBuffer().from(input);
} else if (!isArrayBufferView(input)) {
throw new ERR_INVALID_ARG_TYPE('input',
['ArrayBuffer', 'ArrayBufferView'],
input);

this[kUTF8FastPath] &&= !(options?.stream);

if (this[kUTF8FastPath]) {
return decodeUTF8(input, this[kIgnoreBOM]);
}

this.#prepareConverter();

validateObject(options, 'options', {
nullable: true,
allowArray: true,
Expand All @@ -428,11 +445,7 @@ function makeTextDecoderICU() {
if (options !== null)
flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH;

const ret = _decode(this[kHandle], input, flags);
if (typeof ret === 'number') {
throw new ERR_ENCODING_INVALID_ENCODED_DATA(this.encoding, ret);
}
return ret.toString('ucs2');
return _decode(this[kHandle], input, flags, this.encoding);
}
}

Expand Down
45 changes: 45 additions & 0 deletions src/node_buffer.cc
Expand Up @@ -24,6 +24,7 @@
#include "node_blob.h"
#include "node_errors.h"
#include "node_external_reference.h"
#include "node_i18n.h"
#include "node_internals.h"

#include "env-inl.h"
Expand Down Expand Up @@ -565,6 +566,48 @@ void StringSlice(const FunctionCallbackInfo<Value>& args) {
args.GetReturnValue().Set(ret);
}

// Convert the input into an encoded string
void DecodeUTF8(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args); // list, flags

if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
args[0]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"list\" argument must be an instance of SharedArrayBuffer, "
"ArrayBuffer or ArrayBufferView.");
}

ArrayBufferViewContents<char> buffer(args[0]);

CHECK(args[1]->IsBoolean());
bool ignore_bom = args[1]->IsTrue();

const char* data = buffer.data();
size_t length = buffer.length();

if (!ignore_bom && length >= 3) {
if (memcmp(data, "\xEF\xBB\xBF", 3) == 0) {
data += 3;
length -= 3;
}
}

if (length == 0) return args.GetReturnValue().SetEmptyString();

Local<Value> error;
MaybeLocal<Value> maybe_ret =
StringBytes::Encode(env->isolate(), data, length, UTF8, &error);
Local<Value> ret;

if (!maybe_ret.ToLocal(&ret)) {
CHECK(!error.IsEmpty());
env->isolate()->ThrowException(error);
return;
}

args.GetReturnValue().Set(ret);
}

// bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd])
void Copy(const FunctionCallbackInfo<Value> &args) {
Expand Down Expand Up @@ -1282,6 +1325,7 @@ void Initialize(Local<Object> target,

SetMethod(context, target, "setBufferPrototype", SetBufferPrototype);
SetMethodNoSideEffect(context, target, "createFromString", CreateFromString);
SetMethodNoSideEffect(context, target, "decodeUTF8", DecodeUTF8);

SetMethodNoSideEffect(context, target, "byteLengthUtf8", ByteLengthUtf8);
SetMethod(context, target, "copy", Copy);
Expand Down Expand Up @@ -1339,6 +1383,7 @@ void Initialize(Local<Object> target,
void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
registry->Register(SetBufferPrototype);
registry->Register(CreateFromString);
registry->Register(DecodeUTF8);

registry->Register(ByteLengthUtf8);
registry->Register(Copy);
Expand Down
1 change: 1 addition & 0 deletions src/node_errors.h
Expand Up @@ -60,6 +60,7 @@ void OOMErrorHandler(const char* location, bool is_heap_oom);
V(ERR_CRYPTO_JOB_INIT_FAILED, Error) \
V(ERR_DLOPEN_DISABLED, Error) \
V(ERR_DLOPEN_FAILED, Error) \
V(ERR_ENCODING_INVALID_ENCODED_DATA, TypeError) \
V(ERR_EXECUTION_ENVIRONMENT_NOT_AVAILABLE, Error) \
V(ERR_INVALID_ADDRESS, Error) \
V(ERR_INVALID_ARG_VALUE, TypeError) \
Expand Down
58 changes: 42 additions & 16 deletions src/node_i18n.cc
Expand Up @@ -50,6 +50,7 @@
#include "node_buffer.h"
#include "node_errors.h"
#include "node_internals.h"
#include "string_bytes.h"
#include "util-inl.h"
#include "v8.h"

Expand Down Expand Up @@ -96,7 +97,6 @@ using v8::NewStringType;
using v8::Object;
using v8::ObjectTemplate;
using v8::String;
using v8::Uint8Array;
using v8::Value;

namespace i18n {
Expand Down Expand Up @@ -436,16 +436,27 @@ void ConverterObject::Create(const FunctionCallbackInfo<Value>& args) {
void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);

CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags
CHECK_GE(args.Length(), 4); // Converter, Buffer, Flags, Encoding

ConverterObject* converter;
ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As<Object>());

if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() ||
args[1]->IsArrayBufferView())) {
return node::THROW_ERR_INVALID_ARG_TYPE(
env->isolate(),
"The \"input\" argument must be an instance of SharedArrayBuffer, "
"ArrayBuffer or ArrayBufferView.");
}

ArrayBufferViewContents<char> input(args[1]);
int flags = args[2]->Uint32Value(env->context()).ToChecked();

CHECK(args[3]->IsString());
Local<String> from_encoding = args[3].As<String>();

UErrorCode status = U_ZERO_ERROR;
MaybeStackBuffer<UChar> result;
MaybeLocal<Object> ret;

UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH;

Expand Down Expand Up @@ -501,23 +512,38 @@ void ConverterObject::Decode(const FunctionCallbackInfo<Value>& args) {
converter->set_bom_seen(true);
}
}
ret = ToBufferEndian(env, &result);
if (omit_initial_bom && !ret.IsEmpty()) {

Local<Value> error;
UChar* output = result.out();
size_t beginning = 0;
size_t length = result.length() * sizeof(UChar);

if (omit_initial_bom) {
// Perform `ret = ret.slice(2)`.
CHECK(ret.ToLocalChecked()->IsUint8Array());
Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
ret = Buffer::New(env,
orig_ret->Buffer(),
orig_ret->ByteOffset() + 2,
orig_ret->ByteLength() - 2)
.FromMaybe(Local<Uint8Array>());
beginning += 2;
length -= 2;
}

char* value = reinterpret_cast<char*>(output) + beginning;

if (IsBigEndian()) {
SwapBytes16(value, length);
}

MaybeLocal<Value> encoded =
StringBytes::Encode(env->isolate(), value, length, UCS2, &error);

Local<Value> ret;
if (encoded.ToLocal(&ret)) {
args.GetReturnValue().Set(ret);
return;
}
if (!ret.IsEmpty())
args.GetReturnValue().Set(ret.ToLocalChecked());
return;
}

args.GetReturnValue().Set(status);
node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
env->isolate(),
"The encoded data was not valid for encoding %s",
*node::Utf8Value(env->isolate(), from_encoding));
}

ConverterObject::ConverterObject(
Expand Down
25 changes: 23 additions & 2 deletions src/util-inl.h
Expand Up @@ -513,8 +513,9 @@ SlicedArguments::SlicedArguments(
template <typename T, size_t S>
ArrayBufferViewContents<T, S>::ArrayBufferViewContents(
v8::Local<v8::Value> value) {
CHECK(value->IsArrayBufferView());
Read(value.As<v8::ArrayBufferView>());
DCHECK(value->IsArrayBufferView() || value->IsSharedArrayBuffer() ||
value->IsArrayBuffer());
ReadValue(value);
}

template <typename T, size_t S>
Expand Down Expand Up @@ -542,6 +543,26 @@ void ArrayBufferViewContents<T, S>::Read(v8::Local<v8::ArrayBufferView> abv) {
}
}

template <typename T, size_t S>
void ArrayBufferViewContents<T, S>::ReadValue(v8::Local<v8::Value> buf) {
static_assert(sizeof(T) == 1, "Only supports one-byte data at the moment");
DCHECK(buf->IsArrayBufferView() || buf->IsSharedArrayBuffer() ||
buf->IsArrayBuffer());

if (buf->IsArrayBufferView()) {
Read(buf.As<v8::ArrayBufferView>());
} else if (buf->IsArrayBuffer()) {
auto ab = buf.As<v8::ArrayBuffer>();
length_ = ab->ByteLength();
data_ = static_cast<T*>(ab->Data());
} else {
CHECK(buf->IsSharedArrayBuffer());
auto sab = buf.As<v8::SharedArrayBuffer>();
length_ = sab->ByteLength();
data_ = static_cast<T*>(sab->Data());
}
}

// ECMA262 20.1.2.5
inline bool IsSafeJsInt(v8::Local<v8::Value> v) {
if (!v->IsNumber()) return false;
Expand Down
1 change: 1 addition & 0 deletions src/util.h
Expand Up @@ -505,6 +505,7 @@ class ArrayBufferViewContents {
explicit inline ArrayBufferViewContents(v8::Local<v8::Object> value);
explicit inline ArrayBufferViewContents(v8::Local<v8::ArrayBufferView> abv);
inline void Read(v8::Local<v8::ArrayBufferView> abv);
inline void ReadValue(v8::Local<v8::Value> buf);

inline const T* data() const { return data_; }
inline size_t length() const { return length_; }
Expand Down
2 changes: 1 addition & 1 deletion test/parallel/test-whatwg-encoding-custom-textdecoder.js
Expand Up @@ -113,7 +113,7 @@ if (common.hasIntl) {
' fatal: false,\n' +
' ignoreBOM: true,\n' +
' [Symbol(flags)]: 4,\n' +
' [Symbol(handle)]: Converter {}\n' +
' [Symbol(handle)]: undefined\n' +
'}'
);
} else {
Expand Down