diff --git a/benchmark/util/text-decoder.js b/benchmark/util/text-decoder.js new file mode 100644 index 00000000000000..3d1ccc34bb8995 --- /dev/null +++ b/benchmark/util/text-decoder.js @@ -0,0 +1,37 @@ +'use strict'; + +const common = require('../common.js'); + +const bench = common.createBenchmark(main, { + encoding: ['utf-8', 'latin1', 'iso-8859-3'], + ignoreBOM: [0, 1], + len: [256, 1024 * 16, 1024 * 512], + n: [1e2], + type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'] +}); + +function main({ encoding, len, n, ignoreBOM, type }) { + const decoder = new TextDecoder(encoding, { ignoreBOM }); + let buf; + + switch (type) { + case 'SharedArrayBuffer': { + buf = new SharedArrayBuffer(len); + break; + } + case 'ArrayBuffer': { + buf = new ArrayBuffer(len); + break; + } + case 'Buffer': { + buf = Buffer.allocUnsafe(len); + break; + } + } + + bench.start(); + for (let i = 0; i < n; i++) { + decoder.decode(buf); + } + bench.end(n); +} diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index f9fa43228d5f57..6ea554d5fbc892 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -4,6 +4,7 @@ // https://encoding.spec.whatwg.org const { + Boolean, ObjectCreate, ObjectDefineProperties, ObjectGetOwnPropertyDescriptors, @@ -18,7 +19,6 @@ const { } = primordials; const { - ERR_ENCODING_INVALID_ENCODED_DATA, ERR_ENCODING_NOT_SUPPORTED, ERR_INVALID_ARG_TYPE, ERR_INVALID_THIS, @@ -29,6 +29,8 @@ const kFlags = Symbol('flags'); const kEncoding = Symbol('encoding'); const kDecoder = Symbol('decoder'); const kEncoder = Symbol('encoder'); +const kUTF8FastPath = Symbol('kUTF8FastPath'); +const kIgnoreBOM = Symbol('kIgnoreBOM'); const { getConstructorOf, @@ -50,7 +52,8 @@ const { const { encodeInto, - encodeUtf8String + encodeUtf8String, + decodeUTF8, } = internalBinding('buffer'); let Buffer; @@ -398,26 +401,40 @@ function makeTextDecoderICU() { flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; } - const handle = getConverter(enc, flags); - if (handle === undefined) - throw new ERR_ENCODING_NOT_SUPPORTED(encoding); + // Only support fast path for UTF-8 without FATAL flag + const fastPathAvailable = enc === 'utf-8' && !(options?.fatal); this[kDecoder] = true; - this[kHandle] = handle; this[kFlags] = flags; this[kEncoding] = enc; + this[kIgnoreBOM] = Boolean(options?.ignoreBOM); + this[kUTF8FastPath] = fastPathAvailable; + this[kHandle] = undefined; + + if (!fastPathAvailable) { + this.#prepareConverter(); + } } + #prepareConverter() { + if (this[kHandle] !== undefined) return; + const handle = getConverter(this[kEncoding], this[kFlags]); + if (handle === undefined) + throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]); + this[kHandle] = handle; + } decode(input = empty, options = kEmptyObject) { validateDecoder(this); - if (isAnyArrayBuffer(input)) { - input = lazyBuffer().from(input); - } else if (!isArrayBufferView(input)) { - throw new ERR_INVALID_ARG_TYPE('input', - ['ArrayBuffer', 'ArrayBufferView'], - input); + + this[kUTF8FastPath] &&= !(options?.stream); + + if (this[kUTF8FastPath]) { + return decodeUTF8(input, this[kIgnoreBOM]); } + + this.#prepareConverter(); + validateObject(options, 'options', { nullable: true, allowArray: true, @@ -428,11 +445,7 @@ function makeTextDecoderICU() { if (options !== null) flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH; - const ret = _decode(this[kHandle], input, flags); - if (typeof ret === 'number') { - throw new ERR_ENCODING_INVALID_ENCODED_DATA(this.encoding, ret); - } - return ret.toString('ucs2'); + return _decode(this[kHandle], input, flags, this.encoding); } } diff --git a/src/node_buffer.cc b/src/node_buffer.cc index eb8e541c68635d..acec3c420ce1d2 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -24,6 +24,7 @@ #include "node_blob.h" #include "node_errors.h" #include "node_external_reference.h" +#include "node_i18n.h" #include "node_internals.h" #include "env-inl.h" @@ -565,6 +566,48 @@ void StringSlice(const FunctionCallbackInfo& args) { args.GetReturnValue().Set(ret); } +// Convert the input into an encoded string +void DecodeUTF8(const FunctionCallbackInfo& args) { + Environment* env = Environment::GetCurrent(args); // list, flags + + if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() || + args[0]->IsArrayBufferView())) { + return node::THROW_ERR_INVALID_ARG_TYPE( + env->isolate(), + "The \"list\" argument must be an instance of SharedArrayBuffer, " + "ArrayBuffer or ArrayBufferView."); + } + + ArrayBufferViewContents buffer(args[0]); + + CHECK(args[1]->IsBoolean()); + bool ignore_bom = args[1]->IsTrue(); + + const char* data = buffer.data(); + size_t length = buffer.length(); + + if (!ignore_bom && length >= 3) { + if (memcmp(data, "\xEF\xBB\xBF", 3) == 0) { + data += 3; + length -= 3; + } + } + + if (length == 0) return args.GetReturnValue().SetEmptyString(); + + Local error; + MaybeLocal maybe_ret = + StringBytes::Encode(env->isolate(), data, length, UTF8, &error); + Local ret; + + if (!maybe_ret.ToLocal(&ret)) { + CHECK(!error.IsEmpty()); + env->isolate()->ThrowException(error); + return; + } + + args.GetReturnValue().Set(ret); +} // bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd]) void Copy(const FunctionCallbackInfo &args) { @@ -1282,6 +1325,7 @@ void Initialize(Local target, SetMethod(context, target, "setBufferPrototype", SetBufferPrototype); SetMethodNoSideEffect(context, target, "createFromString", CreateFromString); + SetMethodNoSideEffect(context, target, "decodeUTF8", DecodeUTF8); SetMethodNoSideEffect(context, target, "byteLengthUtf8", ByteLengthUtf8); SetMethod(context, target, "copy", Copy); @@ -1339,6 +1383,7 @@ void Initialize(Local target, void RegisterExternalReferences(ExternalReferenceRegistry* registry) { registry->Register(SetBufferPrototype); registry->Register(CreateFromString); + registry->Register(DecodeUTF8); registry->Register(ByteLengthUtf8); registry->Register(Copy); diff --git a/src/node_errors.h b/src/node_errors.h index 5587c234862610..926f54286ec72f 100644 --- a/src/node_errors.h +++ b/src/node_errors.h @@ -60,6 +60,7 @@ void OOMErrorHandler(const char* location, bool is_heap_oom); V(ERR_CRYPTO_JOB_INIT_FAILED, Error) \ V(ERR_DLOPEN_DISABLED, Error) \ V(ERR_DLOPEN_FAILED, Error) \ + V(ERR_ENCODING_INVALID_ENCODED_DATA, TypeError) \ V(ERR_EXECUTION_ENVIRONMENT_NOT_AVAILABLE, Error) \ V(ERR_INVALID_ADDRESS, Error) \ V(ERR_INVALID_ARG_VALUE, TypeError) \ diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 581d52a7d05738..441c2b32763a96 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -50,6 +50,7 @@ #include "node_buffer.h" #include "node_errors.h" #include "node_internals.h" +#include "string_bytes.h" #include "util-inl.h" #include "v8.h" @@ -96,7 +97,6 @@ using v8::NewStringType; using v8::Object; using v8::ObjectTemplate; using v8::String; -using v8::Uint8Array; using v8::Value; namespace i18n { @@ -436,16 +436,27 @@ void ConverterObject::Create(const FunctionCallbackInfo& args) { void ConverterObject::Decode(const FunctionCallbackInfo& args) { Environment* env = Environment::GetCurrent(args); - CHECK_GE(args.Length(), 3); // Converter, Buffer, Flags + CHECK_GE(args.Length(), 4); // Converter, Buffer, Flags, Encoding ConverterObject* converter; ASSIGN_OR_RETURN_UNWRAP(&converter, args[0].As()); + + if (!(args[1]->IsArrayBuffer() || args[1]->IsSharedArrayBuffer() || + args[1]->IsArrayBufferView())) { + return node::THROW_ERR_INVALID_ARG_TYPE( + env->isolate(), + "The \"input\" argument must be an instance of SharedArrayBuffer, " + "ArrayBuffer or ArrayBufferView."); + } + ArrayBufferViewContents input(args[1]); int flags = args[2]->Uint32Value(env->context()).ToChecked(); + CHECK(args[3]->IsString()); + Local from_encoding = args[3].As(); + UErrorCode status = U_ZERO_ERROR; MaybeStackBuffer result; - MaybeLocal ret; UBool flush = (flags & CONVERTER_FLAGS_FLUSH) == CONVERTER_FLAGS_FLUSH; @@ -501,23 +512,38 @@ void ConverterObject::Decode(const FunctionCallbackInfo& args) { converter->set_bom_seen(true); } } - ret = ToBufferEndian(env, &result); - if (omit_initial_bom && !ret.IsEmpty()) { + + Local error; + UChar* output = result.out(); + size_t beginning = 0; + size_t length = result.length() * sizeof(UChar); + + if (omit_initial_bom) { // Perform `ret = ret.slice(2)`. - CHECK(ret.ToLocalChecked()->IsUint8Array()); - Local orig_ret = ret.ToLocalChecked().As(); - ret = Buffer::New(env, - orig_ret->Buffer(), - orig_ret->ByteOffset() + 2, - orig_ret->ByteLength() - 2) - .FromMaybe(Local()); + beginning += 2; + length -= 2; + } + + char* value = reinterpret_cast(output) + beginning; + + if (IsBigEndian()) { + SwapBytes16(value, length); + } + + MaybeLocal encoded = + StringBytes::Encode(env->isolate(), value, length, UCS2, &error); + + Local ret; + if (encoded.ToLocal(&ret)) { + args.GetReturnValue().Set(ret); + return; } - if (!ret.IsEmpty()) - args.GetReturnValue().Set(ret.ToLocalChecked()); - return; } - args.GetReturnValue().Set(status); + node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA( + env->isolate(), + "The encoded data was not valid for encoding %s", + *node::Utf8Value(env->isolate(), from_encoding)); } ConverterObject::ConverterObject( diff --git a/src/util-inl.h b/src/util-inl.h index bdf9732e853485..18f829d2775f0d 100644 --- a/src/util-inl.h +++ b/src/util-inl.h @@ -513,8 +513,9 @@ SlicedArguments::SlicedArguments( template ArrayBufferViewContents::ArrayBufferViewContents( v8::Local value) { - CHECK(value->IsArrayBufferView()); - Read(value.As()); + DCHECK(value->IsArrayBufferView() || value->IsSharedArrayBuffer() || + value->IsArrayBuffer()); + ReadValue(value); } template @@ -542,6 +543,26 @@ void ArrayBufferViewContents::Read(v8::Local abv) { } } +template +void ArrayBufferViewContents::ReadValue(v8::Local buf) { + static_assert(sizeof(T) == 1, "Only supports one-byte data at the moment"); + DCHECK(buf->IsArrayBufferView() || buf->IsSharedArrayBuffer() || + buf->IsArrayBuffer()); + + if (buf->IsArrayBufferView()) { + Read(buf.As()); + } else if (buf->IsArrayBuffer()) { + auto ab = buf.As(); + length_ = ab->ByteLength(); + data_ = static_cast(ab->Data()); + } else { + CHECK(buf->IsSharedArrayBuffer()); + auto sab = buf.As(); + length_ = sab->ByteLength(); + data_ = static_cast(sab->Data()); + } +} + // ECMA262 20.1.2.5 inline bool IsSafeJsInt(v8::Local v) { if (!v->IsNumber()) return false; diff --git a/src/util.h b/src/util.h index 0fe821e9fa97ce..7e02c232de4f1b 100644 --- a/src/util.h +++ b/src/util.h @@ -505,6 +505,7 @@ class ArrayBufferViewContents { explicit inline ArrayBufferViewContents(v8::Local value); explicit inline ArrayBufferViewContents(v8::Local abv); inline void Read(v8::Local abv); + inline void ReadValue(v8::Local buf); inline const T* data() const { return data_; } inline size_t length() const { return length_; } diff --git a/test/parallel/test-whatwg-encoding-custom-textdecoder.js b/test/parallel/test-whatwg-encoding-custom-textdecoder.js index 74c6a002223255..f4ad73e1c61da8 100644 --- a/test/parallel/test-whatwg-encoding-custom-textdecoder.js +++ b/test/parallel/test-whatwg-encoding-custom-textdecoder.js @@ -113,7 +113,7 @@ if (common.hasIntl) { ' fatal: false,\n' + ' ignoreBOM: true,\n' + ' [Symbol(flags)]: 4,\n' + - ' [Symbol(handle)]: Converter {}\n' + + ' [Symbol(handle)]: undefined\n' + '}' ); } else {