From fab72b1677c94818d6185a4c5e6954d803c064c7 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Tue, 10 Jan 2023 12:25:19 +0100 Subject: [PATCH] src: use simdutf for converting externalized builtins to UTF-16 Remove the dependency on ICU for this part, as well as the hacky way of converting embedder main sources to UTF-8 via V8 APIs. Allow `UnionBytes` to own the memory its pointing to in order to simplify the code on the `BuiltinLoader` side. PR-URL: https://github.com/nodejs/node/pull/46119 Reviewed-By: Yagiz Nizipli Reviewed-By: James M Snell Reviewed-By: Rafael Gonzaga Reviewed-By: Darshan Sen --- configure.py | 6 +--- node.gyp | 1 + src/api/environment.cc | 15 ++-------- src/env-inl.h | 5 ---- src/env.h | 6 ---- src/node_builtins.cc | 26 ++++++++-------- src/node_builtins.h | 9 +----- src/node_union_bytes.h | 64 ++++++---------------------------------- src/util.cc | 62 ++++++++++++++++++++++++++++++++++++++ test/cctest/test_util.cc | 17 ++++++++++- 10 files changed, 106 insertions(+), 105 deletions(-) diff --git a/configure.py b/configure.py index e146f67f639ca3..0a45c07f587ed8 100755 --- a/configure.py +++ b/configure.py @@ -2077,11 +2077,7 @@ def make_bin_override(): for builtin in shareable_builtins: builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path' if getattr(options, builtin_id): - if options.with_intl == 'none': - option_name = '--shared-builtin-' + builtin + '-path' - error(option_name + ' is incompatible with --with-intl=none' ) - else: - output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)] + output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)] else: output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]] diff --git a/node.gyp b/node.gyp index 498df0ad3a735d..21e3ed50befd58 100644 --- a/node.gyp +++ b/node.gyp @@ -1211,6 +1211,7 @@ 'node_dtrace_header', 'node_dtrace_ustack', 'node_dtrace_provider', + 'deps/simdutf/simdutf.gyp:simdutf', ], 'includes': [ diff --git a/src/api/environment.cc b/src/api/environment.cc index 339b9a1822f3e9..5e4f69921b2635 100644 --- a/src/api/environment.cc +++ b/src/api/environment.cc @@ -464,21 +464,10 @@ MaybeLocal LoadEnvironment( Environment* env, const char* main_script_source_utf8) { CHECK_NOT_NULL(main_script_source_utf8); - Isolate* isolate = env->isolate(); return LoadEnvironment( - env, - [&](const StartExecutionCallbackInfo& info) -> MaybeLocal { - // This is a slightly hacky way to convert UTF-8 to UTF-16. - Local str = - String::NewFromUtf8(isolate, - main_script_source_utf8).ToLocalChecked(); - auto main_utf16 = std::make_unique(isolate, str); - - // TODO(addaleax): Avoid having a global table for all scripts. + env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal { std::string name = "embedder_main_" + std::to_string(env->thread_id()); - builtins::BuiltinLoader::Add( - name.c_str(), UnionBytes(**main_utf16, main_utf16->length())); - env->set_main_utf16(std::move(main_utf16)); + builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8); Realm* realm = env->principal_realm(); // Arguments must match the parameters specified in diff --git a/src/env-inl.h b/src/env-inl.h index 65710f7670c0ac..bea695bd8c92d4 100644 --- a/src/env-inl.h +++ b/src/env-inl.h @@ -799,11 +799,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) { cleanup_queue_.Remove(fn, arg); } -void Environment::set_main_utf16(std::unique_ptr str) { - CHECK(!main_utf16_); - main_utf16_ = std::move(str); -} - void Environment::set_process_exit_handler( std::function&& handler) { process_exit_handler_ = std::move(handler); diff --git a/src/env.h b/src/env.h index bcbaf761914c95..6cfba33607e216 100644 --- a/src/env.h +++ b/src/env.h @@ -950,7 +950,6 @@ class Environment : public MemoryRetainer { #endif // HAVE_INSPECTOR - inline void set_main_utf16(std::unique_ptr); inline void set_process_exit_handler( std::function&& handler); @@ -1122,11 +1121,6 @@ class Environment : public MemoryRetainer { std::unique_ptr principal_realm_ = nullptr; - // Keeps the main script source alive is one was passed to LoadEnvironment(). - // We should probably find a way to just use plain `v8::String`s created from - // the source passed to LoadEnvironment() directly instead. - std::unique_ptr main_utf16_; - // Used by allocate_managed_buffer() and release_managed_buffer() to keep // track of the BackingStore for a given pointer. std::unordered_map> diff --git a/src/node_builtins.cc b/src/node_builtins.cc index 9d82a06adcefe3..ca2efa633be88d 100644 --- a/src/node_builtins.cc +++ b/src/node_builtins.cc @@ -3,6 +3,7 @@ #include "env-inl.h" #include "node_external_reference.h" #include "node_internals.h" +#include "simdutf.h" #include "util-inl.h" namespace node { @@ -32,7 +33,6 @@ BuiltinLoader BuiltinLoader::instance_; BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) { LoadJavaScriptSource(); -#if defined(NODE_HAVE_I18N_SUPPORT) #ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH AddExternalizedBuiltin( "internal/deps/cjs-module-lexer/lexer", @@ -49,7 +49,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) { AddExternalizedBuiltin("internal/deps/undici/undici", STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH)); #endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH -#endif // NODE_HAVE_I18N_SUPPORT } BuiltinLoader* BuiltinLoader::GetInstance() { @@ -237,7 +236,6 @@ MaybeLocal BuiltinLoader::LoadBuiltinSource(Isolate* isolate, #endif // NODE_BUILTIN_MODULES_PATH } -#if defined(NODE_HAVE_I18N_SUPPORT) void BuiltinLoader::AddExternalizedBuiltin(const char* id, const char* filename) { std::string source; @@ -249,16 +247,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id, return; } - icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8( - icu::StringPiece(source.data(), source.length())); - auto source_utf16 = std::make_unique(utf16); - Add(id, - UnionBytes(reinterpret_cast((*source_utf16).getBuffer()), - utf16.length())); - // keep source bytes for builtin alive while BuiltinLoader exists - GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16)); + Add(id, source); +} + +bool BuiltinLoader::Add(const char* id, std::string_view utf8source) { + size_t expected_u16_length = + simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length()); + auto out = std::make_shared>(expected_u16_length); + size_t u16_length = simdutf::convert_utf8_to_utf16le( + utf8source.data(), + utf8source.length(), + reinterpret_cast(out->data())); + out->resize(u16_length); + return Add(id, UnionBytes(out)); } -#endif // NODE_HAVE_I18N_SUPPORT // Returns Local of the compiled module if return_code_cache // is false (we are only compiling the function). diff --git a/src/node_builtins.h b/src/node_builtins.h index a6fbff98ccb0a7..90b158b84bb2a6 100644 --- a/src/node_builtins.h +++ b/src/node_builtins.h @@ -8,9 +8,6 @@ #include #include #include -#if defined(NODE_HAVE_I18N_SUPPORT) -#include -#endif // NODE_HAVE_I18N_SUPPORT #include #include "node_mutex.h" #include "node_union_bytes.h" @@ -59,6 +56,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader { static v8::Local GetConfigString(v8::Isolate* isolate); static bool Exists(const char* id); static bool Add(const char* id, const UnionBytes& source); + static bool Add(const char* id, std::string_view utf8source); static bool CompileAllBuiltins(v8::Local context); static void RefreshCodeCache(const std::vector& in); @@ -124,18 +122,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader { static void HasCachedBuiltins( const v8::FunctionCallbackInfo& args); -#if defined(NODE_HAVE_I18N_SUPPORT) static void AddExternalizedBuiltin(const char* id, const char* filename); -#endif // NODE_HAVE_I18N_SUPPORT static BuiltinLoader instance_; BuiltinCategories builtin_categories_; BuiltinSourceMap source_; BuiltinCodeCacheMap code_cache_; UnionBytes config_; -#if defined(NODE_HAVE_I18N_SUPPORT) - std::list> externalized_source_bytes_; -#endif // NODE_HAVE_I18N_SUPPORT // Used to synchronize access to the code cache map Mutex code_cache_mutex_; diff --git a/src/node_union_bytes.h b/src/node_union_bytes.h index d296e67f005f04..6e1a3afb614323 100644 --- a/src/node_union_bytes.h +++ b/src/node_union_bytes.h @@ -11,57 +11,20 @@ namespace node { -class NonOwningExternalOneByteResource - : public v8::String::ExternalOneByteStringResource { - public: - explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length) - : data_(data), length_(length) {} - ~NonOwningExternalOneByteResource() override = default; - - const char* data() const override { - return reinterpret_cast(data_); - } - size_t length() const override { return length_; } - - NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) = - delete; - NonOwningExternalOneByteResource& operator=( - const NonOwningExternalOneByteResource&) = delete; - - private: - const uint8_t* data_; - size_t length_; -}; - -class NonOwningExternalTwoByteResource - : public v8::String::ExternalStringResource { - public: - explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length) - : data_(data), length_(length) {} - ~NonOwningExternalTwoByteResource() override = default; - - const uint16_t* data() const override { return data_; } - size_t length() const override { return length_; } - - NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) = - delete; - NonOwningExternalTwoByteResource& operator=( - const NonOwningExternalTwoByteResource&) = delete; - - private: - const uint16_t* data_; - size_t length_; -}; - // Similar to a v8::String, but it's independent from Isolates // and can be materialized in Isolates as external Strings -// via ToStringChecked. The data pointers are owned by the caller. +// via ToStringChecked. class UnionBytes { public: UnionBytes(const uint16_t* data, size_t length) : one_bytes_(nullptr), two_bytes_(data), length_(length) {} UnionBytes(const uint8_t* data, size_t length) : one_bytes_(data), two_bytes_(nullptr), length_(length) {} + template // T = uint8_t or uint16_t + explicit UnionBytes(std::shared_ptr> data) + : UnionBytes(data->data(), data->size()) { + owning_ptr_ = data; + } UnionBytes(const UnionBytes&) = default; UnionBytes& operator=(const UnionBytes&) = default; @@ -77,23 +40,14 @@ class UnionBytes { CHECK_NOT_NULL(one_bytes_); return one_bytes_; } - v8::Local ToStringChecked(v8::Isolate* isolate) const { - if (is_one_byte()) { - NonOwningExternalOneByteResource* source = - new NonOwningExternalOneByteResource(one_bytes_data(), length_); - return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked(); - } else { - NonOwningExternalTwoByteResource* source = - new NonOwningExternalTwoByteResource(two_bytes_data(), length_); - return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked(); - } - } - size_t length() { return length_; } + v8::Local ToStringChecked(v8::Isolate* isolate) const; + size_t length() const { return length_; } private: const uint8_t* one_bytes_; const uint16_t* two_bytes_; size_t length_; + std::shared_ptr owning_ptr_; }; } // namespace node diff --git a/src/util.cc b/src/util.cc index 584a6a32d6cc5b..d2afd9323017e5 100644 --- a/src/util.cc +++ b/src/util.cc @@ -475,4 +475,66 @@ void SetConstructorFunction(Local context, that->Set(context, name, tmpl->GetFunction(context).ToLocalChecked()).Check(); } +namespace { + +class NonOwningExternalOneByteResource + : public v8::String::ExternalOneByteStringResource { + public: + explicit NonOwningExternalOneByteResource(const UnionBytes& source) + : source_(source) {} + ~NonOwningExternalOneByteResource() override = default; + + const char* data() const override { + return reinterpret_cast(source_.one_bytes_data()); + } + size_t length() const override { return source_.length(); } + + NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) = + delete; + NonOwningExternalOneByteResource& operator=( + const NonOwningExternalOneByteResource&) = delete; + + private: + const UnionBytes source_; +}; + +class NonOwningExternalTwoByteResource + : public v8::String::ExternalStringResource { + public: + explicit NonOwningExternalTwoByteResource(const UnionBytes& source) + : source_(source) {} + ~NonOwningExternalTwoByteResource() override = default; + + const uint16_t* data() const override { return source_.two_bytes_data(); } + size_t length() const override { return source_.length(); } + + NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) = + delete; + NonOwningExternalTwoByteResource& operator=( + const NonOwningExternalTwoByteResource&) = delete; + + private: + const UnionBytes source_; +}; + +} // anonymous namespace + +Local UnionBytes::ToStringChecked(Isolate* isolate) const { + if (UNLIKELY(length() == 0)) { + // V8 requires non-null data pointers for empty external strings, + // but we don't guarantee that. Solve this by not creating an + // external string at all in that case. + return String::Empty(isolate); + } + if (is_one_byte()) { + NonOwningExternalOneByteResource* source = + new NonOwningExternalOneByteResource(*this); + return String::NewExternalOneByte(isolate, source).ToLocalChecked(); + } else { + NonOwningExternalTwoByteResource* source = + new NonOwningExternalTwoByteResource(*this); + return String::NewExternalTwoByte(isolate, source).ToLocalChecked(); + } +} + } // namespace node diff --git a/test/cctest/test_util.cc b/test/cctest/test_util.cc index 75861ba00d52ba..443a03117c09fc 100644 --- a/test/cctest/test_util.cc +++ b/test/cctest/test_util.cc @@ -1,7 +1,8 @@ -#include "util-inl.h" #include "debug_utils-inl.h" #include "env-inl.h" #include "gtest/gtest.h" +#include "simdutf.h" +#include "util-inl.h" using node::Calloc; using node::Malloc; @@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) { const std::string with_zero = std::string("a") + '\0' + 'b'; EXPECT_EQ(SPrintF("%s", with_zero), with_zero); } + +TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) { + // In simdutf, "LE" does *not* refer to Little Endian, it refers + // to 16-byte code units that are stored using *host* endianness. + // This is weird and confusing naming, and so we add this assertion + // here to verify that this is actually the case (so that CI tells + // us if it changed, because for most people Little Endian is + // host endianness, so locally everything would work fine). + const char utf8source[] = "\xe7\x8c\xab"; + char16_t u16output; + size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output); + EXPECT_EQ(u16len, 1u); + EXPECT_EQ(u16output, 0x732B); +}