Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

src: use simdutf for converting externalized builtins to UTF-16 #46119

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 1 addition & 5 deletions configure.py
Expand Up @@ -2024,11 +2024,7 @@ def make_bin_override():
for builtin in shareable_builtins:
builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path'
if getattr(options, builtin_id):
if options.with_intl == 'none':
option_name = '--shared-builtin-' + builtin + '-path'
error(option_name + ' is incompatible with --with-intl=none' )
else:
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)]
else:
output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]]

Expand Down
1 change: 1 addition & 0 deletions node.gyp
Expand Up @@ -973,6 +973,7 @@
'deps/googletest/googletest.gyp:gtest_main',
'deps/histogram/histogram.gyp:histogram',
'deps/uvwasi/uvwasi.gyp:uvwasi',
'deps/simdutf/simdutf.gyp:simdutf',
],

'includes': [
Expand Down
15 changes: 2 additions & 13 deletions src/api/environment.cc
Expand Up @@ -467,21 +467,10 @@ MaybeLocal<Value> LoadEnvironment(
Environment* env,
const char* main_script_source_utf8) {
CHECK_NOT_NULL(main_script_source_utf8);
Isolate* isolate = env->isolate();
return LoadEnvironment(
env,
[&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
// This is a slightly hacky way to convert UTF-8 to UTF-16.
Local<String> str =
String::NewFromUtf8(isolate,
main_script_source_utf8).ToLocalChecked();
auto main_utf16 = std::make_unique<String::Value>(isolate, str);

// TODO(addaleax): Avoid having a global table for all scripts.
env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal<Value> {
std::string name = "embedder_main_" + std::to_string(env->thread_id());
builtins::BuiltinLoader::Add(
name.c_str(), UnionBytes(**main_utf16, main_utf16->length()));
env->set_main_utf16(std::move(main_utf16));
builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8);
Realm* realm = env->principal_realm();

return realm->ExecuteBootstrapper(name.c_str());
Expand Down
5 changes: 0 additions & 5 deletions src/env-inl.h
Expand Up @@ -805,11 +805,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) {
cleanup_queue_.Remove(fn, arg);
}

void Environment::set_main_utf16(std::unique_ptr<v8::String::Value> str) {
CHECK(!main_utf16_);
main_utf16_ = std::move(str);
}

void Environment::set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler) {
process_exit_handler_ = std::move(handler);
Expand Down
6 changes: 0 additions & 6 deletions src/env.h
Expand Up @@ -961,7 +961,6 @@ class Environment : public MemoryRetainer {

#endif // HAVE_INSPECTOR

inline void set_main_utf16(std::unique_ptr<v8::String::Value>);
inline void set_process_exit_handler(
std::function<void(Environment*, ExitCode)>&& handler);

Expand Down Expand Up @@ -1144,11 +1143,6 @@ class Environment : public MemoryRetainer {

std::unique_ptr<Realm> principal_realm_ = nullptr;

// Keeps the main script source alive is one was passed to LoadEnvironment().
// We should probably find a way to just use plain `v8::String`s created from
// the source passed to LoadEnvironment() directly instead.
std::unique_ptr<v8::String::Value> main_utf16_;

// Used by allocate_managed_buffer() and release_managed_buffer() to keep
// track of the BackingStore for a given pointer.
std::unordered_map<char*, std::unique_ptr<v8::BackingStore>>
Expand Down
25 changes: 13 additions & 12 deletions src/node_builtins.cc
Expand Up @@ -3,6 +3,7 @@
#include "env-inl.h"
#include "node_external_reference.h"
#include "node_internals.h"
#include "simdutf.h"
#include "util-inl.h"

namespace node {
Expand Down Expand Up @@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_;

BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
LoadJavaScriptSource();
#if defined(NODE_HAVE_I18N_SUPPORT)
#ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH
AddExternalizedBuiltin(
"internal/deps/cjs-module-lexer/lexer",
Expand All @@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) {
AddExternalizedBuiltin("internal/deps/undici/undici",
STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH));
#endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH
#endif // NODE_HAVE_I18N_SUPPORT
}

BuiltinLoader* BuiltinLoader::GetInstance() {
Expand Down Expand Up @@ -240,7 +239,6 @@ MaybeLocal<String> BuiltinLoader::LoadBuiltinSource(Isolate* isolate,
#endif // NODE_BUILTIN_MODULES_PATH
}

#if defined(NODE_HAVE_I18N_SUPPORT)
void BuiltinLoader::AddExternalizedBuiltin(const char* id,
const char* filename) {
std::string source;
Expand All @@ -252,16 +250,19 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id,
return;
}

icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8(
icu::StringPiece(source.data(), source.length()));
auto source_utf16 = std::make_unique<icu::UnicodeString>(utf16);
Add(id,
UnionBytes(reinterpret_cast<const uint16_t*>((*source_utf16).getBuffer()),
utf16.length()));
// keep source bytes for builtin alive while BuiltinLoader exists
GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16));
Add(id, source);
}

bool BuiltinLoader::Add(const char* id, std::string_view utf8source) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can just create a UnionBytes constructor that takes a std::string_view that does this? (then I guess that UnionBytes can own the vector?))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, UnionBytes that supports storing UTF-8 directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fwiw, I’ll merge this in order to unblock #45942, but I’m still happy to iterate further here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A union for everything ;)

size_t expected_u16_length =
simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length());
std::shared_ptr<uint16_t[]> out{new uint16_t[expected_u16_length]};
size_t u16_length =
simdutf::convert_utf8_to_utf16le(utf8source.data(),
utf8source.length(),
reinterpret_cast<char16_t*>(out.get()));
return Add(id, UnionBytes(out, u16_length));
}
#endif // NODE_HAVE_I18N_SUPPORT

// Returns Local<Function> of the compiled module if return_code_cache
// is false (we are only compiling the function).
Expand Down
9 changes: 1 addition & 8 deletions src/node_builtins.h
Expand Up @@ -8,9 +8,6 @@
#include <memory>
#include <set>
#include <string>
#if defined(NODE_HAVE_I18N_SUPPORT)
#include <unicode/unistr.h>
#endif // NODE_HAVE_I18N_SUPPORT
#include <vector>
#include "node_mutex.h"
#include "node_union_bytes.h"
Expand Down Expand Up @@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static v8::Local<v8::String> GetConfigString(v8::Isolate* isolate);
static bool Exists(const char* id);
static bool Add(const char* id, const UnionBytes& source);
static bool Add(const char* id, std::string_view utf8source);

static bool CompileAllBuiltins(v8::Local<v8::Context> context);
static void RefreshCodeCache(const std::vector<CodeCacheInfo>& in);
Expand Down Expand Up @@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
static void HasCachedBuiltins(
const v8::FunctionCallbackInfo<v8::Value>& args);

#if defined(NODE_HAVE_I18N_SUPPORT)
static void AddExternalizedBuiltin(const char* id, const char* filename);
#endif // NODE_HAVE_I18N_SUPPORT

static BuiltinLoader instance_;
BuiltinCategories builtin_categories_;
BuiltinSourceMap source_;
BuiltinCodeCacheMap code_cache_;
UnionBytes config_;
#if defined(NODE_HAVE_I18N_SUPPORT)
std::list<std::unique_ptr<icu::UnicodeString>> externalized_source_bytes_;
#endif // NODE_HAVE_I18N_SUPPORT

// Used to synchronize access to the code cache map
Mutex code_cache_mutex_;
Expand Down
8 changes: 7 additions & 1 deletion src/node_union_bytes.h
Expand Up @@ -55,13 +55,18 @@ class NonOwningExternalTwoByteResource

// Similar to a v8::String, but it's independent from Isolates
// and can be materialized in Isolates as external Strings
// via ToStringChecked. The data pointers are owned by the caller.
// via ToStringChecked.
class UnionBytes {
public:
UnionBytes(const uint16_t* data, size_t length)
: one_bytes_(nullptr), two_bytes_(data), length_(length) {}
UnionBytes(const uint8_t* data, size_t length)
: one_bytes_(data), two_bytes_(nullptr), length_(length) {}
template <typename T> // T = uint8_t or uint16_t
UnionBytes(std::shared_ptr<T[]> data, size_t length)
: UnionBytes(data.get(), length) {
owning_ptr_ = data;
}

UnionBytes(const UnionBytes&) = default;
UnionBytes& operator=(const UnionBytes&) = default;
Expand Down Expand Up @@ -94,6 +99,7 @@ class UnionBytes {
const uint8_t* one_bytes_;
const uint16_t* two_bytes_;
size_t length_;
std::shared_ptr<void> owning_ptr_;
};

} // namespace node
Expand Down
17 changes: 16 additions & 1 deletion test/cctest/test_util.cc
@@ -1,7 +1,8 @@
#include "util-inl.h"
#include "debug_utils-inl.h"
#include "env-inl.h"
#include "gtest/gtest.h"
#include "simdutf.h"
#include "util-inl.h"

using node::Calloc;
using node::Malloc;
Expand Down Expand Up @@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) {
const std::string with_zero = std::string("a") + '\0' + 'b';
EXPECT_EQ(SPrintF("%s", with_zero), with_zero);
}

TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) {
// In simdutf, "LE" does *not* refer to Little Endian, it refers
anonrig marked this conversation as resolved.
Show resolved Hide resolved
// to 16-byte code units that are stored using *host* endianness.
// This is weird and confusing naming, and so we add this assertion
// here to verify that this is actually the case (so that CI tells
// us if it changed, because for most people Little Endian is
// host endianness, so locally everything would work fine).
const char utf8source[] = "\xe7\x8c\xab";
char16_t u16output;
size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output);
EXPECT_EQ(u16len, 1u);
EXPECT_EQ(u16output, 0x732B);
}