Skip to content

Commit

Permalink
Precompute embedded string literals hash code
Browse files Browse the repository at this point in the history
With embedded strings we often have some space left in the slot, which
we can use to store the string Hash code.

It's probably only worth it for string literals, as they are the ones
likely to be used as hash keys.

We chose to store the Hash code right after the string terminator as to
make it easy/fast to compute, and not require one more union in RString.

```
compare-ruby: ruby 3.4.0dev (2024-04-22T06:32:21Z main f77618c) [arm64-darwin23]
built-ruby: ruby 3.4.0dev (2024-04-22T10:13:03Z interned-string-ha.. 8a1a32331b) [arm64-darwin23]
last_commit=Precompute embedded string literals hash code

|            |compare-ruby|built-ruby|
|:-----------|-----------:|---------:|
|symbol      |     39.275M|   39.753M|
|            |           -|     1.01x|
|dyn_symbol  |     37.348M|   37.704M|
|            |           -|     1.01x|
|small_lit   |     29.514M|   33.948M|
|            |           -|     1.15x|
|frozen_lit  |     27.180M|   33.056M|
|            |           -|     1.22x|
|iseq_lit    |     27.391M|   32.242M|
|            |           -|     1.18x|
```

Co-Authored-By: Étienne Barrié <etienne.barrie@gmail.com>
  • Loading branch information
byroot and etiennebarrie committed Apr 22, 2024
1 parent f77618c commit 5281aa9
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 13 deletions.
20 changes: 20 additions & 0 deletions benchmark/hash_aref_str_lit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
prelude: |
# frozen_string_literal: true
hash = 10.times.to_h do |i|
[i, i]
end
dyn_sym = "dynamic_symbol".to_sym
binary = RubyVM::InstructionSequence.compile("# frozen_string_literal: true\n'iseq_load'").to_binary
iseq_literal_string = RubyVM::InstructionSequence.load_from_binary(binary).eval
hash[:some_symbol] = 1
hash[dyn_sym] = 2
hash["small"] = 3
hash["frozen_string_literal"] = 4
hash[iseq_literal_string] = 5
benchmark:
symbol: hash[:some_symbol]
dyn_symbol: hash[dyn_sym]
small_lit: hash["small"]
frozen_lit: hash["frozen_string_literal"]
iseq_lit: hash[iseq_literal_string]
2 changes: 1 addition & 1 deletion compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -13501,7 +13501,7 @@ ibf_load_object_string(const struct ibf_load *load, const struct ibf_object_head

VALUE str;
if (header->frozen && !header->internal) {
str = rb_enc_interned_str(ptr, len, rb_enc_from_index(encindex));
str = rb_enc_literal_str(ptr, len, rb_enc_from_index(encindex));
}
else {
str = rb_enc_str_new(ptr, len, rb_enc_from_index(encindex));
Expand Down
1 change: 1 addition & 0 deletions internal/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ VALUE rb_str_concat_literals(size_t num, const VALUE *strary);
VALUE rb_str_eql(VALUE str1, VALUE str2);
VALUE rb_id_quote_unprintable(ID);
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc);
VALUE rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc);

struct rb_execution_context_struct;
VALUE rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled);
Expand Down
2 changes: 1 addition & 1 deletion prism_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ parse_static_literal_string(rb_iseq_t *iseq, const pm_scope_node_t *scope_node,
encoding = scope_node->encoding;
}

VALUE value = rb_enc_interned_str((const char *) pm_string_source(string), pm_string_length(string), encoding);
VALUE value = rb_enc_literal_str((const char *) pm_string_source(string), pm_string_length(string), encoding);
rb_enc_str_coderange(value);

if (ISEQ_COMPILE_DATA(iseq)->option->debug_frozen_string_literal || RTEST(ruby_debug)) {
Expand Down
2 changes: 1 addition & 1 deletion ruby_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,7 @@ rb_parser_build_script_lines_from(rb_parser_ary_t *lines)
VALUE
rb_str_new_parser_string(rb_parser_string_t *str)
{
VALUE string = rb_enc_interned_str(str->ptr, str->len, str->enc);
VALUE string = rb_enc_literal_str(str->ptr, str->len, str->enc);
rb_enc_str_coderange(string);
return string;
}
Expand Down
73 changes: 63 additions & 10 deletions string.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ VALUE rb_cSymbol;
* another string (the shared root).
* 3: STR_CHILLED (will be frozen in a future version)
* The string appears frozen but can be mutated with a warning.
* 4: STR_PRECOMPUTED_HASH
* TODO: doc
* 5: STR_SHARED_ROOT
* Other strings may point to the contents of this string. When this
* flag is set, STR_SHARED must not be set.
Expand Down Expand Up @@ -116,6 +118,7 @@ VALUE rb_cSymbol;
*/

#define RUBY_MAX_CHAR_LEN 16
#define STR_PRECOMPUTED_HASH FL_USER4
#define STR_SHARED_ROOT FL_USER5
#define STR_BORROWED FL_USER6
#define STR_TMPLOCK FL_USER7
Expand Down Expand Up @@ -257,6 +260,7 @@ static VALUE str_new(VALUE klass, const char *ptr, long len);
static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
static inline void str_modifiable(VALUE str);
static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
static inline VALUE str_alloc_embed(VALUE klass, size_t capa);

static inline void
str_make_independent(VALUE str)
Expand Down Expand Up @@ -334,7 +338,7 @@ mustnot_wchar(VALUE str)

static int fstring_cmp(VALUE a, VALUE b);

static VALUE register_fstring(VALUE str, bool copy);
static VALUE register_fstring(VALUE str, bool copy, bool precompute_hash);

const struct st_hash_type rb_fstring_hash_type = {
fstring_cmp,
Expand All @@ -343,9 +347,28 @@ const struct st_hash_type rb_fstring_hash_type = {

#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)

static VALUE
str_precompute_hash(VALUE str)
{
RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH));
RUBY_ASSERT(STR_EMBED_P(str));

#if RUBY_DEBUG
size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str));
size_t free_bytes = str_embed_capa(str) - used_bytes;
RUBY_ASSERT(free_bytes >= sizeof(st_index_t));
#endif

*(st_index_t *)(RSTRING_END(str) + TERM_LEN(str)) = rb_str_hash(str);
FL_SET(str, STR_PRECOMPUTED_HASH);

return str;
}

struct fstr_update_arg {
VALUE fstr;
bool copy;
bool precompute_hash;
};

static int
Expand All @@ -370,7 +393,20 @@ fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int exist
else {
if (FL_TEST_RAW(str, STR_FAKESTR)) {
if (arg->copy) {
VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
VALUE new_str;
long len = RSTRING_LEN(str);
long capa = len + sizeof(st_index_t);
int term_len = TERM_LEN(str);

if (arg->precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) {
new_str = str_alloc_embed(rb_cString, capa + term_len);
memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len);
STR_SET_LEN(new_str, RSTRING_LEN(str));
str_precompute_hash(new_str);
}
else {
new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
}
rb_enc_copy(new_str, str);
str = new_str;
}
Expand Down Expand Up @@ -428,7 +464,7 @@ rb_fstring(VALUE str)
if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED))
rb_str_resize(str, RSTRING_LEN(str));

fstr = register_fstring(str, FALSE);
fstr = register_fstring(str, false, false);

if (!bare) {
str_replace_shared_without_enc(str, fstr);
Expand All @@ -439,10 +475,12 @@ rb_fstring(VALUE str)
}

static VALUE
register_fstring(VALUE str, bool copy)
register_fstring(VALUE str, bool copy, bool precompute_hash)
{
struct fstr_update_arg args;
args.copy = copy;
struct fstr_update_arg args = {
.copy = copy,
.precompute_hash = precompute_hash
};

RB_VM_LOCK_ENTER();
{
Expand Down Expand Up @@ -500,14 +538,14 @@ VALUE
rb_fstring_new(const char *ptr, long len)
{
struct RString fake_str;
return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false);
}

VALUE
rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
{
struct RString fake_str;
return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false);
}

VALUE
Expand Down Expand Up @@ -3655,6 +3693,10 @@ rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
st_index_t
rb_str_hash(VALUE str)
{
if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) {
return *(st_index_t *)(RSTRING_END(str) + TERM_LEN(str));
}

st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
if (e && !is_ascii_string(str)) {
Expand Down Expand Up @@ -12130,7 +12172,7 @@ VALUE
rb_interned_str(const char *ptr, long len)
{
struct RString fake_str;
return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false);
}

VALUE
Expand All @@ -12147,7 +12189,18 @@ rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
}

struct RString fake_str;
return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false);
}

VALUE
rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc)
{
if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
rb_enc_autoload(enc);
}

struct RString fake_str;
return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true);
}

VALUE
Expand Down

0 comments on commit 5281aa9

Please sign in to comment.