Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prefetching optimisations for sweeping #9934

Merged
merged 3 commits into from
Feb 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ Working version
including the search path for shared stub libraries.
(David Allsopp, review by Xavier Leroy)

- #9934: Optimise sweeping using prefetching.
(Stephen Dolan and Will Hasenplaugh, review by David Allsopp, Xavier
Leroy and Damien Doligez, benchmarking by Shubham Kumar and KC
Sivaramakrishnan)

- #10025: Track custom blocks (e.g. Bigarray) with Statmemprof
(Stephen Dolan, review by Leo White, Gabriel Scherer and Jacques-Henri
Jourdan)
Expand Down
11 changes: 11 additions & 0 deletions runtime/caml/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,17 @@ CAMLdeprecated_typedef(addr, char *);
#error "How do I align values on this platform?"
#endif

/* Prefetching */

#ifdef CAML_INTERNALS
#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
#define caml_prefetch(p) __builtin_prefetch((p), 1, 3)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think - but I have not experimented - that the MSVC equivalent is #include <winnt.h> and PreFetchCacheLine((p), PF_NON_TEMPORAL_LEVEL_ALL) (I'm not sure about the constant)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: why only on x86? __builtin_prefetch exists on all GCC-supported platforms, even though it can be a no-op. And I'm sure ARM and others would benefit too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I would document a bit:

#define caml_prefetch(p) __builtin_prefetch((p), 1, 3)
/* 1 = intent to write; 3 = all cache levels */

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: why only on x86? __builtin_prefetch exists on all GCC-supported platforms, even though it can be a no-op. And I'm sure ARM and others would benefit too.

I left others out because I don't really know anything about non-x86 memory hierarchies. We can turn it on for ARM if you like, but I can't judge how much / whether it'll help, and I don't have the expertise / time to do any serious benchmarking.

(I'll add the comments)

/* 1 = intent to write; 3 = all cache levels */
#else
#define caml_prefetch(p)
#endif
#endif

/* CAMLunused is preserved for compatibility reasons.
Instead of the legacy GCC/Clang-only
CAMLunused foo;
Expand Down
1 change: 1 addition & 0 deletions runtime/freelist.c
Original file line number Diff line number Diff line change
Expand Up @@ -1662,6 +1662,7 @@ static header_t *bf_merge_block (value bp, char *limit)
}
caml_fl_cur_wsz += Whsize_val (cur);
next:
caml_prefetch(Hp_val(cur + 4096));
cur = Next_in_mem (cur);
if (Hp_val (cur) >= (header_t *) limit){
CAMLassert (Hp_val (cur) == (header_t *) limit);
Expand Down
27 changes: 15 additions & 12 deletions runtime/major_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ extern value caml_fl_merge; /* Defined in freelist.c. */
redarkening required */
static char *redarken_first_chunk = NULL;

static char *sweep_chunk, *sweep_limit;
static char *sweep_chunk;
static double p_backlog = 0.0; /* backlog for the gc speedup parameter */

int caml_gc_subphase; /* Subphase_{mark_roots,mark_main,mark_final} */
Expand Down Expand Up @@ -397,7 +397,6 @@ static void init_sweep_phase(void)
caml_gc_phase = Phase_sweep;
sweep_chunk = caml_heap_start;
caml_gc_sweep_hp = sweep_chunk;
sweep_limit = sweep_chunk + Chunk_size (sweep_chunk);
caml_fl_wsz_at_phase_change = caml_fl_cur_wsz;
if (caml_major_gc_hook) (*caml_major_gc_hook)();
}
Expand Down Expand Up @@ -698,21 +697,24 @@ static void clean_slice (intnat work)

static void sweep_slice (intnat work)
{
char *hp;
char *hp, *sweep_hp, *limit;
header_t hd;

caml_gc_message (0x40, "Sweeping %"
ARCH_INTNAT_PRINTF_FORMAT "d words\n", work);
sweep_hp = caml_gc_sweep_hp;
limit = sweep_chunk + Chunk_size(sweep_chunk);
while (work > 0){
if (caml_gc_sweep_hp < sweep_limit){
hp = caml_gc_sweep_hp;
if (sweep_hp < limit){
caml_prefetch(sweep_hp + 4000);
hp = sweep_hp;
hd = Hd_hp (hp);
work -= Whsize_hd (hd);
caml_gc_sweep_hp += Bhsize_hd (hd);
sweep_hp += Bhsize_hd (hd);
switch (Color_hd (hd)){
case Caml_white:
caml_gc_sweep_hp =
(char *)caml_fl_merge_block(Val_hp (hp), sweep_limit);
caml_gc_sweep_hp = sweep_hp;
sweep_hp = (char *) caml_fl_merge_block (Val_hp (hp), limit);
break;
case Caml_blue:
/* Only the blocks of the free-list are blue. See [freelist.c]. */
Expand All @@ -723,21 +725,23 @@ static void sweep_slice (intnat work)
Hd_hp (hp) = Whitehd_hd (hd);
break;
}
CAMLassert (caml_gc_sweep_hp <= sweep_limit);
CAMLassert (sweep_hp <= limit);
}else{
sweep_chunk = Chunk_next (sweep_chunk);
if (sweep_chunk == NULL){
/* Sweeping is done. */
caml_gc_sweep_hp = sweep_hp;
++ Caml_state->stat_major_collections;
work = 0;
caml_gc_phase = Phase_idle;
caml_request_minor_gc ();
}else{
caml_gc_sweep_hp = sweep_chunk;
sweep_limit = sweep_chunk + Chunk_size (sweep_chunk);
sweep_hp = sweep_chunk;
limit = sweep_chunk + Chunk_size (sweep_chunk);
}
}
}
caml_gc_sweep_hp = sweep_hp;
}

/* The main entry point for the major GC. Called about once for each
Expand Down Expand Up @@ -1085,7 +1089,6 @@ void caml_finalise_heap (void)
caml_gc_phase = Phase_sweep;
sweep_chunk = caml_heap_start;
caml_gc_sweep_hp = sweep_chunk;
sweep_limit = sweep_chunk + Chunk_size (sweep_chunk);
while (caml_gc_phase == Phase_sweep)
sweep_slice (LONG_MAX);
}
Expand Down