ocaml · damiendoligez · Feb 3, 2021 · Sep 22, 2020 · Sep 22, 2020 · Jan 20, 2021
diff --git a/Changes b/Changes
@@ -14,6 +14,11 @@ Working version
   including the search path for shared stub libraries.
   (David Allsopp, review by Xavier Leroy)
 
+- #9934: Optimise sweeping using prefetching.
+  (Stephen Dolan and Will Hasenplaugh, review by David Allsopp, Xavier
+   Leroy and Damien Doligez, benchmarking by Shubham Kumar and KC
+   Sivaramakrishnan)
+
 - #10025: Track custom blocks (e.g. Bigarray) with Statmemprof
   (Stephen Dolan, review by Leo White, Gabriel Scherer and Jacques-Henri
    Jourdan)

diff --git a/runtime/caml/misc.h b/runtime/caml/misc.h
@@ -113,6 +113,17 @@ CAMLdeprecated_typedef(addr, char *);
 #error "How do I align values on this platform?"
 #endif
 
+/* Prefetching */
+
+#ifdef CAML_INTERNALS
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define caml_prefetch(p) __builtin_prefetch((p), 1, 3)
+/* 1 = intent to write; 3 = all cache levels */
+#else
+#define caml_prefetch(p)
+#endif
+#endif
+
 /* CAMLunused is preserved for compatibility reasons.
    Instead of the legacy GCC/Clang-only
      CAMLunused foo;

diff --git a/runtime/freelist.c b/runtime/freelist.c
@@ -1662,6 +1662,7 @@ static header_t *bf_merge_block (value bp, char *limit)
     }
     caml_fl_cur_wsz += Whsize_val (cur);
   next:
+    caml_prefetch(Hp_val(cur + 4096));
     cur = Next_in_mem (cur);
     if (Hp_val (cur) >= (header_t *) limit){
       CAMLassert (Hp_val (cur) == (header_t *) limit);

diff --git a/runtime/major_gc.c b/runtime/major_gc.c
@@ -72,7 +72,7 @@ extern value caml_fl_merge;  /* Defined in freelist.c. */
   redarkening required */
 static char *redarken_first_chunk = NULL;
 
-static char *sweep_chunk, *sweep_limit;
+static char *sweep_chunk;
 static double p_backlog = 0.0; /* backlog for the gc speedup parameter */
 
 int caml_gc_subphase;     /* Subphase_{mark_roots,mark_main,mark_final} */
@@ -397,7 +397,6 @@ static void init_sweep_phase(void)
   caml_gc_phase = Phase_sweep;
   sweep_chunk = caml_heap_start;
   caml_gc_sweep_hp = sweep_chunk;
-  sweep_limit = sweep_chunk + Chunk_size (sweep_chunk);
   caml_fl_wsz_at_phase_change = caml_fl_cur_wsz;
   if (caml_major_gc_hook) (*caml_major_gc_hook)();
 }
@@ -698,21 +697,24 @@ static void clean_slice (intnat work)
 
 static void sweep_slice (intnat work)
 {
-  char *hp;
+  char *hp, *sweep_hp, *limit;
   header_t hd;
 
   caml_gc_message (0x40, "Sweeping %"
                    ARCH_INTNAT_PRINTF_FORMAT "d words\n", work);
+  sweep_hp = caml_gc_sweep_hp;
+  limit = sweep_chunk + Chunk_size(sweep_chunk);
   while (work > 0){
-    if (caml_gc_sweep_hp < sweep_limit){
-      hp = caml_gc_sweep_hp;
+    if (sweep_hp < limit){
+      caml_prefetch(sweep_hp + 4000);
+      hp = sweep_hp;
       hd = Hd_hp (hp);
       work -= Whsize_hd (hd);
-      caml_gc_sweep_hp += Bhsize_hd (hd);
+      sweep_hp += Bhsize_hd (hd);
       switch (Color_hd (hd)){
       case Caml_white:
-        caml_gc_sweep_hp =
-            (char *)caml_fl_merge_block(Val_hp (hp), sweep_limit);
+        caml_gc_sweep_hp = sweep_hp;
+        sweep_hp = (char *) caml_fl_merge_block (Val_hp (hp), limit);
         break;
       case Caml_blue:
         /* Only the blocks of the free-list are blue.  See [freelist.c]. */
@@ -723,21 +725,23 @@ static void sweep_slice (intnat work)
         Hd_hp (hp) = Whitehd_hd (hd);
         break;
       }
-      CAMLassert (caml_gc_sweep_hp <= sweep_limit);
+      CAMLassert (sweep_hp <= limit);
     }else{
       sweep_chunk = Chunk_next (sweep_chunk);
       if (sweep_chunk == NULL){
         /* Sweeping is done. */
+        caml_gc_sweep_hp = sweep_hp;
         ++ Caml_state->stat_major_collections;
         work = 0;
         caml_gc_phase = Phase_idle;
         caml_request_minor_gc ();
       }else{
-        caml_gc_sweep_hp = sweep_chunk;
-        sweep_limit = sweep_chunk + Chunk_size (sweep_chunk);
+        sweep_hp = sweep_chunk;
+        limit = sweep_chunk + Chunk_size (sweep_chunk);
       }
     }
   }
+  caml_gc_sweep_hp = sweep_hp;
 }
 
 /* The main entry point for the major GC. Called about once for each
@@ -1085,7 +1089,6 @@ void caml_finalise_heap (void)
   caml_gc_phase = Phase_sweep;
   sweep_chunk = caml_heap_start;
   caml_gc_sweep_hp = sweep_chunk;
-  sweep_limit = sweep_chunk + Chunk_size (sweep_chunk);
   while (caml_gc_phase == Phase_sweep)
     sweep_slice (LONG_MAX);
 }