Skip to content

Commit

Permalink
Allow CSE of immutable loads across stores
Browse files Browse the repository at this point in the history
  • Loading branch information
stedolan committed May 26, 2021
1 parent fd59303 commit 9b6b7e9
Show file tree
Hide file tree
Showing 23 changed files with 57 additions and 49 deletions.
3 changes: 3 additions & 0 deletions Changes
Expand Up @@ -103,6 +103,9 @@ Working version
(Vincent Laviron, with help from Sebastien Hinderer, review by Stephen Dolan
and David Allsopp)

- #9562, #367: Allow CSE of immutable loads across stores
(Stephen Dolan, review by Mark Shinwell)

- #9876: do not cache the young_limit GC variable in a processor register.
This affects the ARM64, PowerPC and RISC-V ports, making signal handling
and minor GC triggers more reliable, at the cost of a small slowdown.
Expand Down
36 changes: 18 additions & 18 deletions asmcomp/CSEgen.ml
Expand Up @@ -25,7 +25,7 @@ type valnum = int
type op_class =
| Op_pure (* pure arithmetic, produce one or several result *)
| Op_checkbound (* checkbound-style: no result, can raise an exn *)
| Op_load (* memory load *)
| Op_load of Asttypes.mutable_flag (* memory load *)
| Op_store of bool (* memory store, false = init, true = assign *)
| Op_other (* anything else that does not allocate nor store in memory *)

Expand All @@ -40,29 +40,29 @@ module Equations = struct
Map.Make(struct type t = rhs let compare = Stdlib.compare end)

type 'a t =
{ load_equations : 'a Rhs_map.t;
{ mutable_load_equations : 'a Rhs_map.t;
other_equations : 'a Rhs_map.t }

let empty =
{ load_equations = Rhs_map.empty;
{ mutable_load_equations = Rhs_map.empty;
other_equations = Rhs_map.empty }

let add op_class op v m =
match op_class with
| Op_load ->
{ m with load_equations = Rhs_map.add op v m.load_equations }
| Op_load Mutable ->
{ m with mutable_load_equations = Rhs_map.add op v m.mutable_load_equations }
| _ ->
{ m with other_equations = Rhs_map.add op v m.other_equations }

let find op_class op m =
match op_class with
| Op_load ->
Rhs_map.find op m.load_equations
| Op_load Mutable ->
Rhs_map.find op m.mutable_load_equations
| _ ->
Rhs_map.find op m.other_equations

let remove_loads m =
{ load_equations = Rhs_map.empty;
let remove_mutable_loads m =
{ mutable_load_equations = Rhs_map.empty;
other_equations = m.other_equations }
end

Expand Down Expand Up @@ -190,8 +190,8 @@ let set_unknown_regs n rs =

(* Keep only the equations satisfying the given predicate. *)

let remove_load_numbering n =
{ n with num_eqs = Equations.remove_loads n.num_eqs }
let remove_mutable_load_numbering n =
{ n with num_eqs = Equations.remove_mutable_loads n.num_eqs }

(* Forget everything we know about registers of type [Addr]. *)

Expand Down Expand Up @@ -225,7 +225,7 @@ method class_of_operation op =
| Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
| Iextcall _ | Iopaque -> assert false (* treated specially *)
| Istackoffset _ -> Op_other
| Iload(_,_) -> Op_load
| Iload(_,_,mut) -> Op_load mut
| Istore(_,_,asg) -> Op_store asg
| Ialloc _ -> assert false (* treated specially *)
| Iintop(Icheckbound) -> Op_checkbound
Expand All @@ -243,11 +243,11 @@ method is_cheap_operation op =
| Iconst_int _ -> true
| _ -> false

(* Forget all equations involving memory loads. Performed after a
non-initializing store *)
(* Forget all equations involving mutable memory loads.
Performed after a non-initializing store *)

method private kill_loads n =
remove_load_numbering n
remove_mutable_load_numbering n

(* Perform CSE on the given instruction [i] and its successors.
[n] is the value numbering current at the beginning of [i]. *)
Expand Down Expand Up @@ -289,13 +289,13 @@ method private cse n i =
Moreover, allocation can trigger the asynchronous execution
of arbitrary Caml code (finalizer, signal handler, context
switch), which can contain non-initializing stores.
Hence, all equations over loads must be removed. *)
Hence, all equations over mutable loads must be removed. *)
let n1 = kill_addr_regs (self#kill_loads n) in
let n2 = set_unknown_regs n1 i.res in
{i with next = self#cse n2 i.next}
| Iop op ->
begin match self#class_of_operation op with
| (Op_pure | Op_checkbound | Op_load) as op_class ->
| (Op_pure | Op_checkbound | Op_load _) as op_class ->
let (n1, varg) = valnum_regs n i.arg in
let n2 = set_unknown_regs n1 (Proc.destroyed_at_oper i.desc) in
begin match find_equation op_class n1 (op, varg) with
Expand Down Expand Up @@ -333,7 +333,7 @@ method private cse n i =
{i with next = self#cse n2 i.next}
| Op_store true ->
(* A non-initializing store can invalidate
anything we know about prior loads. *)
anything we know about prior mutable loads. *)
let n1 = set_unknown_regs n (Proc.destroyed_at_oper i.desc) in
let n2 = set_unknown_regs n1 i.res in
let n3 = self#kill_loads n2 in
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/CSEgen.mli
Expand Up @@ -19,7 +19,7 @@
type op_class =
| Op_pure (* pure, produce one result *)
| Op_checkbound (* checkbound-style: no result, can raise an exn *)
| Op_load (* memory load *)
| Op_load of Asttypes.mutable_flag (* memory load *)
| Op_store of bool (* memory store, false = init, true = assign *)
| Op_other (* anything else that does not allocate nor store in memory *)

Expand Down
2 changes: 1 addition & 1 deletion asmcomp/amd64/CSE.ml
Expand Up @@ -30,7 +30,7 @@ method! class_of_operation op =
| Ilea _ | Isextend32 | Izextend32 -> Op_pure
| Istore_int(_, _, is_asg) -> Op_store is_asg
| Ioffset_loc(_, _) -> Op_store true
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load Mutable
| Ibswap _ | Isqrtf -> super#class_of_operation op
end
| _ -> super#class_of_operation op
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/amd64/emit.mlp
Expand Up @@ -546,7 +546,7 @@ let emit_instr env fallthrough i =
if n <> 0
then cfi_adjust_cfa_offset n;
env.stack_offset <- env.stack_offset + n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let dest = res i 0 in
begin match chunk with
| Word_int | Word_val ->
Expand Down
6 changes: 3 additions & 3 deletions asmcomp/arm/emit.mlp
Expand Up @@ -525,10 +525,10 @@ let emit_instr env i =
let ninstr = emit_stack_adjustment (-n) in
env.stack_offset <- env.stack_offset + n;
ninstr
| Lop(Iload(Single, addr)) when !fpu >= VFPv2 ->
| Lop(Iload(Single, addr, _mut)) when !fpu >= VFPv2 ->
` flds s14, {emit_addressing addr i.arg 0}\n`;
` fcvtds {emit_reg i.res.(0)}, s14\n`; 2
| Lop(Iload((Double | Double_u), addr)) when !fpu = Soft ->
| Lop(Iload((Double | Double_u), addr, _mut)) when !fpu = Soft ->
(* Use LDM or LDRD if possible *)
begin match i.res.(0), i.res.(1), addr with
{loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
Expand All @@ -547,7 +547,7 @@ let emit_instr env i =
` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
end; 2
end
| Lop(Iload(size, addr)) ->
| Lop(Iload(size, addr, _mut)) ->
let r = i.res.(0) in
let instr =
match size with
Expand Down
5 changes: 3 additions & 2 deletions asmcomp/arm/proc.ml
Expand Up @@ -301,7 +301,8 @@ let destroyed_at_oper = function
| Iop(Iintop (Icomp _) | Iintop_imm(Icomp _, _))
when !arch >= ARMv8 && !thumb ->
[| phys_reg 3 |] (* r3 destroyed *)
| Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _, _)) ->
| Iop(Iintoffloat | Ifloatofint
| Iload(Single, _, _) | Istore(Single, _, _)) ->
[| phys_reg 107 |] (* d7 (s14-s15) destroyed *)
| _ -> [||]

Expand All @@ -325,7 +326,7 @@ let max_register_pressure = function
| Ialloc _ -> if abi = EABI then [| 7; 0; 0 |] else [| 7; 8; 8 |]
| Iconst_symbol _ when !Clflags.pic_code -> [| 7; 16; 32 |]
| Iintoffloat | Ifloatofint
| Iload(Single, _) | Istore(Single, _, _) -> [| 9; 15; 31 |]
| Iload(Single, _, _) | Istore(Single, _, _) -> [| 9; 15; 31 |]
| Iintop Imulh when !arch < ARMv6 -> [| 8; 16; 32 |]
| _ -> [| 9; 16; 32 |]

Expand Down
2 changes: 1 addition & 1 deletion asmcomp/arm/scheduling.ml
Expand Up @@ -29,7 +29,7 @@ method oper_latency = function
(* Loads have a latency of two cycles in general *)
Iconst_symbol _
| Iconst_float _
| Iload(_, _)
| Iload(_, _, _)
| Ireload
| Ifloatofint (* mcr/mrc count as memory access *)
| Iintoffloat -> 2
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/arm64/emit.mlp
Expand Up @@ -454,7 +454,7 @@ module BR = Branch_relaxation.Make (struct
| Lop (Iextcall { alloc = false; }) -> 1
| Lop (Iextcall { alloc = true; }) -> 3
| Lop (Istackoffset _) -> 2
| Lop (Iload (size, addr)) | Lop (Istore (size, addr, _)) ->
| Lop (Iload (size, addr, _)) | Lop (Istore (size, addr, _)) ->
let based = match addr with Iindexed _ -> 0 | Ibased _ -> 1 in
based + begin match size with Single -> 2 | _ -> 1 end
| Lop (Ialloc _) when f.fun_fast -> 5
Expand Down Expand Up @@ -673,7 +673,7 @@ let emit_instr env i =
assert (n mod 16 = 0);
emit_stack_adjustment (-n);
env.stack_offset <- env.stack_offset + n
| Lop(Iload(size, addr)) ->
| Lop(Iload(size, addr, _mut)) ->
let dst = i.res.(0) in
let base =
match addr with
Expand Down
5 changes: 3 additions & 2 deletions asmcomp/arm64/proc.ml
Expand Up @@ -258,7 +258,8 @@ let destroyed_at_oper = function
destroyed_at_c_call
| Iop(Ialloc _) ->
[| reg_x8 |]
| Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _, _)) ->
| Iop( Iintoffloat | Ifloatofint
| Iload(Single, _, _) | Istore(Single, _, _)) ->
[| reg_d7 |] (* d7 / s7 destroyed *)
| _ -> [||]

Expand All @@ -277,7 +278,7 @@ let max_register_pressure = function
| Iextcall _ -> [| 7; 8 |] (* 7 integer callee-saves, 8 FP callee-saves *)
| Ialloc _ -> [| 22; 32 |]
| Iintoffloat | Ifloatofint
| Iload(Single, _) | Istore(Single, _, _) -> [| 23; 31 |]
| Iload(Single, _, _) | Istore(Single, _, _) -> [| 23; 31 |]
| _ -> [| 23; 32 |]

(* Layout of the stack *)
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/CSE.ml
Expand Up @@ -29,7 +29,7 @@ method! class_of_operation op =
(* Operations that affect the floating-point stack cannot be factored *)
| Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Iintoffloat | Ifloatofint
| Iload((Single | Double | Double_u), _) -> Op_other
| Iload((Single | Double | Double_u), _, _) -> Op_other
(* Specific ops *)
| Ispecific(Ilea _) -> Op_pure
| Ispecific(Istore_int(_, _, is_asg)) -> Op_store is_asg
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/emit.mlp
Expand Up @@ -540,7 +540,7 @@ let emit_instr env fallthrough i =
else I.sub (int n) esp;
cfi_adjust_cfa_offset n;
env.stack_offset <- env.stack_offset + n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let dest = i.res.(0) in
begin match chunk with
| Word_int | Word_val | Thirtytwo_signed | Thirtytwo_unsigned ->
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/selection.ml
Expand Up @@ -133,7 +133,7 @@ let pseudoregs_for_operation op arg res =
(* For floating-point operations and floating-point loads,
the result is always left at the top of the floating-point stack *)
| Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Ifloatofint | Iload((Single | Double | Double_u), _)
| Ifloatofint | Iload((Single | Double | Double_u), _, _)
| Ispecific(Isubfrev | Idivfrev | Ifloatarithmem _ | Ifloatspecial _) ->
(arg, [| tos |], false) (* don't move it immediately *)
(* For storing a byte, the argument must be in eax...edx.
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/mach.ml
Expand Up @@ -51,7 +51,7 @@ type operation =
ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
alloc : bool; }
| Istackoffset of int
| Iload of Cmm.memory_chunk * Arch.addressing_mode
| Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
| Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
| Ialloc of { bytes : int; dbginfo : Debuginfo.alloc_dbginfo; }
| Iintop of integer_operation
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/mach.mli
Expand Up @@ -51,7 +51,7 @@ type operation =
ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
alloc : bool; }
| Istackoffset of int
| Iload of Cmm.memory_chunk * Arch.addressing_mode
| Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
| Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
(* false = initialization, true = assignment *)
| Ialloc of { bytes : int; dbginfo : Debuginfo.alloc_dbginfo; }
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/power/emit.mlp
Expand Up @@ -464,7 +464,7 @@ module BR = Branch_relaxation.Make (struct
size 3 (2 + tocload_size()) (2 + tocload_size())
| Lop(Iextcall { alloc = false; _}) -> size 1 2 2
| Lop(Istackoffset _) -> 1
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
if chunk = Byte_signed
then load_store_size addr + 1
else load_store_size addr
Expand Down Expand Up @@ -732,7 +732,7 @@ let emit_instr env i =
| Lop(Istackoffset n) ->
` addi 1, 1, {emit_int (-n)}\n`;
adjust_stack_offset env n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let loadinstr =
match chunk with
| Byte_unsigned -> "lbz"
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/power/scheduling.ml
Expand Up @@ -26,7 +26,7 @@ inherit Schedgen.scheduler_generic

method oper_latency = function
Ireload -> 2
| Iload(_, _) -> 2
| Iload(_, _, _) -> 2
| Iconst_float _ -> 2 (* turned into a load *)
| Iconst_symbol _ -> 1
| Iintop(Imul | Imulh) -> 9
Expand All @@ -46,7 +46,7 @@ method! reload_retaddr_latency = 12

method oper_issue_cycles = function
Iconst_float _ | Iconst_symbol _ -> 2
| Iload(_, Ibased(_, _)) -> 2
| Iload(_, Ibased(_, _), _) -> 2
| Istore(_, Ibased(_, _), _) -> 2
| Ialloc _ -> 4
| Iintop(Imod) -> 40 (* assuming full stall *)
Expand Down
5 changes: 4 additions & 1 deletion asmcomp/printmach.ml
Expand Up @@ -120,9 +120,12 @@ let operation op arg ppf res =
(if alloc then "" else " (noalloc)")
| Istackoffset n ->
fprintf ppf "offset stack %i" n
| Iload(chunk, addr) ->
| Iload(chunk, addr, Immutable) ->
fprintf ppf "%s[%a]"
(Printcmm.chunk chunk) (Arch.print_addressing reg addr) arg
| Iload(chunk, addr, Mutable) ->
fprintf ppf "%s mut[%a]"
(Printcmm.chunk chunk) (Arch.print_addressing reg addr) arg
| Istore(chunk, addr, is_assign) ->
fprintf ppf "%s[%a] := %a %s"
(Printcmm.chunk chunk)
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/riscv/emit.mlp
Expand Up @@ -302,10 +302,10 @@ let emit_instr env i =
assert (n mod 16 = 0);
emit_stack_adjustment (-n);
env.stack_offset <- env.stack_offset + n
| Lop(Iload(Single, Iindexed ofs)) ->
| Lop(Iload(Single, Iindexed ofs, _mut)) ->
` flw {emit_reg i.res.(0)}, {emit_int ofs}({emit_reg i.arg.(0)})\n`;
` fcvt.d.s {emit_reg i.res.(0)}, {emit_reg i.res.(0)}\n`
| Lop(Iload(chunk, Iindexed ofs)) ->
| Lop(Iload(chunk, Iindexed ofs, _mut)) ->
let instr =
match chunk with
| Byte_unsigned -> "lbu"
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/s390x/emit.mlp
Expand Up @@ -359,7 +359,7 @@ let emit_instr env i =
emit_stack_adjust n;
env.stack_offset <- env.stack_offset + n

| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let loadinstr =
match chunk with
Byte_unsigned -> "llgc"
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/s390x/scheduling.ml
Expand Up @@ -35,7 +35,7 @@ inherit Schedgen.scheduler_generic

method oper_latency = function
Ireload -> 4
| Iload(_, _) -> 4
| Iload(_, _, _) -> 4
| Iconst_float _ -> 4 (* turned into a load *)
| Iintop(Imul) -> 10
| Iintop_imm(Imul, _) -> 10
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/schedgen.ml
Expand Up @@ -135,7 +135,7 @@ let rec remove_instr node = function

(* We treat Lreloadretaddr as a word-sized load *)

let some_load = (Iload(Cmm.Word_int, Arch.identity_addressing))
let some_load = (Iload(Cmm.Word_int, Arch.identity_addressing, Mutable))

(* The generic scheduler *)

Expand Down Expand Up @@ -181,7 +181,7 @@ method is_store = function
| _ -> false

method is_load = function
Iload(_, _) -> true
Iload(_, _, _) -> true
| _ -> false

method is_checkbound = function
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/selectgen.ml
Expand Up @@ -443,9 +443,9 @@ method select_operation op args _dbg =
(Icall_ind, args)
| (Cextcall(func, ty_res, ty_args, alloc), _) ->
Iextcall { func; ty_res; ty_args; alloc; }, args
| (Cload (chunk, _mut), [arg]) ->
| (Cload (chunk, mut), [arg]) ->
let (addr, eloc) = self#select_addressing chunk arg in
(Iload(chunk, addr), [eloc])
(Iload(chunk, addr, mut), [eloc])
| (Cstore (chunk, init), [arg1; arg2]) ->
let (addr, eloc) = self#select_addressing chunk arg1 in
let is_assign =
Expand Down

0 comments on commit 9b6b7e9

Please sign in to comment.