Skip to content

Commit

Permalink
Allow CSE of immutable loads across stores (ocaml#9562)
Browse files Browse the repository at this point in the history
(cherry picked from commit 756dd27)
  • Loading branch information
stedolan authored and edwintorok committed Jun 26, 2021
1 parent acb6f3e commit 886da52
Show file tree
Hide file tree
Showing 24 changed files with 65 additions and 50 deletions.
8 changes: 7 additions & 1 deletion .depend
Expand Up @@ -2023,15 +2023,18 @@ asmcomp/CSEgen.cmo : \
asmcomp/proc.cmi \
asmcomp/mach.cmi \
asmcomp/cmm.cmi \
parsing/asttypes.cmi \
asmcomp/CSEgen.cmi
asmcomp/CSEgen.cmx : \
asmcomp/reg.cmx \
asmcomp/proc.cmx \
asmcomp/mach.cmx \
asmcomp/cmm.cmx \
parsing/asttypes.cmi \
asmcomp/CSEgen.cmi
asmcomp/CSEgen.cmi : \
asmcomp/mach.cmi
asmcomp/mach.cmi \
parsing/asttypes.cmi
asmcomp/afl_instrument.cmo : \
lambda/lambda.cmi \
asmcomp/cmm.cmi \
Expand Down Expand Up @@ -2644,6 +2647,7 @@ asmcomp/mach.cmo : \
lambda/debuginfo.cmi \
asmcomp/cmm.cmi \
middle_end/backend_var.cmi \
parsing/asttypes.cmi \
asmcomp/arch.cmo \
asmcomp/mach.cmi
asmcomp/mach.cmx : \
Expand All @@ -2654,6 +2658,7 @@ asmcomp/mach.cmx : \
lambda/debuginfo.cmx \
asmcomp/cmm.cmx \
middle_end/backend_var.cmx \
parsing/asttypes.cmi \
asmcomp/arch.cmx \
asmcomp/mach.cmi
asmcomp/mach.cmi : \
Expand All @@ -2663,6 +2668,7 @@ asmcomp/mach.cmi : \
lambda/debuginfo.cmi \
asmcomp/cmm.cmi \
middle_end/backend_var.cmi \
parsing/asttypes.cmi \
asmcomp/arch.cmo
asmcomp/printcmm.cmo : \
utils/targetint.cmi \
Expand Down
3 changes: 3 additions & 0 deletions Changes
Expand Up @@ -7,6 +7,9 @@ OCaml 4.12, maintenance version
platforms without AFL support.
(David Allsopp, review by Xavier Leroy)

- #9562, #367: Allow CSE of immutable loads across stores
(Stephen Dolan, review by Mark Shinwell)

- #10349: Fix destroyed_at_c_call on RISC-V
(Mark Shinwell, review by Nicolás Ojeda Bär)

Expand Down
37 changes: 19 additions & 18 deletions asmcomp/CSEgen.ml
Expand Up @@ -25,7 +25,7 @@ type valnum = int
type op_class =
| Op_pure (* pure arithmetic, produce one or several result *)
| Op_checkbound (* checkbound-style: no result, can raise an exn *)
| Op_load (* memory load *)
| Op_load of Asttypes.mutable_flag (* memory load *)
| Op_store of bool (* memory store, false = init, true = assign *)
| Op_other (* anything else that does not allocate nor store in memory *)

Expand All @@ -40,29 +40,30 @@ module Equations = struct
Map.Make(struct type t = rhs let compare = Stdlib.compare end)

type 'a t =
{ load_equations : 'a Rhs_map.t;
{ mutable_load_equations : 'a Rhs_map.t;
other_equations : 'a Rhs_map.t }

let empty =
{ load_equations = Rhs_map.empty;
{ mutable_load_equations = Rhs_map.empty;
other_equations = Rhs_map.empty }

let add op_class op v m =
match op_class with
| Op_load ->
{ m with load_equations = Rhs_map.add op v m.load_equations }
| Op_load Mutable ->
{ m with mutable_load_equations =
Rhs_map.add op v m.mutable_load_equations }
| _ ->
{ m with other_equations = Rhs_map.add op v m.other_equations }

let find op_class op m =
match op_class with
| Op_load ->
Rhs_map.find op m.load_equations
| Op_load Mutable ->
Rhs_map.find op m.mutable_load_equations
| _ ->
Rhs_map.find op m.other_equations

let remove_loads m =
{ load_equations = Rhs_map.empty;
let remove_mutable_loads m =
{ mutable_load_equations = Rhs_map.empty;
other_equations = m.other_equations }
end

Expand Down Expand Up @@ -190,8 +191,8 @@ let set_unknown_regs n rs =

(* Keep only the equations satisfying the given predicate. *)

let remove_load_numbering n =
{ n with num_eqs = Equations.remove_loads n.num_eqs }
let remove_mutable_load_numbering n =
{ n with num_eqs = Equations.remove_mutable_loads n.num_eqs }

(* Forget everything we know about registers of type [Addr]. *)

Expand Down Expand Up @@ -225,7 +226,7 @@ method class_of_operation op =
| Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
| Iextcall _ | Iopaque -> assert false (* treated specially *)
| Istackoffset _ -> Op_other
| Iload(_,_) -> Op_load
| Iload(_,_,mut) -> Op_load mut
| Istore(_,_,asg) -> Op_store asg
| Ialloc _ -> assert false (* treated specially *)
| Iintop(Icheckbound) -> Op_checkbound
Expand All @@ -244,11 +245,11 @@ method is_cheap_operation op =
| Iconst_int _ -> true
| _ -> false

(* Forget all equations involving memory loads. Performed after a
non-initializing store *)
(* Forget all equations involving mutable memory loads.
Performed after a non-initializing store *)

method private kill_loads n =
remove_load_numbering n
remove_mutable_load_numbering n

(* Perform CSE on the given instruction [i] and its successors.
[n] is the value numbering current at the beginning of [i]. *)
Expand Down Expand Up @@ -290,13 +291,13 @@ method private cse n i =
Moreover, allocation can trigger the asynchronous execution
of arbitrary Caml code (finalizer, signal handler, context
switch), which can contain non-initializing stores.
Hence, all equations over loads must be removed. *)
Hence, all equations over mutable loads must be removed. *)
let n1 = kill_addr_regs (self#kill_loads n) in
let n2 = set_unknown_regs n1 i.res in
{i with next = self#cse n2 i.next}
| Iop op ->
begin match self#class_of_operation op with
| (Op_pure | Op_checkbound | Op_load) as op_class ->
| (Op_pure | Op_checkbound | Op_load _) as op_class ->
let (n1, varg) = valnum_regs n i.arg in
let n2 = set_unknown_regs n1 (Proc.destroyed_at_oper i.desc) in
begin match find_equation op_class n1 (op, varg) with
Expand Down Expand Up @@ -334,7 +335,7 @@ method private cse n i =
{i with next = self#cse n2 i.next}
| Op_store true ->
(* A non-initializing store can invalidate
anything we know about prior loads. *)
anything we know about prior mutable loads. *)
let n1 = set_unknown_regs n (Proc.destroyed_at_oper i.desc) in
let n2 = set_unknown_regs n1 i.res in
let n3 = self#kill_loads n2 in
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/CSEgen.mli
Expand Up @@ -19,7 +19,7 @@
type op_class =
| Op_pure (* pure, produce one result *)
| Op_checkbound (* checkbound-style: no result, can raise an exn *)
| Op_load (* memory load *)
| Op_load of Asttypes.mutable_flag (* memory load *)
| Op_store of bool (* memory store, false = init, true = assign *)
| Op_other (* anything else that does not allocate nor store in memory *)

Expand Down
2 changes: 1 addition & 1 deletion asmcomp/amd64/CSE.ml
Expand Up @@ -30,7 +30,7 @@ method! class_of_operation op =
| Ilea _ | Isextend32 | Izextend32 -> Op_pure
| Istore_int(_, _, is_asg) -> Op_store is_asg
| Ioffset_loc(_, _) -> Op_store true
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load Mutable
| Ibswap _ | Isqrtf -> super#class_of_operation op
end
| _ -> super#class_of_operation op
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/amd64/emit.mlp
Expand Up @@ -546,7 +546,7 @@ let emit_instr env fallthrough i =
if n <> 0
then cfi_adjust_cfa_offset n;
env.stack_offset <- env.stack_offset + n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let dest = res i 0 in
begin match chunk with
| Word_int | Word_val ->
Expand Down
6 changes: 3 additions & 3 deletions asmcomp/arm/emit.mlp
Expand Up @@ -525,10 +525,10 @@ let emit_instr env i =
let ninstr = emit_stack_adjustment (-n) in
env.stack_offset <- env.stack_offset + n;
ninstr
| Lop(Iload(Single, addr)) when !fpu >= VFPv2 ->
| Lop(Iload(Single, addr, _mut)) when !fpu >= VFPv2 ->
` flds s14, {emit_addressing addr i.arg 0}\n`;
` fcvtds {emit_reg i.res.(0)}, s14\n`; 2
| Lop(Iload((Double | Double_u), addr)) when !fpu = Soft ->
| Lop(Iload((Double | Double_u), addr, _mut)) when !fpu = Soft ->
(* Use LDM or LDRD if possible *)
begin match i.res.(0), i.res.(1), addr with
{loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
Expand All @@ -547,7 +547,7 @@ let emit_instr env i =
` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
end; 2
end
| Lop(Iload(size, addr)) ->
| Lop(Iload(size, addr, _mut)) ->
let r = i.res.(0) in
let instr =
match size with
Expand Down
5 changes: 3 additions & 2 deletions asmcomp/arm/proc.ml
Expand Up @@ -301,7 +301,8 @@ let destroyed_at_oper = function
| Iop(Iintop (Icomp _) | Iintop_imm(Icomp _, _))
when !arch >= ARMv8 && !thumb ->
[| phys_reg 3 |] (* r3 destroyed *)
| Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _, _)) ->
| Iop(Iintoffloat | Ifloatofint
| Iload(Single, _, _) | Istore(Single, _, _)) ->
[| phys_reg 107 |] (* d7 (s14-s15) destroyed *)
| _ -> [||]

Expand All @@ -325,7 +326,7 @@ let max_register_pressure = function
| Ialloc _ -> if abi = EABI then [| 7; 0; 0 |] else [| 7; 8; 8 |]
| Iconst_symbol _ when !Clflags.pic_code -> [| 7; 16; 32 |]
| Iintoffloat | Ifloatofint
| Iload(Single, _) | Istore(Single, _, _) -> [| 9; 15; 31 |]
| Iload(Single, _, _) | Istore(Single, _, _) -> [| 9; 15; 31 |]
| Iintop Imulh when !arch < ARMv6 -> [| 8; 16; 32 |]
| _ -> [| 9; 16; 32 |]

Expand Down
2 changes: 1 addition & 1 deletion asmcomp/arm/scheduling.ml
Expand Up @@ -29,7 +29,7 @@ method oper_latency = function
(* Loads have a latency of two cycles in general *)
Iconst_symbol _
| Iconst_float _
| Iload(_, _)
| Iload(_, _, _)
| Ireload
| Ifloatofint (* mcr/mrc count as memory access *)
| Iintoffloat -> 2
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/arm64/emit.mlp
Expand Up @@ -462,7 +462,7 @@ module BR = Branch_relaxation.Make (struct
| Lop (Iextcall { alloc = false; }) -> 1
| Lop (Iextcall { alloc = true; }) -> 3
| Lop (Istackoffset _) -> 2
| Lop (Iload (size, addr)) | Lop (Istore (size, addr, _)) ->
| Lop (Iload (size, addr, _)) | Lop (Istore (size, addr, _)) ->
let based = match addr with Iindexed _ -> 0 | Ibased _ -> 1 in
based + begin match size with Single -> 2 | _ -> 1 end
| Lop (Ialloc _) when f.fun_fast -> 5
Expand Down Expand Up @@ -681,7 +681,7 @@ let emit_instr env i =
assert (n mod 16 = 0);
emit_stack_adjustment (-n);
env.stack_offset <- env.stack_offset + n
| Lop(Iload(size, addr)) ->
| Lop(Iload(size, addr, _mut)) ->
let dst = i.res.(0) in
let base =
match addr with
Expand Down
5 changes: 3 additions & 2 deletions asmcomp/arm64/proc.ml
Expand Up @@ -258,7 +258,8 @@ let destroyed_at_oper = function
destroyed_at_c_call
| Iop(Ialloc _) ->
[| reg_x8 |]
| Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _, _)) ->
| Iop( Iintoffloat | Ifloatofint
| Iload(Single, _, _) | Istore(Single, _, _)) ->
[| reg_d7 |] (* d7 / s7 destroyed *)
| _ -> [||]

Expand All @@ -277,7 +278,7 @@ let max_register_pressure = function
| Iextcall _ -> [| 10; 8 |]
| Ialloc _ -> [| 22; 32 |]
| Iintoffloat | Ifloatofint
| Iload(Single, _) | Istore(Single, _, _) -> [| 23; 31 |]
| Iload(Single, _, _) | Istore(Single, _, _) -> [| 23; 31 |]
| _ -> [| 23; 32 |]

(* Layout of the stack *)
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/CSE.ml
Expand Up @@ -29,7 +29,7 @@ method! class_of_operation op =
(* Operations that affect the floating-point stack cannot be factored *)
| Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Iintoffloat | Ifloatofint
| Iload((Single | Double | Double_u), _) -> Op_other
| Iload((Single | Double | Double_u), _, _) -> Op_other
(* Specific ops *)
| Ispecific(Ilea _) -> Op_pure
| Ispecific(Istore_int(_, _, is_asg)) -> Op_store is_asg
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/emit.mlp
Expand Up @@ -540,7 +540,7 @@ let emit_instr env fallthrough i =
else I.sub (int n) esp;
cfi_adjust_cfa_offset n;
env.stack_offset <- env.stack_offset + n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let dest = i.res.(0) in
begin match chunk with
| Word_int | Word_val | Thirtytwo_signed | Thirtytwo_unsigned ->
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/selection.ml
Expand Up @@ -133,7 +133,7 @@ let pseudoregs_for_operation op arg res =
(* For floating-point operations and floating-point loads,
the result is always left at the top of the floating-point stack *)
| Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Ifloatofint | Iload((Single | Double | Double_u), _)
| Ifloatofint | Iload((Single | Double | Double_u), _, _)
| Ispecific(Isubfrev | Idivfrev | Ifloatarithmem _ | Ifloatspecial _) ->
(arg, [| tos |], false) (* don't move it immediately *)
(* For storing a byte, the argument must be in eax...edx.
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/mach.ml
Expand Up @@ -51,7 +51,7 @@ type operation =
ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
alloc : bool; }
| Istackoffset of int
| Iload of Cmm.memory_chunk * Arch.addressing_mode
| Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
| Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
| Ialloc of { bytes : int; dbginfo : Debuginfo.alloc_dbginfo; }
| Iintop of integer_operation
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/mach.mli
Expand Up @@ -51,7 +51,7 @@ type operation =
ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
alloc : bool; }
| Istackoffset of int
| Iload of Cmm.memory_chunk * Arch.addressing_mode
| Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
| Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
(* false = initialization, true = assignment *)
| Ialloc of { bytes : int; dbginfo : Debuginfo.alloc_dbginfo; }
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/power/emit.mlp
Expand Up @@ -459,7 +459,7 @@ module BR = Branch_relaxation.Make (struct
size 3 (2 + tocload_size()) (2 + tocload_size())
| Lop(Iextcall { alloc = false; _}) -> size 1 2 2
| Lop(Istackoffset _) -> 1
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
if chunk = Byte_signed
then load_store_size addr + 1
else load_store_size addr
Expand Down Expand Up @@ -728,7 +728,7 @@ let emit_instr env i =
| Lop(Istackoffset n) ->
` addi 1, 1, {emit_int (-n)}\n`;
adjust_stack_offset env n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let loadinstr =
match chunk with
| Byte_unsigned -> "lbz"
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/power/scheduling.ml
Expand Up @@ -26,7 +26,7 @@ inherit Schedgen.scheduler_generic

method oper_latency = function
Ireload -> 2
| Iload(_, _) -> 2
| Iload(_, _, _) -> 2
| Iconst_float _ -> 2 (* turned into a load *)
| Iconst_symbol _ -> 1
| Iintop(Imul | Imulh) -> 9
Expand All @@ -46,7 +46,7 @@ method! reload_retaddr_latency = 12

method oper_issue_cycles = function
Iconst_float _ | Iconst_symbol _ -> 2
| Iload(_, Ibased(_, _)) -> 2
| Iload(_, Ibased(_, _), _) -> 2
| Istore(_, Ibased(_, _), _) -> 2
| Ialloc _ -> 4
| Iintop(Imod) -> 40 (* assuming full stall *)
Expand Down
5 changes: 4 additions & 1 deletion asmcomp/printmach.ml
Expand Up @@ -122,9 +122,12 @@ let operation op arg ppf res =
(if alloc then "" else " (noalloc)")
| Istackoffset n ->
fprintf ppf "offset stack %i" n
| Iload(chunk, addr) ->
| Iload(chunk, addr, Immutable) ->
fprintf ppf "%s[%a]"
(Printcmm.chunk chunk) (Arch.print_addressing reg addr) arg
| Iload(chunk, addr, Mutable) ->
fprintf ppf "%s mut[%a]"
(Printcmm.chunk chunk) (Arch.print_addressing reg addr) arg
| Istore(chunk, addr, is_assign) ->
fprintf ppf "%s[%a] := %a %s"
(Printcmm.chunk chunk)
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/riscv/emit.mlp
Expand Up @@ -302,10 +302,10 @@ let emit_instr env i =
assert (n mod 16 = 0);
emit_stack_adjustment (-n);
env.stack_offset <- env.stack_offset + n
| Lop(Iload(Single, Iindexed ofs)) ->
| Lop(Iload(Single, Iindexed ofs, _mut)) ->
` flw {emit_reg i.res.(0)}, {emit_int ofs}({emit_reg i.arg.(0)})\n`;
` fcvt.d.s {emit_reg i.res.(0)}, {emit_reg i.res.(0)}\n`
| Lop(Iload(chunk, Iindexed ofs)) ->
| Lop(Iload(chunk, Iindexed ofs, _mut)) ->
let instr =
match chunk with
| Byte_unsigned -> "lbu"
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/s390x/emit.mlp
Expand Up @@ -359,7 +359,7 @@ let emit_instr env i =
emit_stack_adjust n;
env.stack_offset <- env.stack_offset + n

| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let loadinstr =
match chunk with
Byte_unsigned -> "llgc"
Expand Down

0 comments on commit 886da52

Please sign in to comment.