Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow CSE of immutable loads across stores #9562

Merged
merged 1 commit into from
May 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 7 additions & 1 deletion .depend
Original file line number Diff line number Diff line change
Expand Up @@ -2124,15 +2124,18 @@ asmcomp/CSEgen.cmo : \
asmcomp/proc.cmi \
asmcomp/mach.cmi \
asmcomp/cmm.cmi \
parsing/asttypes.cmi \
asmcomp/CSEgen.cmi
asmcomp/CSEgen.cmx : \
asmcomp/reg.cmx \
asmcomp/proc.cmx \
asmcomp/mach.cmx \
asmcomp/cmm.cmx \
parsing/asttypes.cmi \
asmcomp/CSEgen.cmi
asmcomp/CSEgen.cmi : \
asmcomp/mach.cmi
asmcomp/mach.cmi \
parsing/asttypes.cmi
asmcomp/afl_instrument.cmo : \
lambda/lambda.cmi \
asmcomp/cmm.cmi \
Expand Down Expand Up @@ -2762,20 +2765,23 @@ asmcomp/mach.cmo : \
lambda/lambda.cmi \
lambda/debuginfo.cmi \
asmcomp/cmm.cmi \
parsing/asttypes.cmi \
asmcomp/arch.cmo \
asmcomp/mach.cmi
asmcomp/mach.cmx : \
asmcomp/reg.cmx \
lambda/lambda.cmx \
lambda/debuginfo.cmx \
asmcomp/cmm.cmx \
parsing/asttypes.cmi \
asmcomp/arch.cmx \
asmcomp/mach.cmi
asmcomp/mach.cmi : \
asmcomp/reg.cmi \
lambda/lambda.cmi \
lambda/debuginfo.cmi \
asmcomp/cmm.cmi \
parsing/asttypes.cmi \
asmcomp/arch.cmo
asmcomp/printcmm.cmo : \
utils/targetint.cmi \
Expand Down
3 changes: 3 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ Working version
(Vincent Laviron, with help from Sebastien Hinderer, review by Stephen Dolan
and David Allsopp)

- #9562, #367: Allow CSE of immutable loads across stores
(Stephen Dolan, review by Mark Shinwell)

- #9876: do not cache the young_limit GC variable in a processor register.
This affects the ARM64, PowerPC and RISC-V ports, making signal handling
and minor GC triggers more reliable, at the cost of a small slowdown.
Expand Down
37 changes: 19 additions & 18 deletions asmcomp/CSEgen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ type valnum = int
type op_class =
| Op_pure (* pure arithmetic, produce one or several result *)
| Op_checkbound (* checkbound-style: no result, can raise an exn *)
| Op_load (* memory load *)
| Op_load of Asttypes.mutable_flag (* memory load *)
| Op_store of bool (* memory store, false = init, true = assign *)
| Op_other (* anything else that does not allocate nor store in memory *)

Expand All @@ -40,29 +40,30 @@ module Equations = struct
Map.Make(struct type t = rhs let compare = Stdlib.compare end)

type 'a t =
{ load_equations : 'a Rhs_map.t;
{ mutable_load_equations : 'a Rhs_map.t;
other_equations : 'a Rhs_map.t }

let empty =
{ load_equations = Rhs_map.empty;
{ mutable_load_equations = Rhs_map.empty;
other_equations = Rhs_map.empty }

let add op_class op v m =
match op_class with
| Op_load ->
{ m with load_equations = Rhs_map.add op v m.load_equations }
| Op_load Mutable ->
{ m with mutable_load_equations =
Rhs_map.add op v m.mutable_load_equations }
| _ ->
{ m with other_equations = Rhs_map.add op v m.other_equations }

let find op_class op m =
match op_class with
| Op_load ->
Rhs_map.find op m.load_equations
| Op_load Mutable ->
Rhs_map.find op m.mutable_load_equations
| _ ->
Rhs_map.find op m.other_equations

let remove_loads m =
{ load_equations = Rhs_map.empty;
let remove_mutable_loads m =
{ mutable_load_equations = Rhs_map.empty;
other_equations = m.other_equations }
end

Expand Down Expand Up @@ -190,8 +191,8 @@ let set_unknown_regs n rs =

(* Keep only the equations satisfying the given predicate. *)

let remove_load_numbering n =
{ n with num_eqs = Equations.remove_loads n.num_eqs }
let remove_mutable_load_numbering n =
{ n with num_eqs = Equations.remove_mutable_loads n.num_eqs }

(* Forget everything we know about registers of type [Addr]. *)

Expand Down Expand Up @@ -225,7 +226,7 @@ method class_of_operation op =
| Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
| Iextcall _ | Iopaque -> assert false (* treated specially *)
| Istackoffset _ -> Op_other
| Iload(_,_) -> Op_load
| Iload(_,_,mut) -> Op_load mut
| Istore(_,_,asg) -> Op_store asg
| Ialloc _ -> assert false (* treated specially *)
| Iintop(Icheckbound) -> Op_checkbound
Expand All @@ -243,11 +244,11 @@ method is_cheap_operation op =
| Iconst_int _ -> true
| _ -> false

(* Forget all equations involving memory loads. Performed after a
non-initializing store *)
(* Forget all equations involving mutable memory loads.
Performed after a non-initializing store *)

method private kill_loads n =
remove_load_numbering n
remove_mutable_load_numbering n

(* Perform CSE on the given instruction [i] and its successors.
[n] is the value numbering current at the beginning of [i]. *)
Expand Down Expand Up @@ -289,13 +290,13 @@ method private cse n i =
Moreover, allocation can trigger the asynchronous execution
of arbitrary Caml code (finalizer, signal handler, context
switch), which can contain non-initializing stores.
Hence, all equations over loads must be removed. *)
Hence, all equations over mutable loads must be removed. *)
let n1 = kill_addr_regs (self#kill_loads n) in
let n2 = set_unknown_regs n1 i.res in
{i with next = self#cse n2 i.next}
| Iop op ->
begin match self#class_of_operation op with
| (Op_pure | Op_checkbound | Op_load) as op_class ->
| (Op_pure | Op_checkbound | Op_load _) as op_class ->
let (n1, varg) = valnum_regs n i.arg in
let n2 = set_unknown_regs n1 (Proc.destroyed_at_oper i.desc) in
begin match find_equation op_class n1 (op, varg) with
Expand Down Expand Up @@ -333,7 +334,7 @@ method private cse n i =
{i with next = self#cse n2 i.next}
| Op_store true ->
(* A non-initializing store can invalidate
anything we know about prior loads. *)
anything we know about prior mutable loads. *)
let n1 = set_unknown_regs n (Proc.destroyed_at_oper i.desc) in
let n2 = set_unknown_regs n1 i.res in
let n3 = self#kill_loads n2 in
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/CSEgen.mli
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
type op_class =
| Op_pure (* pure, produce one result *)
| Op_checkbound (* checkbound-style: no result, can raise an exn *)
| Op_load (* memory load *)
| Op_load of Asttypes.mutable_flag (* memory load *)
| Op_store of bool (* memory store, false = init, true = assign *)
| Op_other (* anything else that does not allocate nor store in memory *)

Expand Down
2 changes: 1 addition & 1 deletion asmcomp/amd64/CSE.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ method! class_of_operation op =
| Ilea _ | Isextend32 | Izextend32 -> Op_pure
| Istore_int(_, _, is_asg) -> Op_store is_asg
| Ioffset_loc(_, _) -> Op_store true
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load
| Ifloatarithmem _ | Ifloatsqrtf _ -> Op_load Mutable
| Ibswap _ | Isqrtf -> super#class_of_operation op
end
| _ -> super#class_of_operation op
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ let emit_instr env fallthrough i =
if n <> 0
then cfi_adjust_cfa_offset n;
env.stack_offset <- env.stack_offset + n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let dest = res i 0 in
begin match chunk with
| Word_int | Word_val ->
Expand Down
6 changes: 3 additions & 3 deletions asmcomp/arm/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -525,10 +525,10 @@ let emit_instr env i =
let ninstr = emit_stack_adjustment (-n) in
env.stack_offset <- env.stack_offset + n;
ninstr
| Lop(Iload(Single, addr)) when !fpu >= VFPv2 ->
| Lop(Iload(Single, addr, _mut)) when !fpu >= VFPv2 ->
` flds s14, {emit_addressing addr i.arg 0}\n`;
` fcvtds {emit_reg i.res.(0)}, s14\n`; 2
| Lop(Iload((Double | Double_u), addr)) when !fpu = Soft ->
| Lop(Iload((Double | Double_u), addr, _mut)) when !fpu = Soft ->
(* Use LDM or LDRD if possible *)
begin match i.res.(0), i.res.(1), addr with
{loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
Expand All @@ -547,7 +547,7 @@ let emit_instr env i =
` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
end; 2
end
| Lop(Iload(size, addr)) ->
| Lop(Iload(size, addr, _mut)) ->
let r = i.res.(0) in
let instr =
match size with
Expand Down
5 changes: 3 additions & 2 deletions asmcomp/arm/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,8 @@ let destroyed_at_oper = function
| Iop(Iintop (Icomp _) | Iintop_imm(Icomp _, _))
when !arch >= ARMv8 && !thumb ->
[| phys_reg 3 |] (* r3 destroyed *)
| Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _, _)) ->
| Iop(Iintoffloat | Ifloatofint
| Iload(Single, _, _) | Istore(Single, _, _)) ->
[| phys_reg 107 |] (* d7 (s14-s15) destroyed *)
| _ -> [||]

Expand All @@ -325,7 +326,7 @@ let max_register_pressure = function
| Ialloc _ -> if abi = EABI then [| 7; 0; 0 |] else [| 7; 8; 8 |]
| Iconst_symbol _ when !Clflags.pic_code -> [| 7; 16; 32 |]
| Iintoffloat | Ifloatofint
| Iload(Single, _) | Istore(Single, _, _) -> [| 9; 15; 31 |]
| Iload(Single, _, _) | Istore(Single, _, _) -> [| 9; 15; 31 |]
| Iintop Imulh when !arch < ARMv6 -> [| 8; 16; 32 |]
| _ -> [| 9; 16; 32 |]

Expand Down
2 changes: 1 addition & 1 deletion asmcomp/arm/scheduling.ml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ method oper_latency = function
(* Loads have a latency of two cycles in general *)
Iconst_symbol _
| Iconst_float _
| Iload(_, _)
| Iload(_, _, _)
| Ireload
| Ifloatofint (* mcr/mrc count as memory access *)
| Iintoffloat -> 2
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/arm64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ module BR = Branch_relaxation.Make (struct
| Lop (Iextcall { alloc = false; }) -> 1
| Lop (Iextcall { alloc = true; }) -> 3
| Lop (Istackoffset _) -> 2
| Lop (Iload (size, addr)) | Lop (Istore (size, addr, _)) ->
| Lop (Iload (size, addr, _)) | Lop (Istore (size, addr, _)) ->
let based = match addr with Iindexed _ -> 0 | Ibased _ -> 1 in
based + begin match size with Single -> 2 | _ -> 1 end
| Lop (Ialloc _) when f.fun_fast -> 5
Expand Down Expand Up @@ -673,7 +673,7 @@ let emit_instr env i =
assert (n mod 16 = 0);
emit_stack_adjustment (-n);
env.stack_offset <- env.stack_offset + n
| Lop(Iload(size, addr)) ->
| Lop(Iload(size, addr, _mut)) ->
let dst = i.res.(0) in
let base =
match addr with
Expand Down
5 changes: 3 additions & 2 deletions asmcomp/arm64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ let destroyed_at_oper = function
destroyed_at_c_call
| Iop(Ialloc _) ->
[| reg_x8 |]
| Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _, _)) ->
| Iop( Iintoffloat | Ifloatofint
| Iload(Single, _, _) | Istore(Single, _, _)) ->
[| reg_d7 |] (* d7 / s7 destroyed *)
| _ -> [||]

Expand All @@ -277,7 +278,7 @@ let max_register_pressure = function
| Iextcall _ -> [| 7; 8 |] (* 7 integer callee-saves, 8 FP callee-saves *)
| Ialloc _ -> [| 22; 32 |]
| Iintoffloat | Ifloatofint
| Iload(Single, _) | Istore(Single, _, _) -> [| 23; 31 |]
| Iload(Single, _, _) | Istore(Single, _, _) -> [| 23; 31 |]
| _ -> [| 23; 32 |]

(* Layout of the stack *)
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/CSE.ml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ method! class_of_operation op =
(* Operations that affect the floating-point stack cannot be factored *)
| Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Iintoffloat | Ifloatofint
| Iload((Single | Double | Double_u), _) -> Op_other
| Iload((Single | Double | Double_u), _, _) -> Op_other
(* Specific ops *)
| Ispecific(Ilea _) -> Op_pure
| Ispecific(Istore_int(_, _, is_asg)) -> Op_store is_asg
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ let emit_instr env fallthrough i =
else I.sub (int n) esp;
cfi_adjust_cfa_offset n;
env.stack_offset <- env.stack_offset + n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let dest = i.res.(0) in
begin match chunk with
| Word_int | Word_val | Thirtytwo_signed | Thirtytwo_unsigned ->
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/i386/selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ let pseudoregs_for_operation op arg res =
(* For floating-point operations and floating-point loads,
the result is always left at the top of the floating-point stack *)
| Iconst_float _ | Inegf | Iabsf | Iaddf | Isubf | Imulf | Idivf
| Ifloatofint | Iload((Single | Double | Double_u), _)
| Ifloatofint | Iload((Single | Double | Double_u), _, _)
| Ispecific(Isubfrev | Idivfrev | Ifloatarithmem _ | Ifloatspecial _) ->
(arg, [| tos |], false) (* don't move it immediately *)
(* For storing a byte, the argument must be in eax...edx.
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/mach.ml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ type operation =
ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
alloc : bool; }
| Istackoffset of int
| Iload of Cmm.memory_chunk * Arch.addressing_mode
| Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
| Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
| Ialloc of { bytes : int; dbginfo : Debuginfo.alloc_dbginfo; }
| Iintop of integer_operation
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/mach.mli
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ type operation =
ty_res : Cmm.machtype; ty_args : Cmm.exttype list;
alloc : bool; }
| Istackoffset of int
| Iload of Cmm.memory_chunk * Arch.addressing_mode
| Iload of Cmm.memory_chunk * Arch.addressing_mode * Asttypes.mutable_flag
| Istore of Cmm.memory_chunk * Arch.addressing_mode * bool
(* false = initialization, true = assignment *)
| Ialloc of { bytes : int; dbginfo : Debuginfo.alloc_dbginfo; }
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/power/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ module BR = Branch_relaxation.Make (struct
size 3 (2 + tocload_size()) (2 + tocload_size())
| Lop(Iextcall { alloc = false; _}) -> size 1 2 2
| Lop(Istackoffset _) -> 1
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
if chunk = Byte_signed
then load_store_size addr + 1
else load_store_size addr
Expand Down Expand Up @@ -732,7 +732,7 @@ let emit_instr env i =
| Lop(Istackoffset n) ->
` addi 1, 1, {emit_int (-n)}\n`;
adjust_stack_offset env n
| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let loadinstr =
match chunk with
| Byte_unsigned -> "lbz"
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/power/scheduling.ml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ inherit Schedgen.scheduler_generic

method oper_latency = function
Ireload -> 2
| Iload(_, _) -> 2
| Iload(_, _, _) -> 2
| Iconst_float _ -> 2 (* turned into a load *)
| Iconst_symbol _ -> 1
| Iintop(Imul | Imulh) -> 9
Expand All @@ -46,7 +46,7 @@ method! reload_retaddr_latency = 12

method oper_issue_cycles = function
Iconst_float _ | Iconst_symbol _ -> 2
| Iload(_, Ibased(_, _)) -> 2
| Iload(_, Ibased(_, _), _) -> 2
| Istore(_, Ibased(_, _), _) -> 2
| Ialloc _ -> 4
| Iintop(Imod) -> 40 (* assuming full stall *)
Expand Down
5 changes: 4 additions & 1 deletion asmcomp/printmach.ml
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,12 @@ let operation op arg ppf res =
(if alloc then "" else " (noalloc)")
| Istackoffset n ->
fprintf ppf "offset stack %i" n
| Iload(chunk, addr) ->
| Iload(chunk, addr, Immutable) ->
fprintf ppf "%s[%a]"
(Printcmm.chunk chunk) (Arch.print_addressing reg addr) arg
| Iload(chunk, addr, Mutable) ->
fprintf ppf "%s mut[%a]"
(Printcmm.chunk chunk) (Arch.print_addressing reg addr) arg
| Istore(chunk, addr, is_assign) ->
fprintf ppf "%s[%a] := %a %s"
(Printcmm.chunk chunk)
Expand Down
4 changes: 2 additions & 2 deletions asmcomp/riscv/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -302,10 +302,10 @@ let emit_instr env i =
assert (n mod 16 = 0);
emit_stack_adjustment (-n);
env.stack_offset <- env.stack_offset + n
| Lop(Iload(Single, Iindexed ofs)) ->
| Lop(Iload(Single, Iindexed ofs, _mut)) ->
` flw {emit_reg i.res.(0)}, {emit_int ofs}({emit_reg i.arg.(0)})\n`;
` fcvt.d.s {emit_reg i.res.(0)}, {emit_reg i.res.(0)}\n`
| Lop(Iload(chunk, Iindexed ofs)) ->
| Lop(Iload(chunk, Iindexed ofs, _mut)) ->
let instr =
match chunk with
| Byte_unsigned -> "lbu"
Expand Down
2 changes: 1 addition & 1 deletion asmcomp/s390x/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ let emit_instr env i =
emit_stack_adjust n;
env.stack_offset <- env.stack_offset + n

| Lop(Iload(chunk, addr)) ->
| Lop(Iload(chunk, addr, _mut)) ->
let loadinstr =
match chunk with
Byte_unsigned -> "llgc"
Expand Down