Skip to content

Commit

Permalink
Support more arguments to tail calls by passing them through the doma…
Browse files Browse the repository at this point in the history
…in state (#10595)

In 2004, commit af9b98f, the calling conventions for the i386 port of ocamlopt were changed: the first 6 integer arguments go into registers, like before, but the next 16 arguments go into a global array `caml_extra_params`, instead of being passed on stack like before.  The reason for this hack is that passing arguments in global memory does not preclude tail call optimization, unlike passing arguments on stack.  Parameters passed via `caml_extra_params` are immediately copied on stack or in registers on function entry, before another function call, a GC, or a context switch can take place, so everything is safe in OCaml, and in Multicore OCaml as long as there is only one execution domain.

This hack was justified by the paucity of registers provided by the i386 architecture.  It was believed that other architectures provide enough registers for parameter passing that most if not all reasonable tail calls can be accommodated.

Now it's 2021 and users want tail calls with more arguments than available registers on all the architectures we support.

So, biting the bullet and swallowing some pride, this commit extends the 2004 i386 hack to all the architectures supported by OCaml.  Once the registers available for passing function arguments are exhausted, the next 64 arguments are passed in a memory area that is part of the domain state.  This argument passing is compatible with tail calls, so we get guaranteed tail calls up to 70 arguments (in the worst case).

The domain state is used instead of a global array so that (1) this is compatible with Multicore OCaml and concurrent execution of multiple domains, and (2) we benefit from efficient addressing from the domain state register.

For i386, we don't have a domain state register, and Multicore OCaml will support only one domain on this architecture, so we keep using a global `caml_extra_params` array; only, its size was increased to support 64 arguments.

The tests for tail calls were extended to 
- Test tail calls to other functions, not just to self
- Test up to 32 arguments.
  • Loading branch information
xavierleroy committed Sep 9, 2021
1 parent 8177967 commit 0117428
Show file tree
Hide file tree
Showing 24 changed files with 333 additions and 148 deletions.
5 changes: 5 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ Working version
and long register allocation times.
(Xavier Leroy, report by Edwin Török, review by Nicolás Ojeda Bär)

- #10595: Tail calls with up to 64 arguments are guaranteed to be compiled
as tail calls. To this end, memory locations in the domain state
are used for passing arguments that do not fit in registers.
(Xavier Leroy, review by Vincent Laviron)

### Standard library:

* #7812, #10475: `Filename.chop_suffix name suff` now checks that `suff`
Expand Down
18 changes: 12 additions & 6 deletions asmcomp/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,13 @@ let frame_size env = (* includes return address *)

let slot_offset env loc cl =
match loc with
| Incoming n -> (frame_size env) + n
| Incoming n -> frame_size env + n
| Local n ->
if cl = 0
then env.stack_offset + n * 8
else env.stack_offset + (env.f.fun_num_stack_slots.(0) + n) * 8
| Outgoing n -> n
| Domainstate _ -> assert false (* not a stack slot *)

(* Symbols *)

Expand Down Expand Up @@ -171,14 +172,18 @@ let emit_Llabel env fallthrough lbl =

(* Output a pseudo-register *)

let x86_data_type_for_stack_slot = function
| Float -> REAL8
| _ -> QWORD

let reg env = function
| { loc = Reg.Reg r } -> register_name r
| { loc = Stack s; typ = Float } as r ->
let ofs = slot_offset env s (register_class r) in
mem64 REAL8 ofs RSP
| { loc = Stack s } as r ->
| { loc = Stack(Domainstate n); typ = ty } ->
let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in
mem64 (x86_data_type_for_stack_slot ty) ofs R14
| { loc = Stack s; typ = ty } as r ->
let ofs = slot_offset env s (register_class r) in
mem64 QWORD ofs RSP
mem64 (x86_data_type_for_stack_slot ty) ofs RSP
| { loc = Unknown } ->
assert false

Expand All @@ -188,6 +193,7 @@ let reg64 = function

let arg env i n = reg env i.arg.(n)
let res env i n = reg env i.res.(n)

(* Output a reference to the lower 8, 16 or 32 bits of a register *)

let reg_low_8_name = Array.map (fun r -> Reg8L r) int_reg_name
Expand Down
39 changes: 24 additions & 15 deletions asmcomp/amd64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ let win64 = Arch.win64
stub saves them into the GC regs block).
*)

let max_arguments_for_tailcalls = 10

let int_reg_name =
match Config.ccomp_type with
| "msvc" ->
Expand Down Expand Up @@ -157,12 +155,15 @@ let word_addressed = false

(* Calling conventions *)

let calling_conventions first_int last_int first_float last_float make_stack
let size_domainstate_args = 64 * size_int

let calling_conventions first_int last_int first_float last_float
make_stack first_stack
arg =
let loc = Array.make (Array.length arg) Reg.dummy in
let int = ref first_int in
let float = ref first_float in
let ofs = ref 0 in
let ofs = ref first_stack in
for i = 0 to Array.length arg - 1 do
match arg.(i) with
| Val | Int | Addr as ty ->
Expand All @@ -183,21 +184,29 @@ let calling_conventions first_int last_int first_float last_float make_stack
ofs := !ofs + size_float
end
done;
(loc, Misc.align !ofs 16) (* keep stack 16-aligned *)

let incoming ofs = Incoming ofs
let outgoing ofs = Outgoing ofs
(loc, Misc.align (max 0 !ofs) 16) (* keep stack 16-aligned *)

let incoming ofs =
if ofs >= 0
then Incoming ofs
else Domainstate (ofs + size_domainstate_args)
let outgoing ofs =
if ofs >= 0
then Outgoing ofs
else Domainstate (ofs + size_domainstate_args)
let not_supported _ofs = fatal_error "Proc.loc_results: cannot call"

let loc_arguments arg =
calling_conventions 0 9 100 109 outgoing arg
calling_conventions 0 9 100 109 outgoing (- size_domainstate_args) arg
let loc_parameters arg =
let (loc, _ofs) =
calling_conventions 0 9 100 109 incoming arg
in
loc
calling_conventions 0 9 100 109 incoming (- size_domainstate_args) arg
in loc
let loc_results res =
let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported res in loc
let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported 0 res
in loc

let max_arguments_for_tailcalls = 10 (* in regs *) + 64 (* in domain state *)

(* C calling conventions under Unix:
first integer args in rdi, rsi, rdx, rcx, r8, r9
Expand All @@ -213,10 +222,10 @@ let loc_results res =
Return value in rax or xmm0. *)

let loc_external_results res =
let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported res in loc
let (loc, _ofs) = calling_conventions 0 0 100 100 not_supported 0 res in loc

let unix_loc_external_arguments arg =
calling_conventions 2 7 100 107 outgoing arg
calling_conventions 2 7 100 107 outgoing 0 arg

let win64_int_external_arguments =
[| 5 (*rcx*); 4 (*rdx*); 6 (*r8*); 7 (*r9*) |]
Expand Down
7 changes: 6 additions & 1 deletion asmcomp/arm/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,18 @@ let slot_offset env loc cl =
| Outgoing n ->
assert (n >= 0);
n
| Domainstate _ -> assert false (* not a stack slot *)

(* Output a stack reference *)

let emit_stack env r =
match r.loc with
| Stack (Domainstate n) ->
let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in
`[domain_state_ptr, #{emit_int ofs}]`
| Stack s ->
let ofs = slot_offset env s (register_class r) in `[sp, #{emit_int ofs}]`
let ofs = slot_offset env s (register_class r) in
`[sp, #{emit_int ofs}]`
| _ -> fatal_error "Emit_arm.emit_stack"

(* Output an addressing mode *)
Expand Down
36 changes: 23 additions & 13 deletions asmcomp/arm/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ let stack_slot slot ty =

(* Calling conventions *)

let size_domainstate_args = 64 * size_int

let loc_int last_int make_stack int ofs =
if !int <= last_int then begin
let l = phys_reg !int in
Expand Down Expand Up @@ -149,41 +151,49 @@ let loc_int_pair last_int make_stack int ofs =
[| stack_lower; stack_upper |]
end

let calling_conventions first_int last_int first_float last_float make_stack
arg =
let calling_conventions first_int last_int first_float last_float
make_stack first_stack arg =
let loc = Array.make (Array.length arg) Reg.dummy in
let int = ref first_int in
let float = ref first_float in
let ofs = ref 0 in
let ofs = ref first_stack in
for i = 0 to Array.length arg - 1 do
match arg.(i) with
| Val | Int | Addr ->
loc.(i) <- loc_int last_int make_stack int ofs
| Float ->
loc.(i) <- loc_float last_float make_stack float ofs
done;
(loc, Misc.align !ofs 8) (* keep stack 8-aligned *)

let incoming ofs = Incoming ofs
let outgoing ofs = Outgoing ofs
(loc, Misc.align (max 0 !ofs) 8) (* keep stack 8-aligned *)

let incoming ofs =
if ofs >= 0
then Incoming ofs
else Domainstate (ofs + size_domainstate_args)
let outgoing ofs =
if ofs >= 0
then Outgoing ofs
else Domainstate (ofs + size_domainstate_args)
let not_supported _ofs = fatal_error "Proc.loc_results: cannot call"

(* OCaml calling convention:
first integer args in r0...r7
first float args in d0...d15 (EABI+VFP)
remaining args on stack.
remaining args in domain state area, then on stack.
Return values in r0...r7 or d0...d15. *)

let max_arguments_for_tailcalls = 8
let max_arguments_for_tailcalls = 8 (* in regs *) + 64 (* in domain state *)

let loc_arguments arg =
calling_conventions 0 7 100 115 outgoing arg
calling_conventions 0 7 100 115 outgoing (- size_domainstate_args) arg

let loc_parameters arg =
let (loc, _) = calling_conventions 0 7 100 115 incoming arg in loc
let (loc, _) =
calling_conventions 0 7 100 115 incoming (- size_domainstate_args) arg
in loc

let loc_results res =
let (loc, _) = calling_conventions 0 7 100 115 not_supported res in loc
let (loc, _) = calling_conventions 0 7 100 115 not_supported 0 res in loc

(* C calling convention:
first integer args in r0...r3
Expand Down Expand Up @@ -218,7 +228,7 @@ let loc_external_arguments ty_args =
external_calling_conventions 0 3 100 107 outgoing ty_args

let loc_external_results res =
let (loc, _) = calling_conventions 0 1 100 100 not_supported res
let (loc, _) = calling_conventions 0 1 100 100 not_supported 0 res
in loc

let loc_exn_bucket = phys_reg 0
Expand Down
7 changes: 6 additions & 1 deletion asmcomp/arm64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,18 @@ let slot_offset env loc cl =
| Outgoing n ->
assert (n >= 0);
n
| Domainstate _ -> assert false (* not a stack slot *)

(* Output a stack reference *)

let emit_stack env r =
match r.loc with
| Stack (Domainstate n) ->
let ofs = n + Domainstate.(idx_of_field Domain_extra_params) * 8 in
`[{emit_reg reg_domain_state_ptr}, #{emit_int ofs}]`
| Stack s ->
let ofs = slot_offset env s (register_class r) in `[sp, #{emit_int ofs}]`
let ofs = slot_offset env s (register_class r) in
`[sp, #{emit_int ofs}]`
| _ -> fatal_error "Emit.emit_stack"

(* Output an addressing mode *)
Expand Down
35 changes: 23 additions & 12 deletions asmcomp/arm64/proc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ let stack_slot slot ty =

(* Calling conventions *)

let size_domainstate_args = 64 * size_int

let loc_int last_int make_stack int ofs =
if !int <= last_int then begin
let l = phys_reg !int in
Expand Down Expand Up @@ -138,43 +140,52 @@ let loc_int32 last_int make_stack int ofs =
end

let calling_conventions
first_int last_int first_float last_float make_stack arg =
first_int last_int first_float last_float make_stack first_stack arg =
let loc = Array.make (Array.length arg) Reg.dummy in
let int = ref first_int in
let float = ref first_float in
let ofs = ref 0 in
let ofs = ref first_stack in
for i = 0 to Array.length arg - 1 do
match arg.(i) with
| Val | Int | Addr ->
loc.(i) <- loc_int last_int make_stack int ofs
| Float ->
loc.(i) <- loc_float last_float make_stack float ofs
done;
(loc, Misc.align !ofs 16) (* keep stack 16-aligned *)

let incoming ofs = Incoming ofs
let outgoing ofs = Outgoing ofs
(loc, Misc.align (max 0 !ofs) 16) (* keep stack 16-aligned *)

let incoming ofs =
if ofs >= 0
then Incoming ofs
else Domainstate (ofs + size_domainstate_args)
let outgoing ofs =
if ofs >= 0
then Outgoing ofs
else Domainstate (ofs + size_domainstate_args)
let not_supported _ofs = fatal_error "Proc.loc_results: cannot call"

(* OCaml calling convention:
first integer args in r0...r15
first float args in d0...d15
remaining args on stack.
remaining args in domain state area, then on stack.
Return values in r0...r15 or d0...d15. *)

let max_arguments_for_tailcalls = 16
let max_arguments_for_tailcalls = 16 (* in regs *) + 64 (* in domain state *)

let last_int_register = if macosx then 7 else 15

let loc_arguments arg =
calling_conventions 0 last_int_register 100 115 outgoing arg
calling_conventions 0 last_int_register 100 115
outgoing (- size_domainstate_args) arg
let loc_parameters arg =
let (loc, _) =
calling_conventions 0 last_int_register 100 115 incoming arg
calling_conventions 0 last_int_register 100 115
incoming (- size_domainstate_args) arg
in
loc
let loc_results res =
let (loc, _) =
calling_conventions 0 last_int_register 100 115 not_supported res
calling_conventions 0 last_int_register 100 115 not_supported 0 res
in
loc

Expand Down Expand Up @@ -208,7 +219,7 @@ let loc_external_arguments ty_args =
external_calling_conventions 0 7 100 107 outgoing ty_args

let loc_external_results res =
let (loc, _) = calling_conventions 0 1 100 100 not_supported res in loc
let (loc, _) = calling_conventions 0 1 100 100 not_supported 0 res in loc

let loc_exn_bucket = phys_reg 0

Expand Down
23 changes: 16 additions & 7 deletions asmcomp/i386/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ let slot_offset env loc cl =
| Outgoing n ->
assert (n >= 0);
n
| Domainstate _ -> assert false (* not a stack slot *)

(* Record symbols used and defined - at the end generate extern for those
used but not defined *)
Expand Down Expand Up @@ -138,16 +139,24 @@ let domain_field f r =
let load_domain_state r =
I.mov (sym32 "Caml_state") r

let x86_data_type_for_stack_slot = function
| Float -> REAL8
| _ -> DWORD

(* The Domainstate locations are mapped to a global array "caml_extra_params"
defined in runtime/i386*. We cannot access the domain state here
because in the i386 port there is no register that always point to the
domain state. A global array works because i386 does not
support multiple domains. *)

let reg env = function
| { loc = Reg r } -> register_name r
| { loc = Stack(Incoming n | Outgoing n) } when n < 0 ->
sym32 "caml_extra_params" ~ofs:(n + 64)
| { loc = Stack s; typ = Float } as r ->
let ofs = slot_offset env s (register_class r) in
mem32 REAL8 ofs RSP
| { loc = Stack s } as r ->
| { loc = Stack(Domainstate n); typ = ty } ->
mem_sym (x86_data_type_for_stack_slot ty)
(emit_symbol "caml_extra_params") ~ofs:n
| { loc = Stack s; typ = ty } as r ->
let ofs = slot_offset env s (register_class r) in
mem32 DWORD ofs RSP
mem32 (x86_data_type_for_stack_slot ty) ofs RSP
| { loc = Unknown } ->
fatal_error "Emit_i386.reg"

Expand Down

0 comments on commit 0117428

Please sign in to comment.