ocaml · xavierleroy · Nov 26, 2020 · Aug 27, 2020 · Aug 27, 2020 · Aug 28, 2020
diff --git a/Changes b/Changes
@@ -11,6 +11,9 @@ Working version
 
 ### Code generation and optimizations:
 
+- #9937: improvements in ARM64 code generation (constants, sign extensions)
+  (Xavier Leroy, review by Stephen Dolan)
+
 ### Standard library:
 
 ### Other libraries:
@@ -38,6 +41,7 @@ Working version
 - #10005: Try expanding aliases in Ctype.nondep_type_rec
   (Stephen Dolan, review by Gabriel Scherer, Leo White and Xavier Leroy)
 
+
 OCaml 4.12.0
 ------------
 

diff --git a/asmcomp/arm64/arch.ml b/asmcomp/arm64/arch.ml
@@ -58,6 +58,7 @@ type specific_operation =
   | Isqrtf        (* floating-point square root *)
   | Ibswap of int (* endianness conversion *)
   | Imove32       (* 32-bit integer move *)
+  | Isignext of int (* sign extension *)
 
 and arith_operation =
     Ishiftadd
@@ -169,3 +170,74 @@ let print_specific_operation printreg op ppf arg =
   | Imove32 ->
       fprintf ppf "move32 %a"
         printreg arg.(0)
+  | Isignext n ->
+      fprintf ppf "signext%d %a"
+        n printreg arg.(0)
+
+(* Recognition of logical immediate arguments *)
+
+(* An automaton to recognize ( 0+1+0* | 1+0+1* )
+
+               0          1          0
+              / \        / \        / \
+              \ /        \ /        \ /
+        -0--> [1] --1--> [2] --0--> [3]
+       /
+     [0]
+       \
+        -1--> [4] --0--> [5] --1--> [6]
+              / \        / \        / \
+              \ /        \ /        \ /
+               1          0          1
+
+The accepting states are 2, 3, 5 and 6. *)
+
+let auto_table = [|   (* accepting?, next on 0, next on 1 *)
+  (* state 0 *) (false, 1, 4);
+  (* state 1 *) (false, 1, 2);
+  (* state 2 *) (true,  3, 2);
+  (* state 3 *) (true,  3, 7);
+  (* state 4 *) (false, 5, 4);
+  (* state 5 *) (true,  5, 6);
+  (* state 6 *) (true,  7, 6);
+  (* state 7 *) (false, 7, 7)   (* error state *)
+|]
+
+let rec run_automata nbits state input =
+  let (acc, next0, next1) = auto_table.(state) in
+  if nbits <= 0
+  then acc
+  else run_automata (nbits - 1)
+                    (if Nativeint.logand input 1n = 0n then next0 else next1)
+                    (Nativeint.shift_right_logical input 1)
+
+(* The following function determines a length [e]
+   such that [x] is a repetition [BB...B] of a bit pattern [B] of length [e].
+   [e] ranges over 64, 32, 16, 8, 4, 2.  The smaller [e] the better. *)
+
+let logical_imm_length x =
+  (* [test n] checks that the low [2n] bits of [x] are of the
+     form [BB], that is, two occurrences of the same [n] bits *)
+  let test n =
+    let mask = Nativeint.(sub (shift_left 1n n) 1n) in
+    let low_n_bits = Nativeint.(logand x mask) in
+    let next_n_bits = Nativeint.(logand (shift_right_logical x n) mask) in
+    low_n_bits = next_n_bits in
+  (* If [test n] fails, we know that the length [e] is
+     at least [2n].  Hence we test with decreasing values of [n]:
+     32, 16, 8, 4, 2. *)
+  if not (test 32) then 64
+  else if not (test 16) then 32
+  else if not (test 8) then 16
+  else if not (test 4) then 8
+  else if not (test 2) then 4
+  else 2
+
+(* A valid logical immediate is
+- neither [0] nor [-1];
+- composed of a repetition [BBBBB] of a bit-pattern [B] of length [e]
+- the low [e] bits of the number, that is, [B], match [0+1+0*] or [1+0+1*].
+*)
+
+let is_logical_immediate x =
+  x <> 0n && x <> -1n && run_automata (logical_imm_length x) 0 x
diff --git a/asmcomp/arm64/emit.mlp b/asmcomp/arm64/emit.mlp
@@ -226,63 +226,55 @@ let name_for_int_operation = function
   | Iasr -> "asr"
   | _ -> assert false
 
+(* Decompose an integer constant into four 16-bit shifted fragments.
+   Omit the fragments that are equal to "default" (16 zeros or 16 ones). *)
+
+let decompose_int default n =
+  let rec decomp n pos =
+    if pos >= 64 then [] else begin
+      let frag = Nativeint.logand n 0xFFFFn
+      and rem  = Nativeint.shift_right_logical n 16 in
+      if frag = default
+      then decomp rem (pos + 16)
+      else (frag, pos) :: decomp rem (pos + 16)
+    end
+  in decomp n 0
+
 (* Load an integer constant into a register *)
 
+let emit_movk dst (f, p) =
+    `	movk	{emit_reg dst}, #{emit_nativeint f}, lsl #{emit_int p}\n`
+
 let emit_intconst dst n =
-  let rec emit_pos first shift =
-    if shift < 0 then begin
-      if first then `	mov	{emit_reg dst}, xzr\n`
-    end else begin
-      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
-      if s = 0n then emit_pos first (shift - 16) else begin
-        if first then
-          `	movz	{emit_reg dst}, #{emit_nativeint s}, lsl #{emit_int shift}\n`
-        else
-           `	movk	{emit_reg dst}, #{emit_nativeint s}, lsl #{emit_int shift}\n`;
-        emit_pos false (shift - 16)
-      end
-    end
-  and emit_neg first shift =
-    if shift < 0 then begin
-      if first then `	movn	{emit_reg dst}, #0\n`
+  if is_logical_immediate n then
+    `	orr	{emit_reg dst}, xzr, #{emit_nativeint n}\n`
+  else begin
+    let dz = decompose_int 0x0000n n
+    and dn = decompose_int 0xFFFFn n in
+    if List.length dz <= List.length dn then begin
+      match dz with
+      | [] ->
+          `	mov	{emit_reg dst}, xzr\n`
+      | (f, p) :: l ->
+          `	movz	{emit_reg dst}, #{emit_nativeint f}, lsl #{emit_int p}\n`;
+          List.iter (emit_movk dst) l
     end else begin
-      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
-      if s = 0xFFFFn then emit_neg first (shift - 16) else begin
-        if first then
-          `	movn	{emit_reg dst}, #{emit_nativeint (Nativeint.logxor s 0xFFFFn)}, lsl #{emit_int shift}\n`
-        else
-           `	movk	{emit_reg dst}, #{emit_nativeint s}, lsl #{emit_int shift}\n`;
-        emit_neg false (shift - 16)
-      end
+      match dn with
+      | [] ->
+          `	movn	{emit_reg dst}, #0\n`
+      | (f, p) :: l ->
+          let nf = Nativeint.logxor f 0xFFFFn in
+          `	movn	{emit_reg dst}, #{emit_nativeint nf}, lsl #{emit_int p}\n`;
+          List.iter (emit_movk dst) l
     end
-  in
-    if n < 0n then emit_neg true 48 else emit_pos true 48
+  end
 
 let num_instructions_for_intconst n =
-  let num_instructions = ref 0 in
-  let rec count_pos first shift =
-    if shift < 0 then begin
-      if first then incr num_instructions
-    end else begin
-      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
-      if s = 0n then count_pos first (shift - 16) else begin
-        incr num_instructions;
-        count_pos false (shift - 16)
-      end
-    end
-  and count_neg first shift =
-    if shift < 0 then begin
-      if first then incr num_instructions
-    end else begin
-      let s = Nativeint.(logand (shift_right_logical n shift) 0xFFFFn) in
-      if s = 0xFFFFn then count_neg first (shift - 16) else begin
-        incr num_instructions;
-        count_neg false (shift - 16)
-      end
-    end
-  in
-  if n < 0n then count_neg true 48 else count_pos true 48;
-  !num_instructions
+  if is_logical_immediate n then 1 else begin
+    let dz = decompose_int 0x0000n n
+    and dn = decompose_int 0xFFFFn n in
+    max 1 (min (List.length dz) (List.length dn))
+  end
 
 (* Recognize float constants appropriate for FMOV dst, #fpimm instruction:
    "a normalized binary floating point encoding with 1 sign bit, 4
@@ -534,6 +526,7 @@ module BR = Branch_relaxation.Make (struct
     | Lop (Ispecific (Ibswap 16)) -> 2
     | Lop (Ispecific (Ibswap _)) -> 1
     | Lop (Ispecific Imove32) -> 1
+    | Lop (Ispecific (Isignext _)) -> 1
     | Lop (Iname_for_debugger _) -> 0
     | Lreloadretaddr -> 0
     | Lreturn -> epilogue_size ()
@@ -880,6 +873,8 @@ let emit_instr i =
         | _ ->
             assert false
         end
+    | Lop(Ispecific(Isignext size)) ->
+        `	sbfm	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #0, #{emit_int (size - 1)}\n`
     | Lop (Iname_for_debugger _) -> ()
     | Lreloadretaddr ->
         ()

diff --git a/asmcomp/arm64/selection.ml b/asmcomp/arm64/selection.ml
@@ -34,47 +34,8 @@ let is_offset chunk n =
     | Word_int | Word_val | Double | Double_u ->
         n land 7 = 0 && n lsr 3 < 0x1000)
 
-(* An automaton to recognize ( 0+1+0* | 1+0+1* )
-
-               0          1          0
-              / \        / \        / \
-              \ /        \ /        \ /
-        -0--> [1] --1--> [2] --0--> [3]
-       /
-     [0]
-       \
-        -1--> [4] --0--> [5] --1--> [6]
-              / \        / \        / \
-              \ /        \ /        \ /
-               1          0          1
-
-The accepting states are 2, 3, 5 and 6. *)
-
-let auto_table = [|   (* accepting?, next on 0, next on 1 *)
-  (* state 0 *) (false, 1, 4);
-  (* state 1 *) (false, 1, 2);
-  (* state 2 *) (true,  3, 2);
-  (* state 3 *) (true,  3, 7);
-  (* state 4 *) (false, 5, 4);
-  (* state 5 *) (true,  5, 6);
-  (* state 6 *) (true,  7, 6);
-  (* state 7 *) (false, 7, 7)   (* error state *)
-|]
-
-let rec run_automata nbits state input =
-  let (acc, next0, next1) = auto_table.(state) in
-  if nbits <= 0
-  then acc
-  else run_automata (nbits - 1)
-                    (if input land 1 = 0 then next0 else next1)
-                    (input asr 1)
-
-(* We are very conservative wrt what ARM64 supports: we don't support
-   repetitions of a 000111000 or 1110000111 pattern, just a single
-   pattern of this kind. *)
-
 let is_logical_immediate n =
-  n <> 0 && n <> -1 && run_automata 64 0 n
+  Arch.is_logical_immediate (Nativeint.of_int n)
 
 (* Signed immediates are simpler *)
 
@@ -199,6 +160,14 @@ method! select_operation op args dbg =
       | _ ->
           super#select_operation op args dbg
       end
+  (* Recognize sign extension *)
+  | Casr ->
+      begin match args with
+        [Cop(Clsl, [k; Cconst_int (n, _)], _); Cconst_int (n', _)]
+        when n' = n && 0 < n && n < 64 ->
+          (Ispecific (Isignext (64 - n)), [k])
+        | _ -> super#select_operation op args dbg
+      end
   (* Recognize floating-point negate and multiply *)
   | Cnegf ->
       begin match args with