From 870a670cd34a1c3fa4d1f64cae410d3bb394bdf8 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <jkun@google.com>
Date: Tue, 26 Mar 2024 16:36:22 -0700
Subject: [PATCH] Add `heir-simd-vectorizer` pipeline

This is intended to be a consistent bundling of the ported HECO optimizations, so that all tests can use them identically.

Fixes https://github.com/google/heir/issues/556
Depends on https://github.com/google/heir/pull/531 for changes to hamming-distance test

PiperOrigin-RevId: 619348713
---
 docs/content/en/docs/pipelines.md             | 22 +++++++++++++
 tests/{simd => heir_simd_vectorizer}/BUILD    |  0
 .../box_blur_4x4.mlir                         |  6 ++--
 .../box_blur_64x64.mlir                       |  7 ++--
 .../hamming_distance.mlir                     | 10 +++---
 .../simple_sum.mlir                           |  6 ++--
 tools/BUILD                                   |  3 ++
 tools/heir-opt.cpp                            | 33 +++++++++++++++++++
 8 files changed, 70 insertions(+), 17 deletions(-)
 rename tests/{simd => heir_simd_vectorizer}/BUILD (100%)
 rename tests/{simd => heir_simd_vectorizer}/box_blur_4x4.mlir (85%)
 rename tests/{simd => heir_simd_vectorizer}/box_blur_64x64.mlir (92%)
 rename tests/{simd => heir_simd_vectorizer}/hamming_distance.mlir (69%)
 rename tests/{simd => heir_simd_vectorizer}/simple_sum.mlir (72%)

diff --git a/docs/content/en/docs/pipelines.md b/docs/content/en/docs/pipelines.md
index ba77f23a9..4f07e6606 100644
--- a/docs/content/en/docs/pipelines.md
+++ b/docs/content/en/docs/pipelines.md
@@ -7,6 +7,27 @@ weight: 9
 
 ## `heir-opt`
 
+### `--heir-simd-vectorizer`
+
+Run scheme-agnostic passes to convert FHE programs that operate on scalar types
+to equivalent programs that operate on vectors.
+
+This pass is intended to process FHE programs that are known to be good for
+SIMD, but a specific FHE scheme has not yet been chosen. It expects to handle
+`arith` ops operating on `tensor` types (with or without `secret.generic`).
+
+The pass unrolls all loops, then applies a series of passes that convert scalar
+operations on tensor elements to SIMD operations on full tensors. This uses the
+FHE computational model common to BGV, BFV, and CKKS, in which data is packed
+in polynomial ciphertexts, interpreted as vectors of individual data elements,
+and arithmetic can be applied across entire ciphertexts, with some limited
+support for rotations via automorphisms of the underlying ring.
+
+Along the way, this pipeline applies heuristic optimizations to minimize the
+number of rotations needed, relying on the implicit cost model that rotations
+are generally expensive. The specific set of passes can be found in
+`tools/heir-opt.cpp` where the pipeline is defined.
+
 ### `--heir-tosa-to-arith`
 
 Lowers a TOSA MLIR model to `func`, `arith`, and `memref`.
@@ -26,6 +47,7 @@ tool can lower a TFLite FlatBuffer to textual MLIR with
 [hello_world.tosa.mlir](https://github.com/google/heir/blob/main/tests/hello_world.tosa.mlir)
 for an example.
 
+
 ### `--yosys-optimizer`
 
 Uses Yosys to booleanize and optimize MLIR functions.
diff --git a/tests/simd/BUILD b/tests/heir_simd_vectorizer/BUILD
similarity index 100%
rename from tests/simd/BUILD
rename to tests/heir_simd_vectorizer/BUILD
diff --git a/tests/simd/box_blur_4x4.mlir b/tests/heir_simd_vectorizer/box_blur_4x4.mlir
similarity index 85%
rename from tests/simd/box_blur_4x4.mlir
rename to tests/heir_simd_vectorizer/box_blur_4x4.mlir
index 007ba1d35..c99e48f55 100644
--- a/tests/simd/box_blur_4x4.mlir
+++ b/tests/heir_simd_vectorizer/box_blur_4x4.mlir
@@ -1,13 +1,11 @@
 // RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse \
-// RUN:   --full-loop-unroll \
-// RUN:   --insert-rotate --cse --canonicalize --collapse-insertion-chains --canonicalize --cse \
-// RUN:   %s | FileCheck %s
+// RUN:   --heir-simd-vectorizer %s | FileCheck %s
 
 module  {
   // CHECK-LABEL: @box_blur
   // CHECK-NOT: tensor.extract
   // CHECK-COUNT-7: tensor_ext.rotate
-  func.func @box_blur(%arg0: tensor<16xi16> {secret.secret}) -> tensor<16xi16> {
+  func.func @box_blur(%arg0: tensor<16xi16>) -> tensor<16xi16> {
     %c16 = arith.constant 16 : index
     %c4 = arith.constant 4 : index
     %0 = affine.for %x = 0 to 4 iter_args(%arg0_x = %arg0) -> (tensor<16xi16>) {
diff --git a/tests/simd/box_blur_64x64.mlir b/tests/heir_simd_vectorizer/box_blur_64x64.mlir
similarity index 92%
rename from tests/simd/box_blur_64x64.mlir
rename to tests/heir_simd_vectorizer/box_blur_64x64.mlir
index 21fb28d95..8e6fb5d57 100644
--- a/tests/simd/box_blur_64x64.mlir
+++ b/tests/heir_simd_vectorizer/box_blur_64x64.mlir
@@ -1,6 +1,5 @@
-// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse --full-loop-unroll \
-// RUN:   --insert-rotate --cse --canonicalize --collapse-insertion-chains --canonicalize --cse \
-// RUN:   %s | FileCheck %s
+// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse \
+// RUN:   --heir-simd-vectorizer %s | FileCheck %s
 
 module  {
   // CHECK-LABEL: @box_blur
@@ -30,7 +29,7 @@ module  {
   // CHECK-NEXT:     secret.yield %[[v15]]
   // CHECK-NEXT:   } -> !secret.secret<tensor<4096xi16>>
   // CHECK-NEXT:   return %[[v0]]
-  func.func @box_blur(%arg0: tensor<4096xi16> {secret.secret}) -> tensor<4096xi16> {
+  func.func @box_blur(%arg0: tensor<4096xi16>) -> tensor<4096xi16> {
     %c4096 = arith.constant 4096 : index
     %c64 = arith.constant 64 : index
     %0 = affine.for %x = 0 to 64 iter_args(%arg0_x = %arg0) -> (tensor<4096xi16>) {
diff --git a/tests/simd/hamming_distance.mlir b/tests/heir_simd_vectorizer/hamming_distance.mlir
similarity index 69%
rename from tests/simd/hamming_distance.mlir
rename to tests/heir_simd_vectorizer/hamming_distance.mlir
index f8aa39500..84151900f 100644
--- a/tests/simd/hamming_distance.mlir
+++ b/tests/heir_simd_vectorizer/hamming_distance.mlir
@@ -1,6 +1,5 @@
 // RUN: heir-opt --secretize=entry-function=hamming --wrap-generic --canonicalize --cse \
-// RUN:   --full-loop-unroll --cse --canonicalize --insert-rotate --cse --canonicalize \
-// RUN:   %s | FileCheck %s
+// RUN:   --heir-simd-vectorizer %s | FileCheck %s
 
 // CHECK-LABEL: @hamming
 // CHECK: secret.generic
@@ -15,10 +14,11 @@
 // CHECK-NEXT: tensor.extract
 // CHECK-NEXT: secret.yield
 
-// TODO(#521): support rotate-and-reduce when the input is already a series of incremental rotations,
-// as this IR is currently lowered to 4-1 rotate operations to sum after doing (x-y)**2 in SIMD.
+// TODO(#521): Fix rotate-and-reduce to work on this IR.
+// The problem is that the lattice identifies the rotate-version of this IR as
+// being overdetermined.
 
-func.func @hamming(%arg0: tensor<4xi16> {secret.secret}, %arg1: tensor<4xi16> {secret.secret}) -> i16 {
+func.func @hamming(%arg0: tensor<4xi16>, %arg1: tensor<4xi16>) -> i16 {
   %c0 = arith.constant 0 : index
   %c0_si16 = arith.constant 0 : i16
   %0 = affine.for %arg2 = 0 to 4 iter_args(%arg6 = %c0_si16) -> i16 {
diff --git a/tests/simd/simple_sum.mlir b/tests/heir_simd_vectorizer/simple_sum.mlir
similarity index 72%
rename from tests/simd/simple_sum.mlir
rename to tests/heir_simd_vectorizer/simple_sum.mlir
index 2af7833dc..7596a87ee 100644
--- a/tests/simd/simple_sum.mlir
+++ b/tests/heir_simd_vectorizer/simple_sum.mlir
@@ -1,14 +1,12 @@
 // RUN: heir-opt --secretize=entry-function=simple_sum --wrap-generic --canonicalize --cse \
-// RUN:   --full-loop-unroll --insert-rotate --cse --canonicalize \
-// RUN:   --rotate-and-reduce --canonicalize \
-// RUN:   %s | FileCheck %s
+// RUN:   --heir-simd-vectorizer %s | FileCheck %s
 
 // Sum all entries of a tensor into a single scalar
 // CHECK-LABEL: @simple_sum
 // CHECK: secret.generic
 // CHECK-COUNT-5: tensor_ext.rotate
 // CHECK-NOT: tensor_ext.rotate
-func.func @simple_sum(%arg0: tensor<32xi16> {secret.secret}) -> i16 {
+func.func @simple_sum(%arg0: tensor<32xi16>) -> i16 {
   %c0 = arith.constant 0 : index
   %c0_si16 = arith.constant 0 : i16
   %0 = affine.for %i = 0 to 32 iter_args(%sum_iter = %c0_si16) -> i16 {
diff --git a/tools/BUILD b/tools/BUILD
index df1e9b490..fa54343e8 100644
--- a/tools/BUILD
+++ b/tools/BUILD
@@ -56,6 +56,9 @@ cc_binary(
         "@heir//lib/Dialect/Secret/Transforms:DistributeGeneric",
         "@heir//lib/Dialect/TensorExt/IR:Dialect",
         "@heir//lib/Dialect/TensorExt/Transforms",
+        "@heir//lib/Dialect/TensorExt/Transforms:CollapseInsertionChains",
+        "@heir//lib/Dialect/TensorExt/Transforms:InsertRotate",
+        "@heir//lib/Dialect/TensorExt/Transforms:RotateAndReduce",
         "@heir//lib/Dialect/TfheRust/IR:Dialect",
         "@heir//lib/Dialect/TfheRustBool/IR:Dialect",
         "@heir//lib/Transforms/ElementwiseToAffine",
diff --git a/tools/heir-opt.cpp b/tools/heir-opt.cpp
index 0d06e0a15..a39cb8169 100644
--- a/tools/heir-opt.cpp
+++ b/tools/heir-opt.cpp
@@ -22,7 +22,10 @@
 #include "include/Dialect/Secret/Transforms/DistributeGeneric.h"
 #include "include/Dialect/Secret/Transforms/Passes.h"
 #include "include/Dialect/TensorExt/IR/TensorExtDialect.h"
+#include "include/Dialect/TensorExt/Transforms/CollapseInsertionChains.h"
+#include "include/Dialect/TensorExt/Transforms/InsertRotate.h"
 #include "include/Dialect/TensorExt/Transforms/Passes.h"
+#include "include/Dialect/TensorExt/Transforms/RotateAndReduce.h"
 #include "include/Dialect/TfheRust/IR/TfheRustDialect.h"
 #include "include/Dialect/TfheRustBool/IR/TfheRustBoolDialect.h"
 #include "include/Transforms/ElementwiseToAffine/ElementwiseToAffine.h"
@@ -174,6 +177,29 @@ void polynomialToLLVMPipelineBuilder(OpPassManager &manager) {
   manager.addPass(createSymbolDCEPass());
 }
 
+void heirSIMDVectorizerPipelineBuilder(OpPassManager &manager) {
+  // For now we unroll loops to enable insert-rotate, but we would like to be
+  // smarter about this and do an affine loop analysis.
+  manager.addPass(createFullLoopUnroll());
+
+  // Insert rotations aligned to slot targets. Future work should provide
+  // alternative methods to optimally align rotations, and allow the user to
+  // configure this via pipeline options.
+  manager.addPass(tensor_ext::createInsertRotate());
+  manager.addPass(createCSEPass());
+  manager.addPass(createCanonicalizerPass());
+
+  manager.addPass(tensor_ext::createCollapseInsertionChains());
+  manager.addPass(createCSEPass());
+  manager.addPass(createSCCPPass());
+  manager.addPass(createCanonicalizerPass());
+
+  manager.addPass(tensor_ext::createRotateAndReduce());
+  manager.addPass(createCSEPass());
+  manager.addPass(createSCCPPass());
+  manager.addPass(createCanonicalizerPass());
+}
+
 #ifndef HEIR_NO_YOSYS
 struct TosaToBooleanTfheOptions
     : public PassPipelineOptions<TosaToBooleanTfheOptions> {
@@ -333,6 +359,13 @@ int main(int argc, char **argv) {
       "Run passes to lower the polynomial dialect to LLVM",
       polynomialToLLVMPipelineBuilder);
 
+  PassPipelineRegistration<>(
+      "heir-simd-vectorizer",
+      "Run scheme-agnostic passes to convert FHE programs that operate on "
+      "scalar types to equivalent programs that operate on vectors and use "
+      "tensor_ext.rotate",
+      heirSIMDVectorizerPipelineBuilder);
+
   return asMainReturnCode(
       MlirOptMain(argc, argv, "HEIR Pass Driver", registry));
 }