From 870a670cd34a1c3fa4d1f64cae410d3bb394bdf8 Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Tue, 26 Mar 2024 16:36:22 -0700 Subject: [PATCH] Add `heir-simd-vectorizer` pipeline This is intended to be a consistent bundling of the ported HECO optimizations, so that all tests can use them identically. Fixes https://github.com/google/heir/issues/556 Depends on https://github.com/google/heir/pull/531 for changes to hamming-distance test PiperOrigin-RevId: 619348713 --- docs/content/en/docs/pipelines.md | 22 +++++++++++++ tests/{simd => heir_simd_vectorizer}/BUILD | 0 .../box_blur_4x4.mlir | 6 ++-- .../box_blur_64x64.mlir | 7 ++-- .../hamming_distance.mlir | 10 +++--- .../simple_sum.mlir | 6 ++-- tools/BUILD | 3 ++ tools/heir-opt.cpp | 33 +++++++++++++++++++ 8 files changed, 70 insertions(+), 17 deletions(-) rename tests/{simd => heir_simd_vectorizer}/BUILD (100%) rename tests/{simd => heir_simd_vectorizer}/box_blur_4x4.mlir (85%) rename tests/{simd => heir_simd_vectorizer}/box_blur_64x64.mlir (92%) rename tests/{simd => heir_simd_vectorizer}/hamming_distance.mlir (69%) rename tests/{simd => heir_simd_vectorizer}/simple_sum.mlir (72%) diff --git a/docs/content/en/docs/pipelines.md b/docs/content/en/docs/pipelines.md index ba77f23a9..4f07e6606 100644 --- a/docs/content/en/docs/pipelines.md +++ b/docs/content/en/docs/pipelines.md @@ -7,6 +7,27 @@ weight: 9 ## `heir-opt` +### `--heir-simd-vectorizer` + +Run scheme-agnostic passes to convert FHE programs that operate on scalar types +to equivalent programs that operate on vectors. + +This pass is intended to process FHE programs that are known to be good for +SIMD, but a specific FHE scheme has not yet been chosen. It expects to handle +`arith` ops operating on `tensor` types (with or without `secret.generic`). + +The pass unrolls all loops, then applies a series of passes that convert scalar +operations on tensor elements to SIMD operations on full tensors. This uses the +FHE computational model common to BGV, BFV, and CKKS, in which data is packed +in polynomial ciphertexts, interpreted as vectors of individual data elements, +and arithmetic can be applied across entire ciphertexts, with some limited +support for rotations via automorphisms of the underlying ring. + +Along the way, this pipeline applies heuristic optimizations to minimize the +number of rotations needed, relying on the implicit cost model that rotations +are generally expensive. The specific set of passes can be found in +`tools/heir-opt.cpp` where the pipeline is defined. + ### `--heir-tosa-to-arith` Lowers a TOSA MLIR model to `func`, `arith`, and `memref`. @@ -26,6 +47,7 @@ tool can lower a TFLite FlatBuffer to textual MLIR with [hello_world.tosa.mlir](https://github.com/google/heir/blob/main/tests/hello_world.tosa.mlir) for an example. + ### `--yosys-optimizer` Uses Yosys to booleanize and optimize MLIR functions. diff --git a/tests/simd/BUILD b/tests/heir_simd_vectorizer/BUILD similarity index 100% rename from tests/simd/BUILD rename to tests/heir_simd_vectorizer/BUILD diff --git a/tests/simd/box_blur_4x4.mlir b/tests/heir_simd_vectorizer/box_blur_4x4.mlir similarity index 85% rename from tests/simd/box_blur_4x4.mlir rename to tests/heir_simd_vectorizer/box_blur_4x4.mlir index 007ba1d35..c99e48f55 100644 --- a/tests/simd/box_blur_4x4.mlir +++ b/tests/heir_simd_vectorizer/box_blur_4x4.mlir @@ -1,13 +1,11 @@ // RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse \ -// RUN: --full-loop-unroll \ -// RUN: --insert-rotate --cse --canonicalize --collapse-insertion-chains --canonicalize --cse \ -// RUN: %s | FileCheck %s +// RUN: --heir-simd-vectorizer %s | FileCheck %s module { // CHECK-LABEL: @box_blur // CHECK-NOT: tensor.extract // CHECK-COUNT-7: tensor_ext.rotate - func.func @box_blur(%arg0: tensor<16xi16> {secret.secret}) -> tensor<16xi16> { + func.func @box_blur(%arg0: tensor<16xi16>) -> tensor<16xi16> { %c16 = arith.constant 16 : index %c4 = arith.constant 4 : index %0 = affine.for %x = 0 to 4 iter_args(%arg0_x = %arg0) -> (tensor<16xi16>) { diff --git a/tests/simd/box_blur_64x64.mlir b/tests/heir_simd_vectorizer/box_blur_64x64.mlir similarity index 92% rename from tests/simd/box_blur_64x64.mlir rename to tests/heir_simd_vectorizer/box_blur_64x64.mlir index 21fb28d95..8e6fb5d57 100644 --- a/tests/simd/box_blur_64x64.mlir +++ b/tests/heir_simd_vectorizer/box_blur_64x64.mlir @@ -1,6 +1,5 @@ -// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse --full-loop-unroll \ -// RUN: --insert-rotate --cse --canonicalize --collapse-insertion-chains --canonicalize --cse \ -// RUN: %s | FileCheck %s +// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse \ +// RUN: --heir-simd-vectorizer %s | FileCheck %s module { // CHECK-LABEL: @box_blur @@ -30,7 +29,7 @@ module { // CHECK-NEXT: secret.yield %[[v15]] // CHECK-NEXT: } -> !secret.secret> // CHECK-NEXT: return %[[v0]] - func.func @box_blur(%arg0: tensor<4096xi16> {secret.secret}) -> tensor<4096xi16> { + func.func @box_blur(%arg0: tensor<4096xi16>) -> tensor<4096xi16> { %c4096 = arith.constant 4096 : index %c64 = arith.constant 64 : index %0 = affine.for %x = 0 to 64 iter_args(%arg0_x = %arg0) -> (tensor<4096xi16>) { diff --git a/tests/simd/hamming_distance.mlir b/tests/heir_simd_vectorizer/hamming_distance.mlir similarity index 69% rename from tests/simd/hamming_distance.mlir rename to tests/heir_simd_vectorizer/hamming_distance.mlir index f8aa39500..84151900f 100644 --- a/tests/simd/hamming_distance.mlir +++ b/tests/heir_simd_vectorizer/hamming_distance.mlir @@ -1,6 +1,5 @@ // RUN: heir-opt --secretize=entry-function=hamming --wrap-generic --canonicalize --cse \ -// RUN: --full-loop-unroll --cse --canonicalize --insert-rotate --cse --canonicalize \ -// RUN: %s | FileCheck %s +// RUN: --heir-simd-vectorizer %s | FileCheck %s // CHECK-LABEL: @hamming // CHECK: secret.generic @@ -15,10 +14,11 @@ // CHECK-NEXT: tensor.extract // CHECK-NEXT: secret.yield -// TODO(#521): support rotate-and-reduce when the input is already a series of incremental rotations, -// as this IR is currently lowered to 4-1 rotate operations to sum after doing (x-y)**2 in SIMD. +// TODO(#521): Fix rotate-and-reduce to work on this IR. +// The problem is that the lattice identifies the rotate-version of this IR as +// being overdetermined. -func.func @hamming(%arg0: tensor<4xi16> {secret.secret}, %arg1: tensor<4xi16> {secret.secret}) -> i16 { +func.func @hamming(%arg0: tensor<4xi16>, %arg1: tensor<4xi16>) -> i16 { %c0 = arith.constant 0 : index %c0_si16 = arith.constant 0 : i16 %0 = affine.for %arg2 = 0 to 4 iter_args(%arg6 = %c0_si16) -> i16 { diff --git a/tests/simd/simple_sum.mlir b/tests/heir_simd_vectorizer/simple_sum.mlir similarity index 72% rename from tests/simd/simple_sum.mlir rename to tests/heir_simd_vectorizer/simple_sum.mlir index 2af7833dc..7596a87ee 100644 --- a/tests/simd/simple_sum.mlir +++ b/tests/heir_simd_vectorizer/simple_sum.mlir @@ -1,14 +1,12 @@ // RUN: heir-opt --secretize=entry-function=simple_sum --wrap-generic --canonicalize --cse \ -// RUN: --full-loop-unroll --insert-rotate --cse --canonicalize \ -// RUN: --rotate-and-reduce --canonicalize \ -// RUN: %s | FileCheck %s +// RUN: --heir-simd-vectorizer %s | FileCheck %s // Sum all entries of a tensor into a single scalar // CHECK-LABEL: @simple_sum // CHECK: secret.generic // CHECK-COUNT-5: tensor_ext.rotate // CHECK-NOT: tensor_ext.rotate -func.func @simple_sum(%arg0: tensor<32xi16> {secret.secret}) -> i16 { +func.func @simple_sum(%arg0: tensor<32xi16>) -> i16 { %c0 = arith.constant 0 : index %c0_si16 = arith.constant 0 : i16 %0 = affine.for %i = 0 to 32 iter_args(%sum_iter = %c0_si16) -> i16 { diff --git a/tools/BUILD b/tools/BUILD index df1e9b490..fa54343e8 100644 --- a/tools/BUILD +++ b/tools/BUILD @@ -56,6 +56,9 @@ cc_binary( "@heir//lib/Dialect/Secret/Transforms:DistributeGeneric", "@heir//lib/Dialect/TensorExt/IR:Dialect", "@heir//lib/Dialect/TensorExt/Transforms", + "@heir//lib/Dialect/TensorExt/Transforms:CollapseInsertionChains", + "@heir//lib/Dialect/TensorExt/Transforms:InsertRotate", + "@heir//lib/Dialect/TensorExt/Transforms:RotateAndReduce", "@heir//lib/Dialect/TfheRust/IR:Dialect", "@heir//lib/Dialect/TfheRustBool/IR:Dialect", "@heir//lib/Transforms/ElementwiseToAffine", diff --git a/tools/heir-opt.cpp b/tools/heir-opt.cpp index 0d06e0a15..a39cb8169 100644 --- a/tools/heir-opt.cpp +++ b/tools/heir-opt.cpp @@ -22,7 +22,10 @@ #include "include/Dialect/Secret/Transforms/DistributeGeneric.h" #include "include/Dialect/Secret/Transforms/Passes.h" #include "include/Dialect/TensorExt/IR/TensorExtDialect.h" +#include "include/Dialect/TensorExt/Transforms/CollapseInsertionChains.h" +#include "include/Dialect/TensorExt/Transforms/InsertRotate.h" #include "include/Dialect/TensorExt/Transforms/Passes.h" +#include "include/Dialect/TensorExt/Transforms/RotateAndReduce.h" #include "include/Dialect/TfheRust/IR/TfheRustDialect.h" #include "include/Dialect/TfheRustBool/IR/TfheRustBoolDialect.h" #include "include/Transforms/ElementwiseToAffine/ElementwiseToAffine.h" @@ -174,6 +177,29 @@ void polynomialToLLVMPipelineBuilder(OpPassManager &manager) { manager.addPass(createSymbolDCEPass()); } +void heirSIMDVectorizerPipelineBuilder(OpPassManager &manager) { + // For now we unroll loops to enable insert-rotate, but we would like to be + // smarter about this and do an affine loop analysis. + manager.addPass(createFullLoopUnroll()); + + // Insert rotations aligned to slot targets. Future work should provide + // alternative methods to optimally align rotations, and allow the user to + // configure this via pipeline options. + manager.addPass(tensor_ext::createInsertRotate()); + manager.addPass(createCSEPass()); + manager.addPass(createCanonicalizerPass()); + + manager.addPass(tensor_ext::createCollapseInsertionChains()); + manager.addPass(createCSEPass()); + manager.addPass(createSCCPPass()); + manager.addPass(createCanonicalizerPass()); + + manager.addPass(tensor_ext::createRotateAndReduce()); + manager.addPass(createCSEPass()); + manager.addPass(createSCCPPass()); + manager.addPass(createCanonicalizerPass()); +} + #ifndef HEIR_NO_YOSYS struct TosaToBooleanTfheOptions : public PassPipelineOptions { @@ -333,6 +359,13 @@ int main(int argc, char **argv) { "Run passes to lower the polynomial dialect to LLVM", polynomialToLLVMPipelineBuilder); + PassPipelineRegistration<>( + "heir-simd-vectorizer", + "Run scheme-agnostic passes to convert FHE programs that operate on " + "scalar types to equivalent programs that operate on vectors and use " + "tensor_ext.rotate", + heirSIMDVectorizerPipelineBuilder); + return asMainReturnCode( MlirOptMain(argc, argv, "HEIR Pass Driver", registry)); }