Skip to content

Commit

Permalink
Add heir-simd-vectorizer pipeline
Browse files Browse the repository at this point in the history
This is intended to be a consistent bundling of the ported HECO optimizations, so that all tests can use them identically.

Fixes #556
Depends on #531 for changes to hamming-distance test

PiperOrigin-RevId: 619348713
  • Loading branch information
j2kun authored and Copybara-Service committed Mar 27, 2024
1 parent c67f0a0 commit 6085ea8
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 28 deletions.
22 changes: 22 additions & 0 deletions docs/content/en/docs/pipelines.md
Expand Up @@ -7,6 +7,27 @@ weight: 9

## `heir-opt`

### `--heir-simd-vectorizer`

Run scheme-agnostic passes to convert FHE programs that operate on scalar types
to equivalent programs that operate on vectors.

This pass is intended to process FHE programs that are known to be good for
SIMD, but a specific FHE scheme has not yet been chosen. It expects to handle
`arith` ops operating on `tensor` types (with or without `secret.generic`).

The pass unrolls all loops, then applies a series of passes that convert scalar
operations on tensor elements to SIMD operations on full tensors. This uses the
FHE computational model common to BGV, BFV, and CKKS, in which data is packed
in polynomial ciphertexts, interpreted as vectors of individual data elements,
and arithmetic can be applied across entire ciphertexts, with some limited
support for rotations via automorphisms of the underlying ring.

Along the way, this pipeline applies heuristic optimizations to minimize the
number of rotations needed, relying on the implicit cost model that rotations
are generally expensive. The specific set of passes can be found in
`tools/heir-opt.cpp` where the pipeline is defined.

### `--heir-tosa-to-arith`

Lowers a TOSA MLIR model to `func`, `arith`, and `memref`.
Expand All @@ -26,6 +47,7 @@ tool can lower a TFLite FlatBuffer to textual MLIR with
[hello_world.tosa.mlir](https://github.com/google/heir/blob/main/tests/hello_world.tosa.mlir)
for an example.


### `--yosys-optimizer`

Uses Yosys to booleanize and optimize MLIR functions.
Expand Down
31 changes: 20 additions & 11 deletions scripts/lit_to_bazel.py
@@ -1,6 +1,6 @@
from collections import deque
import os
import pathlib
from collections import deque

import fire

Expand All @@ -11,6 +11,7 @@
IN_REDIRECT = "<"
RUN_PREFIX = "// RUN:"


def strip_run_prefix(line):
if RUN_PREFIX in line:
return line.split(RUN_PREFIX)[1]
Expand All @@ -28,8 +29,8 @@ def convert_to_run_commands(run_lines):

line = strip_run_prefix(line)

if '|' in line:
first, second = line.split('|', maxsplit=1)
if "|" in line:
first, second = line.split("|", maxsplit=1)
current_command += " " + first.strip()
cmds.append(current_command.strip())
current_command = ""
Expand All @@ -43,8 +44,8 @@ def convert_to_run_commands(run_lines):
current_command = ""
continue

if line.strip().endswith('\\'):
current_command += " " + line.replace('\\', '').strip()
if line.strip().endswith("\\"):
current_command += " " + line.replace("\\", "").strip()
continue

current_command += line
Expand All @@ -56,6 +57,7 @@ def convert_to_run_commands(run_lines):

def lit_to_bazel(
lit_test_file: str,
git_root: str = "",
):
"""A helper CLI that converts MLIR test files to bazel run commands.
Expand All @@ -64,9 +66,11 @@ def lit_to_bazel(
command.
"""

git_root = pathlib.Path(__file__).parent.parent
if not os.path.isdir(git_root / ".git"):
raise RuntimeError(f"Could not find git root, looked at {git_root}")
if not git_root:
git_root = pathlib.Path(__file__).parent.parent
if not os.path.isdir(git_root / ".git"):
raise RuntimeError(f"Could not find git root, looked at {git_root}")
# if git root is manually specified, just trust it

if not lit_test_file:
raise ValueError("lit_test_file must be provided")
Expand All @@ -81,7 +85,7 @@ def lit_to_bazel(
run_lines.append(line)

commands = convert_to_run_commands(run_lines)
commands = [x for x in commands if 'FileCheck' not in x]
commands = [x for x in commands if "FileCheck" not in x]
# remove consecutive and trailing pipes
if commands[-1] == PIPE:
commands.pop()
Expand All @@ -95,8 +99,13 @@ def lit_to_bazel(
# I would consider using bazel-bin/tools/heir-opt, but the yosys
# requirement requires additional env vars to be set for the yosys and ABC
# paths, which is not yet worth doing for this script.
joined = joined.replace("heir-opt", "bazel run --noallow_analysis_cache_discard //tools:heir-opt --")
joined = joined.replace("heir-translate", f"{git_root}/bazel-bin/tools/heir-translate")
joined = joined.replace(
"heir-opt",
"bazel run --noallow_analysis_cache_discard //tools:heir-opt --",
)
joined = joined.replace(
"heir-translate", f"{git_root}/bazel-bin/tools/heir-translate"
)
joined = joined.replace("%s", str(pathlib.Path(lit_test_file).absolute()))
print(joined)

Expand Down
File renamed without changes.
@@ -1,13 +1,11 @@
// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse \
// RUN: --full-loop-unroll \
// RUN: --insert-rotate --cse --canonicalize --collapse-insertion-chains --canonicalize --cse \
// RUN: %s | FileCheck %s
// RUN: --heir-simd-vectorizer %s | FileCheck %s

module {
// CHECK-LABEL: @box_blur
// CHECK-NOT: tensor.extract
// CHECK-COUNT-7: tensor_ext.rotate
func.func @box_blur(%arg0: tensor<16xi16> {secret.secret}) -> tensor<16xi16> {
func.func @box_blur(%arg0: tensor<16xi16>) -> tensor<16xi16> {
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%0 = affine.for %x = 0 to 4 iter_args(%arg0_x = %arg0) -> (tensor<16xi16>) {
Expand Down
@@ -1,6 +1,5 @@
// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse --full-loop-unroll \
// RUN: --insert-rotate --cse --canonicalize --collapse-insertion-chains --canonicalize --cse \
// RUN: %s | FileCheck %s
// RUN: heir-opt --secretize=entry-function=box_blur --wrap-generic --canonicalize --cse \
// RUN: --heir-simd-vectorizer %s | FileCheck %s

module {
// CHECK-LABEL: @box_blur
Expand Down Expand Up @@ -30,7 +29,7 @@ module {
// CHECK-NEXT: secret.yield %[[v15]]
// CHECK-NEXT: } -> !secret.secret<tensor<4096xi16>>
// CHECK-NEXT: return %[[v0]]
func.func @box_blur(%arg0: tensor<4096xi16> {secret.secret}) -> tensor<4096xi16> {
func.func @box_blur(%arg0: tensor<4096xi16>) -> tensor<4096xi16> {
%c4096 = arith.constant 4096 : index
%c64 = arith.constant 64 : index
%0 = affine.for %x = 0 to 64 iter_args(%arg0_x = %arg0) -> (tensor<4096xi16>) {
Expand Down
@@ -1,6 +1,5 @@
// RUN: heir-opt --secretize=entry-function=hamming --wrap-generic --canonicalize --cse \
// RUN: --full-loop-unroll --cse --canonicalize --insert-rotate --cse --canonicalize \
// RUN: %s | FileCheck %s
// RUN: --heir-simd-vectorizer %s | FileCheck %s

// CHECK-LABEL: @hamming
// CHECK: secret.generic
Expand All @@ -15,10 +14,11 @@
// CHECK-NEXT: tensor.extract
// CHECK-NEXT: secret.yield

// TODO(#521): support rotate-and-reduce when the input is already a series of incremental rotations,
// as this IR is currently lowered to 4-1 rotate operations to sum after doing (x-y)**2 in SIMD.
// TODO(#521): Fix rotate-and-reduce to work on this IR.
// The problem is that the lattice identifies the rotate-version of this IR as
// being overdetermined.

func.func @hamming(%arg0: tensor<4xi16> {secret.secret}, %arg1: tensor<4xi16> {secret.secret}) -> i16 {
func.func @hamming(%arg0: tensor<4xi16>, %arg1: tensor<4xi16>) -> i16 {
%c0 = arith.constant 0 : index
%c0_si16 = arith.constant 0 : i16
%0 = affine.for %arg2 = 0 to 4 iter_args(%arg6 = %c0_si16) -> i16 {
Expand Down
@@ -1,14 +1,12 @@
// RUN: heir-opt --secretize=entry-function=simple_sum --wrap-generic --canonicalize --cse \
// RUN: --full-loop-unroll --insert-rotate --cse --canonicalize \
// RUN: --rotate-and-reduce --canonicalize \
// RUN: %s | FileCheck %s
// RUN: --heir-simd-vectorizer %s | FileCheck %s

// Sum all entries of a tensor into a single scalar
// CHECK-LABEL: @simple_sum
// CHECK: secret.generic
// CHECK-COUNT-5: tensor_ext.rotate
// CHECK-NOT: tensor_ext.rotate
func.func @simple_sum(%arg0: tensor<32xi16> {secret.secret}) -> i16 {
func.func @simple_sum(%arg0: tensor<32xi16>) -> i16 {
%c0 = arith.constant 0 : index
%c0_si16 = arith.constant 0 : i16
%0 = affine.for %i = 0 to 32 iter_args(%sum_iter = %c0_si16) -> i16 {
Expand Down
3 changes: 3 additions & 0 deletions tools/BUILD
Expand Up @@ -56,6 +56,9 @@ cc_binary(
"@heir//lib/Dialect/Secret/Transforms:DistributeGeneric",
"@heir//lib/Dialect/TensorExt/IR:Dialect",
"@heir//lib/Dialect/TensorExt/Transforms",
"@heir//lib/Dialect/TensorExt/Transforms:CollapseInsertionChains",
"@heir//lib/Dialect/TensorExt/Transforms:InsertRotate",
"@heir//lib/Dialect/TensorExt/Transforms:RotateAndReduce",
"@heir//lib/Dialect/TfheRust/IR:Dialect",
"@heir//lib/Dialect/TfheRustBool/IR:Dialect",
"@heir//lib/Transforms/ElementwiseToAffine",
Expand Down
33 changes: 33 additions & 0 deletions tools/heir-opt.cpp
Expand Up @@ -22,7 +22,10 @@
#include "include/Dialect/Secret/Transforms/DistributeGeneric.h"
#include "include/Dialect/Secret/Transforms/Passes.h"
#include "include/Dialect/TensorExt/IR/TensorExtDialect.h"
#include "include/Dialect/TensorExt/Transforms/CollapseInsertionChains.h"
#include "include/Dialect/TensorExt/Transforms/InsertRotate.h"
#include "include/Dialect/TensorExt/Transforms/Passes.h"
#include "include/Dialect/TensorExt/Transforms/RotateAndReduce.h"
#include "include/Dialect/TfheRust/IR/TfheRustDialect.h"
#include "include/Dialect/TfheRustBool/IR/TfheRustBoolDialect.h"
#include "include/Transforms/ElementwiseToAffine/ElementwiseToAffine.h"
Expand Down Expand Up @@ -174,6 +177,29 @@ void polynomialToLLVMPipelineBuilder(OpPassManager &manager) {
manager.addPass(createSymbolDCEPass());
}

void heirSIMDVectorizerPipelineBuilder(OpPassManager &manager) {
// For now we unroll loops to enable insert-rotate, but we would like to be
// smarter about this and do an affine loop analysis.
manager.addPass(createFullLoopUnroll());

// Insert rotations aligned to slot targets. Future work should provide
// alternative methods to optimally align rotations, and allow the user to
// configure this via pipeline options.
manager.addPass(tensor_ext::createInsertRotate());
manager.addPass(createCSEPass());
manager.addPass(createCanonicalizerPass());

manager.addPass(tensor_ext::createCollapseInsertionChains());
manager.addPass(createCSEPass());
manager.addPass(createSCCPPass());
manager.addPass(createCanonicalizerPass());

manager.addPass(tensor_ext::createRotateAndReduce());
manager.addPass(createCSEPass());
manager.addPass(createSCCPPass());
manager.addPass(createCanonicalizerPass());
}

#ifndef HEIR_NO_YOSYS
struct TosaToBooleanTfheOptions
: public PassPipelineOptions<TosaToBooleanTfheOptions> {
Expand Down Expand Up @@ -333,6 +359,13 @@ int main(int argc, char **argv) {
"Run passes to lower the polynomial dialect to LLVM",
polynomialToLLVMPipelineBuilder);

PassPipelineRegistration<>(
"heir-simd-vectorizer",
"Run scheme-agnostic passes to convert FHE programs that operate on "
"scalar types to equivalent programs that operate on vectors and use "
"tensor_ext.rotate",
heirSIMDVectorizerPipelineBuilder);

return asMainReturnCode(
MlirOptMain(argc, argv, "HEIR Pass Driver", registry));
}

0 comments on commit 6085ea8

Please sign in to comment.