Skip to content
This repository has been archived by the owner on Dec 22, 2021. It is now read-only.

LLVM inlines constant generation for swizzle #371

Open
omnisip opened this issue Oct 5, 2020 · 1 comment
Open

LLVM inlines constant generation for swizzle #371

omnisip opened this issue Oct 5, 2020 · 1 comment

Comments

@omnisip
Copy link

omnisip commented Oct 5, 2020

#include <wasm_simd128.h>
#define SIMDPP_ARCH_X86_SSE4_1 1
#include "simdpp/simd.h"
#include <xmmintrin.h>
#include <tmmintrin.h>
#include <iomanip>
#include <array>
#include <cstdio>
#include <thread>
#include <future>
#include <iostream>

   // SWIZZLE constants
   
  const static __u32x4 sw1 __attribute__((require_constant_initialization)) = {0xffffff00, 0xffffff01, 0xffffff02, 0xffffff03};
   const static __u32x4 sw2  __attribute__((require_constant_initialization))  = {0xffffff04, 0xffffff05, 0xffffff06, 0xffffff07};
   const static __u32x4 sw3  __attribute__((require_constant_initialization)) = {0xffffff08, 0xffffff09, 0xffffff0a, 0xffffff0b};
   const static __u32x4 sw4   __attribute__((require_constant_initialization)) = {0xffffff0c, 0xffffff0d, 0xffffff0e, 0xffffff0f};
   

inline void sumMatrixPass1(uint8_t const* pImgData1, uint8_t const* pImgData2, uint32_t* pSumArray,
       
                           unsigned width, unsigned height, unsigned hBegin, unsigned hEnd) {
   using namespace simdpp;
   uint32x4 zero = make_zero();


      
  for (unsigned h = hBegin; h < hEnd; ++h)
  {
    uint32x4 lastValSplatX = zero;
    uint32x4 lastValSplatY = zero;
    uint32x4 lastValSplatXX = zero;
    uint32x4 lastValSplatYY = zero;
    uint32x4 lastValSplatXY = zero;
    unsigned w = 0;
    for (; w+15 < width; w += 16)
    {
      uint8x16 imgDataXOrig  = uint8x16(load(pImgData1 + h * width + w));
      uint8x16 imgDataYOrig = uint8x16(load(pImgData2 + h * width + w));
        {
        // a_0 a_1 a_2 a_3
            
        uint32v4::base_vector_type   x = wasm_v8x16_swizzle(imgDataXOrig.native(), sw1);
        uint32v4::base_vector_type  y = wasm_v8x16_swizzle(imgDataYOrig.native(), sw1);
        uint32v4::base_vector_type xx = mul_lo(x,x);
        uint32v4::base_vector_type yy = mul_lo(y, y);
        uint32v4::base_vector_type xy = mul_lo(x, y);

        // a_0 a_0+a_1 a_1+a_2 a_2+a_3
        x = add(x, move4_r<1>(x));
        x = add(x, move4_r<2>(x));
        x = add(x, lastValSplatX);
        lastValSplatX = permute4<3,3,3,3>(x);
        store(pSumArray+h*width+w+0*4, x);
        y = add(y, move4_r<1>(y));
        y = add(y, move4_r<2>(y));
        y = add(y, lastValSplatY);
        lastValSplatY = permute4<3,3,3,3>(y);
        store(width*height+pSumArray+h*width+w+0*4, y);
        xx = add(xx, move4_r<1>(xx));
        xx = add(xx, move4_r<2>(xx));
        xx = add(xx, lastValSplatXX);
        lastValSplatXX = permute4<3,3,3,3>(xx);
        store(2*width*height+pSumArray+h*width+w+0*4, xx);
        yy = add(yy, move4_r<1>(yy));
        yy = add(yy, move4_r<2>(yy));
        yy = add(yy, lastValSplatYY);
        lastValSplatYY = permute4<3,3,3,3>(yy);
        store(3*width*height+pSumArray+h*width+w+0*4, yy);
        xy = add(xy, move4_r<1>(xy));
        xy = add(xy, move4_r<2>(xy));
        xy = add(xy, lastValSplatXY);
        lastValSplatXY = permute4<3,3,3,3>(xy);
        store(4*width*height+pSumArray+h*width+w+0*4, xy);
      }
      {
        uint32v4::base_vector_type   x = wasm_v8x16_swizzle(imgDataXOrig.native(), sw2);
        uint32v4::base_vector_type  y = wasm_v8x16_swizzle(imgDataYOrig.native(), sw2);
        uint32v4::base_vector_type xx = mul_lo(x,x);
        uint32v4::base_vector_type yy = mul_lo(y, y);
        uint32v4::base_vector_type xy = mul_lo(x, y);
        // a_0 a_0+a_1 a_1+a_2 a_2+a_3
        x = add(x, move4_r<1>(x));
        x = add(x, move4_r<2>(x));
        x = add(x, lastValSplatX);
        lastValSplatX = permute4<3,3,3,3>(x);
        store(pSumArray+h*width+w+1*4, x);
        y = add(y, move4_r<1>(y));
        y = add(y, move4_r<2>(y));
        y = add(y, lastValSplatY);
        lastValSplatY = permute4<3,3,3,3>(y);
        store(width*height+pSumArray+h*width+w+1*4, y);
        xx = add(xx, move4_r<1>(xx));
        xx = add(xx, move4_r<2>(xx));
        xx = add(xx, lastValSplatXX);
        lastValSplatXX = permute4<3,3,3,3>(xx);
        store(2*width*height+pSumArray+h*width+w+1*4, xx);
        yy = add(yy, move4_r<1>(yy));
        yy = add(yy, move4_r<2>(yy));
        yy = add(yy, lastValSplatYY);
        lastValSplatYY = permute4<3,3,3,3>(yy);
        store(3*width*height+pSumArray+h*width+w+1*4, yy);
        xy = add(xy, move4_r<1>(xy));
        xy = add(xy, move4_r<2>(xy));
        xy = add(xy, lastValSplatXY);
        lastValSplatXY = permute4<3,3,3,3>(xy);
        store(4*width*height+pSumArray+h*width+w+1*4, xy);
      }
      
    {
        uint32v4::base_vector_type   x = wasm_v8x16_swizzle(imgDataXOrig.native(), sw3);
        uint32v4::base_vector_type  y = wasm_v8x16_swizzle(imgDataYOrig.native(), sw3);
        uint32v4::base_vector_type xx = mul_lo(x,x);
        uint32v4::base_vector_type yy = mul_lo(y, y);
        uint32v4::base_vector_type xy = mul_lo(x, y);
        // a_0 a_0+a_1 a_1+a_2 a_2+a_3
        x = add(x, move4_r<1>(x));
        x = add(x, move4_r<2>(x));
        x = add(x, lastValSplatX);
        lastValSplatX = permute4<3,3,3,3>(x);
        store(pSumArray+h*width+w+2*4, x);
        y = add(y, move4_r<1>(y));
        y = add(y, move4_r<2>(y));
        y = add(y, lastValSplatY);
        lastValSplatY = permute4<3,3,3,3>(y);
        store(width*height+pSumArray+h*width+w+2*4, y);
        xx = add(xx, move4_r<1>(xx));
        xx = add(xx, move4_r<2>(xx));
        xx = add(xx, lastValSplatXX);
        lastValSplatXX = permute4<3,3,3,3>(xx);
        store(2*width*height+pSumArray+h*width+w+2*4, xx);
        yy = add(yy, move4_r<1>(yy));
        yy = add(yy, move4_r<2>(yy));
        yy = add(yy, lastValSplatYY);
        lastValSplatYY = permute4<3,3,3,3>(yy);
        store(3*width*height+pSumArray+h*width+w+2*4, yy);
        xy = add(xy, move4_r<1>(xy));
        xy = add(xy, move4_r<2>(xy));
        xy = add(xy, lastValSplatXY);
        lastValSplatXY = permute4<3,3,3,3>(xy);
        store(4*width*height+pSumArray+h*width+w+2*4, xy);
      }
      
          {
        uint32v4::base_vector_type   x = wasm_v8x16_swizzle(imgDataXOrig.native(), sw4);
        uint32v4::base_vector_type  y = wasm_v8x16_swizzle(imgDataYOrig.native(), sw4);
        uint32v4::base_vector_type xx = mul_lo(x,x);
        uint32v4::base_vector_type yy = mul_lo(y, y);
        uint32v4::base_vector_type xy = mul_lo(x, y);
        // a_0 a_0+a_1 a_1+a_2 a_2+a_3
        x = add(x, move4_r<1>(x));
        x = add(x, move4_r<2>(x));
        x = add(x, lastValSplatX);
        lastValSplatX = permute4<3,3,3,3>(x);
        store(pSumArray+h*width+w+2*4, x);
        y = add(y, move4_r<1>(y));
        y = add(y, move4_r<2>(y));
        y = add(y, lastValSplatY);
        lastValSplatY = permute4<3,3,3,3>(y);
        store(width*height+pSumArray+h*width+w+3*4, y);
        xx = add(xx, move4_r<1>(xx));
        xx = add(xx, move4_r<2>(xx));
        xx = add(xx, lastValSplatXX);
        lastValSplatXX = permute4<3,3,3,3>(xx);
        store(2*width*height+pSumArray+h*width+w+3*4, xx);
        yy = add(yy, move4_r<1>(yy));
        yy = add(yy, move4_r<2>(yy));
        yy = add(yy, lastValSplatYY);
        lastValSplatYY = permute4<3,3,3,3>(yy);
        store(3*width*height+pSumArray+h*width+w+3*4, yy);
        xy = add(xy, move4_r<1>(xy));
        xy = add(xy, move4_r<2>(xy));
        xy = add(xy, lastValSplatXY);
        lastValSplatXY = permute4<3,3,3,3>(xy);
        store(4*width*height+pSumArray+h*width+w+3*4, xy);
      }
      
    }
  }
}

.Ltmp149:
        .loc    2 44 68                         # rgb2y-sample.cpp:44:68
        local.get       15
        local.get       4
        i32.add 
.Ltmp150:
        .loc    52 34 9                         # ./simdpp/detail/insn/load.h:34:9
        v128.load       0
        local.tee       26
        i64.const       -1090921693440
.Ltmp151:
        .loc    2 60 42                         # rgb2y-sample.cpp:60:42
        i64x2.splat
        i64.const       -1082331758846
        i64x2.replace_lane      1
        local.tee       27
        v8x16.swizzle

In this particular code sample, only 2 of the swizzles regenerate the shuffle mask in the loop. The other 2 treat them as constants...

@omnisip
Copy link
Author

omnisip commented Oct 5, 2020

@tlively This is what you asked for, I think.

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

2 participants