Skip to content

Commit

Permalink
fse: Optimize compression (#745)
Browse files Browse the repository at this point in the history
* fse: Optimize table building

Skipping the loop body when v == 0 helps endzerobits and normcount2. Not
writing to s.symbolLen in every iteration helps the other benchmarks.

name                     old speed      new speed      delta
Compress/gettysburg-8     181MB/s ± 1%   183MB/s ± 0%   +1.15%  (p=0.002 n=10+8)
Compress/digits-8         241MB/s ± 0%   241MB/s ± 1%     ~     (p=0.434 n=9+10)
Compress/twain-8          218MB/s ± 0%   218MB/s ± 0%     ~     (p=0.755 n=10+10)
Compress/low-ent-8        239MB/s ± 0%   239MB/s ± 1%     ~     (p=0.853 n=10+10)
Compress/superlow-ent-8   208MB/s ± 1%   208MB/s ± 0%     ~     (p=0.408 n=9+7)
Compress/endzerobits-8   11.5MB/s ± 1%  13.3MB/s ± 1%  +16.35%  (p=0.000 n=10+9)
Compress/pngdata.001-8    224MB/s ± 0%   224MB/s ± 1%   +0.38%  (p=0.004 n=8+10)
Compress/normcount2-8    35.7MB/s ± 1%  36.6MB/s ± 1%   +2.66%  (p=0.000 n=10+9)

* fse: Skip bounds checks

each occurrence of

	v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]

now incurs three bounds checks instead of four. I haven't found a way to
eliminate the remaining three.

name                     old speed      new speed      delta
Compress/gettysburg-8     183MB/s ± 0%   189MB/s ± 0%  +3.32%  (p=0.000 n=8+9)
Compress/digits-8         241MB/s ± 1%   251MB/s ± 1%  +4.14%  (p=0.000 n=10+9)
Compress/twain-8          218MB/s ± 0%   228MB/s ± 0%  +4.36%  (p=0.000 n=10+10)
Compress/low-ent-8        239MB/s ± 1%   244MB/s ± 1%  +1.90%  (p=0.000 n=10+10)
Compress/superlow-ent-8   208MB/s ± 0%   210MB/s ± 0%  +0.89%  (p=0.000 n=7+8)
Compress/endzerobits-8   13.3MB/s ± 1%  13.4MB/s ± 1%  +0.40%  (p=0.019 n=9+10)
Compress/pngdata.001-8    224MB/s ± 1%   225MB/s ± 1%  +0.41%  (p=0.006 n=10+9)
Compress/normcount2-8    36.6MB/s ± 1%  36.4MB/s ± 1%  -0.62%  (p=0.012 n=9+10)
  • Loading branch information
greatroar committed Jan 21, 2023
1 parent 5f40643 commit e766bf7
Showing 1 changed file with 15 additions and 16 deletions.
31 changes: 15 additions & 16 deletions fse/compress.go
Expand Up @@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error {
c1.encodeZero(tt[src[ip-2]])
ip -= 2
}
src = src[:ip]

// Main compression loop.
switch {
case !s.zeroBits && s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush.
// We do not need to check if any output is 0 bits.
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0])
c1.encode(tt[v1])
c2.encode(tt[v2])
c1.encode(tt[v3])
ip -= 4
}
case !s.zeroBits:
// We do not need to check if any output is 0 bits.
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encode(tt[v0])
c1.encode(tt[v1])
s.bw.flush32()
c2.encode(tt[v2])
c1.encode(tt[v3])
ip -= 4
}
case s.actualTableLog <= 8:
// We can encode 4 symbols without requiring a flush
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1])
c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3])
ip -= 4
}
default:
for ip >= 4 {
for ; len(src) >= 4; src = src[:len(src)-4] {
s.bw.flush32()
v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
c2.encodeZero(tt[v0])
c1.encodeZero(tt[v1])
s.bw.flush32()
c2.encodeZero(tt[v2])
c1.encodeZero(tt[v3])
ip -= 4
}
}

Expand Down Expand Up @@ -459,15 +456,17 @@ func (s *Scratch) countSimple(in []byte) (max int) {
for _, v := range in {
s.count[v]++
}
m := uint32(0)
m, symlen := uint32(0), s.symbolLen
for i, v := range s.count[:] {
if v == 0 {
continue
}
if v > m {
m = v
}
if v > 0 {
s.symbolLen = uint16(i) + 1
}
symlen = uint16(i) + 1
}
s.symbolLen = symlen
return int(m)
}

Expand Down

0 comments on commit e766bf7

Please sign in to comment.