update kcp-go package

This commit is contained in:
fatedier
2019-03-17 17:09:54 +08:00
parent 87a4de4370
commit fdcdccb0c2
122 changed files with 14490 additions and 2469 deletions

View File

@@ -1,40 +0,0 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
/.idea
/backup
/loopunroll/
cpu.out
mathtool/galois/
mathtool/matrix/
mem.out
/examples/
/.DS_Store
/mathtool/cntinverse
/invert
/bakcup
/buf.svg
*.svg
*.out
/escape

View File

@@ -1,9 +0,0 @@
language: go
go:
- 1.9
install:
- go get github.com/templexxx/reedsolomon
script:
- go test -v

View File

@@ -1,23 +0,0 @@
MIT License
Copyright (c) 2017 Templexxx
Copyright (c) 2015 Klaus Post
Copyright (c) 2015 Backblaze
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -1,108 +0,0 @@
# Reed-Solomon
[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8]
[1]: https://godoc.org/github.com/templexxx/reedsolomon?status.svg
[2]: https://godoc.org/github.com/templexxx/reedsolomon
[3]: https://img.shields.io/badge/license-MIT-blue.svg
[4]: LICENSE
[5]: https://travis-ci.org/templexxx/reedsolomon.svg?branch=master
[6]: https://travis-ci.org/templexxx/reedsolomon
[7]: https://goreportcard.com/badge/github.com/templexxx/reedsolomon
[8]: https://goreportcard.com/report/github.com/templexxx/reedsolomon
## Introduction:
1. Reed-Solomon Erasure Code engine in pure Go.
2. Super Fast: more than 10GB/s per physics core ( 10+4, 4KB per vector, Macbook Pro 2.8 GHz Intel Core i7 )
## Installation
To get the package use the standard:
```bash
go get github.com/templexxx/reedsolomon
```
## Documentation
See the associated [GoDoc](http://godoc.org/github.com/templexxx/reedsolomon)
## Specification
### GOARCH
1. All arch are supported
2. 0.1.0 need go1.9 for sync.Map in AMD64
### Math
1. Coding over in GF(2^8)
2. Primitive Polynomial: x^8 + x^4 + x^3 + x^2 + 1 (0x1d)
3. mathtool/gentbls.go : generator Primitive Polynomial and it's log table, exp table, multiply table, inverse table etc. We can get more info about how galois field work
4. mathtool/cntinverse.go : calculate how many inverse matrix will have in different RS codes config
5. Both of Cauchy and Vandermonde Matrix are supported. Vandermonde need more operations for preserving the property that any square subset of rows is invertible
### Why so fast?
These three parts will cost too much time:
1. lookup galois-field tables
2. read/write memory
3. calculate inverse matrix in the reconstruct process
SIMD will solve no.1
Cache-friendly codes will help to solve no.2 & no.3, and more, use a sync.Map for cache inverse matrix, it will help to save about 1000ns when we need same matrix.
## Performance
Performance depends mainly on:
1. CPU instruction extension( AVX2 or SSSE3 or none )
2. number of data/parity vects
3. unit size of calculation ( see it in rs_amd64.go )
4. size of shards
5. speed of memory (waste so much time on read/write mem, :D )
6. performance of CPU
7. the way of using ( reuse memory)
And we must know the benchmark test is quite different with encoding/decoding in practice.
Because in benchmark test loops, the CPU Cache will help a lot. In practice, we must reuse the memory to make the performance become as good as the benchmark test.
Example of performance on my MacBook 2017 i7 2.8GHz. 10+4 (with 0.1.0).
### Encoding:
| Vector size | Speed (MB/S) |
|----------------|--------------|
| 1400B | 7655.02 |
| 4KB | 10551.37 |
| 64KB | 9297.25 |
| 1MB | 6829.89 |
| 16MB | 6312.83 |
### Reconstruct (use nil to point which one need repair):
| Vector size | Speed (MB/S) |
|----------------|--------------|
| 1400B | 4124.85 |
| 4KB | 5715.45 |
| 64KB | 6050.06 |
| 1MB | 5001.21 |
| 16MB | 5043.04 |
### ReconstructWithPos (use a position list to point which one need repair, reuse the memory):
| Vector size | Speed (MB/S) |
|----------------|--------------|
| 1400B | 6170.24 |
| 4KB | 9444.86 |
| 64KB | 9311.30 |
| 1MB | 6781.06 |
| 16MB | 6285.34 |
**reconstruct benchmark tests here run with inverse matrix cache, if there is no cache, it will cost more time( about 1000ns)**
## Who is using this?
1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
## Links & Thanks
* [Klauspost ReedSolomon](https://github.com/klauspost/reedsolomon)
* [intel ISA-L](https://github.com/01org/isa-l)
* [GF SIMD] (http://www.ssrc.ucsc.edu/papers/plank-fast13.pdf)

View File

@@ -1,156 +0,0 @@
package reedsolomon
import "errors"
type matrix []byte
func genEncMatrixCauchy(d, p int) matrix {
t := d + p
m := make([]byte, t*d)
for i := 0; i < d; i++ {
m[i*d+i] = byte(1)
}
d2 := d * d
for i := d; i < t; i++ {
for j := 0; j < d; j++ {
d := i ^ j
a := inverseTbl[d]
m[d2] = byte(a)
d2++
}
}
return m
}
func gfExp(b byte, n int) byte {
if n == 0 {
return 1
}
if b == 0 {
return 0
}
a := logTbl[b]
ret := int(a) * n
for ret >= 255 {
ret -= 255
}
return byte(expTbl[ret])
}
func genVandMatrix(vm []byte, t, d int) {
for i := 0; i < t; i++ {
for j := 0; j < d; j++ {
vm[i*d+j] = gfExp(byte(i), j)
}
}
}
func (m matrix) mul(right matrix, rows, cols int, r []byte) {
for i := 0; i < rows; i++ {
for j := 0; j < cols; j++ {
var v byte
for k := 0; k < cols; k++ {
v ^= gfMul(m[i*cols+k], right[k*cols+j])
}
r[i*cols+j] = v
}
}
}
func genEncMatrixVand(d, p int) (matrix, error) {
t := d + p
buf := make([]byte, (2*t+4*d)*d)
vm := buf[:t*d]
genVandMatrix(vm, t, d)
top := buf[t*d : (t+d)*d]
copy(top, vm[:d*d])
raw := buf[(t+d)*d : (t+3*d)*d]
im := buf[(t+3*d)*d : (t+4*d)*d]
err := matrix(top).invert(raw, d, im)
if err != nil {
return nil, err
}
r := buf[(t+4*d)*d : (2*t+4*d)*d]
matrix(vm).mul(im, t, d, r)
return matrix(r), nil
}
// [I|m'] -> [m']
func (m matrix) subMatrix(n int, r []byte) {
for i := 0; i < n; i++ {
off := i * n
copy(r[off:off+n], m[2*off+n:2*(off+n)])
}
}
func (m matrix) invert(raw matrix, n int, im []byte) error {
// [m] -> [m|I]
for i := 0; i < n; i++ {
t := i * n
copy(raw[2*t:2*t+n], m[t:t+n])
raw[2*t+i+n] = byte(1)
}
err := gauss(raw, n)
if err != nil {
return err
}
raw.subMatrix(n, im)
return nil
}
func (m matrix) swap(i, j, n int) {
for k := 0; k < n; k++ {
m[i*n+k], m[j*n+k] = m[j*n+k], m[i*n+k]
}
}
func gfMul(a, b byte) byte {
return mulTbl[a][b]
}
var errSingular = errors.New("rs.invert: matrix is singular")
// [m|I] -> [I|m']
func gauss(m matrix, n int) error {
n2 := 2 * n
for i := 0; i < n; i++ {
if m[i*n2+i] == 0 {
for j := i + 1; j < n; j++ {
if m[j*n2+i] != 0 {
m.swap(i, j, n2)
break
}
}
}
if m[i*n2+i] == 0 {
return errSingular
}
if m[i*n2+i] != 1 {
d := m[i*n2+i]
scale := inverseTbl[d]
for c := 0; c < n2; c++ {
m[i*n2+c] = gfMul(m[i*n2+c], scale)
}
}
for j := i + 1; j < n; j++ {
if m[j*n2+i] != 0 {
scale := m[j*n2+i]
for c := 0; c < n2; c++ {
m[j*n2+c] ^= gfMul(scale, m[i*n2+c])
}
}
}
}
for k := 0; k < n; k++ {
for j := 0; j < k; j++ {
if m[j*n2+k] != 0 {
scale := m[j*n2+k]
for c := 0; c < n2; c++ {
m[j*n2+c] ^= gfMul(scale, m[k*n2+c])
}
}
}
}
return nil
}

View File

@@ -1,280 +0,0 @@
/*
Reed-Solomon Codes over GF(2^8)
Primitive Polynomial: x^8+x^4+x^3+x^2+1
Galois Filed arithmetic using Intel SIMD instructions (AVX2 or SSSE3)
*/
package reedsolomon
import "errors"
// Encoder implements for Reed-Solomon Encoding/Reconstructing
type Encoder interface {
// Encode multiply generator-matrix with data
// len(vects) must be equal with num of data+parity
Encode(vects [][]byte) error
// Result of reconst will be put into origin position of vects
// it means if you lost vects[0], after reconst the vects[0]'s data will be back in vects[0]
// Reconstruct repair lost data & parity
// Set vect nil if lost
Reconstruct(vects [][]byte) error
// Reconstruct repair lost data
// Set vect nil if lost
ReconstructData(vects [][]byte) error
// ReconstWithPos repair lost data&parity with has&lost vects position
// Save bandwidth&disk I/O (cmp with Reconstruct, if the lost is less than num of parity)
// As erasure codes, we must know which vect is broken,
// so it's necessary to provide such APIs
// len(has) must equal num of data vects
// Example:
// in 3+2, the whole position: [0,1,2,3,4]
// if lost vects[0]
// the "has" could be [1,2,3] or [1,2,4] or ...
// then you must be sure that vects[1] vects[2] vects[3] have correct data (if the "has" is [1,2,3])
// the "dLost" will be [0]
// ps:
// 1. the above lists are in increasing orders TODO support out-of-order
// 2. each vect has same len, don't set it nil
// so we don't need to make slice
ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error
//// ReconstWithPos repair lost data with survived&lost vects position
//// Don't need to append position of parity lost into "lost"
ReconstDataWithPos(vects [][]byte, has, dLost []int) error
}
func checkCfg(d, p int) error {
if (d <= 0) || (p <= 0) {
return errors.New("rs.New: data or parity <= 0")
}
if d+p >= 256 {
return errors.New("rs.New: data+parity >= 256")
}
return nil
}
// New create an Encoder (vandermonde matrix as Encoding matrix)
func New(data, parity int) (enc Encoder, err error) {
err = checkCfg(data, parity)
if err != nil {
return
}
e, err := genEncMatrixVand(data, parity)
if err != nil {
return
}
return newRS(data, parity, e), nil
}
// NewCauchy create an Encoder (cauchy matrix as Generator Matrix)
func NewCauchy(data, parity int) (enc Encoder, err error) {
err = checkCfg(data, parity)
if err != nil {
return
}
e := genEncMatrixCauchy(data, parity)
return newRS(data, parity, e), nil
}
type encBase struct {
data int
parity int
encode []byte
gen []byte
}
func checkEnc(d, p int, vs [][]byte) (size int, err error) {
total := len(vs)
if d+p != total {
err = errors.New("rs.checkER: vects not match rs args")
return
}
size = len(vs[0])
if size == 0 {
err = errors.New("rs.checkER: vects size = 0")
return
}
for i := 1; i < total; i++ {
if len(vs[i]) != size {
err = errors.New("rs.checkER: vects size mismatch")
return
}
}
return
}
func (e *encBase) Encode(vects [][]byte) (err error) {
d := e.data
p := e.parity
_, err = checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i], pv[j])
} else {
mulVect(g[j*d], dv[0], pv[j])
}
}
}
return
}
func mulVect(c byte, a, b []byte) {
t := mulTbl[c]
for i := 0; i < len(a); i++ {
b[i] = t[a[i]]
}
}
func mulVectAdd(c byte, a, b []byte) {
t := mulTbl[c]
for i := 0; i < len(a); i++ {
b[i] ^= t[a[i]]
}
}
func (e *encBase) Reconstruct(vects [][]byte) (err error) {
return e.reconstruct(vects, false)
}
func (e *encBase) ReconstructData(vects [][]byte) (err error) {
return e.reconstruct(vects, true)
}
func (e *encBase) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
return e.reconstWithPos(vects, has, dLost, pLost, false)
}
func (e *encBase) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
return e.reconstWithPos(vects, has, dLost, nil, true)
}
func (e *encBase) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
em := e.encode
dCnt := len(dLost)
size := len(vects[has[0]])
if dCnt != 0 {
vtmp := make([][]byte, d+dCnt)
for i, p := range has {
vtmp[i] = vects[p]
}
for i, p := range dLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
matrixbuf := make([]byte, 4*d*d+dCnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return err2
}
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
etmp := &encBase{data: d, parity: dCnt, gen: g}
err2 = etmp.Encode(vtmp[:d+dCnt])
if err2 != nil {
return err2
}
}
if dataOnly {
return
}
pCnt := len(pLost)
if pCnt != 0 {
vtmp := make([][]byte, d+pCnt)
g := make([]byte, pCnt*d)
for i, l := range pLost {
copy(g[i*d:i*d+d], em[l*d:l*d+d])
}
for i := 0; i < d; i++ {
vtmp[i] = vects[i]
}
for i, p := range pLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
etmp := &encBase{data: d, parity: pCnt, gen: g}
err2 := etmp.Encode(vtmp[:d+pCnt])
if err2 != nil {
return err2
}
}
return
}
func (e *encBase) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
p := e.parity
// TODO check more, maybe element in has show in lost & deal with len(has) > d
if len(has) != d {
return errors.New("rs.Reconst: not enough vects")
}
dCnt := len(dLost)
if dCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
pCnt := len(pLost)
if pCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encBase) reconstruct(vects [][]byte, dataOnly bool) (err error) {
d := e.data
p := e.parity
t := d + p
listBuf := make([]int, t+p)
has := listBuf[:d]
dLost := listBuf[d:t]
pLost := listBuf[t : t+p]
hasCnt, dCnt, pCnt := 0, 0, 0
for i := 0; i < t; i++ {
if vects[i] != nil {
if hasCnt < d {
has[hasCnt] = i
hasCnt++
}
} else {
if i < d {
if dCnt < p {
dLost[dCnt] = i
dCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
} else {
if pCnt < p {
pLost[pCnt] = i
pCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
}
}
}
if hasCnt != d {
return errors.New("rs.Reconst: not enough vects")
}
dLost = dLost[:dCnt]
pLost = pLost[:pCnt]
return e.reconst(vects, has, dLost, pLost, dataOnly)
}

View File

@@ -1,868 +0,0 @@
package reedsolomon
import (
"errors"
"sync"
"github.com/templexxx/cpufeat"
)
// SIMD Instruction Extensions
const (
none = iota
avx2
ssse3
)
var extension = none
func init() {
getEXT()
}
func getEXT() {
if cpufeat.X86.HasAVX2 {
extension = avx2
return
} else if cpufeat.X86.HasSSSE3 {
extension = ssse3
return
} else {
extension = none
return
}
}
//go:noescape
func copy32B(dst, src []byte) // Need SSE2(introduced in 2001)
func initTbl(g matrix, rows, cols int, tbl []byte) {
off := 0
for i := 0; i < cols; i++ {
for j := 0; j < rows; j++ {
c := g[j*cols+i]
t := lowhighTbl[c][:]
copy32B(tbl[off:off+32], t)
off += 32
}
}
}
// At most 3060 inverse matrix (when data=14, parity=4, calc by mathtool/cntinverse)
// In practice, data usually below 12, parity below 5
func okCache(data, parity int) bool {
if data < 15 && parity < 5 { // you can change it, but the data+parity can't be bigger than 32 (tips: see the codes about make inverse matrix)
return true
}
return false
}
type (
encSSSE3 encSIMD
encAVX2 encSIMD
encSIMD struct {
data int
parity int
encode matrix
gen matrix
tbl []byte
// inverse matrix cache is design for small vect size ( < 4KB )
// it will save time for calculating inverse matrix
// but it's not so important for big vect size
enableCache bool
inverseCache iCache
}
iCache struct {
sync.RWMutex
data map[uint32][]byte
}
)
func newRS(d, p int, em matrix) (enc Encoder) {
g := em[d*d:]
if extension == none {
return &encBase{data: d, parity: p, encode: em, gen: g}
}
t := make([]byte, d*p*32)
initTbl(g, p, d, t)
ok := okCache(d, p)
if extension == avx2 {
e := &encAVX2{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
inverseCache: iCache{data: make(map[uint32][]byte)}}
return e
}
e := &encSSSE3{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
inverseCache: iCache{data: make(map[uint32][]byte)}}
return e
}
// Size of sub-vector
const unit int = 16 * 1024
func getDo(n int) int {
if n < unit {
c := n >> 4
if c == 0 {
return unit
}
return c << 4
}
return unit
}
func (e *encAVX2) Encode(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMul(start, end, dv, pv)
start = end
} else {
e.matrixMulRemain(start, size, dv, pv)
start = size
}
}
return
}
//go:noescape
func mulVectAVX2(tbl, d, p []byte)
//go:noescape
func mulVectAddAVX2(tbl, d, p []byte)
func (e *encAVX2) matrixMul(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
tbl := e.tbl
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
}
off += 32
}
}
}
func (e *encAVX2) matrixMulRemain(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
tbl := e.tbl
if do >= 16 {
end2 := start + do
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
}
off += 32
}
}
start = end
}
if undone > do {
// may recalculate some data, but still improve a lot
start2 := end - 16
if start2 >= 0 {
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
}
off += 32
}
}
} else {
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
// use generator-matrix but not tbls for encoding
// it's design for reconstructing
// for small vects, it cost to much time on initTbl, so drop it
// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
func (e *encAVX2) encodeGen(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMulGen(start, end, dv, pv)
start = end
} else {
e.matrixMulRemainGen(start, size, dv, pv)
start = size
}
}
return
}
func (e *encAVX2) matrixMulGen(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
}
}
}
}
func (e *encAVX2) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
g := e.gen
if do >= 16 {
end2 := start + do
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
}
}
}
start = end
}
if undone > do {
start2 := end - 16
if start2 >= 0 {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
}
}
}
} else {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
func (e *encAVX2) Reconstruct(vects [][]byte) (err error) {
return e.reconstruct(vects, false)
}
func (e *encAVX2) ReconstructData(vects [][]byte) (err error) {
return e.reconstruct(vects, true)
}
func (e *encAVX2) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
return e.reconstWithPos(vects, has, dLost, pLost, false)
}
func (e *encAVX2) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
return e.reconstWithPos(vects, has, dLost, nil, true)
}
func (e *encAVX2) makeGen(has, dLost []int) (gen []byte, err error) {
d := e.data
em := e.encode
cnt := len(dLost)
if !e.enableCache {
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
var ikey uint32
for _, p := range has {
ikey += 1 << uint8(p)
}
e.inverseCache.RLock()
v, ok := e.inverseCache.data[ikey]
if ok {
im := v
g := make([]byte, cnt*d)
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
e.inverseCache.RUnlock()
return g, nil
}
e.inverseCache.RUnlock()
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
e.inverseCache.Lock()
e.inverseCache.data[ikey] = im
e.inverseCache.Unlock()
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
func (e *encAVX2) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
em := e.encode
dCnt := len(dLost)
size := len(vects[has[0]])
if dCnt != 0 {
vtmp := make([][]byte, d+dCnt)
for i, p := range has {
vtmp[i] = vects[p]
}
for i, p := range dLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
g, err2 := e.makeGen(has, dLost)
if err2 != nil {
return
}
etmp := &encAVX2{data: d, parity: dCnt, gen: g}
err2 = etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
if dataOnly {
return
}
pCnt := len(pLost)
if pCnt != 0 {
g := make([]byte, pCnt*d)
for i, l := range pLost {
copy(g[i*d:i*d+d], em[l*d:l*d+d])
}
vtmp := make([][]byte, d+pCnt)
for i := 0; i < d; i++ {
vtmp[i] = vects[i]
}
for i, p := range pLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
etmp := &encAVX2{data: d, parity: pCnt, gen: g}
err2 := etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
return
}
func (e *encAVX2) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
p := e.parity
if len(has) != d {
return errors.New("rs.Reconst: not enough vects")
}
dCnt := len(dLost)
if dCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
pCnt := len(pLost)
if pCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encAVX2) reconstruct(vects [][]byte, dataOnly bool) (err error) {
d := e.data
p := e.parity
t := d + p
listBuf := make([]int, t+p)
has := listBuf[:d]
dLost := listBuf[d:t]
pLost := listBuf[t : t+p]
hasCnt, dCnt, pCnt := 0, 0, 0
for i := 0; i < t; i++ {
if vects[i] != nil {
if hasCnt < d {
has[hasCnt] = i
hasCnt++
}
} else {
if i < d {
if dCnt < p {
dLost[dCnt] = i
dCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
} else {
if pCnt < p {
pLost[pCnt] = i
pCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
}
}
}
if hasCnt != d {
return errors.New("rs.Reconst: not enough vects")
}
dLost = dLost[:dCnt]
pLost = pLost[:pCnt]
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encSSSE3) Encode(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMul(start, end, dv, pv)
start = end
} else {
e.matrixMulRemain(start, size, dv, pv)
start = size
}
}
return
}
//go:noescape
func mulVectSSSE3(tbl, d, p []byte)
//go:noescape
func mulVectAddSSSE3(tbl, d, p []byte)
func (e *encSSSE3) matrixMul(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
tbl := e.tbl
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
}
off += 32
}
}
}
func (e *encSSSE3) matrixMulRemain(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
tbl := e.tbl
if do >= 16 {
end2 := start + do
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
}
off += 32
}
}
start = end
}
if undone > do {
start2 := end - 16
if start2 >= 0 {
off := 0
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := tbl[off : off+32]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
}
off += 32
}
}
} else {
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
// use generator-matrix but not tbls for encoding
// it's design for reconstructing
// for small vects, it cost to much time on initTbl, so drop it
// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
func (e *encSSSE3) encodeGen(vects [][]byte) (err error) {
d := e.data
p := e.parity
size, err := checkEnc(d, p, vects)
if err != nil {
return
}
dv := vects[:d]
pv := vects[d:]
start, end := 0, 0
do := getDo(size)
for start < size {
end = start + do
if end <= size {
e.matrixMulGen(start, end, dv, pv)
start = end
} else {
e.matrixMulRemainGen(start, size, dv, pv)
start = size
}
}
return
}
func (e *encSSSE3) matrixMulGen(start, end int, dv, pv [][]byte) {
d := e.data
p := e.parity
g := e.gen
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
} else {
mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
}
}
}
}
func (e *encSSSE3) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
undone := end - start
do := (undone >> 4) << 4
d := e.data
p := e.parity
g := e.gen
if do >= 16 {
end2 := start + do
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
} else {
mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
}
}
}
start = end
}
if undone > do {
start2 := end - 16
if start2 >= 0 {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
t := lowhighTbl[g[j*d+i]][:]
if i != 0 {
mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
} else {
mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
}
}
}
} else {
for i := 0; i < d; i++ {
for j := 0; j < p; j++ {
if i != 0 {
mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
} else {
mulVect(g[j*d], dv[0][start:], pv[j][start:])
}
}
}
}
}
}
func (e *encSSSE3) Reconstruct(vects [][]byte) (err error) {
return e.reconstruct(vects, false)
}
func (e *encSSSE3) ReconstructData(vects [][]byte) (err error) {
return e.reconstruct(vects, true)
}
func (e *encSSSE3) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
return e.reconstWithPos(vects, has, dLost, pLost, false)
}
func (e *encSSSE3) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
return e.reconstWithPos(vects, has, dLost, nil, true)
}
func (e *encSSSE3) makeGen(has, dLost []int) (gen []byte, err error) {
d := e.data
em := e.encode
cnt := len(dLost)
if !e.enableCache {
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
var ikey uint32
for _, p := range has {
ikey += 1 << uint8(p)
}
e.inverseCache.RLock()
v, ok := e.inverseCache.data[ikey]
if ok {
im := v
g := make([]byte, cnt*d)
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
e.inverseCache.RUnlock()
return g, nil
}
e.inverseCache.RUnlock()
matrixbuf := make([]byte, 4*d*d+cnt*d)
m := matrixbuf[:d*d]
for i, l := range has {
copy(m[i*d:i*d+d], em[l*d:l*d+d])
}
raw := matrixbuf[d*d : 3*d*d]
im := matrixbuf[3*d*d : 4*d*d]
err2 := matrix(m).invert(raw, d, im)
if err2 != nil {
return nil, err2
}
e.inverseCache.Lock()
e.inverseCache.data[ikey] = im
e.inverseCache.Unlock()
g := matrixbuf[4*d*d:]
for i, l := range dLost {
copy(g[i*d:i*d+d], im[l*d:l*d+d])
}
return g, nil
}
func (e *encSSSE3) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
em := e.encode
dCnt := len(dLost)
size := len(vects[has[0]])
if dCnt != 0 {
vtmp := make([][]byte, d+dCnt)
for i, p := range has {
vtmp[i] = vects[p]
}
for i, p := range dLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
g, err2 := e.makeGen(has, dLost)
if err2 != nil {
return
}
etmp := &encSSSE3{data: d, parity: dCnt, gen: g}
err2 = etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
if dataOnly {
return
}
pCnt := len(pLost)
if pCnt != 0 {
g := make([]byte, pCnt*d)
for i, l := range pLost {
copy(g[i*d:i*d+d], em[l*d:l*d+d])
}
vtmp := make([][]byte, d+pCnt)
for i := 0; i < d; i++ {
vtmp[i] = vects[i]
}
for i, p := range pLost {
if len(vects[p]) == 0 {
vects[p] = make([]byte, size)
}
vtmp[i+d] = vects[p]
}
etmp := &encSSSE3{data: d, parity: pCnt, gen: g}
err2 := etmp.encodeGen(vtmp)
if err2 != nil {
return err2
}
}
return
}
func (e *encSSSE3) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
d := e.data
p := e.parity
if len(has) != d {
return errors.New("rs.Reconst: not enough vects")
}
dCnt := len(dLost)
if dCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
pCnt := len(pLost)
if pCnt > p {
return errors.New("rs.Reconst: not enough vects")
}
return e.reconst(vects, has, dLost, pLost, dataOnly)
}
func (e *encSSSE3) reconstruct(vects [][]byte, dataOnly bool) (err error) {
d := e.data
p := e.parity
t := d + p
listBuf := make([]int, t+p)
has := listBuf[:d]
dLost := listBuf[d:t]
pLost := listBuf[t : t+p]
hasCnt, dCnt, pCnt := 0, 0, 0
for i := 0; i < t; i++ {
if vects[i] != nil {
if hasCnt < d {
has[hasCnt] = i
hasCnt++
}
} else {
if i < d {
if dCnt < p {
dLost[dCnt] = i
dCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
} else {
if pCnt < p {
pLost[pCnt] = i
pCnt++
} else {
return errors.New("rs.Reconst: not enough vects")
}
}
}
}
if hasCnt != d {
return errors.New("rs.Reconst: not enough vects")
}
dLost = dLost[:dCnt]
pLost = pLost[:pCnt]
return e.reconst(vects, has, dLost, pLost, dataOnly)
}

View File

@@ -1,401 +0,0 @@
// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
#include "textflag.h"
#define low_tbl Y0
#define high_tbl Y1
#define mask Y2
#define in0 Y3
#define in1 Y4
#define in2 Y5
#define in3 Y6
#define in4 Y7
#define in5 Y8
#define in0_h Y10
#define in1_h Y11
#define in2_h Y12
#define in3_h Y13
#define in4_h Y14
#define in5_h Y15
#define in BX
#define out DI
#define len R8
#define pos R9
#define tmp0 R10
#define low_tblx X0
#define high_tblx X1
#define maskx X2
#define in0x X3
#define in0_hx X10
#define tmp0x X9
#define tmp1x X11
#define tmp2x X12
#define tmp3x X13
// func mulVectAVX2(tbl, d, p []byte)
TEXT ·mulVectAVX2(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
VMOVDQU (tmp0), low_tblx
VMOVDQU 16(tmp0), high_tblx
MOVB $0x0f, DX
LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2
VPBROADCASTB maskx, maskx
MOVQ in_len+32(FP), len
TESTQ $31, len
JNZ one16b
ymm:
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
VINSERTI128 $1, maskx, mask, mask
TESTQ $255, len
JNZ not_aligned
// 256bytes/loop
aligned:
MOVQ $0, pos
loop256b:
VMOVDQU (in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, (out)(pos*1)
VMOVDQU 32(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VMOVDQU in1, 32(out)(pos*1)
VMOVDQU 64(in)(pos*1), in2
VPSRLQ $4, in2, in2_h
VPAND mask, in2_h, in2_h
VPAND mask, in2, in2
VPSHUFB in2_h, high_tbl, in2_h
VPSHUFB in2, low_tbl, in2
VPXOR in2, in2_h, in2
VMOVDQU in2, 64(out)(pos*1)
VMOVDQU 96(in)(pos*1), in3
VPSRLQ $4, in3, in3_h
VPAND mask, in3_h, in3_h
VPAND mask, in3, in3
VPSHUFB in3_h, high_tbl, in3_h
VPSHUFB in3, low_tbl, in3
VPXOR in3, in3_h, in3
VMOVDQU in3, 96(out)(pos*1)
VMOVDQU 128(in)(pos*1), in4
VPSRLQ $4, in4, in4_h
VPAND mask, in4_h, in4_h
VPAND mask, in4, in4
VPSHUFB in4_h, high_tbl, in4_h
VPSHUFB in4, low_tbl, in4
VPXOR in4, in4_h, in4
VMOVDQU in4, 128(out)(pos*1)
VMOVDQU 160(in)(pos*1), in5
VPSRLQ $4, in5, in5_h
VPAND mask, in5_h, in5_h
VPAND mask, in5, in5
VPSHUFB in5_h, high_tbl, in5_h
VPSHUFB in5, low_tbl, in5
VPXOR in5, in5_h, in5
VMOVDQU in5, 160(out)(pos*1)
VMOVDQU 192(in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, 192(out)(pos*1)
VMOVDQU 224(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VMOVDQU in1, 224(out)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
not_aligned:
MOVQ len, tmp0
ANDQ $255, tmp0
loop32b:
VMOVDQU -32(in)(len*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VMOVDQU in0, -32(out)(len*1)
SUBQ $32, len
SUBQ $32, tmp0
JG loop32b
CMPQ len, $256
JGE aligned
VZEROUPPER
RET
one16b:
VMOVDQU -16(in)(len*1), in0x
VPSRLQ $4, in0x, in0_hx
VPAND maskx, in0x, in0x
VPAND maskx, in0_hx, in0_hx
VPSHUFB in0_hx, high_tblx, in0_hx
VPSHUFB in0x, low_tblx, in0x
VPXOR in0x, in0_hx, in0x
VMOVDQU in0x, -16(out)(len*1)
SUBQ $16, len
CMPQ len, $0
JNE ymm
RET
// func mulVectAddAVX2(tbl, d, p []byte)
TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
VMOVDQU (tmp0), low_tblx
VMOVDQU 16(tmp0), high_tblx
MOVB $0x0f, DX
LONG $0x2069e3c4; WORD $0x00d2
VPBROADCASTB maskx, maskx
MOVQ in_len+32(FP), len
TESTQ $31, len
JNZ one16b
ymm:
VINSERTI128 $1, low_tblx, low_tbl, low_tbl
VINSERTI128 $1, high_tblx, high_tbl, high_tbl
VINSERTI128 $1, maskx, mask, mask
TESTQ $255, len
JNZ not_aligned
aligned:
MOVQ $0, pos
loop256b:
VMOVDQU (in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR (out)(pos*1), in0, in0
VMOVDQU in0, (out)(pos*1)
VMOVDQU 32(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VPXOR 32(out)(pos*1), in1, in1
VMOVDQU in1, 32(out)(pos*1)
VMOVDQU 64(in)(pos*1), in2
VPSRLQ $4, in2, in2_h
VPAND mask, in2_h, in2_h
VPAND mask, in2, in2
VPSHUFB in2_h, high_tbl, in2_h
VPSHUFB in2, low_tbl, in2
VPXOR in2, in2_h, in2
VPXOR 64(out)(pos*1), in2, in2
VMOVDQU in2, 64(out)(pos*1)
VMOVDQU 96(in)(pos*1), in3
VPSRLQ $4, in3, in3_h
VPAND mask, in3_h, in3_h
VPAND mask, in3, in3
VPSHUFB in3_h, high_tbl, in3_h
VPSHUFB in3, low_tbl, in3
VPXOR in3, in3_h, in3
VPXOR 96(out)(pos*1), in3, in3
VMOVDQU in3, 96(out)(pos*1)
VMOVDQU 128(in)(pos*1), in4
VPSRLQ $4, in4, in4_h
VPAND mask, in4_h, in4_h
VPAND mask, in4, in4
VPSHUFB in4_h, high_tbl, in4_h
VPSHUFB in4, low_tbl, in4
VPXOR in4, in4_h, in4
VPXOR 128(out)(pos*1), in4, in4
VMOVDQU in4, 128(out)(pos*1)
VMOVDQU 160(in)(pos*1), in5
VPSRLQ $4, in5, in5_h
VPAND mask, in5_h, in5_h
VPAND mask, in5, in5
VPSHUFB in5_h, high_tbl, in5_h
VPSHUFB in5, low_tbl, in5
VPXOR in5, in5_h, in5
VPXOR 160(out)(pos*1), in5, in5
VMOVDQU in5, 160(out)(pos*1)
VMOVDQU 192(in)(pos*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR 192(out)(pos*1), in0, in0
VMOVDQU in0, 192(out)(pos*1)
VMOVDQU 224(in)(pos*1), in1
VPSRLQ $4, in1, in1_h
VPAND mask, in1_h, in1_h
VPAND mask, in1, in1
VPSHUFB in1_h, high_tbl, in1_h
VPSHUFB in1, low_tbl, in1
VPXOR in1, in1_h, in1
VPXOR 224(out)(pos*1), in1, in1
VMOVDQU in1, 224(out)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
not_aligned:
MOVQ len, tmp0
ANDQ $255, tmp0
loop32b:
VMOVDQU -32(in)(len*1), in0
VPSRLQ $4, in0, in0_h
VPAND mask, in0_h, in0_h
VPAND mask, in0, in0
VPSHUFB in0_h, high_tbl, in0_h
VPSHUFB in0, low_tbl, in0
VPXOR in0, in0_h, in0
VPXOR -32(out)(len*1), in0, in0
VMOVDQU in0, -32(out)(len*1)
SUBQ $32, len
SUBQ $32, tmp0
JG loop32b
CMPQ len, $256
JGE aligned
VZEROUPPER
RET
one16b:
VMOVDQU -16(in)(len*1), in0x
VPSRLQ $4, in0x, in0_hx
VPAND maskx, in0x, in0x
VPAND maskx, in0_hx, in0_hx
VPSHUFB in0_hx, high_tblx, in0_hx
VPSHUFB in0x, low_tblx, in0x
VPXOR in0x, in0_hx, in0x
VPXOR -16(out)(len*1), in0x, in0x
VMOVDQU in0x, -16(out)(len*1)
SUBQ $16, len
CMPQ len, $0
JNE ymm
RET
// func mulVectSSSE3(tbl, d, p []byte)
TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
MOVOU (tmp0), low_tblx
MOVOU 16(tmp0), high_tblx
MOVB $15, tmp0
MOVQ tmp0, maskx
PXOR tmp0x, tmp0x
PSHUFB tmp0x, maskx
MOVQ in_len+32(FP), len
SHRQ $4, len
loop:
MOVOU (in), in0x
MOVOU in0x, in0_hx
PSRLQ $4, in0_hx
PAND maskx, in0x
PAND maskx, in0_hx
MOVOU low_tblx, tmp1x
MOVOU high_tblx, tmp2x
PSHUFB in0x, tmp1x
PSHUFB in0_hx, tmp2x
PXOR tmp1x, tmp2x
MOVOU tmp2x, (out)
ADDQ $16, in
ADDQ $16, out
SUBQ $1, len
JNZ loop
RET
// func mulVectAddSSSE3(tbl, d, p []byte)
TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
MOVQ i+24(FP), in
MOVQ o+48(FP), out
MOVQ tbl+0(FP), tmp0
MOVOU (tmp0), low_tblx
MOVOU 16(tmp0), high_tblx
MOVB $15, tmp0
MOVQ tmp0, maskx
PXOR tmp0x, tmp0x
PSHUFB tmp0x, maskx
MOVQ in_len+32(FP), len
SHRQ $4, len
loop:
MOVOU (in), in0x
MOVOU in0x, in0_hx
PSRLQ $4, in0_hx
PAND maskx, in0x
PAND maskx, in0_hx
MOVOU low_tblx, tmp1x
MOVOU high_tblx, tmp2x
PSHUFB in0x, tmp1x
PSHUFB in0_hx, tmp2x
PXOR tmp1x, tmp2x
MOVOU (out), tmp3x
PXOR tmp3x, tmp2x
MOVOU tmp2x, (out)
ADDQ $16, in
ADDQ $16, out
SUBQ $1, len
JNZ loop
RET
// func copy32B(dst, src []byte)
TEXT ·copy32B(SB), NOSPLIT, $0
MOVQ dst+0(FP), SI
MOVQ src+24(FP), DX
MOVOU (DX), X0
MOVOU 16(DX), X1
MOVOU X0, (SI)
MOVOU X1, 16(SI)
RET

View File

@@ -1,8 +0,0 @@
// +build !amd64
package reedsolomon
func newRS(d, p int, em matrix) (enc Encoder) {
g := em[d*d:]
return &encBase{data: d, parity: p, encode: em, gen: g}
}

File diff suppressed because one or more lines are too long