update kcp-go package

2026-04-14 21:19:11 +08:00 · 2019-03-17 17:09:54 +08:00
parent 87a4de4370
commit fdcdccb0c2
122 changed files with 14490 additions and 2469 deletions
--- a/vendor/github.com/templexxx/reedsolomon/.gitignore
+++ b/vendor/github.com/templexxx/reedsolomon/.gitignore
@@ -1,40 +0,0 @@
-# Compiled Object files, Static and Dynamic libs (Shared Objects)
-*.o
-*.a
-*.so
-
-# Folders
-_obj
-_test
-
-# Architecture specific extensions/prefixes
-*.[568vq]
-[568vq].out
-
-*.cgo1.go
-*.cgo2.c
-_cgo_defun.c
-_cgo_gotypes.go
-_cgo_export.*
-
-_testmain.go
-
-*.exe
-*.test
-*.prof
-/.idea
-/backup
-/loopunroll/
-cpu.out
-mathtool/galois/
-mathtool/matrix/
-mem.out
-/examples/
-/.DS_Store
-/mathtool/cntinverse
-/invert
-/bakcup
-/buf.svg
-*.svg
-*.out
-/escape
--- a/vendor/github.com/templexxx/reedsolomon/.travis.yml
+++ b/vendor/github.com/templexxx/reedsolomon/.travis.yml
@@ -1,9 +0,0 @@
-language: go
-go:
-    - 1.9
-
-install:
-    - go get github.com/templexxx/reedsolomon
-
-script:
-    - go test -v
--- a/vendor/github.com/templexxx/reedsolomon/LICENSE
+++ b/vendor/github.com/templexxx/reedsolomon/LICENSE
@@ -1,23 +0,0 @@
-MIT License
-
-Copyright (c) 2017 Templexxx
-Copyright (c) 2015 Klaus Post
-Copyright (c) 2015 Backblaze
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/vendor/github.com/templexxx/reedsolomon/README.md
+++ b/vendor/github.com/templexxx/reedsolomon/README.md
@@ -1,108 +0,0 @@
-# Reed-Solomon
-
-[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] 
-
-[1]: https://godoc.org/github.com/templexxx/reedsolomon?status.svg
-[2]: https://godoc.org/github.com/templexxx/reedsolomon
-[3]: https://img.shields.io/badge/license-MIT-blue.svg
-[4]: LICENSE
-[5]: https://travis-ci.org/templexxx/reedsolomon.svg?branch=master
-[6]: https://travis-ci.org/templexxx/reedsolomon
-[7]: https://goreportcard.com/badge/github.com/templexxx/reedsolomon
-[8]: https://goreportcard.com/report/github.com/templexxx/reedsolomon
-
-
-## Introduction:
-1.  Reed-Solomon Erasure Code engine in pure Go.
-2.  Super Fast: more than 10GB/s per physics core ( 10+4, 4KB per vector, Macbook Pro 2.8 GHz Intel Core i7 )
-
-## Installation
-To get the package use the standard:
-```bash
-go get github.com/templexxx/reedsolomon
-```
-
-## Documentation
-See the associated [GoDoc](http://godoc.org/github.com/templexxx/reedsolomon)
-
-## Specification
-### GOARCH
-1. All arch are supported
-2. 0.1.0 need go1.9 for sync.Map in AMD64
-
-### Math
-1. Coding over in GF(2^8)
-2. Primitive Polynomial: x^8 + x^4 + x^3 + x^2 + 1 (0x1d)
-3. mathtool/gentbls.go : generator Primitive Polynomial and it's log table, exp table, multiply table, inverse table etc. We can get more info about how galois field work
-4. mathtool/cntinverse.go : calculate how many inverse matrix will have in different RS codes config
-5. Both of Cauchy and Vandermonde Matrix are supported. Vandermonde need more operations for preserving the property that any square subset of rows is invertible
-
-### Why so fast?
-These three parts will cost too much time:
-
-1. lookup galois-field tables
-2. read/write memory
-3. calculate inverse matrix in the reconstruct process
-
-SIMD will solve no.1
-
-Cache-friendly codes will help to solve no.2 & no.3, and more, use a sync.Map for cache inverse matrix, it will help to save about 1000ns when we need same matrix. 
-
-## Performance
-
-Performance depends mainly on:
-
-1. CPU instruction extension( AVX2 or SSSE3 or none )
-2. number of data/parity vects
-3. unit size of calculation ( see it in rs_amd64.go )
-4. size of shards
-5. speed of memory (waste so much time on read/write mem, :D )
-6. performance of CPU
-7. the way of using ( reuse memory)
-
-And we must know the benchmark test is quite different with encoding/decoding in practice.
-
-Because in benchmark test loops, the CPU Cache will help a lot. In practice, we must reuse the memory to make the performance become as good as the benchmark test.
-
-Example of performance on my MacBook 2017 i7 2.8GHz. 10+4 (with 0.1.0).
-
-### Encoding:
-
-| Vector size | Speed (MB/S) |
-|----------------|--------------|
-| 1400B              |    7655.02  |
-| 4KB              |       10551.37  |
-| 64KB              |       9297.25 |
-| 1MB              |      6829.89 |
-| 16MB              |      6312.83 |
-
-### Reconstruct (use nil to point which one need repair):
-
-| Vector size | Speed (MB/S) |
-|----------------|--------------|
-| 1400B              |    4124.85  |
-| 4KB              |       5715.45 |
-| 64KB              |       6050.06 |
-| 1MB              |      5001.21 |
-| 16MB              |      5043.04 |
-
-### ReconstructWithPos (use a position list to point which one need repair, reuse the memory):
-
-| Vector size | Speed (MB/S) |
-|----------------|--------------|
-| 1400B              |    6170.24  |
-| 4KB              |       9444.86 |
-| 64KB              |       9311.30 |
-| 1MB              |      6781.06 |
-| 16MB              |      6285.34 |
-
-**reconstruct benchmark tests here run with inverse matrix cache, if there is no cache, it will cost more time( about 1000ns)**
-
-## Who is using this?
-
-1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang
-
-## Links & Thanks
-* [Klauspost ReedSolomon](https://github.com/klauspost/reedsolomon)
-* [intel ISA-L](https://github.com/01org/isa-l)
-* [GF SIMD] (http://www.ssrc.ucsc.edu/papers/plank-fast13.pdf)
--- a/vendor/github.com/templexxx/reedsolomon/matrix.go
+++ b/vendor/github.com/templexxx/reedsolomon/matrix.go
@@ -1,156 +0,0 @@
-package reedsolomon
-
-import "errors"
-
-type matrix []byte
-
-func genEncMatrixCauchy(d, p int) matrix {
-	t := d + p
-	m := make([]byte, t*d)
-	for i := 0; i < d; i++ {
-		m[i*d+i] = byte(1)
-	}
-
-	d2 := d * d
-	for i := d; i < t; i++ {
-		for j := 0; j < d; j++ {
-			d := i ^ j
-			a := inverseTbl[d]
-			m[d2] = byte(a)
-			d2++
-		}
-	}
-	return m
-}
-
-func gfExp(b byte, n int) byte {
-	if n == 0 {
-		return 1
-	}
-	if b == 0 {
-		return 0
-	}
-	a := logTbl[b]
-	ret := int(a) * n
-	for ret >= 255 {
-		ret -= 255
-	}
-	return byte(expTbl[ret])
-}
-
-func genVandMatrix(vm []byte, t, d int) {
-	for i := 0; i < t; i++ {
-		for j := 0; j < d; j++ {
-			vm[i*d+j] = gfExp(byte(i), j)
-		}
-	}
-}
-
-func (m matrix) mul(right matrix, rows, cols int, r []byte) {
-	for i := 0; i < rows; i++ {
-		for j := 0; j < cols; j++ {
-			var v byte
-			for k := 0; k < cols; k++ {
-				v ^= gfMul(m[i*cols+k], right[k*cols+j])
-			}
-			r[i*cols+j] = v
-		}
-	}
-}
-
-func genEncMatrixVand(d, p int) (matrix, error) {
-	t := d + p
-	buf := make([]byte, (2*t+4*d)*d)
-	vm := buf[:t*d]
-	genVandMatrix(vm, t, d)
-	top := buf[t*d : (t+d)*d]
-	copy(top, vm[:d*d])
-	raw := buf[(t+d)*d : (t+3*d)*d]
-	im := buf[(t+3*d)*d : (t+4*d)*d]
-	err := matrix(top).invert(raw, d, im)
-	if err != nil {
-		return nil, err
-	}
-	r := buf[(t+4*d)*d : (2*t+4*d)*d]
-	matrix(vm).mul(im, t, d, r)
-	return matrix(r), nil
-}
-
-// [I|m'] -> [m']
-func (m matrix) subMatrix(n int, r []byte) {
-	for i := 0; i < n; i++ {
-		off := i * n
-		copy(r[off:off+n], m[2*off+n:2*(off+n)])
-	}
-}
-
-func (m matrix) invert(raw matrix, n int, im []byte) error {
-	// [m] -> [m|I]
-	for i := 0; i < n; i++ {
-		t := i * n
-		copy(raw[2*t:2*t+n], m[t:t+n])
-		raw[2*t+i+n] = byte(1)
-	}
-	err := gauss(raw, n)
-	if err != nil {
-		return err
-	}
-	raw.subMatrix(n, im)
-	return nil
-}
-
-func (m matrix) swap(i, j, n int) {
-	for k := 0; k < n; k++ {
-		m[i*n+k], m[j*n+k] = m[j*n+k], m[i*n+k]
-	}
-}
-
-func gfMul(a, b byte) byte {
-	return mulTbl[a][b]
-}
-
-var errSingular = errors.New("rs.invert: matrix is singular")
-
-// [m|I] -> [I|m']
-func gauss(m matrix, n int) error {
-	n2 := 2 * n
-	for i := 0; i < n; i++ {
-		if m[i*n2+i] == 0 {
-			for j := i + 1; j < n; j++ {
-				if m[j*n2+i] != 0 {
-					m.swap(i, j, n2)
-					break
-				}
-			}
-		}
-		if m[i*n2+i] == 0 {
-			return errSingular
-		}
-		if m[i*n2+i] != 1 {
-			d := m[i*n2+i]
-			scale := inverseTbl[d]
-			for c := 0; c < n2; c++ {
-				m[i*n2+c] = gfMul(m[i*n2+c], scale)
-			}
-		}
-		for j := i + 1; j < n; j++ {
-			if m[j*n2+i] != 0 {
-				scale := m[j*n2+i]
-				for c := 0; c < n2; c++ {
-					m[j*n2+c] ^= gfMul(scale, m[i*n2+c])
-				}
-			}
-		}
-	}
-	for k := 0; k < n; k++ {
-		for j := 0; j < k; j++ {
-			if m[j*n2+k] != 0 {
-				scale := m[j*n2+k]
-				for c := 0; c < n2; c++ {
-					m[j*n2+c] ^= gfMul(scale, m[k*n2+c])
-				}
-			}
-		}
-	}
-	return nil
-}
--- a/vendor/github.com/templexxx/reedsolomon/rs.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs.go
@@ -1,280 +0,0 @@
-/*
-	Reed-Solomon Codes over GF(2^8)
-	Primitive Polynomial:  x^8+x^4+x^3+x^2+1
-	Galois Filed arithmetic using Intel SIMD instructions (AVX2 or SSSE3)
-*/
-
-package reedsolomon
-
-import "errors"
-
-// Encoder implements for Reed-Solomon Encoding/Reconstructing
-type Encoder interface {
-	// Encode multiply generator-matrix with data
-	// len(vects) must be equal with num of data+parity
-	Encode(vects [][]byte) error
-	// Result of reconst will be put into origin position of vects
-	// it means if you lost vects[0], after reconst the vects[0]'s data will be back in vects[0]
-
-	// Reconstruct repair lost data & parity
-	// Set vect nil if lost
-	Reconstruct(vects [][]byte) error
-	// Reconstruct repair lost data
-	// Set vect nil if lost
-	ReconstructData(vects [][]byte) error
-	// ReconstWithPos repair lost data&parity with has&lost vects position
-	// Save bandwidth&disk I/O (cmp with Reconstruct, if the lost is less than num of parity)
-	// As erasure codes, we must know which vect is broken,
-	// so it's necessary to provide such APIs
-	// len(has) must equal num of data vects
-	// Example:
-	// in 3+2, the whole position: [0,1,2,3,4]
-	// if lost vects[0]
-	// the "has" could be [1,2,3] or [1,2,4] or ...
-	// then you must be sure that vects[1] vects[2] vects[3] have correct data (if the "has" is [1,2,3])
-	// the "dLost" will be [0]
-	// ps:
-	// 1. the above lists are in increasing orders  TODO support out-of-order
-	// 2. each vect has same len, don't set it nil
-	// so we don't need to make slice
-	ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error
-	//// ReconstWithPos repair lost data with survived&lost vects position
-	//// Don't need to append position of parity lost into "lost"
-	ReconstDataWithPos(vects [][]byte, has, dLost []int) error
-}
-
-func checkCfg(d, p int) error {
-	if (d <= 0) || (p <= 0) {
-		return errors.New("rs.New: data or parity <= 0")
-	}
-	if d+p >= 256 {
-		return errors.New("rs.New: data+parity >= 256")
-	}
-	return nil
-}
-
-// New create an Encoder (vandermonde matrix as Encoding matrix)
-func New(data, parity int) (enc Encoder, err error) {
-	err = checkCfg(data, parity)
-	if err != nil {
-		return
-	}
-	e, err := genEncMatrixVand(data, parity)
-	if err != nil {
-		return
-	}
-	return newRS(data, parity, e), nil
-}
-
-// NewCauchy create an Encoder (cauchy matrix as Generator Matrix)
-func NewCauchy(data, parity int) (enc Encoder, err error) {
-	err = checkCfg(data, parity)
-	if err != nil {
-		return
-	}
-	e := genEncMatrixCauchy(data, parity)
-	return newRS(data, parity, e), nil
-}
-
-type encBase struct {
-	data   int
-	parity int
-	encode []byte
-	gen    []byte
-}
-
-func checkEnc(d, p int, vs [][]byte) (size int, err error) {
-	total := len(vs)
-	if d+p != total {
-		err = errors.New("rs.checkER: vects not match rs args")
-		return
-	}
-	size = len(vs[0])
-	if size == 0 {
-		err = errors.New("rs.checkER: vects size = 0")
-		return
-	}
-	for i := 1; i < total; i++ {
-		if len(vs[i]) != size {
-			err = errors.New("rs.checkER: vects size mismatch")
-			return
-		}
-	}
-	return
-}
-
-func (e *encBase) Encode(vects [][]byte) (err error) {
-	d := e.data
-	p := e.parity
-	_, err = checkEnc(d, p, vects)
-	if err != nil {
-		return
-	}
-	dv := vects[:d]
-	pv := vects[d:]
-	g := e.gen
-	for i := 0; i < d; i++ {
-		for j := 0; j < p; j++ {
-			if i != 0 {
-				mulVectAdd(g[j*d+i], dv[i], pv[j])
-			} else {
-				mulVect(g[j*d], dv[0], pv[j])
-			}
-		}
-	}
-	return
-}
-
-func mulVect(c byte, a, b []byte) {
-	t := mulTbl[c]
-	for i := 0; i < len(a); i++ {
-		b[i] = t[a[i]]
-	}
-}
-
-func mulVectAdd(c byte, a, b []byte) {
-	t := mulTbl[c]
-	for i := 0; i < len(a); i++ {
-		b[i] ^= t[a[i]]
-	}
-}
-
-func (e *encBase) Reconstruct(vects [][]byte) (err error) {
-	return e.reconstruct(vects, false)
-}
-
-func (e *encBase) ReconstructData(vects [][]byte) (err error) {
-	return e.reconstruct(vects, true)
-}
-
-func (e *encBase) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
-	return e.reconstWithPos(vects, has, dLost, pLost, false)
-}
-
-func (e *encBase) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
-	return e.reconstWithPos(vects, has, dLost, nil, true)
-}
-
-func (e *encBase) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
-	d := e.data
-	em := e.encode
-	dCnt := len(dLost)
-	size := len(vects[has[0]])
-	if dCnt != 0 {
-		vtmp := make([][]byte, d+dCnt)
-		for i, p := range has {
-			vtmp[i] = vects[p]
-		}
-		for i, p := range dLost {
-			if len(vects[p]) == 0 {
-				vects[p] = make([]byte, size)
-			}
-			vtmp[i+d] = vects[p]
-		}
-		matrixbuf := make([]byte, 4*d*d+dCnt*d)
-		m := matrixbuf[:d*d]
-		for i, l := range has {
-			copy(m[i*d:i*d+d], em[l*d:l*d+d])
-		}
-		raw := matrixbuf[d*d : 3*d*d]
-		im := matrixbuf[3*d*d : 4*d*d]
-		err2 := matrix(m).invert(raw, d, im)
-		if err2 != nil {
-			return err2
-		}
-		g := matrixbuf[4*d*d:]
-		for i, l := range dLost {
-			copy(g[i*d:i*d+d], im[l*d:l*d+d])
-		}
-		etmp := &encBase{data: d, parity: dCnt, gen: g}
-		err2 = etmp.Encode(vtmp[:d+dCnt])
-		if err2 != nil {
-			return err2
-		}
-	}
-	if dataOnly {
-		return
-	}
-	pCnt := len(pLost)
-	if pCnt != 0 {
-		vtmp := make([][]byte, d+pCnt)
-		g := make([]byte, pCnt*d)
-		for i, l := range pLost {
-			copy(g[i*d:i*d+d], em[l*d:l*d+d])
-		}
-		for i := 0; i < d; i++ {
-			vtmp[i] = vects[i]
-		}
-		for i, p := range pLost {
-			if len(vects[p]) == 0 {
-				vects[p] = make([]byte, size)
-			}
-			vtmp[i+d] = vects[p]
-		}
-		etmp := &encBase{data: d, parity: pCnt, gen: g}
-		err2 := etmp.Encode(vtmp[:d+pCnt])
-		if err2 != nil {
-			return err2
-		}
-	}
-	return
-}
-
-func (e *encBase) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
-	d := e.data
-	p := e.parity
-	// TODO check more, maybe element in has show in lost & deal with len(has) > d
-	if len(has) != d {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	dCnt := len(dLost)
-	if dCnt > p {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	pCnt := len(pLost)
-	if pCnt > p {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	return e.reconst(vects, has, dLost, pLost, dataOnly)
-}
-
-func (e *encBase) reconstruct(vects [][]byte, dataOnly bool) (err error) {
-	d := e.data
-	p := e.parity
-	t := d + p
-	listBuf := make([]int, t+p)
-	has := listBuf[:d]
-	dLost := listBuf[d:t]
-	pLost := listBuf[t : t+p]
-	hasCnt, dCnt, pCnt := 0, 0, 0
-	for i := 0; i < t; i++ {
-		if vects[i] != nil {
-			if hasCnt < d {
-				has[hasCnt] = i
-				hasCnt++
-			}
-		} else {
-			if i < d {
-				if dCnt < p {
-					dLost[dCnt] = i
-					dCnt++
-				} else {
-					return errors.New("rs.Reconst: not enough vects")
-				}
-			} else {
-				if pCnt < p {
-					pLost[pCnt] = i
-					pCnt++
-				} else {
-					return errors.New("rs.Reconst: not enough vects")
-				}
-			}
-		}
-	}
-	if hasCnt != d {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	dLost = dLost[:dCnt]
-	pLost = pLost[:pCnt]
-	return e.reconst(vects, has, dLost, pLost, dataOnly)
-}
--- a/vendor/github.com/templexxx/reedsolomon/rs_amd64.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs_amd64.go
@@ -1,868 +0,0 @@
-package reedsolomon
-
-import (
-	"errors"
-	"sync"
-
-	"github.com/templexxx/cpufeat"
-)
-
-// SIMD Instruction Extensions
-const (
-	none = iota
-	avx2
-	ssse3
-)
-
-var extension = none
-
-func init() {
-	getEXT()
-}
-
-func getEXT() {
-	if cpufeat.X86.HasAVX2 {
-		extension = avx2
-		return
-	} else if cpufeat.X86.HasSSSE3 {
-		extension = ssse3
-		return
-	} else {
-		extension = none
-		return
-	}
-}
-
-//go:noescape
-func copy32B(dst, src []byte) // Need SSE2(introduced in 2001)
-
-func initTbl(g matrix, rows, cols int, tbl []byte) {
-	off := 0
-	for i := 0; i < cols; i++ {
-		for j := 0; j < rows; j++ {
-			c := g[j*cols+i]
-			t := lowhighTbl[c][:]
-			copy32B(tbl[off:off+32], t)
-			off += 32
-		}
-	}
-}
-
-// At most 3060 inverse matrix (when data=14, parity=4, calc by mathtool/cntinverse)
-// In practice,  data usually below 12, parity below 5
-func okCache(data, parity int) bool {
-	if data < 15 && parity < 5 { // you can change it, but the data+parity can't be bigger than 32 (tips: see the codes about make inverse matrix)
-		return true
-	}
-	return false
-}
-
-type (
-	encSSSE3 encSIMD
-	encAVX2  encSIMD
-	encSIMD  struct {
-		data   int
-		parity int
-		encode matrix
-		gen    matrix
-		tbl    []byte
-		// inverse matrix cache is design for small vect size ( < 4KB )
-		// it will save time for calculating inverse matrix
-		// but it's not so important for big vect size
-		enableCache  bool
-		inverseCache iCache
-	}
-	iCache struct {
-		sync.RWMutex
-		data map[uint32][]byte
-	}
-)
-
-func newRS(d, p int, em matrix) (enc Encoder) {
-	g := em[d*d:]
-	if extension == none {
-		return &encBase{data: d, parity: p, encode: em, gen: g}
-	}
-	t := make([]byte, d*p*32)
-	initTbl(g, p, d, t)
-	ok := okCache(d, p)
-	if extension == avx2 {
-		e := &encAVX2{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
-			inverseCache: iCache{data: make(map[uint32][]byte)}}
-		return e
-	}
-	e := &encSSSE3{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
-		inverseCache: iCache{data: make(map[uint32][]byte)}}
-	return e
-}
-
-// Size of sub-vector
-const unit int = 16 * 1024
-
-func getDo(n int) int {
-	if n < unit {
-		c := n >> 4
-		if c == 0 {
-			return unit
-		}
-		return c << 4
-	}
-	return unit
-}
-
-func (e *encAVX2) Encode(vects [][]byte) (err error) {
-	d := e.data
-	p := e.parity
-	size, err := checkEnc(d, p, vects)
-	if err != nil {
-		return
-	}
-	dv := vects[:d]
-	pv := vects[d:]
-	start, end := 0, 0
-	do := getDo(size)
-	for start < size {
-		end = start + do
-		if end <= size {
-			e.matrixMul(start, end, dv, pv)
-			start = end
-		} else {
-			e.matrixMulRemain(start, size, dv, pv)
-			start = size
-		}
-	}
-	return
-}
-
-//go:noescape
-func mulVectAVX2(tbl, d, p []byte)
-
-//go:noescape
-func mulVectAddAVX2(tbl, d, p []byte)
-
-func (e *encAVX2) matrixMul(start, end int, dv, pv [][]byte) {
-	d := e.data
-	p := e.parity
-	tbl := e.tbl
-	off := 0
-	for i := 0; i < d; i++ {
-		for j := 0; j < p; j++ {
-			t := tbl[off : off+32]
-			if i != 0 {
-				mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
-			} else {
-				mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
-			}
-			off += 32
-		}
-	}
-}
-
-func (e *encAVX2) matrixMulRemain(start, end int, dv, pv [][]byte) {
-	undone := end - start
-	do := (undone >> 4) << 4
-	d := e.data
-	p := e.parity
-	tbl := e.tbl
-	if do >= 16 {
-		end2 := start + do
-		off := 0
-		for i := 0; i < d; i++ {
-			for j := 0; j < p; j++ {
-				t := tbl[off : off+32]
-				if i != 0 {
-					mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
-				} else {
-					mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
-				}
-				off += 32
-			}
-		}
-		start = end
-	}
-	if undone > do {
-		// may recalculate some data, but still improve a lot
-		start2 := end - 16
-		if start2 >= 0 {
-			off := 0
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					t := tbl[off : off+32]
-					if i != 0 {
-						mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
-					} else {
-						mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
-					}
-					off += 32
-				}
-			}
-		} else {
-			g := e.gen
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					if i != 0 {
-						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
-					} else {
-						mulVect(g[j*d], dv[0][start:], pv[j][start:])
-					}
-				}
-			}
-		}
-	}
-}
-
-// use generator-matrix but not tbls for encoding
-// it's design for reconstructing
-// for small vects, it cost to much time on initTbl, so drop it
-// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
-func (e *encAVX2) encodeGen(vects [][]byte) (err error) {
-	d := e.data
-	p := e.parity
-	size, err := checkEnc(d, p, vects)
-	if err != nil {
-		return
-	}
-	dv := vects[:d]
-	pv := vects[d:]
-	start, end := 0, 0
-	do := getDo(size)
-	for start < size {
-		end = start + do
-		if end <= size {
-			e.matrixMulGen(start, end, dv, pv)
-			start = end
-		} else {
-			e.matrixMulRemainGen(start, size, dv, pv)
-			start = size
-		}
-	}
-	return
-}
-
-func (e *encAVX2) matrixMulGen(start, end int, dv, pv [][]byte) {
-	d := e.data
-	p := e.parity
-	g := e.gen
-	for i := 0; i < d; i++ {
-		for j := 0; j < p; j++ {
-			t := lowhighTbl[g[j*d+i]][:]
-			if i != 0 {
-				mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
-			} else {
-				mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
-			}
-		}
-	}
-}
-
-func (e *encAVX2) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
-	undone := end - start
-	do := (undone >> 4) << 4
-	d := e.data
-	p := e.parity
-	g := e.gen
-	if do >= 16 {
-		end2 := start + do
-		for i := 0; i < d; i++ {
-			for j := 0; j < p; j++ {
-				t := lowhighTbl[g[j*d+i]][:]
-				if i != 0 {
-					mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
-				} else {
-					mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
-				}
-			}
-		}
-		start = end
-	}
-	if undone > do {
-		start2 := end - 16
-		if start2 >= 0 {
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					t := lowhighTbl[g[j*d+i]][:]
-					if i != 0 {
-						mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
-					} else {
-						mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
-					}
-				}
-			}
-		} else {
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					if i != 0 {
-						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
-					} else {
-						mulVect(g[j*d], dv[0][start:], pv[j][start:])
-					}
-				}
-			}
-		}
-	}
-}
-
-func (e *encAVX2) Reconstruct(vects [][]byte) (err error) {
-	return e.reconstruct(vects, false)
-}
-
-func (e *encAVX2) ReconstructData(vects [][]byte) (err error) {
-	return e.reconstruct(vects, true)
-}
-
-func (e *encAVX2) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
-	return e.reconstWithPos(vects, has, dLost, pLost, false)
-}
-
-func (e *encAVX2) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
-	return e.reconstWithPos(vects, has, dLost, nil, true)
-}
-
-func (e *encAVX2) makeGen(has, dLost []int) (gen []byte, err error) {
-	d := e.data
-	em := e.encode
-	cnt := len(dLost)
-	if !e.enableCache {
-		matrixbuf := make([]byte, 4*d*d+cnt*d)
-		m := matrixbuf[:d*d]
-		for i, l := range has {
-			copy(m[i*d:i*d+d], em[l*d:l*d+d])
-		}
-		raw := matrixbuf[d*d : 3*d*d]
-		im := matrixbuf[3*d*d : 4*d*d]
-		err2 := matrix(m).invert(raw, d, im)
-		if err2 != nil {
-			return nil, err2
-		}
-		g := matrixbuf[4*d*d:]
-		for i, l := range dLost {
-			copy(g[i*d:i*d+d], im[l*d:l*d+d])
-		}
-		return g, nil
-	}
-	var ikey uint32
-	for _, p := range has {
-		ikey += 1 << uint8(p)
-	}
-	e.inverseCache.RLock()
-	v, ok := e.inverseCache.data[ikey]
-	if ok {
-		im := v
-		g := make([]byte, cnt*d)
-		for i, l := range dLost {
-			copy(g[i*d:i*d+d], im[l*d:l*d+d])
-		}
-		e.inverseCache.RUnlock()
-		return g, nil
-	}
-	e.inverseCache.RUnlock()
-	matrixbuf := make([]byte, 4*d*d+cnt*d)
-	m := matrixbuf[:d*d]
-	for i, l := range has {
-		copy(m[i*d:i*d+d], em[l*d:l*d+d])
-	}
-	raw := matrixbuf[d*d : 3*d*d]
-	im := matrixbuf[3*d*d : 4*d*d]
-	err2 := matrix(m).invert(raw, d, im)
-	if err2 != nil {
-		return nil, err2
-	}
-	e.inverseCache.Lock()
-	e.inverseCache.data[ikey] = im
-	e.inverseCache.Unlock()
-	g := matrixbuf[4*d*d:]
-	for i, l := range dLost {
-		copy(g[i*d:i*d+d], im[l*d:l*d+d])
-	}
-	return g, nil
-}
-
-func (e *encAVX2) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
-	d := e.data
-	em := e.encode
-	dCnt := len(dLost)
-	size := len(vects[has[0]])
-	if dCnt != 0 {
-		vtmp := make([][]byte, d+dCnt)
-		for i, p := range has {
-			vtmp[i] = vects[p]
-		}
-		for i, p := range dLost {
-			if len(vects[p]) == 0 {
-				vects[p] = make([]byte, size)
-			}
-			vtmp[i+d] = vects[p]
-		}
-		g, err2 := e.makeGen(has, dLost)
-		if err2 != nil {
-			return
-		}
-		etmp := &encAVX2{data: d, parity: dCnt, gen: g}
-		err2 = etmp.encodeGen(vtmp)
-		if err2 != nil {
-			return err2
-		}
-	}
-	if dataOnly {
-		return
-	}
-	pCnt := len(pLost)
-	if pCnt != 0 {
-		g := make([]byte, pCnt*d)
-		for i, l := range pLost {
-			copy(g[i*d:i*d+d], em[l*d:l*d+d])
-		}
-		vtmp := make([][]byte, d+pCnt)
-		for i := 0; i < d; i++ {
-			vtmp[i] = vects[i]
-		}
-		for i, p := range pLost {
-			if len(vects[p]) == 0 {
-				vects[p] = make([]byte, size)
-			}
-			vtmp[i+d] = vects[p]
-		}
-		etmp := &encAVX2{data: d, parity: pCnt, gen: g}
-		err2 := etmp.encodeGen(vtmp)
-		if err2 != nil {
-			return err2
-		}
-	}
-	return
-}
-
-func (e *encAVX2) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
-	d := e.data
-	p := e.parity
-	if len(has) != d {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	dCnt := len(dLost)
-	if dCnt > p {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	pCnt := len(pLost)
-	if pCnt > p {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	return e.reconst(vects, has, dLost, pLost, dataOnly)
-}
-
-func (e *encAVX2) reconstruct(vects [][]byte, dataOnly bool) (err error) {
-	d := e.data
-	p := e.parity
-	t := d + p
-	listBuf := make([]int, t+p)
-	has := listBuf[:d]
-	dLost := listBuf[d:t]
-	pLost := listBuf[t : t+p]
-	hasCnt, dCnt, pCnt := 0, 0, 0
-	for i := 0; i < t; i++ {
-		if vects[i] != nil {
-			if hasCnt < d {
-				has[hasCnt] = i
-				hasCnt++
-			}
-		} else {
-			if i < d {
-				if dCnt < p {
-					dLost[dCnt] = i
-					dCnt++
-				} else {
-					return errors.New("rs.Reconst: not enough vects")
-				}
-			} else {
-				if pCnt < p {
-					pLost[pCnt] = i
-					pCnt++
-				} else {
-					return errors.New("rs.Reconst: not enough vects")
-				}
-			}
-		}
-	}
-	if hasCnt != d {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	dLost = dLost[:dCnt]
-	pLost = pLost[:pCnt]
-	return e.reconst(vects, has, dLost, pLost, dataOnly)
-}
-
-func (e *encSSSE3) Encode(vects [][]byte) (err error) {
-	d := e.data
-	p := e.parity
-	size, err := checkEnc(d, p, vects)
-	if err != nil {
-		return
-	}
-	dv := vects[:d]
-	pv := vects[d:]
-	start, end := 0, 0
-	do := getDo(size)
-	for start < size {
-		end = start + do
-		if end <= size {
-			e.matrixMul(start, end, dv, pv)
-			start = end
-		} else {
-			e.matrixMulRemain(start, size, dv, pv)
-			start = size
-		}
-	}
-	return
-}
-
-//go:noescape
-func mulVectSSSE3(tbl, d, p []byte)
-
-//go:noescape
-func mulVectAddSSSE3(tbl, d, p []byte)
-
-func (e *encSSSE3) matrixMul(start, end int, dv, pv [][]byte) {
-	d := e.data
-	p := e.parity
-	tbl := e.tbl
-	off := 0
-	for i := 0; i < d; i++ {
-		for j := 0; j < p; j++ {
-			t := tbl[off : off+32]
-			if i != 0 {
-				mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
-			} else {
-				mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
-			}
-			off += 32
-		}
-	}
-}
-
-func (e *encSSSE3) matrixMulRemain(start, end int, dv, pv [][]byte) {
-	undone := end - start
-	do := (undone >> 4) << 4
-	d := e.data
-	p := e.parity
-	tbl := e.tbl
-	if do >= 16 {
-		end2 := start + do
-		off := 0
-		for i := 0; i < d; i++ {
-			for j := 0; j < p; j++ {
-				t := tbl[off : off+32]
-				if i != 0 {
-					mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
-				} else {
-					mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
-				}
-				off += 32
-			}
-		}
-		start = end
-	}
-	if undone > do {
-		start2 := end - 16
-		if start2 >= 0 {
-			off := 0
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					t := tbl[off : off+32]
-					if i != 0 {
-						mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
-					} else {
-						mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
-					}
-					off += 32
-				}
-			}
-		} else {
-			g := e.gen
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					if i != 0 {
-						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
-					} else {
-						mulVect(g[j*d], dv[0][start:], pv[j][start:])
-					}
-				}
-			}
-		}
-	}
-}
-
-// use generator-matrix but not tbls for encoding
-// it's design for reconstructing
-// for small vects, it cost to much time on initTbl, so drop it
-// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
-func (e *encSSSE3) encodeGen(vects [][]byte) (err error) {
-	d := e.data
-	p := e.parity
-	size, err := checkEnc(d, p, vects)
-	if err != nil {
-		return
-	}
-	dv := vects[:d]
-	pv := vects[d:]
-	start, end := 0, 0
-	do := getDo(size)
-	for start < size {
-		end = start + do
-		if end <= size {
-			e.matrixMulGen(start, end, dv, pv)
-			start = end
-		} else {
-			e.matrixMulRemainGen(start, size, dv, pv)
-			start = size
-		}
-	}
-	return
-}
-
-func (e *encSSSE3) matrixMulGen(start, end int, dv, pv [][]byte) {
-	d := e.data
-	p := e.parity
-	g := e.gen
-	for i := 0; i < d; i++ {
-		for j := 0; j < p; j++ {
-			t := lowhighTbl[g[j*d+i]][:]
-			if i != 0 {
-				mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
-			} else {
-				mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
-			}
-		}
-	}
-}
-
-func (e *encSSSE3) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
-	undone := end - start
-	do := (undone >> 4) << 4
-	d := e.data
-	p := e.parity
-	g := e.gen
-	if do >= 16 {
-		end2 := start + do
-		for i := 0; i < d; i++ {
-			for j := 0; j < p; j++ {
-				t := lowhighTbl[g[j*d+i]][:]
-				if i != 0 {
-					mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
-				} else {
-					mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
-				}
-			}
-		}
-		start = end
-	}
-	if undone > do {
-		start2 := end - 16
-		if start2 >= 0 {
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					t := lowhighTbl[g[j*d+i]][:]
-					if i != 0 {
-						mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
-					} else {
-						mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
-					}
-				}
-			}
-		} else {
-			for i := 0; i < d; i++ {
-				for j := 0; j < p; j++ {
-					if i != 0 {
-						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
-					} else {
-						mulVect(g[j*d], dv[0][start:], pv[j][start:])
-					}
-				}
-			}
-		}
-	}
-}
-
-func (e *encSSSE3) Reconstruct(vects [][]byte) (err error) {
-	return e.reconstruct(vects, false)
-}
-
-func (e *encSSSE3) ReconstructData(vects [][]byte) (err error) {
-	return e.reconstruct(vects, true)
-}
-
-func (e *encSSSE3) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
-	return e.reconstWithPos(vects, has, dLost, pLost, false)
-}
-
-func (e *encSSSE3) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
-	return e.reconstWithPos(vects, has, dLost, nil, true)
-}
-
-func (e *encSSSE3) makeGen(has, dLost []int) (gen []byte, err error) {
-	d := e.data
-	em := e.encode
-	cnt := len(dLost)
-	if !e.enableCache {
-		matrixbuf := make([]byte, 4*d*d+cnt*d)
-		m := matrixbuf[:d*d]
-		for i, l := range has {
-			copy(m[i*d:i*d+d], em[l*d:l*d+d])
-		}
-		raw := matrixbuf[d*d : 3*d*d]
-		im := matrixbuf[3*d*d : 4*d*d]
-		err2 := matrix(m).invert(raw, d, im)
-		if err2 != nil {
-			return nil, err2
-		}
-		g := matrixbuf[4*d*d:]
-		for i, l := range dLost {
-			copy(g[i*d:i*d+d], im[l*d:l*d+d])
-		}
-		return g, nil
-	}
-	var ikey uint32
-	for _, p := range has {
-		ikey += 1 << uint8(p)
-	}
-	e.inverseCache.RLock()
-	v, ok := e.inverseCache.data[ikey]
-	if ok {
-		im := v
-		g := make([]byte, cnt*d)
-		for i, l := range dLost {
-			copy(g[i*d:i*d+d], im[l*d:l*d+d])
-		}
-		e.inverseCache.RUnlock()
-		return g, nil
-	}
-	e.inverseCache.RUnlock()
-	matrixbuf := make([]byte, 4*d*d+cnt*d)
-	m := matrixbuf[:d*d]
-	for i, l := range has {
-		copy(m[i*d:i*d+d], em[l*d:l*d+d])
-	}
-	raw := matrixbuf[d*d : 3*d*d]
-	im := matrixbuf[3*d*d : 4*d*d]
-	err2 := matrix(m).invert(raw, d, im)
-	if err2 != nil {
-		return nil, err2
-	}
-	e.inverseCache.Lock()
-	e.inverseCache.data[ikey] = im
-	e.inverseCache.Unlock()
-	g := matrixbuf[4*d*d:]
-	for i, l := range dLost {
-		copy(g[i*d:i*d+d], im[l*d:l*d+d])
-	}
-	return g, nil
-}
-
-func (e *encSSSE3) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
-	d := e.data
-	em := e.encode
-	dCnt := len(dLost)
-	size := len(vects[has[0]])
-	if dCnt != 0 {
-		vtmp := make([][]byte, d+dCnt)
-		for i, p := range has {
-			vtmp[i] = vects[p]
-		}
-		for i, p := range dLost {
-			if len(vects[p]) == 0 {
-				vects[p] = make([]byte, size)
-			}
-			vtmp[i+d] = vects[p]
-		}
-		g, err2 := e.makeGen(has, dLost)
-		if err2 != nil {
-			return
-		}
-		etmp := &encSSSE3{data: d, parity: dCnt, gen: g}
-		err2 = etmp.encodeGen(vtmp)
-		if err2 != nil {
-			return err2
-		}
-	}
-	if dataOnly {
-		return
-	}
-	pCnt := len(pLost)
-	if pCnt != 0 {
-		g := make([]byte, pCnt*d)
-		for i, l := range pLost {
-			copy(g[i*d:i*d+d], em[l*d:l*d+d])
-		}
-		vtmp := make([][]byte, d+pCnt)
-		for i := 0; i < d; i++ {
-			vtmp[i] = vects[i]
-		}
-		for i, p := range pLost {
-			if len(vects[p]) == 0 {
-				vects[p] = make([]byte, size)
-			}
-			vtmp[i+d] = vects[p]
-		}
-		etmp := &encSSSE3{data: d, parity: pCnt, gen: g}
-		err2 := etmp.encodeGen(vtmp)
-		if err2 != nil {
-			return err2
-		}
-	}
-	return
-}
-
-func (e *encSSSE3) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
-	d := e.data
-	p := e.parity
-	if len(has) != d {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	dCnt := len(dLost)
-	if dCnt > p {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	pCnt := len(pLost)
-	if pCnt > p {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	return e.reconst(vects, has, dLost, pLost, dataOnly)
-}
-
-func (e *encSSSE3) reconstruct(vects [][]byte, dataOnly bool) (err error) {
-	d := e.data
-	p := e.parity
-	t := d + p
-	listBuf := make([]int, t+p)
-	has := listBuf[:d]
-	dLost := listBuf[d:t]
-	pLost := listBuf[t : t+p]
-	hasCnt, dCnt, pCnt := 0, 0, 0
-	for i := 0; i < t; i++ {
-		if vects[i] != nil {
-			if hasCnt < d {
-				has[hasCnt] = i
-				hasCnt++
-			}
-		} else {
-			if i < d {
-				if dCnt < p {
-					dLost[dCnt] = i
-					dCnt++
-				} else {
-					return errors.New("rs.Reconst: not enough vects")
-				}
-			} else {
-				if pCnt < p {
-					pLost[pCnt] = i
-					pCnt++
-				} else {
-					return errors.New("rs.Reconst: not enough vects")
-				}
-			}
-		}
-	}
-	if hasCnt != d {
-		return errors.New("rs.Reconst: not enough vects")
-	}
-	dLost = dLost[:dCnt]
-	pLost = pLost[:pCnt]
-	return e.reconst(vects, has, dLost, pLost, dataOnly)
-}
--- a/vendor/github.com/templexxx/reedsolomon/rs_amd64.s
+++ b/vendor/github.com/templexxx/reedsolomon/rs_amd64.s
@@ -1,401 +0,0 @@
-// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
-
-#include "textflag.h"
-
-#define low_tbl Y0
-#define high_tbl Y1
-#define mask Y2
-#define in0  Y3
-#define in1  Y4
-#define in2  Y5
-#define in3  Y6
-#define in4  Y7
-#define in5  Y8
-#define in0_h  Y10
-#define in1_h  Y11
-#define in2_h  Y12
-#define in3_h  Y13
-#define in4_h  Y14
-#define in5_h  Y15
-
-#define in  BX
-#define out DI
-#define len R8
-#define pos R9
-
-#define tmp0 R10
-
-#define low_tblx X0
-#define high_tblx X1
-#define maskx X2
-#define in0x X3
-#define in0_hx X10
-#define tmp0x  X9
-#define tmp1x  X11
-#define tmp2x  X12
-#define tmp3x  X13
-
-
-// func mulVectAVX2(tbl, d, p []byte)
-TEXT ·mulVectAVX2(SB), NOSPLIT, $0
-    MOVQ         i+24(FP), in
-	MOVQ         o+48(FP), out
-	MOVQ         tbl+0(FP), tmp0
-	VMOVDQU      (tmp0), low_tblx
-	VMOVDQU      16(tmp0), high_tblx
-	MOVB         $0x0f, DX
-	LONG         $0x2069e3c4; WORD $0x00d2   // VPINSRB $0x00, EDX, XMM2, XMM2
-	VPBROADCASTB maskx, maskx
-	MOVQ         in_len+32(FP), len
-	TESTQ        $31, len
-	JNZ          one16b
-
-ymm:
-    VINSERTI128  $1, low_tblx, low_tbl, low_tbl
-    VINSERTI128  $1, high_tblx, high_tbl, high_tbl
-    VINSERTI128  $1, maskx, mask, mask
-    TESTQ        $255, len
-    JNZ          not_aligned
-
-// 256bytes/loop
-aligned:
-    MOVQ         $0, pos
-
-loop256b:
-	VMOVDQU (in)(pos*1), in0
-	VPSRLQ  $4, in0, in0_h
-	VPAND   mask, in0_h, in0_h
-	VPAND   mask, in0, in0
-	VPSHUFB in0_h, high_tbl, in0_h
-	VPSHUFB in0, low_tbl, in0
-	VPXOR   in0, in0_h, in0
-	VMOVDQU in0, (out)(pos*1)
-
-    VMOVDQU 32(in)(pos*1), in1
-	VPSRLQ  $4, in1, in1_h
-	VPAND   mask, in1_h, in1_h
-	VPAND   mask, in1, in1
-	VPSHUFB in1_h, high_tbl, in1_h
-	VPSHUFB in1, low_tbl, in1
-	VPXOR   in1, in1_h, in1
-	VMOVDQU in1, 32(out)(pos*1)
-
-    VMOVDQU 64(in)(pos*1), in2
-	VPSRLQ  $4, in2, in2_h
-	VPAND   mask, in2_h, in2_h
-	VPAND   mask, in2, in2
-	VPSHUFB in2_h, high_tbl, in2_h
-	VPSHUFB in2, low_tbl, in2
-	VPXOR   in2, in2_h, in2
-	VMOVDQU in2, 64(out)(pos*1)
-
-    VMOVDQU 96(in)(pos*1), in3
-	VPSRLQ  $4, in3, in3_h
-	VPAND   mask, in3_h, in3_h
-	VPAND   mask, in3, in3
-	VPSHUFB in3_h, high_tbl, in3_h
-	VPSHUFB in3, low_tbl, in3
-	VPXOR   in3, in3_h, in3
-	VMOVDQU in3, 96(out)(pos*1)
-
-    VMOVDQU 128(in)(pos*1), in4
-	VPSRLQ  $4, in4, in4_h
-	VPAND   mask, in4_h, in4_h
-	VPAND   mask, in4, in4
-	VPSHUFB in4_h, high_tbl, in4_h
-	VPSHUFB in4, low_tbl, in4
-	VPXOR   in4, in4_h, in4
-	VMOVDQU in4, 128(out)(pos*1)
-
-    VMOVDQU 160(in)(pos*1), in5
-	VPSRLQ  $4, in5, in5_h
-	VPAND   mask, in5_h, in5_h
-	VPAND   mask, in5, in5
-	VPSHUFB in5_h, high_tbl, in5_h
-	VPSHUFB in5, low_tbl, in5
-	VPXOR   in5, in5_h, in5
-	VMOVDQU in5, 160(out)(pos*1)
-
-    VMOVDQU 192(in)(pos*1), in0
-	VPSRLQ  $4, in0, in0_h
-	VPAND   mask, in0_h, in0_h
-	VPAND   mask, in0, in0
-	VPSHUFB in0_h, high_tbl, in0_h
-	VPSHUFB in0, low_tbl, in0
-	VPXOR   in0, in0_h, in0
-	VMOVDQU in0, 192(out)(pos*1)
-
-    VMOVDQU 224(in)(pos*1), in1
-	VPSRLQ  $4, in1, in1_h
-	VPAND   mask, in1_h, in1_h
-	VPAND   mask, in1, in1
-	VPSHUFB in1_h, high_tbl, in1_h
-	VPSHUFB in1, low_tbl, in1
-	VPXOR   in1, in1_h, in1
-	VMOVDQU in1, 224(out)(pos*1)
-
-	ADDQ    $256, pos
-	CMPQ    len, pos
-	JNE     loop256b
-	VZEROUPPER
-	RET
-
-not_aligned:
-    MOVQ    len, tmp0
-    ANDQ    $255, tmp0
-
-loop32b:
-    VMOVDQU -32(in)(len*1), in0
-	VPSRLQ  $4, in0, in0_h
-	VPAND   mask, in0_h, in0_h
-	VPAND   mask, in0, in0
-	VPSHUFB in0_h, high_tbl, in0_h
-	VPSHUFB in0, low_tbl, in0
-	VPXOR   in0, in0_h, in0
-	VMOVDQU in0, -32(out)(len*1)
-	SUBQ    $32, len
-	SUBQ    $32, tmp0
-	JG      loop32b
-	CMPQ    len, $256
-	JGE     aligned
-	VZEROUPPER
-	RET
-
-one16b:
-    VMOVDQU  -16(in)(len*1), in0x
-    VPSRLQ   $4, in0x, in0_hx
-    VPAND    maskx, in0x, in0x
-    VPAND    maskx, in0_hx, in0_hx
-    VPSHUFB  in0_hx, high_tblx, in0_hx
-    VPSHUFB  in0x, low_tblx, in0x
-    VPXOR    in0x, in0_hx, in0x
-	VMOVDQU  in0x, -16(out)(len*1)
-	SUBQ     $16, len
-	CMPQ     len, $0
-	JNE      ymm
-	RET
-
-// func mulVectAddAVX2(tbl, d, p []byte)
-TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
-    MOVQ         i+24(FP), in
-	MOVQ         o+48(FP), out
-	MOVQ         tbl+0(FP), tmp0
-	VMOVDQU      (tmp0), low_tblx
-	VMOVDQU      16(tmp0), high_tblx
-	MOVB         $0x0f, DX
-	LONG         $0x2069e3c4; WORD $0x00d2
-	VPBROADCASTB maskx, maskx
-	MOVQ         in_len+32(FP), len
-	TESTQ        $31, len
-	JNZ          one16b
-
-ymm:
-    VINSERTI128  $1, low_tblx, low_tbl, low_tbl
-    VINSERTI128  $1, high_tblx, high_tbl, high_tbl
-    VINSERTI128  $1, maskx, mask, mask
-    TESTQ        $255, len
-    JNZ          not_aligned
-
-aligned:
-    MOVQ         $0, pos
-
-loop256b:
-    VMOVDQU (in)(pos*1), in0
-	VPSRLQ  $4, in0, in0_h
-	VPAND   mask, in0_h, in0_h
-	VPAND   mask, in0, in0
-	VPSHUFB in0_h, high_tbl, in0_h
-	VPSHUFB in0, low_tbl, in0
-	VPXOR   in0, in0_h, in0
-	VPXOR   (out)(pos*1), in0, in0
-	VMOVDQU in0, (out)(pos*1)
-
-    VMOVDQU 32(in)(pos*1), in1
-	VPSRLQ  $4, in1, in1_h
-	VPAND   mask, in1_h, in1_h
-	VPAND   mask, in1, in1
-	VPSHUFB in1_h, high_tbl, in1_h
-	VPSHUFB in1, low_tbl, in1
-	VPXOR   in1, in1_h, in1
-	VPXOR   32(out)(pos*1), in1, in1
-	VMOVDQU in1, 32(out)(pos*1)
-
-    VMOVDQU 64(in)(pos*1), in2
-	VPSRLQ  $4, in2, in2_h
-	VPAND   mask, in2_h, in2_h
-	VPAND   mask, in2, in2
-	VPSHUFB in2_h, high_tbl, in2_h
-	VPSHUFB in2, low_tbl, in2
-	VPXOR   in2, in2_h, in2
-	VPXOR   64(out)(pos*1), in2, in2
-	VMOVDQU in2, 64(out)(pos*1)
-
-    VMOVDQU 96(in)(pos*1), in3
-	VPSRLQ  $4, in3, in3_h
-	VPAND   mask, in3_h, in3_h
-	VPAND   mask, in3, in3
-	VPSHUFB in3_h, high_tbl, in3_h
-	VPSHUFB in3, low_tbl, in3
-	VPXOR   in3, in3_h, in3
-	VPXOR   96(out)(pos*1), in3, in3
-	VMOVDQU in3, 96(out)(pos*1)
-
-    VMOVDQU 128(in)(pos*1), in4
-	VPSRLQ  $4, in4, in4_h
-	VPAND   mask, in4_h, in4_h
-	VPAND   mask, in4, in4
-	VPSHUFB in4_h, high_tbl, in4_h
-	VPSHUFB in4, low_tbl, in4
-	VPXOR   in4, in4_h, in4
-	VPXOR   128(out)(pos*1), in4, in4
-	VMOVDQU in4, 128(out)(pos*1)
-
-    VMOVDQU 160(in)(pos*1), in5
-	VPSRLQ  $4, in5, in5_h
-	VPAND   mask, in5_h, in5_h
-	VPAND   mask, in5, in5
-	VPSHUFB in5_h, high_tbl, in5_h
-	VPSHUFB in5, low_tbl, in5
-	VPXOR   in5, in5_h, in5
-	VPXOR   160(out)(pos*1), in5, in5
-	VMOVDQU in5, 160(out)(pos*1)
-
-    VMOVDQU 192(in)(pos*1), in0
-	VPSRLQ  $4, in0, in0_h
-	VPAND   mask, in0_h, in0_h
-	VPAND   mask, in0, in0
-	VPSHUFB in0_h, high_tbl, in0_h
-	VPSHUFB in0, low_tbl, in0
-	VPXOR   in0, in0_h, in0
-	VPXOR   192(out)(pos*1), in0, in0
-	VMOVDQU in0, 192(out)(pos*1)
-
-    VMOVDQU 224(in)(pos*1), in1
-	VPSRLQ  $4, in1, in1_h
-	VPAND   mask, in1_h, in1_h
-	VPAND   mask, in1, in1
-	VPSHUFB in1_h, high_tbl, in1_h
-	VPSHUFB in1, low_tbl, in1
-	VPXOR   in1, in1_h, in1
-	VPXOR   224(out)(pos*1), in1, in1
-	VMOVDQU in1, 224(out)(pos*1)
-
-	ADDQ    $256, pos
-	CMPQ    len, pos
-	JNE     loop256b
-	VZEROUPPER
-	RET
-
-not_aligned:
-    MOVQ    len, tmp0
-    ANDQ    $255, tmp0
-
-loop32b:
-    VMOVDQU -32(in)(len*1), in0
-	VPSRLQ  $4, in0, in0_h
-	VPAND   mask, in0_h, in0_h
-	VPAND   mask, in0, in0
-	VPSHUFB in0_h, high_tbl, in0_h
-	VPSHUFB in0, low_tbl, in0
-	VPXOR   in0, in0_h, in0
-	VPXOR   -32(out)(len*1), in0, in0
-	VMOVDQU in0, -32(out)(len*1)
-	SUBQ    $32, len
-	SUBQ    $32, tmp0
-	JG      loop32b
-	CMPQ    len, $256
-	JGE     aligned
-	VZEROUPPER
-	RET
-
-one16b:
-    VMOVDQU  -16(in)(len*1), in0x
-    VPSRLQ   $4, in0x, in0_hx
-    VPAND    maskx, in0x, in0x
-    VPAND    maskx, in0_hx, in0_hx
-    VPSHUFB  in0_hx, high_tblx, in0_hx
-    VPSHUFB  in0x, low_tblx, in0x
-    VPXOR    in0x, in0_hx, in0x
-    VPXOR    -16(out)(len*1), in0x, in0x
-	VMOVDQU  in0x, -16(out)(len*1)
-	SUBQ     $16, len
-	CMPQ     len, $0
-	JNE      ymm
-	RET
-
-// func mulVectSSSE3(tbl, d, p []byte)
-TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
-    MOVQ    i+24(FP), in
-	MOVQ    o+48(FP), out
-	MOVQ    tbl+0(FP), tmp0
-	MOVOU   (tmp0), low_tblx
-	MOVOU   16(tmp0), high_tblx
-    MOVB    $15, tmp0
-    MOVQ    tmp0, maskx
-    PXOR    tmp0x, tmp0x
-   	PSHUFB  tmp0x, maskx
-	MOVQ    in_len+32(FP), len
-	SHRQ    $4, len
-
-loop:
-	MOVOU  (in), in0x
-	MOVOU  in0x, in0_hx
-	PSRLQ  $4, in0_hx
-	PAND   maskx, in0x
-	PAND   maskx, in0_hx
-	MOVOU  low_tblx, tmp1x
-	MOVOU  high_tblx, tmp2x
-	PSHUFB in0x, tmp1x
-	PSHUFB in0_hx, tmp2x
-	PXOR   tmp1x, tmp2x
-	MOVOU  tmp2x, (out)
-	ADDQ   $16, in
-	ADDQ   $16, out
-	SUBQ   $1, len
-	JNZ    loop
-	RET
-
-// func mulVectAddSSSE3(tbl, d, p []byte)
-TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
-    MOVQ    i+24(FP), in
-	MOVQ    o+48(FP), out
-	MOVQ    tbl+0(FP), tmp0
-	MOVOU   (tmp0), low_tblx
-	MOVOU   16(tmp0), high_tblx
-    MOVB    $15, tmp0
-    MOVQ    tmp0, maskx
-    PXOR    tmp0x, tmp0x
-   	PSHUFB  tmp0x, maskx
-	MOVQ    in_len+32(FP), len
-	SHRQ    $4, len
-
-loop:
-	MOVOU  (in), in0x
-	MOVOU  in0x, in0_hx
-	PSRLQ  $4, in0_hx
-	PAND   maskx, in0x
-	PAND   maskx, in0_hx
-	MOVOU  low_tblx, tmp1x
-	MOVOU  high_tblx, tmp2x
-	PSHUFB in0x, tmp1x
-	PSHUFB in0_hx, tmp2x
-	PXOR   tmp1x, tmp2x
-	MOVOU  (out), tmp3x
-	PXOR   tmp3x, tmp2x
-	MOVOU  tmp2x, (out)
-	ADDQ   $16, in
-	ADDQ   $16, out
-	SUBQ   $1, len
-	JNZ    loop
-	RET
-
-// func copy32B(dst, src []byte)
-TEXT ·copy32B(SB), NOSPLIT, $0
-    MOVQ dst+0(FP), SI
-    MOVQ src+24(FP), DX
-    MOVOU (DX), X0
-    MOVOU 16(DX), X1
-    MOVOU X0, (SI)
-    MOVOU X1, 16(SI)
-    RET
-	
--- a/vendor/github.com/templexxx/reedsolomon/rs_other.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs_other.go
@@ -1,8 +0,0 @@
-// +build !amd64
-
-package reedsolomon
-
-func newRS(d, p int, em matrix) (enc Encoder) {
-	g := em[d*d:]
-	return &encBase{data: d, parity: p, encode: em, gen: g}
-}
--- a/vendor/github.com/templexxx/reedsolomon/tbl.go
+++ b/vendor/github.com/templexxx/reedsolomon/tbl.go