vendor: add kcp-go package

2026-03-17 23:39:16 +08:00 · 2017-06-04 20:07:03 +08:00
parent 80ba931326
commit 84341b7fcc
139 changed files with 17429 additions and 0 deletions
--- a/vendor/github.com/klauspost/cpuid/.gitignore
+++ b/vendor/github.com/klauspost/cpuid/.gitignore
@@ -0,0 +1,24 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
--- a/vendor/github.com/klauspost/cpuid/.travis.yml
+++ b/vendor/github.com/klauspost/cpuid/.travis.yml
@@ -0,0 +1,8 @@
+language: go
+
+go:
+  - 1.3
+  - 1.4
+  - 1.5
+  - 1.6
+  - tip
--- a/vendor/github.com/klauspost/cpuid/LICENSE
+++ b/vendor/github.com/klauspost/cpuid/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/vendor/github.com/klauspost/cpuid/README.md
+++ b/vendor/github.com/klauspost/cpuid/README.md
@@ -0,0 +1,145 @@
+# cpuid
+Package cpuid provides information about the CPU running the current program.
+
+CPU features are detected on startup, and kept for fast access through the life of the application.
+Currently x86 / x64 (AMD64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
+
+You can access the CPU information by accessing the shared CPU variable of the cpuid library.
+
+Package home: https://github.com/klauspost/cpuid
+
+[![GoDoc][1]][2] [![Build Status][3]][4]
+
+[1]: https://godoc.org/github.com/klauspost/cpuid?status.svg
+[2]: https://godoc.org/github.com/klauspost/cpuid
+[3]: https://travis-ci.org/klauspost/cpuid.svg
+[4]: https://travis-ci.org/klauspost/cpuid
+
+# features
+## CPU Instructions
+*  **CMOV** (i686 CMOV)
+*  **NX** (NX (No-Execute) bit)
+*  **AMD3DNOW** (AMD 3DNOW)
+*  **AMD3DNOWEXT** (AMD 3DNowExt)
+*  **MMX** (standard MMX)
+*  **MMXEXT** (SSE integer functions or AMD MMX ext)
+*  **SSE** (SSE functions)
+*  **SSE2** (P4 SSE functions)
+*  **SSE3** (Prescott SSE3 functions)
+*  **SSSE3** (Conroe SSSE3 functions)
+*  **SSE4** (Penryn SSE4.1 functions)
+*  **SSE4A** (AMD Barcelona microarchitecture SSE4a instructions)
+*  **SSE42** (Nehalem SSE4.2 functions)
+*  **AVX** (AVX functions)
+*  **AVX2** (AVX2 functions)
+*  **FMA3** (Intel FMA 3)
+*  **FMA4** (Bulldozer FMA4 functions)
+*  **XOP** (Bulldozer XOP functions)
+*  **F16C** (Half-precision floating-point conversion)
+*  **BMI1** (Bit Manipulation Instruction Set 1)
+*  **BMI2** (Bit Manipulation Instruction Set 2)
+*  **TBM** (AMD Trailing Bit Manipulation)
+*  **LZCNT** (LZCNT instruction)
+*  **POPCNT** (POPCNT instruction)
+*  **AESNI** (Advanced Encryption Standard New Instructions)
+*  **CLMUL** (Carry-less Multiplication)
+*  **HTT** (Hyperthreading (enabled))
+*  **HLE** (Hardware Lock Elision)
+*  **RTM** (Restricted Transactional Memory)
+*  **RDRAND** (RDRAND instruction is available)
+*  **RDSEED** (RDSEED instruction is available)
+*  **ADX** (Intel ADX (Multi-Precision Add-Carry Instruction Extensions))
+*  **SHA** (Intel SHA Extensions)
+*  **AVX512F** (AVX-512 Foundation)
+*  **AVX512DQ** (AVX-512 Doubleword and Quadword Instructions)
+*  **AVX512IFMA** (AVX-512 Integer Fused Multiply-Add Instructions)
+*  **AVX512PF** (AVX-512 Prefetch Instructions)
+*  **AVX512ER** (AVX-512 Exponential and Reciprocal Instructions)
+*  **AVX512CD** (AVX-512 Conflict Detection Instructions)
+*  **AVX512BW** (AVX-512 Byte and Word Instructions)
+*  **AVX512VL** (AVX-512 Vector Length Extensions)
+*  **AVX512VBMI** (AVX-512 Vector Bit Manipulation Instructions)
+*  **MPX** (Intel MPX (Memory Protection Extensions))
+*  **ERMS** (Enhanced REP MOVSB/STOSB)
+*  **RDTSCP** (RDTSCP Instruction)
+*  **CX16** (CMPXCHG16B Instruction)
+*  **SGX** (Software Guard Extensions, with activation details)
+
+## Performance
+*  **RDTSCP()** Returns current cycle count. Can be used for benchmarking.
+*  **SSE2SLOW** (SSE2 is supported, but usually not faster)
+*  **SSE3SLOW** (SSE3 is supported, but usually not faster)
+*  **ATOM** (Atom processor, some SSSE3 instructions are slower)
+*  **Cache line** (Probable size of a cache line).
+*  **L1, L2, L3 Cache size** on newer Intel/AMD CPUs.
+
+## Cpu Vendor/VM
+* **Intel**
+* **AMD**
+* **VIA**
+* **Transmeta**
+* **NSC**
+* **KVM**  (Kernel-based Virtual Machine)
+* **MSVM** (Microsoft Hyper-V or Windows Virtual PC)
+* **VMware**
+* **XenHVM**
+
+# installing
+
+```go get github.com/klauspost/cpuid```
+
+# example
+
+```Go
+package main
+
+import (
+	"fmt"
+	"github.com/klauspost/cpuid"
+)
+
+func main() {
+	// Print basic CPU information:
+	fmt.Println("Name:", cpuid.CPU.BrandName)
+	fmt.Println("PhysicalCores:", cpuid.CPU.PhysicalCores)
+	fmt.Println("ThreadsPerCore:", cpuid.CPU.ThreadsPerCore)
+	fmt.Println("LogicalCores:", cpuid.CPU.LogicalCores)
+	fmt.Println("Family", cpuid.CPU.Family, "Model:", cpuid.CPU.Model)
+	fmt.Println("Features:", cpuid.CPU.Features)
+	fmt.Println("Cacheline bytes:", cpuid.CPU.CacheLine)
+	fmt.Println("L1 Data Cache:", cpuid.CPU.Cache.L1D, "bytes")
+	fmt.Println("L1 Instruction Cache:", cpuid.CPU.Cache.L1D, "bytes")
+	fmt.Println("L2 Cache:", cpuid.CPU.Cache.L2, "bytes")
+	fmt.Println("L3 Cache:", cpuid.CPU.Cache.L3, "bytes")
+
+	// Test if we have a specific feature:
+	if cpuid.CPU.SSE() {
+		fmt.Println("We have Streaming SIMD Extensions")
+	}
+}
+```
+
+Sample output:
+```
+>go run main.go
+Name: Intel(R) Core(TM) i5-2540M CPU @ 2.60GHz
+PhysicalCores: 2
+ThreadsPerCore: 2
+LogicalCores: 4
+Family 6 Model: 42
+Features: CMOV,MMX,MMXEXT,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AESNI,CLMUL
+Cacheline bytes: 64
+We have Streaming SIMD Extensions
+```
+
+# private package
+
+In the "private" folder you can find an autogenerated version of the library you can include in your own packages.
+
+For this purpose all exports are removed, and functions and constants are lowercased.
+
+This is not a recommended way of using the library, but provided for convenience, if it is difficult for you to use external packages.
+
+# license
+
+This code is published under an MIT license. See LICENSE file for more information.
--- a/vendor/github.com/klauspost/cpuid/cpuid.go
+++ b/vendor/github.com/klauspost/cpuid/cpuid.go
--- a/vendor/github.com/klauspost/cpuid/cpuid_386.s
+++ b/vendor/github.com/klauspost/cpuid/cpuid_386.s
@@ -0,0 +1,42 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORL CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+4(FP)
+	MOVL BX, ebx+8(FP)
+	MOVL CX, ecx+12(FP)
+	MOVL DX, edx+16(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+4(FP)
+	MOVL DX, edx+8(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
+++ b/vendor/github.com/klauspost/cpuid/cpuid_amd64.s
@@ -0,0 +1,42 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+//+build amd64,!gccgo
+
+// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuid(SB), 7, $0
+	XORQ CX, CX
+	MOVL op+0(FP), AX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·asmCpuidex(SB), 7, $0
+	MOVL op+0(FP), AX
+	MOVL op2+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func asmXgetbv(index uint32) (eax, edx uint32)
+TEXT ·asmXgetbv(SB), 7, $0
+	MOVL index+0(FP), CX
+	BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+	MOVL AX, eax+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
+
+// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+TEXT ·asmRdtscpAsm(SB), 7, $0
+	BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
+	MOVL AX, eax+0(FP)
+	MOVL BX, ebx+4(FP)
+	MOVL CX, ecx+8(FP)
+	MOVL DX, edx+12(FP)
+	RET
--- a/vendor/github.com/klauspost/cpuid/detect_intel.go
+++ b/vendor/github.com/klauspost/cpuid/detect_intel.go
@@ -0,0 +1,17 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo amd64,!gccgo
+
+package cpuid
+
+func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func asmXgetbv(index uint32) (eax, edx uint32)
+func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
+
+func initCPU() {
+	cpuid = asmCpuid
+	cpuidex = asmCpuidex
+	xgetbv = asmXgetbv
+	rdtscpAsm = asmRdtscpAsm
+}
--- a/vendor/github.com/klauspost/cpuid/detect_ref.go
+++ b/vendor/github.com/klauspost/cpuid/detect_ref.go
@@ -0,0 +1,23 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build !amd64,!386 gccgo
+
+package cpuid
+
+func initCPU() {
+	cpuid = func(op uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	cpuidex = func(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+
+	xgetbv = func(index uint32) (eax, edx uint32) {
+		return 0, 0
+	}
+
+	rdtscpAsm = func() (eax, ebx, ecx, edx uint32) {
+		return 0, 0, 0, 0
+	}
+}
--- a/vendor/github.com/klauspost/cpuid/generate.go
+++ b/vendor/github.com/klauspost/cpuid/generate.go
@@ -0,0 +1,3 @@
+package cpuid
+
+//go:generate go run private-gen.go
--- a/vendor/github.com/klauspost/cpuid/private-gen.go
+++ b/vendor/github.com/klauspost/cpuid/private-gen.go
@@ -0,0 +1,476 @@
+// +build ignore
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"go/ast"
+	"go/parser"
+	"go/printer"
+	"go/token"
+	"io"
+	"io/ioutil"
+	"log"
+	"os"
+	"reflect"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var inFiles = []string{"cpuid.go", "cpuid_test.go"}
+var copyFiles = []string{"cpuid_amd64.s", "cpuid_386.s", "detect_ref.go", "detect_intel.go"}
+var fileSet = token.NewFileSet()
+var reWrites = []rewrite{
+	initRewrite("CPUInfo -> cpuInfo"),
+	initRewrite("Vendor -> vendor"),
+	initRewrite("Flags -> flags"),
+	initRewrite("Detect -> detect"),
+	initRewrite("CPU -> cpu"),
+}
+var excludeNames = map[string]bool{"string": true, "join": true, "trim": true,
+	// cpuid_test.go
+	"t": true, "println": true, "logf": true, "log": true, "fatalf": true, "fatal": true,
+}
+
+var excludePrefixes = []string{"test", "benchmark"}
+
+func main() {
+	Package := "private"
+	parserMode := parser.ParseComments
+	exported := make(map[string]rewrite)
+	for _, file := range inFiles {
+		in, err := os.Open(file)
+		if err != nil {
+			log.Fatalf("opening input", err)
+		}
+
+		src, err := ioutil.ReadAll(in)
+		if err != nil {
+			log.Fatalf("reading input", err)
+		}
+
+		astfile, err := parser.ParseFile(fileSet, file, src, parserMode)
+		if err != nil {
+			log.Fatalf("parsing input", err)
+		}
+
+		for _, rw := range reWrites {
+			astfile = rw(astfile)
+		}
+
+		// Inspect the AST and print all identifiers and literals.
+		var startDecl token.Pos
+		var endDecl token.Pos
+		ast.Inspect(astfile, func(n ast.Node) bool {
+			var s string
+			switch x := n.(type) {
+			case *ast.Ident:
+				if x.IsExported() {
+					t := strings.ToLower(x.Name)
+					for _, pre := range excludePrefixes {
+						if strings.HasPrefix(t, pre) {
+							return true
+						}
+					}
+					if excludeNames[t] != true {
+						//if x.Pos() > startDecl && x.Pos() < endDecl {
+						exported[x.Name] = initRewrite(x.Name + " -> " + t)
+					}
+				}
+
+			case *ast.GenDecl:
+				if x.Tok == token.CONST && x.Lparen > 0 {
+					startDecl = x.Lparen
+					endDecl = x.Rparen
+					// fmt.Printf("Decl:%s -> %s\n", fileSet.Position(startDecl), fileSet.Position(endDecl))
+				}
+			}
+			if s != "" {
+				fmt.Printf("%s:\t%s\n", fileSet.Position(n.Pos()), s)
+			}
+			return true
+		})
+
+		for _, rw := range exported {
+			astfile = rw(astfile)
+		}
+
+		var buf bytes.Buffer
+
+		printer.Fprint(&buf, fileSet, astfile)
+
+		// Remove package documentation and insert information
+		s := buf.String()
+		ind := strings.Index(buf.String(), "\npackage cpuid")
+		s = s[ind:]
+		s = "// Generated, DO NOT EDIT,\n" +
+			"// but copy it to your own project and rename the package.\n" +
+			"// See more at http://github.com/klauspost/cpuid\n" +
+			s
+
+		outputName := Package + string(os.PathSeparator) + file
+
+		err = ioutil.WriteFile(outputName, []byte(s), 0644)
+		if err != nil {
+			log.Fatalf("writing output: %s", err)
+		}
+		log.Println("Generated", outputName)
+	}
+
+	for _, file := range copyFiles {
+		dst := ""
+		if strings.HasPrefix(file, "cpuid") {
+			dst = Package + string(os.PathSeparator) + file
+		} else {
+			dst = Package + string(os.PathSeparator) + "cpuid_" + file
+		}
+		err := copyFile(file, dst)
+		if err != nil {
+			log.Fatalf("copying file: %s", err)
+		}
+		log.Println("Copied", dst)
+	}
+}
+
+// CopyFile copies a file from src to dst. If src and dst files exist, and are
+// the same, then return success. Copy the file contents from src to dst.
+func copyFile(src, dst string) (err error) {
+	sfi, err := os.Stat(src)
+	if err != nil {
+		return
+	}
+	if !sfi.Mode().IsRegular() {
+		// cannot copy non-regular files (e.g., directories,
+		// symlinks, devices, etc.)
+		return fmt.Errorf("CopyFile: non-regular source file %s (%q)", sfi.Name(), sfi.Mode().String())
+	}
+	dfi, err := os.Stat(dst)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			return
+		}
+	} else {
+		if !(dfi.Mode().IsRegular()) {
+			return fmt.Errorf("CopyFile: non-regular destination file %s (%q)", dfi.Name(), dfi.Mode().String())
+		}
+		if os.SameFile(sfi, dfi) {
+			return
+		}
+	}
+	err = copyFileContents(src, dst)
+	return
+}
+
+// copyFileContents copies the contents of the file named src to the file named
+// by dst. The file will be created if it does not already exist. If the
+// destination file exists, all it's contents will be replaced by the contents
+// of the source file.
+func copyFileContents(src, dst string) (err error) {
+	in, err := os.Open(src)
+	if err != nil {
+		return
+	}
+	defer in.Close()
+	out, err := os.Create(dst)
+	if err != nil {
+		return
+	}
+	defer func() {
+		cerr := out.Close()
+		if err == nil {
+			err = cerr
+		}
+	}()
+	if _, err = io.Copy(out, in); err != nil {
+		return
+	}
+	err = out.Sync()
+	return
+}
+
+type rewrite func(*ast.File) *ast.File
+
+// Mostly copied from gofmt
+func initRewrite(rewriteRule string) rewrite {
+	f := strings.Split(rewriteRule, "->")
+	if len(f) != 2 {
+		fmt.Fprintf(os.Stderr, "rewrite rule must be of the form 'pattern -> replacement'\n")
+		os.Exit(2)
+	}
+	pattern := parseExpr(f[0], "pattern")
+	replace := parseExpr(f[1], "replacement")
+	return func(p *ast.File) *ast.File { return rewriteFile(pattern, replace, p) }
+}
+
+// parseExpr parses s as an expression.
+// It might make sense to expand this to allow statement patterns,
+// but there are problems with preserving formatting and also
+// with what a wildcard for a statement looks like.
+func parseExpr(s, what string) ast.Expr {
+	x, err := parser.ParseExpr(s)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "parsing %s %s at %s\n", what, s, err)
+		os.Exit(2)
+	}
+	return x
+}
+
+// Keep this function for debugging.
+/*
+func dump(msg string, val reflect.Value) {
+	fmt.Printf("%s:\n", msg)
+	ast.Print(fileSet, val.Interface())
+	fmt.Println()
+}
+*/
+
+// rewriteFile applies the rewrite rule 'pattern -> replace' to an entire file.
+func rewriteFile(pattern, replace ast.Expr, p *ast.File) *ast.File {
+	cmap := ast.NewCommentMap(fileSet, p, p.Comments)
+	m := make(map[string]reflect.Value)
+	pat := reflect.ValueOf(pattern)
+	repl := reflect.ValueOf(replace)
+
+	var rewriteVal func(val reflect.Value) reflect.Value
+	rewriteVal = func(val reflect.Value) reflect.Value {
+		// don't bother if val is invalid to start with
+		if !val.IsValid() {
+			return reflect.Value{}
+		}
+		for k := range m {
+			delete(m, k)
+		}
+		val = apply(rewriteVal, val)
+		if match(m, pat, val) {
+			val = subst(m, repl, reflect.ValueOf(val.Interface().(ast.Node).Pos()))
+		}
+		return val
+	}
+
+	r := apply(rewriteVal, reflect.ValueOf(p)).Interface().(*ast.File)
+	r.Comments = cmap.Filter(r).Comments() // recreate comments list
+	return r
+}
+
+// set is a wrapper for x.Set(y); it protects the caller from panics if x cannot be changed to y.
+func set(x, y reflect.Value) {
+	// don't bother if x cannot be set or y is invalid
+	if !x.CanSet() || !y.IsValid() {
+		return
+	}
+	defer func() {
+		if x := recover(); x != nil {
+			if s, ok := x.(string); ok &&
+				(strings.Contains(s, "type mismatch") || strings.Contains(s, "not assignable")) {
+				// x cannot be set to y - ignore this rewrite
+				return
+			}
+			panic(x)
+		}
+	}()
+	x.Set(y)
+}
+
+// Values/types for special cases.
+var (
+	objectPtrNil = reflect.ValueOf((*ast.Object)(nil))
+	scopePtrNil  = reflect.ValueOf((*ast.Scope)(nil))
+
+	identType     = reflect.TypeOf((*ast.Ident)(nil))
+	objectPtrType = reflect.TypeOf((*ast.Object)(nil))
+	positionType  = reflect.TypeOf(token.NoPos)
+	callExprType  = reflect.TypeOf((*ast.CallExpr)(nil))
+	scopePtrType  = reflect.TypeOf((*ast.Scope)(nil))
+)
+
+// apply replaces each AST field x in val with f(x), returning val.
+// To avoid extra conversions, f operates on the reflect.Value form.
+func apply(f func(reflect.Value) reflect.Value, val reflect.Value) reflect.Value {
+	if !val.IsValid() {
+		return reflect.Value{}
+	}
+
+	// *ast.Objects introduce cycles and are likely incorrect after
+	// rewrite; don't follow them but replace with nil instead
+	if val.Type() == objectPtrType {
+		return objectPtrNil
+	}
+
+	// similarly for scopes: they are likely incorrect after a rewrite;
+	// replace them with nil
+	if val.Type() == scopePtrType {
+		return scopePtrNil
+	}
+
+	switch v := reflect.Indirect(val); v.Kind() {
+	case reflect.Slice:
+		for i := 0; i < v.Len(); i++ {
+			e := v.Index(i)
+			set(e, f(e))
+		}
+	case reflect.Struct:
+		for i := 0; i < v.NumField(); i++ {
+			e := v.Field(i)
+			set(e, f(e))
+		}
+	case reflect.Interface:
+		e := v.Elem()
+		set(v, f(e))
+	}
+	return val
+}
+
+func isWildcard(s string) bool {
+	rune, size := utf8.DecodeRuneInString(s)
+	return size == len(s) && unicode.IsLower(rune)
+}
+
+// match returns true if pattern matches val,
+// recording wildcard submatches in m.
+// If m == nil, match checks whether pattern == val.
+func match(m map[string]reflect.Value, pattern, val reflect.Value) bool {
+	// Wildcard matches any expression.  If it appears multiple
+	// times in the pattern, it must match the same expression
+	// each time.
+	if m != nil && pattern.IsValid() && pattern.Type() == identType {
+		name := pattern.Interface().(*ast.Ident).Name
+		if isWildcard(name) && val.IsValid() {
+			// wildcards only match valid (non-nil) expressions.
+			if _, ok := val.Interface().(ast.Expr); ok && !val.IsNil() {
+				if old, ok := m[name]; ok {
+					return match(nil, old, val)
+				}
+				m[name] = val
+				return true
+			}
+		}
+	}
+
+	// Otherwise, pattern and val must match recursively.
+	if !pattern.IsValid() || !val.IsValid() {
+		return !pattern.IsValid() && !val.IsValid()
+	}
+	if pattern.Type() != val.Type() {
+		return false
+	}
+
+	// Special cases.
+	switch pattern.Type() {
+	case identType:
+		// For identifiers, only the names need to match
+		// (and none of the other *ast.Object information).
+		// This is a common case, handle it all here instead
+		// of recursing down any further via reflection.
+		p := pattern.Interface().(*ast.Ident)
+		v := val.Interface().(*ast.Ident)
+		return p == nil && v == nil || p != nil && v != nil && p.Name == v.Name
+	case objectPtrType, positionType:
+		// object pointers and token positions always match
+		return true
+	case callExprType:
+		// For calls, the Ellipsis fields (token.Position) must
+		// match since that is how f(x) and f(x...) are different.
+		// Check them here but fall through for the remaining fields.
+		p := pattern.Interface().(*ast.CallExpr)
+		v := val.Interface().(*ast.CallExpr)
+		if p.Ellipsis.IsValid() != v.Ellipsis.IsValid() {
+			return false
+		}
+	}
+
+	p := reflect.Indirect(pattern)
+	v := reflect.Indirect(val)
+	if !p.IsValid() || !v.IsValid() {
+		return !p.IsValid() && !v.IsValid()
+	}
+
+	switch p.Kind() {
+	case reflect.Slice:
+		if p.Len() != v.Len() {
+			return false
+		}
+		for i := 0; i < p.Len(); i++ {
+			if !match(m, p.Index(i), v.Index(i)) {
+				return false
+			}
+		}
+		return true
+
+	case reflect.Struct:
+		for i := 0; i < p.NumField(); i++ {
+			if !match(m, p.Field(i), v.Field(i)) {
+				return false
+			}
+		}
+		return true
+
+	case reflect.Interface:
+		return match(m, p.Elem(), v.Elem())
+	}
+
+	// Handle token integers, etc.
+	return p.Interface() == v.Interface()
+}
+
+// subst returns a copy of pattern with values from m substituted in place
+// of wildcards and pos used as the position of tokens from the pattern.
+// if m == nil, subst returns a copy of pattern and doesn't change the line
+// number information.
+func subst(m map[string]reflect.Value, pattern reflect.Value, pos reflect.Value) reflect.Value {
+	if !pattern.IsValid() {
+		return reflect.Value{}
+	}
+
+	// Wildcard gets replaced with map value.
+	if m != nil && pattern.Type() == identType {
+		name := pattern.Interface().(*ast.Ident).Name
+		if isWildcard(name) {
+			if old, ok := m[name]; ok {
+				return subst(nil, old, reflect.Value{})
+			}
+		}
+	}
+
+	if pos.IsValid() && pattern.Type() == positionType {
+		// use new position only if old position was valid in the first place
+		if old := pattern.Interface().(token.Pos); !old.IsValid() {
+			return pattern
+		}
+		return pos
+	}
+
+	// Otherwise copy.
+	switch p := pattern; p.Kind() {
+	case reflect.Slice:
+		v := reflect.MakeSlice(p.Type(), p.Len(), p.Len())
+		for i := 0; i < p.Len(); i++ {
+			v.Index(i).Set(subst(m, p.Index(i), pos))
+		}
+		return v
+
+	case reflect.Struct:
+		v := reflect.New(p.Type()).Elem()
+		for i := 0; i < p.NumField(); i++ {
+			v.Field(i).Set(subst(m, p.Field(i), pos))
+		}
+		return v
+
+	case reflect.Ptr:
+		v := reflect.New(p.Type()).Elem()
+		if elem := p.Elem(); elem.IsValid() {
+			v.Set(subst(m, elem, pos).Addr())
+		}
+		return v
+
+	case reflect.Interface:
+		v := reflect.New(p.Type()).Elem()
+		if elem := p.Elem(); elem.IsValid() {
+			v.Set(subst(m, elem, pos))
+		}
+		return v
+	}
+
+	return pattern
+}
--- a/vendor/github.com/klauspost/reedsolomon/.gitignore
+++ b/vendor/github.com/klauspost/reedsolomon/.gitignore
@@ -0,0 +1,24 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
--- a/vendor/github.com/klauspost/reedsolomon/.travis.yml
+++ b/vendor/github.com/klauspost/reedsolomon/.travis.yml
@@ -0,0 +1,33 @@
+language: go
+
+sudo: false
+
+os:
+  - linux
+  - osx 
+
+go:
+  - 1.5
+  - 1.6
+  - 1.7
+  - 1.8
+  - master
+
+install:
+ - go get ./...
+
+script: 
+ - go vet ./...
+ - go test -v -cpu=1,2,4 .
+ - go test -v -cpu=1,2,4 -short -race .
+ - go test -tags=noasm -v -cpu=1,2,4 -short -race .
+ - go build examples/simple-decoder.go
+ - go build examples/simple-encoder.go
+ - go build examples/stream-decoder.go
+ - go build examples/stream-encoder.go
+ - diff <(gofmt -d .) <("")
+
+matrix:
+  allow_failures:
+    - go: 'master'
+  fast_finish: true
--- a/vendor/github.com/klauspost/reedsolomon/LICENSE
+++ b/vendor/github.com/klauspost/reedsolomon/LICENSE
@@ -0,0 +1,23 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+Copyright (c) 2015 Backblaze
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/vendor/github.com/klauspost/reedsolomon/README.md
+++ b/vendor/github.com/klauspost/reedsolomon/README.md
@@ -0,0 +1,216 @@
+# Reed-Solomon
+[![GoDoc][1]][2] [![Build Status][3]][4]
+
+[1]: https://godoc.org/github.com/klauspost/reedsolomon?status.svg
+[2]: https://godoc.org/github.com/klauspost/reedsolomon
+[3]: https://travis-ci.org/klauspost/reedsolomon.svg?branch=master
+[4]: https://travis-ci.org/klauspost/reedsolomon
+
+Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.
+
+This is a golang port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
+
+For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).
+
+Package home: https://github.com/klauspost/reedsolomon
+
+Godoc: https://godoc.org/github.com/klauspost/reedsolomon
+
+# Installation
+To get the package use the standard:
+```bash
+go get github.com/klauspost/reedsolomon
+```
+
+# Usage
+
+This section assumes you know the basics of Reed-Solomon encoding. A good start is this [Backblaze blog post](https://www.backblaze.com/blog/reed-solomon/).
+
+This package performs the calculation of the parity sets. The usage is therefore relatively simple.
+
+First of all, you need to choose your distribution of data and parity shards. A 'good' distribution is very subjective, and will depend a lot on your usage scenario. A good starting point is above 5 and below 257 data shards (the maximum supported number), and the number of parity shards to be 2 or above, and below the number of data shards.
+
+To create an encoder with 10 data shards (where your data goes) and 3 parity shards (calculated):
+```Go
+    enc, err := reedsolomon.New(10, 3)
+```
+This encoder will work for all parity sets with this distribution of data and parity shards. The error will only be set if you specify 0 or negative values in any of the parameters, or if you specify more than 256 data shards.
+
+The you send and receive data  is a simple slice of byte slices; `[][]byte`. In the example above, the top slice must have a length of 13.
+```Go
+    data := make([][]byte, 13)
+```
+You should then fill the 10 first slices with *equally sized* data, and create parity shards that will be populated with parity data. In this case we create the data in memory, but you could for instance also use [mmap](https://github.com/edsrzf/mmap-go) to map files.
+
+```Go
+    // Create all shards, size them at 50000 each
+    for i := range input {
+      data[i] := make([]byte, 50000)
+    }
+    
+    
+  // Fill some data into the data shards
+    for i, in := range data[:10] {
+      for j:= range in {
+         in[j] = byte((i+j)&0xff)
+      }
+    }
+```
+
+To populate the parity shards, you simply call `Encode()` with your data.
+```Go
+    err = enc.Encode(data)
+```
+The only cases where you should get an error is, if the data shards aren't of equal size. The last 3 shards now contain parity data. You can verify this by calling `Verify()`:
+
+```Go
+    ok, err = enc.Verify(data)
+```
+
+The final (and important) part is to be able to reconstruct missing shards. For this to work, you need to know which parts of your data is missing. The encoder *does not know which parts are invalid*, so if data corruption is a likely scenario, you need to implement a hash check for each shard. If a byte has changed in your set, and you don't know which it is, there is no way to reconstruct the data set.
+
+To indicate missing data, you set the shard to nil before calling `Reconstruct()`:
+
+```Go
+    // Delete two data shards
+    data[3] = nil
+    data[7] = nil
+    
+    // Reconstruct the missing shards
+    err := enc.Reconstruct(data)
+```
+The missing data and parity shards will be recreated. If more than 3 shards are missing, the reconstruction will fail.
+
+So to sum up reconstruction:
+* The number of data/parity shards must match the numbers used for encoding.
+* The order of shards must be the same as used when encoding.
+* You may only supply data you know is valid.
+* Invalid shards should be set to nil.
+
+For complete examples of an encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
+
+# Splitting/Joining Data
+
+You might have a large slice of data. To help you split this, there are some helper functions that can split and join a single byte slice.
+
+```Go
+   bigfile, _ := ioutil.Readfile("myfile.data")
+   
+   // Split the file
+   split, err := enc.Split(bigfile)
+```
+This will split the file into the number of data shards set when creating the encoder and create empty parity shards. 
+
+An important thing to note is that you have to *keep track of the exact input size*. If the size of the input isn't divisible by the number of data shards, extra zeros will be inserted in the last shard.
+
+To join a data set, use the `Join()` function, which will join the shards and write it to the `io.Writer` you supply: 
+```Go
+   // Join a data set and write it to io.Discard.
+   err = enc.Join(io.Discard, data, len(bigfile))
+```
+
+# Streaming/Merging
+
+It might seem like a limitation that all data should be in memory, but an important property is that *as long as the number of data/parity shards are the same, you can merge/split data sets*, and they will remain valid as a separate set.
+
+```Go
+    // Split the data set of 50000 elements into two of 25000
+    splitA := make([][]byte, 13)
+    splitB := make([][]byte, 13)
+    
+    // Merge into a 100000 element set
+    merged := make([][]byte, 13)
+    
+    for i := range data {
+      splitA[i] = data[i][:25000]
+      splitB[i] = data[i][25000:]
+      
+      // Concencate it to itself
+	  merged[i] = append(make([]byte, 0, len(data[i])*2), data[i]...)
+	  merged[i] = append(merged[i], data[i]...)
+    }
+    
+    // Each part should still verify as ok.
+    ok, err := enc.Verify(splitA)
+    if ok && err == nil {
+        log.Println("splitA ok")
+    }
+    
+    ok, err = enc.Verify(splitB)
+    if ok && err == nil {
+        log.Println("splitB ok")
+    }
+    
+    ok, err = enc.Verify(merge)
+    if ok && err == nil {
+        log.Println("merge ok")
+    }
+```
+
+This means that if you have a data set that may not fit into memory, you can split processing into smaller blocks. For the best throughput, don't use too small blocks.
+
+This also means that you can divide big input up into smaller blocks, and do reconstruction on parts of your data. This doesn't give the same flexibility of a higher number of data shards, but it will be much more performant.
+
+# Streaming API
+
+There has been added support for a streaming API, to help perform fully streaming operations, which enables you to do the same operations, but on streams. To use the stream API, use [`NewStream`](https://godoc.org/github.com/klauspost/reedsolomon#NewStream) function to create the encoding/decoding interfaces. You can use [`NewStreamC`](https://godoc.org/github.com/klauspost/reedsolomon#NewStreamC) to ready an interface that reads/writes concurrently from the streams.
+
+Input is delivered as `[]io.Reader`, output as `[]io.Writer`, and functionality corresponds to the in-memory API. Each stream must supply the same amount of data, similar to how each slice must be similar size with the in-memory API. 
+If an error occurs in relation to a stream, a [`StreamReadError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamReadError) or [`StreamWriteError`](https://godoc.org/github.com/klauspost/reedsolomon#StreamWriteError) will help you determine which stream was the offender.
+
+There is no buffering or timeouts/retry specified. If you want to add that, you need to add it to the Reader/Writer.
+
+For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
+
+#Advanced Options
+
+You can modify internal options which affects how jobs are split between and processed by goroutines.
+
+To create options, use the WithXXX functions. You can supply options to `New`, `NewStream` and `NewStreamC`. If no Options are supplied, default options are used.
+
+Example of how to supply options:
+
+ ```Go
+     enc, err := reedsolomon.New(10, 3, WithMaxGoroutines(25))
+ ```
+
+
+# Performance
+Performance depends mainly on the number of parity shards. In rough terms, doubling the number of parity shards will double the encoding time.
+
+Here are the throughput numbers with some different selections of data and parity shards. For reference each shard is 1MB random data, and 2 CPU cores are used for encoding.
+
+| Data | Parity | Parity | MB/s   | SSSE3 MB/s  | SSSE3 Speed | Rel. Speed |
+|------|--------|--------|--------|-------------|-------------|------------|
+| 5    | 2      | 40%    | 576,11 | 2599,2      | 451%        | 100,00%    |
+| 10   | 2      | 20%    | 587,73 | 3100,28     | 528%        | 102,02%    |
+| 10   | 4      | 40%    | 298,38 | 2470,97     | 828%        | 51,79%     |
+| 50   | 20     | 40%    | 59,81  | 713,28      | 1193%       | 10,38%     |
+
+If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`.
+
+Example of performance scaling on Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz - 4 physical cores, 8 logical cores. The example uses 10 blocks with 16MB data each and 4 parity blocks.
+
+| Threads | MB/s    | Speed |
+|---------|---------|-------|
+| 1       | 1355,11 | 100%  |
+| 2       | 2339,78 | 172%  |
+| 4       | 3179,33 | 235%  |
+| 8       | 4346,18 | 321%  |
+
+# asm2plan9s
+
+[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
+
+# Links
+* [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/).
+* [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon). Compatible java library by Backblaze.
+* [reedsolomon-c](https://github.com/jannson/reedsolomon-c). C version, compatible with output from this package.
+* [Reed-Solomon Erasure Coding in Haskell](https://github.com/NicolasT/reedsolomon). Haskell port of the package with similar performance.
+* [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests.
+* [rsraid](https://github.com/goayame/rsraid). A similar library written in Go. Slower, but supports more shards.
+* [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations.
+
+# License
+
+This code, as the original [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) is published under an MIT license. See LICENSE file for more information.
--- a/vendor/github.com/klauspost/reedsolomon/appveyor.yml
+++ b/vendor/github.com/klauspost/reedsolomon/appveyor.yml
@@ -0,0 +1,20 @@
+os: Visual Studio 2015
+
+platform: x64
+
+clone_folder: c:\gopath\src\github.com\klauspost\reedsolomon
+
+# environment variables
+environment:
+  GOPATH: c:\gopath
+
+install:
+  - echo %PATH%
+  - echo %GOPATH%
+  - go version
+  - go env
+  - go get -d ./...
+
+build_script:
+  - go test -v -cpu=2 ./...
+  - go test -cpu=1,2,4 -short -race ./...
--- a/vendor/github.com/klauspost/reedsolomon/galois.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois.go
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@@ -0,0 +1,73 @@
+//+build !noasm
+//+build !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+//go:noescape
+func galMulSSSE3(low, high, in, out []byte)
+
+//go:noescape
+func galMulSSSE3Xor(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2Xor(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2(low, high, in, out []byte)
+
+// This is what the assembler rountes does in blocks of 16 bytes:
+/*
+func galMulSSSE3(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] = low[l] ^ high[h]
+	}
+}
+
+func galMulSSSE3Xor(low, high, in, out []byte) {
+	for n, input := range in {
+		l := input & 0xf
+		h := input >> 4
+		out[n] ^= low[l] ^ high[h]
+	}
+}
+*/
+
+func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
+	var done int
+	if avx2 {
+		galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 5) << 5
+	} else if ssse3 {
+		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 4) << 4
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] = mt[in[i]]
+		}
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
+	var done int
+	if avx2 {
+		galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 5) << 5
+	} else if ssse3 {
+		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+		done = (len(in) >> 4) << 4
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] ^= mt[in[i]]
+		}
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@@ -0,0 +1,164 @@
+//+build !noasm !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
+// and http://jerasure.org/jerasure/gf-complete/tree/master
+
+// func galMulSSSE3Xor(low, high, in, out []byte)
+TEXT ·galMulSSSE3Xor(SB), 7, $0
+	MOVQ   low+0(FP), SI     // SI: &low
+	MOVQ   high+24(FP), DX   // DX: &high
+	MOVOU  (SI), X6          // X6 low
+	MOVOU  (DX), X7          // X7: high
+	MOVQ   $15, BX           // BX: low mask
+	MOVQ   BX, X8
+	PXOR   X5, X5
+	MOVQ   in+48(FP), SI     // R11: &in
+	MOVQ   in_len+56(FP), R9 // R9: len(in)
+	MOVQ   out+72(FP), DX    // DX: &out
+	PSHUFB X5, X8            // X8: lomask (unpacked)
+	SHRQ   $4, R9            // len(in) / 16
+	CMPQ   R9, $0
+	JEQ    done_xor
+
+loopback_xor:
+	MOVOU  (SI), X0     // in[x]
+	MOVOU  (DX), X4     // out[x]
+	MOVOU  X0, X1       // in[x]
+	MOVOU  X6, X2       // low copy
+	MOVOU  X7, X3       // high copy
+	PSRLQ  $4, X1       // X1: high input
+	PAND   X8, X0       // X0: low input
+	PAND   X8, X1       // X0: high input
+	PSHUFB X0, X2       // X2: mul low part
+	PSHUFB X1, X3       // X3: mul high part
+	PXOR   X2, X3       // X3: Result
+	PXOR   X4, X3       // X3: Result xor existing out
+	MOVOU  X3, (DX)     // Store
+	ADDQ   $16, SI      // in+=16
+	ADDQ   $16, DX      // out+=16
+	SUBQ   $1, R9
+	JNZ    loopback_xor
+
+done_xor:
+	RET
+
+// func galMulSSSE3(low, high, in, out []byte)
+TEXT ·galMulSSSE3(SB), 7, $0
+	MOVQ   low+0(FP), SI     // SI: &low
+	MOVQ   high+24(FP), DX   // DX: &high
+	MOVOU  (SI), X6          // X6 low
+	MOVOU  (DX), X7          // X7: high
+	MOVQ   $15, BX           // BX: low mask
+	MOVQ   BX, X8
+	PXOR   X5, X5
+	MOVQ   in+48(FP), SI     // R11: &in
+	MOVQ   in_len+56(FP), R9 // R9: len(in)
+	MOVQ   out+72(FP), DX    // DX: &out
+	PSHUFB X5, X8            // X8: lomask (unpacked)
+	SHRQ   $4, R9            // len(in) / 16
+	CMPQ   R9, $0
+	JEQ    done
+
+loopback:
+	MOVOU  (SI), X0 // in[x]
+	MOVOU  X0, X1   // in[x]
+	MOVOU  X6, X2   // low copy
+	MOVOU  X7, X3   // high copy
+	PSRLQ  $4, X1   // X1: high input
+	PAND   X8, X0   // X0: low input
+	PAND   X8, X1   // X0: high input
+	PSHUFB X0, X2   // X2: mul low part
+	PSHUFB X1, X3   // X3: mul high part
+	PXOR   X2, X3   // X3: Result
+	MOVOU  X3, (DX) // Store
+	ADDQ   $16, SI  // in+=16
+	ADDQ   $16, DX  // out+=16
+	SUBQ   $1, R9
+	JNZ    loopback
+
+done:
+	RET
+
+// func galMulAVX2Xor(low, high, in, out []byte)
+TEXT ·galMulAVX2Xor(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6 low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
+	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
+	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)
+
+	SHRQ  $5, R9         // len(in) /32
+	MOVQ  out+72(FP), DX // DX: &out
+	MOVQ  in+48(FP), SI  // R11: &in
+	TESTQ R9, R9
+	JZ    done_xor_avx2
+
+loopback_xor_avx2:
+	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
+	LONG $0x226ffec5             // VMOVDQU YMM4, [rdx]
+	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
+	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
+	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
+	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
+	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
+	LONG $0xdbefedc5             // VPXOR   YMM3, YMM2, YMM3    ; X3: Result
+	LONG $0xe4efe5c5             // VPXOR   YMM4, YMM3, YMM4    ; X4: Result
+	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4
+
+	ADDQ $32, SI           // in+=32
+	ADDQ $32, DX           // out+=32
+	SUBQ $1, R9
+	JNZ  loopback_xor_avx2
+
+done_xor_avx2:
+	// VZEROUPPER
+	BYTE $0xc5; BYTE $0xf8; BYTE $0x77
+	RET
+
+// func galMulAVX2(low, high, in, out []byte)
+TEXT ·galMulAVX2(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6 low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
+	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
+	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)
+
+	SHRQ  $5, R9         // len(in) /32
+	MOVQ  out+72(FP), DX // DX: &out
+	MOVQ  in+48(FP), SI  // R11: &in
+	TESTQ R9, R9
+	JZ    done_avx2
+
+loopback_avx2:
+	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
+	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
+	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
+	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
+	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
+	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
+	LONG $0xe3efedc5             // VPXOR   YMM4, YMM2, YMM3    ; X4: Result
+	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4
+
+	ADDQ $32, SI       // in+=32
+	ADDQ $32, DX       // out+=32
+	SUBQ $1, R9
+	JNZ  loopback_avx2
+
+done_avx2:
+
+	BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
+	RET
--- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@@ -0,0 +1,19 @@
+//+build !amd64 noasm appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+
+package reedsolomon
+
+func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
+	mt := mulTable[c]
+	for n, input := range in {
+		out[n] = mt[input]
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
+	mt := mulTable[c]
+	for n, input := range in {
+		out[n] ^= mt[input]
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/gentables.go
+++ b/vendor/github.com/klauspost/reedsolomon/gentables.go
@@ -0,0 +1,132 @@
+//+build ignore
+
+package main
+
+import (
+	"fmt"
+)
+
+var logTable = [fieldSize]int16{
+	-1, 0, 1, 25, 2, 50, 26, 198,
+	3, 223, 51, 238, 27, 104, 199, 75,
+	4, 100, 224, 14, 52, 141, 239, 129,
+	28, 193, 105, 248, 200, 8, 76, 113,
+	5, 138, 101, 47, 225, 36, 15, 33,
+	53, 147, 142, 218, 240, 18, 130, 69,
+	29, 181, 194, 125, 106, 39, 249, 185,
+	201, 154, 9, 120, 77, 228, 114, 166,
+	6, 191, 139, 98, 102, 221, 48, 253,
+	226, 152, 37, 179, 16, 145, 34, 136,
+	54, 208, 148, 206, 143, 150, 219, 189,
+	241, 210, 19, 92, 131, 56, 70, 64,
+	30, 66, 182, 163, 195, 72, 126, 110,
+	107, 58, 40, 84, 250, 133, 186, 61,
+	202, 94, 155, 159, 10, 21, 121, 43,
+	78, 212, 229, 172, 115, 243, 167, 87,
+	7, 112, 192, 247, 140, 128, 99, 13,
+	103, 74, 222, 237, 49, 197, 254, 24,
+	227, 165, 153, 119, 38, 184, 180, 124,
+	17, 68, 146, 217, 35, 32, 137, 46,
+	55, 63, 209, 91, 149, 188, 207, 205,
+	144, 135, 151, 178, 220, 252, 190, 97,
+	242, 86, 211, 171, 20, 42, 93, 158,
+	132, 60, 57, 83, 71, 109, 65, 162,
+	31, 45, 67, 216, 183, 123, 164, 118,
+	196, 23, 73, 236, 127, 12, 111, 246,
+	108, 161, 59, 82, 41, 157, 85, 170,
+	251, 96, 134, 177, 187, 204, 62, 90,
+	203, 89, 95, 176, 156, 169, 160, 81,
+	11, 245, 22, 235, 122, 117, 44, 215,
+	79, 174, 213, 233, 230, 231, 173, 232,
+	116, 214, 244, 234, 168, 80, 88, 175,
+}
+
+const (
+	// The number of elements in the field.
+	fieldSize = 256
+
+	// The polynomial used to generate the logarithm table.
+	//
+	// There are a number of polynomials that work to generate
+	// a Galois field of 256 elements.  The choice is arbitrary,
+	// and we just use the first one.
+	//
+	// The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105,
+	//* 113, 135, 141, 169, 195, 207, 231, and 245.
+	generatingPolynomial = 29
+)
+
+func main() {
+	t := generateExpTable()
+	fmt.Printf("var expTable = %#v\n", t)
+	//t2 := generateMulTableSplit(t)
+	//fmt.Printf("var mulTable = %#v\n", t2)
+	low, high := generateMulTableHalf(t)
+	fmt.Printf("var mulTableLow = %#v\n", low)
+	fmt.Printf("var mulTableHigh = %#v\n", high)
+}
+
+/**
+ * Generates the inverse log table.
+ */
+func generateExpTable() []byte {
+	result := make([]byte, fieldSize*2-2)
+	for i := 1; i < fieldSize; i++ {
+		log := logTable[i]
+		result[log] = byte(i)
+		result[log+fieldSize-1] = byte(i)
+	}
+	return result
+}
+
+func generateMulTable(expTable []byte) []byte {
+	result := make([]byte, 256*256)
+	for v := range result {
+		a := byte(v & 0xff)
+		b := byte(v >> 8)
+		if a == 0 || b == 0 {
+			result[v] = 0
+			continue
+		}
+		logA := int(logTable[a])
+		logB := int(logTable[b])
+		result[v] = expTable[logA+logB]
+	}
+	return result
+}
+
+func generateMulTableSplit(expTable []byte) [256][256]byte {
+	var result [256][256]byte
+	for a := range result {
+		for b := range result[a] {
+			if a == 0 || b == 0 {
+				result[a][b] = 0
+				continue
+			}
+			logA := int(logTable[a])
+			logB := int(logTable[b])
+			result[a][b] = expTable[logA+logB]
+		}
+	}
+	return result
+}
+
+func generateMulTableHalf(expTable []byte) (low [256][16]byte, high [256][16]byte) {
+	for a := range low {
+		for b := range low {
+			result := 0
+			if !(a == 0 || b == 0) {
+				logA := int(logTable[a])
+				logB := int(logTable[b])
+				result = int(expTable[logA+logB])
+			}
+			if (b & 0xf) == b {
+				low[a][b] = byte(result)
+			}
+			if (b & 0xf0) == b {
+				high[a][b>>4] = byte(result)
+			}
+		}
+	}
+	return
+}
--- a/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
+++ b/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
@@ -0,0 +1,160 @@
+/**
+ * A thread-safe tree which caches inverted matrices.
+ *
+ * Copyright 2016, Peter Collins
+ */
+
+package reedsolomon
+
+import (
+	"errors"
+	"sync"
+)
+
+// The tree uses a Reader-Writer mutex to make it thread-safe
+// when accessing cached matrices and inserting new ones.
+type inversionTree struct {
+	mutex *sync.RWMutex
+	root  inversionNode
+}
+
+type inversionNode struct {
+	matrix   matrix
+	children []*inversionNode
+}
+
+// newInversionTree initializes a tree for storing inverted matrices.
+// Note that the root node is the identity matrix as it implies
+// there were no errors with the original data.
+func newInversionTree(dataShards, parityShards int) inversionTree {
+	identity, _ := identityMatrix(dataShards)
+	root := inversionNode{
+		matrix:   identity,
+		children: make([]*inversionNode, dataShards+parityShards),
+	}
+	return inversionTree{
+		mutex: &sync.RWMutex{},
+		root:  root,
+	}
+}
+
+// GetInvertedMatrix returns the cached inverted matrix or nil if it
+// is not found in the tree keyed on the indices of invalid rows.
+func (t inversionTree) GetInvertedMatrix(invalidIndices []int) matrix {
+	// Lock the tree for reading before accessing the tree.
+	t.mutex.RLock()
+	defer t.mutex.RUnlock()
+
+	// If no invalid indices were give we should return the root
+	// identity matrix.
+	if len(invalidIndices) == 0 {
+		return t.root.matrix
+	}
+
+	// Recursively search for the inverted matrix in the tree, passing in
+	// 0 as the parent index as we start at the root of the tree.
+	return t.root.getInvertedMatrix(invalidIndices, 0)
+}
+
+// errAlreadySet is returned if the root node matrix is overwritten
+var errAlreadySet = errors.New("the root node identity matrix is already set")
+
+// InsertInvertedMatrix inserts a new inverted matrix into the tree
+// keyed by the indices of invalid rows.  The total number of shards
+// is required for creating the proper length lists of child nodes for
+// each node.
+func (t inversionTree) InsertInvertedMatrix(invalidIndices []int, matrix matrix, shards int) error {
+	// If no invalid indices were given then we are done because the
+	// root node is already set with the identity matrix.
+	if len(invalidIndices) == 0 {
+		return errAlreadySet
+	}
+
+	if !matrix.IsSquare() {
+		return errNotSquare
+	}
+
+	// Lock the tree for writing and reading before accessing the tree.
+	t.mutex.Lock()
+	defer t.mutex.Unlock()
+
+	// Recursively create nodes for the inverted matrix in the tree until
+	// we reach the node to insert the matrix to.  We start by passing in
+	// 0 as the parent index as we start at the root of the tree.
+	t.root.insertInvertedMatrix(invalidIndices, matrix, shards, 0)
+
+	return nil
+}
+
+func (n inversionNode) getInvertedMatrix(invalidIndices []int, parent int) matrix {
+	// Get the child node to search next from the list of children.  The
+	// list of children starts relative to the parent index passed in
+	// because the indices of invalid rows is sorted (by default).  As we
+	// search recursively, the first invalid index gets popped off the list,
+	// so when searching through the list of children, use that first invalid
+	// index to find the child node.
+	firstIndex := invalidIndices[0]
+	node := n.children[firstIndex-parent]
+
+	// If the child node doesn't exist in the list yet, fail fast by
+	// returning, so we can construct and insert the proper inverted matrix.
+	if node == nil {
+		return nil
+	}
+
+	// If there's more than one invalid index left in the list we should
+	// keep searching recursively.
+	if len(invalidIndices) > 1 {
+		// Search recursively on the child node by passing in the invalid indices
+		// with the first index popped off the front.  Also the parent index to
+		// pass down is the first index plus one.
+		return node.getInvertedMatrix(invalidIndices[1:], firstIndex+1)
+	}
+	// If there aren't any more invalid indices to search, we've found our
+	// node.  Return it, however keep in mind that the matrix could still be
+	// nil because intermediary nodes in the tree are created sometimes with
+	// their inversion matrices uninitialized.
+	return node.matrix
+}
+
+func (n inversionNode) insertInvertedMatrix(invalidIndices []int, matrix matrix, shards, parent int) {
+	// As above, get the child node to search next from the list of children.
+	// The list of children starts relative to the parent index passed in
+	// because the indices of invalid rows is sorted (by default).  As we
+	// search recursively, the first invalid index gets popped off the list,
+	// so when searching through the list of children, use that first invalid
+	// index to find the child node.
+	firstIndex := invalidIndices[0]
+	node := n.children[firstIndex-parent]
+
+	// If the child node doesn't exist in the list yet, create a new
+	// node because we have the writer lock and add it to the list
+	// of children.
+	if node == nil {
+		// Make the length of the list of children equal to the number
+		// of shards minus the first invalid index because the list of
+		// invalid indices is sorted, so only this length of errors
+		// are possible in the tree.
+		node = &inversionNode{
+			children: make([]*inversionNode, shards-firstIndex),
+		}
+		// Insert the new node into the tree at the first index relative
+		// to the parent index that was given in this recursive call.
+		n.children[firstIndex-parent] = node
+	}
+
+	// If there's more than one invalid index left in the list we should
+	// keep searching recursively in order to find the node to add our
+	// matrix.
+	if len(invalidIndices) > 1 {
+		// As above, search recursively on the child node by passing in
+		// the invalid indices with the first index popped off the front.
+		// Also the total number of shards and parent index are passed down
+		// which is equal to the first index plus one.
+		node.insertInvertedMatrix(invalidIndices[1:], matrix, shards, firstIndex+1)
+	} else {
+		// If there aren't any more invalid indices to search, we've found our
+		// node.  Cache the inverted matrix in this node.
+		node.matrix = matrix
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/matrix.go
+++ b/vendor/github.com/klauspost/reedsolomon/matrix.go
@@ -0,0 +1,279 @@
+/**
+ * Matrix Algebra over an 8-bit Galois Field
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+package reedsolomon
+
+import (
+	"errors"
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+// byte[row][col]
+type matrix [][]byte
+
+// newMatrix returns a matrix of zeros.
+func newMatrix(rows, cols int) (matrix, error) {
+	if rows <= 0 {
+		return nil, errInvalidRowSize
+	}
+	if cols <= 0 {
+		return nil, errInvalidColSize
+	}
+
+	m := matrix(make([][]byte, rows))
+	for i := range m {
+		m[i] = make([]byte, cols)
+	}
+	return m, nil
+}
+
+// NewMatrixData initializes a matrix with the given row-major data.
+// Note that data is not copied from input.
+func newMatrixData(data [][]byte) (matrix, error) {
+	m := matrix(data)
+	err := m.Check()
+	if err != nil {
+		return nil, err
+	}
+	return m, nil
+}
+
+// IdentityMatrix returns an identity matrix of the given size.
+func identityMatrix(size int) (matrix, error) {
+	m, err := newMatrix(size, size)
+	if err != nil {
+		return nil, err
+	}
+	for i := range m {
+		m[i][i] = 1
+	}
+	return m, nil
+}
+
+// errInvalidRowSize will be returned if attempting to create a matrix with negative or zero row number.
+var errInvalidRowSize = errors.New("invalid row size")
+
+// errInvalidColSize will be returned if attempting to create a matrix with negative or zero column number.
+var errInvalidColSize = errors.New("invalid column size")
+
+// errColSizeMismatch is returned if the size of matrix columns mismatch.
+var errColSizeMismatch = errors.New("column size is not the same for all rows")
+
+func (m matrix) Check() error {
+	rows := len(m)
+	if rows <= 0 {
+		return errInvalidRowSize
+	}
+	cols := len(m[0])
+	if cols <= 0 {
+		return errInvalidColSize
+	}
+
+	for _, col := range m {
+		if len(col) != cols {
+			return errColSizeMismatch
+		}
+	}
+	return nil
+}
+
+// String returns a human-readable string of the matrix contents.
+//
+// Example: [[1, 2], [3, 4]]
+func (m matrix) String() string {
+	rowOut := make([]string, 0, len(m))
+	for _, row := range m {
+		colOut := make([]string, 0, len(row))
+		for _, col := range row {
+			colOut = append(colOut, strconv.Itoa(int(col)))
+		}
+		rowOut = append(rowOut, "["+strings.Join(colOut, ", ")+"]")
+	}
+	return "[" + strings.Join(rowOut, ", ") + "]"
+}
+
+// Multiply multiplies this matrix (the one on the left) by another
+// matrix (the one on the right) and returns a new matrix with the result.
+func (m matrix) Multiply(right matrix) (matrix, error) {
+	if len(m[0]) != len(right) {
+		return nil, fmt.Errorf("columns on left (%d) is different than rows on right (%d)", len(m[0]), len(right))
+	}
+	result, _ := newMatrix(len(m), len(right[0]))
+	for r, row := range result {
+		for c := range row {
+			var value byte
+			for i := range m[0] {
+				value ^= galMultiply(m[r][i], right[i][c])
+			}
+			result[r][c] = value
+		}
+	}
+	return result, nil
+}
+
+// Augment returns the concatenation of this matrix and the matrix on the right.
+func (m matrix) Augment(right matrix) (matrix, error) {
+	if len(m) != len(right) {
+		return nil, errMatrixSize
+	}
+
+	result, _ := newMatrix(len(m), len(m[0])+len(right[0]))
+	for r, row := range m {
+		for c := range row {
+			result[r][c] = m[r][c]
+		}
+		cols := len(m[0])
+		for c := range right[0] {
+			result[r][cols+c] = right[r][c]
+		}
+	}
+	return result, nil
+}
+
+// errMatrixSize is returned if matrix dimensions are doesn't match.
+var errMatrixSize = errors.New("matrix sizes does not match")
+
+func (m matrix) SameSize(n matrix) error {
+	if len(m) != len(n) {
+		return errMatrixSize
+	}
+	for i := range m {
+		if len(m[i]) != len(n[i]) {
+			return errMatrixSize
+		}
+	}
+	return nil
+}
+
+// Returns a part of this matrix. Data is copied.
+func (m matrix) SubMatrix(rmin, cmin, rmax, cmax int) (matrix, error) {
+	result, err := newMatrix(rmax-rmin, cmax-cmin)
+	if err != nil {
+		return nil, err
+	}
+	// OPTME: If used heavily, use copy function to copy slice
+	for r := rmin; r < rmax; r++ {
+		for c := cmin; c < cmax; c++ {
+			result[r-rmin][c-cmin] = m[r][c]
+		}
+	}
+	return result, nil
+}
+
+// SwapRows Exchanges two rows in the matrix.
+func (m matrix) SwapRows(r1, r2 int) error {
+	if r1 < 0 || len(m) <= r1 || r2 < 0 || len(m) <= r2 {
+		return errInvalidRowSize
+	}
+	m[r2], m[r1] = m[r1], m[r2]
+	return nil
+}
+
+// IsSquare will return true if the matrix is square
+// and nil if the matrix is square
+func (m matrix) IsSquare() bool {
+	return len(m) == len(m[0])
+}
+
+// errSingular is returned if the matrix is singular and cannot be inversed
+var errSingular = errors.New("matrix is singular")
+
+// errNotSquare is returned if attempting to inverse a non-square matrix.
+var errNotSquare = errors.New("only square matrices can be inverted")
+
+// Invert returns the inverse of this matrix.
+// Returns ErrSingular when the matrix is singular and doesn't have an inverse.
+// The matrix must be square, otherwise ErrNotSquare is returned.
+func (m matrix) Invert() (matrix, error) {
+	if !m.IsSquare() {
+		return nil, errNotSquare
+	}
+
+	size := len(m)
+	work, _ := identityMatrix(size)
+	work, _ = m.Augment(work)
+
+	err := work.gaussianElimination()
+	if err != nil {
+		return nil, err
+	}
+
+	return work.SubMatrix(0, size, size, size*2)
+}
+
+func (m matrix) gaussianElimination() error {
+	rows := len(m)
+	columns := len(m[0])
+	// Clear out the part below the main diagonal and scale the main
+	// diagonal to be 1.
+	for r := 0; r < rows; r++ {
+		// If the element on the diagonal is 0, find a row below
+		// that has a non-zero and swap them.
+		if m[r][r] == 0 {
+			for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
+				if m[rowBelow][r] != 0 {
+					m.SwapRows(r, rowBelow)
+					break
+				}
+			}
+		}
+		// If we couldn't find one, the matrix is singular.
+		if m[r][r] == 0 {
+			return errSingular
+		}
+		// Scale to 1.
+		if m[r][r] != 1 {
+			scale := galDivide(1, m[r][r])
+			for c := 0; c < columns; c++ {
+				m[r][c] = galMultiply(m[r][c], scale)
+			}
+		}
+		// Make everything below the 1 be a 0 by subtracting
+		// a multiple of it.  (Subtraction and addition are
+		// both exclusive or in the Galois field.)
+		for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
+			if m[rowBelow][r] != 0 {
+				scale := m[rowBelow][r]
+				for c := 0; c < columns; c++ {
+					m[rowBelow][c] ^= galMultiply(scale, m[r][c])
+				}
+			}
+		}
+	}
+
+	// Now clear the part above the main diagonal.
+	for d := 0; d < rows; d++ {
+		for rowAbove := 0; rowAbove < d; rowAbove++ {
+			if m[rowAbove][d] != 0 {
+				scale := m[rowAbove][d]
+				for c := 0; c < columns; c++ {
+					m[rowAbove][c] ^= galMultiply(scale, m[d][c])
+				}
+
+			}
+		}
+	}
+	return nil
+}
+
+// Create a Vandermonde matrix, which is guaranteed to have the
+// property that any subset of rows that forms a square matrix
+// is invertible.
+func vandermonde(rows, cols int) (matrix, error) {
+	result, err := newMatrix(rows, cols)
+	if err != nil {
+		return nil, err
+	}
+	for r, row := range result {
+		for c := range row {
+			result[r][c] = galExp(byte(r), c)
+		}
+	}
+	return result, nil
+}
--- a/vendor/github.com/klauspost/reedsolomon/options.go
+++ b/vendor/github.com/klauspost/reedsolomon/options.go
@@ -0,0 +1,67 @@
+package reedsolomon
+
+import (
+	"runtime"
+
+	"github.com/klauspost/cpuid"
+)
+
+// Option allows to override processing parameters.
+type Option func(*options)
+
+type options struct {
+	maxGoroutines     int
+	minSplitSize      int
+	useAVX2, useSSSE3 bool
+}
+
+var defaultOptions = options{
+	maxGoroutines: 50,
+	minSplitSize:  512,
+}
+
+func init() {
+	if runtime.GOMAXPROCS(0) <= 1 {
+		defaultOptions.maxGoroutines = 1
+	}
+	// Detect CPU capabilities.
+	defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
+	defaultOptions.useAVX2 = cpuid.CPU.AVX2()
+}
+
+// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
+// Jobs will be split into this many parts, unless each goroutine would have to process
+// less than minSplitSize bytes (set with WithMinSplitSize).
+// For the best speed, keep this well above the GOMAXPROCS number for more fine grained
+// scheduling.
+// If n <= 0, it is ignored.
+func WithMaxGoroutines(n int) Option {
+	return func(o *options) {
+		if n > 0 {
+			o.maxGoroutines = n
+		}
+	}
+}
+
+// MinSplitSize Is the minimum encoding size in bytes per goroutine.
+// See WithMaxGoroutines on how jobs are split.
+// If n <= 0, it is ignored.
+func WithMinSplitSize(n int) Option {
+	return func(o *options) {
+		if n > 0 {
+			o.minSplitSize = n
+		}
+	}
+}
+
+func withSSE3(enabled bool) Option {
+	return func(o *options) {
+		o.useSSSE3 = enabled
+	}
+}
+
+func withAVX2(enabled bool) Option {
+	return func(o *options) {
+		o.useAVX2 = enabled
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@@ -0,0 +1,596 @@
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+// Package reedsolomon enables Erasure Coding in Go
+//
+// For usage and examples, see https://github.com/klauspost/reedsolomon
+//
+package reedsolomon
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"sync"
+)
+
+// Encoder is an interface to encode Reed-Salomon parity sets for your data.
+type Encoder interface {
+	// Encodes parity for a set of data shards.
+	// Input is 'shards' containing data shards followed by parity shards.
+	// The number of shards must match the number given to New().
+	// Each shard is a byte array, and they must all be the same size.
+	// The parity shards will always be overwritten and the data shards
+	// will remain the same, so it is safe for you to read from the
+	// data shards while this is running.
+	Encode(shards [][]byte) error
+
+	// Verify returns true if the parity shards contain correct data.
+	// The data is the same format as Encode. No data is modified, so
+	// you are allowed to read from data while this is running.
+	Verify(shards [][]byte) (bool, error)
+
+	// Reconstruct will recreate the missing shards if possible.
+	//
+	// Given a list of shards, some of which contain data, fills in the
+	// ones that don't have data.
+	//
+	// The length of the array must be equal to the total number of shards.
+	// You indicate that a shard is missing by setting it to nil.
+	//
+	// If there are too few shards to reconstruct the missing
+	// ones, ErrTooFewShards will be returned.
+	//
+	// The reconstructed shard set is complete, but integrity is not verified.
+	// Use the Verify function to check if data set is ok.
+	Reconstruct(shards [][]byte) error
+
+	// Split a data slice into the number of shards given to the encoder,
+	// and create empty parity shards.
+	//
+	// The data will be split into equally sized shards.
+	// If the data size isn't dividable by the number of shards,
+	// the last shard will contain extra zeros.
+	//
+	// There must be at least 1 byte otherwise ErrShortData will be
+	// returned.
+	//
+	// The data will not be copied, except for the last shard, so you
+	// should not modify the data of the input slice afterwards.
+	Split(data []byte) ([][]byte, error)
+
+	// Join the shards and write the data segment to dst.
+	//
+	// Only the data shards are considered.
+	// You must supply the exact output size you want.
+	// If there are to few shards given, ErrTooFewShards will be returned.
+	// If the total data size is less than outSize, ErrShortData will be returned.
+	Join(dst io.Writer, shards [][]byte, outSize int) error
+}
+
+// reedSolomon contains a matrix for a specific
+// distribution of datashards and parity shards.
+// Construct if using New()
+type reedSolomon struct {
+	DataShards   int // Number of data shards, should not be modified.
+	ParityShards int // Number of parity shards, should not be modified.
+	Shards       int // Total number of shards. Calculated, and should not be modified.
+	m            matrix
+	tree         inversionTree
+	parity       [][]byte
+	o            options
+}
+
+// ErrInvShardNum will be returned by New, if you attempt to create
+// an Encoder where either data or parity shards is zero or less.
+var ErrInvShardNum = errors.New("cannot create Encoder with zero or less data/parity shards")
+
+// ErrMaxShardNum will be returned by New, if you attempt to create
+// an Encoder where data and parity shards cannot be bigger than
+// Galois field GF(2^8) - 1.
+var ErrMaxShardNum = errors.New("cannot create Encoder with 255 or more data+parity shards")
+
+// New creates a new encoder and initializes it to
+// the number of data shards and parity shards that
+// you want to use. You can reuse this encoder.
+// Note that the maximum number of data shards is 256.
+// If no options are supplied, default options are used.
+func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
+	r := reedSolomon{
+		DataShards:   dataShards,
+		ParityShards: parityShards,
+		Shards:       dataShards + parityShards,
+		o:            defaultOptions,
+	}
+
+	for _, opt := range opts {
+		opt(&r.o)
+	}
+	if dataShards <= 0 || parityShards <= 0 {
+		return nil, ErrInvShardNum
+	}
+
+	if dataShards+parityShards > 255 {
+		return nil, ErrMaxShardNum
+	}
+
+	// Start with a Vandermonde matrix.  This matrix would work,
+	// in theory, but doesn't have the property that the data
+	// shards are unchanged after encoding.
+	vm, err := vandermonde(r.Shards, dataShards)
+	if err != nil {
+		return nil, err
+	}
+
+	// Multiply by the inverse of the top square of the matrix.
+	// This will make the top square be the identity matrix, but
+	// preserve the property that any square subset of rows  is
+	// invertible.
+	top, _ := vm.SubMatrix(0, 0, dataShards, dataShards)
+	top, _ = top.Invert()
+	r.m, _ = vm.Multiply(top)
+
+	// Inverted matrices are cached in a tree keyed by the indices
+	// of the invalid rows of the data to reconstruct.
+	// The inversion root node will have the identity matrix as
+	// its inversion matrix because it implies there are no errors
+	// with the original data.
+	r.tree = newInversionTree(dataShards, parityShards)
+
+	r.parity = make([][]byte, parityShards)
+	for i := range r.parity {
+		r.parity[i] = r.m[dataShards+i]
+	}
+
+	return &r, err
+}
+
+// ErrTooFewShards is returned if too few shards where given to
+// Encode/Verify/Reconstruct. It will also be returned from Reconstruct
+// if there were too few shards to reconstruct the missing data.
+var ErrTooFewShards = errors.New("too few shards given")
+
+// Encodes parity for a set of data shards.
+// An array 'shards' containing data shards followed by parity shards.
+// The number of shards must match the number given to New.
+// Each shard is a byte array, and they must all be the same size.
+// The parity shards will always be overwritten and the data shards
+// will remain the same.
+func (r reedSolomon) Encode(shards [][]byte) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+
+	err := checkShards(shards, false)
+	if err != nil {
+		return err
+	}
+
+	// Get the slice of output buffers.
+	output := shards[r.DataShards:]
+
+	// Do the coding.
+	r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0]))
+	return nil
+}
+
+// Verify returns true if the parity shards contain the right data.
+// The data is the same format as Encode. No data is modified.
+func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
+	if len(shards) != r.Shards {
+		return false, ErrTooFewShards
+	}
+	err := checkShards(shards, false)
+	if err != nil {
+		return false, err
+	}
+
+	// Slice of buffers being checked.
+	toCheck := shards[r.DataShards:]
+
+	// Do the checking.
+	return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil
+}
+
+// Multiplies a subset of rows from a coding matrix by a full set of
+// input shards to produce some output shards.
+// 'matrixRows' is The rows from the matrix to use.
+// 'inputs' An array of byte arrays, each of which is one input shard.
+// The number of inputs used is determined by the length of each matrix row.
+// outputs Byte arrays where the computed shards are stored.
+// The number of outputs computed, and the
+// number of matrix rows used, is determined by
+// outputCount, which is the number of outputs to compute.
+func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
+	if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
+		r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
+		return
+	}
+	for c := 0; c < r.DataShards; c++ {
+		in := inputs[c]
+		for iRow := 0; iRow < outputCount; iRow++ {
+			if c == 0 {
+				galMulSlice(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
+			} else {
+				galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
+			}
+		}
+	}
+}
+
+// Perform the same as codeSomeShards, but split the workload into
+// several goroutines.
+func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
+	var wg sync.WaitGroup
+	do := byteCount / r.o.maxGoroutines
+	if do < r.o.minSplitSize {
+		do = r.o.minSplitSize
+	}
+	start := 0
+	for start < byteCount {
+		if start+do > byteCount {
+			do = byteCount - start
+		}
+		wg.Add(1)
+		go func(start, stop int) {
+			for c := 0; c < r.DataShards; c++ {
+				in := inputs[c]
+				for iRow := 0; iRow < outputCount; iRow++ {
+					if c == 0 {
+						galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+					} else {
+						galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+					}
+				}
+			}
+			wg.Done()
+		}(start, start+do)
+		start += do
+	}
+	wg.Wait()
+}
+
+// checkSomeShards is mostly the same as codeSomeShards,
+// except this will check values and return
+// as soon as a difference is found.
+func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
+	if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
+		return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
+	}
+	outputs := make([][]byte, len(toCheck))
+	for i := range outputs {
+		outputs[i] = make([]byte, byteCount)
+	}
+	for c := 0; c < r.DataShards; c++ {
+		in := inputs[c]
+		for iRow := 0; iRow < outputCount; iRow++ {
+			galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
+		}
+	}
+
+	for i, calc := range outputs {
+		if !bytes.Equal(calc, toCheck[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
+	same := true
+	var mu sync.RWMutex // For above
+
+	var wg sync.WaitGroup
+	do := byteCount / r.o.maxGoroutines
+	if do < r.o.minSplitSize {
+		do = r.o.minSplitSize
+	}
+	start := 0
+	for start < byteCount {
+		if start+do > byteCount {
+			do = byteCount - start
+		}
+		wg.Add(1)
+		go func(start, do int) {
+			defer wg.Done()
+			outputs := make([][]byte, len(toCheck))
+			for i := range outputs {
+				outputs[i] = make([]byte, do)
+			}
+			for c := 0; c < r.DataShards; c++ {
+				mu.RLock()
+				if !same {
+					mu.RUnlock()
+					return
+				}
+				mu.RUnlock()
+				in := inputs[c][start : start+do]
+				for iRow := 0; iRow < outputCount; iRow++ {
+					galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
+				}
+			}
+
+			for i, calc := range outputs {
+				if !bytes.Equal(calc, toCheck[i][start:start+do]) {
+					mu.Lock()
+					same = false
+					mu.Unlock()
+					return
+				}
+			}
+		}(start, do)
+		start += do
+	}
+	wg.Wait()
+	return same
+}
+
+// ErrShardNoData will be returned if there are no shards,
+// or if the length of all shards is zero.
+var ErrShardNoData = errors.New("no shard data")
+
+// ErrShardSize is returned if shard length isn't the same for all
+// shards.
+var ErrShardSize = errors.New("shard sizes does not match")
+
+// checkShards will check if shards are the same size
+// or 0, if allowed. An error is returned if this fails.
+// An error is also returned if all shards are size 0.
+func checkShards(shards [][]byte, nilok bool) error {
+	size := shardSize(shards)
+	if size == 0 {
+		return ErrShardNoData
+	}
+	for _, shard := range shards {
+		if len(shard) != size {
+			if len(shard) != 0 || !nilok {
+				return ErrShardSize
+			}
+		}
+	}
+	return nil
+}
+
+// shardSize return the size of a single shard.
+// The first non-zero size is returned,
+// or 0 if all shards are size 0.
+func shardSize(shards [][]byte) int {
+	for _, shard := range shards {
+		if len(shard) != 0 {
+			return len(shard)
+		}
+	}
+	return 0
+}
+
+// Reconstruct will recreate the missing shards, if possible.
+//
+// Given a list of shards, some of which contain data, fills in the
+// ones that don't have data.
+//
+// The length of the array must be equal to Shards.
+// You indicate that a shard is missing by setting it to nil.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// The reconstructed shard set is complete, but integrity is not verified.
+// Use the Verify function to check if data set is ok.
+func (r reedSolomon) Reconstruct(shards [][]byte) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+	// Check arguments.
+	err := checkShards(shards, true)
+	if err != nil {
+		return err
+	}
+
+	shardSize := shardSize(shards)
+
+	// Quick check: are all of the shards present?  If so, there's
+	// nothing to do.
+	numberPresent := 0
+	for i := 0; i < r.Shards; i++ {
+		if len(shards[i]) != 0 {
+			numberPresent++
+		}
+	}
+	if numberPresent == r.Shards {
+		// Cool.  All of the shards data data.  We don't
+		// need to do anything.
+		return nil
+	}
+
+	// More complete sanity check
+	if numberPresent < r.DataShards {
+		return ErrTooFewShards
+	}
+
+	// Pull out an array holding just the shards that
+	// correspond to the rows of the submatrix.  These shards
+	// will be the input to the decoding process that re-creates
+	// the missing data shards.
+	//
+	// Also, create an array of indices of the valid rows we do have
+	// and the invalid rows we don't have up until we have enough valid rows.
+	subShards := make([][]byte, r.DataShards)
+	validIndices := make([]int, r.DataShards)
+	invalidIndices := make([]int, 0)
+	subMatrixRow := 0
+	for matrixRow := 0; matrixRow < r.Shards && subMatrixRow < r.DataShards; matrixRow++ {
+		if len(shards[matrixRow]) != 0 {
+			subShards[subMatrixRow] = shards[matrixRow]
+			validIndices[subMatrixRow] = matrixRow
+			subMatrixRow++
+		} else {
+			invalidIndices = append(invalidIndices, matrixRow)
+		}
+	}
+
+	// Attempt to get the cached inverted matrix out of the tree
+	// based on the indices of the invalid rows.
+	dataDecodeMatrix := r.tree.GetInvertedMatrix(invalidIndices)
+
+	// If the inverted matrix isn't cached in the tree yet we must
+	// construct it ourselves and insert it into the tree for the
+	// future.  In this way the inversion tree is lazily loaded.
+	if dataDecodeMatrix == nil {
+		// Pull out the rows of the matrix that correspond to the
+		// shards that we have and build a square matrix.  This
+		// matrix could be used to generate the shards that we have
+		// from the original data.
+		subMatrix, _ := newMatrix(r.DataShards, r.DataShards)
+		for subMatrixRow, validIndex := range validIndices {
+			for c := 0; c < r.DataShards; c++ {
+				subMatrix[subMatrixRow][c] = r.m[validIndex][c]
+			}
+		}
+		// Invert the matrix, so we can go from the encoded shards
+		// back to the original data.  Then pull out the row that
+		// generates the shard that we want to decode.  Note that
+		// since this matrix maps back to the original data, it can
+		// be used to create a data shard, but not a parity shard.
+		dataDecodeMatrix, err = subMatrix.Invert()
+		if err != nil {
+			return err
+		}
+
+		// Cache the inverted matrix in the tree for future use keyed on the
+		// indices of the invalid rows.
+		err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.Shards)
+		if err != nil {
+			return err
+		}
+	}
+
+	// Re-create any data shards that were missing.
+	//
+	// The input to the coding is all of the shards we actually
+	// have, and the output is the missing data shards.  The computation
+	// is done using the special decode matrix we just built.
+	outputs := make([][]byte, r.ParityShards)
+	matrixRows := make([][]byte, r.ParityShards)
+	outputCount := 0
+
+	for iShard := 0; iShard < r.DataShards; iShard++ {
+		if len(shards[iShard]) == 0 {
+			shards[iShard] = make([]byte, shardSize)
+			outputs[outputCount] = shards[iShard]
+			matrixRows[outputCount] = dataDecodeMatrix[iShard]
+			outputCount++
+		}
+	}
+	r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize)
+
+	// Now that we have all of the data shards intact, we can
+	// compute any of the parity that is missing.
+	//
+	// The input to the coding is ALL of the data shards, including
+	// any that we just calculated.  The output is whichever of the
+	// data shards were missing.
+	outputCount = 0
+	for iShard := r.DataShards; iShard < r.Shards; iShard++ {
+		if len(shards[iShard]) == 0 {
+			shards[iShard] = make([]byte, shardSize)
+			outputs[outputCount] = shards[iShard]
+			matrixRows[outputCount] = r.parity[iShard-r.DataShards]
+			outputCount++
+		}
+	}
+	r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize)
+	return nil
+}
+
+// ErrShortData will be returned by Split(), if there isn't enough data
+// to fill the number of shards.
+var ErrShortData = errors.New("not enough data to fill the number of requested shards")
+
+// Split a data slice into the number of shards given to the encoder,
+// and create empty parity shards.
+//
+// The data will be split into equally sized shards.
+// If the data size isn't divisible by the number of shards,
+// the last shard will contain extra zeros.
+//
+// There must be at least 1 byte otherwise ErrShortData will be
+// returned.
+//
+// The data will not be copied, except for the last shard, so you
+// should not modify the data of the input slice afterwards.
+func (r reedSolomon) Split(data []byte) ([][]byte, error) {
+	if len(data) == 0 {
+		return nil, ErrShortData
+	}
+	// Calculate number of bytes per shard.
+	perShard := (len(data) + r.DataShards - 1) / r.DataShards
+
+	// Pad data to r.Shards*perShard.
+	padding := make([]byte, (r.Shards*perShard)-len(data))
+	data = append(data, padding...)
+
+	// Split into equal-length shards.
+	dst := make([][]byte, r.Shards)
+	for i := range dst {
+		dst[i] = data[:perShard]
+		data = data[perShard:]
+	}
+
+	return dst, nil
+}
+
+// ErrReconstructRequired is returned if too few data shards are intact and a
+// reconstruction is required before you can successfully join the shards.
+var ErrReconstructRequired = errors.New("reconstruction required as one or more required data shards are nil")
+
+// Join the shards and write the data segment to dst.
+//
+// Only the data shards are considered.
+// You must supply the exact output size you want.
+//
+// If there are to few shards given, ErrTooFewShards will be returned.
+// If the total data size is less than outSize, ErrShortData will be returned.
+// If one or more required data shards are nil, ErrReconstructRequired will be returned.
+func (r reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error {
+	// Do we have enough shards?
+	if len(shards) < r.DataShards {
+		return ErrTooFewShards
+	}
+	shards = shards[:r.DataShards]
+
+	// Do we have enough data?
+	size := 0
+	for _, shard := range shards {
+		if shard == nil {
+			return ErrReconstructRequired
+		}
+		size += len(shard)
+
+		// Do we have enough data already?
+		if size >= outSize {
+			break
+		}
+	}
+	if size < outSize {
+		return ErrShortData
+	}
+
+	// Copy data to dst
+	write := outSize
+	for _, shard := range shards {
+		if write < len(shard) {
+			_, err := dst.Write(shard[:write])
+			return err
+		}
+		n, err := dst.Write(shard)
+		if err != nil {
+			return err
+		}
+		write -= n
+	}
+	return nil
+}
--- a/vendor/github.com/klauspost/reedsolomon/streaming.go
+++ b/vendor/github.com/klauspost/reedsolomon/streaming.go
@@ -0,0 +1,575 @@
+/**
+ * Reed-Solomon Coding over 8-bit values.
+ *
+ * Copyright 2015, Klaus Post
+ * Copyright 2015, Backblaze, Inc.
+ */
+
+package reedsolomon
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+)
+
+// StreamEncoder is an interface to encode Reed-Salomon parity sets for your data.
+// It provides a fully streaming interface, and processes data in blocks of up to 4MB.
+//
+// For small shard sizes, 10MB and below, it is recommended to use the in-memory interface,
+// since the streaming interface has a start up overhead.
+//
+// For all operations, no readers and writers should not assume any order/size of
+// individual reads/writes.
+//
+// For usage examples, see "stream-encoder.go" and "streamdecoder.go" in the examples
+// folder.
+type StreamEncoder interface {
+	// Encodes parity shards for a set of data shards.
+	//
+	// Input is 'shards' containing readers for data shards followed by parity shards
+	// io.Writer.
+	//
+	// The number of shards must match the number given to NewStream().
+	//
+	// Each reader must supply the same number of bytes.
+	//
+	// The parity shards will be written to the writer.
+	// The number of bytes written will match the input size.
+	//
+	// If a data stream returns an error, a StreamReadError type error
+	// will be returned. If a parity writer returns an error, a
+	// StreamWriteError will be returned.
+	Encode(data []io.Reader, parity []io.Writer) error
+
+	// Verify returns true if the parity shards contain correct data.
+	//
+	// The number of shards must match the number total data+parity shards
+	// given to NewStream().
+	//
+	// Each reader must supply the same number of bytes.
+	// If a shard stream returns an error, a StreamReadError type error
+	// will be returned.
+	Verify(shards []io.Reader) (bool, error)
+
+	// Reconstruct will recreate the missing shards if possible.
+	//
+	// Given a list of valid shards (to read) and invalid shards (to write)
+	//
+	// You indicate that a shard is missing by setting it to nil in the 'valid'
+	// slice and at the same time setting a non-nil writer in "fill".
+	// An index cannot contain both non-nil 'valid' and 'fill' entry.
+	// If both are provided 'ErrReconstructMismatch' is returned.
+	//
+	// If there are too few shards to reconstruct the missing
+	// ones, ErrTooFewShards will be returned.
+	//
+	// The reconstructed shard set is complete, but integrity is not verified.
+	// Use the Verify function to check if data set is ok.
+	Reconstruct(valid []io.Reader, fill []io.Writer) error
+
+	// Split a an input stream into the number of shards given to the encoder.
+	//
+	// The data will be split into equally sized shards.
+	// If the data size isn't dividable by the number of shards,
+	// the last shard will contain extra zeros.
+	//
+	// You must supply the total size of your input.
+	// 'ErrShortData' will be returned if it is unable to retrieve the
+	// number of bytes indicated.
+	Split(data io.Reader, dst []io.Writer, size int64) (err error)
+
+	// Join the shards and write the data segment to dst.
+	//
+	// Only the data shards are considered.
+	//
+	// You must supply the exact output size you want.
+	// If there are to few shards given, ErrTooFewShards will be returned.
+	// If the total data size is less than outSize, ErrShortData will be returned.
+	Join(dst io.Writer, shards []io.Reader, outSize int64) error
+}
+
+// StreamReadError is returned when a read error is encountered
+// that relates to a supplied stream.
+// This will allow you to find out which reader has failed.
+type StreamReadError struct {
+	Err    error // The error
+	Stream int   // The stream number on which the error occurred
+}
+
+// Error returns the error as a string
+func (s StreamReadError) Error() string {
+	return fmt.Sprintf("error reading stream %d: %s", s.Stream, s.Err)
+}
+
+// String returns the error as a string
+func (s StreamReadError) String() string {
+	return s.Error()
+}
+
+// StreamWriteError is returned when a write error is encountered
+// that relates to a supplied stream. This will allow you to
+// find out which reader has failed.
+type StreamWriteError struct {
+	Err    error // The error
+	Stream int   // The stream number on which the error occurred
+}
+
+// Error returns the error as a string
+func (s StreamWriteError) Error() string {
+	return fmt.Sprintf("error writing stream %d: %s", s.Stream, s.Err)
+}
+
+// String returns the error as a string
+func (s StreamWriteError) String() string {
+	return s.Error()
+}
+
+// rsStream contains a matrix for a specific
+// distribution of datashards and parity shards.
+// Construct if using NewStream()
+type rsStream struct {
+	r  *reedSolomon
+	bs int // Block size
+	// Shard reader
+	readShards func(dst [][]byte, in []io.Reader) error
+	// Shard writer
+	writeShards func(out []io.Writer, in [][]byte) error
+	creads      bool
+	cwrites     bool
+}
+
+// NewStream creates a new encoder and initializes it to
+// the number of data shards and parity shards that
+// you want to use. You can reuse this encoder.
+// Note that the maximum number of data shards is 256.
+func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) {
+	enc, err := New(dataShards, parityShards, o...)
+	if err != nil {
+		return nil, err
+	}
+	rs := enc.(*reedSolomon)
+	r := rsStream{r: rs, bs: 4 << 20}
+	r.readShards = readShards
+	r.writeShards = writeShards
+	return &r, err
+}
+
+// NewStreamC creates a new encoder and initializes it to
+// the number of data shards and parity shards given.
+//
+// This functions as 'NewStream', but allows you to enable CONCURRENT reads and writes.
+func NewStreamC(dataShards, parityShards int, conReads, conWrites bool, o ...Option) (StreamEncoder, error) {
+	enc, err := New(dataShards, parityShards, o...)
+	if err != nil {
+		return nil, err
+	}
+	rs := enc.(*reedSolomon)
+	r := rsStream{r: rs, bs: 4 << 20}
+	r.readShards = readShards
+	r.writeShards = writeShards
+	if conReads {
+		r.readShards = cReadShards
+	}
+	if conWrites {
+		r.writeShards = cWriteShards
+	}
+	return &r, err
+}
+
+func createSlice(n, length int) [][]byte {
+	out := make([][]byte, n)
+	for i := range out {
+		out[i] = make([]byte, length)
+	}
+	return out
+}
+
+// Encodes parity shards for a set of data shards.
+//
+// Input is 'shards' containing readers for data shards followed by parity shards
+// io.Writer.
+//
+// The number of shards must match the number given to NewStream().
+//
+// Each reader must supply the same number of bytes.
+//
+// The parity shards will be written to the writer.
+// The number of bytes written will match the input size.
+//
+// If a data stream returns an error, a StreamReadError type error
+// will be returned. If a parity writer returns an error, a
+// StreamWriteError will be returned.
+func (r rsStream) Encode(data []io.Reader, parity []io.Writer) error {
+	if len(data) != r.r.DataShards {
+		return ErrTooFewShards
+	}
+
+	if len(parity) != r.r.ParityShards {
+		return ErrTooFewShards
+	}
+
+	all := createSlice(r.r.Shards, r.bs)
+	in := all[:r.r.DataShards]
+	out := all[r.r.DataShards:]
+	read := 0
+
+	for {
+		err := r.readShards(in, data)
+		switch err {
+		case nil:
+		case io.EOF:
+			if read == 0 {
+				return ErrShardNoData
+			}
+			return nil
+		default:
+			return err
+		}
+		out = trimShards(out, shardSize(in))
+		read += shardSize(in)
+		err = r.r.Encode(all)
+		if err != nil {
+			return err
+		}
+		err = r.writeShards(parity, out)
+		if err != nil {
+			return err
+		}
+	}
+}
+
+// Trim the shards so they are all the same size
+func trimShards(in [][]byte, size int) [][]byte {
+	for i := range in {
+		if in[i] != nil {
+			in[i] = in[i][0:size]
+		}
+		if len(in[i]) < size {
+			in[i] = nil
+		}
+	}
+	return in
+}
+
+func readShards(dst [][]byte, in []io.Reader) error {
+	if len(in) != len(dst) {
+		panic("internal error: in and dst size does not match")
+	}
+	size := -1
+	for i := range in {
+		if in[i] == nil {
+			dst[i] = nil
+			continue
+		}
+		n, err := io.ReadFull(in[i], dst[i])
+		// The error is EOF only if no bytes were read.
+		// If an EOF happens after reading some but not all the bytes,
+		// ReadFull returns ErrUnexpectedEOF.
+		switch err {
+		case io.ErrUnexpectedEOF, io.EOF:
+			if size < 0 {
+				size = n
+			} else if n != size {
+				// Shard sizes must match.
+				return ErrShardSize
+			}
+			dst[i] = dst[i][0:n]
+		case nil:
+			continue
+		default:
+			return StreamReadError{Err: err, Stream: i}
+		}
+	}
+	if size == 0 {
+		return io.EOF
+	}
+	return nil
+}
+
+func writeShards(out []io.Writer, in [][]byte) error {
+	if len(out) != len(in) {
+		panic("internal error: in and out size does not match")
+	}
+	for i := range in {
+		if out[i] == nil {
+			continue
+		}
+		n, err := out[i].Write(in[i])
+		if err != nil {
+			return StreamWriteError{Err: err, Stream: i}
+		}
+		//
+		if n != len(in[i]) {
+			return StreamWriteError{Err: io.ErrShortWrite, Stream: i}
+		}
+	}
+	return nil
+}
+
+type readResult struct {
+	n    int
+	size int
+	err  error
+}
+
+// cReadShards reads shards concurrently
+func cReadShards(dst [][]byte, in []io.Reader) error {
+	if len(in) != len(dst) {
+		panic("internal error: in and dst size does not match")
+	}
+	var wg sync.WaitGroup
+	wg.Add(len(in))
+	res := make(chan readResult, len(in))
+	for i := range in {
+		if in[i] == nil {
+			dst[i] = nil
+			wg.Done()
+			continue
+		}
+		go func(i int) {
+			defer wg.Done()
+			n, err := io.ReadFull(in[i], dst[i])
+			// The error is EOF only if no bytes were read.
+			// If an EOF happens after reading some but not all the bytes,
+			// ReadFull returns ErrUnexpectedEOF.
+			res <- readResult{size: n, err: err, n: i}
+
+		}(i)
+	}
+	wg.Wait()
+	close(res)
+	size := -1
+	for r := range res {
+		switch r.err {
+		case io.ErrUnexpectedEOF, io.EOF:
+			if size < 0 {
+				size = r.size
+			} else if r.size != size {
+				// Shard sizes must match.
+				return ErrShardSize
+			}
+			dst[r.n] = dst[r.n][0:r.size]
+		case nil:
+		default:
+			return StreamReadError{Err: r.err, Stream: r.n}
+		}
+	}
+	if size == 0 {
+		return io.EOF
+	}
+	return nil
+}
+
+// cWriteShards writes shards concurrently
+func cWriteShards(out []io.Writer, in [][]byte) error {
+	if len(out) != len(in) {
+		panic("internal error: in and out size does not match")
+	}
+	var errs = make(chan error, len(out))
+	var wg sync.WaitGroup
+	wg.Add(len(out))
+	for i := range in {
+		go func(i int) {
+			defer wg.Done()
+			if out[i] == nil {
+				errs <- nil
+				return
+			}
+			n, err := out[i].Write(in[i])
+			if err != nil {
+				errs <- StreamWriteError{Err: err, Stream: i}
+				return
+			}
+			if n != len(in[i]) {
+				errs <- StreamWriteError{Err: io.ErrShortWrite, Stream: i}
+			}
+		}(i)
+	}
+	wg.Wait()
+	close(errs)
+	for err := range errs {
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Verify returns true if the parity shards contain correct data.
+//
+// The number of shards must match the number total data+parity shards
+// given to NewStream().
+//
+// Each reader must supply the same number of bytes.
+// If a shard stream returns an error, a StreamReadError type error
+// will be returned.
+func (r rsStream) Verify(shards []io.Reader) (bool, error) {
+	if len(shards) != r.r.Shards {
+		return false, ErrTooFewShards
+	}
+
+	read := 0
+	all := createSlice(r.r.Shards, r.bs)
+	for {
+		err := r.readShards(all, shards)
+		if err == io.EOF {
+			if read == 0 {
+				return false, ErrShardNoData
+			}
+			return true, nil
+		}
+		if err != nil {
+			return false, err
+		}
+		read += shardSize(all)
+		ok, err := r.r.Verify(all)
+		if !ok || err != nil {
+			return ok, err
+		}
+	}
+}
+
+// ErrReconstructMismatch is returned by the StreamEncoder, if you supply
+// "valid" and "fill" streams on the same index.
+// Therefore it is impossible to see if you consider the shard valid
+// or would like to have it reconstructed.
+var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutually exclusive")
+
+// Reconstruct will recreate the missing shards if possible.
+//
+// Given a list of valid shards (to read) and invalid shards (to write)
+//
+// You indicate that a shard is missing by setting it to nil in the 'valid'
+// slice and at the same time setting a non-nil writer in "fill".
+// An index cannot contain both non-nil 'valid' and 'fill' entry.
+//
+// If there are too few shards to reconstruct the missing
+// ones, ErrTooFewShards will be returned.
+//
+// The reconstructed shard set is complete, but integrity is not verified.
+// Use the Verify function to check if data set is ok.
+func (r rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error {
+	if len(valid) != r.r.Shards {
+		return ErrTooFewShards
+	}
+	if len(fill) != r.r.Shards {
+		return ErrTooFewShards
+	}
+
+	all := createSlice(r.r.Shards, r.bs)
+	for i := range valid {
+		if valid[i] != nil && fill[i] != nil {
+			return ErrReconstructMismatch
+		}
+	}
+
+	read := 0
+	for {
+		err := r.readShards(all, valid)
+		if err == io.EOF {
+			if read == 0 {
+				return ErrShardNoData
+			}
+			return nil
+		}
+		if err != nil {
+			return err
+		}
+		read += shardSize(all)
+		all = trimShards(all, shardSize(all))
+
+		err = r.r.Reconstruct(all)
+		if err != nil {
+			return err
+		}
+		err = r.writeShards(fill, all)
+		if err != nil {
+			return err
+		}
+	}
+}
+
+// Join the shards and write the data segment to dst.
+//
+// Only the data shards are considered.
+//
+// You must supply the exact output size you want.
+// If there are to few shards given, ErrTooFewShards will be returned.
+// If the total data size is less than outSize, ErrShortData will be returned.
+func (r rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error {
+	// Do we have enough shards?
+	if len(shards) < r.r.DataShards {
+		return ErrTooFewShards
+	}
+
+	// Trim off parity shards if any
+	shards = shards[:r.r.DataShards]
+	for i := range shards {
+		if shards[i] == nil {
+			return StreamReadError{Err: ErrShardNoData, Stream: i}
+		}
+	}
+	// Join all shards
+	src := io.MultiReader(shards...)
+
+	// Copy data to dst
+	n, err := io.CopyN(dst, src, outSize)
+	if err == io.EOF {
+		return ErrShortData
+	}
+	if err != nil {
+		return err
+	}
+	if n != outSize {
+		return ErrShortData
+	}
+	return nil
+}
+
+// Split a an input stream into the number of shards given to the encoder.
+//
+// The data will be split into equally sized shards.
+// If the data size isn't dividable by the number of shards,
+// the last shard will contain extra zeros.
+//
+// You must supply the total size of your input.
+// 'ErrShortData' will be returned if it is unable to retrieve the
+// number of bytes indicated.
+func (r rsStream) Split(data io.Reader, dst []io.Writer, size int64) error {
+	if size == 0 {
+		return ErrShortData
+	}
+	if len(dst) != r.r.DataShards {
+		return ErrInvShardNum
+	}
+
+	for i := range dst {
+		if dst[i] == nil {
+			return StreamWriteError{Err: ErrShardNoData, Stream: i}
+		}
+	}
+
+	// Calculate number of bytes per shard.
+	perShard := (size + int64(r.r.DataShards) - 1) / int64(r.r.DataShards)
+
+	// Pad data to r.Shards*perShard.
+	padding := make([]byte, (int64(r.r.Shards)*perShard)-size)
+	data = io.MultiReader(data, bytes.NewBuffer(padding))
+
+	// Split into equal-length shards and copy.
+	for i := range dst {
+		n, err := io.CopyN(dst[i], data, perShard)
+		if err != io.EOF && err != nil {
+			return err
+		}
+		if n != perShard {
+			return ErrShortData
+		}
+	}
+
+	return nil
+}
--- a/vendor/github.com/xtaci/kcp-go/.gitignore
+++ b/vendor/github.com/xtaci/kcp-go/.gitignore
@@ -0,0 +1,24 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
--- a/vendor/github.com/xtaci/kcp-go/.travis.yml
+++ b/vendor/github.com/xtaci/kcp-go/.travis.yml
@@ -0,0 +1,15 @@
+language: go
+go:
+    - 1.8
+
+before_install:
+    - go get -t -v ./...
+
+install:
+    - go get github.com/xtaci/kcp-go
+
+script:
+    - go test -coverprofile=coverage.txt -covermode=atomic -bench .
+
+after_success:
+    - bash <(curl -s https://codecov.io/bash)
--- a/vendor/github.com/xtaci/kcp-go/LICENSE
+++ b/vendor/github.com/xtaci/kcp-go/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Daniel Fu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
--- a/vendor/github.com/xtaci/kcp-go/README.md
+++ b/vendor/github.com/xtaci/kcp-go/README.md
@@ -0,0 +1,171 @@
+<img src="kcp-go.png" alt="kcp-go" height="50px" />
+
+
+[![GoDoc][1]][2] [![Powered][9]][10] [![MIT licensed][11]][12] [![Build Status][3]][4] [![Go Report Card][5]][6] [![Coverage Statusd][7]][8]
+
+[1]: https://godoc.org/github.com/xtaci/kcp-go?status.svg
+[2]: https://godoc.org/github.com/xtaci/kcp-go
+[3]: https://travis-ci.org/xtaci/kcp-go.svg?branch=master
+[4]: https://travis-ci.org/xtaci/kcp-go
+[5]: https://goreportcard.com/badge/github.com/xtaci/kcp-go
+[6]: https://goreportcard.com/report/github.com/xtaci/kcp-go
+[7]: https://codecov.io/gh/xtaci/kcp-go/branch/master/graph/badge.svg
+[8]: https://codecov.io/gh/xtaci/kcp-go
+[9]: https://img.shields.io/badge/KCP-Powered-blue.svg
+[10]: https://github.com/skywind3000/kcp
+[11]: https://img.shields.io/badge/license-MIT-blue.svg
+[12]: LICENSE
+
+## Introduction
+
+**kcp-go** is a **Production-Grade Reliable-UDP** library for [golang](https://golang.org/). 
+
+It provides **fast, ordered and error-checked** delivery of streams over **UDP** packets, has been well tested with opensource project [kcptun](https://github.com/xtaci/kcptun). Millions of devices(from low-end MIPS routers to high-end servers) are running with **kcp-go** at present, including applications like **online games, live broadcasting, file synchronization and network acceleration**.
+
+[Lastest Release](https://github.com/xtaci/kcp-go/releases)
+
+## Features
+
+1. Optimized for **Realtime Multiplayer Games, Audio/Video Streaming**.
+1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with language specific optimizations.
+1. **Cache friendly** and **Memory optimized** design, offers extremely **High Performance** core.
+1. Compatible with [net.Conn](https://golang.org/pkg/net/#Conn) and [net.Listener](https://golang.org/pkg/net/#Listener), easy to use.
+1. [FEC(Forward Error Correction)](https://en.wikipedia.org/wiki/Forward_error_correction) Support with [Reed-Solomon Codes](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction)
+1. Packet level encryption support with [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard), [TEA](https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm), [3DES](https://en.wikipedia.org/wiki/Triple_DES), [Blowfish](https://en.wikipedia.org/wiki/Blowfish_(cipher)), [Cast5](https://en.wikipedia.org/wiki/CAST-128), [Salsa20]( https://en.wikipedia.org/wiki/Salsa20), etc. in [CFB](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_Feedback_.28CFB.29) mode.
+1. **O(1) goroutines** created for the entire server application, minimized goroutine context switch.
+
+## Conventions
+
+Control messages like **SYN/FIN/RST** in TCP **are not defined** in KCP, you need some **keepalive/heartbeat mechanism** in the application-level. A real world example is to use some **multiplexing** protocol over session, such as [smux](https://github.com/xtaci/smux)(with embedded keepalive mechanism), see [kcptun](https://github.com/xtaci/kcptun) for example.
+
+## Documentation
+
+For complete documentation, see the associated [Godoc](https://godoc.org/github.com/xtaci/kcp-go).
+
+## Specification
+
+<img src="frame.png" alt="Frame Format" height="109px" />
+
+```
+-----------------+
+| SESSION         |
+-----------------+
+| KCP(ARQ)        |
+-----------------+
+| FEC(OPTIONAL)   |
+-----------------+
+| CRYPTO(OPTIONAL)|
+-----------------+
+| UDP(PACKET)     |
+-----------------+
+| IP              |
+-----------------+
+| LINK            |
+-----------------+
+| PHY             |
+-----------------+
+(LAYER MODEL OF KCP-GO)
+```
+
+
+## Usage
+
+Client:   [full demo](https://github.com/xtaci/kcptun/blob/master/client/main.go)
+```go
+kcpconn, err := kcp.DialWithOptions("192.168.0.1:10000", nil, 10, 3)
+```
+Server:   [full demo](https://github.com/xtaci/kcptun/blob/master/server/main.go)
+```go
+lis, err := kcp.ListenWithOptions(":10000", nil, 10, 3)
+```
+
+## Performance
+```
+  Model Name:	MacBook Pro
+  Model Identifier:	MacBookPro12,1
+  Processor Name:	Intel Core i5
+  Processor Speed:	2.7 GHz
+  Number of Processors:	1
+  Total Number of Cores:	2
+  L2 Cache (per Core):	256 KB
+  L3 Cache:	3 MB
+  Memory:	8 GB
+```
+```
+$ go test -v -run=^$ -bench .
+beginning tests, encryption:salsa20, fec:10/3
+BenchmarkAES128-4          	  200000	      8256 ns/op	 363.33 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAES192-4          	  200000	      9153 ns/op	 327.74 MB/s	       0 B/op	       0 allocs/op
+BenchmarkAES256-4          	  200000	     10079 ns/op	 297.64 MB/s	       0 B/op	       0 allocs/op
+BenchmarkTEA-4             	  100000	     18643 ns/op	 160.91 MB/s	       0 B/op	       0 allocs/op
+BenchmarkXOR-4             	 5000000	       316 ns/op	9486.46 MB/s	       0 B/op	       0 allocs/op
+BenchmarkBlowfish-4        	   50000	     35643 ns/op	  84.17 MB/s	       0 B/op	       0 allocs/op
+BenchmarkNone-4            	30000000	        56.2 ns/op	53371.83 MB/s	       0 B/op	       0 allocs/op
+BenchmarkCast5-4           	   30000	     44744 ns/op	  67.05 MB/s	       0 B/op	       0 allocs/op
+Benchmark3DES-4            	    2000	    639839 ns/op	   4.69 MB/s	       2 B/op	       0 allocs/op
+BenchmarkTwofish-4         	   30000	     43368 ns/op	  69.17 MB/s	       0 B/op	       0 allocs/op
+BenchmarkXTEA-4            	   30000	     57673 ns/op	  52.02 MB/s	       0 B/op	       0 allocs/op
+BenchmarkSalsa20-4         	  300000	      3917 ns/op	 765.80 MB/s	       0 B/op	       0 allocs/op
+BenchmarkFlush-4           	10000000	       226 ns/op	       0 B/op	       0 allocs/op
+BenchmarkEchoSpeed4K-4     	    5000	    300030 ns/op	  13.65 MB/s	    5672 B/op	     177 allocs/op
+BenchmarkEchoSpeed64K-4    	     500	   3202335 ns/op	  20.47 MB/s	   73295 B/op	    2198 allocs/op
+BenchmarkEchoSpeed512K-4   	      50	  24926924 ns/op	  21.03 MB/s	  659339 B/op	   17602 allocs/op
+BenchmarkEchoSpeed1M-4     	      20	  64857821 ns/op	  16.17 MB/s	 1772437 B/op	   42869 allocs/op
+BenchmarkSinkSpeed4K-4     	   30000	     50230 ns/op	  81.54 MB/s	    2058 B/op	      48 allocs/op
+BenchmarkSinkSpeed64K-4    	    2000	    648718 ns/op	 101.02 MB/s	   31165 B/op	     687 allocs/op
+BenchmarkSinkSpeed256K-4   	     300	   4635905 ns/op	 113.09 MB/s	  286229 B/op	    5516 allocs/op
+BenchmarkSinkSpeed1M-4     	     200	   9566933 ns/op	 109.60 MB/s	  463771 B/op	   10701 allocs/op
+PASS
+ok  	_/Users/xtaci/.godeps/src/github.com/xtaci/kcp-go	39.689s
+```
+
+## Design Considerations
+
+1. slice vs. container/list
+
+`kcp.flush()` loops through the send queue for retransmission checking for every 20ms(interval).
+
+I've wrote a benchmark for comparing sequential loop through *slice* and *container/list* here:
+
+https://github.com/xtaci/notes/blob/master/golang/benchmark2/cachemiss_test.go
+
+```
+BenchmarkLoopSlice-4   	2000000000	         0.39 ns/op
+BenchmarkLoopList-4    	100000000	        54.6 ns/op
+```
+
+List structure introduces **heavy cache misses** compared to slice which owns better **locality**, 5000 connections with 32 window size and 20ms interval will cost 6us/0.03%(cpu) using slice, and 8.7ms/43.5%(cpu) for list for each `kcp.flush()`.
+
+2. Timing accuracy vs. syscall clock_gettime
+
+Timing is **critical** to **RTT estimator**, inaccurate timing introduces false retransmissions in KCP, but calling `time.Now()` costs 42 cycles(10.5ns on 4GHz CPU, 15.6ns on my MacBook Pro 2.7GHz), the benchmark for time.Now():
+
+https://github.com/xtaci/notes/blob/master/golang/benchmark2/syscall_test.go
+
+```
+BenchmarkNow-4         	100000000	        15.6 ns/op
+```
+
+In kcp-go, after each `kcp.output()` function call, current time will be updated upon return, and each `kcp.flush()` will get current time once. For most of the time, 5000 connections costs 5000 * 15.6ns = 78us(no packet needs to be sent by `kcp.output()`), as for 10MB/s data transfering with 1400 MTU, `kcp.output()` will be called around 7500 times and costs 117us for `time.Now()` in **every second**.
+
+
+## Tuning
+
+Q: I'm running > 3000 connections on my server. the CPU utilization is high.
+
+A: A standalone `agent` or `gate` server for kcp-go is suggested, not only for CPU utilization, but also important to the **precision** of RTT measurements which indirectly affects retransmission. By increasing update `interval` with `SetNoDelay` like `conn.SetNoDelay(1, 40, 1, 1)` will dramatically reduce system load.
+
+## Who is using this?
+
+1. https://github.com/xtaci/kcptun -- A Secure Tunnel Based On KCP over UDP.
+2. https://github.com/getlantern/lantern -- Lantern delivers fast access to the open Internet. 
+3. https://github.com/smallnest/rpcx -- A RPC service framework based on net/rpc like alibaba Dubbo and weibo Motan.
+4. https://github.com/gonet2/agent -- A gateway for games with stream multiplexing.
+5. https://github.com/syncthing/syncthing -- Open Source Continuous File Synchronization.
+6. https://play.google.com/store/apps/details?id=com.k17game.k3 -- Battle Zone - Earth 2048, a world-wide strategy game.
+
+## Links
+
+1. https://github.com/xtaci/libkcp -- FEC enhanced KCP session library for iOS/Android in C++
+2. https://github.com/skywind3000/kcp -- A Fast and Reliable ARQ Protocol
+3. https://github.com/klauspost/reedsolomon -- Reed-Solomon Erasure Coding in Go
--- a/vendor/github.com/xtaci/kcp-go/crypt.go
+++ b/vendor/github.com/xtaci/kcp-go/crypt.go
@@ -0,0 +1,263 @@
+package kcp
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/des"
+	"crypto/sha1"
+
+	"golang.org/x/crypto/blowfish"
+	"golang.org/x/crypto/cast5"
+	"golang.org/x/crypto/pbkdf2"
+	"golang.org/x/crypto/salsa20"
+	"golang.org/x/crypto/tea"
+	"golang.org/x/crypto/twofish"
+	"golang.org/x/crypto/xtea"
+)
+
+var (
+	initialVector = []byte{167, 115, 79, 156, 18, 172, 27, 1, 164, 21, 242, 193, 252, 120, 230, 107}
+	saltxor       = `sH3CIVoF#rWLtJo6`
+)
+
+// BlockCrypt defines encryption/decryption methods for a given byte slice.
+// Notes on implementing: the data to be encrypted contains a builtin
+// nonce at the first 16 bytes
+type BlockCrypt interface {
+	// Encrypt encrypts the whole block in src into dst.
+	// Dst and src may point at the same memory.
+	Encrypt(dst, src []byte)
+
+	// Decrypt decrypts the whole block in src into dst.
+	// Dst and src may point at the same memory.
+	Decrypt(dst, src []byte)
+}
+
+type salsa20BlockCrypt struct {
+	key [32]byte
+}
+
+// NewSalsa20BlockCrypt https://en.wikipedia.org/wiki/Salsa20
+func NewSalsa20BlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(salsa20BlockCrypt)
+	copy(c.key[:], key)
+	return c, nil
+}
+
+func (c *salsa20BlockCrypt) Encrypt(dst, src []byte) {
+	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
+	copy(dst[:8], src[:8])
+}
+func (c *salsa20BlockCrypt) Decrypt(dst, src []byte) {
+	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
+	copy(dst[:8], src[:8])
+}
+
+type twofishBlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewTwofishBlockCrypt https://en.wikipedia.org/wiki/Twofish
+func NewTwofishBlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(twofishBlockCrypt)
+	block, err := twofish.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, twofish.BlockSize)
+	c.decbuf = make([]byte, 2*twofish.BlockSize)
+	return c, nil
+}
+
+func (c *twofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *twofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type tripleDESBlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewTripleDESBlockCrypt https://en.wikipedia.org/wiki/Triple_DES
+func NewTripleDESBlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(tripleDESBlockCrypt)
+	block, err := des.NewTripleDESCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, des.BlockSize)
+	c.decbuf = make([]byte, 2*des.BlockSize)
+	return c, nil
+}
+
+func (c *tripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *tripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type cast5BlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewCast5BlockCrypt https://en.wikipedia.org/wiki/CAST-128
+func NewCast5BlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(cast5BlockCrypt)
+	block, err := cast5.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, cast5.BlockSize)
+	c.decbuf = make([]byte, 2*cast5.BlockSize)
+	return c, nil
+}
+
+func (c *cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type blowfishBlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewBlowfishBlockCrypt https://en.wikipedia.org/wiki/Blowfish_(cipher)
+func NewBlowfishBlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(blowfishBlockCrypt)
+	block, err := blowfish.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, blowfish.BlockSize)
+	c.decbuf = make([]byte, 2*blowfish.BlockSize)
+	return c, nil
+}
+
+func (c *blowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *blowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type aesBlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewAESBlockCrypt https://en.wikipedia.org/wiki/Advanced_Encryption_Standard
+func NewAESBlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(aesBlockCrypt)
+	block, err := aes.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, aes.BlockSize)
+	c.decbuf = make([]byte, 2*aes.BlockSize)
+	return c, nil
+}
+
+func (c *aesBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *aesBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type teaBlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewTEABlockCrypt https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm
+func NewTEABlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(teaBlockCrypt)
+	block, err := tea.NewCipherWithRounds(key, 16)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, tea.BlockSize)
+	c.decbuf = make([]byte, 2*tea.BlockSize)
+	return c, nil
+}
+
+func (c *teaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *teaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type xteaBlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewXTEABlockCrypt https://en.wikipedia.org/wiki/XTEA
+func NewXTEABlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(xteaBlockCrypt)
+	block, err := xtea.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, xtea.BlockSize)
+	c.decbuf = make([]byte, 2*xtea.BlockSize)
+	return c, nil
+}
+
+func (c *xteaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *xteaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
+type simpleXORBlockCrypt struct {
+	xortbl []byte
+}
+
+// NewSimpleXORBlockCrypt simple xor with key expanding
+func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(simpleXORBlockCrypt)
+	c.xortbl = pbkdf2.Key(key, []byte(saltxor), 32, mtuLimit, sha1.New)
+	return c, nil
+}
+
+func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+
+type noneBlockCrypt struct{}
+
+// NewNoneBlockCrypt does nothing but copying
+func NewNoneBlockCrypt(key []byte) (BlockCrypt, error) {
+	return new(noneBlockCrypt), nil
+}
+
+func (c *noneBlockCrypt) Encrypt(dst, src []byte) { copy(dst, src) }
+func (c *noneBlockCrypt) Decrypt(dst, src []byte) { copy(dst, src) }
+
+// packet encryption with local CFB mode
+func encrypt(block cipher.Block, dst, src, buf []byte) {
+	blocksize := block.BlockSize()
+	tbl := buf[:blocksize]
+	block.Encrypt(tbl, initialVector)
+	n := len(src) / blocksize
+	base := 0
+	for i := 0; i < n; i++ {
+		xorWords(dst[base:], src[base:], tbl)
+		block.Encrypt(tbl, dst[base:])
+		base += blocksize
+	}
+	xorBytes(dst[base:], src[base:], tbl)
+}
+
+func decrypt(block cipher.Block, dst, src, buf []byte) {
+	blocksize := block.BlockSize()
+	tbl := buf[:blocksize]
+	next := buf[blocksize:]
+	block.Encrypt(tbl, initialVector)
+	n := len(src) / blocksize
+	base := 0
+	for i := 0; i < n; i++ {
+		block.Encrypt(next, src[base:])
+		xorWords(dst[base:], src[base:], tbl)
+		tbl, next = next, tbl
+		base += blocksize
+	}
+	xorBytes(dst[base:], src[base:], tbl)
+}
--- a/vendor/github.com/xtaci/kcp-go/donate.png
+++ b/vendor/github.com/xtaci/kcp-go/donate.png
--- a/vendor/github.com/xtaci/kcp-go/fec.go
+++ b/vendor/github.com/xtaci/kcp-go/fec.go
@@ -0,0 +1,303 @@
+package kcp
+
+import (
+	"encoding/binary"
+	"sync/atomic"
+
+	"github.com/klauspost/reedsolomon"
+)
+
+const (
+	fecHeaderSize      = 6
+	fecHeaderSizePlus2 = fecHeaderSize + 2 // plus 2B data size
+	typeData           = 0xf1
+	typeFEC            = 0xf2
+)
+
+type (
+	// fecPacket is a decoded FEC packet
+	fecPacket struct {
+		seqid uint32
+		flag  uint16
+		data  []byte
+	}
+
+	// fecDecoder for decoding incoming packets
+	fecDecoder struct {
+		rxlimit      int // queue size limit
+		dataShards   int
+		parityShards int
+		shardSize    int
+		rx           []fecPacket // ordered receive queue
+
+		// caches
+		decodeCache [][]byte
+		flagCache   []bool
+
+		// RS decoder
+		codec reedsolomon.Encoder
+	}
+)
+
+func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder {
+	if dataShards <= 0 || parityShards <= 0 {
+		return nil
+	}
+	if rxlimit < dataShards+parityShards {
+		return nil
+	}
+
+	fec := new(fecDecoder)
+	fec.rxlimit = rxlimit
+	fec.dataShards = dataShards
+	fec.parityShards = parityShards
+	fec.shardSize = dataShards + parityShards
+	enc, err := reedsolomon.New(dataShards, parityShards, reedsolomon.WithMaxGoroutines(1))
+	if err != nil {
+		return nil
+	}
+	fec.codec = enc
+	fec.decodeCache = make([][]byte, fec.shardSize)
+	fec.flagCache = make([]bool, fec.shardSize)
+	return fec
+}
+
+// decodeBytes a fec packet
+func (dec *fecDecoder) decodeBytes(data []byte) fecPacket {
+	var pkt fecPacket
+	pkt.seqid = binary.LittleEndian.Uint32(data)
+	pkt.flag = binary.LittleEndian.Uint16(data[4:])
+	// allocate memory & copy
+	buf := xmitBuf.Get().([]byte)[:len(data)-6]
+	copy(buf, data[6:])
+	pkt.data = buf
+	return pkt
+}
+
+// decode a fec packet
+func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
+	// insertion
+	n := len(dec.rx) - 1
+	insertIdx := 0
+	for i := n; i >= 0; i-- {
+		if pkt.seqid == dec.rx[i].seqid { // de-duplicate
+			xmitBuf.Put(pkt.data)
+			return nil
+		} else if _itimediff(pkt.seqid, dec.rx[i].seqid) > 0 { // insertion
+			insertIdx = i + 1
+			break
+		}
+	}
+
+	// insert into ordered rx queue
+	if insertIdx == n+1 {
+		dec.rx = append(dec.rx, pkt)
+	} else {
+		dec.rx = append(dec.rx, fecPacket{})
+		copy(dec.rx[insertIdx+1:], dec.rx[insertIdx:]) // shift right
+		dec.rx[insertIdx] = pkt
+	}
+
+	// shard range for current packet
+	shardBegin := pkt.seqid - pkt.seqid%uint32(dec.shardSize)
+	shardEnd := shardBegin + uint32(dec.shardSize) - 1
+
+	// max search range in ordered queue for current shard
+	searchBegin := insertIdx - int(pkt.seqid%uint32(dec.shardSize))
+	if searchBegin < 0 {
+		searchBegin = 0
+	}
+	searchEnd := searchBegin + dec.shardSize - 1
+	if searchEnd >= len(dec.rx) {
+		searchEnd = len(dec.rx) - 1
+	}
+
+	// re-construct datashards
+	if searchEnd-searchBegin+1 >= dec.dataShards {
+		var numshard, numDataShard, first, maxlen int
+
+		// zero cache
+		shards := dec.decodeCache
+		shardsflag := dec.flagCache
+		for k := range dec.decodeCache {
+			shards[k] = nil
+			shardsflag[k] = false
+		}
+
+		// shard assembly
+		for i := searchBegin; i <= searchEnd; i++ {
+			seqid := dec.rx[i].seqid
+			if _itimediff(seqid, shardEnd) > 0 {
+				break
+			} else if _itimediff(seqid, shardBegin) >= 0 {
+				shards[seqid%uint32(dec.shardSize)] = dec.rx[i].data
+				shardsflag[seqid%uint32(dec.shardSize)] = true
+				numshard++
+				if dec.rx[i].flag == typeData {
+					numDataShard++
+				}
+				if numshard == 1 {
+					first = i
+				}
+				if len(dec.rx[i].data) > maxlen {
+					maxlen = len(dec.rx[i].data)
+				}
+			}
+		}
+
+		if numDataShard == dec.dataShards {
+			// case 1:  no lost data shards
+			dec.rx = dec.freeRange(first, numshard, dec.rx)
+		} else if numshard >= dec.dataShards {
+			// case 2: data shard lost, but  recoverable from parity shard
+			for k := range shards {
+				if shards[k] != nil {
+					dlen := len(shards[k])
+					shards[k] = shards[k][:maxlen]
+					xorBytes(shards[k][dlen:], shards[k][dlen:], shards[k][dlen:])
+				}
+			}
+			if err := dec.codec.Reconstruct(shards); err == nil {
+				for k := range shards[:dec.dataShards] {
+					if !shardsflag[k] {
+						recovered = append(recovered, shards[k])
+					}
+				}
+			}
+			dec.rx = dec.freeRange(first, numshard, dec.rx)
+		}
+	}
+
+	// keep rxlimit
+	if len(dec.rx) > dec.rxlimit {
+		if dec.rx[0].flag == typeData { // record unrecoverable data
+			atomic.AddUint64(&DefaultSnmp.FECShortShards, 1)
+		}
+		dec.rx = dec.freeRange(0, 1, dec.rx)
+	}
+	return
+}
+
+// free a range of fecPacket, and zero for GC recycling
+func (dec *fecDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
+	for i := first; i < first+n; i++ { // free
+		xmitBuf.Put(q[i].data)
+	}
+	copy(q[first:], q[first+n:])
+	for i := 0; i < n; i++ { // dereference data
+		q[len(q)-1-i].data = nil
+	}
+	return q[:len(q)-n]
+}
+
+type (
+	// fecEncoder for encoding outgoing packets
+	fecEncoder struct {
+		dataShards   int
+		parityShards int
+		shardSize    int
+		paws         uint32 // Protect Against Wrapped Sequence numbers
+		next         uint32 // next seqid
+
+		shardCount int // count the number of datashards collected
+		maxSize    int // record maximum data length in datashard
+
+		headerOffset  int // FEC header offset
+		payloadOffset int // FEC payload offset
+
+		// caches
+		shardCache  [][]byte
+		encodeCache [][]byte
+
+		// RS encoder
+		codec reedsolomon.Encoder
+	}
+)
+
+func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder {
+	if dataShards <= 0 || parityShards <= 0 {
+		return nil
+	}
+	fec := new(fecEncoder)
+	fec.dataShards = dataShards
+	fec.parityShards = parityShards
+	fec.shardSize = dataShards + parityShards
+	fec.paws = (0xffffffff/uint32(fec.shardSize) - 1) * uint32(fec.shardSize)
+	fec.headerOffset = offset
+	fec.payloadOffset = fec.headerOffset + fecHeaderSize
+
+	enc, err := reedsolomon.New(dataShards, parityShards, reedsolomon.WithMaxGoroutines(1))
+	if err != nil {
+		return nil
+	}
+	fec.codec = enc
+
+	// caches
+	fec.encodeCache = make([][]byte, fec.shardSize)
+	fec.shardCache = make([][]byte, fec.shardSize)
+	for k := range fec.shardCache {
+		fec.shardCache[k] = make([]byte, mtuLimit)
+	}
+	return fec
+}
+
+// encode the packet, output parity shards if we have enough datashards
+// the content of returned parityshards will change in next encode
+func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
+	enc.markData(b[enc.headerOffset:])
+	binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:])))
+
+	// copy data to fec datashards
+	sz := len(b)
+	enc.shardCache[enc.shardCount] = enc.shardCache[enc.shardCount][:sz]
+	copy(enc.shardCache[enc.shardCount], b)
+	enc.shardCount++
+
+	// record max datashard length
+	if sz > enc.maxSize {
+		enc.maxSize = sz
+	}
+
+	//  calculate Reed-Solomon Erasure Code
+	if enc.shardCount == enc.dataShards {
+		// bzero each datashard's tail
+		for i := 0; i < enc.dataShards; i++ {
+			shard := enc.shardCache[i]
+			slen := len(shard)
+			xorBytes(shard[slen:enc.maxSize], shard[slen:enc.maxSize], shard[slen:enc.maxSize])
+		}
+
+		// construct equal-sized slice with stripped header
+		cache := enc.encodeCache
+		for k := range cache {
+			cache[k] = enc.shardCache[k][enc.payloadOffset:enc.maxSize]
+		}
+
+		// rs encode
+		if err := enc.codec.Encode(cache); err == nil {
+			ps = enc.shardCache[enc.dataShards:]
+			for k := range ps {
+				enc.markFEC(ps[k][enc.headerOffset:])
+				ps[k] = ps[k][:enc.maxSize]
+			}
+		}
+
+		// reset counters to zero
+		enc.shardCount = 0
+		enc.maxSize = 0
+	}
+
+	return
+}
+
+func (enc *fecEncoder) markData(data []byte) {
+	binary.LittleEndian.PutUint32(data, enc.next)
+	binary.LittleEndian.PutUint16(data[4:], typeData)
+	enc.next++
+}
+
+func (enc *fecEncoder) markFEC(data []byte) {
+	binary.LittleEndian.PutUint32(data, enc.next)
+	binary.LittleEndian.PutUint16(data[4:], typeFEC)
+	enc.next = (enc.next + 1) % enc.paws
+}
--- a/vendor/github.com/xtaci/kcp-go/frame.png
+++ b/vendor/github.com/xtaci/kcp-go/frame.png
--- a/vendor/github.com/xtaci/kcp-go/kcp-go.png
+++ b/vendor/github.com/xtaci/kcp-go/kcp-go.png
--- a/vendor/github.com/xtaci/kcp-go/kcp.go
+++ b/vendor/github.com/xtaci/kcp-go/kcp.go
@@ -0,0 +1,998 @@
+// Package kcp - A Fast and Reliable ARQ Protocol
+package kcp
+
+import (
+	"encoding/binary"
+	"sync/atomic"
+)
+
+const (
+	IKCP_RTO_NDL     = 30  // no delay min rto
+	IKCP_RTO_MIN     = 100 // normal min rto
+	IKCP_RTO_DEF     = 200
+	IKCP_RTO_MAX     = 60000
+	IKCP_CMD_PUSH    = 81 // cmd: push data
+	IKCP_CMD_ACK     = 82 // cmd: ack
+	IKCP_CMD_WASK    = 83 // cmd: window probe (ask)
+	IKCP_CMD_WINS    = 84 // cmd: window size (tell)
+	IKCP_ASK_SEND    = 1  // need to send IKCP_CMD_WASK
+	IKCP_ASK_TELL    = 2  // need to send IKCP_CMD_WINS
+	IKCP_WND_SND     = 32
+	IKCP_WND_RCV     = 32
+	IKCP_MTU_DEF     = 1400
+	IKCP_ACK_FAST    = 3
+	IKCP_INTERVAL    = 100
+	IKCP_OVERHEAD    = 24
+	IKCP_DEADLINK    = 20
+	IKCP_THRESH_INIT = 2
+	IKCP_THRESH_MIN  = 2
+	IKCP_PROBE_INIT  = 7000   // 7 secs to probe window size
+	IKCP_PROBE_LIMIT = 120000 // up to 120 secs to probe window
+)
+
+// output_callback is a prototype which ought capture conn and call conn.Write
+type output_callback func(buf []byte, size int)
+
+/* encode 8 bits unsigned int */
+func ikcp_encode8u(p []byte, c byte) []byte {
+	p[0] = c
+	return p[1:]
+}
+
+/* decode 8 bits unsigned int */
+func ikcp_decode8u(p []byte, c *byte) []byte {
+	*c = p[0]
+	return p[1:]
+}
+
+/* encode 16 bits unsigned int (lsb) */
+func ikcp_encode16u(p []byte, w uint16) []byte {
+	binary.LittleEndian.PutUint16(p, w)
+	return p[2:]
+}
+
+/* decode 16 bits unsigned int (lsb) */
+func ikcp_decode16u(p []byte, w *uint16) []byte {
+	*w = binary.LittleEndian.Uint16(p)
+	return p[2:]
+}
+
+/* encode 32 bits unsigned int (lsb) */
+func ikcp_encode32u(p []byte, l uint32) []byte {
+	binary.LittleEndian.PutUint32(p, l)
+	return p[4:]
+}
+
+/* decode 32 bits unsigned int (lsb) */
+func ikcp_decode32u(p []byte, l *uint32) []byte {
+	*l = binary.LittleEndian.Uint32(p)
+	return p[4:]
+}
+
+func _imin_(a, b uint32) uint32 {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+func _imax_(a, b uint32) uint32 {
+	if a >= b {
+		return a
+	}
+	return b
+}
+
+func _ibound_(lower, middle, upper uint32) uint32 {
+	return _imin_(_imax_(lower, middle), upper)
+}
+
+func _itimediff(later, earlier uint32) int32 {
+	return (int32)(later - earlier)
+}
+
+// segment defines a KCP segment
+type segment struct {
+	conv     uint32
+	cmd      uint8
+	frg      uint8
+	wnd      uint16
+	ts       uint32
+	sn       uint32
+	una      uint32
+	rto      uint32
+	xmit     uint32
+	resendts uint32
+	fastack  uint32
+	data     []byte
+}
+
+// encode a segment into buffer
+func (seg *segment) encode(ptr []byte) []byte {
+	ptr = ikcp_encode32u(ptr, seg.conv)
+	ptr = ikcp_encode8u(ptr, seg.cmd)
+	ptr = ikcp_encode8u(ptr, seg.frg)
+	ptr = ikcp_encode16u(ptr, seg.wnd)
+	ptr = ikcp_encode32u(ptr, seg.ts)
+	ptr = ikcp_encode32u(ptr, seg.sn)
+	ptr = ikcp_encode32u(ptr, seg.una)
+	ptr = ikcp_encode32u(ptr, uint32(len(seg.data)))
+	atomic.AddUint64(&DefaultSnmp.OutSegs, 1)
+	return ptr
+}
+
+// KCP defines a single KCP connection
+type KCP struct {
+	conv, mtu, mss, state                  uint32
+	snd_una, snd_nxt, rcv_nxt              uint32
+	ssthresh                               uint32
+	rx_rttvar, rx_srtt                     int32
+	rx_rto, rx_minrto                      uint32
+	snd_wnd, rcv_wnd, rmt_wnd, cwnd, probe uint32
+	interval, ts_flush                     uint32
+	nodelay, updated                       uint32
+	ts_probe, probe_wait                   uint32
+	dead_link, incr                        uint32
+
+	fastresend     int32
+	nocwnd, stream int32
+
+	snd_queue []segment
+	rcv_queue []segment
+	snd_buf   []segment
+	rcv_buf   []segment
+
+	acklist []ackItem
+
+	buffer []byte
+	output output_callback
+}
+
+type ackItem struct {
+	sn uint32
+	ts uint32
+}
+
+// NewKCP create a new kcp control object, 'conv' must equal in two endpoint
+// from the same connection.
+func NewKCP(conv uint32, output output_callback) *KCP {
+	kcp := new(KCP)
+	kcp.conv = conv
+	kcp.snd_wnd = IKCP_WND_SND
+	kcp.rcv_wnd = IKCP_WND_RCV
+	kcp.rmt_wnd = IKCP_WND_RCV
+	kcp.mtu = IKCP_MTU_DEF
+	kcp.mss = kcp.mtu - IKCP_OVERHEAD
+	kcp.buffer = make([]byte, (kcp.mtu+IKCP_OVERHEAD)*3)
+	kcp.rx_rto = IKCP_RTO_DEF
+	kcp.rx_minrto = IKCP_RTO_MIN
+	kcp.interval = IKCP_INTERVAL
+	kcp.ts_flush = IKCP_INTERVAL
+	kcp.ssthresh = IKCP_THRESH_INIT
+	kcp.dead_link = IKCP_DEADLINK
+	kcp.output = output
+	return kcp
+}
+
+// newSegment creates a KCP segment
+func (kcp *KCP) newSegment(size int) (seg segment) {
+	seg.data = xmitBuf.Get().([]byte)[:size]
+	return
+}
+
+// delSegment recycles a KCP segment
+func (kcp *KCP) delSegment(seg segment) {
+	xmitBuf.Put(seg.data)
+}
+
+// PeekSize checks the size of next message in the recv queue
+func (kcp *KCP) PeekSize() (length int) {
+	if len(kcp.rcv_queue) == 0 {
+		return -1
+	}
+
+	seg := &kcp.rcv_queue[0]
+	if seg.frg == 0 {
+		return len(seg.data)
+	}
+
+	if len(kcp.rcv_queue) < int(seg.frg+1) {
+		return -1
+	}
+
+	for k := range kcp.rcv_queue {
+		seg := &kcp.rcv_queue[k]
+		length += len(seg.data)
+		if seg.frg == 0 {
+			break
+		}
+	}
+	return
+}
+
+// Recv is user/upper level recv: returns size, returns below zero for EAGAIN
+func (kcp *KCP) Recv(buffer []byte) (n int) {
+	if len(kcp.rcv_queue) == 0 {
+		return -1
+	}
+
+	peeksize := kcp.PeekSize()
+	if peeksize < 0 {
+		return -2
+	}
+
+	if peeksize > len(buffer) {
+		return -3
+	}
+
+	var fast_recover bool
+	if len(kcp.rcv_queue) >= int(kcp.rcv_wnd) {
+		fast_recover = true
+	}
+
+	// merge fragment
+	count := 0
+	for k := range kcp.rcv_queue {
+		seg := &kcp.rcv_queue[k]
+		copy(buffer, seg.data)
+		buffer = buffer[len(seg.data):]
+		n += len(seg.data)
+		count++
+		kcp.delSegment(*seg)
+		if seg.frg == 0 {
+			break
+		}
+	}
+	if count > 0 {
+		kcp.rcv_queue = kcp.remove_front(kcp.rcv_queue, count)
+	}
+
+	// move available data from rcv_buf -> rcv_queue
+	count = 0
+	for k := range kcp.rcv_buf {
+		seg := &kcp.rcv_buf[k]
+		if seg.sn == kcp.rcv_nxt && len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
+			kcp.rcv_nxt++
+			count++
+		} else {
+			break
+		}
+	}
+
+	if count > 0 {
+		kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
+		kcp.rcv_buf = kcp.remove_front(kcp.rcv_buf, count)
+	}
+
+	// fast recover
+	if len(kcp.rcv_queue) < int(kcp.rcv_wnd) && fast_recover {
+		// ready to send back IKCP_CMD_WINS in ikcp_flush
+		// tell remote my window size
+		kcp.probe |= IKCP_ASK_TELL
+	}
+	return
+}
+
+// Send is user/upper level send, returns below zero for error
+func (kcp *KCP) Send(buffer []byte) int {
+	var count int
+	if len(buffer) == 0 {
+		return -1
+	}
+
+	// append to previous segment in streaming mode (if possible)
+	if kcp.stream != 0 {
+		n := len(kcp.snd_queue)
+		if n > 0 {
+			seg := &kcp.snd_queue[n-1]
+			if len(seg.data) < int(kcp.mss) {
+				capacity := int(kcp.mss) - len(seg.data)
+				extend := capacity
+				if len(buffer) < capacity {
+					extend = len(buffer)
+				}
+
+				// grow slice, the underlying cap is guaranteed to
+				// be larger than kcp.mss
+				oldlen := len(seg.data)
+				seg.data = seg.data[:oldlen+extend]
+				copy(seg.data[oldlen:], buffer)
+				buffer = buffer[extend:]
+			}
+		}
+
+		if len(buffer) == 0 {
+			return 0
+		}
+	}
+
+	if len(buffer) <= int(kcp.mss) {
+		count = 1
+	} else {
+		count = (len(buffer) + int(kcp.mss) - 1) / int(kcp.mss)
+	}
+
+	if count > 255 {
+		return -2
+	}
+
+	if count == 0 {
+		count = 1
+	}
+
+	for i := 0; i < count; i++ {
+		var size int
+		if len(buffer) > int(kcp.mss) {
+			size = int(kcp.mss)
+		} else {
+			size = len(buffer)
+		}
+		seg := kcp.newSegment(size)
+		copy(seg.data, buffer[:size])
+		if kcp.stream == 0 { // message mode
+			seg.frg = uint8(count - i - 1)
+		} else { // stream mode
+			seg.frg = 0
+		}
+		kcp.snd_queue = append(kcp.snd_queue, seg)
+		buffer = buffer[size:]
+	}
+	return 0
+}
+
+func (kcp *KCP) update_ack(rtt int32) {
+	// https://tools.ietf.org/html/rfc6298
+	var rto uint32
+	if kcp.rx_srtt == 0 {
+		kcp.rx_srtt = rtt
+		kcp.rx_rttvar = rtt >> 1
+	} else {
+		delta := rtt - kcp.rx_srtt
+		kcp.rx_srtt += delta >> 3
+		if delta < 0 {
+			delta = -delta
+		}
+		if rtt < kcp.rx_srtt-kcp.rx_rttvar {
+			// if the new RTT sample is below the bottom of the range of
+			// what an RTT measurement is expected to be.
+			// give an 8x reduced weight versus its normal weighting
+			kcp.rx_rttvar += (delta - kcp.rx_rttvar) >> 5
+		} else {
+			kcp.rx_rttvar += (delta - kcp.rx_rttvar) >> 2
+		}
+	}
+	rto = uint32(kcp.rx_srtt) + _imax_(kcp.interval, uint32(kcp.rx_rttvar)<<2)
+	kcp.rx_rto = _ibound_(kcp.rx_minrto, rto, IKCP_RTO_MAX)
+}
+
+func (kcp *KCP) shrink_buf() {
+	if len(kcp.snd_buf) > 0 {
+		seg := &kcp.snd_buf[0]
+		kcp.snd_una = seg.sn
+	} else {
+		kcp.snd_una = kcp.snd_nxt
+	}
+}
+
+func (kcp *KCP) parse_ack(sn uint32) {
+	if _itimediff(sn, kcp.snd_una) < 0 || _itimediff(sn, kcp.snd_nxt) >= 0 {
+		return
+	}
+
+	for k := range kcp.snd_buf {
+		seg := &kcp.snd_buf[k]
+		if sn == seg.sn {
+			kcp.delSegment(*seg)
+			copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
+			kcp.snd_buf[len(kcp.snd_buf)-1] = segment{}
+			kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
+			break
+		}
+		if _itimediff(sn, seg.sn) < 0 {
+			break
+		}
+	}
+}
+
+func (kcp *KCP) parse_fastack(sn uint32) {
+	if _itimediff(sn, kcp.snd_una) < 0 || _itimediff(sn, kcp.snd_nxt) >= 0 {
+		return
+	}
+
+	for k := range kcp.snd_buf {
+		seg := &kcp.snd_buf[k]
+		if _itimediff(sn, seg.sn) < 0 {
+			break
+		} else if sn != seg.sn {
+			seg.fastack++
+		}
+	}
+}
+
+func (kcp *KCP) parse_una(una uint32) {
+	count := 0
+	for k := range kcp.snd_buf {
+		seg := &kcp.snd_buf[k]
+		if _itimediff(una, seg.sn) > 0 {
+			kcp.delSegment(*seg)
+			count++
+		} else {
+			break
+		}
+	}
+	if count > 0 {
+		kcp.snd_buf = kcp.remove_front(kcp.snd_buf, count)
+	}
+}
+
+// ack append
+func (kcp *KCP) ack_push(sn, ts uint32) {
+	kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
+}
+
+func (kcp *KCP) parse_data(newseg segment) {
+	sn := newseg.sn
+	if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
+		_itimediff(sn, kcp.rcv_nxt) < 0 {
+		kcp.delSegment(newseg)
+		return
+	}
+
+	n := len(kcp.rcv_buf) - 1
+	insert_idx := 0
+	repeat := false
+	for i := n; i >= 0; i-- {
+		seg := &kcp.rcv_buf[i]
+		if seg.sn == sn {
+			repeat = true
+			atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
+			break
+		}
+		if _itimediff(sn, seg.sn) > 0 {
+			insert_idx = i + 1
+			break
+		}
+	}
+
+	if !repeat {
+		if insert_idx == n+1 {
+			kcp.rcv_buf = append(kcp.rcv_buf, newseg)
+		} else {
+			kcp.rcv_buf = append(kcp.rcv_buf, segment{})
+			copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
+			kcp.rcv_buf[insert_idx] = newseg
+		}
+	} else {
+		kcp.delSegment(newseg)
+	}
+
+	// move available data from rcv_buf -> rcv_queue
+	count := 0
+	for k := range kcp.rcv_buf {
+		seg := &kcp.rcv_buf[k]
+		if seg.sn == kcp.rcv_nxt && len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
+			kcp.rcv_nxt++
+			count++
+		} else {
+			break
+		}
+	}
+	if count > 0 {
+		kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
+		kcp.rcv_buf = kcp.remove_front(kcp.rcv_buf, count)
+	}
+}
+
+// Input when you received a low level packet (eg. UDP packet), call it
+// regular indicates a regular packet has received(not from FEC)
+func (kcp *KCP) Input(data []byte, regular, ackNoDelay bool) int {
+	una := kcp.snd_una
+	if len(data) < IKCP_OVERHEAD {
+		return -1
+	}
+
+	var maxack uint32
+	var lastackts uint32
+	var flag int
+	var inSegs uint64
+
+	for {
+		var ts, sn, length, una, conv uint32
+		var wnd uint16
+		var cmd, frg uint8
+
+		if len(data) < int(IKCP_OVERHEAD) {
+			break
+		}
+
+		data = ikcp_decode32u(data, &conv)
+		if conv != kcp.conv {
+			return -1
+		}
+
+		data = ikcp_decode8u(data, &cmd)
+		data = ikcp_decode8u(data, &frg)
+		data = ikcp_decode16u(data, &wnd)
+		data = ikcp_decode32u(data, &ts)
+		data = ikcp_decode32u(data, &sn)
+		data = ikcp_decode32u(data, &una)
+		data = ikcp_decode32u(data, &length)
+		if len(data) < int(length) {
+			return -2
+		}
+
+		if cmd != IKCP_CMD_PUSH && cmd != IKCP_CMD_ACK &&
+			cmd != IKCP_CMD_WASK && cmd != IKCP_CMD_WINS {
+			return -3
+		}
+
+		// only trust window updates from regular packets. i.e: latest update
+		if regular {
+			kcp.rmt_wnd = uint32(wnd)
+		}
+		kcp.parse_una(una)
+		kcp.shrink_buf()
+
+		if cmd == IKCP_CMD_ACK {
+			kcp.parse_ack(sn)
+			kcp.shrink_buf()
+			if flag == 0 {
+				flag = 1
+				maxack = sn
+			} else if _itimediff(sn, maxack) > 0 {
+				maxack = sn
+			}
+			lastackts = ts
+		} else if cmd == IKCP_CMD_PUSH {
+			if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) < 0 {
+				kcp.ack_push(sn, ts)
+				if _itimediff(sn, kcp.rcv_nxt) >= 0 {
+					seg := kcp.newSegment(int(length))
+					seg.conv = conv
+					seg.cmd = cmd
+					seg.frg = frg
+					seg.wnd = wnd
+					seg.ts = ts
+					seg.sn = sn
+					seg.una = una
+					copy(seg.data, data[:length])
+					kcp.parse_data(seg)
+				} else {
+					atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
+				}
+			} else {
+				atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
+			}
+		} else if cmd == IKCP_CMD_WASK {
+			// ready to send back IKCP_CMD_WINS in Ikcp_flush
+			// tell remote my window size
+			kcp.probe |= IKCP_ASK_TELL
+		} else if cmd == IKCP_CMD_WINS {
+			// do nothing
+		} else {
+			return -3
+		}
+
+		inSegs++
+		data = data[length:]
+	}
+	atomic.AddUint64(&DefaultSnmp.InSegs, inSegs)
+
+	if flag != 0 && regular {
+		kcp.parse_fastack(maxack)
+		current := currentMs()
+		if _itimediff(current, lastackts) >= 0 {
+			kcp.update_ack(_itimediff(current, lastackts))
+		}
+	}
+
+	if _itimediff(kcp.snd_una, una) > 0 {
+		if kcp.cwnd < kcp.rmt_wnd {
+			mss := kcp.mss
+			if kcp.cwnd < kcp.ssthresh {
+				kcp.cwnd++
+				kcp.incr += mss
+			} else {
+				if kcp.incr < mss {
+					kcp.incr = mss
+				}
+				kcp.incr += (mss*mss)/kcp.incr + (mss / 16)
+				if (kcp.cwnd+1)*mss <= kcp.incr {
+					kcp.cwnd++
+				}
+			}
+			if kcp.cwnd > kcp.rmt_wnd {
+				kcp.cwnd = kcp.rmt_wnd
+				kcp.incr = kcp.rmt_wnd * mss
+			}
+		}
+	}
+
+	if ackNoDelay && len(kcp.acklist) > 0 { // ack immediately
+		kcp.flush(true)
+	} else if kcp.rmt_wnd == 0 && len(kcp.acklist) > 0 { // window zero
+		kcp.flush(true)
+	}
+	return 0
+}
+
+func (kcp *KCP) wnd_unused() uint16 {
+	if len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
+		return uint16(int(kcp.rcv_wnd) - len(kcp.rcv_queue))
+	}
+	return 0
+}
+
+// flush pending data
+func (kcp *KCP) flush(ackOnly bool) {
+	var seg segment
+	seg.conv = kcp.conv
+	seg.cmd = IKCP_CMD_ACK
+	seg.wnd = kcp.wnd_unused()
+	seg.una = kcp.rcv_nxt
+
+	buffer := kcp.buffer
+	// flush acknowledges
+	ptr := buffer
+	for i, ack := range kcp.acklist {
+		size := len(buffer) - len(ptr)
+		if size+IKCP_OVERHEAD > int(kcp.mtu) {
+			kcp.output(buffer, size)
+			ptr = buffer
+		}
+		// filter jitters caused by bufferbloat
+		if ack.sn >= kcp.rcv_nxt || len(kcp.acklist)-1 == i {
+			seg.sn, seg.ts = ack.sn, ack.ts
+			ptr = seg.encode(ptr)
+		}
+	}
+	kcp.acklist = kcp.acklist[0:0]
+
+	if ackOnly { // flash remain ack segments
+		size := len(buffer) - len(ptr)
+		if size > 0 {
+			kcp.output(buffer, size)
+		}
+		return
+	}
+
+	// probe window size (if remote window size equals zero)
+	if kcp.rmt_wnd == 0 {
+		current := currentMs()
+		if kcp.probe_wait == 0 {
+			kcp.probe_wait = IKCP_PROBE_INIT
+			kcp.ts_probe = current + kcp.probe_wait
+		} else {
+			if _itimediff(current, kcp.ts_probe) >= 0 {
+				if kcp.probe_wait < IKCP_PROBE_INIT {
+					kcp.probe_wait = IKCP_PROBE_INIT
+				}
+				kcp.probe_wait += kcp.probe_wait / 2
+				if kcp.probe_wait > IKCP_PROBE_LIMIT {
+					kcp.probe_wait = IKCP_PROBE_LIMIT
+				}
+				kcp.ts_probe = current + kcp.probe_wait
+				kcp.probe |= IKCP_ASK_SEND
+			}
+		}
+	} else {
+		kcp.ts_probe = 0
+		kcp.probe_wait = 0
+	}
+
+	// flush window probing commands
+	if (kcp.probe & IKCP_ASK_SEND) != 0 {
+		seg.cmd = IKCP_CMD_WASK
+		size := len(buffer) - len(ptr)
+		if size+IKCP_OVERHEAD > int(kcp.mtu) {
+			kcp.output(buffer, size)
+			ptr = buffer
+		}
+		ptr = seg.encode(ptr)
+	}
+
+	// flush window probing commands
+	if (kcp.probe & IKCP_ASK_TELL) != 0 {
+		seg.cmd = IKCP_CMD_WINS
+		size := len(buffer) - len(ptr)
+		if size+IKCP_OVERHEAD > int(kcp.mtu) {
+			kcp.output(buffer, size)
+			ptr = buffer
+		}
+		ptr = seg.encode(ptr)
+	}
+
+	kcp.probe = 0
+
+	// calculate window size
+	cwnd := _imin_(kcp.snd_wnd, kcp.rmt_wnd)
+	if kcp.nocwnd == 0 {
+		cwnd = _imin_(kcp.cwnd, cwnd)
+	}
+
+	// sliding window, controlled by snd_nxt && sna_una+cwnd
+	newSegsCount := 0
+	for k := range kcp.snd_queue {
+		if _itimediff(kcp.snd_nxt, kcp.snd_una+cwnd) >= 0 {
+			break
+		}
+		newseg := kcp.snd_queue[k]
+		newseg.conv = kcp.conv
+		newseg.cmd = IKCP_CMD_PUSH
+		newseg.sn = kcp.snd_nxt
+		kcp.snd_buf = append(kcp.snd_buf, newseg)
+		kcp.snd_nxt++
+		newSegsCount++
+		kcp.snd_queue[k].data = nil
+	}
+	if newSegsCount > 0 {
+		kcp.snd_queue = kcp.remove_front(kcp.snd_queue, newSegsCount)
+	}
+
+	// calculate resent
+	resent := uint32(kcp.fastresend)
+	if kcp.fastresend <= 0 {
+		resent = 0xffffffff
+	}
+
+	// check for retransmissions
+	current := currentMs()
+	var change, lost, lostSegs, fastRetransSegs, earlyRetransSegs uint64
+	for k := range kcp.snd_buf {
+		segment := &kcp.snd_buf[k]
+		needsend := false
+		if segment.xmit == 0 { // initial transmit
+			needsend = true
+			segment.rto = kcp.rx_rto
+			segment.resendts = current + segment.rto
+		} else if _itimediff(current, segment.resendts) >= 0 { // RTO
+			needsend = true
+			if kcp.nodelay == 0 {
+				segment.rto += kcp.rx_rto
+			} else {
+				segment.rto += kcp.rx_rto / 2
+			}
+			segment.resendts = current + segment.rto
+			lost++
+			lostSegs++
+		} else if segment.fastack >= resent { // fast retransmit
+			needsend = true
+			segment.fastack = 0
+			segment.rto = kcp.rx_rto
+			segment.resendts = current + segment.rto
+			change++
+			fastRetransSegs++
+		} else if segment.fastack > 0 && newSegsCount == 0 { // early retransmit
+			needsend = true
+			segment.fastack = 0
+			segment.rto = kcp.rx_rto
+			segment.resendts = current + segment.rto
+			change++
+			earlyRetransSegs++
+		}
+
+		if needsend {
+			segment.xmit++
+			segment.ts = current
+			segment.wnd = seg.wnd
+			segment.una = seg.una
+
+			size := len(buffer) - len(ptr)
+			need := IKCP_OVERHEAD + len(segment.data)
+
+			if size+need > int(kcp.mtu) {
+				kcp.output(buffer, size)
+				current = currentMs() // time update for a blocking call
+				ptr = buffer
+			}
+
+			ptr = segment.encode(ptr)
+			copy(ptr, segment.data)
+			ptr = ptr[len(segment.data):]
+
+			if segment.xmit >= kcp.dead_link {
+				kcp.state = 0xFFFFFFFF
+			}
+		}
+	}
+
+	// flash remain segments
+	size := len(buffer) - len(ptr)
+	if size > 0 {
+		kcp.output(buffer, size)
+	}
+
+	// counter updates
+	sum := lostSegs
+	if lostSegs > 0 {
+		atomic.AddUint64(&DefaultSnmp.LostSegs, lostSegs)
+	}
+	if fastRetransSegs > 0 {
+		atomic.AddUint64(&DefaultSnmp.FastRetransSegs, fastRetransSegs)
+		sum += fastRetransSegs
+	}
+	if earlyRetransSegs > 0 {
+		atomic.AddUint64(&DefaultSnmp.EarlyRetransSegs, earlyRetransSegs)
+		sum += earlyRetransSegs
+	}
+	if sum > 0 {
+		atomic.AddUint64(&DefaultSnmp.RetransSegs, sum)
+	}
+
+	// update ssthresh
+	// rate halving, https://tools.ietf.org/html/rfc6937
+	if change > 0 {
+		inflight := kcp.snd_nxt - kcp.snd_una
+		kcp.ssthresh = inflight / 2
+		if kcp.ssthresh < IKCP_THRESH_MIN {
+			kcp.ssthresh = IKCP_THRESH_MIN
+		}
+		kcp.cwnd = kcp.ssthresh + resent
+		kcp.incr = kcp.cwnd * kcp.mss
+	}
+
+	// congestion control, https://tools.ietf.org/html/rfc5681
+	if lost > 0 {
+		kcp.ssthresh = cwnd / 2
+		if kcp.ssthresh < IKCP_THRESH_MIN {
+			kcp.ssthresh = IKCP_THRESH_MIN
+		}
+		kcp.cwnd = 1
+		kcp.incr = kcp.mss
+	}
+
+	if kcp.cwnd < 1 {
+		kcp.cwnd = 1
+		kcp.incr = kcp.mss
+	}
+}
+
+// Update updates state (call it repeatedly, every 10ms-100ms), or you can ask
+// ikcp_check when to call it again (without ikcp_input/_send calling).
+// 'current' - current timestamp in millisec.
+func (kcp *KCP) Update() {
+	var slap int32
+
+	current := currentMs()
+	if kcp.updated == 0 {
+		kcp.updated = 1
+		kcp.ts_flush = current
+	}
+
+	slap = _itimediff(current, kcp.ts_flush)
+
+	if slap >= 10000 || slap < -10000 {
+		kcp.ts_flush = current
+		slap = 0
+	}
+
+	if slap >= 0 {
+		kcp.ts_flush += kcp.interval
+		if _itimediff(current, kcp.ts_flush) >= 0 {
+			kcp.ts_flush = current + kcp.interval
+		}
+		kcp.flush(false)
+	}
+}
+
+// Check determines when should you invoke ikcp_update:
+// returns when you should invoke ikcp_update in millisec, if there
+// is no ikcp_input/_send calling. you can call ikcp_update in that
+// time, instead of call update repeatly.
+// Important to reduce unnacessary ikcp_update invoking. use it to
+// schedule ikcp_update (eg. implementing an epoll-like mechanism,
+// or optimize ikcp_update when handling massive kcp connections)
+func (kcp *KCP) Check() uint32 {
+	current := currentMs()
+	ts_flush := kcp.ts_flush
+	tm_flush := int32(0x7fffffff)
+	tm_packet := int32(0x7fffffff)
+	minimal := uint32(0)
+	if kcp.updated == 0 {
+		return current
+	}
+
+	if _itimediff(current, ts_flush) >= 10000 ||
+		_itimediff(current, ts_flush) < -10000 {
+		ts_flush = current
+	}
+
+	if _itimediff(current, ts_flush) >= 0 {
+		return current
+	}
+
+	tm_flush = _itimediff(ts_flush, current)
+
+	for k := range kcp.snd_buf {
+		seg := &kcp.snd_buf[k]
+		diff := _itimediff(seg.resendts, current)
+		if diff <= 0 {
+			return current
+		}
+		if diff < tm_packet {
+			tm_packet = diff
+		}
+	}
+
+	minimal = uint32(tm_packet)
+	if tm_packet >= tm_flush {
+		minimal = uint32(tm_flush)
+	}
+	if minimal >= kcp.interval {
+		minimal = kcp.interval
+	}
+
+	return current + minimal
+}
+
+// SetMtu changes MTU size, default is 1400
+func (kcp *KCP) SetMtu(mtu int) int {
+	if mtu < 50 || mtu < IKCP_OVERHEAD {
+		return -1
+	}
+	buffer := make([]byte, (mtu+IKCP_OVERHEAD)*3)
+	if buffer == nil {
+		return -2
+	}
+	kcp.mtu = uint32(mtu)
+	kcp.mss = kcp.mtu - IKCP_OVERHEAD
+	kcp.buffer = buffer
+	return 0
+}
+
+// NoDelay options
+// fastest: ikcp_nodelay(kcp, 1, 20, 2, 1)
+// nodelay: 0:disable(default), 1:enable
+// interval: internal update timer interval in millisec, default is 100ms
+// resend: 0:disable fast resend(default), 1:enable fast resend
+// nc: 0:normal congestion control(default), 1:disable congestion control
+func (kcp *KCP) NoDelay(nodelay, interval, resend, nc int) int {
+	if nodelay >= 0 {
+		kcp.nodelay = uint32(nodelay)
+		if nodelay != 0 {
+			kcp.rx_minrto = IKCP_RTO_NDL
+		} else {
+			kcp.rx_minrto = IKCP_RTO_MIN
+		}
+	}
+	if interval >= 0 {
+		if interval > 5000 {
+			interval = 5000
+		} else if interval < 10 {
+			interval = 10
+		}
+		kcp.interval = uint32(interval)
+	}
+	if resend >= 0 {
+		kcp.fastresend = int32(resend)
+	}
+	if nc >= 0 {
+		kcp.nocwnd = int32(nc)
+	}
+	return 0
+}
+
+// WndSize sets maximum window size: sndwnd=32, rcvwnd=32 by default
+func (kcp *KCP) WndSize(sndwnd, rcvwnd int) int {
+	if sndwnd > 0 {
+		kcp.snd_wnd = uint32(sndwnd)
+	}
+	if rcvwnd > 0 {
+		kcp.rcv_wnd = uint32(rcvwnd)
+	}
+	return 0
+}
+
+// WaitSnd gets how many packet is waiting to be sent
+func (kcp *KCP) WaitSnd() int {
+	return len(kcp.snd_buf) + len(kcp.snd_queue)
+}
+
+// remove front n elements from queue
+func (kcp *KCP) remove_front(q []segment, n int) []segment {
+	newn := copy(q, q[n:])
+	for i := newn; i < len(q); i++ {
+		q[i] = segment{} // manual set nil for GC
+	}
+	return q[:newn]
+}
--- a/vendor/github.com/xtaci/kcp-go/sess.go
+++ b/vendor/github.com/xtaci/kcp-go/sess.go
@@ -0,0 +1,932 @@
+package kcp
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+	"hash/crc32"
+	"io"
+	"net"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/pkg/errors"
+	"golang.org/x/net/ipv4"
+)
+
+type errTimeout struct {
+	error
+}
+
+func (errTimeout) Timeout() bool   { return true }
+func (errTimeout) Temporary() bool { return true }
+func (errTimeout) Error() string   { return "i/o timeout" }
+
+const (
+	// 16-bytes magic number for each packet
+	nonceSize = 16
+
+	// 4-bytes packet checksum
+	crcSize = 4
+
+	// overall crypto header size
+	cryptHeaderSize = nonceSize + crcSize
+
+	// maximum packet size
+	mtuLimit = 1500
+
+	// FEC keeps rxFECMulti* (dataShard+parityShard) ordered packets in memory
+	rxFECMulti = 3
+
+	// accept backlog
+	acceptBacklog = 128
+
+	// prerouting(to session) queue
+	qlen = 128
+)
+
+const (
+	errBrokenPipe       = "broken pipe"
+	errInvalidOperation = "invalid operation"
+)
+
+var (
+	// global packet buffer
+	// shared among sending/receiving/FEC
+	xmitBuf sync.Pool
+)
+
+func init() {
+	xmitBuf.New = func() interface{} {
+		return make([]byte, mtuLimit)
+	}
+}
+
+type (
+	// UDPSession defines a KCP session implemented by UDP
+	UDPSession struct {
+		updaterIdx int            // record slice index in updater
+		conn       net.PacketConn // the underlying packet connection
+		kcp        *KCP           // KCP ARQ protocol
+		l          *Listener      // point to the Listener if it's accepted by Listener
+		block      BlockCrypt     // block encryption
+
+		// kcp receiving is based on packets
+		// recvbuf turns packets into stream
+		recvbuf []byte
+		bufptr  []byte
+		// extended output buffer(with header)
+		ext []byte
+
+		// FEC
+		fecDecoder *fecDecoder
+		fecEncoder *fecEncoder
+
+		// settings
+		remote     net.Addr  // remote peer address
+		rd         time.Time // read deadline
+		wd         time.Time // write deadline
+		headerSize int       // the overall header size added before KCP frame
+		ackNoDelay bool      // send ack immediately for each incoming packet
+		writeDelay bool      // delay kcp.flush() for Write() for bulk transfer
+		dup        int       // duplicate udp packets
+
+		// notifications
+		die          chan struct{} // notify session has Closed
+		chReadEvent  chan struct{} // notify Read() can be called without blocking
+		chWriteEvent chan struct{} // notify Write() can be called without blocking
+		chErrorEvent chan error    // notify Read() have an error
+
+		isClosed bool // flag the session has Closed
+		mu       sync.Mutex
+	}
+
+	setReadBuffer interface {
+		SetReadBuffer(bytes int) error
+	}
+
+	setWriteBuffer interface {
+		SetWriteBuffer(bytes int) error
+	}
+)
+
+// newUDPSession create a new udp session for client or server
+func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession {
+	sess := new(UDPSession)
+	sess.die = make(chan struct{})
+	sess.chReadEvent = make(chan struct{}, 1)
+	sess.chWriteEvent = make(chan struct{}, 1)
+	sess.chErrorEvent = make(chan error, 1)
+	sess.remote = remote
+	sess.conn = conn
+	sess.l = l
+	sess.block = block
+	sess.recvbuf = make([]byte, mtuLimit)
+
+	// FEC initialization
+	sess.fecDecoder = newFECDecoder(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
+	if sess.block != nil {
+		sess.fecEncoder = newFECEncoder(dataShards, parityShards, cryptHeaderSize)
+	} else {
+		sess.fecEncoder = newFECEncoder(dataShards, parityShards, 0)
+	}
+
+	// calculate header size
+	if sess.block != nil {
+		sess.headerSize += cryptHeaderSize
+	}
+	if sess.fecEncoder != nil {
+		sess.headerSize += fecHeaderSizePlus2
+	}
+
+	// only allocate extended packet buffer
+	// when the extra header is required
+	if sess.headerSize > 0 {
+		sess.ext = make([]byte, mtuLimit)
+	}
+
+	sess.kcp = NewKCP(conv, func(buf []byte, size int) {
+		if size >= IKCP_OVERHEAD {
+			sess.output(buf[:size])
+		}
+	})
+	sess.kcp.SetMtu(IKCP_MTU_DEF - sess.headerSize)
+
+	// add current session to the global updater,
+	// which periodically calls sess.update()
+	updater.addSession(sess)
+
+	if sess.l == nil { // it's a client connection
+		go sess.readLoop()
+		atomic.AddUint64(&DefaultSnmp.ActiveOpens, 1)
+	} else {
+		atomic.AddUint64(&DefaultSnmp.PassiveOpens, 1)
+	}
+	currestab := atomic.AddUint64(&DefaultSnmp.CurrEstab, 1)
+	maxconn := atomic.LoadUint64(&DefaultSnmp.MaxConn)
+	if currestab > maxconn {
+		atomic.CompareAndSwapUint64(&DefaultSnmp.MaxConn, maxconn, currestab)
+	}
+
+	return sess
+}
+
+// Read implements net.Conn
+func (s *UDPSession) Read(b []byte) (n int, err error) {
+	for {
+		s.mu.Lock()
+		if len(s.bufptr) > 0 { // copy from buffer into b
+			n = copy(b, s.bufptr)
+			s.bufptr = s.bufptr[n:]
+			s.mu.Unlock()
+			return n, nil
+		}
+
+		if s.isClosed {
+			s.mu.Unlock()
+			return 0, errors.New(errBrokenPipe)
+		}
+
+		if size := s.kcp.PeekSize(); size > 0 { // peek data size from kcp
+			atomic.AddUint64(&DefaultSnmp.BytesReceived, uint64(size))
+			if len(b) >= size { // direct write to b
+				s.kcp.Recv(b)
+				s.mu.Unlock()
+				return size, nil
+			}
+
+			// resize kcp receive buffer
+			// to make sure recvbuf has enough capacity
+			if cap(s.recvbuf) < size {
+				s.recvbuf = make([]byte, size)
+			}
+
+			// resize recvbuf slice length
+			s.recvbuf = s.recvbuf[:size]
+			s.kcp.Recv(s.recvbuf)
+			n = copy(b, s.recvbuf)   // copy to b
+			s.bufptr = s.recvbuf[n:] // update pointer
+			s.mu.Unlock()
+			return n, nil
+		}
+
+		// read deadline
+		var timeout *time.Timer
+		var c <-chan time.Time
+		if !s.rd.IsZero() {
+			if time.Now().After(s.rd) {
+				s.mu.Unlock()
+				return 0, errTimeout{}
+			}
+
+			delay := s.rd.Sub(time.Now())
+			timeout = time.NewTimer(delay)
+			c = timeout.C
+		}
+		s.mu.Unlock()
+
+		// wait for read event or timeout
+		select {
+		case <-s.chReadEvent:
+		case <-c:
+		case <-s.die:
+		case err = <-s.chErrorEvent:
+			if timeout != nil {
+				timeout.Stop()
+			}
+			return n, err
+		}
+
+		if timeout != nil {
+			timeout.Stop()
+		}
+	}
+}
+
+// Write implements net.Conn
+func (s *UDPSession) Write(b []byte) (n int, err error) {
+	for {
+		s.mu.Lock()
+		if s.isClosed {
+			s.mu.Unlock()
+			return 0, errors.New(errBrokenPipe)
+		}
+
+		// api flow control
+		if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
+			n = len(b)
+			for {
+				if len(b) <= int(s.kcp.mss) {
+					s.kcp.Send(b)
+					break
+				} else {
+					s.kcp.Send(b[:s.kcp.mss])
+					b = b[s.kcp.mss:]
+				}
+			}
+
+			if !s.writeDelay {
+				s.kcp.flush(false)
+			}
+			s.mu.Unlock()
+			atomic.AddUint64(&DefaultSnmp.BytesSent, uint64(n))
+			return n, nil
+		}
+
+		// write deadline
+		var timeout *time.Timer
+		var c <-chan time.Time
+		if !s.wd.IsZero() {
+			if time.Now().After(s.wd) {
+				s.mu.Unlock()
+				return 0, errTimeout{}
+			}
+			delay := s.wd.Sub(time.Now())
+			timeout = time.NewTimer(delay)
+			c = timeout.C
+		}
+		s.mu.Unlock()
+
+		// wait for write event or timeout
+		select {
+		case <-s.chWriteEvent:
+		case <-c:
+		case <-s.die:
+		}
+
+		if timeout != nil {
+			timeout.Stop()
+		}
+	}
+}
+
+// Close closes the connection.
+func (s *UDPSession) Close() error {
+	// remove this session from updater & listener(if necessary)
+	updater.removeSession(s)
+	if s.l != nil { // notify listener
+		s.l.closeSession(s.remote)
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.isClosed {
+		return errors.New(errBrokenPipe)
+	}
+	close(s.die)
+	s.isClosed = true
+	atomic.AddUint64(&DefaultSnmp.CurrEstab, ^uint64(0))
+	if s.l == nil { // client socket close
+		return s.conn.Close()
+	}
+	return nil
+}
+
+// LocalAddr returns the local network address. The Addr returned is shared by all invocations of LocalAddr, so do not modify it.
+func (s *UDPSession) LocalAddr() net.Addr { return s.conn.LocalAddr() }
+
+// RemoteAddr returns the remote network address. The Addr returned is shared by all invocations of RemoteAddr, so do not modify it.
+func (s *UDPSession) RemoteAddr() net.Addr { return s.remote }
+
+// SetDeadline sets the deadline associated with the listener. A zero time value disables the deadline.
+func (s *UDPSession) SetDeadline(t time.Time) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.rd = t
+	s.wd = t
+	return nil
+}
+
+// SetReadDeadline implements the Conn SetReadDeadline method.
+func (s *UDPSession) SetReadDeadline(t time.Time) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.rd = t
+	return nil
+}
+
+// SetWriteDeadline implements the Conn SetWriteDeadline method.
+func (s *UDPSession) SetWriteDeadline(t time.Time) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.wd = t
+	return nil
+}
+
+// SetWriteDelay delays write for bulk transfer until the next update interval
+func (s *UDPSession) SetWriteDelay(delay bool) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.writeDelay = delay
+}
+
+// SetWindowSize set maximum window size
+func (s *UDPSession) SetWindowSize(sndwnd, rcvwnd int) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.kcp.WndSize(sndwnd, rcvwnd)
+}
+
+// SetMtu sets the maximum transmission unit(not including UDP header)
+func (s *UDPSession) SetMtu(mtu int) bool {
+	if mtu > mtuLimit {
+		return false
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.kcp.SetMtu(mtu - s.headerSize)
+	return true
+}
+
+// SetStreamMode toggles the stream mode on/off
+func (s *UDPSession) SetStreamMode(enable bool) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if enable {
+		s.kcp.stream = 1
+	} else {
+		s.kcp.stream = 0
+	}
+}
+
+// SetACKNoDelay changes ack flush option, set true to flush ack immediately,
+func (s *UDPSession) SetACKNoDelay(nodelay bool) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.ackNoDelay = nodelay
+}
+
+// SetDUP duplicates udp packets for kcp output, for testing purpose only
+func (s *UDPSession) SetDUP(dup int) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.dup = dup
+}
+
+// SetNoDelay calls nodelay() of kcp
+// https://github.com/skywind3000/kcp/blob/master/README.en.md#protocol-configuration
+func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.kcp.NoDelay(nodelay, interval, resend, nc)
+}
+
+// SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener
+func (s *UDPSession) SetDSCP(dscp int) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.l == nil {
+		if nc, ok := s.conn.(*connectedUDPConn); ok {
+			return ipv4.NewConn(nc.UDPConn).SetTOS(dscp << 2)
+		} else if nc, ok := s.conn.(net.Conn); ok {
+			return ipv4.NewConn(nc).SetTOS(dscp << 2)
+		}
+	}
+	return errors.New(errInvalidOperation)
+}
+
+// SetReadBuffer sets the socket read buffer, no effect if it's accepted from Listener
+func (s *UDPSession) SetReadBuffer(bytes int) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.l == nil {
+		if nc, ok := s.conn.(setReadBuffer); ok {
+			return nc.SetReadBuffer(bytes)
+		}
+	}
+	return errors.New(errInvalidOperation)
+}
+
+// SetWriteBuffer sets the socket write buffer, no effect if it's accepted from Listener
+func (s *UDPSession) SetWriteBuffer(bytes int) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.l == nil {
+		if nc, ok := s.conn.(setWriteBuffer); ok {
+			return nc.SetWriteBuffer(bytes)
+		}
+	}
+	return errors.New(errInvalidOperation)
+}
+
+// output pipeline entry
+// steps for output data processing:
+// 0. Header extends
+// 1. FEC
+// 2. CRC32
+// 3. Encryption
+// 4. WriteTo kernel
+func (s *UDPSession) output(buf []byte) {
+	var ecc [][]byte
+
+	// 0. extend buf's header space(if necessary)
+	ext := buf
+	if s.headerSize > 0 {
+		ext = s.ext[:s.headerSize+len(buf)]
+		copy(ext[s.headerSize:], buf)
+	}
+
+	// 1. FEC encoding
+	if s.fecEncoder != nil {
+		ecc = s.fecEncoder.encode(ext)
+	}
+
+	// 2&3. crc32 & encryption
+	if s.block != nil {
+		io.ReadFull(rand.Reader, ext[:nonceSize])
+		checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:])
+		binary.LittleEndian.PutUint32(ext[nonceSize:], checksum)
+		s.block.Encrypt(ext, ext)
+
+		for k := range ecc {
+			io.ReadFull(rand.Reader, ecc[k][:nonceSize])
+			checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:])
+			binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum)
+			s.block.Encrypt(ecc[k], ecc[k])
+		}
+	}
+
+	// 4. WriteTo kernel
+	nbytes := 0
+	npkts := 0
+	for i := 0; i < s.dup+1; i++ {
+		if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
+			nbytes += n
+			npkts++
+		}
+	}
+
+	for k := range ecc {
+		if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
+			nbytes += n
+			npkts++
+		}
+	}
+	atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts))
+	atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(nbytes))
+}
+
+// kcp update, returns interval for next calling
+func (s *UDPSession) update() (interval time.Duration) {
+	s.mu.Lock()
+	s.kcp.flush(false)
+	if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
+		s.notifyWriteEvent()
+	}
+	interval = time.Duration(s.kcp.interval) * time.Millisecond
+	s.mu.Unlock()
+	return
+}
+
+// GetConv gets conversation id of a session
+func (s *UDPSession) GetConv() uint32 { return s.kcp.conv }
+
+func (s *UDPSession) notifyReadEvent() {
+	select {
+	case s.chReadEvent <- struct{}{}:
+	default:
+	}
+}
+
+func (s *UDPSession) notifyWriteEvent() {
+	select {
+	case s.chWriteEvent <- struct{}{}:
+	default:
+	}
+}
+
+func (s *UDPSession) kcpInput(data []byte) {
+	var kcpInErrors, fecErrs, fecRecovered, fecParityShards uint64
+
+	if s.fecDecoder != nil {
+		f := s.fecDecoder.decodeBytes(data)
+		s.mu.Lock()
+		if f.flag == typeData {
+			if ret := s.kcp.Input(data[fecHeaderSizePlus2:], true, s.ackNoDelay); ret != 0 {
+				kcpInErrors++
+			}
+		}
+
+		if f.flag == typeData || f.flag == typeFEC {
+			if f.flag == typeFEC {
+				fecParityShards++
+			}
+
+			recovers := s.fecDecoder.decode(f)
+			for _, r := range recovers {
+				if len(r) >= 2 { // must be larger than 2bytes
+					sz := binary.LittleEndian.Uint16(r)
+					if int(sz) <= len(r) && sz >= 2 {
+						if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 {
+							fecRecovered++
+						} else {
+							kcpInErrors++
+						}
+					} else {
+						fecErrs++
+					}
+				} else {
+					fecErrs++
+				}
+			}
+		}
+
+		// notify reader
+		if n := s.kcp.PeekSize(); n > 0 {
+			s.notifyReadEvent()
+		}
+		s.mu.Unlock()
+	} else {
+		s.mu.Lock()
+		if ret := s.kcp.Input(data, true, s.ackNoDelay); ret != 0 {
+			kcpInErrors++
+		}
+		// notify reader
+		if n := s.kcp.PeekSize(); n > 0 {
+			s.notifyReadEvent()
+		}
+		s.mu.Unlock()
+	}
+
+	atomic.AddUint64(&DefaultSnmp.InPkts, 1)
+	atomic.AddUint64(&DefaultSnmp.InBytes, uint64(len(data)))
+	if fecParityShards > 0 {
+		atomic.AddUint64(&DefaultSnmp.FECParityShards, fecParityShards)
+	}
+	if kcpInErrors > 0 {
+		atomic.AddUint64(&DefaultSnmp.KCPInErrors, kcpInErrors)
+	}
+	if fecErrs > 0 {
+		atomic.AddUint64(&DefaultSnmp.FECErrs, fecErrs)
+	}
+	if fecRecovered > 0 {
+		atomic.AddUint64(&DefaultSnmp.FECRecovered, fecRecovered)
+	}
+}
+
+func (s *UDPSession) receiver(ch chan<- []byte) {
+	for {
+		data := xmitBuf.Get().([]byte)[:mtuLimit]
+		if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
+			select {
+			case ch <- data[:n]:
+			case <-s.die:
+				return
+			}
+		} else if err != nil {
+			s.chErrorEvent <- err
+			return
+		} else {
+			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
+		}
+	}
+}
+
+// read loop for client session
+func (s *UDPSession) readLoop() {
+	chPacket := make(chan []byte, qlen)
+	go s.receiver(chPacket)
+
+	for {
+		select {
+		case data := <-chPacket:
+			raw := data
+			dataValid := false
+			if s.block != nil {
+				s.block.Decrypt(data, data)
+				data = data[nonceSize:]
+				checksum := crc32.ChecksumIEEE(data[crcSize:])
+				if checksum == binary.LittleEndian.Uint32(data) {
+					data = data[crcSize:]
+					dataValid = true
+				} else {
+					atomic.AddUint64(&DefaultSnmp.InCsumErrors, 1)
+				}
+			} else if s.block == nil {
+				dataValid = true
+			}
+
+			if dataValid {
+				s.kcpInput(data)
+			}
+			xmitBuf.Put(raw)
+		case <-s.die:
+			return
+		}
+	}
+}
+
+type (
+	// Listener defines a server listening for connections
+	Listener struct {
+		block        BlockCrypt     // block encryption
+		dataShards   int            // FEC data shard
+		parityShards int            // FEC parity shard
+		fecDecoder   *fecDecoder    // FEC mock initialization
+		conn         net.PacketConn // the underlying packet connection
+
+		sessions        map[string]*UDPSession // all sessions accepted by this Listener
+		chAccepts       chan *UDPSession       // Listen() backlog
+		chSessionClosed chan net.Addr          // session close queue
+		headerSize      int                    // the overall header size added before KCP frame
+		die             chan struct{}          // notify the listener has closed
+		rd              atomic.Value           // read deadline for Accept()
+		wd              atomic.Value
+	}
+
+	// incoming packet
+	inPacket struct {
+		from net.Addr
+		data []byte
+	}
+)
+
+// monitor incoming data for all connections of server
+func (l *Listener) monitor() {
+	chPacket := make(chan inPacket, qlen)
+	go l.receiver(chPacket)
+	for {
+		select {
+		case p := <-chPacket:
+			raw := p.data
+			data := p.data
+			from := p.from
+			dataValid := false
+			if l.block != nil {
+				l.block.Decrypt(data, data)
+				data = data[nonceSize:]
+				checksum := crc32.ChecksumIEEE(data[crcSize:])
+				if checksum == binary.LittleEndian.Uint32(data) {
+					data = data[crcSize:]
+					dataValid = true
+				} else {
+					atomic.AddUint64(&DefaultSnmp.InCsumErrors, 1)
+				}
+			} else if l.block == nil {
+				dataValid = true
+			}
+
+			if dataValid {
+				addr := from.String()
+				s, ok := l.sessions[addr]
+				if !ok { // new session
+					if len(l.chAccepts) < cap(l.chAccepts) { // do not let new session overwhelm accept queue
+						var conv uint32
+						convValid := false
+						if l.fecDecoder != nil {
+							isfec := binary.LittleEndian.Uint16(data[4:])
+							if isfec == typeData {
+								conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
+								convValid = true
+							}
+						} else {
+							conv = binary.LittleEndian.Uint32(data)
+							convValid = true
+						}
+
+						if convValid {
+							s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block)
+							s.kcpInput(data)
+							l.sessions[addr] = s
+							l.chAccepts <- s
+						}
+					}
+				} else {
+					s.kcpInput(data)
+				}
+			}
+
+			xmitBuf.Put(raw)
+		case deadlink := <-l.chSessionClosed:
+			delete(l.sessions, deadlink.String())
+		case <-l.die:
+			return
+		}
+	}
+}
+
+func (l *Listener) receiver(ch chan<- inPacket) {
+	for {
+		data := xmitBuf.Get().([]byte)[:mtuLimit]
+		if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD {
+			select {
+			case ch <- inPacket{from, data[:n]}:
+			case <-l.die:
+				return
+			}
+		} else if err != nil {
+			return
+		} else {
+			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
+		}
+	}
+}
+
+// SetReadBuffer sets the socket read buffer for the Listener
+func (l *Listener) SetReadBuffer(bytes int) error {
+	if nc, ok := l.conn.(setReadBuffer); ok {
+		return nc.SetReadBuffer(bytes)
+	}
+	return errors.New(errInvalidOperation)
+}
+
+// SetWriteBuffer sets the socket write buffer for the Listener
+func (l *Listener) SetWriteBuffer(bytes int) error {
+	if nc, ok := l.conn.(setWriteBuffer); ok {
+		return nc.SetWriteBuffer(bytes)
+	}
+	return errors.New(errInvalidOperation)
+}
+
+// SetDSCP sets the 6bit DSCP field of IP header
+func (l *Listener) SetDSCP(dscp int) error {
+	if nc, ok := l.conn.(net.Conn); ok {
+		return ipv4.NewConn(nc).SetTOS(dscp << 2)
+	}
+	return errors.New(errInvalidOperation)
+}
+
+// Accept implements the Accept method in the Listener interface; it waits for the next call and returns a generic Conn.
+func (l *Listener) Accept() (net.Conn, error) {
+	return l.AcceptKCP()
+}
+
+// AcceptKCP accepts a KCP connection
+func (l *Listener) AcceptKCP() (*UDPSession, error) {
+	var timeout <-chan time.Time
+	if tdeadline, ok := l.rd.Load().(time.Time); ok && !tdeadline.IsZero() {
+		timeout = time.After(tdeadline.Sub(time.Now()))
+	}
+
+	select {
+	case <-timeout:
+		return nil, &errTimeout{}
+	case c := <-l.chAccepts:
+		return c, nil
+	case <-l.die:
+		return nil, errors.New(errBrokenPipe)
+	}
+}
+
+// SetDeadline sets the deadline associated with the listener. A zero time value disables the deadline.
+func (l *Listener) SetDeadline(t time.Time) error {
+	l.SetReadDeadline(t)
+	l.SetWriteDeadline(t)
+	return nil
+}
+
+// SetReadDeadline implements the Conn SetReadDeadline method.
+func (l *Listener) SetReadDeadline(t time.Time) error {
+	l.rd.Store(t)
+	return nil
+}
+
+// SetWriteDeadline implements the Conn SetWriteDeadline method.
+func (l *Listener) SetWriteDeadline(t time.Time) error {
+	l.wd.Store(t)
+	return nil
+}
+
+// Close stops listening on the UDP address. Already Accepted connections are not closed.
+func (l *Listener) Close() error {
+	close(l.die)
+	return l.conn.Close()
+}
+
+// closeSession notify the listener that a session has closed
+func (l *Listener) closeSession(remote net.Addr) bool {
+	select {
+	case l.chSessionClosed <- remote:
+		return true
+	case <-l.die:
+		return false
+	}
+}
+
+// Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it.
+func (l *Listener) Addr() net.Addr { return l.conn.LocalAddr() }
+
+// Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp",
+func Listen(laddr string) (net.Listener, error) { return ListenWithOptions(laddr, nil, 0, 0) }
+
+// ListenWithOptions listens for incoming KCP packets addressed to the local address laddr on the network "udp" with packet encryption,
+// dataShards, parityShards defines Reed-Solomon Erasure Coding parameters
+func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards int) (*Listener, error) {
+	udpaddr, err := net.ResolveUDPAddr("udp", laddr)
+	if err != nil {
+		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
+	}
+	conn, err := net.ListenUDP("udp", udpaddr)
+	if err != nil {
+		return nil, errors.Wrap(err, "net.ListenUDP")
+	}
+
+	return ServeConn(block, dataShards, parityShards, conn)
+}
+
+// ServeConn serves KCP protocol for a single packet connection.
+func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*Listener, error) {
+	l := new(Listener)
+	l.conn = conn
+	l.sessions = make(map[string]*UDPSession)
+	l.chAccepts = make(chan *UDPSession, acceptBacklog)
+	l.chSessionClosed = make(chan net.Addr)
+	l.die = make(chan struct{})
+	l.dataShards = dataShards
+	l.parityShards = parityShards
+	l.block = block
+	l.fecDecoder = newFECDecoder(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
+
+	// calculate header size
+	if l.block != nil {
+		l.headerSize += cryptHeaderSize
+	}
+	if l.fecDecoder != nil {
+		l.headerSize += fecHeaderSizePlus2
+	}
+
+	go l.monitor()
+	return l, nil
+}
+
+// Dial connects to the remote address "raddr" on the network "udp"
+func Dial(raddr string) (net.Conn, error) { return DialWithOptions(raddr, nil, 0, 0) }
+
+// DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
+func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
+	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
+	if err != nil {
+		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
+	}
+
+	udpconn, err := net.DialUDP("udp", nil, udpaddr)
+	if err != nil {
+		return nil, errors.Wrap(err, "net.DialUDP")
+	}
+
+	return NewConn(raddr, block, dataShards, parityShards, &connectedUDPConn{udpconn})
+}
+
+// NewConn establishes a session and talks KCP protocol over a packet connection.
+func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) {
+	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
+	if err != nil {
+		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
+	}
+
+	var convid uint32
+	binary.Read(rand.Reader, binary.LittleEndian, &convid)
+	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil
+}
+
+// returns current time in milliseconds
+func currentMs() uint32 { return uint32(time.Now().UnixNano() / int64(time.Millisecond)) }
+
+// connectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
+// to Write syscalls that are 4 times faster on some OS'es. This should only be
+// used for connections that were produced by a net.Dial* call.
+type connectedUDPConn struct{ *net.UDPConn }
+
+// WriteTo redirects all writes to the Write syscall, which is 4 times faster.
+func (c *connectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { return c.Write(b) }
--- a/vendor/github.com/xtaci/kcp-go/snmp.go
+++ b/vendor/github.com/xtaci/kcp-go/snmp.go
@@ -0,0 +1,164 @@
+package kcp
+
+import (
+	"fmt"
+	"sync/atomic"
+)
+
+// Snmp defines network statistics indicator
+type Snmp struct {
+	BytesSent        uint64 // bytes sent from upper level
+	BytesReceived    uint64 // bytes received to upper level
+	MaxConn          uint64 // max number of connections ever reached
+	ActiveOpens      uint64 // accumulated active open connections
+	PassiveOpens     uint64 // accumulated passive open connections
+	CurrEstab        uint64 // current number of established connections
+	InErrs           uint64 // UDP read errors reported from net.PacketConn
+	InCsumErrors     uint64 // checksum errors from CRC32
+	KCPInErrors      uint64 // packet iput errors reported from KCP
+	InPkts           uint64 // incoming packets count
+	OutPkts          uint64 // outgoing packets count
+	InSegs           uint64 // incoming KCP segments
+	OutSegs          uint64 // outgoing KCP segments
+	InBytes          uint64 // UDP bytes received
+	OutBytes         uint64 // UDP bytes sent
+	RetransSegs      uint64 // accmulated retransmited segments
+	FastRetransSegs  uint64 // accmulated fast retransmitted segments
+	EarlyRetransSegs uint64 // accmulated early retransmitted segments
+	LostSegs         uint64 // number of segs infered as lost
+	RepeatSegs       uint64 // number of segs duplicated
+	FECRecovered     uint64 // correct packets recovered from FEC
+	FECErrs          uint64 // incorrect packets recovered from FEC
+	FECParityShards  uint64 // FEC segments received
+	FECShortShards   uint64 // number of data shards that's not enough for recovery
+}
+
+func newSnmp() *Snmp {
+	return new(Snmp)
+}
+
+// Header returns all field names
+func (s *Snmp) Header() []string {
+	return []string{
+		"BytesSent",
+		"BytesReceived",
+		"MaxConn",
+		"ActiveOpens",
+		"PassiveOpens",
+		"CurrEstab",
+		"InErrs",
+		"InCsumErrors",
+		"KCPInErrors",
+		"InPkts",
+		"OutPkts",
+		"InSegs",
+		"OutSegs",
+		"InBytes",
+		"OutBytes",
+		"RetransSegs",
+		"FastRetransSegs",
+		"EarlyRetransSegs",
+		"LostSegs",
+		"RepeatSegs",
+		"FECParityShards",
+		"FECErrs",
+		"FECRecovered",
+		"FECShortShards",
+	}
+}
+
+// ToSlice returns current snmp info as slice
+func (s *Snmp) ToSlice() []string {
+	snmp := s.Copy()
+	return []string{
+		fmt.Sprint(snmp.BytesSent),
+		fmt.Sprint(snmp.BytesReceived),
+		fmt.Sprint(snmp.MaxConn),
+		fmt.Sprint(snmp.ActiveOpens),
+		fmt.Sprint(snmp.PassiveOpens),
+		fmt.Sprint(snmp.CurrEstab),
+		fmt.Sprint(snmp.InErrs),
+		fmt.Sprint(snmp.InCsumErrors),
+		fmt.Sprint(snmp.KCPInErrors),
+		fmt.Sprint(snmp.InPkts),
+		fmt.Sprint(snmp.OutPkts),
+		fmt.Sprint(snmp.InSegs),
+		fmt.Sprint(snmp.OutSegs),
+		fmt.Sprint(snmp.InBytes),
+		fmt.Sprint(snmp.OutBytes),
+		fmt.Sprint(snmp.RetransSegs),
+		fmt.Sprint(snmp.FastRetransSegs),
+		fmt.Sprint(snmp.EarlyRetransSegs),
+		fmt.Sprint(snmp.LostSegs),
+		fmt.Sprint(snmp.RepeatSegs),
+		fmt.Sprint(snmp.FECParityShards),
+		fmt.Sprint(snmp.FECErrs),
+		fmt.Sprint(snmp.FECRecovered),
+		fmt.Sprint(snmp.FECShortShards),
+	}
+}
+
+// Copy make a copy of current snmp snapshot
+func (s *Snmp) Copy() *Snmp {
+	d := newSnmp()
+	d.BytesSent = atomic.LoadUint64(&s.BytesSent)
+	d.BytesReceived = atomic.LoadUint64(&s.BytesReceived)
+	d.MaxConn = atomic.LoadUint64(&s.MaxConn)
+	d.ActiveOpens = atomic.LoadUint64(&s.ActiveOpens)
+	d.PassiveOpens = atomic.LoadUint64(&s.PassiveOpens)
+	d.CurrEstab = atomic.LoadUint64(&s.CurrEstab)
+	d.InErrs = atomic.LoadUint64(&s.InErrs)
+	d.InCsumErrors = atomic.LoadUint64(&s.InCsumErrors)
+	d.KCPInErrors = atomic.LoadUint64(&s.KCPInErrors)
+	d.InPkts = atomic.LoadUint64(&s.InPkts)
+	d.OutPkts = atomic.LoadUint64(&s.OutPkts)
+	d.InSegs = atomic.LoadUint64(&s.InSegs)
+	d.OutSegs = atomic.LoadUint64(&s.OutSegs)
+	d.InBytes = atomic.LoadUint64(&s.InBytes)
+	d.OutBytes = atomic.LoadUint64(&s.OutBytes)
+	d.RetransSegs = atomic.LoadUint64(&s.RetransSegs)
+	d.FastRetransSegs = atomic.LoadUint64(&s.FastRetransSegs)
+	d.EarlyRetransSegs = atomic.LoadUint64(&s.EarlyRetransSegs)
+	d.LostSegs = atomic.LoadUint64(&s.LostSegs)
+	d.RepeatSegs = atomic.LoadUint64(&s.RepeatSegs)
+	d.FECParityShards = atomic.LoadUint64(&s.FECParityShards)
+	d.FECErrs = atomic.LoadUint64(&s.FECErrs)
+	d.FECRecovered = atomic.LoadUint64(&s.FECRecovered)
+	d.FECShortShards = atomic.LoadUint64(&s.FECShortShards)
+	return d
+}
+
+// Reset values to zero
+func (s *Snmp) Reset() {
+	atomic.StoreUint64(&s.BytesSent, 0)
+	atomic.StoreUint64(&s.BytesReceived, 0)
+	atomic.StoreUint64(&s.MaxConn, 0)
+	atomic.StoreUint64(&s.ActiveOpens, 0)
+	atomic.StoreUint64(&s.PassiveOpens, 0)
+	atomic.StoreUint64(&s.CurrEstab, 0)
+	atomic.StoreUint64(&s.InErrs, 0)
+	atomic.StoreUint64(&s.InCsumErrors, 0)
+	atomic.StoreUint64(&s.KCPInErrors, 0)
+	atomic.StoreUint64(&s.InPkts, 0)
+	atomic.StoreUint64(&s.OutPkts, 0)
+	atomic.StoreUint64(&s.InSegs, 0)
+	atomic.StoreUint64(&s.OutSegs, 0)
+	atomic.StoreUint64(&s.InBytes, 0)
+	atomic.StoreUint64(&s.OutBytes, 0)
+	atomic.StoreUint64(&s.RetransSegs, 0)
+	atomic.StoreUint64(&s.FastRetransSegs, 0)
+	atomic.StoreUint64(&s.EarlyRetransSegs, 0)
+	atomic.StoreUint64(&s.LostSegs, 0)
+	atomic.StoreUint64(&s.RepeatSegs, 0)
+	atomic.StoreUint64(&s.FECParityShards, 0)
+	atomic.StoreUint64(&s.FECErrs, 0)
+	atomic.StoreUint64(&s.FECRecovered, 0)
+	atomic.StoreUint64(&s.FECShortShards, 0)
+}
+
+// DefaultSnmp is the global KCP connection statistics collector
+var DefaultSnmp *Snmp
+
+func init() {
+	DefaultSnmp = newSnmp()
+}
--- a/vendor/github.com/xtaci/kcp-go/updater.go
+++ b/vendor/github.com/xtaci/kcp-go/updater.go
@@ -0,0 +1,105 @@
+package kcp
+
+import (
+	"container/heap"
+	"sync"
+	"time"
+)
+
+var updater updateHeap
+
+func init() {
+	updater.init()
+	go updater.updateTask()
+}
+
+// entry contains a session update info
+type entry struct {
+	ts time.Time
+	s  *UDPSession
+}
+
+// a global heap managed kcp.flush() caller
+type updateHeap struct {
+	entries  []entry
+	mu       sync.Mutex
+	chWakeUp chan struct{}
+}
+
+func (h *updateHeap) Len() int           { return len(h.entries) }
+func (h *updateHeap) Less(i, j int) bool { return h.entries[i].ts.Before(h.entries[j].ts) }
+func (h *updateHeap) Swap(i, j int) {
+	h.entries[i], h.entries[j] = h.entries[j], h.entries[i]
+	h.entries[i].s.updaterIdx = i
+	h.entries[j].s.updaterIdx = j
+}
+
+func (h *updateHeap) Push(x interface{}) {
+	h.entries = append(h.entries, x.(entry))
+	n := len(h.entries)
+	h.entries[n-1].s.updaterIdx = n - 1
+}
+
+func (h *updateHeap) Pop() interface{} {
+	n := len(h.entries)
+	x := h.entries[n-1]
+	h.entries[n-1].s.updaterIdx = -1
+	h.entries[n-1] = entry{} // manual set nil for GC
+	h.entries = h.entries[0 : n-1]
+	return x
+}
+
+func (h *updateHeap) init() {
+	h.chWakeUp = make(chan struct{}, 1)
+}
+
+func (h *updateHeap) addSession(s *UDPSession) {
+	h.mu.Lock()
+	heap.Push(h, entry{time.Now(), s})
+	h.mu.Unlock()
+	h.wakeup()
+}
+
+func (h *updateHeap) removeSession(s *UDPSession) {
+	h.mu.Lock()
+	if s.updaterIdx != -1 {
+		heap.Remove(h, s.updaterIdx)
+	}
+	h.mu.Unlock()
+}
+
+func (h *updateHeap) wakeup() {
+	select {
+	case h.chWakeUp <- struct{}{}:
+	default:
+	}
+}
+
+func (h *updateHeap) updateTask() {
+	var timer <-chan time.Time
+	for {
+		select {
+		case <-timer:
+		case <-h.chWakeUp:
+		}
+
+		h.mu.Lock()
+		hlen := h.Len()
+		now := time.Now()
+		for i := 0; i < hlen; i++ {
+			entry := heap.Pop(h).(entry)
+			if now.After(entry.ts) {
+				entry.ts = now.Add(entry.s.update())
+				heap.Push(h, entry)
+			} else {
+				heap.Push(h, entry)
+				break
+			}
+		}
+
+		if hlen > 0 {
+			timer = time.After(h.entries[0].ts.Sub(now))
+		}
+		h.mu.Unlock()
+	}
+}
--- a/vendor/github.com/xtaci/kcp-go/xor.go
+++ b/vendor/github.com/xtaci/kcp-go/xor.go
@@ -0,0 +1,110 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package kcp
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+// fastXORBytes xors in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastXORBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+
+	w := n / wordSize
+	if w > 0 {
+		wordBytes := w * wordSize
+		fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
+	}
+
+	for i := (n - n%wordSize); i < n; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+
+	return n
+}
+
+func safeXORBytes(dst, a, b []byte) int {
+	n := len(a)
+	if len(b) < n {
+		n = len(b)
+	}
+	ex := n % 8
+	for i := 0; i < ex; i++ {
+		dst[i] = a[i] ^ b[i]
+	}
+
+	for i := ex; i < n; i += 8 {
+		_dst := dst[i : i+8]
+		_a := a[i : i+8]
+		_b := b[i : i+8]
+		_dst[0] = _a[0] ^ _b[0]
+		_dst[1] = _a[1] ^ _b[1]
+		_dst[2] = _a[2] ^ _b[2]
+		_dst[3] = _a[3] ^ _b[3]
+
+		_dst[4] = _a[4] ^ _b[4]
+		_dst[5] = _a[5] ^ _b[5]
+		_dst[6] = _a[6] ^ _b[6]
+		_dst[7] = _a[7] ^ _b[7]
+	}
+	return n
+}
+
+// xorBytes xors the bytes in a and b. The destination is assumed to have enough
+// space. Returns the number of bytes xor'd.
+func xorBytes(dst, a, b []byte) int {
+	if supportsUnaligned {
+		return fastXORBytes(dst, a, b)
+	}
+	// TODO(hanwen): if (dst, a, b) have common alignment
+	// we could still try fastXORBytes. It is not clear
+	// how often this happens, and it's only worth it if
+	// the block encryption itself is hardware
+	// accelerated.
+	return safeXORBytes(dst, a, b)
+}
+
+// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The arguments are assumed to be of equal length.
+func fastXORWords(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	n := len(b) / wordSize
+	ex := n % 8
+	for i := 0; i < ex; i++ {
+		dw[i] = aw[i] ^ bw[i]
+	}
+
+	for i := ex; i < n; i += 8 {
+		_dw := dw[i : i+8]
+		_aw := aw[i : i+8]
+		_bw := bw[i : i+8]
+		_dw[0] = _aw[0] ^ _bw[0]
+		_dw[1] = _aw[1] ^ _bw[1]
+		_dw[2] = _aw[2] ^ _bw[2]
+		_dw[3] = _aw[3] ^ _bw[3]
+		_dw[4] = _aw[4] ^ _bw[4]
+		_dw[5] = _aw[5] ^ _bw[5]
+		_dw[6] = _aw[6] ^ _bw[6]
+		_dw[7] = _aw[7] ^ _bw[7]
+	}
+}
+
+func xorWords(dst, a, b []byte) {
+	if supportsUnaligned {
+		fastXORWords(dst, a, b)
+	} else {
+		safeXORBytes(dst, a, b)
+	}
+}