cmd/syncthing: Enable KCP by default

Also, use upstream library, as my changes have been merged.
2017-10-17 23:17:10 +01:00 · 2017-10-17 23:17:10 +01:00 · fb7264a663
parent 889814a1af
commit fb7264a663
55 changed files with 3016 additions and 2798 deletions
--- a/lib/config/config.go
+++ b/lib/config/config.go
@ -48,11 +48,8 @@ var (
 	DefaultListenAddresses = []string{
 		util.Address("tcp", net.JoinHostPort("0.0.0.0", strconv.Itoa(DefaultTCPPort))),
 		"dynamic+https://relays.syncthing.net/endpoint",
+		util.Address("kcp", net.JoinHostPort("0.0.0.0", strconv.Itoa(DefaultKCPPort))),
 	}
-	// DefaultKCPListenAddress gets added to the default listen address set
-	// when the appropriate feature flag is set. Feature flag stuff to be
-	// removed later.
-	DefaultKCPListenAddress = util.Address("kcp", net.JoinHostPort("0.0.0.0", strconv.Itoa(DefaultKCPPort)))
 	// DefaultDiscoveryServersV4 should be substituted when the configuration
 	// contains <globalAnnounceServer>default-v4</globalAnnounceServer>.
 	DefaultDiscoveryServersV4 = []string{
--- a/lib/config/config_test.go
+++ b/lib/config/config_test.go
@ -68,7 +68,6 @@ func TestDefaultValues(t *testing.T) {
 		WeakHashSelectionMethod: WeakHashAuto,
 		StunKeepaliveS:          24,
 		StunServers:             []string{"default"},
-		DefaultKCPEnabled:       false,
 		KCPCongestionControl:    true,
 		KCPReceiveWindowSize:    128,
 		KCPSendWindowSize:       128,
@ -214,7 +213,6 @@ func TestOverriddenValues(t *testing.T) {
 		WeakHashSelectionMethod: WeakHashNever,
 		StunKeepaliveS:          10,
 		StunServers:             []string{"a.stun.com", "b.stun.com"},
-		DefaultKCPEnabled:       true,
 		KCPCongestionControl:    false,
 		KCPReceiveWindowSize:    1280,
 		KCPSendWindowSize:       1280,
--- a/lib/config/optionsconfiguration.go
+++ b/lib/config/optionsconfiguration.go
@ -134,7 +134,6 @@ type OptionsConfiguration struct {
 	WeakHashSelectionMethod WeakHashSelectionMethod `xml:"weakHashSelectionMethod" json:"weakHashSelectionMethod"`
 	StunServers             []string                `xml:"stunServer" json:"stunServers" default:"default"`
 	StunKeepaliveS          int                     `xml:"stunKeepaliveSeconds" json:"stunKeepaliveSeconds" default:"24"`
-	DefaultKCPEnabled       bool                    `xml:"defaultKCPEnabled" json:"defaultKCPEnabled" default:"false"`
 	KCPNoDelay              bool                    `xml:"kcpNoDelay" json:"kcpNoDelay" default:"false"`
 	KCPUpdateIntervalMs     int                     `xml:"kcpUpdateIntervalMs" json:"kcpUpdateIntervalMs" default:"25"`
 	KCPFastResend           bool                    `xml:"kcpFastResend" json:"kcpFastResend" default:"false"`
--- a/lib/config/testdata/overridenvalues.xml
+++ b/lib/config/testdata/overridenvalues.xml
@ -38,7 +38,6 @@
        <stunKeepaliveSeconds>10</stunKeepaliveSeconds>
        <stunServer>a.stun.com</stunServer>
        <stunServer>b.stun.com</stunServer>
-        <defaultKCPEnabled>true</defaultKCPEnabled>
        <kcpCongestionControl>false</kcpCongestionControl>
        <kcpReceiveWindowSize>1280</kcpReceiveWindowSize>
        <kcpSendWindowSize>1280</kcpSendWindowSize>
--- a/lib/config/wrapper.go
+++ b/lib/config/wrapper.go
@ -423,9 +423,6 @@ func (w *Wrapper) ListenAddresses() []string {
 		switch addr {
 		case "default":
 			addresses = append(addresses, DefaultListenAddresses...)
-			if w.cfg.Options.DefaultKCPEnabled { // temporary feature flag
-				addresses = append(addresses, DefaultKCPListenAddress)
-			}
 		default:
 			addresses = append(addresses, addr)
 		}
--- a/lib/connections/kcp_dial.go
+++ b/lib/connections/kcp_dial.go
@ -11,9 +11,9 @@ import (
 	"net/url"
 	"time"

-	"github.com/AudriusButkevicius/kcp-go"
 	"github.com/syncthing/syncthing/lib/config"
 	"github.com/syncthing/syncthing/lib/protocol"
+	"github.com/xtaci/kcp-go"
 	"github.com/xtaci/smux"
 )

--- a/lib/connections/kcp_listen.go
+++ b/lib/connections/kcp_listen.go
@ -15,9 +15,9 @@ import (
 	"sync/atomic"
 	"time"

-	"github.com/AudriusButkevicius/kcp-go"
 	"github.com/AudriusButkevicius/pfilter"
 	"github.com/ccding/go-stun/stun"
+	"github.com/xtaci/kcp-go"
 	"github.com/xtaci/smux"

 	"github.com/syncthing/syncthing/lib/config"
--- a/lib/protocol/benchmark_test.go
+++ b/lib/protocol/benchmark_test.go
@ -8,8 +8,8 @@ import (
 	"net"
 	"testing"

-	"github.com/AudriusButkevicius/kcp-go"
 	"github.com/syncthing/syncthing/lib/dialer"
+	"github.com/xtaci/kcp-go"
 )

 func BenchmarkRequestsRawTCP(b *testing.B) {
--- a/vendor/github.com/klauspost/reedsolomon/examples/simple-decoder.go
+++ b/vendor/github.com/klauspost/reedsolomon/examples/simple-decoder.go
@ -1,125 +0,0 @@
-//+build ignore
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-//
-// Simple decoder example.
-//
-// The decoder reverses the process of "simple-encoder.go"
-//
-// To build an executable use:
-//
-// go build simple-decoder.go
-//
-// Simple Encoder/Decoder Shortcomings:
-// * If the file size of the input isn't diviable by the number of data shards
-//   the output will contain extra zeroes
-//
-// * If the shard numbers isn't the same for the decoder as in the
-//   encoder, invalid output will be generated.
-//
-// * If values have changed in a shard, it cannot be reconstructed.
-//
-// * If two shards have been swapped, reconstruction will always fail.
-//   You need to supply the shards in the same order as they were given to you.
-//
-// The solution for this is to save a metadata file containing:
-//
-// * File size.
-// * The number of data/parity shards.
-// * HASH of each shard.
-// * Order of the shards.
-//
-// If you save these properties, you should abe able to detect file corruption
-// in a shard and be able to reconstruct your data if you have the needed number of shards left.
-
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"os"
-
-	"github.com/klauspost/reedsolomon"
-)
-
-var dataShards = flag.Int("data", 4, "Number of shards to split the data into")
-var parShards = flag.Int("par", 2, "Number of parity shards")
-var outFile = flag.String("out", "", "Alternative output path/file")
-
-func init() {
-	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "  simple-decoder [-flags] basefile.ext\nDo not add the number to the filename.\n")
-		fmt.Fprintf(os.Stderr, "Valid flags:\n")
-		flag.PrintDefaults()
-	}
-}
-
-func main() {
-	// Parse flags
-	flag.Parse()
-	args := flag.Args()
-	if len(args) != 1 {
-		fmt.Fprintf(os.Stderr, "Error: No filenames given\n")
-		flag.Usage()
-		os.Exit(1)
-	}
-	fname := args[0]
-
-	// Create matrix
-	enc, err := reedsolomon.New(*dataShards, *parShards)
-	checkErr(err)
-
-	// Create shards and load the data.
-	shards := make([][]byte, *dataShards+*parShards)
-	for i := range shards {
-		infn := fmt.Sprintf("%s.%d", fname, i)
-		fmt.Println("Opening", infn)
-		shards[i], err = ioutil.ReadFile(infn)
-		if err != nil {
-			fmt.Println("Error reading file", err)
-			shards[i] = nil
-		}
-	}
-
-	// Verify the shards
-	ok, err := enc.Verify(shards)
-	if ok {
-		fmt.Println("No reconstruction needed")
-	} else {
-		fmt.Println("Verification failed. Reconstructing data")
-		err = enc.Reconstruct(shards)
-		if err != nil {
-			fmt.Println("Reconstruct failed -", err)
-			os.Exit(1)
-		}
-		ok, err = enc.Verify(shards)
-		if !ok {
-			fmt.Println("Verification failed after reconstruction, data likely corrupted.")
-			os.Exit(1)
-		}
-		checkErr(err)
-	}
-
-	// Join the shards and write them
-	outfn := *outFile
-	if outfn == "" {
-		outfn = fname
-	}
-
-	fmt.Println("Writing data to", outfn)
-	f, err := os.Create(outfn)
-	checkErr(err)
-
-	// We don't know the exact filesize.
-	err = enc.Join(f, shards, len(shards[0])**dataShards)
-	checkErr(err)
-}
-
-func checkErr(err error) {
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error: %s", err.Error())
-		os.Exit(2)
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/examples/simple-encoder.go
+++ b/vendor/github.com/klauspost/reedsolomon/examples/simple-encoder.go
@ -1,112 +0,0 @@
-//+build ignore
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-//
-// Simple encoder example
-//
-// The encoder encodes a simgle file into a number of shards
-// To reverse the process see "simpledecoder.go"
-//
-// To build an executable use:
-//
-// go build simple-decoder.go
-//
-// Simple Encoder/Decoder Shortcomings:
-// * If the file size of the input isn't diviable by the number of data shards
-//   the output will contain extra zeroes
-//
-// * If the shard numbers isn't the same for the decoder as in the
-//   encoder, invalid output will be generated.
-//
-// * If values have changed in a shard, it cannot be reconstructed.
-//
-// * If two shards have been swapped, reconstruction will always fail.
-//   You need to supply the shards in the same order as they were given to you.
-//
-// The solution for this is to save a metadata file containing:
-//
-// * File size.
-// * The number of data/parity shards.
-// * HASH of each shard.
-// * Order of the shards.
-//
-// If you save these properties, you should abe able to detect file corruption
-// in a shard and be able to reconstruct your data if you have the needed number of shards left.
-
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-
-	"github.com/klauspost/reedsolomon"
-)
-
-var dataShards = flag.Int("data", 4, "Number of shards to split the data into, must be below 257.")
-var parShards = flag.Int("par", 2, "Number of parity shards")
-var outDir = flag.String("out", "", "Alternative output directory")
-
-func init() {
-	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "  simple-encoder [-flags] filename.ext\n\n")
-		fmt.Fprintf(os.Stderr, "Valid flags:\n")
-		flag.PrintDefaults()
-	}
-}
-
-func main() {
-	// Parse command line parameters.
-	flag.Parse()
-	args := flag.Args()
-	if len(args) != 1 {
-		fmt.Fprintf(os.Stderr, "Error: No input filename given\n")
-		flag.Usage()
-		os.Exit(1)
-	}
-	if *dataShards > 257 {
-		fmt.Fprintf(os.Stderr, "Error: Too many data shards\n")
-		os.Exit(1)
-	}
-	fname := args[0]
-
-	// Create encoding matrix.
-	enc, err := reedsolomon.New(*dataShards, *parShards)
-	checkErr(err)
-
-	fmt.Println("Opening", fname)
-	b, err := ioutil.ReadFile(fname)
-	checkErr(err)
-
-	// Split the file into equally sized shards.
-	shards, err := enc.Split(b)
-	checkErr(err)
-	fmt.Printf("File split into %d data+parity shards with %d bytes/shard.\n", len(shards), len(shards[0]))
-
-	// Encode parity
-	err = enc.Encode(shards)
-	checkErr(err)
-
-	// Write out the resulting files.
-	dir, file := filepath.Split(fname)
-	if *outDir != "" {
-		dir = *outDir
-	}
-	for i, shard := range shards {
-		outfn := fmt.Sprintf("%s.%d", file, i)
-
-		fmt.Println("Writing to", outfn)
-		err = ioutil.WriteFile(filepath.Join(dir, outfn), shard, os.ModePerm)
-		checkErr(err)
-	}
-}
-
-func checkErr(err error) {
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error: %s", err.Error())
-		os.Exit(2)
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/examples/stream-decoder.go
+++ b/vendor/github.com/klauspost/reedsolomon/examples/stream-decoder.go
@ -1,167 +0,0 @@
-//+build ignore
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-//
-// Stream decoder example.
-//
-// The decoder reverses the process of "stream-encoder.go"
-//
-// To build an executable use:
-//
-// go build stream-decoder.go
-//
-// Simple Encoder/Decoder Shortcomings:
-// * If the file size of the input isn't dividable by the number of data shards
-//   the output will contain extra zeroes
-//
-// * If the shard numbers isn't the same for the decoder as in the
-//   encoder, invalid output will be generated.
-//
-// * If values have changed in a shard, it cannot be reconstructed.
-//
-// * If two shards have been swapped, reconstruction will always fail.
-//   You need to supply the shards in the same order as they were given to you.
-//
-// The solution for this is to save a metadata file containing:
-//
-// * File size.
-// * The number of data/parity shards.
-// * HASH of each shard.
-// * Order of the shards.
-//
-// If you save these properties, you should abe able to detect file corruption
-// in a shard and be able to reconstruct your data if you have the needed number of shards left.
-
-package main
-
-import (
-	"flag"
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-
-	"github.com/klauspost/reedsolomon"
-)
-
-var dataShards = flag.Int("data", 4, "Number of shards to split the data into")
-var parShards = flag.Int("par", 2, "Number of parity shards")
-var outFile = flag.String("out", "", "Alternative output path/file")
-
-func init() {
-	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "  %s [-flags] basefile.ext\nDo not add the number to the filename.\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "Valid flags:\n")
-		flag.PrintDefaults()
-	}
-}
-
-func main() {
-	// Parse flags
-	flag.Parse()
-	args := flag.Args()
-	if len(args) != 1 {
-		fmt.Fprintf(os.Stderr, "Error: No filenames given\n")
-		flag.Usage()
-		os.Exit(1)
-	}
-	fname := args[0]
-
-	// Create matrix
-	enc, err := reedsolomon.NewStream(*dataShards, *parShards)
-	checkErr(err)
-
-	// Open the inputs
-	shards, size, err := openInput(*dataShards, *parShards, fname)
-	checkErr(err)
-
-	// Verify the shards
-	ok, err := enc.Verify(shards)
-	if ok {
-		fmt.Println("No reconstruction needed")
-	} else {
-		fmt.Println("Verification failed. Reconstructing data")
-		shards, size, err = openInput(*dataShards, *parShards, fname)
-		checkErr(err)
-		// Create out destination writers
-		out := make([]io.Writer, len(shards))
-		for i := range out {
-			if shards[i] == nil {
-				dir, _ := filepath.Split(fname)
-				outfn := fmt.Sprintf("%s.%d", fname, i)
-				fmt.Println("Creating", outfn)
-				out[i], err = os.Create(filepath.Join(dir, outfn))
-				checkErr(err)
-			}
-		}
-		err = enc.Reconstruct(shards, out)
-		if err != nil {
-			fmt.Println("Reconstruct failed -", err)
-			os.Exit(1)
-		}
-		// Close output.
-		for i := range out {
-			if out[i] != nil {
-				err := out[i].(*os.File).Close()
-				checkErr(err)
-			}
-		}
-		shards, size, err = openInput(*dataShards, *parShards, fname)
-		ok, err = enc.Verify(shards)
-		if !ok {
-			fmt.Println("Verification failed after reconstruction, data likely corrupted:", err)
-			os.Exit(1)
-		}
-		checkErr(err)
-	}
-
-	// Join the shards and write them
-	outfn := *outFile
-	if outfn == "" {
-		outfn = fname
-	}
-
-	fmt.Println("Writing data to", outfn)
-	f, err := os.Create(outfn)
-	checkErr(err)
-
-	shards, size, err = openInput(*dataShards, *parShards, fname)
-	checkErr(err)
-
-	// We don't know the exact filesize.
-	err = enc.Join(f, shards, int64(*dataShards)*size)
-	checkErr(err)
-}
-
-func openInput(dataShards, parShards int, fname string) (r []io.Reader, size int64, err error) {
-	// Create shards and load the data.
-	shards := make([]io.Reader, dataShards+parShards)
-	for i := range shards {
-		infn := fmt.Sprintf("%s.%d", fname, i)
-		fmt.Println("Opening", infn)
-		f, err := os.Open(infn)
-		if err != nil {
-			fmt.Println("Error reading file", err)
-			shards[i] = nil
-			continue
-		} else {
-			shards[i] = f
-		}
-		stat, err := f.Stat()
-		checkErr(err)
-		if stat.Size() > 0 {
-			size = stat.Size()
-		} else {
-			shards[i] = nil
-		}
-	}
-	return shards, size, nil
-}
-
-func checkErr(err error) {
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error: %s", err.Error())
-		os.Exit(2)
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/examples/stream-encoder.go
+++ b/vendor/github.com/klauspost/reedsolomon/examples/stream-encoder.go
@ -1,142 +0,0 @@
-//+build ignore
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-//
-// Simple stream encoder example
-//
-// The encoder encodes a single file into a number of shards
-// To reverse the process see "stream-decoder.go"
-//
-// To build an executable use:
-//
-// go build stream-encoder.go
-//
-// Simple Encoder/Decoder Shortcomings:
-// * If the file size of the input isn't dividable by the number of data shards
-//   the output will contain extra zeroes
-//
-// * If the shard numbers isn't the same for the decoder as in the
-//   encoder, invalid output will be generated.
-//
-// * If values have changed in a shard, it cannot be reconstructed.
-//
-// * If two shards have been swapped, reconstruction will always fail.
-//   You need to supply the shards in the same order as they were given to you.
-//
-// The solution for this is to save a metadata file containing:
-//
-// * File size.
-// * The number of data/parity shards.
-// * HASH of each shard.
-// * Order of the shards.
-//
-// If you save these properties, you should abe able to detect file corruption
-// in a shard and be able to reconstruct your data if you have the needed number of shards left.
-
-package main
-
-import (
-	"flag"
-	"fmt"
-	"os"
-	"path/filepath"
-
-	"io"
-
-	"github.com/klauspost/reedsolomon"
-)
-
-var dataShards = flag.Int("data", 4, "Number of shards to split the data into, must be below 257.")
-var parShards = flag.Int("par", 2, "Number of parity shards")
-var outDir = flag.String("out", "", "Alternative output directory")
-
-func init() {
-	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "  %s [-flags] filename.ext\n\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "Valid flags:\n")
-		flag.PrintDefaults()
-	}
-}
-
-func main() {
-	// Parse command line parameters.
-	flag.Parse()
-	args := flag.Args()
-	if len(args) != 1 {
-		fmt.Fprintf(os.Stderr, "Error: No input filename given\n")
-		flag.Usage()
-		os.Exit(1)
-	}
-	if *dataShards > 257 {
-		fmt.Fprintf(os.Stderr, "Error: Too many data shards\n")
-		os.Exit(1)
-	}
-	fname := args[0]
-
-	// Create encoding matrix.
-	enc, err := reedsolomon.NewStream(*dataShards, *parShards)
-	checkErr(err)
-
-	fmt.Println("Opening", fname)
-	f, err := os.Open(fname)
-	checkErr(err)
-
-	instat, err := f.Stat()
-	checkErr(err)
-
-	shards := *dataShards + *parShards
-	out := make([]*os.File, shards)
-
-	// Create the resulting files.
-	dir, file := filepath.Split(fname)
-	if *outDir != "" {
-		dir = *outDir
-	}
-	for i := range out {
-		outfn := fmt.Sprintf("%s.%d", file, i)
-		fmt.Println("Creating", outfn)
-		out[i], err = os.Create(filepath.Join(dir, outfn))
-		checkErr(err)
-	}
-
-	// Split into files.
-	data := make([]io.Writer, *dataShards)
-	for i := range data {
-		data[i] = out[i]
-	}
-	// Do the split
-	err = enc.Split(f, data, instat.Size())
-	checkErr(err)
-
-	// Close and re-open the files.
-	input := make([]io.Reader, *dataShards)
-
-	for i := range data {
-		out[i].Close()
-		f, err := os.Open(out[i].Name())
-		checkErr(err)
-		input[i] = f
-		defer f.Close()
-	}
-
-	// Create parity output writers
-	parity := make([]io.Writer, *parShards)
-	for i := range parity {
-		parity[i] = out[*dataShards+i]
-		defer out[*dataShards+i].Close()
-	}
-
-	// Encode parity
-	err = enc.Encode(input, parity)
-	checkErr(err)
-	fmt.Printf("File split into %d data + %d parity shards.\n", *dataShards, *parShards)
-
-}
-
-func checkErr(err error) {
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error: %s", err.Error())
-		os.Exit(2)
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/galois.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois.go
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@ -1,73 +0,0 @@
-//+build !noasm
-//+build !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package reedsolomon
-
-//go:noescape
-func galMulSSSE3(low, high, in, out []byte)
-
-//go:noescape
-func galMulSSSE3Xor(low, high, in, out []byte)
-
-//go:noescape
-func galMulAVX2Xor(low, high, in, out []byte)
-
-//go:noescape
-func galMulAVX2(low, high, in, out []byte)
-
-// This is what the assembler rountes does in blocks of 16 bytes:
-/*
-func galMulSSSE3(low, high, in, out []byte) {
-	for n, input := range in {
-		l := input & 0xf
-		h := input >> 4
-		out[n] = low[l] ^ high[h]
-	}
-}
-
-func galMulSSSE3Xor(low, high, in, out []byte) {
-	for n, input := range in {
-		l := input & 0xf
-		h := input >> 4
-		out[n] ^= low[l] ^ high[h]
-	}
-}
-*/
-
-func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
-	var done int
-	if avx2 {
-		galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 5) << 5
-	} else if ssse3 {
-		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 4) << 4
-	}
-	remain := len(in) - done
-	if remain > 0 {
-		mt := mulTable[c]
-		for i := done; i < len(in); i++ {
-			out[i] = mt[in[i]]
-		}
-	}
-}
-
-func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
-	var done int
-	if avx2 {
-		galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 5) << 5
-	} else if ssse3 {
-		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 4) << 4
-	}
-	remain := len(in) - done
-	if remain > 0 {
-		mt := mulTable[c]
-		for i := done; i < len(in); i++ {
-			out[i] ^= mt[in[i]]
-		}
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@ -1,164 +0,0 @@
-//+build !noasm !appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
-// and http://jerasure.org/jerasure/gf-complete/tree/master
-
-// func galMulSSSE3Xor(low, high, in, out []byte)
-TEXT ·galMulSSSE3Xor(SB), 7, $0
-	MOVQ   low+0(FP), SI     // SI: &low
-	MOVQ   high+24(FP), DX   // DX: &high
-	MOVOU  (SI), X6          // X6 low
-	MOVOU  (DX), X7          // X7: high
-	MOVQ   $15, BX           // BX: low mask
-	MOVQ   BX, X8
-	PXOR   X5, X5
-	MOVQ   in+48(FP), SI     // R11: &in
-	MOVQ   in_len+56(FP), R9 // R9: len(in)
-	MOVQ   out+72(FP), DX    // DX: &out
-	PSHUFB X5, X8            // X8: lomask (unpacked)
-	SHRQ   $4, R9            // len(in) / 16
-	CMPQ   R9, $0
-	JEQ    done_xor
-
-loopback_xor:
-	MOVOU  (SI), X0     // in[x]
-	MOVOU  (DX), X4     // out[x]
-	MOVOU  X0, X1       // in[x]
-	MOVOU  X6, X2       // low copy
-	MOVOU  X7, X3       // high copy
-	PSRLQ  $4, X1       // X1: high input
-	PAND   X8, X0       // X0: low input
-	PAND   X8, X1       // X0: high input
-	PSHUFB X0, X2       // X2: mul low part
-	PSHUFB X1, X3       // X3: mul high part
-	PXOR   X2, X3       // X3: Result
-	PXOR   X4, X3       // X3: Result xor existing out
-	MOVOU  X3, (DX)     // Store
-	ADDQ   $16, SI      // in+=16
-	ADDQ   $16, DX      // out+=16
-	SUBQ   $1, R9
-	JNZ    loopback_xor
-
-done_xor:
-	RET
-
-// func galMulSSSE3(low, high, in, out []byte)
-TEXT ·galMulSSSE3(SB), 7, $0
-	MOVQ   low+0(FP), SI     // SI: &low
-	MOVQ   high+24(FP), DX   // DX: &high
-	MOVOU  (SI), X6          // X6 low
-	MOVOU  (DX), X7          // X7: high
-	MOVQ   $15, BX           // BX: low mask
-	MOVQ   BX, X8
-	PXOR   X5, X5
-	MOVQ   in+48(FP), SI     // R11: &in
-	MOVQ   in_len+56(FP), R9 // R9: len(in)
-	MOVQ   out+72(FP), DX    // DX: &out
-	PSHUFB X5, X8            // X8: lomask (unpacked)
-	SHRQ   $4, R9            // len(in) / 16
-	CMPQ   R9, $0
-	JEQ    done
-
-loopback:
-	MOVOU  (SI), X0 // in[x]
-	MOVOU  X0, X1   // in[x]
-	MOVOU  X6, X2   // low copy
-	MOVOU  X7, X3   // high copy
-	PSRLQ  $4, X1   // X1: high input
-	PAND   X8, X0   // X0: low input
-	PAND   X8, X1   // X0: high input
-	PSHUFB X0, X2   // X2: mul low part
-	PSHUFB X1, X3   // X3: mul high part
-	PXOR   X2, X3   // X3: Result
-	MOVOU  X3, (DX) // Store
-	ADDQ   $16, SI  // in+=16
-	ADDQ   $16, DX  // out+=16
-	SUBQ   $1, R9
-	JNZ    loopback
-
-done:
-	RET
-
-// func galMulAVX2Xor(low, high, in, out []byte)
-TEXT ·galMulAVX2Xor(SB), 7, $0
-	MOVQ  low+0(FP), SI     // SI: &low
-	MOVQ  high+24(FP), DX   // DX: &high
-	MOVQ  $15, BX           // BX: low mask
-	MOVQ  BX, X5
-	MOVOU (SI), X6          // X6 low
-	MOVOU (DX), X7          // X7: high
-	MOVQ  in_len+56(FP), R9 // R9: len(in)
-
-	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
-	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
-	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)
-
-	SHRQ  $5, R9         // len(in) /32
-	MOVQ  out+72(FP), DX // DX: &out
-	MOVQ  in+48(FP), SI  // R11: &in
-	TESTQ R9, R9
-	JZ    done_xor_avx2
-
-loopback_xor_avx2:
-	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
-	LONG $0x226ffec5             // VMOVDQU YMM4, [rdx]
-	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
-	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
-	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
-	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
-	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
-	LONG $0xdbefedc5             // VPXOR   YMM3, YMM2, YMM3    ; X3: Result
-	LONG $0xe4efe5c5             // VPXOR   YMM4, YMM3, YMM4    ; X4: Result
-	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4
-
-	ADDQ $32, SI           // in+=32
-	ADDQ $32, DX           // out+=32
-	SUBQ $1, R9
-	JNZ  loopback_xor_avx2
-
-done_xor_avx2:
-	// VZEROUPPER
-	BYTE $0xc5; BYTE $0xf8; BYTE $0x77
-	RET
-
-// func galMulAVX2(low, high, in, out []byte)
-TEXT ·galMulAVX2(SB), 7, $0
-	MOVQ  low+0(FP), SI     // SI: &low
-	MOVQ  high+24(FP), DX   // DX: &high
-	MOVQ  $15, BX           // BX: low mask
-	MOVQ  BX, X5
-	MOVOU (SI), X6          // X6 low
-	MOVOU (DX), X7          // X7: high
-	MOVQ  in_len+56(FP), R9 // R9: len(in)
-
-	LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
-	LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
-	LONG $0x787d62c4; BYTE $0xc5   // VPBROADCASTB YMM8, XMM5         ; X8: lomask (unpacked)
-
-	SHRQ  $5, R9         // len(in) /32
-	MOVQ  out+72(FP), DX // DX: &out
-	MOVQ  in+48(FP), SI  // R11: &in
-	TESTQ R9, R9
-	JZ    done_avx2
-
-loopback_avx2:
-	LONG $0x066ffec5             // VMOVDQU YMM0, [rsi]
-	LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ  YMM1, YMM0, 4   ; X1: high input
-	LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND   YMM0, YMM0, YMM8      ; X0: low input
-	LONG $0xdb75c1c4; BYTE $0xc8 // VPAND   YMM1, YMM1, YMM8      ; X1: high input
-	LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB  YMM2, YMM6, YMM0   ; X2: mul low part
-	LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB  YMM3, YMM7, YMM1   ; X2: mul high part
-	LONG $0xe3efedc5             // VPXOR   YMM4, YMM2, YMM3    ; X4: Result
-	LONG $0x227ffec5             // VMOVDQU [rdx], YMM4
-
-	ADDQ $32, SI       // in+=32
-	ADDQ $32, DX       // out+=32
-	SUBQ $1, R9
-	JNZ  loopback_avx2
-
-done_avx2:
-
-	BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
-	RET
--- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@ -1,19 +0,0 @@
-//+build !amd64 noasm appengine
-
-// Copyright 2015, Klaus Post, see LICENSE for details.
-
-package reedsolomon
-
-func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
-	mt := mulTable[c]
-	for n, input := range in {
-		out[n] = mt[input]
-	}
-}
-
-func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
-	mt := mulTable[c]
-	for n, input := range in {
-		out[n] ^= mt[input]
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/gentables.go
+++ b/vendor/github.com/klauspost/reedsolomon/gentables.go
@ -1,132 +0,0 @@
-//+build ignore
-
-package main
-
-import (
-	"fmt"
-)
-
-var logTable = [fieldSize]int16{
-	-1, 0, 1, 25, 2, 50, 26, 198,
-	3, 223, 51, 238, 27, 104, 199, 75,
-	4, 100, 224, 14, 52, 141, 239, 129,
-	28, 193, 105, 248, 200, 8, 76, 113,
-	5, 138, 101, 47, 225, 36, 15, 33,
-	53, 147, 142, 218, 240, 18, 130, 69,
-	29, 181, 194, 125, 106, 39, 249, 185,
-	201, 154, 9, 120, 77, 228, 114, 166,
-	6, 191, 139, 98, 102, 221, 48, 253,
-	226, 152, 37, 179, 16, 145, 34, 136,
-	54, 208, 148, 206, 143, 150, 219, 189,
-	241, 210, 19, 92, 131, 56, 70, 64,
-	30, 66, 182, 163, 195, 72, 126, 110,
-	107, 58, 40, 84, 250, 133, 186, 61,
-	202, 94, 155, 159, 10, 21, 121, 43,
-	78, 212, 229, 172, 115, 243, 167, 87,
-	7, 112, 192, 247, 140, 128, 99, 13,
-	103, 74, 222, 237, 49, 197, 254, 24,
-	227, 165, 153, 119, 38, 184, 180, 124,
-	17, 68, 146, 217, 35, 32, 137, 46,
-	55, 63, 209, 91, 149, 188, 207, 205,
-	144, 135, 151, 178, 220, 252, 190, 97,
-	242, 86, 211, 171, 20, 42, 93, 158,
-	132, 60, 57, 83, 71, 109, 65, 162,
-	31, 45, 67, 216, 183, 123, 164, 118,
-	196, 23, 73, 236, 127, 12, 111, 246,
-	108, 161, 59, 82, 41, 157, 85, 170,
-	251, 96, 134, 177, 187, 204, 62, 90,
-	203, 89, 95, 176, 156, 169, 160, 81,
-	11, 245, 22, 235, 122, 117, 44, 215,
-	79, 174, 213, 233, 230, 231, 173, 232,
-	116, 214, 244, 234, 168, 80, 88, 175,
-}
-
-const (
-	// The number of elements in the field.
-	fieldSize = 256
-
-	// The polynomial used to generate the logarithm table.
-	//
-	// There are a number of polynomials that work to generate
-	// a Galois field of 256 elements.  The choice is arbitrary,
-	// and we just use the first one.
-	//
-	// The possibilities are: 29, 43, 45, 77, 95, 99, 101, 105,
-	//* 113, 135, 141, 169, 195, 207, 231, and 245.
-	generatingPolynomial = 29
-)
-
-func main() {
-	t := generateExpTable()
-	fmt.Printf("var expTable = %#v\n", t)
-	//t2 := generateMulTableSplit(t)
-	//fmt.Printf("var mulTable = %#v\n", t2)
-	low, high := generateMulTableHalf(t)
-	fmt.Printf("var mulTableLow = %#v\n", low)
-	fmt.Printf("var mulTableHigh = %#v\n", high)
-}
-
-/**
- * Generates the inverse log table.
- */
-func generateExpTable() []byte {
-	result := make([]byte, fieldSize*2-2)
-	for i := 1; i < fieldSize; i++ {
-		log := logTable[i]
-		result[log] = byte(i)
-		result[log+fieldSize-1] = byte(i)
-	}
-	return result
-}
-
-func generateMulTable(expTable []byte) []byte {
-	result := make([]byte, 256*256)
-	for v := range result {
-		a := byte(v & 0xff)
-		b := byte(v >> 8)
-		if a == 0 || b == 0 {
-			result[v] = 0
-			continue
-		}
-		logA := int(logTable[a])
-		logB := int(logTable[b])
-		result[v] = expTable[logA+logB]
-	}
-	return result
-}
-
-func generateMulTableSplit(expTable []byte) [256][256]byte {
-	var result [256][256]byte
-	for a := range result {
-		for b := range result[a] {
-			if a == 0 || b == 0 {
-				result[a][b] = 0
-				continue
-			}
-			logA := int(logTable[a])
-			logB := int(logTable[b])
-			result[a][b] = expTable[logA+logB]
-		}
-	}
-	return result
-}
-
-func generateMulTableHalf(expTable []byte) (low [256][16]byte, high [256][16]byte) {
-	for a := range low {
-		for b := range low {
-			result := 0
-			if !(a == 0 || b == 0) {
-				logA := int(logTable[a])
-				logB := int(logTable[b])
-				result = int(expTable[logA+logB])
-			}
-			if (b & 0xf) == b {
-				low[a][b] = byte(result)
-			}
-			if (b & 0xf0) == b {
-				high[a][b>>4] = byte(result)
-			}
-		}
-	}
-	return
-}
--- a/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
+++ b/vendor/github.com/klauspost/reedsolomon/inversion_tree.go
@ -1,160 +0,0 @@
-/**
- * A thread-safe tree which caches inverted matrices.
- *
- * Copyright 2016, Peter Collins
- */
-
-package reedsolomon
-
-import (
-	"errors"
-	"sync"
-)
-
-// The tree uses a Reader-Writer mutex to make it thread-safe
-// when accessing cached matrices and inserting new ones.
-type inversionTree struct {
-	mutex *sync.RWMutex
-	root  inversionNode
-}
-
-type inversionNode struct {
-	matrix   matrix
-	children []*inversionNode
-}
-
-// newInversionTree initializes a tree for storing inverted matrices.
-// Note that the root node is the identity matrix as it implies
-// there were no errors with the original data.
-func newInversionTree(dataShards, parityShards int) inversionTree {
-	identity, _ := identityMatrix(dataShards)
-	root := inversionNode{
-		matrix:   identity,
-		children: make([]*inversionNode, dataShards+parityShards),
-	}
-	return inversionTree{
-		mutex: &sync.RWMutex{},
-		root:  root,
-	}
-}
-
-// GetInvertedMatrix returns the cached inverted matrix or nil if it
-// is not found in the tree keyed on the indices of invalid rows.
-func (t inversionTree) GetInvertedMatrix(invalidIndices []int) matrix {
-	// Lock the tree for reading before accessing the tree.
-	t.mutex.RLock()
-	defer t.mutex.RUnlock()
-
-	// If no invalid indices were give we should return the root
-	// identity matrix.
-	if len(invalidIndices) == 0 {
-		return t.root.matrix
-	}
-
-	// Recursively search for the inverted matrix in the tree, passing in
-	// 0 as the parent index as we start at the root of the tree.
-	return t.root.getInvertedMatrix(invalidIndices, 0)
-}
-
-// errAlreadySet is returned if the root node matrix is overwritten
-var errAlreadySet = errors.New("the root node identity matrix is already set")
-
-// InsertInvertedMatrix inserts a new inverted matrix into the tree
-// keyed by the indices of invalid rows.  The total number of shards
-// is required for creating the proper length lists of child nodes for
-// each node.
-func (t inversionTree) InsertInvertedMatrix(invalidIndices []int, matrix matrix, shards int) error {
-	// If no invalid indices were given then we are done because the
-	// root node is already set with the identity matrix.
-	if len(invalidIndices) == 0 {
-		return errAlreadySet
-	}
-
-	if !matrix.IsSquare() {
-		return errNotSquare
-	}
-
-	// Lock the tree for writing and reading before accessing the tree.
-	t.mutex.Lock()
-	defer t.mutex.Unlock()
-
-	// Recursively create nodes for the inverted matrix in the tree until
-	// we reach the node to insert the matrix to.  We start by passing in
-	// 0 as the parent index as we start at the root of the tree.
-	t.root.insertInvertedMatrix(invalidIndices, matrix, shards, 0)
-
-	return nil
-}
-
-func (n inversionNode) getInvertedMatrix(invalidIndices []int, parent int) matrix {
-	// Get the child node to search next from the list of children.  The
-	// list of children starts relative to the parent index passed in
-	// because the indices of invalid rows is sorted (by default).  As we
-	// search recursively, the first invalid index gets popped off the list,
-	// so when searching through the list of children, use that first invalid
-	// index to find the child node.
-	firstIndex := invalidIndices[0]
-	node := n.children[firstIndex-parent]
-
-	// If the child node doesn't exist in the list yet, fail fast by
-	// returning, so we can construct and insert the proper inverted matrix.
-	if node == nil {
-		return nil
-	}
-
-	// If there's more than one invalid index left in the list we should
-	// keep searching recursively.
-	if len(invalidIndices) > 1 {
-		// Search recursively on the child node by passing in the invalid indices
-		// with the first index popped off the front.  Also the parent index to
-		// pass down is the first index plus one.
-		return node.getInvertedMatrix(invalidIndices[1:], firstIndex+1)
-	}
-	// If there aren't any more invalid indices to search, we've found our
-	// node.  Return it, however keep in mind that the matrix could still be
-	// nil because intermediary nodes in the tree are created sometimes with
-	// their inversion matrices uninitialized.
-	return node.matrix
-}
-
-func (n inversionNode) insertInvertedMatrix(invalidIndices []int, matrix matrix, shards, parent int) {
-	// As above, get the child node to search next from the list of children.
-	// The list of children starts relative to the parent index passed in
-	// because the indices of invalid rows is sorted (by default).  As we
-	// search recursively, the first invalid index gets popped off the list,
-	// so when searching through the list of children, use that first invalid
-	// index to find the child node.
-	firstIndex := invalidIndices[0]
-	node := n.children[firstIndex-parent]
-
-	// If the child node doesn't exist in the list yet, create a new
-	// node because we have the writer lock and add it to the list
-	// of children.
-	if node == nil {
-		// Make the length of the list of children equal to the number
-		// of shards minus the first invalid index because the list of
-		// invalid indices is sorted, so only this length of errors
-		// are possible in the tree.
-		node = &inversionNode{
-			children: make([]*inversionNode, shards-firstIndex),
-		}
-		// Insert the new node into the tree at the first index relative
-		// to the parent index that was given in this recursive call.
-		n.children[firstIndex-parent] = node
-	}
-
-	// If there's more than one invalid index left in the list we should
-	// keep searching recursively in order to find the node to add our
-	// matrix.
-	if len(invalidIndices) > 1 {
-		// As above, search recursively on the child node by passing in
-		// the invalid indices with the first index popped off the front.
-		// Also the total number of shards and parent index are passed down
-		// which is equal to the first index plus one.
-		node.insertInvertedMatrix(invalidIndices[1:], matrix, shards, firstIndex+1)
-	} else {
-		// If there aren't any more invalid indices to search, we've found our
-		// node.  Cache the inverted matrix in this node.
-		node.matrix = matrix
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/matrix.go
+++ b/vendor/github.com/klauspost/reedsolomon/matrix.go
@ -1,279 +0,0 @@
-/**
- * Matrix Algebra over an 8-bit Galois Field
- *
- * Copyright 2015, Klaus Post
- * Copyright 2015, Backblaze, Inc.
- */
-
-package reedsolomon
-
-import (
-	"errors"
-	"fmt"
-	"strconv"
-	"strings"
-)
-
-// byte[row][col]
-type matrix [][]byte
-
-// newMatrix returns a matrix of zeros.
-func newMatrix(rows, cols int) (matrix, error) {
-	if rows <= 0 {
-		return nil, errInvalidRowSize
-	}
-	if cols <= 0 {
-		return nil, errInvalidColSize
-	}
-
-	m := matrix(make([][]byte, rows))
-	for i := range m {
-		m[i] = make([]byte, cols)
-	}
-	return m, nil
-}
-
-// NewMatrixData initializes a matrix with the given row-major data.
-// Note that data is not copied from input.
-func newMatrixData(data [][]byte) (matrix, error) {
-	m := matrix(data)
-	err := m.Check()
-	if err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-// IdentityMatrix returns an identity matrix of the given size.
-func identityMatrix(size int) (matrix, error) {
-	m, err := newMatrix(size, size)
-	if err != nil {
-		return nil, err
-	}
-	for i := range m {
-		m[i][i] = 1
-	}
-	return m, nil
-}
-
-// errInvalidRowSize will be returned if attempting to create a matrix with negative or zero row number.
-var errInvalidRowSize = errors.New("invalid row size")
-
-// errInvalidColSize will be returned if attempting to create a matrix with negative or zero column number.
-var errInvalidColSize = errors.New("invalid column size")
-
-// errColSizeMismatch is returned if the size of matrix columns mismatch.
-var errColSizeMismatch = errors.New("column size is not the same for all rows")
-
-func (m matrix) Check() error {
-	rows := len(m)
-	if rows <= 0 {
-		return errInvalidRowSize
-	}
-	cols := len(m[0])
-	if cols <= 0 {
-		return errInvalidColSize
-	}
-
-	for _, col := range m {
-		if len(col) != cols {
-			return errColSizeMismatch
-		}
-	}
-	return nil
-}
-
-// String returns a human-readable string of the matrix contents.
-//
-// Example: [[1, 2], [3, 4]]
-func (m matrix) String() string {
-	rowOut := make([]string, 0, len(m))
-	for _, row := range m {
-		colOut := make([]string, 0, len(row))
-		for _, col := range row {
-			colOut = append(colOut, strconv.Itoa(int(col)))
-		}
-		rowOut = append(rowOut, "["+strings.Join(colOut, ", ")+"]")
-	}
-	return "[" + strings.Join(rowOut, ", ") + "]"
-}
-
-// Multiply multiplies this matrix (the one on the left) by another
-// matrix (the one on the right) and returns a new matrix with the result.
-func (m matrix) Multiply(right matrix) (matrix, error) {
-	if len(m[0]) != len(right) {
-		return nil, fmt.Errorf("columns on left (%d) is different than rows on right (%d)", len(m[0]), len(right))
-	}
-	result, _ := newMatrix(len(m), len(right[0]))
-	for r, row := range result {
-		for c := range row {
-			var value byte
-			for i := range m[0] {
-				value ^= galMultiply(m[r][i], right[i][c])
-			}
-			result[r][c] = value
-		}
-	}
-	return result, nil
-}
-
-// Augment returns the concatenation of this matrix and the matrix on the right.
-func (m matrix) Augment(right matrix) (matrix, error) {
-	if len(m) != len(right) {
-		return nil, errMatrixSize
-	}
-
-	result, _ := newMatrix(len(m), len(m[0])+len(right[0]))
-	for r, row := range m {
-		for c := range row {
-			result[r][c] = m[r][c]
-		}
-		cols := len(m[0])
-		for c := range right[0] {
-			result[r][cols+c] = right[r][c]
-		}
-	}
-	return result, nil
-}
-
-// errMatrixSize is returned if matrix dimensions are doesn't match.
-var errMatrixSize = errors.New("matrix sizes does not match")
-
-func (m matrix) SameSize(n matrix) error {
-	if len(m) != len(n) {
-		return errMatrixSize
-	}
-	for i := range m {
-		if len(m[i]) != len(n[i]) {
-			return errMatrixSize
-		}
-	}
-	return nil
-}
-
-// Returns a part of this matrix. Data is copied.
-func (m matrix) SubMatrix(rmin, cmin, rmax, cmax int) (matrix, error) {
-	result, err := newMatrix(rmax-rmin, cmax-cmin)
-	if err != nil {
-		return nil, err
-	}
-	// OPTME: If used heavily, use copy function to copy slice
-	for r := rmin; r < rmax; r++ {
-		for c := cmin; c < cmax; c++ {
-			result[r-rmin][c-cmin] = m[r][c]
-		}
-	}
-	return result, nil
-}
-
-// SwapRows Exchanges two rows in the matrix.
-func (m matrix) SwapRows(r1, r2 int) error {
-	if r1 < 0 || len(m) <= r1 || r2 < 0 || len(m) <= r2 {
-		return errInvalidRowSize
-	}
-	m[r2], m[r1] = m[r1], m[r2]
-	return nil
-}
-
-// IsSquare will return true if the matrix is square
-// and nil if the matrix is square
-func (m matrix) IsSquare() bool {
-	return len(m) == len(m[0])
-}
-
-// errSingular is returned if the matrix is singular and cannot be inversed
-var errSingular = errors.New("matrix is singular")
-
-// errNotSquare is returned if attempting to inverse a non-square matrix.
-var errNotSquare = errors.New("only square matrices can be inverted")
-
-// Invert returns the inverse of this matrix.
-// Returns ErrSingular when the matrix is singular and doesn't have an inverse.
-// The matrix must be square, otherwise ErrNotSquare is returned.
-func (m matrix) Invert() (matrix, error) {
-	if !m.IsSquare() {
-		return nil, errNotSquare
-	}
-
-	size := len(m)
-	work, _ := identityMatrix(size)
-	work, _ = m.Augment(work)
-
-	err := work.gaussianElimination()
-	if err != nil {
-		return nil, err
-	}
-
-	return work.SubMatrix(0, size, size, size*2)
-}
-
-func (m matrix) gaussianElimination() error {
-	rows := len(m)
-	columns := len(m[0])
-	// Clear out the part below the main diagonal and scale the main
-	// diagonal to be 1.
-	for r := 0; r < rows; r++ {
-		// If the element on the diagonal is 0, find a row below
-		// that has a non-zero and swap them.
-		if m[r][r] == 0 {
-			for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
-				if m[rowBelow][r] != 0 {
-					m.SwapRows(r, rowBelow)
-					break
-				}
-			}
-		}
-		// If we couldn't find one, the matrix is singular.
-		if m[r][r] == 0 {
-			return errSingular
-		}
-		// Scale to 1.
-		if m[r][r] != 1 {
-			scale := galDivide(1, m[r][r])
-			for c := 0; c < columns; c++ {
-				m[r][c] = galMultiply(m[r][c], scale)
-			}
-		}
-		// Make everything below the 1 be a 0 by subtracting
-		// a multiple of it.  (Subtraction and addition are
-		// both exclusive or in the Galois field.)
-		for rowBelow := r + 1; rowBelow < rows; rowBelow++ {
-			if m[rowBelow][r] != 0 {
-				scale := m[rowBelow][r]
-				for c := 0; c < columns; c++ {
-					m[rowBelow][c] ^= galMultiply(scale, m[r][c])
-				}
-			}
-		}
-	}
-
-	// Now clear the part above the main diagonal.
-	for d := 0; d < rows; d++ {
-		for rowAbove := 0; rowAbove < d; rowAbove++ {
-			if m[rowAbove][d] != 0 {
-				scale := m[rowAbove][d]
-				for c := 0; c < columns; c++ {
-					m[rowAbove][c] ^= galMultiply(scale, m[d][c])
-				}
-
-			}
-		}
-	}
-	return nil
-}
-
-// Create a Vandermonde matrix, which is guaranteed to have the
-// property that any subset of rows that forms a square matrix
-// is invertible.
-func vandermonde(rows, cols int) (matrix, error) {
-	result, err := newMatrix(rows, cols)
-	if err != nil {
-		return nil, err
-	}
-	for r, row := range result {
-		for c := range row {
-			result[r][c] = galExp(byte(r), c)
-		}
-	}
-	return result, nil
-}
--- a/vendor/github.com/klauspost/reedsolomon/options.go
+++ b/vendor/github.com/klauspost/reedsolomon/options.go
@ -1,67 +0,0 @@
-package reedsolomon
-
-import (
-	"runtime"
-
-	"github.com/klauspost/cpuid"
-)
-
-// Option allows to override processing parameters.
-type Option func(*options)
-
-type options struct {
-	maxGoroutines     int
-	minSplitSize      int
-	useAVX2, useSSSE3 bool
-}
-
-var defaultOptions = options{
-	maxGoroutines: 50,
-	minSplitSize:  512,
-}
-
-func init() {
-	if runtime.GOMAXPROCS(0) <= 1 {
-		defaultOptions.maxGoroutines = 1
-	}
-	// Detect CPU capabilities.
-	defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
-	defaultOptions.useAVX2 = cpuid.CPU.AVX2()
-}
-
-// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
-// Jobs will be split into this many parts, unless each goroutine would have to process
-// less than minSplitSize bytes (set with WithMinSplitSize).
-// For the best speed, keep this well above the GOMAXPROCS number for more fine grained
-// scheduling.
-// If n <= 0, it is ignored.
-func WithMaxGoroutines(n int) Option {
-	return func(o *options) {
-		if n > 0 {
-			o.maxGoroutines = n
-		}
-	}
-}
-
-// MinSplitSize Is the minimum encoding size in bytes per goroutine.
-// See WithMaxGoroutines on how jobs are split.
-// If n <= 0, it is ignored.
-func WithMinSplitSize(n int) Option {
-	return func(o *options) {
-		if n > 0 {
-			o.maxGoroutines = n
-		}
-	}
-}
-
-func withSSE3(enabled bool) Option {
-	return func(o *options) {
-		o.useSSSE3 = enabled
-	}
-}
-
-func withAVX2(enabled bool) Option {
-	return func(o *options) {
-		o.useAVX2 = enabled
-	}
-}
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@ -1,596 +0,0 @@
-/**
- * Reed-Solomon Coding over 8-bit values.
- *
- * Copyright 2015, Klaus Post
- * Copyright 2015, Backblaze, Inc.
- */
-
-// Package reedsolomon enables Erasure Coding in Go
-//
-// For usage and examples, see https://github.com/klauspost/reedsolomon
-//
-package reedsolomon
-
-import (
-	"bytes"
-	"errors"
-	"io"
-	"sync"
-)
-
-// Encoder is an interface to encode Reed-Salomon parity sets for your data.
-type Encoder interface {
-	// Encodes parity for a set of data shards.
-	// Input is 'shards' containing data shards followed by parity shards.
-	// The number of shards must match the number given to New().
-	// Each shard is a byte array, and they must all be the same size.
-	// The parity shards will always be overwritten and the data shards
-	// will remain the same, so it is safe for you to read from the
-	// data shards while this is running.
-	Encode(shards [][]byte) error
-
-	// Verify returns true if the parity shards contain correct data.
-	// The data is the same format as Encode. No data is modified, so
-	// you are allowed to read from data while this is running.
-	Verify(shards [][]byte) (bool, error)
-
-	// Reconstruct will recreate the missing shards if possible.
-	//
-	// Given a list of shards, some of which contain data, fills in the
-	// ones that don't have data.
-	//
-	// The length of the array must be equal to the total number of shards.
-	// You indicate that a shard is missing by setting it to nil.
-	//
-	// If there are too few shards to reconstruct the missing
-	// ones, ErrTooFewShards will be returned.
-	//
-	// The reconstructed shard set is complete, but integrity is not verified.
-	// Use the Verify function to check if data set is ok.
-	Reconstruct(shards [][]byte) error
-
-	// Split a data slice into the number of shards given to the encoder,
-	// and create empty parity shards.
-	//
-	// The data will be split into equally sized shards.
-	// If the data size isn't dividable by the number of shards,
-	// the last shard will contain extra zeros.
-	//
-	// There must be at least 1 byte otherwise ErrShortData will be
-	// returned.
-	//
-	// The data will not be copied, except for the last shard, so you
-	// should not modify the data of the input slice afterwards.
-	Split(data []byte) ([][]byte, error)
-
-	// Join the shards and write the data segment to dst.
-	//
-	// Only the data shards are considered.
-	// You must supply the exact output size you want.
-	// If there are to few shards given, ErrTooFewShards will be returned.
-	// If the total data size is less than outSize, ErrShortData will be returned.
-	Join(dst io.Writer, shards [][]byte, outSize int) error
-}
-
-// reedSolomon contains a matrix for a specific
-// distribution of datashards and parity shards.
-// Construct if using New()
-type reedSolomon struct {
-	DataShards   int // Number of data shards, should not be modified.
-	ParityShards int // Number of parity shards, should not be modified.
-	Shards       int // Total number of shards. Calculated, and should not be modified.
-	m            matrix
-	tree         inversionTree
-	parity       [][]byte
-	o            options
-}
-
-// ErrInvShardNum will be returned by New, if you attempt to create
-// an Encoder where either data or parity shards is zero or less.
-var ErrInvShardNum = errors.New("cannot create Encoder with zero or less data/parity shards")
-
-// ErrMaxShardNum will be returned by New, if you attempt to create
-// an Encoder where data and parity shards cannot be bigger than
-// Galois field GF(2^8) - 1.
-var ErrMaxShardNum = errors.New("cannot create Encoder with 255 or more data+parity shards")
-
-// New creates a new encoder and initializes it to
-// the number of data shards and parity shards that
-// you want to use. You can reuse this encoder.
-// Note that the maximum number of data shards is 256.
-// If no options are supplied, default options are used.
-func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
-	r := reedSolomon{
-		DataShards:   dataShards,
-		ParityShards: parityShards,
-		Shards:       dataShards + parityShards,
-		o:            defaultOptions,
-	}
-
-	for _, opt := range opts {
-		opt(&r.o)
-	}
-	if dataShards <= 0 || parityShards <= 0 {
-		return nil, ErrInvShardNum
-	}
-
-	if dataShards+parityShards > 255 {
-		return nil, ErrMaxShardNum
-	}
-
-	// Start with a Vandermonde matrix.  This matrix would work,
-	// in theory, but doesn't have the property that the data
-	// shards are unchanged after encoding.
-	vm, err := vandermonde(r.Shards, dataShards)
-	if err != nil {
-		return nil, err
-	}
-
-	// Multiply by the inverse of the top square of the matrix.
-	// This will make the top square be the identity matrix, but
-	// preserve the property that any square subset of rows  is
-	// invertible.
-	top, _ := vm.SubMatrix(0, 0, dataShards, dataShards)
-	top, _ = top.Invert()
-	r.m, _ = vm.Multiply(top)
-
-	// Inverted matrices are cached in a tree keyed by the indices
-	// of the invalid rows of the data to reconstruct.
-	// The inversion root node will have the identity matrix as
-	// its inversion matrix because it implies there are no errors
-	// with the original data.
-	r.tree = newInversionTree(dataShards, parityShards)
-
-	r.parity = make([][]byte, parityShards)
-	for i := range r.parity {
-		r.parity[i] = r.m[dataShards+i]
-	}
-
-	return &r, err
-}
-
-// ErrTooFewShards is returned if too few shards where given to
-// Encode/Verify/Reconstruct. It will also be returned from Reconstruct
-// if there were too few shards to reconstruct the missing data.
-var ErrTooFewShards = errors.New("too few shards given")
-
-// Encodes parity for a set of data shards.
-// An array 'shards' containing data shards followed by parity shards.
-// The number of shards must match the number given to New.
-// Each shard is a byte array, and they must all be the same size.
-// The parity shards will always be overwritten and the data shards
-// will remain the same.
-func (r reedSolomon) Encode(shards [][]byte) error {
-	if len(shards) != r.Shards {
-		return ErrTooFewShards
-	}
-
-	err := checkShards(shards, false)
-	if err != nil {
-		return err
-	}
-
-	// Get the slice of output buffers.
-	output := shards[r.DataShards:]
-
-	// Do the coding.
-	r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0]))
-	return nil
-}
-
-// Verify returns true if the parity shards contain the right data.
-// The data is the same format as Encode. No data is modified.
-func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
-	if len(shards) != r.Shards {
-		return false, ErrTooFewShards
-	}
-	err := checkShards(shards, false)
-	if err != nil {
-		return false, err
-	}
-
-	// Slice of buffers being checked.
-	toCheck := shards[r.DataShards:]
-
-	// Do the checking.
-	return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil
-}
-
-// Multiplies a subset of rows from a coding matrix by a full set of
-// input shards to produce some output shards.
-// 'matrixRows' is The rows from the matrix to use.
-// 'inputs' An array of byte arrays, each of which is one input shard.
-// The number of inputs used is determined by the length of each matrix row.
-// outputs Byte arrays where the computed shards are stored.
-// The number of outputs computed, and the
-// number of matrix rows used, is determined by
-// outputCount, which is the number of outputs to compute.
-func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
-	if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
-		r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
-		return
-	}
-	for c := 0; c < r.DataShards; c++ {
-		in := inputs[c]
-		for iRow := 0; iRow < outputCount; iRow++ {
-			if c == 0 {
-				galMulSlice(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
-			} else {
-				galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
-			}
-		}
-	}
-}
-
-// Perform the same as codeSomeShards, but split the workload into
-// several goroutines.
-func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
-	var wg sync.WaitGroup
-	do := byteCount / r.o.maxGoroutines
-	if do < r.o.minSplitSize {
-		do = r.o.minSplitSize
-	}
-	start := 0
-	for start < byteCount {
-		if start+do > byteCount {
-			do = byteCount - start
-		}
-		wg.Add(1)
-		go func(start, stop int) {
-			for c := 0; c < r.DataShards; c++ {
-				in := inputs[c]
-				for iRow := 0; iRow < outputCount; iRow++ {
-					if c == 0 {
-						galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
-					} else {
-						galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
-					}
-				}
-			}
-			wg.Done()
-		}(start, start+do)
-		start += do
-	}
-	wg.Wait()
-}
-
-// checkSomeShards is mostly the same as codeSomeShards,
-// except this will check values and return
-// as soon as a difference is found.
-func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
-	if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
-		return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
-	}
-	outputs := make([][]byte, len(toCheck))
-	for i := range outputs {
-		outputs[i] = make([]byte, byteCount)
-	}
-	for c := 0; c < r.DataShards; c++ {
-		in := inputs[c]
-		for iRow := 0; iRow < outputCount; iRow++ {
-			galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
-		}
-	}
-
-	for i, calc := range outputs {
-		if !bytes.Equal(calc, toCheck[i]) {
-			return false
-		}
-	}
-	return true
-}
-
-func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
-	same := true
-	var mu sync.RWMutex // For above
-
-	var wg sync.WaitGroup
-	do := byteCount / r.o.maxGoroutines
-	if do < r.o.minSplitSize {
-		do = r.o.minSplitSize
-	}
-	start := 0
-	for start < byteCount {
-		if start+do > byteCount {
-			do = byteCount - start
-		}
-		wg.Add(1)
-		go func(start, do int) {
-			defer wg.Done()
-			outputs := make([][]byte, len(toCheck))
-			for i := range outputs {
-				outputs[i] = make([]byte, do)
-			}
-			for c := 0; c < r.DataShards; c++ {
-				mu.RLock()
-				if !same {
-					mu.RUnlock()
-					return
-				}
-				mu.RUnlock()
-				in := inputs[c][start : start+do]
-				for iRow := 0; iRow < outputCount; iRow++ {
-					galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
-				}
-			}
-
-			for i, calc := range outputs {
-				if !bytes.Equal(calc, toCheck[i][start:start+do]) {
-					mu.Lock()
-					same = false
-					mu.Unlock()
-					return
-				}
-			}
-		}(start, do)
-		start += do
-	}
-	wg.Wait()
-	return same
-}
-
-// ErrShardNoData will be returned if there are no shards,
-// or if the length of all shards is zero.
-var ErrShardNoData = errors.New("no shard data")
-
-// ErrShardSize is returned if shard length isn't the same for all
-// shards.
-var ErrShardSize = errors.New("shard sizes does not match")
-
-// checkShards will check if shards are the same size
-// or 0, if allowed. An error is returned if this fails.
-// An error is also returned if all shards are size 0.
-func checkShards(shards [][]byte, nilok bool) error {
-	size := shardSize(shards)
-	if size == 0 {
-		return ErrShardNoData
-	}
-	for _, shard := range shards {
-		if len(shard) != size {
-			if len(shard) != 0 || !nilok {
-				return ErrShardSize
-			}
-		}
-	}
-	return nil
-}
-
-// shardSize return the size of a single shard.
-// The first non-zero size is returned,
-// or 0 if all shards are size 0.
-func shardSize(shards [][]byte) int {
-	for _, shard := range shards {
-		if len(shard) != 0 {
-			return len(shard)
-		}
-	}
-	return 0
-}
-
-// Reconstruct will recreate the missing shards, if possible.
-//
-// Given a list of shards, some of which contain data, fills in the
-// ones that don't have data.
-//
-// The length of the array must be equal to Shards.
-// You indicate that a shard is missing by setting it to nil.
-//
-// If there are too few shards to reconstruct the missing
-// ones, ErrTooFewShards will be returned.
-//
-// The reconstructed shard set is complete, but integrity is not verified.
-// Use the Verify function to check if data set is ok.
-func (r reedSolomon) Reconstruct(shards [][]byte) error {
-	if len(shards) != r.Shards {
-		return ErrTooFewShards
-	}
-	// Check arguments.
-	err := checkShards(shards, true)
-	if err != nil {
-		return err
-	}
-
-	shardSize := shardSize(shards)
-
-	// Quick check: are all of the shards present?  If so, there's
-	// nothing to do.
-	numberPresent := 0
-	for i := 0; i < r.Shards; i++ {
-		if len(shards[i]) != 0 {
-			numberPresent++
-		}
-	}
-	if numberPresent == r.Shards {
-		// Cool.  All of the shards data data.  We don't
-		// need to do anything.
-		return nil
-	}
-
-	// More complete sanity check
-	if numberPresent < r.DataShards {
-		return ErrTooFewShards
-	}
-
-	// Pull out an array holding just the shards that
-	// correspond to the rows of the submatrix.  These shards
-	// will be the input to the decoding process that re-creates
-	// the missing data shards.
-	//
-	// Also, create an array of indices of the valid rows we do have
-	// and the invalid rows we don't have up until we have enough valid rows.
-	subShards := make([][]byte, r.DataShards)
-	validIndices := make([]int, r.DataShards)
-	invalidIndices := make([]int, 0)
-	subMatrixRow := 0
-	for matrixRow := 0; matrixRow < r.Shards && subMatrixRow < r.DataShards; matrixRow++ {
-		if len(shards[matrixRow]) != 0 {
-			subShards[subMatrixRow] = shards[matrixRow]
-			validIndices[subMatrixRow] = matrixRow
-			subMatrixRow++
-		} else {
-			invalidIndices = append(invalidIndices, matrixRow)
-		}
-	}
-
-	// Attempt to get the cached inverted matrix out of the tree
-	// based on the indices of the invalid rows.
-	dataDecodeMatrix := r.tree.GetInvertedMatrix(invalidIndices)
-
-	// If the inverted matrix isn't cached in the tree yet we must
-	// construct it ourselves and insert it into the tree for the
-	// future.  In this way the inversion tree is lazily loaded.
-	if dataDecodeMatrix == nil {
-		// Pull out the rows of the matrix that correspond to the
-		// shards that we have and build a square matrix.  This
-		// matrix could be used to generate the shards that we have
-		// from the original data.
-		subMatrix, _ := newMatrix(r.DataShards, r.DataShards)
-		for subMatrixRow, validIndex := range validIndices {
-			for c := 0; c < r.DataShards; c++ {
-				subMatrix[subMatrixRow][c] = r.m[validIndex][c]
-			}
-		}
-		// Invert the matrix, so we can go from the encoded shards
-		// back to the original data.  Then pull out the row that
-		// generates the shard that we want to decode.  Note that
-		// since this matrix maps back to the original data, it can
-		// be used to create a data shard, but not a parity shard.
-		dataDecodeMatrix, err = subMatrix.Invert()
-		if err != nil {
-			return err
-		}
-
-		// Cache the inverted matrix in the tree for future use keyed on the
-		// indices of the invalid rows.
-		err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.Shards)
-		if err != nil {
-			return err
-		}
-	}
-
-	// Re-create any data shards that were missing.
-	//
-	// The input to the coding is all of the shards we actually
-	// have, and the output is the missing data shards.  The computation
-	// is done using the special decode matrix we just built.
-	outputs := make([][]byte, r.ParityShards)
-	matrixRows := make([][]byte, r.ParityShards)
-	outputCount := 0
-
-	for iShard := 0; iShard < r.DataShards; iShard++ {
-		if len(shards[iShard]) == 0 {
-			shards[iShard] = make([]byte, shardSize)
-			outputs[outputCount] = shards[iShard]
-			matrixRows[outputCount] = dataDecodeMatrix[iShard]
-			outputCount++
-		}
-	}
-	r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize)
-
-	// Now that we have all of the data shards intact, we can
-	// compute any of the parity that is missing.
-	//
-	// The input to the coding is ALL of the data shards, including
-	// any that we just calculated.  The output is whichever of the
-	// data shards were missing.
-	outputCount = 0
-	for iShard := r.DataShards; iShard < r.Shards; iShard++ {
-		if len(shards[iShard]) == 0 {
-			shards[iShard] = make([]byte, shardSize)
-			outputs[outputCount] = shards[iShard]
-			matrixRows[outputCount] = r.parity[iShard-r.DataShards]
-			outputCount++
-		}
-	}
-	r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize)
-	return nil
-}
-
-// ErrShortData will be returned by Split(), if there isn't enough data
-// to fill the number of shards.
-var ErrShortData = errors.New("not enough data to fill the number of requested shards")
-
-// Split a data slice into the number of shards given to the encoder,
-// and create empty parity shards.
-//
-// The data will be split into equally sized shards.
-// If the data size isn't divisible by the number of shards,
-// the last shard will contain extra zeros.
-//
-// There must be at least 1 byte otherwise ErrShortData will be
-// returned.
-//
-// The data will not be copied, except for the last shard, so you
-// should not modify the data of the input slice afterwards.
-func (r reedSolomon) Split(data []byte) ([][]byte, error) {
-	if len(data) == 0 {
-		return nil, ErrShortData
-	}
-	// Calculate number of bytes per shard.
-	perShard := (len(data) + r.DataShards - 1) / r.DataShards
-
-	// Pad data to r.Shards*perShard.
-	padding := make([]byte, (r.Shards*perShard)-len(data))
-	data = append(data, padding...)
-
-	// Split into equal-length shards.
-	dst := make([][]byte, r.Shards)
-	for i := range dst {
-		dst[i] = data[:perShard]
-		data = data[perShard:]
-	}
-
-	return dst, nil
-}
-
-// ErrReconstructRequired is returned if too few data shards are intact and a
-// reconstruction is required before you can successfully join the shards.
-var ErrReconstructRequired = errors.New("reconstruction required as one or more required data shards are nil")
-
-// Join the shards and write the data segment to dst.
-//
-// Only the data shards are considered.
-// You must supply the exact output size you want.
-//
-// If there are to few shards given, ErrTooFewShards will be returned.
-// If the total data size is less than outSize, ErrShortData will be returned.
-// If one or more required data shards are nil, ErrReconstructRequired will be returned.
-func (r reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error {
-	// Do we have enough shards?
-	if len(shards) < r.DataShards {
-		return ErrTooFewShards
-	}
-	shards = shards[:r.DataShards]
-
-	// Do we have enough data?
-	size := 0
-	for _, shard := range shards {
-		if shard == nil {
-			return ErrReconstructRequired
-		}
-		size += len(shard)
-
-		// Do we have enough data already?
-		if size >= outSize {
-			break
-		}
-	}
-	if size < outSize {
-		return ErrShortData
-	}
-
-	// Copy data to dst
-	write := outSize
-	for _, shard := range shards {
-		if write < len(shard) {
-			_, err := dst.Write(shard[:write])
-			return err
-		}
-		n, err := dst.Write(shard)
-		if err != nil {
-			return err
-		}
-		write -= n
-	}
-	return nil
-}
--- a/vendor/github.com/klauspost/reedsolomon/streaming.go
+++ b/vendor/github.com/klauspost/reedsolomon/streaming.go
@ -1,575 +0,0 @@
-/**
- * Reed-Solomon Coding over 8-bit values.
- *
- * Copyright 2015, Klaus Post
- * Copyright 2015, Backblaze, Inc.
- */
-
-package reedsolomon
-
-import (
-	"bytes"
-	"errors"
-	"fmt"
-	"io"
-	"sync"
-)
-
-// StreamEncoder is an interface to encode Reed-Salomon parity sets for your data.
-// It provides a fully streaming interface, and processes data in blocks of up to 4MB.
-//
-// For small shard sizes, 10MB and below, it is recommended to use the in-memory interface,
-// since the streaming interface has a start up overhead.
-//
-// For all operations, no readers and writers should not assume any order/size of
-// individual reads/writes.
-//
-// For usage examples, see "stream-encoder.go" and "streamdecoder.go" in the examples
-// folder.
-type StreamEncoder interface {
-	// Encodes parity shards for a set of data shards.
-	//
-	// Input is 'shards' containing readers for data shards followed by parity shards
-	// io.Writer.
-	//
-	// The number of shards must match the number given to NewStream().
-	//
-	// Each reader must supply the same number of bytes.
-	//
-	// The parity shards will be written to the writer.
-	// The number of bytes written will match the input size.
-	//
-	// If a data stream returns an error, a StreamReadError type error
-	// will be returned. If a parity writer returns an error, a
-	// StreamWriteError will be returned.
-	Encode(data []io.Reader, parity []io.Writer) error
-
-	// Verify returns true if the parity shards contain correct data.
-	//
-	// The number of shards must match the number total data+parity shards
-	// given to NewStream().
-	//
-	// Each reader must supply the same number of bytes.
-	// If a shard stream returns an error, a StreamReadError type error
-	// will be returned.
-	Verify(shards []io.Reader) (bool, error)
-
-	// Reconstruct will recreate the missing shards if possible.
-	//
-	// Given a list of valid shards (to read) and invalid shards (to write)
-	//
-	// You indicate that a shard is missing by setting it to nil in the 'valid'
-	// slice and at the same time setting a non-nil writer in "fill".
-	// An index cannot contain both non-nil 'valid' and 'fill' entry.
-	// If both are provided 'ErrReconstructMismatch' is returned.
-	//
-	// If there are too few shards to reconstruct the missing
-	// ones, ErrTooFewShards will be returned.
-	//
-	// The reconstructed shard set is complete, but integrity is not verified.
-	// Use the Verify function to check if data set is ok.
-	Reconstruct(valid []io.Reader, fill []io.Writer) error
-
-	// Split a an input stream into the number of shards given to the encoder.
-	//
-	// The data will be split into equally sized shards.
-	// If the data size isn't dividable by the number of shards,
-	// the last shard will contain extra zeros.
-	//
-	// You must supply the total size of your input.
-	// 'ErrShortData' will be returned if it is unable to retrieve the
-	// number of bytes indicated.
-	Split(data io.Reader, dst []io.Writer, size int64) (err error)
-
-	// Join the shards and write the data segment to dst.
-	//
-	// Only the data shards are considered.
-	//
-	// You must supply the exact output size you want.
-	// If there are to few shards given, ErrTooFewShards will be returned.
-	// If the total data size is less than outSize, ErrShortData will be returned.
-	Join(dst io.Writer, shards []io.Reader, outSize int64) error
-}
-
-// StreamReadError is returned when a read error is encountered
-// that relates to a supplied stream.
-// This will allow you to find out which reader has failed.
-type StreamReadError struct {
-	Err    error // The error
-	Stream int   // The stream number on which the error occurred
-}
-
-// Error returns the error as a string
-func (s StreamReadError) Error() string {
-	return fmt.Sprintf("error reading stream %d: %s", s.Stream, s.Err)
-}
-
-// String returns the error as a string
-func (s StreamReadError) String() string {
-	return s.Error()
-}
-
-// StreamWriteError is returned when a write error is encountered
-// that relates to a supplied stream. This will allow you to
-// find out which reader has failed.
-type StreamWriteError struct {
-	Err    error // The error
-	Stream int   // The stream number on which the error occurred
-}
-
-// Error returns the error as a string
-func (s StreamWriteError) Error() string {
-	return fmt.Sprintf("error writing stream %d: %s", s.Stream, s.Err)
-}
-
-// String returns the error as a string
-func (s StreamWriteError) String() string {
-	return s.Error()
-}
-
-// rsStream contains a matrix for a specific
-// distribution of datashards and parity shards.
-// Construct if using NewStream()
-type rsStream struct {
-	r  *reedSolomon
-	bs int // Block size
-	// Shard reader
-	readShards func(dst [][]byte, in []io.Reader) error
-	// Shard writer
-	writeShards func(out []io.Writer, in [][]byte) error
-	creads      bool
-	cwrites     bool
-}
-
-// NewStream creates a new encoder and initializes it to
-// the number of data shards and parity shards that
-// you want to use. You can reuse this encoder.
-// Note that the maximum number of data shards is 256.
-func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) {
-	enc, err := New(dataShards, parityShards, o...)
-	if err != nil {
-		return nil, err
-	}
-	rs := enc.(*reedSolomon)
-	r := rsStream{r: rs, bs: 4 << 20}
-	r.readShards = readShards
-	r.writeShards = writeShards
-	return &r, err
-}
-
-// NewStreamC creates a new encoder and initializes it to
-// the number of data shards and parity shards given.
-//
-// This functions as 'NewStream', but allows you to enable CONCURRENT reads and writes.
-func NewStreamC(dataShards, parityShards int, conReads, conWrites bool, o ...Option) (StreamEncoder, error) {
-	enc, err := New(dataShards, parityShards, o...)
-	if err != nil {
-		return nil, err
-	}
-	rs := enc.(*reedSolomon)
-	r := rsStream{r: rs, bs: 4 << 20}
-	r.readShards = readShards
-	r.writeShards = writeShards
-	if conReads {
-		r.readShards = cReadShards
-	}
-	if conWrites {
-		r.writeShards = cWriteShards
-	}
-	return &r, err
-}
-
-func createSlice(n, length int) [][]byte {
-	out := make([][]byte, n)
-	for i := range out {
-		out[i] = make([]byte, length)
-	}
-	return out
-}
-
-// Encodes parity shards for a set of data shards.
-//
-// Input is 'shards' containing readers for data shards followed by parity shards
-// io.Writer.
-//
-// The number of shards must match the number given to NewStream().
-//
-// Each reader must supply the same number of bytes.
-//
-// The parity shards will be written to the writer.
-// The number of bytes written will match the input size.
-//
-// If a data stream returns an error, a StreamReadError type error
-// will be returned. If a parity writer returns an error, a
-// StreamWriteError will be returned.
-func (r rsStream) Encode(data []io.Reader, parity []io.Writer) error {
-	if len(data) != r.r.DataShards {
-		return ErrTooFewShards
-	}
-
-	if len(parity) != r.r.ParityShards {
-		return ErrTooFewShards
-	}
-
-	all := createSlice(r.r.Shards, r.bs)
-	in := all[:r.r.DataShards]
-	out := all[r.r.DataShards:]
-	read := 0
-
-	for {
-		err := r.readShards(in, data)
-		switch err {
-		case nil:
-		case io.EOF:
-			if read == 0 {
-				return ErrShardNoData
-			}
-			return nil
-		default:
-			return err
-		}
-		out = trimShards(out, shardSize(in))
-		read += shardSize(in)
-		err = r.r.Encode(all)
-		if err != nil {
-			return err
-		}
-		err = r.writeShards(parity, out)
-		if err != nil {
-			return err
-		}
-	}
-}
-
-// Trim the shards so they are all the same size
-func trimShards(in [][]byte, size int) [][]byte {
-	for i := range in {
-		if in[i] != nil {
-			in[i] = in[i][0:size]
-		}
-		if len(in[i]) < size {
-			in[i] = nil
-		}
-	}
-	return in
-}
-
-func readShards(dst [][]byte, in []io.Reader) error {
-	if len(in) != len(dst) {
-		panic("internal error: in and dst size does not match")
-	}
-	size := -1
-	for i := range in {
-		if in[i] == nil {
-			dst[i] = nil
-			continue
-		}
-		n, err := io.ReadFull(in[i], dst[i])
-		// The error is EOF only if no bytes were read.
-		// If an EOF happens after reading some but not all the bytes,
-		// ReadFull returns ErrUnexpectedEOF.
-		switch err {
-		case io.ErrUnexpectedEOF, io.EOF:
-			if size < 0 {
-				size = n
-			} else if n != size {
-				// Shard sizes must match.
-				return ErrShardSize
-			}
-			dst[i] = dst[i][0:n]
-		case nil:
-			continue
-		default:
-			return StreamReadError{Err: err, Stream: i}
-		}
-	}
-	if size == 0 {
-		return io.EOF
-	}
-	return nil
-}
-
-func writeShards(out []io.Writer, in [][]byte) error {
-	if len(out) != len(in) {
-		panic("internal error: in and out size does not match")
-	}
-	for i := range in {
-		if out[i] == nil {
-			continue
-		}
-		n, err := out[i].Write(in[i])
-		if err != nil {
-			return StreamWriteError{Err: err, Stream: i}
-		}
-		//
-		if n != len(in[i]) {
-			return StreamWriteError{Err: io.ErrShortWrite, Stream: i}
-		}
-	}
-	return nil
-}
-
-type readResult struct {
-	n    int
-	size int
-	err  error
-}
-
-// cReadShards reads shards concurrently
-func cReadShards(dst [][]byte, in []io.Reader) error {
-	if len(in) != len(dst) {
-		panic("internal error: in and dst size does not match")
-	}
-	var wg sync.WaitGroup
-	wg.Add(len(in))
-	res := make(chan readResult, len(in))
-	for i := range in {
-		if in[i] == nil {
-			dst[i] = nil
-			wg.Done()
-			continue
-		}
-		go func(i int) {
-			defer wg.Done()
-			n, err := io.ReadFull(in[i], dst[i])
-			// The error is EOF only if no bytes were read.
-			// If an EOF happens after reading some but not all the bytes,
-			// ReadFull returns ErrUnexpectedEOF.
-			res <- readResult{size: n, err: err, n: i}
-
-		}(i)
-	}
-	wg.Wait()
-	close(res)
-	size := -1
-	for r := range res {
-		switch r.err {
-		case io.ErrUnexpectedEOF, io.EOF:
-			if size < 0 {
-				size = r.size
-			} else if r.size != size {
-				// Shard sizes must match.
-				return ErrShardSize
-			}
-			dst[r.n] = dst[r.n][0:r.size]
-		case nil:
-		default:
-			return StreamReadError{Err: r.err, Stream: r.n}
-		}
-	}
-	if size == 0 {
-		return io.EOF
-	}
-	return nil
-}
-
-// cWriteShards writes shards concurrently
-func cWriteShards(out []io.Writer, in [][]byte) error {
-	if len(out) != len(in) {
-		panic("internal error: in and out size does not match")
-	}
-	var errs = make(chan error, len(out))
-	var wg sync.WaitGroup
-	wg.Add(len(out))
-	for i := range in {
-		go func(i int) {
-			defer wg.Done()
-			if out[i] == nil {
-				errs <- nil
-				return
-			}
-			n, err := out[i].Write(in[i])
-			if err != nil {
-				errs <- StreamWriteError{Err: err, Stream: i}
-				return
-			}
-			if n != len(in[i]) {
-				errs <- StreamWriteError{Err: io.ErrShortWrite, Stream: i}
-			}
-		}(i)
-	}
-	wg.Wait()
-	close(errs)
-	for err := range errs {
-		if err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// Verify returns true if the parity shards contain correct data.
-//
-// The number of shards must match the number total data+parity shards
-// given to NewStream().
-//
-// Each reader must supply the same number of bytes.
-// If a shard stream returns an error, a StreamReadError type error
-// will be returned.
-func (r rsStream) Verify(shards []io.Reader) (bool, error) {
-	if len(shards) != r.r.Shards {
-		return false, ErrTooFewShards
-	}
-
-	read := 0
-	all := createSlice(r.r.Shards, r.bs)
-	for {
-		err := r.readShards(all, shards)
-		if err == io.EOF {
-			if read == 0 {
-				return false, ErrShardNoData
-			}
-			return true, nil
-		}
-		if err != nil {
-			return false, err
-		}
-		read += shardSize(all)
-		ok, err := r.r.Verify(all)
-		if !ok || err != nil {
-			return ok, err
-		}
-	}
-}
-
-// ErrReconstructMismatch is returned by the StreamEncoder, if you supply
-// "valid" and "fill" streams on the same index.
-// Therefore it is impossible to see if you consider the shard valid
-// or would like to have it reconstructed.
-var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutually exclusive")
-
-// Reconstruct will recreate the missing shards if possible.
-//
-// Given a list of valid shards (to read) and invalid shards (to write)
-//
-// You indicate that a shard is missing by setting it to nil in the 'valid'
-// slice and at the same time setting a non-nil writer in "fill".
-// An index cannot contain both non-nil 'valid' and 'fill' entry.
-//
-// If there are too few shards to reconstruct the missing
-// ones, ErrTooFewShards will be returned.
-//
-// The reconstructed shard set is complete, but integrity is not verified.
-// Use the Verify function to check if data set is ok.
-func (r rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error {
-	if len(valid) != r.r.Shards {
-		return ErrTooFewShards
-	}
-	if len(fill) != r.r.Shards {
-		return ErrTooFewShards
-	}
-
-	all := createSlice(r.r.Shards, r.bs)
-	for i := range valid {
-		if valid[i] != nil && fill[i] != nil {
-			return ErrReconstructMismatch
-		}
-	}
-
-	read := 0
-	for {
-		err := r.readShards(all, valid)
-		if err == io.EOF {
-			if read == 0 {
-				return ErrShardNoData
-			}
-			return nil
-		}
-		if err != nil {
-			return err
-		}
-		read += shardSize(all)
-		all = trimShards(all, shardSize(all))
-
-		err = r.r.Reconstruct(all)
-		if err != nil {
-			return err
-		}
-		err = r.writeShards(fill, all)
-		if err != nil {
-			return err
-		}
-	}
-}
-
-// Join the shards and write the data segment to dst.
-//
-// Only the data shards are considered.
-//
-// You must supply the exact output size you want.
-// If there are to few shards given, ErrTooFewShards will be returned.
-// If the total data size is less than outSize, ErrShortData will be returned.
-func (r rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error {
-	// Do we have enough shards?
-	if len(shards) < r.r.DataShards {
-		return ErrTooFewShards
-	}
-
-	// Trim off parity shards if any
-	shards = shards[:r.r.DataShards]
-	for i := range shards {
-		if shards[i] == nil {
-			return StreamReadError{Err: ErrShardNoData, Stream: i}
-		}
-	}
-	// Join all shards
-	src := io.MultiReader(shards...)
-
-	// Copy data to dst
-	n, err := io.CopyN(dst, src, outSize)
-	if err == io.EOF {
-		return ErrShortData
-	}
-	if err != nil {
-		return err
-	}
-	if n != outSize {
-		return ErrShortData
-	}
-	return nil
-}
-
-// Split a an input stream into the number of shards given to the encoder.
-//
-// The data will be split into equally sized shards.
-// If the data size isn't dividable by the number of shards,
-// the last shard will contain extra zeros.
-//
-// You must supply the total size of your input.
-// 'ErrShortData' will be returned if it is unable to retrieve the
-// number of bytes indicated.
-func (r rsStream) Split(data io.Reader, dst []io.Writer, size int64) error {
-	if size == 0 {
-		return ErrShortData
-	}
-	if len(dst) != r.r.DataShards {
-		return ErrInvShardNum
-	}
-
-	for i := range dst {
-		if dst[i] == nil {
-			return StreamWriteError{Err: ErrShardNoData, Stream: i}
-		}
-	}
-
-	// Calculate number of bytes per shard.
-	perShard := (size + int64(r.r.DataShards) - 1) / int64(r.r.DataShards)
-
-	// Pad data to r.Shards*perShard.
-	padding := make([]byte, (int64(r.r.Shards)*perShard)-size)
-	data = io.MultiReader(data, bytes.NewBuffer(padding))
-
-	// Split into equal-length shards and copy.
-	for i := range dst {
-		n, err := io.CopyN(dst[i], data, perShard)
-		if err != io.EOF && err != nil {
-			return err
-		}
-		if n != perShard {
-			return ErrShortData
-		}
-	}
-
-	return nil
-}
--- a/vendor/github.com/templexxx/cpufeat/LICENSE
+++ b/vendor/github.com/templexxx/cpufeat/LICENSE
@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/templexxx/cpufeat/cpu.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu.go
@ -0,0 +1,32 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cpu implements processor feature detection
+// used by the Go standard libary.
+package cpufeat
+
+var X86 x86
+
+// The booleans in x86 contain the correspondingly named cpuid feature bit.
+// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers
+// in addition to the cpuid feature bit being set.
+// The struct is padded to avoid false sharing.
+type x86 struct {
+	_            [CacheLineSize]byte
+	HasAES       bool
+	HasAVX       bool
+	HasAVX2      bool
+	HasBMI1      bool
+	HasBMI2      bool
+	HasERMS      bool
+	HasOSXSAVE   bool
+	HasPCLMULQDQ bool
+	HasPOPCNT    bool
+	HasSSE2      bool
+	HasSSE3      bool
+	HasSSSE3     bool
+	HasSSE41     bool
+	HasSSE42     bool
+	_            [CacheLineSize]byte
+}
--- a/vendor/github.com/templexxx/cpufeat/cpu_arm.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_arm.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_arm64.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_arm64.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mips.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mips.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mips64.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mips64.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mips64le.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mips64le.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_mipsle.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_mipsle.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpufeat/cpu_ppc64.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_ppc64.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 128
--- a/vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_ppc64le.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 128
--- a/vendor/github.com/templexxx/cpufeat/cpu_s390x.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_s390x.go
@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpufeat
+
+const CacheLineSize = 256
--- a/vendor/github.com/templexxx/cpufeat/cpu_x86.go
+++ b/vendor/github.com/templexxx/cpufeat/cpu_x86.go
@ -0,0 +1,59 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+package cpufeat
+
+const CacheLineSize = 64
+
+// cpuid is implemented in cpu_x86.s.
+func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+
+// xgetbv with ecx = 0 is implemented in cpu_x86.s.
+func xgetbv() (eax, edx uint32)
+
+func init() {
+	maxId, _, _, _ := cpuid(0, 0)
+
+	if maxId < 1 {
+		return
+	}
+
+	_, _, ecx1, edx1 := cpuid(1, 0)
+	X86.HasSSE2 = isSet(26, edx1)
+
+	X86.HasSSE3 = isSet(0, ecx1)
+	X86.HasPCLMULQDQ = isSet(1, ecx1)
+	X86.HasSSSE3 = isSet(9, ecx1)
+	X86.HasSSE41 = isSet(19, ecx1)
+	X86.HasSSE42 = isSet(20, ecx1)
+	X86.HasPOPCNT = isSet(23, ecx1)
+	X86.HasAES = isSet(25, ecx1)
+	X86.HasOSXSAVE = isSet(27, ecx1)
+
+	osSupportsAVX := false
+	// For XGETBV, OSXSAVE bit is required and sufficient.
+	if X86.HasOSXSAVE {
+		eax, _ := xgetbv()
+		// Check if XMM and YMM registers have OS support.
+		osSupportsAVX = isSet(1, eax) && isSet(2, eax)
+	}
+
+	X86.HasAVX = isSet(28, ecx1) && osSupportsAVX
+
+	if maxId < 7 {
+		return
+	}
+
+	_, ebx7, _, _ := cpuid(7, 0)
+	X86.HasBMI1 = isSet(3, ebx7)
+	X86.HasAVX2 = isSet(5, ebx7) && osSupportsAVX
+	X86.HasBMI2 = isSet(8, ebx7)
+	X86.HasERMS = isSet(9, ebx7)
+}
+
+func isSet(bitpos uint, value uint32) bool {
+	return value&(1<<bitpos) != 0
+}
--- a/vendor/github.com/templexxx/cpufeat/cpu_x86.s
+++ b/vendor/github.com/templexxx/cpufeat/cpu_x86.s
@ -0,0 +1,32 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+#include "textflag.h"
+
+// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+	MOVL eaxArg+0(FP), AX
+	MOVL ecxArg+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv() (eax, edx uint32)
+TEXT ·xgetbv(SB),NOSPLIT,$0-8
+#ifdef GOOS_nacl
+	// nacl does not support XGETBV.
+	MOVL $0, eax+0(FP)
+	MOVL $0, edx+4(FP)
+#else
+	MOVL $0, CX
+	WORD $0x010f; BYTE $0xd0 //XGETBV
+	MOVL AX, eax+0(FP)
+	MOVL DX, edx+4(FP)
+#endif
+	RET
--- a/vendor/github.com/templexxx/reedsolomon/LICENSE
+++ b/vendor/github.com/templexxx/reedsolomon/LICENSE
@ -1,5 +1,6 @@
-The MIT License (MIT)
+MIT License

+Copyright (c) 2017 Templexxx
 Copyright (c) 2015 Klaus Post
 Copyright (c) 2015 Backblaze

@ -20,4 +21,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-
--- a/vendor/github.com/templexxx/reedsolomon/mathtool/cntinverse.go
+++ b/vendor/github.com/templexxx/reedsolomon/mathtool/cntinverse.go
@ -0,0 +1,193 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+)
+
+var vects = flag.Uint64("vects", 20, "number of vects (data+parity)")
+var data = flag.Uint64("data", 0, "number of data vects; keep it empty if you want to "+
+	"get the max num of inverse matrix")
+
+func init() {
+	flag.Usage = func() {
+		fmt.Printf("Usage of %s:\n", os.Args[0])
+		fmt.Println("  cntinverse [-flags]")
+		fmt.Println("  Valid flags:")
+		flag.PrintDefaults()
+	}
+}
+
+func main() {
+	flag.Parse()
+	if *vects > 256 {
+		fmt.Println("Error: vects must <= 256")
+		os.Exit(1)
+	}
+	if *data == 0 {
+		n := getMAXCCombination(*vects)
+		fmt.Println("max num of inverse matrix :", n)
+		os.Exit(0)
+	}
+	n := getCCombination(*vects, *data)
+	fmt.Println("num of inverse matrix:", n)
+	os.Exit(0)
+}
+
+func getMAXCCombination(a uint64) uint64 {
+	b := a / 2 // proved in mathtool/combination.jpg
+	return getCCombination(a, b)
+}
+
+func getCCombination(a, b uint64) uint64 {
+	top := make([]uint64, a-b)
+	bottom := make([]uint64, a-b-1)
+	for i := b + 1; i <= a; i++ {
+		top[i-b-1] = i
+	}
+	var i uint64
+	for i = 2; i <= a-b; i++ {
+		bottom[i-2] = i
+	}
+	for j := 0; j <= 5; j++ {
+		cleanEven(top, bottom)
+		clean3(top, bottom)
+		clean5(top, bottom)
+	}
+	cleanCoffeRound1(top, bottom)
+	if maxBottomBigger5more1(bottom) {
+		top = shuffTop(top)
+		cleanCoffeRound1(top, bottom)
+		cleanCoffeRound1(bottom, top)
+		cleanCoffeRound1(top, bottom)
+		cleanCoffeRound1(bottom, top)
+		cleanCoffeRound1(top, bottom)
+		cleanCoffeRound1(bottom, top)
+	}
+	var topV, bottomV uint64 = 1, 1
+	for _, t := range top {
+		topV = topV * t
+	}
+	for _, b := range bottom {
+		bottomV = bottomV * b
+	}
+	return topV / bottomV
+}
+
+func cleanEven(top, bottom []uint64) {
+	for i, b := range bottom {
+		if even(b) {
+			for j, t := range top {
+				if even(t) {
+					top[j] = t / 2
+					bottom[i] = b / 2
+					break
+				}
+			}
+		}
+	}
+}
+
+func even(a uint64) bool {
+	return a&1 == 0
+}
+
+func clean3(top, bottom []uint64) {
+	for i, b := range bottom {
+		if mod3(b) {
+			for j, t := range top {
+				if mod3(t) {
+					top[j] = t / 3
+					bottom[i] = b / 3
+					break
+				}
+			}
+		}
+	}
+}
+
+func mod3(a uint64) bool {
+	c := a / 3
+	if 3*c == a {
+		return true
+	}
+	return false
+}
+
+func clean5(top, bottom []uint64) {
+	for i, b := range bottom {
+		if mod5(b) {
+			for j, t := range top {
+				if mod5(t) {
+					top[j] = t / 5
+					bottom[i] = b / 5
+					break
+				}
+			}
+		}
+	}
+}
+
+func mod5(a uint64) bool {
+	c := a / 5
+	if 5*c == a {
+		return true
+	}
+	return false
+}
+
+func maxBottomBigger5more1(bottom []uint64) bool {
+	cnt := 0
+	for _, b := range bottom {
+		if b >= 5 {
+			cnt++
+		}
+	}
+	if cnt >= 2 {
+		return true
+	}
+	return false
+}
+
+func cleanCoffeRound1(top, bottom []uint64) {
+	for i, b := range bottom {
+		for j, t := range top {
+			if isCoffe(b, t) {
+				top[j] = t / b
+				bottom[i] = 1
+				break
+			}
+		}
+	}
+}
+
+func isCoffe(b, t uint64) bool {
+	c := t / b
+	if c*b == t {
+		return true
+	}
+	return false
+}
+
+func shuffTop(top []uint64) []uint64 {
+	var tmp uint64 = 1
+	newLen := len(top) + 1
+	for i, t := range top {
+		if t <= 5 {
+			tmp = tmp * t
+			newLen--
+			top[i] = 1
+		}
+	}
+	topNew := make([]uint64, newLen)
+	topNew[0] = tmp
+	cnt := 1
+	for _, t := range top {
+		if t != 1 {
+			topNew[cnt] = t
+			cnt++
+		}
+	}
+	return topNew
+}
--- a/vendor/github.com/templexxx/reedsolomon/mathtool/gentbls.go
+++ b/vendor/github.com/templexxx/reedsolomon/mathtool/gentbls.go
@ -0,0 +1,270 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"os"
+	"strconv"
+	"strings"
+)
+
+// set deg here
+const deg = 8 // <= 8
+
+type polynomial [deg + 1]byte
+
+func main() {
+	f, err := os.OpenFile("tables", os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		log.Fatalln(err)
+	}
+	defer f.Close()
+	outputWriter := bufio.NewWriter(f)
+	ps := genPrimitivePolynomial()
+	title := strconv.FormatInt(int64(deg), 10) + " degree primitive polynomial：\n"
+	var pss string
+	for i, p := range ps {
+		pf := formatPolynomial(p)
+		pf = strconv.FormatInt(int64(i+1), 10) + ". " + pf + ";\n"
+		pss = pss + pf
+	}
+	body := fmt.Sprintf(title+"%v", pss)
+	outputWriter.WriteString(body)
+
+	//set primitive polynomial here to generator tables
+	//x^8+x^4+x^3+x^2+1
+	var primitivePolynomial polynomial
+	primitivePolynomial[0] = 1
+	primitivePolynomial[2] = 1
+	primitivePolynomial[3] = 1
+	primitivePolynomial[4] = 1
+	primitivePolynomial[8] = 1
+
+	lenExpTable := (1 << deg) - 1
+	expTable := genExpTable(primitivePolynomial, lenExpTable)
+	body = fmt.Sprintf("expTbl: %#v\n", expTable)
+	outputWriter.WriteString(body)
+
+	logTable := genLogTable(expTable)
+	body = fmt.Sprintf("logTbl: %#v\n", logTable)
+	outputWriter.WriteString(body)
+
+	mulTable := genMulTable(expTable, logTable)
+	body = fmt.Sprintf("mulTbl: %#v\n", mulTable)
+	outputWriter.WriteString(body)
+
+	lowTable, highTable := genMulTableHalf(mulTable)
+	body = fmt.Sprintf("lowTbl: %#v\n", lowTable)
+	outputWriter.WriteString(body)
+	body = fmt.Sprintf("highTbl: %#v\n", highTable)
+	outputWriter.WriteString(body)
+
+	var combTable [256][32]byte
+	for i := range combTable {
+		l := lowTable[i]
+		for j := 0; j < 16; j++ {
+			combTable[i][j] = l[j]
+		}
+		h := highTable[i][:]
+		for k := 16; k < 32; k++ {
+			combTable[i][k] = h[k-16]
+		}
+	}
+	body = fmt.Sprintf("lowhighTbl: %#v\n", combTable)
+	outputWriter.WriteString(body)
+
+	inverseTable := genInverseTable(mulTable)
+	body = fmt.Sprintf("inverseTbl: %#v\n", inverseTable)
+	outputWriter.WriteString(body)
+	outputWriter.Flush()
+}
+
+// generate primitive Polynomial
+func genPrimitivePolynomial() []polynomial {
+	// drop Polynomial x，so the constant term must be 1
+	// so there are 2^(deg-1) Polynomials
+	cnt := 1 << (deg - 1)
+	var polynomials []polynomial
+	var p polynomial
+	p[0] = 1
+	p[deg] = 1
+	// gen all Polynomials
+	for i := 0; i < cnt; i++ {
+		p = genPolynomial(p, 1)
+		polynomials = append(polynomials, p)
+	}
+	// drop Polynomial x+1, so the cnt of Polynomials is odd
+	var psRaw []polynomial
+	for _, p := range polynomials {
+		var n int
+		for _, v := range p {
+			if v == 1 {
+				n++
+			}
+		}
+		if n&1 != 0 {
+			psRaw = append(psRaw, p)
+		}
+	}
+	// order of primitive element == 2^deg -1 ?
+	var ps []polynomial
+	for _, p := range psRaw {
+		lenTable := (1 << deg) - 1
+		table := genExpTable(p, lenTable)
+		var numOf1 int
+		for _, v := range table {
+			// cnt 1 in ExpTable
+			if int(v) == 1 {
+				numOf1++
+			}
+		}
+		if numOf1 == 1 {
+			ps = append(ps, p)
+		}
+	}
+	return ps
+}
+
+func genPolynomial(p polynomial, i int) polynomial {
+	if p[i] == 0 {
+		p[i] = 1
+	} else {
+		p[i] = 0
+		i++
+		if i == deg {
+			return p
+		}
+		p = genPolynomial(p, i)
+	}
+	return p
+}
+
+func genExpTable(primitivePolynomial polynomial, exp int) []byte {
+	table := make([]byte, exp)
+	var rawPolynomial polynomial
+	rawPolynomial[1] = 1
+	table[0] = byte(1)
+	table[1] = byte(2)
+	for i := 2; i < exp; i++ {
+		rawPolynomial = expGrowPolynomial(rawPolynomial, primitivePolynomial)
+		table[i] = byte(getValueOfPolynomial(rawPolynomial))
+	}
+	return table
+}
+
+func expGrowPolynomial(raw, primitivePolynomial polynomial) polynomial {
+	var newP polynomial
+	for i, v := range raw[:deg] {
+		if v == 1 {
+			newP[i+1] = 1
+		}
+	}
+	if newP[deg] == 1 {
+		for i, v := range primitivePolynomial[:deg] {
+			if v == 1 {
+				if newP[i] == 1 {
+					newP[i] = 0
+				} else {
+					newP[i] = 1
+				}
+			}
+		}
+	}
+	newP[deg] = 0
+	return newP
+}
+
+func getValueOfPolynomial(p polynomial) uint8 {
+	var v uint8
+	for i, coefficient := range p[:deg] {
+		if coefficient != 0 {
+			add := 1 << uint8(i)
+			v += uint8(add)
+		}
+	}
+	return v
+}
+
+func genLogTable(expTable []byte) []byte {
+	table := make([]byte, (1 << deg))
+	//table[0] 无法由本原元的幂得到
+	table[0] = 0
+	for i, v := range expTable {
+		table[v] = byte(i)
+	}
+	return table
+}
+
+func genMulTable(expTable, logTable []byte) [256][256]byte {
+	var result [256][256]byte
+	for a := range result {
+		for b := range result[a] {
+			if a == 0 || b == 0 {
+				result[a][b] = 0
+				continue
+			}
+			logA := int(logTable[a])
+			logB := int(logTable[b])
+			logSum := logA + logB
+			for logSum >= 255 {
+				logSum -= 255
+			}
+			result[a][b] = expTable[logSum]
+		}
+	}
+	return result
+}
+
+func genMulTableHalf(mulTable [256][256]byte) (low [256][16]byte, high [256][16]byte) {
+	for a := range low {
+		for b := range low {
+			//result := 0
+			var result byte
+			if !(a == 0 || b == 0) {
+				//result = int(mulTable[a][b])
+				result = mulTable[a][b]
+
+			}
+			// b & 00001111, [0,15]
+			if (b & 0xf) == b {
+				low[a][b] = result
+			}
+			// b & 11110000, [240,255]
+			if (b & 0xf0) == b {
+				high[a][b>>4] = result
+			}
+		}
+	}
+	return
+}
+
+func genInverseTable(mulTable [256][256]byte) [256]byte {
+	var inVerseTable [256]byte
+	for i, t := range mulTable {
+		for j, v := range t {
+			if int(v) == 1 {
+				inVerseTable[i] = byte(j)
+			}
+		}
+	}
+	return inVerseTable
+}
+
+func formatPolynomial(p polynomial) string {
+	var ps string
+	for i := deg; i > 1; i-- {
+		if p[i] == 1 {
+			ps = ps + "x^" + strconv.FormatInt(int64(i), 10) + "+"
+		}
+	}
+	if p[1] == 1 {
+		ps = ps + "x+"
+	}
+	if p[0] == 1 {
+		ps = ps + "1"
+	} else {
+		strings.TrimSuffix(ps, "+")
+	}
+	return ps
+}
--- a/vendor/github.com/templexxx/reedsolomon/matrix.go
+++ b/vendor/github.com/templexxx/reedsolomon/matrix.go
@ -0,0 +1,156 @@
+package reedsolomon
+
+import "errors"
+
+type matrix []byte
+
+func genEncMatrixCauchy(d, p int) matrix {
+	t := d + p
+	m := make([]byte, t*d)
+	for i := 0; i < d; i++ {
+		m[i*d+i] = byte(1)
+	}
+
+	d2 := d * d
+	for i := d; i < t; i++ {
+		for j := 0; j < d; j++ {
+			d := i ^ j
+			a := inverseTbl[d]
+			m[d2] = byte(a)
+			d2++
+		}
+	}
+	return m
+}
+
+func gfExp(b byte, n int) byte {
+	if n == 0 {
+		return 1
+	}
+	if b == 0 {
+		return 0
+	}
+	a := logTbl[b]
+	ret := int(a) * n
+	for ret >= 255 {
+		ret -= 255
+	}
+	return byte(expTbl[ret])
+}
+
+func genVandMatrix(vm []byte, t, d int) {
+	for i := 0; i < t; i++ {
+		for j := 0; j < d; j++ {
+			vm[i*d+j] = gfExp(byte(i), j)
+		}
+	}
+}
+
+func (m matrix) mul(right matrix, rows, cols int, r []byte) {
+	for i := 0; i < rows; i++ {
+		for j := 0; j < cols; j++ {
+			var v byte
+			for k := 0; k < cols; k++ {
+				v ^= gfMul(m[i*cols+k], right[k*cols+j])
+			}
+			r[i*cols+j] = v
+		}
+	}
+}
+
+func genEncMatrixVand(d, p int) (matrix, error) {
+	t := d + p
+	buf := make([]byte, (2*t+4*d)*d)
+	vm := buf[:t*d]
+	genVandMatrix(vm, t, d)
+	top := buf[t*d : (t+d)*d]
+	copy(top, vm[:d*d])
+	raw := buf[(t+d)*d : (t+3*d)*d]
+	im := buf[(t+3*d)*d : (t+4*d)*d]
+	err := matrix(top).invert(raw, d, im)
+	if err != nil {
+		return nil, err
+	}
+	r := buf[(t+4*d)*d : (2*t+4*d)*d]
+	matrix(vm).mul(im, t, d, r)
+	return matrix(r), nil
+}
+
+// [I|m'] -> [m']
+func (m matrix) subMatrix(n int, r []byte) {
+	for i := 0; i < n; i++ {
+		off := i * n
+		copy(r[off:off+n], m[2*off+n:2*(off+n)])
+	}
+}
+
+func (m matrix) invert(raw matrix, n int, im []byte) error {
+	// [m] -> [m|I]
+	for i := 0; i < n; i++ {
+		t := i * n
+		copy(raw[2*t:2*t+n], m[t:t+n])
+		raw[2*t+i+n] = byte(1)
+	}
+	err := gauss(raw, n)
+	if err != nil {
+		return err
+	}
+	raw.subMatrix(n, im)
+	return nil
+}
+
+func (m matrix) swap(i, j, n int) {
+	for k := 0; k < n; k++ {
+		m[i*n+k], m[j*n+k] = m[j*n+k], m[i*n+k]
+	}
+}
+
+func gfMul(a, b byte) byte {
+	return mulTbl[a][b]
+}
+
+var errSingular = errors.New("rs.invert: matrix is singular")
+
+// [m|I] -> [I|m']
+func gauss(m matrix, n int) error {
+	n2 := 2 * n
+	for i := 0; i < n; i++ {
+		if m[i*n2+i] == 0 {
+			for j := i + 1; j < n; j++ {
+				if m[j*n2+i] != 0 {
+					m.swap(i, j, n2)
+					break
+				}
+			}
+		}
+		if m[i*n2+i] == 0 {
+			return errSingular
+		}
+		if m[i*n2+i] != 1 {
+			d := m[i*n2+i]
+			scale := inverseTbl[d]
+			for c := 0; c < n2; c++ {
+				m[i*n2+c] = gfMul(m[i*n2+c], scale)
+			}
+		}
+		for j := i + 1; j < n; j++ {
+			if m[j*n2+i] != 0 {
+				scale := m[j*n2+i]
+				for c := 0; c < n2; c++ {
+					m[j*n2+c] ^= gfMul(scale, m[i*n2+c])
+				}
+			}
+		}
+	}
+	for k := 0; k < n; k++ {
+		for j := 0; j < k; j++ {
+			if m[j*n2+k] != 0 {
+				scale := m[j*n2+k]
+				for c := 0; c < n2; c++ {
+					m[j*n2+c] ^= gfMul(scale, m[k*n2+c])
+				}
+			}
+		}
+	}
+	return nil
+}
--- a/vendor/github.com/templexxx/reedsolomon/rs.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs.go
@ -0,0 +1,280 @@
+/*
+	Reed-Solomon Codes over GF(2^8)
+	Primitive Polynomial:  x^8+x^4+x^3+x^2+1
+	Galois Filed arithmetic using Intel SIMD instructions (AVX2 or SSSE3)
+*/
+
+package reedsolomon
+
+import "errors"
+
+// Encoder implements for Reed-Solomon Encoding/Reconstructing
+type Encoder interface {
+	// Encode multiply generator-matrix with data
+	// len(vects) must be equal with num of data+parity
+	Encode(vects [][]byte) error
+	// Result of reconst will be put into origin position of vects
+	// it means if you lost vects[0], after reconst the vects[0]'s data will be back in vects[0]
+
+	// Reconstruct repair lost data & parity
+	// Set vect nil if lost
+	Reconstruct(vects [][]byte) error
+	// Reconstruct repair lost data
+	// Set vect nil if lost
+	ReconstructData(vects [][]byte) error
+	// ReconstWithPos repair lost data&parity with has&lost vects position
+	// Save bandwidth&disk I/O (cmp with Reconstruct, if the lost is less than num of parity)
+	// As erasure codes, we must know which vect is broken,
+	// so it's necessary to provide such APIs
+	// len(has) must equal num of data vects
+	// Example:
+	// in 3+2, the whole position: [0,1,2,3,4]
+	// if lost vects[0]
+	// the "has" could be [1,2,3] or [1,2,4] or ...
+	// then you must be sure that vects[1] vects[2] vects[3] have correct data (if the "has" is [1,2,3])
+	// the "dLost" will be [0]
+	// ps:
+	// 1. the above lists are in increasing orders  TODO support out-of-order
+	// 2. each vect has same len, don't set it nil
+	// so we don't need to make slice
+	ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error
+	//// ReconstWithPos repair lost data with survived&lost vects position
+	//// Don't need to append position of parity lost into "lost"
+	ReconstDataWithPos(vects [][]byte, has, dLost []int) error
+}
+
+func checkCfg(d, p int) error {
+	if (d <= 0) || (p <= 0) {
+		return errors.New("rs.New: data or parity <= 0")
+	}
+	if d+p >= 256 {
+		return errors.New("rs.New: data+parity >= 256")
+	}
+	return nil
+}
+
+// New create an Encoder (vandermonde matrix as Encoding matrix)
+func New(data, parity int) (enc Encoder, err error) {
+	err = checkCfg(data, parity)
+	if err != nil {
+		return
+	}
+	e, err := genEncMatrixVand(data, parity)
+	if err != nil {
+		return
+	}
+	return newRS(data, parity, e), nil
+}
+
+// NewCauchy create an Encoder (cauchy matrix as Generator Matrix)
+func NewCauchy(data, parity int) (enc Encoder, err error) {
+	err = checkCfg(data, parity)
+	if err != nil {
+		return
+	}
+	e := genEncMatrixCauchy(data, parity)
+	return newRS(data, parity, e), nil
+}
+
+type encBase struct {
+	data   int
+	parity int
+	encode []byte
+	gen    []byte
+}
+
+func checkEnc(d, p int, vs [][]byte) (size int, err error) {
+	total := len(vs)
+	if d+p != total {
+		err = errors.New("rs.checkER: vects not match rs args")
+		return
+	}
+	size = len(vs[0])
+	if size == 0 {
+		err = errors.New("rs.checkER: vects size = 0")
+		return
+	}
+	for i := 1; i < total; i++ {
+		if len(vs[i]) != size {
+			err = errors.New("rs.checkER: vects size mismatch")
+			return
+		}
+	}
+	return
+}
+
+func (e *encBase) Encode(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	_, err = checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	g := e.gen
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			if i != 0 {
+				mulVectAdd(g[j*d+i], dv[i], pv[j])
+			} else {
+				mulVect(g[j*d], dv[0], pv[j])
+			}
+		}
+	}
+	return
+}
+
+func mulVect(c byte, a, b []byte) {
+	t := mulTbl[c]
+	for i := 0; i < len(a); i++ {
+		b[i] = t[a[i]]
+	}
+}
+
+func mulVectAdd(c byte, a, b []byte) {
+	t := mulTbl[c]
+	for i := 0; i < len(a); i++ {
+		b[i] ^= t[a[i]]
+	}
+}
+
+func (e *encBase) Reconstruct(vects [][]byte) (err error) {
+	return e.reconstruct(vects, false)
+}
+
+func (e *encBase) ReconstructData(vects [][]byte) (err error) {
+	return e.reconstruct(vects, true)
+}
+
+func (e *encBase) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, pLost, false)
+}
+
+func (e *encBase) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, nil, true)
+}
+
+func (e *encBase) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	em := e.encode
+	dCnt := len(dLost)
+	size := len(vects[has[0]])
+	if dCnt != 0 {
+		vtmp := make([][]byte, d+dCnt)
+		for i, p := range has {
+			vtmp[i] = vects[p]
+		}
+		for i, p := range dLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		matrixbuf := make([]byte, 4*d*d+dCnt*d)
+		m := matrixbuf[:d*d]
+		for i, l := range has {
+			copy(m[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		raw := matrixbuf[d*d : 3*d*d]
+		im := matrixbuf[3*d*d : 4*d*d]
+		err2 := matrix(m).invert(raw, d, im)
+		if err2 != nil {
+			return err2
+		}
+		g := matrixbuf[4*d*d:]
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		etmp := &encBase{data: d, parity: dCnt, gen: g}
+		err2 = etmp.Encode(vtmp[:d+dCnt])
+		if err2 != nil {
+			return err2
+		}
+	}
+	if dataOnly {
+		return
+	}
+	pCnt := len(pLost)
+	if pCnt != 0 {
+		vtmp := make([][]byte, d+pCnt)
+		g := make([]byte, pCnt*d)
+		for i, l := range pLost {
+			copy(g[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		for i := 0; i < d; i++ {
+			vtmp[i] = vects[i]
+		}
+		for i, p := range pLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		etmp := &encBase{data: d, parity: pCnt, gen: g}
+		err2 := etmp.Encode(vtmp[:d+pCnt])
+		if err2 != nil {
+			return err2
+		}
+	}
+	return
+}
+
+func (e *encBase) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	// TODO check more, maybe element in has show in lost & deal with len(has) > d
+	if len(has) != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dCnt := len(dLost)
+	if dCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	pCnt := len(pLost)
+	if pCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encBase) reconstruct(vects [][]byte, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	t := d + p
+	listBuf := make([]int, t+p)
+	has := listBuf[:d]
+	dLost := listBuf[d:t]
+	pLost := listBuf[t : t+p]
+	hasCnt, dCnt, pCnt := 0, 0, 0
+	for i := 0; i < t; i++ {
+		if vects[i] != nil {
+			if hasCnt < d {
+				has[hasCnt] = i
+				hasCnt++
+			}
+		} else {
+			if i < d {
+				if dCnt < p {
+					dLost[dCnt] = i
+					dCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			} else {
+				if pCnt < p {
+					pLost[pCnt] = i
+					pCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			}
+		}
+	}
+	if hasCnt != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dLost = dLost[:dCnt]
+	pLost = pLost[:pCnt]
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
--- a/vendor/github.com/templexxx/reedsolomon/rs_amd64.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs_amd64.go
@ -0,0 +1,868 @@
+package reedsolomon
+
+import (
+	"errors"
+	"sync"
+
+	"github.com/templexxx/cpufeat"
+)
+
+// SIMD Instruction Extensions
+const (
+	none = iota
+	avx2
+	ssse3
+)
+
+var extension = none
+
+func init() {
+	getEXT()
+}
+
+func getEXT() {
+	if cpufeat.X86.HasAVX2 {
+		extension = avx2
+		return
+	} else if cpufeat.X86.HasSSSE3 {
+		extension = ssse3
+		return
+	} else {
+		extension = none
+		return
+	}
+}
+
+//go:noescape
+func copy32B(dst, src []byte) // Need SSE2(introduced in 2001)
+
+func initTbl(g matrix, rows, cols int, tbl []byte) {
+	off := 0
+	for i := 0; i < cols; i++ {
+		for j := 0; j < rows; j++ {
+			c := g[j*cols+i]
+			t := lowhighTbl[c][:]
+			copy32B(tbl[off:off+32], t)
+			off += 32
+		}
+	}
+}
+
+// At most 3060 inverse matrix (when data=14, parity=4, calc by mathtool/cntinverse)
+// In practice,  data usually below 12, parity below 5
+func okCache(data, parity int) bool {
+	if data < 15 && parity < 5 { // you can change it, but the data+parity can't be bigger than 32 (tips: see the codes about make inverse matrix)
+		return true
+	}
+	return false
+}
+
+type (
+	encSSSE3 encSIMD
+	encAVX2  encSIMD
+	encSIMD  struct {
+		data   int
+		parity int
+		encode matrix
+		gen    matrix
+		tbl    []byte
+		// inverse matrix cache is design for small vect size ( < 4KB )
+		// it will save time for calculating inverse matrix
+		// but it's not so important for big vect size
+		enableCache  bool
+		inverseCache iCache
+	}
+	iCache struct {
+		sync.RWMutex
+		data map[uint32][]byte
+	}
+)
+
+func newRS(d, p int, em matrix) (enc Encoder) {
+	g := em[d*d:]
+	if extension == none {
+		return &encBase{data: d, parity: p, encode: em, gen: g}
+	}
+	t := make([]byte, d*p*32)
+	initTbl(g, p, d, t)
+	ok := okCache(d, p)
+	if extension == avx2 {
+		e := &encAVX2{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
+			inverseCache: iCache{data: make(map[uint32][]byte)}}
+		return e
+	}
+	e := &encSSSE3{data: d, parity: p, encode: em, gen: g, tbl: t, enableCache: ok,
+		inverseCache: iCache{data: make(map[uint32][]byte)}}
+	return e
+}
+
+// Size of sub-vector
+const unit int = 16 * 1024
+
+func getDo(n int) int {
+	if n < unit {
+		c := n >> 4
+		if c == 0 {
+			return unit
+		}
+		return c << 4
+	}
+	return unit
+}
+
+func (e *encAVX2) Encode(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMul(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemain(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+//go:noescape
+func mulVectAVX2(tbl, d, p []byte)
+
+//go:noescape
+func mulVectAddAVX2(tbl, d, p []byte)
+
+func (e *encAVX2) matrixMul(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	off := 0
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := tbl[off : off+32]
+			if i != 0 {
+				mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
+			}
+			off += 32
+		}
+	}
+}
+
+func (e *encAVX2) matrixMulRemain(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	if do >= 16 {
+		end2 := start + do
+		off := 0
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := tbl[off : off+32]
+				if i != 0 {
+					mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
+				}
+				off += 32
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		// may recalculate some data, but still improve a lot
+		start2 := end - 16
+		if start2 >= 0 {
+			off := 0
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := tbl[off : off+32]
+					if i != 0 {
+						mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
+					}
+					off += 32
+				}
+			}
+		} else {
+			g := e.gen
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+// use generator-matrix but not tbls for encoding
+// it's design for reconstructing
+// for small vects, it cost to much time on initTbl, so drop it
+// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
+func (e *encAVX2) encodeGen(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMulGen(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemainGen(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+func (e *encAVX2) matrixMulGen(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	g := e.gen
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := lowhighTbl[g[j*d+i]][:]
+			if i != 0 {
+				mulVectAddAVX2(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectAVX2(t, dv[0][start:end], pv[j][start:end])
+			}
+		}
+	}
+}
+
+func (e *encAVX2) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	g := e.gen
+	if do >= 16 {
+		end2 := start + do
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := lowhighTbl[g[j*d+i]][:]
+				if i != 0 {
+					mulVectAddAVX2(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectAVX2(t, dv[0][start:end2], pv[j][start:end2])
+				}
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		start2 := end - 16
+		if start2 >= 0 {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := lowhighTbl[g[j*d+i]][:]
+					if i != 0 {
+						mulVectAddAVX2(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectAVX2(t, dv[0][start2:end], pv[j][start2:end])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+func (e *encAVX2) Reconstruct(vects [][]byte) (err error) {
+	return e.reconstruct(vects, false)
+}
+
+func (e *encAVX2) ReconstructData(vects [][]byte) (err error) {
+	return e.reconstruct(vects, true)
+}
+
+func (e *encAVX2) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, pLost, false)
+}
+
+func (e *encAVX2) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, nil, true)
+}
+
+func (e *encAVX2) makeGen(has, dLost []int) (gen []byte, err error) {
+	d := e.data
+	em := e.encode
+	cnt := len(dLost)
+	if !e.enableCache {
+		matrixbuf := make([]byte, 4*d*d+cnt*d)
+		m := matrixbuf[:d*d]
+		for i, l := range has {
+			copy(m[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		raw := matrixbuf[d*d : 3*d*d]
+		im := matrixbuf[3*d*d : 4*d*d]
+		err2 := matrix(m).invert(raw, d, im)
+		if err2 != nil {
+			return nil, err2
+		}
+		g := matrixbuf[4*d*d:]
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		return g, nil
+	}
+	var ikey uint32
+	for _, p := range has {
+		ikey += 1 << uint8(p)
+	}
+	e.inverseCache.RLock()
+	v, ok := e.inverseCache.data[ikey]
+	if ok {
+		im := v
+		g := make([]byte, cnt*d)
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		e.inverseCache.RUnlock()
+		return g, nil
+	}
+	e.inverseCache.RUnlock()
+	matrixbuf := make([]byte, 4*d*d+cnt*d)
+	m := matrixbuf[:d*d]
+	for i, l := range has {
+		copy(m[i*d:i*d+d], em[l*d:l*d+d])
+	}
+	raw := matrixbuf[d*d : 3*d*d]
+	im := matrixbuf[3*d*d : 4*d*d]
+	err2 := matrix(m).invert(raw, d, im)
+	if err2 != nil {
+		return nil, err2
+	}
+	e.inverseCache.Lock()
+	e.inverseCache.data[ikey] = im
+	e.inverseCache.Unlock()
+	g := matrixbuf[4*d*d:]
+	for i, l := range dLost {
+		copy(g[i*d:i*d+d], im[l*d:l*d+d])
+	}
+	return g, nil
+}
+
+func (e *encAVX2) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	em := e.encode
+	dCnt := len(dLost)
+	size := len(vects[has[0]])
+	if dCnt != 0 {
+		vtmp := make([][]byte, d+dCnt)
+		for i, p := range has {
+			vtmp[i] = vects[p]
+		}
+		for i, p := range dLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		g, err2 := e.makeGen(has, dLost)
+		if err2 != nil {
+			return
+		}
+		etmp := &encAVX2{data: d, parity: dCnt, gen: g}
+		err2 = etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	if dataOnly {
+		return
+	}
+	pCnt := len(pLost)
+	if pCnt != 0 {
+		g := make([]byte, pCnt*d)
+		for i, l := range pLost {
+			copy(g[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		vtmp := make([][]byte, d+pCnt)
+		for i := 0; i < d; i++ {
+			vtmp[i] = vects[i]
+		}
+		for i, p := range pLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		etmp := &encAVX2{data: d, parity: pCnt, gen: g}
+		err2 := etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	return
+}
+
+func (e *encAVX2) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	if len(has) != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dCnt := len(dLost)
+	if dCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	pCnt := len(pLost)
+	if pCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encAVX2) reconstruct(vects [][]byte, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	t := d + p
+	listBuf := make([]int, t+p)
+	has := listBuf[:d]
+	dLost := listBuf[d:t]
+	pLost := listBuf[t : t+p]
+	hasCnt, dCnt, pCnt := 0, 0, 0
+	for i := 0; i < t; i++ {
+		if vects[i] != nil {
+			if hasCnt < d {
+				has[hasCnt] = i
+				hasCnt++
+			}
+		} else {
+			if i < d {
+				if dCnt < p {
+					dLost[dCnt] = i
+					dCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			} else {
+				if pCnt < p {
+					pLost[pCnt] = i
+					pCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			}
+		}
+	}
+	if hasCnt != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dLost = dLost[:dCnt]
+	pLost = pLost[:pCnt]
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encSSSE3) Encode(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMul(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemain(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+//go:noescape
+func mulVectSSSE3(tbl, d, p []byte)
+
+//go:noescape
+func mulVectAddSSSE3(tbl, d, p []byte)
+
+func (e *encSSSE3) matrixMul(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	off := 0
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := tbl[off : off+32]
+			if i != 0 {
+				mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
+			}
+			off += 32
+		}
+	}
+}
+
+func (e *encSSSE3) matrixMulRemain(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	tbl := e.tbl
+	if do >= 16 {
+		end2 := start + do
+		off := 0
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := tbl[off : off+32]
+				if i != 0 {
+					mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
+				}
+				off += 32
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		start2 := end - 16
+		if start2 >= 0 {
+			off := 0
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := tbl[off : off+32]
+					if i != 0 {
+						mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
+					}
+					off += 32
+				}
+			}
+		} else {
+			g := e.gen
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+// use generator-matrix but not tbls for encoding
+// it's design for reconstructing
+// for small vects, it cost to much time on initTbl, so drop it
+// and for big vects, the tbls can't impact much, because the cache will be filled with vects' data
+func (e *encSSSE3) encodeGen(vects [][]byte) (err error) {
+	d := e.data
+	p := e.parity
+	size, err := checkEnc(d, p, vects)
+	if err != nil {
+		return
+	}
+	dv := vects[:d]
+	pv := vects[d:]
+	start, end := 0, 0
+	do := getDo(size)
+	for start < size {
+		end = start + do
+		if end <= size {
+			e.matrixMulGen(start, end, dv, pv)
+			start = end
+		} else {
+			e.matrixMulRemainGen(start, size, dv, pv)
+			start = size
+		}
+	}
+	return
+}
+
+func (e *encSSSE3) matrixMulGen(start, end int, dv, pv [][]byte) {
+	d := e.data
+	p := e.parity
+	g := e.gen
+	for i := 0; i < d; i++ {
+		for j := 0; j < p; j++ {
+			t := lowhighTbl[g[j*d+i]][:]
+			if i != 0 {
+				mulVectAddSSSE3(t, dv[i][start:end], pv[j][start:end])
+			} else {
+				mulVectSSSE3(t, dv[0][start:end], pv[j][start:end])
+			}
+		}
+	}
+}
+
+func (e *encSSSE3) matrixMulRemainGen(start, end int, dv, pv [][]byte) {
+	undone := end - start
+	do := (undone >> 4) << 4
+	d := e.data
+	p := e.parity
+	g := e.gen
+	if do >= 16 {
+		end2 := start + do
+		for i := 0; i < d; i++ {
+			for j := 0; j < p; j++ {
+				t := lowhighTbl[g[j*d+i]][:]
+				if i != 0 {
+					mulVectAddSSSE3(t, dv[i][start:end2], pv[j][start:end2])
+				} else {
+					mulVectSSSE3(t, dv[0][start:end2], pv[j][start:end2])
+				}
+			}
+		}
+		start = end
+	}
+	if undone > do {
+		start2 := end - 16
+		if start2 >= 0 {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					t := lowhighTbl[g[j*d+i]][:]
+					if i != 0 {
+						mulVectAddSSSE3(t, dv[i][start2:end], pv[j][start2:end])
+					} else {
+						mulVectSSSE3(t, dv[0][start2:end], pv[j][start2:end])
+					}
+				}
+			}
+		} else {
+			for i := 0; i < d; i++ {
+				for j := 0; j < p; j++ {
+					if i != 0 {
+						mulVectAdd(g[j*d+i], dv[i][start:], pv[j][start:])
+					} else {
+						mulVect(g[j*d], dv[0][start:], pv[j][start:])
+					}
+				}
+			}
+		}
+	}
+}
+
+func (e *encSSSE3) Reconstruct(vects [][]byte) (err error) {
+	return e.reconstruct(vects, false)
+}
+
+func (e *encSSSE3) ReconstructData(vects [][]byte) (err error) {
+	return e.reconstruct(vects, true)
+}
+
+func (e *encSSSE3) ReconstWithPos(vects [][]byte, has, dLost, pLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, pLost, false)
+}
+
+func (e *encSSSE3) ReconstDataWithPos(vects [][]byte, has, dLost []int) error {
+	return e.reconstWithPos(vects, has, dLost, nil, true)
+}
+
+func (e *encSSSE3) makeGen(has, dLost []int) (gen []byte, err error) {
+	d := e.data
+	em := e.encode
+	cnt := len(dLost)
+	if !e.enableCache {
+		matrixbuf := make([]byte, 4*d*d+cnt*d)
+		m := matrixbuf[:d*d]
+		for i, l := range has {
+			copy(m[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		raw := matrixbuf[d*d : 3*d*d]
+		im := matrixbuf[3*d*d : 4*d*d]
+		err2 := matrix(m).invert(raw, d, im)
+		if err2 != nil {
+			return nil, err2
+		}
+		g := matrixbuf[4*d*d:]
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		return g, nil
+	}
+	var ikey uint32
+	for _, p := range has {
+		ikey += 1 << uint8(p)
+	}
+	e.inverseCache.RLock()
+	v, ok := e.inverseCache.data[ikey]
+	if ok {
+		im := v
+		g := make([]byte, cnt*d)
+		for i, l := range dLost {
+			copy(g[i*d:i*d+d], im[l*d:l*d+d])
+		}
+		e.inverseCache.RUnlock()
+		return g, nil
+	}
+	e.inverseCache.RUnlock()
+	matrixbuf := make([]byte, 4*d*d+cnt*d)
+	m := matrixbuf[:d*d]
+	for i, l := range has {
+		copy(m[i*d:i*d+d], em[l*d:l*d+d])
+	}
+	raw := matrixbuf[d*d : 3*d*d]
+	im := matrixbuf[3*d*d : 4*d*d]
+	err2 := matrix(m).invert(raw, d, im)
+	if err2 != nil {
+		return nil, err2
+	}
+	e.inverseCache.Lock()
+	e.inverseCache.data[ikey] = im
+	e.inverseCache.Unlock()
+	g := matrixbuf[4*d*d:]
+	for i, l := range dLost {
+		copy(g[i*d:i*d+d], im[l*d:l*d+d])
+	}
+	return g, nil
+}
+
+func (e *encSSSE3) reconst(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	em := e.encode
+	dCnt := len(dLost)
+	size := len(vects[has[0]])
+	if dCnt != 0 {
+		vtmp := make([][]byte, d+dCnt)
+		for i, p := range has {
+			vtmp[i] = vects[p]
+		}
+		for i, p := range dLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		g, err2 := e.makeGen(has, dLost)
+		if err2 != nil {
+			return
+		}
+		etmp := &encSSSE3{data: d, parity: dCnt, gen: g}
+		err2 = etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	if dataOnly {
+		return
+	}
+	pCnt := len(pLost)
+	if pCnt != 0 {
+		g := make([]byte, pCnt*d)
+		for i, l := range pLost {
+			copy(g[i*d:i*d+d], em[l*d:l*d+d])
+		}
+		vtmp := make([][]byte, d+pCnt)
+		for i := 0; i < d; i++ {
+			vtmp[i] = vects[i]
+		}
+		for i, p := range pLost {
+			if len(vects[p]) == 0 {
+				vects[p] = make([]byte, size)
+			}
+			vtmp[i+d] = vects[p]
+		}
+		etmp := &encSSSE3{data: d, parity: pCnt, gen: g}
+		err2 := etmp.encodeGen(vtmp)
+		if err2 != nil {
+			return err2
+		}
+	}
+	return
+}
+
+func (e *encSSSE3) reconstWithPos(vects [][]byte, has, dLost, pLost []int, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	if len(has) != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dCnt := len(dLost)
+	if dCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	pCnt := len(pLost)
+	if pCnt > p {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
+
+func (e *encSSSE3) reconstruct(vects [][]byte, dataOnly bool) (err error) {
+	d := e.data
+	p := e.parity
+	t := d + p
+	listBuf := make([]int, t+p)
+	has := listBuf[:d]
+	dLost := listBuf[d:t]
+	pLost := listBuf[t : t+p]
+	hasCnt, dCnt, pCnt := 0, 0, 0
+	for i := 0; i < t; i++ {
+		if vects[i] != nil {
+			if hasCnt < d {
+				has[hasCnt] = i
+				hasCnt++
+			}
+		} else {
+			if i < d {
+				if dCnt < p {
+					dLost[dCnt] = i
+					dCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			} else {
+				if pCnt < p {
+					pLost[pCnt] = i
+					pCnt++
+				} else {
+					return errors.New("rs.Reconst: not enough vects")
+				}
+			}
+		}
+	}
+	if hasCnt != d {
+		return errors.New("rs.Reconst: not enough vects")
+	}
+	dLost = dLost[:dCnt]
+	pLost = pLost[:pCnt]
+	return e.reconst(vects, has, dLost, pLost, dataOnly)
+}
--- a/vendor/github.com/templexxx/reedsolomon/rs_amd64.s
+++ b/vendor/github.com/templexxx/reedsolomon/rs_amd64.s
@ -0,0 +1,401 @@
+// Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
+
+#include "textflag.h"
+
+#define low_tbl Y0
+#define high_tbl Y1
+#define mask Y2
+#define in0  Y3
+#define in1  Y4
+#define in2  Y5
+#define in3  Y6
+#define in4  Y7
+#define in5  Y8
+#define in0_h  Y10
+#define in1_h  Y11
+#define in2_h  Y12
+#define in3_h  Y13
+#define in4_h  Y14
+#define in5_h  Y15
+
+#define in  BX
+#define out DI
+#define len R8
+#define pos R9
+
+#define tmp0 R10
+
+#define low_tblx X0
+#define high_tblx X1
+#define maskx X2
+#define in0x X3
+#define in0_hx X10
+#define tmp0x  X9
+#define tmp1x  X11
+#define tmp2x  X12
+#define tmp3x  X13
+
+
+// func mulVectAVX2(tbl, d, p []byte)
+TEXT ·mulVectAVX2(SB), NOSPLIT, $0
+    MOVQ         i+24(FP), in
+	MOVQ         o+48(FP), out
+	MOVQ         tbl+0(FP), tmp0
+	VMOVDQU      (tmp0), low_tblx
+	VMOVDQU      16(tmp0), high_tblx
+	MOVB         $0x0f, DX
+	LONG         $0x2069e3c4; WORD $0x00d2   // VPINSRB $0x00, EDX, XMM2, XMM2
+	VPBROADCASTB maskx, maskx
+	MOVQ         in_len+32(FP), len
+	TESTQ        $31, len
+	JNZ          one16b
+
+ymm:
+    VINSERTI128  $1, low_tblx, low_tbl, low_tbl
+    VINSERTI128  $1, high_tblx, high_tbl, high_tbl
+    VINSERTI128  $1, maskx, mask, mask
+    TESTQ        $255, len
+    JNZ          not_aligned
+
+// 256bytes/loop
+aligned:
+    MOVQ         $0, pos
+
+loop256b:
+	VMOVDQU (in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VMOVDQU in0, (out)(pos*1)
+
+    VMOVDQU 32(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VMOVDQU in1, 32(out)(pos*1)
+
+    VMOVDQU 64(in)(pos*1), in2
+	VPSRLQ  $4, in2, in2_h
+	VPAND   mask, in2_h, in2_h
+	VPAND   mask, in2, in2
+	VPSHUFB in2_h, high_tbl, in2_h
+	VPSHUFB in2, low_tbl, in2
+	VPXOR   in2, in2_h, in2
+	VMOVDQU in2, 64(out)(pos*1)
+
+    VMOVDQU 96(in)(pos*1), in3
+	VPSRLQ  $4, in3, in3_h
+	VPAND   mask, in3_h, in3_h
+	VPAND   mask, in3, in3
+	VPSHUFB in3_h, high_tbl, in3_h
+	VPSHUFB in3, low_tbl, in3
+	VPXOR   in3, in3_h, in3
+	VMOVDQU in3, 96(out)(pos*1)
+
+    VMOVDQU 128(in)(pos*1), in4
+	VPSRLQ  $4, in4, in4_h
+	VPAND   mask, in4_h, in4_h
+	VPAND   mask, in4, in4
+	VPSHUFB in4_h, high_tbl, in4_h
+	VPSHUFB in4, low_tbl, in4
+	VPXOR   in4, in4_h, in4
+	VMOVDQU in4, 128(out)(pos*1)
+
+    VMOVDQU 160(in)(pos*1), in5
+	VPSRLQ  $4, in5, in5_h
+	VPAND   mask, in5_h, in5_h
+	VPAND   mask, in5, in5
+	VPSHUFB in5_h, high_tbl, in5_h
+	VPSHUFB in5, low_tbl, in5
+	VPXOR   in5, in5_h, in5
+	VMOVDQU in5, 160(out)(pos*1)
+
+    VMOVDQU 192(in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VMOVDQU in0, 192(out)(pos*1)
+
+    VMOVDQU 224(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VMOVDQU in1, 224(out)(pos*1)
+
+	ADDQ    $256, pos
+	CMPQ    len, pos
+	JNE     loop256b
+	VZEROUPPER
+	RET
+
+not_aligned:
+    MOVQ    len, tmp0
+    ANDQ    $255, tmp0
+
+loop32b:
+    VMOVDQU -32(in)(len*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VMOVDQU in0, -32(out)(len*1)
+	SUBQ    $32, len
+	SUBQ    $32, tmp0
+	JG      loop32b
+	CMPQ    len, $256
+	JGE     aligned
+	VZEROUPPER
+	RET
+
+one16b:
+    VMOVDQU  -16(in)(len*1), in0x
+    VPSRLQ   $4, in0x, in0_hx
+    VPAND    maskx, in0x, in0x
+    VPAND    maskx, in0_hx, in0_hx
+    VPSHUFB  in0_hx, high_tblx, in0_hx
+    VPSHUFB  in0x, low_tblx, in0x
+    VPXOR    in0x, in0_hx, in0x
+	VMOVDQU  in0x, -16(out)(len*1)
+	SUBQ     $16, len
+	CMPQ     len, $0
+	JNE      ymm
+	RET
+
+// func mulVectAddAVX2(tbl, d, p []byte)
+TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
+    MOVQ         i+24(FP), in
+	MOVQ         o+48(FP), out
+	MOVQ         tbl+0(FP), tmp0
+	VMOVDQU      (tmp0), low_tblx
+	VMOVDQU      16(tmp0), high_tblx
+	MOVB         $0x0f, DX
+	LONG         $0x2069e3c4; WORD $0x00d2
+	VPBROADCASTB maskx, maskx
+	MOVQ         in_len+32(FP), len
+	TESTQ        $31, len
+	JNZ          one16b
+
+ymm:
+    VINSERTI128  $1, low_tblx, low_tbl, low_tbl
+    VINSERTI128  $1, high_tblx, high_tbl, high_tbl
+    VINSERTI128  $1, maskx, mask, mask
+    TESTQ        $255, len
+    JNZ          not_aligned
+
+aligned:
+    MOVQ         $0, pos
+
+loop256b:
+    VMOVDQU (in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VPXOR   (out)(pos*1), in0, in0
+	VMOVDQU in0, (out)(pos*1)
+
+    VMOVDQU 32(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VPXOR   32(out)(pos*1), in1, in1
+	VMOVDQU in1, 32(out)(pos*1)
+
+    VMOVDQU 64(in)(pos*1), in2
+	VPSRLQ  $4, in2, in2_h
+	VPAND   mask, in2_h, in2_h
+	VPAND   mask, in2, in2
+	VPSHUFB in2_h, high_tbl, in2_h
+	VPSHUFB in2, low_tbl, in2
+	VPXOR   in2, in2_h, in2
+	VPXOR   64(out)(pos*1), in2, in2
+	VMOVDQU in2, 64(out)(pos*1)
+
+    VMOVDQU 96(in)(pos*1), in3
+	VPSRLQ  $4, in3, in3_h
+	VPAND   mask, in3_h, in3_h
+	VPAND   mask, in3, in3
+	VPSHUFB in3_h, high_tbl, in3_h
+	VPSHUFB in3, low_tbl, in3
+	VPXOR   in3, in3_h, in3
+	VPXOR   96(out)(pos*1), in3, in3
+	VMOVDQU in3, 96(out)(pos*1)
+
+    VMOVDQU 128(in)(pos*1), in4
+	VPSRLQ  $4, in4, in4_h
+	VPAND   mask, in4_h, in4_h
+	VPAND   mask, in4, in4
+	VPSHUFB in4_h, high_tbl, in4_h
+	VPSHUFB in4, low_tbl, in4
+	VPXOR   in4, in4_h, in4
+	VPXOR   128(out)(pos*1), in4, in4
+	VMOVDQU in4, 128(out)(pos*1)
+
+    VMOVDQU 160(in)(pos*1), in5
+	VPSRLQ  $4, in5, in5_h
+	VPAND   mask, in5_h, in5_h
+	VPAND   mask, in5, in5
+	VPSHUFB in5_h, high_tbl, in5_h
+	VPSHUFB in5, low_tbl, in5
+	VPXOR   in5, in5_h, in5
+	VPXOR   160(out)(pos*1), in5, in5
+	VMOVDQU in5, 160(out)(pos*1)
+
+    VMOVDQU 192(in)(pos*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VPXOR   192(out)(pos*1), in0, in0
+	VMOVDQU in0, 192(out)(pos*1)
+
+    VMOVDQU 224(in)(pos*1), in1
+	VPSRLQ  $4, in1, in1_h
+	VPAND   mask, in1_h, in1_h
+	VPAND   mask, in1, in1
+	VPSHUFB in1_h, high_tbl, in1_h
+	VPSHUFB in1, low_tbl, in1
+	VPXOR   in1, in1_h, in1
+	VPXOR   224(out)(pos*1), in1, in1
+	VMOVDQU in1, 224(out)(pos*1)
+
+	ADDQ    $256, pos
+	CMPQ    len, pos
+	JNE     loop256b
+	VZEROUPPER
+	RET
+
+not_aligned:
+    MOVQ    len, tmp0
+    ANDQ    $255, tmp0
+
+loop32b:
+    VMOVDQU -32(in)(len*1), in0
+	VPSRLQ  $4, in0, in0_h
+	VPAND   mask, in0_h, in0_h
+	VPAND   mask, in0, in0
+	VPSHUFB in0_h, high_tbl, in0_h
+	VPSHUFB in0, low_tbl, in0
+	VPXOR   in0, in0_h, in0
+	VPXOR   -32(out)(len*1), in0, in0
+	VMOVDQU in0, -32(out)(len*1)
+	SUBQ    $32, len
+	SUBQ    $32, tmp0
+	JG      loop32b
+	CMPQ    len, $256
+	JGE     aligned
+	VZEROUPPER
+	RET
+
+one16b:
+    VMOVDQU  -16(in)(len*1), in0x
+    VPSRLQ   $4, in0x, in0_hx
+    VPAND    maskx, in0x, in0x
+    VPAND    maskx, in0_hx, in0_hx
+    VPSHUFB  in0_hx, high_tblx, in0_hx
+    VPSHUFB  in0x, low_tblx, in0x
+    VPXOR    in0x, in0_hx, in0x
+    VPXOR    -16(out)(len*1), in0x, in0x
+	VMOVDQU  in0x, -16(out)(len*1)
+	SUBQ     $16, len
+	CMPQ     len, $0
+	JNE      ymm
+	RET
+
+// func mulVectSSSE3(tbl, d, p []byte)
+TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
+    MOVQ    i+24(FP), in
+	MOVQ    o+48(FP), out
+	MOVQ    tbl+0(FP), tmp0
+	MOVOU   (tmp0), low_tblx
+	MOVOU   16(tmp0), high_tblx
+    MOVB    $15, tmp0
+    MOVQ    tmp0, maskx
+    PXOR    tmp0x, tmp0x
+   	PSHUFB  tmp0x, maskx
+	MOVQ    in_len+32(FP), len
+	SHRQ    $4, len
+
+loop:
+	MOVOU  (in), in0x
+	MOVOU  in0x, in0_hx
+	PSRLQ  $4, in0_hx
+	PAND   maskx, in0x
+	PAND   maskx, in0_hx
+	MOVOU  low_tblx, tmp1x
+	MOVOU  high_tblx, tmp2x
+	PSHUFB in0x, tmp1x
+	PSHUFB in0_hx, tmp2x
+	PXOR   tmp1x, tmp2x
+	MOVOU  tmp2x, (out)
+	ADDQ   $16, in
+	ADDQ   $16, out
+	SUBQ   $1, len
+	JNZ    loop
+	RET
+
+// func mulVectAddSSSE3(tbl, d, p []byte)
+TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
+    MOVQ    i+24(FP), in
+	MOVQ    o+48(FP), out
+	MOVQ    tbl+0(FP), tmp0
+	MOVOU   (tmp0), low_tblx
+	MOVOU   16(tmp0), high_tblx
+    MOVB    $15, tmp0
+    MOVQ    tmp0, maskx
+    PXOR    tmp0x, tmp0x
+   	PSHUFB  tmp0x, maskx
+	MOVQ    in_len+32(FP), len
+	SHRQ    $4, len
+
+loop:
+	MOVOU  (in), in0x
+	MOVOU  in0x, in0_hx
+	PSRLQ  $4, in0_hx
+	PAND   maskx, in0x
+	PAND   maskx, in0_hx
+	MOVOU  low_tblx, tmp1x
+	MOVOU  high_tblx, tmp2x
+	PSHUFB in0x, tmp1x
+	PSHUFB in0_hx, tmp2x
+	PXOR   tmp1x, tmp2x
+	MOVOU  (out), tmp3x
+	PXOR   tmp3x, tmp2x
+	MOVOU  tmp2x, (out)
+	ADDQ   $16, in
+	ADDQ   $16, out
+	SUBQ   $1, len
+	JNZ    loop
+	RET
+
+// func copy32B(dst, src []byte)
+TEXT ·copy32B(SB), NOSPLIT, $0
+    MOVQ dst+0(FP), SI
+    MOVQ src+24(FP), DX
+    MOVOU (DX), X0
+    MOVOU 16(DX), X1
+    MOVOU X0, (SI)
+    MOVOU X1, 16(SI)
+    RET
+	
--- a/vendor/github.com/templexxx/reedsolomon/rs_other.go
+++ b/vendor/github.com/templexxx/reedsolomon/rs_other.go
@ -0,0 +1,8 @@
+// +build !amd64
+
+package reedsolomon
+
+func newRS(d, p int, em matrix) (enc Encoder) {
+	g := em[d*d:]
+	return &encBase{data: d, parity: p, encode: em, gen: g}
+}
--- a/vendor/github.com/templexxx/reedsolomon/tbl.go
+++ b/vendor/github.com/templexxx/reedsolomon/tbl.go
--- a/vendor/github.com/tjfoc/gmsm/sm4/LICENSE
+++ b/vendor/github.com/tjfoc/gmsm/sm4/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/vendor/github.com/tjfoc/gmsm/sm4/sm4.go
+++ b/vendor/github.com/tjfoc/gmsm/sm4/sm4.go
@ -0,0 +1,291 @@
+/*
+Copyright Suzhou Tongji Fintech Research Institute 2017 All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+                 http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package sm4
+
+import (
+	"crypto/cipher"
+	"crypto/rand"
+	"crypto/x509"
+	"encoding/pem"
+	"errors"
+	"io/ioutil"
+	"os"
+	"strconv"
+)
+
+const BlockSize = 16
+
+type SM4Key []byte
+
+type KeySizeError int
+
+// Cipher is an instance of SM4 encryption.
+type Sm4Cipher struct {
+	subkeys []uint32
+	block1  []uint32
+	block2  []byte
+}
+
+// sm4密钥参量
+var fk = [4]uint32{
+	0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc,
+}
+
+// sm4密钥参量
+var ck = [32]uint32{
+	0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
+	0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
+	0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
+	0x50575e65, 0x6c737a81, 0x888f969d, 0xa4abb2b9,
+	0xc0c7ced5, 0xdce3eaf1, 0xf8ff060d, 0x141b2229,
+	0x30373e45, 0x4c535a61, 0x686f767d, 0x848b9299,
+	0xa0a7aeb5, 0xbcc3cad1, 0xd8dfe6ed, 0xf4fb0209,
+	0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279,
+}
+
+// sm4密钥参量
+var sbox = [256]uint8{
+	0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
+	0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+	0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
+	0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
+	0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
+	0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
+	0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
+	0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
+	0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
+	0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
+	0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
+	0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
+	0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
+	0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
+	0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
+	0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48,
+}
+
+func rl(x uint32, i uint8) uint32 { return (x << (i % 32)) | (x >> (32 - (i % 32))) }
+
+func l0(b uint32) uint32 { return b ^ rl(b, 13) ^ rl(b, 23) }
+
+func l1(b uint32) uint32 { return b ^ rl(b, 2) ^ rl(b, 10) ^ rl(b, 18) ^ rl(b, 24) }
+
+func feistel0(x0, x1, x2, x3, rk uint32) uint32 { return x0 ^ l0(p(x1^x2^x3^rk)) }
+
+func feistel1(x0, x1, x2, x3, rk uint32) uint32 { return x0 ^ l1(p(x1^x2^x3^rk)) }
+
+//非线性变换τ(.)
+func p(a uint32) uint32 {
+	return (uint32(sbox[a>>24]) << 24) ^ (uint32(sbox[(a>>16)&0xff]) << 16) ^ (uint32(sbox[(a>>8)&0xff]) << 8) ^ uint32(sbox[(a)&0xff])
+}
+
+/*
+func permuteInitialBlock(block []byte) []uint32 {
+	b := make([]uint32, 4, 4)
+	for i := 0; i < 4; i++ {
+		b[i] = (uint32(block[i*4]) << 24) | (uint32(block[i*4+1]) << 16) |
+			(uint32(block[i*4+2]) << 8) | (uint32(block[i*4+3]))
+	}
+	return b
+}
+
+func permuteFinalBlock(block []uint32) []byte {
+	b := make([]byte, 16, 16)
+	for i := 0; i < 4; i++ {
+		b[i*4] = uint8(block[i] >> 24)
+		b[i*4+1] = uint8(block[i] >> 16)
+		b[i*4+2] = uint8(block[i] >> 8)
+		b[i*4+3] = uint8(block[i])
+	}
+	return b
+}
+
+func cryptBlock(subkeys []uint32, dst, src []byte, decrypt bool) {
+	var tm uint32
+	b := permuteInitialBlock(src)
+	for i := 0; i < 32; i++ {
+		if decrypt {
+			tm = feistel1(b[0], b[1], b[2], b[3], subkeys[31-i])
+		} else {
+			tm = feistel1(b[0], b[1], b[2], b[3], subkeys[i])
+		}
+		b[0], b[1], b[2], b[3] = b[1], b[2], b[3], tm
+	}
+	b[0], b[1], b[2], b[3] = b[3], b[2], b[1], b[0]
+	copy(dst, permuteFinalBlock(b))
+}
+*/
+
+func permuteInitialBlock(b []uint32, block []byte) {
+	for i := 0; i < 4; i++ {
+		b[i] = (uint32(block[i*4]) << 24) | (uint32(block[i*4+1]) << 16) |
+			(uint32(block[i*4+2]) << 8) | (uint32(block[i*4+3]))
+	}
+}
+
+func permuteFinalBlock(b []byte, block []uint32) {
+	for i := 0; i < 4; i++ {
+		b[i*4] = uint8(block[i] >> 24)
+		b[i*4+1] = uint8(block[i] >> 16)
+		b[i*4+2] = uint8(block[i] >> 8)
+		b[i*4+3] = uint8(block[i])
+	}
+}
+func cryptBlock(subkeys []uint32, b []uint32, r []byte, dst, src []byte, decrypt bool) {
+	var tm uint32
+
+	permuteInitialBlock(b, src)
+	for i := 0; i < 32; i++ {
+		if decrypt {
+			tm = b[0] ^ l1(p(b[1]^b[2]^b[3]^subkeys[i]))
+			//			tm = feistel1(b[0], b[1], b[2], b[3], subkeys[31-i])
+		} else {
+			tm = b[0] ^ l1(p(b[1]^b[2]^b[3]^subkeys[i]))
+			//	tm = feistel1(b[0], b[1], b[2], b[3], subkeys[i])
+		}
+		b[0], b[1], b[2], b[3] = b[1], b[2], b[3], tm
+	}
+	b[0], b[1], b[2], b[3] = b[3], b[2], b[1], b[0]
+	permuteFinalBlock(r, b)
+	copy(dst, r)
+}
+
+func generateSubKeys(key []byte) []uint32 {
+	subkeys := make([]uint32, 32)
+	b := make([]uint32, 4)
+	//	b := permuteInitialBlock(key)
+	permuteInitialBlock(b, key)
+	b[0] ^= fk[0]
+	b[1] ^= fk[1]
+	b[2] ^= fk[2]
+	b[3] ^= fk[3]
+	for i := 0; i < 32; i++ {
+		subkeys[i] = feistel0(b[0], b[1], b[2], b[3], ck[i])
+		b[0], b[1], b[2], b[3] = b[1], b[2], b[3], subkeys[i]
+	}
+	return subkeys
+}
+
+func EncryptBlock(key SM4Key, dst, src []byte) {
+	subkeys := generateSubKeys(key)
+	cryptBlock(subkeys, make([]uint32, 4), make([]byte, 16), dst, src, false)
+}
+
+func DecryptBlock(key SM4Key, dst, src []byte) {
+	subkeys := generateSubKeys(key)
+	cryptBlock(subkeys, make([]uint32, 4), make([]byte, 16), dst, src, true)
+}
+
+func ReadKeyFromMem(data []byte, pwd []byte) (SM4Key, error) {
+	block, _ := pem.Decode(data)
+	if x509.IsEncryptedPEMBlock(block) {
+		if block.Type != "SM4 ENCRYPTED KEY" {
+			return nil, errors.New("SM4: unknown type")
+		}
+		if pwd == nil {
+			return nil, errors.New("SM4: need passwd")
+		}
+		data, err := x509.DecryptPEMBlock(block, pwd)
+		if err != nil {
+			return nil, err
+		}
+		return data, nil
+	}
+	if block.Type != "SM4 KEY" {
+		return nil, errors.New("SM4: unknown type")
+	}
+	return block.Bytes, nil
+}
+
+func ReadKeyFromPem(FileName string, pwd []byte) (SM4Key, error) {
+	data, err := ioutil.ReadFile(FileName)
+	if err != nil {
+		return nil, err
+	}
+	return ReadKeyFromMem(data, pwd)
+}
+
+func WriteKeytoMem(key SM4Key, pwd []byte) ([]byte, error) {
+	if pwd != nil {
+		block, err := x509.EncryptPEMBlock(rand.Reader,
+			"SM4 ENCRYPTED KEY", key, pwd, x509.PEMCipherAES256)
+		if err != nil {
+			return nil, err
+		}
+		return pem.EncodeToMemory(block), nil
+	} else {
+		block := &pem.Block{
+			Type:  "SM4 KEY",
+			Bytes: key,
+		}
+		return pem.EncodeToMemory(block), nil
+	}
+}
+
+func WriteKeyToPem(FileName string, key SM4Key, pwd []byte) (bool, error) {
+	var block *pem.Block
+
+	if pwd != nil {
+		var err error
+		block, err = x509.EncryptPEMBlock(rand.Reader,
+			"SM4 ENCRYPTED KEY", key, pwd, x509.PEMCipherAES256)
+		if err != nil {
+			return false, err
+		}
+	} else {
+		block = &pem.Block{
+			Type:  "SM4 KEY",
+			Bytes: key,
+		}
+	}
+	file, err := os.Create(FileName)
+	if err != nil {
+		return false, err
+	}
+	defer file.Close()
+	err = pem.Encode(file, block)
+	if err != nil {
+		return false, nil
+	}
+	return true, nil
+}
+
+func (k KeySizeError) Error() string {
+	return "SM4: invalid key size " + strconv.Itoa(int(k))
+}
+
+// NewCipher creates and returns a new cipher.Block.
+func NewCipher(key []byte) (cipher.Block, error) {
+	if len(key) != BlockSize {
+		return nil, KeySizeError(len(key))
+	}
+	c := new(Sm4Cipher)
+	c.subkeys = generateSubKeys(key)
+	c.block1 = make([]uint32, 4)
+	c.block2 = make([]byte, 16)
+	return c, nil
+}
+
+func (c *Sm4Cipher) BlockSize() int {
+	return BlockSize
+}
+
+func (c *Sm4Cipher) Encrypt(dst, src []byte) {
+	cryptBlock(c.subkeys, c.block1, c.block2, dst, src, false)
+}
+
+func (c *Sm4Cipher) Decrypt(dst, src []byte) {
+	cryptBlock(c.subkeys, c.block1, c.block2, dst, src, true)
+}
--- a/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE
--- a/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go
@ -7,6 +7,7 @@ import (
 	"crypto/sha1"

 	"github.com/templexxx/xor"
+	"github.com/tjfoc/gmsm/sm4"

 	"golang.org/x/crypto/blowfish"
 	"golang.org/x/crypto/cast5"
@ -55,6 +56,28 @@ func (c *salsa20BlockCrypt) Decrypt(dst, src []byte) {
 	copy(dst[:8], src[:8])
 }

+type sm4BlockCrypt struct {
+	encbuf []byte
+	decbuf []byte
+	block  cipher.Block
+}
+
+// NewSM4BlockCrypt https://github.com/tjfoc/gmsm/tree/master/sm4
+func NewSM4BlockCrypt(key []byte) (BlockCrypt, error) {
+	c := new(sm4BlockCrypt)
+	block, err := sm4.NewCipher(key)
+	if err != nil {
+		return nil, err
+	}
+	c.block = block
+	c.encbuf = make([]byte, sm4.BlockSize)
+	c.decbuf = make([]byte, 2*sm4.BlockSize)
+	return c, nil
+}
+
+func (c *sm4BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *sm4BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+
 type twofishBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
--- a/vendor/github.com/AudriusButkevicius/kcp-go/fec.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/fec.go
@ -4,7 +4,7 @@ import (
 	"encoding/binary"
 	"sync/atomic"

-	"github.com/klauspost/reedsolomon"
+	"github.com/templexxx/reedsolomon"
 )

 const (
@ -52,7 +52,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder {
 	fec.dataShards = dataShards
 	fec.parityShards = parityShards
 	fec.shardSize = dataShards + parityShards
-	enc, err := reedsolomon.New(dataShards, parityShards, reedsolomon.WithMaxGoroutines(1))
+	enc, err := reedsolomon.New(dataShards, parityShards)
 	if err != nil {
 		return nil
 	}
@ -157,7 +157,7 @@ func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
 					xorBytes(shards[k][dlen:], shards[k][dlen:], shards[k][dlen:])
 				}
 			}
-			if err := dec.codec.Reconstruct(shards); err == nil {
+			if err := dec.codec.ReconstructData(shards); err == nil {
 				for k := range shards[:dec.dataShards] {
 					if !shardsflag[k] {
 						recovered = append(recovered, shards[k])
@ -226,7 +226,7 @@ func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder {
 	fec.headerOffset = offset
 	fec.payloadOffset = fec.headerOffset + fecHeaderSize

-	enc, err := reedsolomon.New(dataShards, parityShards, reedsolomon.WithMaxGoroutines(1))
+	enc, err := reedsolomon.New(dataShards, parityShards)
 	if err != nil {
 		return nil
 	}
--- a/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go
--- a/vendor/github.com/AudriusButkevicius/kcp-go/sess.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/sess.go
@ -3,7 +3,6 @@ package kcp
 import (
 	"crypto/rand"
 	"encoding/binary"
-	"fmt"
 	"hash/crc32"
 	"io"
 	"net"
@ -306,8 +305,10 @@ func (s *UDPSession) Close() error {
 	// remove this session from updater & listener(if necessary)
 	updater.removeSession(s)
 	if s.l != nil { // notify listener
-		key := fmt.Sprintf("%s/%d", s.remote.String(), s.kcp.conv)
-		s.l.closeSession(key)
+		s.l.closeSession(sessionKey{
+			addr:   s.remote.String(),
+			convID: s.kcp.conv,
+		})
 	}

 	s.mu.Lock()
@ -660,6 +661,11 @@ func (s *UDPSession) readLoop() {
 }

 type (
+	sessionKey struct {
+		addr   string
+		convID uint32
+	}
+
 	// Listener defines a server listening for connections
 	Listener struct {
 		block        BlockCrypt     // block encryption
@ -668,12 +674,12 @@ type (
 		fecDecoder   *fecDecoder    // FEC mock initialization
 		conn         net.PacketConn // the underlying packet connection

-		sessions        map[string]*UDPSession // all sessions accepted by this Listener
-		chAccepts       chan *UDPSession       // Listen() backlog
-		chSessionClosed chan string            // session close queue
-		headerSize      int                    // the overall header size added before KCP frame
-		die             chan struct{}          // notify the listener has closed
-		rd              atomic.Value           // read deadline for Accept()
+		sessions        map[sessionKey]*UDPSession // all sessions accepted by this Listener
+		chAccepts       chan *UDPSession           // Listen() backlog
+		chSessionClosed chan sessionKey            // session close queue
+		headerSize      int                        // the overall header size added before KCP frame
+		die             chan struct{}              // notify the listener has closed
+		rd              atomic.Value               // read deadline for Accept()
 		wd              atomic.Value
 	}

@ -687,7 +693,7 @@ type (
 // monitor incoming data for all connections of server
 func (l *Listener) monitor() {
 	// cache last session
-	var lastKey string
+	var lastKey sessionKey
 	var lastSession *UDPSession

 	chPacket := make(chan inPacket, qlen)
@ -728,8 +734,10 @@ func (l *Listener) monitor() {
 				}

 				if convValid {
-					addr := from.String()
-					key := fmt.Sprintf("%s/%d", addr, conv)
+					key := sessionKey{
+						addr:   from.String(),
+						convID: conv,
+					}
 					var s *UDPSession
 					var ok bool

@ -739,7 +747,7 @@ func (l *Listener) monitor() {
 						s, ok = lastSession, true
 					} else if s, ok = l.sessions[key]; ok {
 						lastSession = s
-						lastKey = addr
+						lastKey = key
 					}

 					if !ok { // new session
@ -758,7 +766,7 @@ func (l *Listener) monitor() {
 			xmitBuf.Put(raw)
 		case key := <-l.chSessionClosed:
 			if key == lastKey {
-				lastKey = ""
+				lastKey = sessionKey{}
 			}
 			delete(l.sessions, key)
 		case <-l.die:
@ -856,7 +864,7 @@ func (l *Listener) Close() error {
 }

 // closeSession notify the listener that a session has closed
-func (l *Listener) closeSession(key string) bool {
+func (l *Listener) closeSession(key sessionKey) bool {
 	select {
 	case l.chSessionClosed <- key:
 		return true
@ -890,9 +898,9 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*Listener, error) {
 	l := new(Listener)
 	l.conn = conn
-	l.sessions = make(map[string]*UDPSession)
+	l.sessions = make(map[sessionKey]*UDPSession)
 	l.chAccepts = make(chan *UDPSession, acceptBacklog)
-	l.chSessionClosed = make(chan string)
+	l.chSessionClosed = make(chan sessionKey)
 	l.die = make(chan struct{})
 	l.dataShards = dataShards
 	l.parityShards = parityShards
--- a/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go
--- a/vendor/github.com/AudriusButkevicius/kcp-go/updater.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/updater.go
--- a/vendor/github.com/AudriusButkevicius/kcp-go/xor.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/xor.go
--- a/vendor/manifest
+++ b/vendor/manifest
@ -17,14 +17,6 @@
 			"branch": "master",
 			"notests": true
 		},
-		{
-			"importpath": "github.com/AudriusButkevicius/kcp-go",
-			"repository": "https://github.com/AudriusButkevicius/kcp-go",
-			"vcs": "git",
-			"revision": "d17218ba2121268b854dd84f2bb54679541c4048",
-			"branch": "master",
-			"notests": true
-		},
 		{
 			"importpath": "github.com/AudriusButkevicius/pfilter",
 			"repository": "https://github.com/AudriusButkevicius/pfilter",
@ -265,14 +257,6 @@
 			"branch": "master",
 			"notests": true
 		},
-		{
-			"importpath": "github.com/klauspost/reedsolomon",
-			"repository": "https://github.com/klauspost/reedsolomon",
-			"vcs": "git",
-			"revision": "5abf0ee302ccf4834e84f63ff74eca3e8b88e4e2",
-			"branch": "master",
-			"notests": true
-		},
 		{
 			"importpath": "github.com/lib/pq",
 			"repository": "https://github.com/lib/pq",
@ -378,6 +362,22 @@
 			"path": "/leveldb",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/templexxx/cpufeat",
+			"repository": "https://github.com/templexxx/cpufeat",
+			"vcs": "git",
+			"revision": "3794dfbfb04749f896b521032f69383f24c3687e",
+			"branch": "master",
+			"notests": true
+		},
+		{
+			"importpath": "github.com/templexxx/reedsolomon",
+			"repository": "https://github.com/templexxx/reedsolomon",
+			"vcs": "git",
+			"revision": "7092926d7d05c415fabb892b1464a03f8228ab80",
+			"branch": "master",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/templexxx/xor",
 			"repository": "https://github.com/templexxx/xor",
@ -394,6 +394,15 @@
 			"branch": "master",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/tjfoc/gmsm/sm4",
+			"repository": "https://github.com/tjfoc/gmsm",
+			"vcs": "git",
+			"revision": "0f4904804c0f24f1784e10195a4144fcffa86a85",
+			"branch": "master",
+			"path": "/sm4",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/vitrun/qart/coding",
 			"repository": "https://github.com/vitrun/qart",
@ -421,6 +430,14 @@
 			"path": "/qr",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/xtaci/kcp-go",
+			"repository": "https://github.com/xtaci/kcp-go",
+			"vcs": "git",
+			"revision": "21da33a6696d67c1bffb3c954366499d613097a6",
+			"branch": "master",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/xtaci/smux",
 			"repository": "https://github.com/xtaci/smux",