Add support for snappy and lz4 compression

2024-01-05 21:30:08 -06:00 · 2024-01-05 21:30:08 -06:00 · 965d3deb8d
parent 2b6aa85302
commit 965d3deb8d
47 changed files with 7774 additions and 6 deletions
--- a/cmd/errors.go
+++ b/cmd/errors.go
@ -16,7 +16,7 @@ import (

 var (
 	ErrInvalidAdminPrefix    = errors.New("admin path must match the pattern " + AllowedCharacters)
-	ErrInvalidCompression    = errors.New("supported compression formats: flate, gzip, lzw, none, zlib, zstd")
+	ErrInvalidCompression    = errors.New("supported compression formats: flate, gzip, lz4, lzw, none, snappy, zlib, zstd")
 	ErrInvalidConcurrency    = errors.New("concurrency limit must be a positive integer")
 	ErrInvalidFileCountRange = errors.New("maximum file count limit must be greater than or equal to minimum file count limit")
 	ErrInvalidFileCountValue = errors.New("file count limits must be non-negative integers no greater than 2147483647")
--- a/cmd/index.go
+++ b/cmd/index.go
@ -17,8 +17,10 @@ import (
 	"sync"
 	"time"

+	"github.com/golang/snappy"
 	"github.com/julienschmidt/httprouter"
 	"github.com/klauspost/compress/zstd"
+	lz4 "github.com/pierrec/lz4/v4"
 	"seedno.de/seednode/roulette/types"
 )

@ -91,16 +93,20 @@ func (index *fileIndex) isEmpty() bool {
 	return length == 0
 }

-func getReader(format string, file io.Reader) (io.ReadCloser, error) {
+func getReader(format string, file io.Reader) (io.Reader, error) {
 	switch format {
 	case "flate":
 		return flate.NewReader(file), nil
 	case "gzip":
 		return gzip.NewReader(file)
+	case "lz4":
+		return lz4.NewReader(file), nil
 	case "lzw":
 		return lzw.NewReader(file, lzw.LSB, 8), nil
 	case "none":
 		return io.NopCloser(file), nil
+	case "snappy":
+		return snappy.NewReader(file), nil
 	case "zlib":
 		return zlib.NewReader(file)
 	case "zstd":
@ -117,11 +123,22 @@ func getWriter(format string, file io.WriteCloser) (io.WriteCloser, error) {
 	case "flate":
 		return flate.NewWriter(file, flate.DefaultCompression)
 	case "gzip":
-		return gzip.NewWriter(file), nil
+		return gzip.NewWriterLevel(file, gzip.BestCompression)
+	case "lz4":
+		encoder := lz4.NewWriter(file)
+
+		err := encoder.Apply(lz4.CompressionLevelOption(lz4.Level9))
+		if err != nil {
+			return file, err
+		}
+
+		return encoder, nil
 	case "lzw":
 		return lzw.NewWriter(file, lzw.LSB, 8), nil
 	case "none":
 		return file, nil
+	case "snappy":
+		return snappy.NewBufferedWriter(file), nil
 	case "zlib":
 		return zlib.NewWriter(file), nil
 	case "zstd":
@ -202,7 +219,7 @@ func (index *fileIndex) Import(path string) error {
 	if err != nil {
 		return err
 	}
-	defer reader.Close()
+	//defer reader.Close()

 	dec := gob.NewDecoder(reader)

--- a/cmd/root.go
+++ b/cmd/root.go
@ -15,7 +15,7 @@ import (

 const (
 	AllowedCharacters string = `^[A-z0-9.\-_]+$`
-	ReleaseVersion    string = "3.10.0"
+	ReleaseVersion    string = "3.11.0"
 )

 var (
@ -62,8 +62,10 @@ var (
 	CompressionFormats = []string{
 		"flate",
 		"gzip",
+		"lz4",
 		"lzw",
 		"none",
+		"snappy",
 		"zlib",
 		"zstd",
 	}
@ -133,7 +135,7 @@ func init() {
 	rootCmd.Flags().BoolVar(&CaseSensitive, "case-sensitive", false, "use case-sensitive matching for filters")
 	rootCmd.Flags().BoolVar(&Code, "code", false, "enable support for source code files")
 	rootCmd.Flags().StringVar(&CodeTheme, "code-theme", "solarized-dark256", "theme for source code syntax highlighting")
-	rootCmd.Flags().StringVar(&Compression, "compression", "zstd", "compression format to use for index (flate, gzip, lzw, none, zlib, zstd)")
+	rootCmd.Flags().StringVar(&Compression, "compression", "zstd", "compression format to use for index (flate, gzip, lz5, lzw, none, snappy, zlib, zstd)")
 	rootCmd.Flags().IntVar(&Concurrency, "concurrency", 8192, "maximum concurrency for scan threads")
 	rootCmd.Flags().BoolVar(&DisableButtons, "disable-buttons", false, "disable first/prev/next/last buttons")
 	rootCmd.Flags().BoolVar(&ExitOnError, "exit-on-error", false, "shut down webserver on error, instead of just printing error")
--- a/go.mod
+++ b/go.mod
@ -4,8 +4,10 @@ go 1.21

 require (
 	github.com/alecthomas/chroma/v2 v2.12.0
+	github.com/golang/snappy v0.0.4
 	github.com/julienschmidt/httprouter v1.3.0
 	github.com/klauspost/compress v1.17.4
+	github.com/pierrec/lz4/v4 v4.1.19
 	github.com/spf13/cobra v1.8.0
 	github.com/yosssi/gohtml v0.0.0-20201013000340-ee4748c638f4
 	golang.org/x/image v0.14.0
--- a/go.sum
+++ b/go.sum
@ -7,6 +7,8 @@ github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW5
 github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
 github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
 github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
 github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
 github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
@ -15,6 +17,8 @@ github.com/julienschmidt/httprouter v1.3.0 h1:U0609e9tgbseu3rBINet9P48AI/D3oJs4d
 github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
 github.com/klauspost/compress v1.17.4 h1:Ej5ixsIri7BrIjBkRZLTo6ghwrEtHFk7ijlczPW4fZ4=
 github.com/klauspost/compress v1.17.4/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
+github.com/pierrec/lz4/v4 v4.1.19 h1:tYLzDnjDXh9qIxSTKHwXwOYmm9d887Y7Y1ZkyXYHAN4=
+github.com/pierrec/lz4/v4 v4.1.19/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
 github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0=
 github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho=
--- a/vendor/github.com/golang/snappy/.gitignore
+++ b/vendor/github.com/golang/snappy/.gitignore
@ -0,0 +1,16 @@
+cmd/snappytool/snappytool
+testdata/bench
+
+# These explicitly listed benchmark data files are for an obsolete version of
+# snappy_test.go.
+testdata/alice29.txt
+testdata/asyoulik.txt
+testdata/fireworks.jpeg
+testdata/geo.protodata
+testdata/html
+testdata/html_x_4
+testdata/kppkn.gtb
+testdata/lcet10.txt
+testdata/paper-100k.pdf
+testdata/plrabn12.txt
+testdata/urls.10K
--- a/vendor/github.com/golang/snappy/AUTHORS
+++ b/vendor/github.com/golang/snappy/AUTHORS
@ -0,0 +1,18 @@
+# This is the official list of Snappy-Go authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Amazon.com, Inc
+Damian Gryski <dgryski@gmail.com>
+Eric Buth <eric@topos.com>
+Google Inc.
+Jan Mercl <0xjnml@gmail.com>
+Klaus Post <klauspost@gmail.com>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
--- a/vendor/github.com/golang/snappy/CONTRIBUTORS
+++ b/vendor/github.com/golang/snappy/CONTRIBUTORS
@ -0,0 +1,41 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the Snappy-Go repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+#     http://code.google.com/legal/individual-cla-v1.0.html
+#     http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+#     Name <email address>
+
+# Please keep the list sorted.
+
+Alex Legg <alexlegg@google.com>
+Damian Gryski <dgryski@gmail.com>
+Eric Buth <eric@topos.com>
+Jan Mercl <0xjnml@gmail.com>
+Jonathan Swinney <jswinney@amazon.com>
+Kai Backman <kaib@golang.org>
+Klaus Post <klauspost@gmail.com>
+Marc-Antoine Ruel <maruel@chromium.org>
+Nigel Tao <nigeltao@golang.org>
+Rob Pike <r@golang.org>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Russ Cox <rsc@golang.org>
+Sebastien Binet <seb.binet@gmail.com>
--- a/vendor/github.com/golang/snappy/LICENSE
+++ b/vendor/github.com/golang/snappy/LICENSE
@ -0,0 +1,27 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/golang/snappy/README
+++ b/vendor/github.com/golang/snappy/README
@ -0,0 +1,107 @@
+The Snappy compression format in the Go programming language.
+
+To download and install from source:
+$ go get github.com/golang/snappy
+
+Unless otherwise noted, the Snappy-Go source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+
+
+Benchmarks.
+
+The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
+or so files, the same set used by the C++ Snappy code (github.com/google/snappy
+and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
+3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
+
+"go test -test.bench=."
+
+_UFlat0-8         2.19GB/s ± 0%  html
+_UFlat1-8         1.41GB/s ± 0%  urls
+_UFlat2-8         23.5GB/s ± 2%  jpg
+_UFlat3-8         1.91GB/s ± 0%  jpg_200
+_UFlat4-8         14.0GB/s ± 1%  pdf
+_UFlat5-8         1.97GB/s ± 0%  html4
+_UFlat6-8          814MB/s ± 0%  txt1
+_UFlat7-8          785MB/s ± 0%  txt2
+_UFlat8-8          857MB/s ± 0%  txt3
+_UFlat9-8          719MB/s ± 1%  txt4
+_UFlat10-8        2.84GB/s ± 0%  pb
+_UFlat11-8        1.05GB/s ± 0%  gaviota
+
+_ZFlat0-8         1.04GB/s ± 0%  html
+_ZFlat1-8          534MB/s ± 0%  urls
+_ZFlat2-8         15.7GB/s ± 1%  jpg
+_ZFlat3-8          740MB/s ± 3%  jpg_200
+_ZFlat4-8         9.20GB/s ± 1%  pdf
+_ZFlat5-8          991MB/s ± 0%  html4
+_ZFlat6-8          379MB/s ± 0%  txt1
+_ZFlat7-8          352MB/s ± 0%  txt2
+_ZFlat8-8          396MB/s ± 1%  txt3
+_ZFlat9-8          327MB/s ± 1%  txt4
+_ZFlat10-8        1.33GB/s ± 1%  pb
+_ZFlat11-8         605MB/s ± 1%  gaviota
+
+
+
+"go test -test.bench=. -tags=noasm"
+
+_UFlat0-8          621MB/s ± 2%  html
+_UFlat1-8          494MB/s ± 1%  urls
+_UFlat2-8         23.2GB/s ± 1%  jpg
+_UFlat3-8         1.12GB/s ± 1%  jpg_200
+_UFlat4-8         4.35GB/s ± 1%  pdf
+_UFlat5-8          609MB/s ± 0%  html4
+_UFlat6-8          296MB/s ± 0%  txt1
+_UFlat7-8          288MB/s ± 0%  txt2
+_UFlat8-8          309MB/s ± 1%  txt3
+_UFlat9-8          280MB/s ± 1%  txt4
+_UFlat10-8         753MB/s ± 0%  pb
+_UFlat11-8         400MB/s ± 0%  gaviota
+
+_ZFlat0-8          409MB/s ± 1%  html
+_ZFlat1-8          250MB/s ± 1%  urls
+_ZFlat2-8         12.3GB/s ± 1%  jpg
+_ZFlat3-8          132MB/s ± 0%  jpg_200
+_ZFlat4-8         2.92GB/s ± 0%  pdf
+_ZFlat5-8          405MB/s ± 1%  html4
+_ZFlat6-8          179MB/s ± 1%  txt1
+_ZFlat7-8          170MB/s ± 1%  txt2
+_ZFlat8-8          189MB/s ± 1%  txt3
+_ZFlat9-8          164MB/s ± 1%  txt4
+_ZFlat10-8         479MB/s ± 1%  pb
+_ZFlat11-8         270MB/s ± 1%  gaviota
+
+
+
+For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
+are the numbers from C++ Snappy's
+
+make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
+
+BM_UFlat/0     2.4GB/s  html
+BM_UFlat/1     1.4GB/s  urls
+BM_UFlat/2    21.8GB/s  jpg
+BM_UFlat/3     1.5GB/s  jpg_200
+BM_UFlat/4    13.3GB/s  pdf
+BM_UFlat/5     2.1GB/s  html4
+BM_UFlat/6     1.0GB/s  txt1
+BM_UFlat/7   959.4MB/s  txt2
+BM_UFlat/8     1.0GB/s  txt3
+BM_UFlat/9   864.5MB/s  txt4
+BM_UFlat/10    2.9GB/s  pb
+BM_UFlat/11    1.2GB/s  gaviota
+
+BM_ZFlat/0   944.3MB/s  html (22.31 %)
+BM_ZFlat/1   501.6MB/s  urls (47.78 %)
+BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
+BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
+BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
+BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
+BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
+BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
+BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
+BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
+BM_ZFlat/10    1.2GB/s  pb (19.68 %)
+BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
--- a/vendor/github.com/golang/snappy/decode.go
+++ b/vendor/github.com/golang/snappy/decode.go
@ -0,0 +1,264 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("snappy: corrupt input")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("snappy: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("snappy: unsupported input")
+
+	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt                  = 1
+	decodeErrCodeUnsupportedLiteralLength = 2
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Decode handles the Snappy block format, not the Snappy stream format.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= len(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	switch decode(dst, src[s:]) {
+	case 0:
+		return dst, nil
+	case decodeErrCodeUnsupportedLiteralLength:
+		return nil, errUnsupportedLiteralLength
+	}
+	return nil, ErrCorrupt
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func NewReader(r io.Reader) *Reader {
+	return &Reader{
+		r:       r,
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+//
+// Reader handles the Snappy stream format, not the Snappy block format.
+type Reader struct {
+	r       io.Reader
+	err     error
+	decoded []byte
+	buf     []byte
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j       int
+	readHeader bool
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.readHeader = false
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+func (r *Reader) fill() error {
+	for r.i >= r.j {
+		if !r.readFull(r.buf[:4], true) {
+			return r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+		if chunkLen > len(r.buf) {
+			r.err = ErrUnsupported
+			return r.err
+		}
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return r.err
+			}
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return r.err
+			}
+			for i := 0; i < len(magicBody); i++ {
+				if r.buf[i] != magicBody[i] {
+					r.err = ErrCorrupt
+					return r.err
+				}
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.readFull(r.buf[:chunkLen], false) {
+			return r.err
+		}
+	}
+
+	return nil
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	if err := r.fill(); err != nil {
+		return 0, err
+	}
+
+	n := copy(p, r.decoded[r.i:r.j])
+	r.i += n
+	return n, nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	if err := r.fill(); err != nil {
+		return 0, err
+	}
+
+	c := r.decoded[r.i]
+	r.i++
+	return c, nil
+}
--- a/vendor/github.com/golang/snappy/decode_amd64.s
+++ b/vendor/github.com/golang/snappy/decode_amd64.s
@ -0,0 +1,490 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- AX	scratch
+//	- BX	scratch
+//	- CX	length or x
+//	- DX	offset
+//	- SI	&src[s]
+//	- DI	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	used by doCopy
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
+// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
+TEXT ·decode(SB), NOSPLIT, $48-56
+	// Initialize SI, DI and R8-R13.
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, DI
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, SI
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+loop:
+	// for s < len(src)
+	CMPQ SI, R13
+	JEQ  end
+
+	// CX = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (SI), CX
+	MOVL    CX, BX
+	ANDL    $3, BX
+	CMPL    BX, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, CX
+	CMPL CX, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ SI
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// CX can hold 64 bits, so the increment cannot overflow.
+	INCQ CX
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// AX = len(dst) - d
+	// BX = len(src) - s
+	MOVQ R10, AX
+	SUBQ DI, AX
+	MOVQ R13, BX
+	SUBQ SI, BX
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ CX, $16
+	JGT  callMemmove
+	CMPQ AX, $16
+	JLT  callMemmove
+	CMPQ BX, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(SI), X0
+	MOVOU X0, 0(DI)
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ CX, AX
+	JGT  errCorrupt
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ CX, SI
+	SUBQ $58, SI
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL CX, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(SI), CX
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(SI), CX
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL CX, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVWLZX -3(SI), CX
+	MOVBLZX -1(SI), BX
+	SHLL    $16, BX
+	ORL     BX, CX
+	JMP     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(SI), CX
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(SI), DX
+	JMP     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVWQZX -2(SI), DX
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- BX == src[s] & 0x03
+	//	- CX == src[s]
+	CMPQ BX, $2
+	JEQ  tagCopy2
+	JA   tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVQ SI, BX
+	SUBQ R11, BX
+	CMPQ BX, R12
+	JA   errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	MOVQ    CX, DX
+	ANDQ    $0xe0, DX
+	SHLQ    $3, DX
+	MOVBQZX -1(SI), BX
+	ORQ     BX, DX
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	SHRQ $2, CX
+	ANDQ $7, CX
+	ADDQ $4, CX
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- CX == length && CX > 0
+	//	- DX == offset
+
+	// if offset <= 0 { etc }
+	CMPQ DX, $0
+	JLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVQ DI, BX
+	SUBQ R8, BX
+	CMPQ BX, DX
+	JLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R10, BX
+	SUBQ DI, BX
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R14 = len(dst)-d
+	//	- R15 = &dst[d-offset]
+	MOVQ R10, R14
+	SUBQ DI, R14
+	MOVQ DI, R15
+	SUBQ DX, R15
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ CX, $16
+	JGT  slowForwardCopy
+	CMPQ DX, $8
+	JLT  slowForwardCopy
+	CMPQ R14, $16
+	JLT  slowForwardCopy
+	MOVQ 0(R15), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(R15), BX
+	MOVQ BX, 8(DI)
+	ADDQ CX, DI
+	JMP  loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R14
+	CMPQ CX, R14
+	JGT  verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMPQ DX, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R15), BX
+	MOVQ BX, (DI)
+	SUBQ DX, CX
+	ADDQ DX, DI
+	ADDQ DX, DX
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by DI being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ DI, AX
+	ADDQ CX, DI
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ CX, $0
+	JLE  loop
+	MOVQ (R15), BX
+	MOVQ BX, (AX)
+	ADDQ $8, R15
+	ADDQ $8, AX
+	SUBQ $8, CX
+	JMP  finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), BX
+	MOVB BX, (DI)
+	INCQ R15
+	INCQ DI
+	DECQ CX
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ DI, R10
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET
--- a/vendor/github.com/golang/snappy/decode_arm64.s
+++ b/vendor/github.com/golang/snappy/decode_arm64.s
@ -0,0 +1,494 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- R2	scratch
+//	- R3	scratch
+//	- R4	length or x
+//	- R5	offset
+//	- R6	&src[s]
+//	- R7	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	used by doCopy
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R7 - R8,  and len(dst)-d is R10 - R7.
+// The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6.
+TEXT ·decode(SB), NOSPLIT, $56-56
+	// Initialize R6, R7 and R8-R13.
+	MOVD dst_base+0(FP), R8
+	MOVD dst_len+8(FP), R9
+	MOVD R8, R7
+	MOVD R8, R10
+	ADD  R9, R10, R10
+	MOVD src_base+24(FP), R11
+	MOVD src_len+32(FP), R12
+	MOVD R11, R6
+	MOVD R11, R13
+	ADD  R12, R13, R13
+
+loop:
+	// for s < len(src)
+	CMP R13, R6
+	BEQ end
+
+	// R4 = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBU (R6), R4
+	MOVW  R4, R3
+	ANDW  $3, R3
+	MOVW  $1, R1
+	CMPW  R1, R3
+	BGE   tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	MOVW $60, R1
+	LSRW $2, R4, R4
+	CMPW R4, R1
+	BLS  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	ADD $1, R6, R6
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that R4 == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// R4 can hold 64 bits, so the increment cannot overflow.
+	ADD $1, R4, R4
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// R2 = len(dst) - d
+	// R3 = len(src) - s
+	MOVD R10, R2
+	SUB  R7, R2, R2
+	MOVD R13, R3
+	SUB  R6, R3, R3
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMP $16, R4
+	BGT callMemmove
+	CMP $16, R2
+	BLT callMemmove
+	CMP $16, R3
+	BLT callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R6), (R14, R15)
+	STP (R14, R15), 0(R7)
+
+	// d += length
+	// s += length
+	ADD R4, R7, R7
+	ADD R4, R6, R6
+	B   loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMP R2, R4
+	BGT errCorrupt
+	CMP R3, R4
+	BGT errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// R7, R6 and R4 as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVD R7, 8(RSP)
+	MOVD R6, 16(RSP)
+	MOVD R4, 24(RSP)
+	MOVD R7, 32(RSP)
+	MOVD R6, 40(RSP)
+	MOVD R4, 48(RSP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVD 32(RSP), R7
+	MOVD 40(RSP), R6
+	MOVD 48(RSP), R4
+	MOVD dst_base+0(FP), R8
+	MOVD dst_len+8(FP), R9
+	MOVD R8, R10
+	ADD  R9, R10, R10
+	MOVD src_base+24(FP), R11
+	MOVD src_len+32(FP), R12
+	MOVD R11, R13
+	ADD  R12, R13, R13
+
+	// d += length
+	// s += length
+	ADD R4, R7, R7
+	ADD R4, R6, R6
+	B   loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADD  R4, R6, R6
+	SUB  $58, R6, R6
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// case x == 60:
+	MOVW $61, R1
+	CMPW R1, R4
+	BEQ  tagLit61
+	BGT  tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBU -1(R6), R4
+	B     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVHU -2(R6), R4
+	B     doLit
+
+tagLit62Plus:
+	CMPW $62, R4
+	BHI  tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVHU -3(R6), R4
+	MOVBU -1(R6), R3
+	ORR   R3<<16, R4
+	B     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVWU -4(R6), R4
+	B     doLit
+
+	// The code above handles literal tags.
+	// ----------------------------------------
+	// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADD $5, R6, R6
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	MOVD $1, R1
+	ADD  R4>>2, R1, R4
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVWU -4(R6), R5
+	B     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADD $3, R6, R6
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	MOVD $1, R1
+	ADD  R4>>2, R1, R4
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVHU -2(R6), R5
+	B     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- R3 == src[s] & 0x03
+	//	- R4 == src[s]
+	CMP $2, R3
+	BEQ tagCopy2
+	BGT tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADD $2, R6, R6
+
+	// if uint(s) > uint(len(src)) { etc }
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	MOVD  R4, R5
+	AND   $0xe0, R5
+	MOVBU -1(R6), R3
+	ORR   R5<<3, R3, R5
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	MOVD $7, R1
+	AND  R4>>2, R1, R4
+	ADD  $4, R4, R4
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- R4 == length && R4 > 0
+	//	- R5 == offset
+
+	// if offset <= 0 { etc }
+	MOVD $0, R1
+	CMP  R1, R5
+	BLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVD R7, R3
+	SUB  R8, R3, R3
+	CMP  R5, R3
+	BLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVD R10, R3
+	SUB  R7, R3, R3
+	CMP  R3, R4
+	BGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R14 = len(dst)-d
+	//	- R15 = &dst[d-offset]
+	MOVD R10, R14
+	SUB  R7, R14, R14
+	MOVD R7, R15
+	SUB  R5, R15, R15
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMP  $16, R4
+	BGT  slowForwardCopy
+	CMP  $8, R5
+	BLT  slowForwardCopy
+	CMP  $16, R14
+	BLT  slowForwardCopy
+	MOVD 0(R15), R2
+	MOVD R2, 0(R7)
+	MOVD 8(R15), R3
+	MOVD R3, 8(R7)
+	ADD  R4, R7, R7
+	B    loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUB $10, R14, R14
+	CMP R14, R4
+	BGT verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMP  $8, R5
+	BGE  fixUpSlowForwardCopy
+	MOVD (R15), R3
+	MOVD R3, (R7)
+	SUB  R5, R4, R4
+	ADD  R5, R7, R7
+	ADD  R5, R5, R5
+	B    makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by R7 being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save R7 to R2 so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVD R7, R2
+	ADD  R4, R7, R7
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	MOVD $0, R1
+	CMP  R1, R4
+	BLE  loop
+	MOVD (R15), R3
+	MOVD R3, (R2)
+	ADD  $8, R15, R15
+	ADD  $8, R2, R2
+	SUB  $8, R4, R4
+	B    finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), R3
+	MOVB R3, (R7)
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	SUB  $1, R4, R4
+	CBNZ R4, verySlowForwardCopy
+	B    loop
+
+	// The code above handles copy tags.
+	// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMP R10, R7
+	BNE errCorrupt
+
+	// return 0
+	MOVD $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVD $1, R2
+	MOVD R2, ret+48(FP)
+	RET
--- a/vendor/github.com/golang/snappy/decode_asm.go
+++ b/vendor/github.com/golang/snappy/decode_asm.go
@ -0,0 +1,15 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+// +build amd64 arm64
+
+package snappy
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func decode(dst, src []byte) int
--- a/vendor/github.com/golang/snappy/decode_other.go
+++ b/vendor/github.com/golang/snappy/decode_other.go
@ -0,0 +1,115 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64 appengine !gc noasm
+
+package snappy
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func decode(dst, src []byte) int {
+	var d, s, offset, length int
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length <= 0 {
+				return decodeErrCodeUnsupportedLiteralLength
+			}
+			if length > len(dst)-d || length > len(src)-s {
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 4 + int(src[s-2])>>2&0x7
+			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset >= length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
--- a/vendor/github.com/golang/snappy/encode.go
+++ b/vendor/github.com/golang/snappy/encode.go
@ -0,0 +1,289 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Encode handles the Snappy block format, not the Snappy stream format.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return dst[:d]
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 16 - 1
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// could be encoded with a copy tag. This is the minimum with respect to the
+// algorithm used by encodeBlock, not a minimum enforced by the file format.
+//
+// The encoded output must start with at least a 1 byte literal, as there are
+// no previous bytes to copy. A minimal (1 byte) copy after that, generated
+// from an emitCopy call in encodeBlock's main loop, would require at least
+// another inputMargin bytes, for the reason above: we want any emitLiteral
+// calls inside encodeBlock's main loop to use the fast path if possible, which
+// requires being able to overrun by inputMargin bytes. Thus,
+// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
+//
+// The C++ code doesn't use this exact threshold, but it could, as discussed at
+// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
+// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
+// optimization. It should not affect the encoded form. This is tested by
+// TestSameEncodingAsCppShortCopies.
+const minNonLiteralBlockSize = 1 + 1 + inputMargin
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if n > 0xffffffff {
+		return -1
+	}
+	// Compressed data can be defined as:
+	//    compressed := item* literal*
+	//    item       := literal* copy
+	//
+	// The trailing literal sequence has a space blowup of at most 62/60
+	// since a literal of length 60 needs one tag byte + one extra byte
+	// for length information.
+	//
+	// Item blowup is trickier to measure. Suppose the "copy" op copies
+	// 4 bytes of data. Because of a special check in the encoding code,
+	// we produce a 4-byte copy only if the offset is < 65536. Therefore
+	// the copy op takes 3 bytes to encode, and this type of item leads
+	// to at most the 62/60 blowup for representing literals.
+	//
+	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
+	// enough, it will take 5 bytes to encode the copy op. Therefore the
+	// worst case here is a one-byte literal followed by a five-byte copy.
+	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
+	//
+	// This last factor dominates the blowup, so the final estimate is:
+	n = 32 + n + n/6
+	if n > 0xffffffff {
+		return -1
+	}
+	return int(n)
+}
+
+var errClosed = errors.New("snappy: Writer is closed")
+
+// NewWriter returns a new Writer that compresses to w.
+//
+// The Writer returned does not buffer writes. There is no need to Flush or
+// Close such a Writer.
+//
+// Deprecated: the Writer returned is not suitable for many small writes, only
+// for few large writes. Use NewBufferedWriter instead, which is efficient
+// regardless of the frequency and shape of the writes, and remember to Close
+// that Writer when done.
+func NewWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// NewBufferedWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// The Writer returned buffers writes. Users must call Close to guarantee all
+// data has been forwarded to the underlying io.Writer. They may also call
+// Flush zero or more times before calling Close.
+func NewBufferedWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		ibuf: make([]byte, 0, maxBlockSize),
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+//
+// Writer handles the Snappy stream format, not the Snappy block format.
+type Writer struct {
+	w   io.Writer
+	err error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	//
+	// Its use is optional. For backwards compatibility, Writers created by the
+	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
+	// therefore do not need to be Flush'ed or Close'd.
+	ibuf []byte
+
+	// obuf is a buffer for the outgoing (compressed) bytes.
+	obuf []byte
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to
+// w. This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	w.w = writer
+	w.err = nil
+	if w.ibuf != nil {
+		w.ibuf = w.ibuf[:0]
+	}
+	w.wroteStreamHeader = false
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if w.ibuf == nil {
+		// Do not buffer incoming bytes. This does not perform or compress well
+		// if the caller of Writer.Write writes many small slices. This
+		// behavior is therefore deprecated, but still supported for backwards
+		// compatibility with code that doesn't explicitly Flush or Close.
+		return w.write(p)
+	}
+
+	// The remainder of this method is based on bufio.Writer.Write from the
+	// standard library.
+
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.Flush()
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if w.err != nil {
+		return nRet, w.err
+	}
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	for len(p) > 0 {
+		obufStart := len(magicChunk)
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			copy(w.obuf, magicChunk)
+			obufStart = 0
+		}
+
+		var uncompressed []byte
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+		checksum := crc(uncompressed)
+
+		// Compress the buffer, discarding the result if the improvement
+		// isn't at least 12.5%.
+		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
+		chunkType := uint8(chunkTypeCompressedData)
+		chunkLen := 4 + len(compressed)
+		obufEnd := obufHeaderLen + len(compressed)
+		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
+			chunkType = chunkTypeUncompressedData
+			chunkLen = 4 + len(uncompressed)
+			obufEnd = obufHeaderLen
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		w.obuf[len(magicChunk)+0] = chunkType
+		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
+		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
+		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
+		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
+		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
+		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
+		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
+
+		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
+			w.err = err
+			return nRet, err
+		}
+		if chunkType == chunkTypeUncompressedData {
+			if _, err := w.w.Write(uncompressed); err != nil {
+				w.err = err
+				return nRet, err
+			}
+		}
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+func (w *Writer) Flush() error {
+	if w.err != nil {
+		return w.err
+	}
+	if len(w.ibuf) == 0 {
+		return nil
+	}
+	w.write(w.ibuf)
+	w.ibuf = w.ibuf[:0]
+	return w.err
+}
+
+// Close calls Flush and then closes the Writer.
+func (w *Writer) Close() error {
+	w.Flush()
+	ret := w.err
+	if w.err == nil {
+		w.err = errClosed
+	}
+	return ret
+}
--- a/vendor/github.com/golang/snappy/encode_amd64.s
+++ b/vendor/github.com/golang/snappy/encode_amd64.s
@ -0,0 +1,730 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
+// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
+// https://github.com/golang/snappy/issues/29
+//
+// As a workaround, the package was built with a known good assembler, and
+// those instructions were disassembled by "objdump -d" to yield the
+//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+// style comments, in AT&T asm syntax. Note that rsp here is a physical
+// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
+// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
+// fine on Go 1.6.
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	len(lit)
+//	- BX	n
+//	- DX	return value
+//	- DI	&dst[i]
+//	- R10	&lit[0]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ lit_base+24(FP), R10
+	MOVQ lit_len+32(FP), AX
+	MOVQ AX, DX
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  oneByte
+	CMPL BX, $256
+	JLT  twoBytes
+
+threeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	ADDQ $3, DX
+	JMP  memmove
+
+twoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	ADDQ $2, DX
+	JMP  memmove
+
+oneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+	ADDQ $1, DX
+
+memmove:
+	MOVQ DX, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	length
+//	- SI	&dst[0]
+//	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, SI
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX
+
+loop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  loop0
+
+step1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMPL AX, $12
+	JGE  step3
+	CMPL R11, $2048
+	JGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- DX	&src[0]
+//	- SI	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVQ src_base+0(FP), DX
+	MOVQ src_len+8(FP), R14
+	MOVQ i+24(FP), R15
+	MOVQ j+32(FP), SI
+	ADDQ DX, R14
+	ADDQ DX, R15
+	ADDQ DX, SI
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   cmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  bsf
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  extendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  extendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- AX	.	.
+//	- BX	.	.
+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
+//	- DX	64	&src[0], tableSize
+//	- SI	72	&src[s]
+//	- DI	80	&dst[d]
+//	- R9	88	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	96	prevHash, currHash, nextHash, offset
+//	- R12	104	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	112	candidate
+//
+// The second column (56, 64, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
+TEXT ·encodeBlock(SB), 0, $32888-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVQ $24, CX
+	MOVQ $256, DX
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	CMPQ DX, $16384
+	JGE  varTable
+	CMPQ DX, R14
+	JGE  varTable
+	SUBQ $1, CX
+	SHLQ $1, DX
+	JMP  calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
+	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
+	// 2048 writes that would zero-initialize all of table's 32768 bytes.
+	SHRQ $3, DX
+	LEAQ table-32768(SP), BX
+	PXOR X0, X0
+
+memclr:
+	MOVOU X0, 0(BX)
+	ADDQ  $16, BX
+	SUBQ  $1, DX
+	JNZ   memclr
+
+	// !!! DX = &src[0]
+	MOVQ SI, DX
+
+	// sLimit := len(src) - inputMargin
+	MOVQ R14, R9
+	SUBQ $15, R9
+
+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVQ CX, 56(SP)
+	MOVQ DX, 64(SP)
+	MOVQ R9, 88(SP)
+
+	// nextEmit := 0
+	MOVQ DX, R10
+
+	// s := 1
+	ADDQ $1, SI
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVL  0(SI), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVQ $32, R12
+
+	// nextS := s
+	MOVQ SI, R13
+
+	// candidate := 0
+	MOVQ $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVQ R13, SI
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVQ R12, R14
+	SHRQ $5, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADDQ R14, R13
+
+	// skip += bytesBetweenHashLookups
+	ADDQ R14, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVQ R13, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JA   emitRemainder
+
+	// candidate = int(table[nextHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[nextHash] = uint16(s)
+	MOVQ SI, AX
+	SUBQ DX, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVL  0(R13), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVL 0(SI), AX
+	MOVL (DX)(R15*1), BX
+	CMPL AX, BX
+	JNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVQ SI, 72(SP)
+	MOVQ DI, 80(SP)
+	MOVQ R15, 112(SP)
+	CALL runtime·memmove(SB)
+	MOVQ 56(SP), CX
+	MOVQ 64(SP), DX
+	MOVQ 72(SP), SI
+	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
+	MOVQ 112(SP), R15
+	JMP  inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVQ SI, R12
+
+	// !!! offset := base - candidate
+	MOVQ R12, R11
+	SUBQ R15, R11
+	SUBQ DX, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVQ src_len+32(FP), R14
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADDQ $4, R15
+	ADDQ DX, R15
+
+	// !!! s += 4
+	ADDQ $4, SI
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVQ SI, AX
+	SUBQ R12, AX
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	CMPL AX, $12
+	JGE  inlineEmitCopyStep3
+	CMPL R11, $2048
+	JGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVQ SI, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVQ SI, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JAE  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVQ -1(SI), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVQ SI, AX
+	SUBQ DX, AX
+	SUBQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// currHash := hash(uint32(x>>8), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// candidate = int(table[currHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[currHash] = uint16(s)
+	ADDQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVL (DX)(R15*1), BX
+	CMPL R14, BX
+	JEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// s++
+	ADDQ $1, SI
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	JMP outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVQ src_len+32(FP), AX
+	ADDQ DX, AX
+	CMPQ R10, AX
+	JEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R10, 24(SP)
+	SUBQ R10, AX
+	MOVQ AX, 32(SP)
+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ DI, 80(SP)
+	CALL ·emitLiteral(SB)
+	MOVQ 80(SP), DI
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ 48(SP), DI
+
+encodeBlockEnd:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, DI
+	MOVQ DI, d+48(FP)
+	RET
--- a/vendor/github.com/golang/snappy/encode_arm64.s
+++ b/vendor/github.com/golang/snappy/encode_arm64.s
@ -0,0 +1,722 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R3	len(lit)
+//	- R4	n
+//	- R6	return value
+//	- R8	&dst[i]
+//	- R10	&lit[0]
+//
+// The 32 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $32-56
+	MOVD dst_base+0(FP), R8
+	MOVD lit_base+24(FP), R10
+	MOVD lit_len+32(FP), R3
+	MOVD R3, R6
+	MOVW R3, R4
+	SUBW $1, R4, R4
+
+	CMPW $60, R4
+	BLT  oneByte
+	CMPW $256, R4
+	BLT  twoBytes
+
+threeBytes:
+	MOVD $0xf4, R2
+	MOVB R2, 0(R8)
+	MOVW R4, 1(R8)
+	ADD  $3, R8, R8
+	ADD  $3, R6, R6
+	B    memmove
+
+twoBytes:
+	MOVD $0xf0, R2
+	MOVB R2, 0(R8)
+	MOVB R4, 1(R8)
+	ADD  $2, R8, R8
+	ADD  $2, R6, R6
+	B    memmove
+
+oneByte:
+	LSLW $2, R4, R4
+	MOVB R4, 0(R8)
+	ADD  $1, R8, R8
+	ADD  $1, R6, R6
+
+memmove:
+	MOVD R6, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// R8, R10 and R3 as arguments.
+	MOVD R8, 8(RSP)
+	MOVD R10, 16(RSP)
+	MOVD R3, 24(RSP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R3	length
+//	- R7	&dst[0]
+//	- R8	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVD dst_base+0(FP), R8
+	MOVD R8, R7
+	MOVD offset+24(FP), R11
+	MOVD length+32(FP), R3
+
+loop0:
+	// for length >= 68 { etc }
+	CMPW $68, R3
+	BLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVD $0xfe, R2
+	MOVB R2, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUB  $64, R3, R3
+	B    loop0
+
+step1:
+	// if length > 64 { etc }
+	CMP $64, R3
+	BLE step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVD $0xee, R2
+	MOVB R2, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUB  $60, R3, R3
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMP  $12, R3
+	BGE  step3
+	CMPW $2048, R11
+	BGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(R8)
+	LSRW $3, R11, R11
+	AND  $0xe0, R11, R11
+	SUB  $4, R3, R3
+	LSLW $2, R3
+	AND  $0xff, R3, R3
+	ORRW R3, R11, R11
+	ORRW $1, R11, R11
+	MOVB R11, 0(R8)
+	ADD  $2, R8, R8
+
+	// Return the number of bytes written.
+	SUB  R7, R8, R8
+	MOVD R8, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUB  $1, R3, R3
+	AND  $0xff, R3, R3
+	LSLW $2, R3, R3
+	ORRW $2, R3, R3
+	MOVB R3, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+
+	// Return the number of bytes written.
+	SUB  R7, R8, R8
+	MOVD R8, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R6	&src[0]
+//	- R7	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVD src_base+0(FP), R6
+	MOVD src_len+8(FP), R14
+	MOVD i+24(FP), R15
+	MOVD j+32(FP), R7
+	ADD  R6, R14, R14
+	ADD  R6, R15, R15
+	ADD  R6, R7, R7
+	MOVD R14, R13
+	SUB  $8, R13, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMP  R13, R7
+	BHI  cmp1
+	MOVD (R15), R3
+	MOVD (R7), R4
+	CMP  R4, R3
+	BNE  bsf
+	ADD  $8, R15, R15
+	ADD  $8, R7, R7
+	B    cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs.
+	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
+	// combination of which finds the least significant bit which is set.
+	// The arm64 architecture is little-endian, and the shift by 3 converts
+	// a bit index to a byte index.
+	EOR  R3, R4, R4
+	RBIT R4, R4
+	CLZ  R4, R4
+	ADD  R4>>3, R7, R7
+
+	// Convert from &src[ret] to ret.
+	SUB  R6, R7, R7
+	MOVD R7, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMP  R7, R14
+	BLS  extendMatchEnd
+	MOVB (R15), R3
+	MOVB (R7), R4
+	CMP  R4, R3
+	BNE  extendMatchEnd
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	B    cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUB  R6, R7, R7
+	MOVD R7, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- R3	.	.
+//	- R4	.	.
+//	- R5	64	shift
+//	- R6	72	&src[0], tableSize
+//	- R7	80	&src[s]
+//	- R8	88	&dst[d]
+//	- R9	96	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	104	prevHash, currHash, nextHash, offset
+//	- R12	112	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	120	candidate
+//	- R16	.	hash constant, 0x1e35a7bd
+//	- R17	.	&table
+//	- .  	128	table
+//
+// The second column (64, 72, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
+TEXT ·encodeBlock(SB), 0, $32896-56
+	MOVD dst_base+0(FP), R8
+	MOVD src_base+24(FP), R7
+	MOVD src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVD  $24, R5
+	MOVD  $256, R6
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	MOVD $16384, R2
+	CMP  R2, R6
+	BGE  varTable
+	CMP  R14, R6
+	BGE  varTable
+	SUB  $1, R5, R5
+	LSL  $1, R6, R6
+	B    calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each
+	// iterations writes 64 bytes, so we can do only tableSize/32 writes
+	// instead of the 2048 writes that would zero-initialize all of table's
+	// 32768 bytes. This clear could overrun the first tableSize elements, but
+	// it won't overrun the allocated stack size.
+	ADD  $128, RSP, R17
+	MOVD R17, R4
+
+	// !!! R6 = &src[tableSize]
+	ADD R6<<1, R17, R6
+
+memclr:
+	STP.P (ZR, ZR), 64(R4)
+	STP   (ZR, ZR), -48(R4)
+	STP   (ZR, ZR), -32(R4)
+	STP   (ZR, ZR), -16(R4)
+	CMP   R4, R6
+	BHI   memclr
+
+	// !!! R6 = &src[0]
+	MOVD R7, R6
+
+	// sLimit := len(src) - inputMargin
+	MOVD R14, R9
+	SUB  $15, R9, R9
+
+	// !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVD R5, 64(RSP)
+	MOVD R6, 72(RSP)
+	MOVD R9, 96(RSP)
+
+	// nextEmit := 0
+	MOVD R6, R10
+
+	// s := 1
+	ADD $1, R7, R7
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVW 0(R7), R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVD $32, R12
+
+	// nextS := s
+	MOVD R7, R13
+
+	// candidate := 0
+	MOVD $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVD R13, R7
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVD R12, R14
+	LSR  $5, R14, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADD R14, R13, R13
+
+	// skip += bytesBetweenHashLookups
+	ADD R14, R12, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVD R13, R3
+	SUB  R6, R3, R3
+	CMP  R9, R3
+	BHI  emitRemainder
+
+	// candidate = int(table[nextHash])
+	MOVHU 0(R17)(R11<<1), R15
+
+	// table[nextHash] = uint16(s)
+	MOVD R7, R3
+	SUB  R6, R3, R3
+
+	MOVH R3, 0(R17)(R11<<1)
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVW 0(R13), R11
+	MULW R16, R11
+	LSRW R5, R11, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVW 0(R7), R3
+	MOVW (R6)(R15), R4
+	CMPW R4, R3
+	BNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVD R7, R3
+	SUB  R10, R3, R3
+	CMP  $16, R3
+	BLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVW R3, R4
+	SUBW $1, R4, R4
+
+	MOVW $60, R2
+	CMPW R2, R4
+	BLT  inlineEmitLiteralOneByte
+	MOVW $256, R2
+	CMPW R2, R4
+	BLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVD $0xf4, R1
+	MOVB R1, 0(R8)
+	MOVW R4, 1(R8)
+	ADD  $3, R8, R8
+	B    inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVD $0xf0, R1
+	MOVB R1, 0(R8)
+	MOVB R4, 1(R8)
+	ADD  $2, R8, R8
+	B    inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	LSLW $2, R4, R4
+	MOVB R4, 0(R8)
+	ADD  $1, R8, R8
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// R8, R10 and R3 as arguments.
+	MOVD R8, 8(RSP)
+	MOVD R10, 16(RSP)
+	MOVD R3, 24(RSP)
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADD   R3, R8, R8
+	MOVD  R7, 80(RSP)
+	MOVD  R8, 88(RSP)
+	MOVD  R15, 120(RSP)
+	CALL  runtime·memmove(SB)
+	MOVD  64(RSP), R5
+	MOVD  72(RSP), R6
+	MOVD  80(RSP), R7
+	MOVD  88(RSP), R8
+	MOVD  96(RSP), R9
+	MOVD  120(RSP), R15
+	ADD   $128, RSP, R17
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+	B     inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB R3, R4
+	SUBW $1, R4, R4
+	AND  $0xff, R4, R4
+	LSLW $2, R4, R4
+	MOVB R4, (R8)
+	ADD  $1, R8, R8
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R10), (R0, R1)
+	STP (R0, R1), 0(R8)
+	ADD R3, R8, R8
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVD R7, R12
+
+	// !!! offset := base - candidate
+	MOVD R12, R11
+	SUB  R15, R11, R11
+	SUB  R6, R11, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVD src_len+32(FP), R14
+	ADD  R6, R14, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVD R14, R13
+	SUB  $8, R13, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADD $4, R15, R15
+	ADD R6, R15, R15
+
+	// !!! s += 4
+	ADD $4, R7, R7
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMP  R13, R7
+	BHI  inlineExtendMatchCmp1
+	MOVD (R15), R3
+	MOVD (R7), R4
+	CMP  R4, R3
+	BNE  inlineExtendMatchBSF
+	ADD  $8, R15, R15
+	ADD  $8, R7, R7
+	B    inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs.
+	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
+	// combination of which finds the least significant bit which is set.
+	// The arm64 architecture is little-endian, and the shift by 3 converts
+	// a bit index to a byte index.
+	EOR  R3, R4, R4
+	RBIT R4, R4
+	CLZ  R4, R4
+	ADD  R4>>3, R7, R7
+	B    inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMP  R7, R14
+	BLS  inlineExtendMatchEnd
+	MOVB (R15), R3
+	MOVB (R7), R4
+	CMP  R4, R3
+	BNE  inlineExtendMatchEnd
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	B    inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVD R7, R3
+	SUB  R12, R3, R3
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	MOVW $68, R2
+	CMPW R2, R3
+	BLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVD $0xfe, R1
+	MOVB R1, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUBW $64, R3, R3
+	B    inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	MOVW $64, R2
+	CMPW R2, R3
+	BLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVD $0xee, R1
+	MOVB R1, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUBW $60, R3, R3
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	MOVW $12, R2
+	CMPW R2, R3
+	BGE  inlineEmitCopyStep3
+	MOVW $2048, R2
+	CMPW R2, R11
+	BGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(R8)
+	LSRW $8, R11, R11
+	LSLW $5, R11, R11
+	SUBW $4, R3, R3
+	AND  $0xff, R3, R3
+	LSLW $2, R3, R3
+	ORRW R3, R11, R11
+	ORRW $1, R11, R11
+	MOVB R11, 0(R8)
+	ADD  $2, R8, R8
+	B    inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBW $1, R3, R3
+	LSLW $2, R3, R3
+	ORRW $2, R3, R3
+	MOVB R3, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVD R7, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVD R7, R3
+	SUB  R6, R3, R3
+	CMP  R3, R9
+	BLS  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVD -1(R7), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVD R7, R3
+	SUB  R6, R3, R3
+	SUB  $1, R3, R3
+
+	MOVHU R3, 0(R17)(R11<<1)
+
+	// currHash := hash(uint32(x>>8), shift)
+	LSR  $8, R14, R14
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// candidate = int(table[currHash])
+	MOVHU 0(R17)(R11<<1), R15
+
+	// table[currHash] = uint16(s)
+	ADD   $1, R3, R3
+	MOVHU R3, 0(R17)(R11<<1)
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVW (R6)(R15), R4
+	CMPW R4, R14
+	BEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	LSR  $8, R14, R14
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// s++
+	ADD $1, R7, R7
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	B outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVD src_len+32(FP), R3
+	ADD  R6, R3, R3
+	CMP  R3, R10
+	BEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVD R8, 8(RSP)
+	MOVD $0, 16(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVD $0, 24(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVD R10, 32(RSP)
+	SUB  R10, R3, R3
+	MOVD R3, 40(RSP)
+	MOVD R3, 48(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVD R8, 88(RSP)
+	CALL ·emitLiteral(SB)
+	MOVD 88(RSP), R8
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVD 56(RSP), R1
+	ADD  R1, R8, R8
+
+encodeBlockEnd:
+	MOVD dst_base+0(FP), R3
+	SUB  R3, R8, R8
+	MOVD R8, d+48(FP)
+	RET
--- a/vendor/github.com/golang/snappy/encode_asm.go
+++ b/vendor/github.com/golang/snappy/encode_asm.go
@ -0,0 +1,30 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+// +build amd64 arm64
+
+package snappy
+
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
+// emitCopy has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitCopy(dst []byte, offset, length int) int
+
+// extendMatch has the same semantics as in encode_other.go.
+//
+//go:noescape
+func extendMatch(src []byte, i, j int) int
+
+// encodeBlock has the same semantics as in encode_other.go.
+//
+//go:noescape
+func encodeBlock(dst, src []byte) (d int)
--- a/vendor/github.com/golang/snappy/encode_other.go
+++ b/vendor/github.com/golang/snappy/encode_other.go
@ -0,0 +1,238 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64 appengine !gc noasm
+
+package snappy
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[0] = 60<<2 | tagLiteral
+		dst[1] = uint8(n)
+		i = 2
+	default:
+		dst[0] = 61<<2 | tagLiteral
+		dst[1] = uint8(n)
+		dst[2] = uint8(n >> 8)
+		i = 3
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65535
+//	4 <= length && length <= 65535
+func emitCopy(dst []byte, offset, length int) int {
+	i := 0
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 64
+	}
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
+}
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
+	shift := uint32(32 - 8)
+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+		shift--
+	}
+	// In Go, all array elements are zero-initialized, so there is no advantage
+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
+	// and in the asm versions of this code, we can get away with zeroing only
+	// the first tableSize elements.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+
+			d += emitCopy(dst[d:], base-candidate, s-base)
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
--- a/vendor/github.com/golang/snappy/snappy.go
+++ b/vendor/github.com/golang/snappy/snappy.go
@ -0,0 +1,98 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package snappy implements the Snappy compression format. It aims for very
+// high speeds and reasonable compression.
+//
+// There are actually two Snappy formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a Snappy stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// The canonical, C++ implementation is at https://github.com/google/snappy and
+// it only implements the block format.
+package snappy // import "github.com/golang/snappy"
+
+import (
+	"hash/crc32"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, this tag is a legacy format that is no longer issued by most
+    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize    = 4
+	chunkHeaderSize = 4
+	magicChunk      = "\xff\x06\x00\x00" + magicBody
+	magicBody       = "sNaPpY"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
+	// https://github.com/google/snappy/blob/master/framing_format.txt says
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
+
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
+
+	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return uint32(c>>15|c<<17) + 0xa282ead8
+}
--- a/vendor/github.com/pierrec/lz4/v4/.gitignore
+++ b/vendor/github.com/pierrec/lz4/v4/.gitignore
@ -0,0 +1,36 @@
+# Created by https://www.gitignore.io/api/macos
+
+### macOS ###
+*.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# End of https://www.gitignore.io/api/macos
+
+cmd/*/*exe
+.idea
+
+fuzz/*.zip
--- a/vendor/github.com/pierrec/lz4/v4/LICENSE
+++ b/vendor/github.com/pierrec/lz4/v4/LICENSE
@ -0,0 +1,28 @@
+Copyright (c) 2015, Pierre Curto
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of xxHash nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
--- a/vendor/github.com/pierrec/lz4/v4/README.md
+++ b/vendor/github.com/pierrec/lz4/v4/README.md
@ -0,0 +1,92 @@
+# lz4 : LZ4 compression in pure Go
+
+[![Go Reference](https://pkg.go.dev/badge/github.com/pierrec/lz4/v4.svg)](https://pkg.go.dev/github.com/pierrec/lz4/v4)
+[![CI](https://github.com/pierrec/lz4/workflows/ci/badge.svg)](https://github.com/pierrec/lz4/actions)
+[![Go Report Card](https://goreportcard.com/badge/github.com/pierrec/lz4)](https://goreportcard.com/report/github.com/pierrec/lz4)
+[![GitHub tag (latest SemVer)](https://img.shields.io/github/tag/pierrec/lz4.svg?style=social)](https://github.com/pierrec/lz4/tags)
+
+## Overview
+
+This package provides a streaming interface to [LZ4 data streams](http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html) as well as low level compress and uncompress functions for LZ4 data blocks.
+The implementation is based on the reference C [one](https://github.com/lz4/lz4).
+
+## Install
+
+Assuming you have the go toolchain installed:
+
+```
+go get github.com/pierrec/lz4/v4
+```
+
+There is a command line interface tool to compress and decompress LZ4 files.
+
+```
+go install github.com/pierrec/lz4/v4/cmd/lz4c
+```
+
+Usage
+
+```
+Usage of lz4c:
+  -version
+        print the program version
+
+Subcommands:
+Compress the given files or from stdin to stdout.
+compress [arguments] [<file name> ...]
+  -bc
+        enable block checksum
+  -l int
+        compression level (0=fastest)
+  -sc
+        disable stream checksum
+  -size string
+        block max size [64K,256K,1M,4M] (default "4M")
+
+Uncompress the given files or from stdin to stdout.
+uncompress [arguments] [<file name> ...]
+
+```
+
+
+## Example
+
+```
+// Compress and uncompress an input string.
+s := "hello world"
+r := strings.NewReader(s)
+
+// The pipe will uncompress the data from the writer.
+pr, pw := io.Pipe()
+zw := lz4.NewWriter(pw)
+zr := lz4.NewReader(pr)
+
+go func() {
+	// Compress the input string.
+	_, _ = io.Copy(zw, r)
+	_ = zw.Close() // Make sure the writer is closed
+	_ = pw.Close() // Terminate the pipe
+}()
+
+_, _ = io.Copy(os.Stdout, zr)
+
+// Output:
+// hello world
+```
+
+## Contributing
+
+Contributions are very welcome for bug fixing, performance improvements...!
+
+- Open an issue with a proper description
+- Send a pull request with appropriate test case(s)
+
+## Contributors
+
+Thanks to all [contributors](https://github.com/pierrec/lz4/graphs/contributors)  so far!
+
+Special thanks to [@Zariel](https://github.com/Zariel) for his asm implementation of the decoder.
+
+Special thanks to [@greatroar](https://github.com/greatroar) for his work on the asm implementations of the decoder for amd64 and arm64.
+
+Special thanks to [@klauspost](https://github.com/klauspost) for his work on optimizing the code.
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
@ -0,0 +1,481 @@
+package lz4block
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+const (
+	// The following constants are used to setup the compression algorithm.
+	minMatch   = 4  // the minimum size of the match sequence size (4 bytes)
+	winSizeLog = 16 // LZ4 64Kb window size limit
+	winSize    = 1 << winSizeLog
+	winMask    = winSize - 1 // 64Kb window of previous data for dependent blocks
+
+	// hashLog determines the size of the hash table used to quickly find a previous match position.
+	// Its value influences the compression speed and memory usage, the lower the faster,
+	// but at the expense of the compression ratio.
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
+
+	mfLimit = 10 + minMatch // The last match cannot start within the last 14 bytes.
+)
+
+func recoverBlock(e *error) {
+	if r := recover(); r != nil && *e == nil {
+		*e = lz4errors.ErrInvalidSourceShortBuffer
+	}
+}
+
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
+}
+
+func CompressBlockBound(n int) int {
+	return n + n/255 + 16
+}
+
+func UncompressBlock(src, dst, dict []byte) (int, error) {
+	if len(src) == 0 {
+		return 0, nil
+	}
+	if di := decodeBlock(dst, src, dict); di >= 0 {
+		return di, nil
+	}
+	return 0, lz4errors.ErrInvalidSourceShortBuffer
+}
+
+type Compressor struct {
+	// Offsets are at most 64kiB, so we can store only the lower 16 bits of
+	// match positions: effectively, an offset from some 64kiB block boundary.
+	//
+	// When we retrieve such an offset, we interpret it as relative to the last
+	// block boundary si &^ 0xffff, or the one before, (si &^ 0xffff) - 0x10000,
+	// depending on which of these is inside the current window. If a table
+	// entry was generated more than 64kiB back in the input, we find out by
+	// inspecting the input stream.
+	table [htSize]uint16
+
+	// Bitmap indicating which positions in the table are in use.
+	// This allows us to quickly reset the table for reuse,
+	// without having to zero everything.
+	inUse [htSize / 32]uint32
+}
+
+// Get returns the position of a presumptive match for the hash h.
+// The match may be a false positive due to a hash collision or an old entry.
+// If si < winSize, the return value may be negative.
+func (c *Compressor) get(h uint32, si int) int {
+	h &= htSize - 1
+	i := 0
+	if c.inUse[h/32]&(1<<(h%32)) != 0 {
+		i = int(c.table[h])
+	}
+	i += si &^ winMask
+	if i >= si {
+		// Try previous 64kiB block (negative when in first block).
+		i -= winSize
+	}
+	return i
+}
+
+func (c *Compressor) put(h uint32, si int) {
+	h &= htSize - 1
+	c.table[h] = uint16(si)
+	c.inUse[h/32] |= 1 << (h % 32)
+}
+
+func (c *Compressor) reset() { c.inUse = [htSize / 32]uint32{} }
+
+var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}
+
+func CompressBlock(src, dst []byte) (int, error) {
+	c := compressorPool.Get().(*Compressor)
+	n, err := c.CompressBlock(src, dst)
+	compressorPool.Put(c)
+	return n, err
+}
+
+func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
+	// Zero out reused table to avoid non-deterministic output (issue #65).
+	c.reset()
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	for si < sn {
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
+		h := blockHash(match)
+		h2 := blockHash(match >> 8)
+
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
+		ref := c.get(h, si)
+		ref2 := c.get(h2, si+1)
+		c.put(h, si)
+		c.put(h2, si+1)
+
+		offset := si - ref
+
+		if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) {
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref3 := c.get(h, si+2)
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref3
+				c.put(h, si)
+
+				if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
+		}
+
+		// Match found.
+		lLen := si - anchor // Literal length.
+		// We already matched 4 bytes.
+		mLen := 4
+
+		// Extend backwards if we can, reducing literals.
+		tOff := si - offset - 1
+		for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
+			si--
+			tOff--
+			lLen--
+			mLen++
+		}
+
+		// Add the match length, so we continue search at the end.
+		// Use mLen to store the offset base.
+		si, mLen = si+mLen, si+minMatch
+
+		// Find the longest match by looking by batches of 8 bytes.
+		for si+8 <= sn {
+			x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
+			if x == 0 {
+				si += 8
+			} else {
+				// Stop is first non-zero byte.
+				si += bits.TrailingZeros64(x) >> 3
+				break
+			}
+		}
+
+		mLen = si - mLen
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF && di < len(dst); l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		if di+lLen > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen + 2
+		anchor = si
+
+		// Encode offset.
+		if di > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		c.put(h, si-2)
+	}
+
+lastLiterals:
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+	if di >= len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	if di+len(src)-anchor > len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
+
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
+type CompressorHC struct {
+	// hashTable: stores the last position found for a given hash
+	// chainTable: stores previous positions for a given hash
+	hashTable, chainTable [htSize]int
+	needsReset            bool
+}
+
+var compressorHCPool = sync.Pool{New: func() interface{} { return new(CompressorHC) }}
+
+func CompressBlockHC(src, dst []byte, depth CompressionLevel) (int, error) {
+	c := compressorHCPool.Get().(*CompressorHC)
+	n, err := c.CompressBlock(src, dst, depth)
+	compressorHCPool.Put(c)
+	return n, err
+}
+
+func (c *CompressorHC) CompressBlock(src, dst []byte, depth CompressionLevel) (_ int, err error) {
+	if c.needsReset {
+		// Zero out reused table to avoid non-deterministic output (issue #65).
+		c.hashTable = [htSize]int{}
+		c.chainTable = [htSize]int{}
+	}
+	c.needsReset = true // Only false on first call.
+
+	defer recoverBlock(&err)
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	if depth == 0 {
+		depth = winSize
+	}
+
+	for si < sn {
+		// Hash the next 4 bytes (sequence).
+		match := binary.LittleEndian.Uint32(src[si:])
+		h := blockHashHC(match)
+
+		// Follow the chain until out of window and give the longest match.
+		mLen := 0
+		offset := 0
+		for next, try := c.hashTable[h], depth; try > 0 && next > 0 && si-next < winSize; next, try = c.chainTable[next&winMask], try-1 {
+			// The first (mLen==0) or next byte (mLen>=minMatch) at current match length
+			// must match to improve on the match length.
+			if src[next+mLen] != src[si+mLen] {
+				continue
+			}
+			ml := 0
+			// Compare the current position with a previous with the same hash.
+			for ml < sn-si {
+				x := binary.LittleEndian.Uint64(src[next+ml:]) ^ binary.LittleEndian.Uint64(src[si+ml:])
+				if x == 0 {
+					ml += 8
+				} else {
+					// Stop is first non-zero byte.
+					ml += bits.TrailingZeros64(x) >> 3
+					break
+				}
+			}
+			if ml < minMatch || ml <= mLen {
+				// Match too small (<minMath) or smaller than the current match.
+				continue
+			}
+			// Found a longer match, keep its position and length.
+			mLen = ml
+			offset = si - next
+			// Try another previous position with the same hash.
+		}
+		c.chainTable[si&winMask] = c.hashTable[h]
+		c.hashTable[h] = si
+
+		// No match found.
+		if mLen == 0 {
+			si += 1 + (si-anchor)>>adaptSkipLog
+			continue
+		}
+
+		// Match found.
+		// Update hash/chain tables with overlapping bytes:
+		// si already hashed, add everything from si+1 up to the match length.
+		winStart := si + 1
+		if ws := si + mLen - winSize; ws > winStart {
+			winStart = ws
+		}
+		for si, ml := winStart, si+mLen; si < ml; {
+			match >>= 8
+			match |= uint32(src[si+3]) << 24
+			h := blockHashHC(match)
+			c.chainTable[si&winMask] = c.hashTable[h]
+			c.hashTable[h] = si
+			si++
+		}
+
+		lLen := si - anchor
+		si += mLen
+		mLen -= minMatch // Match length does not include minMatch.
+
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen
+		anchor = si
+
+		// Encode offset.
+		di += 2
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+	}
+
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+lastLiterals:
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		lLen -= 0xF
+		for ; lLen >= 0xFF; lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
@ -0,0 +1,90 @@
+// Package lz4block provides LZ4 BlockSize types and pools of buffers.
+package lz4block
+
+import "sync"
+
+const (
+	Block64Kb uint32 = 1 << (16 + iota*2)
+	Block256Kb
+	Block1Mb
+	Block4Mb
+)
+
+// In legacy mode all blocks are compressed regardless
+// of the compressed size: use the bound size.
+var Block8Mb = uint32(CompressBlockBound(8 << 20))
+
+var (
+	BlockPool64K  = sync.Pool{New: func() interface{} { return make([]byte, Block64Kb) }}
+	BlockPool256K = sync.Pool{New: func() interface{} { return make([]byte, Block256Kb) }}
+	BlockPool1M   = sync.Pool{New: func() interface{} { return make([]byte, Block1Mb) }}
+	BlockPool4M   = sync.Pool{New: func() interface{} { return make([]byte, Block4Mb) }}
+	BlockPool8M   = sync.Pool{New: func() interface{} { return make([]byte, Block8Mb) }}
+)
+
+func Index(b uint32) BlockSizeIndex {
+	switch b {
+	case Block64Kb:
+		return 4
+	case Block256Kb:
+		return 5
+	case Block1Mb:
+		return 6
+	case Block4Mb:
+		return 7
+	case Block8Mb: // only valid in legacy mode
+		return 3
+	}
+	return 0
+}
+
+func IsValid(b uint32) bool {
+	return Index(b) > 0
+}
+
+type BlockSizeIndex uint8
+
+func (b BlockSizeIndex) IsValid() bool {
+	switch b {
+	case 4, 5, 6, 7:
+		return true
+	}
+	return false
+}
+
+func (b BlockSizeIndex) Get() []byte {
+	var buf interface{}
+	switch b {
+	case 4:
+		buf = BlockPool64K.Get()
+	case 5:
+		buf = BlockPool256K.Get()
+	case 6:
+		buf = BlockPool1M.Get()
+	case 7:
+		buf = BlockPool4M.Get()
+	case 3:
+		buf = BlockPool8M.Get()
+	}
+	return buf.([]byte)
+}
+
+func Put(buf []byte) {
+	// Safeguard: do not allow invalid buffers.
+	switch c := cap(buf); uint32(c) {
+	case Block64Kb:
+		BlockPool64K.Put(buf[:c])
+	case Block256Kb:
+		BlockPool256K.Put(buf[:c])
+	case Block1Mb:
+		BlockPool1M.Put(buf[:c])
+	case Block4Mb:
+		BlockPool4M.Put(buf[:c])
+	case Block8Mb:
+		BlockPool8M.Put(buf[:c])
+	}
+}
+
+type CompressionLevel uint32
+
+const Fast CompressionLevel = 0
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
@ -0,0 +1,448 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX literal and match lengths
+// DX token, match offset
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
+// R14 &dict
+// R15 len(dict)
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOSPLIT, $48-80
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, R11
+	MOVQ dst_len+8(FP), R8
+	ADDQ DI, R8
+
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R9
+	CMPQ R9, $0
+	JE   err_corrupt
+	ADDQ SI, R9
+
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+
+	// shortcut ends
+	// short output end
+	MOVQ R8, R12
+	SUBQ $32, R12
+	// short input end
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	XORL CX, CX
+
+loop:
+	// token := uint32(src[si])
+	MOVBLZX (SI), DX
+	INCQ SI
+
+	// lit_len = token >> 4
+	// if lit_len > 0
+	// CX = lit_len
+	MOVL DX, CX
+	SHRL $4, CX
+
+	// if lit_len != 0xF
+	CMPL CX, $0xF
+	JEQ  lit_len_loop
+	CMPQ DI, R12
+	JAE  copy_literal
+	CMPQ SI, R13
+	JAE  copy_literal
+
+	// copy shortcut
+
+	// A two-stage shortcut for the most common case:
+	// 1) If the literal length is 0..14, and there is enough space,
+	// enter the shortcut and copy 16 bytes on behalf of the literals
+	// (in the fast mode, only 8 bytes can be safely copied this way).
+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
+	// manner; but we ensure that there's enough space in the output for
+	// those 18 bytes earlier, upon entering the shortcut (in other words,
+	// there is a combined check for both stages).
+
+	// copy literal
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+	ADDQ CX, DI
+	ADDQ CX, SI
+
+	MOVL DX, CX
+	ANDL $0xF, CX
+
+	// The second stage: prepare for match copying, decode full info.
+	// If it doesn't work out, the info won't be wasted.
+	// offset := uint16(data[:2])
+	MOVWLZX (SI), DX
+	TESTL DX, DX
+	JE err_corrupt
+	ADDQ $2, SI
+	JC err_short_buf
+
+	MOVQ DI, AX
+	SUBQ DX, AX
+	JC err_corrupt
+	CMPQ AX, DI
+	JA err_short_buf
+
+	// if we can't do the second stage then jump straight to read the
+	// match length, we already have the offset.
+	CMPL CX, $0xF
+	JEQ match_len_loop_pre
+	CMPL DX, $8
+	JLT match_len_loop_pre
+	CMPQ AX, R11
+	JB match_len_loop_pre
+
+	// memcpy(op + 0, match + 0, 8);
+	MOVQ (AX), BX
+	MOVQ BX, (DI)
+	// memcpy(op + 8, match + 8, 8);
+	MOVQ 8(AX), BX
+	MOVQ BX, 8(DI)
+	// memcpy(op +16, match +16, 2);
+	MOVW 16(AX), BX
+	MOVW BX, 16(DI)
+
+	LEAQ const_minMatch(DI)(CX*1), DI
+
+	// shortcut complete, load next token
+	JMP loopcheck
+
+	// Read the rest of the literal length:
+	// do { BX = src[si++]; lit_len += BX } while (BX == 0xFF).
+lit_len_loop:
+	CMPQ SI, R9
+	JAE err_short_buf
+
+	MOVBLZX (SI), BX
+	INCQ SI
+	ADDQ BX, CX
+
+	CMPB BX, $0xFF
+	JE lit_len_loop
+
+copy_literal:
+	// bounds check src and dst
+	MOVQ SI, AX
+	ADDQ CX, AX
+	JC err_short_buf
+	CMPQ AX, R9
+	JA err_short_buf
+
+	MOVQ DI, BX
+	ADDQ CX, BX
+	JC err_short_buf
+	CMPQ BX, R8
+	JA err_short_buf
+
+	// Copy literals of <=48 bytes through the XMM registers.
+	CMPQ CX, $48
+	JGT memmove_lit
+
+	// if len(dst[di:]) < 48
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $48
+	JLT memmove_lit
+
+	// if len(src[si:]) < 48
+	MOVQ R9, BX
+	SUBQ SI, BX
+	CMPQ BX, $48
+	JLT memmove_lit
+
+	MOVOU (SI), X0
+	MOVOU 16(SI), X1
+	MOVOU 32(SI), X2
+	MOVOU X0, (DI)
+	MOVOU X1, 16(DI)
+	MOVOU X2, 32(DI)
+
+	ADDQ CX, SI
+	ADDQ CX, DI
+
+	JMP finish_lit_copy
+
+memmove_lit:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+
+	// Spill registers. Increment SI, DI now so we don't need to save CX.
+	ADDQ CX, DI
+	ADDQ CX, SI
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVL DX, 40(SP)
+
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVL 40(SP), DX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+finish_lit_copy:
+	// CX := mLen
+	// free up DX to use for offset
+	MOVL DX, CX
+	ANDL $0xF, CX
+
+	CMPQ SI, R9
+	JAE end
+
+	// offset
+	// si += 2
+	// DX := int(src[si-2]) | int(src[si-1])<<8
+	ADDQ $2, SI
+	JC err_short_buf
+	CMPQ SI, R9
+	JA err_short_buf
+	MOVWQZX -2(SI), DX
+
+	// 0 offset is invalid
+	TESTL DX, DX
+	JEQ   err_corrupt
+
+match_len_loop_pre:
+	// if mlen != 0xF
+	CMPB CX, $0xF
+	JNE copy_match
+
+	// do { BX = src[si++]; mlen += BX } while (BX == 0xFF).
+match_len_loop:
+	CMPQ SI, R9
+	JAE err_short_buf
+
+	MOVBLZX (SI), BX
+	INCQ SI
+	ADDQ BX, CX
+
+	CMPB BX, $0xFF
+	JE match_len_loop
+
+copy_match:
+	ADDQ $const_minMatch, CX
+
+	// check we have match_len bytes left in dst
+	// di+match_len < len(dst)
+	MOVQ DI, AX
+	ADDQ CX, AX
+	JC err_short_buf
+	CMPQ AX, R8
+	JA err_short_buf
+
+	// DX = offset
+	// CX = match_len
+	// BX = &dst + (di - offset)
+	MOVQ DI, BX
+	SUBQ DX, BX
+
+	// check BX is within dst
+	// if BX < &dst
+	JC copy_match_from_dict
+	CMPQ BX, R11
+	JBE copy_match_from_dict
+
+	// if offset + match_len < di
+	LEAQ (BX)(CX*1), AX
+	CMPQ DI, AX
+	JA copy_interior_match
+
+	// AX := len(dst[:di])
+	// MOVQ DI, AX
+	// SUBQ R11, AX
+
+	// copy 16 bytes at a time
+	// if di-offset < 16 copy 16-(di-offset) bytes to di
+	// then do the remaining
+
+copy_match_loop:
+	// for match_len >= 0
+	// dst[di] = dst[i]
+	// di++
+	// i++
+	MOVB (BX), AX
+	MOVB AX, (DI)
+	INCQ DI
+	INCQ BX
+	DECQ CX
+	JNZ copy_match_loop
+
+	JMP loopcheck
+
+copy_interior_match:
+	CMPQ CX, $16
+	JGT memmove_match
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_match
+
+	MOVOU (BX), X0
+	MOVOU X0, (DI)
+
+	ADDQ CX, DI
+	XORL CX, CX
+	JMP  loopcheck
+
+copy_match_from_dict:
+	// CX = match_len
+	// BX = &dst + (di - offset)
+
+	// AX = offset - di = dict_bytes_available => count of bytes potentially covered by the dictionary
+	MOVQ R11, AX
+	SUBQ BX, AX
+
+	// BX = len(dict) - dict_bytes_available
+	MOVQ R15, BX
+	SUBQ AX, BX
+	JS err_short_dict
+
+	ADDQ R14, BX
+
+	// if match_len > dict_bytes_available, match fits entirely within external dictionary : just copy
+	CMPQ CX, AX
+	JLT memmove_match
+
+	// The match stretches over the dictionary and our block
+	// 1) copy what comes from the dictionary
+	// AX = dict_bytes_available = copy_size
+	// BX = &dict_end - copy_size
+	// CX = match_len
+
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ AX, 16(SP)
+	// store extra stuff we want to recover
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 16(SP), AX // copy_size
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX // match_len
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	// di+=copy_size
+	ADDQ AX, DI
+
+	// 2) copy the rest from the current block
+	// CX = match_len - copy_size = rest_size
+	SUBQ AX, CX
+	MOVQ R11, BX
+
+	// check if we have a copy overlap
+	// AX = &dst + rest_size
+	MOVQ CX, AX
+	ADDQ BX, AX
+	// if &dst + rest_size > di, copy byte by byte
+	CMPQ AX, DI
+
+	JA copy_match_loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+
+	// Spill registers. Increment DI now so we don't need to save CX.
+	ADDQ CX, DI
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+	MOVQ dict_base+48(FP), R14
+	MOVQ dict_len+56(FP), R15
+	XORL CX, CX
+
+loopcheck:
+	// for si < len(src)
+	CMPQ SI, R9
+	JB   loop
+
+end:
+	// Remaining length must be zero.
+	TESTQ CX, CX
+	JNE   err_corrupt
+
+	SUBQ R11, DI
+	MOVQ DI, ret+72(FP)
+	RET
+
+err_corrupt:
+	MOVQ $-1, ret+72(FP)
+	RET
+
+err_short_buf:
+	MOVQ $-2, ret+72(FP)
+	RET
+
+err_short_dict:
+	MOVQ $-3, ret+72(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
@ -0,0 +1,231 @@
+// +build gc
+// +build !noasm
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Register allocation.
+#define dst	R0
+#define dstorig	R1
+#define src	R2
+#define dstend	R3
+#define srcend	R4
+#define match	R5	// Match address.
+#define dictend	R6
+#define token	R7
+#define len	R8	// Literal and match lengths.
+#define offset	R7	// Match offset; overlaps with token.
+#define tmp1	R9
+#define tmp2	R11
+#define tmp3	R12
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $-4-40
+	MOVW dst_base  +0(FP), dst
+	MOVW dst_len   +4(FP), dstend
+	MOVW src_base +12(FP), src
+	MOVW src_len  +16(FP), srcend
+
+	CMP $0, srcend
+	BEQ shortSrc
+
+	ADD dst, dstend
+	ADD src, srcend
+
+	MOVW dst, dstorig
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	MOVW    token >> 4, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD.S   tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CMP $0, len
+	BEQ copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADD.S    dst, len, tmp1
+	ADD.CC.S src, len, tmp2
+	BCS      shortSrc
+	CMP      dstend, tmp1
+	//BHI    shortDst // Uncomment for distinct error codes.
+	CMP.LS   srcend, tmp2
+	BHI      shortSrc
+
+	// Copy literal.
+	CMP $4, len
+	BLO copyLiteralFinish
+
+	// Copy 0-3 bytes until src is aligned.
+	TST        $1, src
+	MOVBU.NE.P 1(src), tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $1, len
+
+	TST        $2, src
+	MOVHU.NE.P 2(src), tmp2
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $2, len
+
+	B copyLiteralLoopCond
+
+copyLiteralLoop:
+	// Aligned load, unaligned write.
+	MOVW.P 4(src), tmp1
+	MOVW   tmp1 >>  8, tmp2
+	MOVB   tmp2, 1(dst)
+	MOVW   tmp1 >> 16, tmp3
+	MOVB   tmp3, 2(dst)
+	MOVW   tmp1 >> 24, tmp2
+	MOVB   tmp2, 3(dst)
+	MOVB.P tmp1, 4(dst)
+copyLiteralLoopCond:
+	// Loop until len-4 < 0.
+	SUB.S  $4, len
+	BPL    copyLiteralLoop
+
+copyLiteralFinish:
+	// Copy remaining 0-3 bytes.
+	// At this point, len may be < 0, but len&3 is still accurate.
+	TST       $1, len
+	MOVB.NE.P 1(src), tmp3
+	MOVB.NE.P tmp3, 1(dst)
+	TST       $2, len
+	MOVB.NE.P 2(src), tmp1
+	MOVB.NE.P tmp1, 2(dst)
+	MOVB.NE   -1(src), tmp2
+	MOVB.NE   tmp2, -1(dst)
+
+copyLiteralDone:
+	// Initial part of match length.
+	// This frees up the token register for reuse as offset.
+	AND $15, token, len
+
+	CMP src, srcend
+	BEQ end
+
+	// Read offset.
+	ADD.S $2, src
+	BCS   shortSrc
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVBU -2(src), offset
+	MOVBU -1(src), tmp1
+	ORR.S tmp1 << 8, offset
+	BEQ   corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD.S   tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	// Bounds check dst+len+minMatch.
+	ADD.S    dst, len, tmp1
+	ADD.CC.S $const_minMatch, tmp1
+	BCS      shortDst
+	CMP      dstend, tmp1
+	BHI      shortDst
+
+	RSB dst, offset, match
+	CMP dstorig, match
+	BGE copyMatch4
+
+	// match < dstorig means the match starts in the dictionary,
+	// at len(dict) - offset + (dst - dstorig).
+	MOVW dict_base+24(FP), match
+	MOVW dict_len +28(FP), dictend
+
+	ADD $const_minMatch, len
+
+	RSB   dst, dstorig, tmp1
+	RSB   dictend, offset, tmp2
+	ADD.S tmp2, tmp1
+	BMI   shortDict
+	ADD   match, dictend
+	ADD   tmp1, match
+
+copyDict:
+	MOVBU.P 1(match), tmp1
+	MOVB.P  tmp1, 1(dst)
+	SUB.S   $1, len
+	CMP.NE  match, dictend
+	BNE     copyDict
+
+	// If the match extends beyond the dictionary, the rest is at dstorig.
+	CMP  $0, len
+	BEQ  copyMatchDone
+	MOVW dstorig, match
+	B    copyMatch
+
+	// Copy a regular match.
+	// Since len+minMatch is at least four, we can do a 4× unrolled
+	// byte copy loop. Using MOVW instead of four byte loads is faster,
+	// but to remain portable we'd have to align match first, which is
+	// too expensive. By alternating loads and stores, we also handle
+	// the case offset < 4.
+copyMatch4:
+	SUB.S   $4, len
+	MOVBU.P 4(match), tmp1
+	MOVB.P  tmp1, 4(dst)
+	MOVBU   -3(match), tmp2
+	MOVB    tmp2, -3(dst)
+	MOVBU   -2(match), tmp3
+	MOVB    tmp3, -2(dst)
+	MOVBU   -1(match), tmp1
+	MOVB    tmp1, -1(dst)
+	BPL     copyMatch4
+
+	// Restore len, which is now negative.
+	ADD.S $4, len
+	BEQ   copyMatchDone
+
+copyMatch:
+	// Finish with a byte-at-a-time copy.
+	SUB.S   $1, len
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	BNE     copyMatch
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	CMP  $0, len
+	BNE  corrupt
+	SUB  dstorig, dst, tmp1
+	MOVW tmp1, ret+36(FP)
+	RET
+
+	// The error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDict:
+shortDst:
+shortSrc:
+corrupt:
+	MOVW $-1, tmp1
+	MOVW tmp1, ret+36(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm64.s
@ -0,0 +1,241 @@
+// +build gc
+// +build !noasm
+
+// This implementation assumes that strict alignment checking is turned off.
+// The Go compiler makes the same assumption.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Register allocation.
+#define dst		R0
+#define dstorig		R1
+#define src		R2
+#define dstend		R3
+#define dstend16	R4	// dstend - 16
+#define srcend		R5
+#define srcend16	R6	// srcend - 16
+#define match		R7	// Match address.
+#define dict		R8
+#define dictlen		R9
+#define dictend		R10
+#define token		R11
+#define len		R12	// Literal and match lengths.
+#define lenRem		R13
+#define offset		R14	// Match offset.
+#define tmp1		R15
+#define tmp2		R16
+#define tmp3		R17
+#define tmp4		R19
+
+// func decodeBlock(dst, src, dict []byte) int
+TEXT ·decodeBlock(SB), NOFRAME+NOSPLIT, $0-80
+	LDP  dst_base+0(FP), (dst, dstend)
+	ADD  dst, dstend
+	MOVD dst, dstorig
+
+	LDP src_base+24(FP), (src, srcend)
+	CBZ srcend, shortSrc
+	ADD src, srcend
+
+	// dstend16 = max(dstend-16, 0) and similarly for srcend16.
+	SUBS $16, dstend, dstend16
+	CSEL LO, ZR, dstend16, dstend16
+	SUBS $16, srcend, srcend16
+	CSEL LO, ZR, srcend16, srcend16
+
+	LDP dict_base+48(FP), (dict, dictlen)
+	ADD dict, dictlen, dictend
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	LSR     $4, token, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADDS    tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CBZ len, copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADDS dst, len, tmp1
+	BCS  shortSrc
+	ADDS src, len, tmp2
+	BCS  shortSrc
+	CMP  dstend, tmp1
+	BHI  shortDst
+	CMP  srcend, tmp2
+	BHI  shortSrc
+
+	// Copy literal.
+	SUBS $16, len
+	BLO  copyLiteralShort
+
+copyLiteralLoop:
+	LDP.P 16(src), (tmp1, tmp2)
+	STP.P (tmp1, tmp2), 16(dst)
+	SUBS  $16, len
+	BPL   copyLiteralLoop
+
+	// Copy (final part of) literal of length 0-15.
+	// If we have >=16 bytes left in src and dst, just copy 16 bytes.
+copyLiteralShort:
+	CMP  dstend16, dst
+	CCMP LO, src, srcend16, $0b0010 // 0010 = preserve carry (LO).
+	BHS  copyLiteralShortEnd
+
+	AND $15, len
+
+	LDP (src), (tmp1, tmp2)
+	ADD len, src
+	STP (tmp1, tmp2), (dst)
+	ADD len, dst
+
+	B copyLiteralDone
+
+	// Safe but slow copy near the end of src, dst.
+copyLiteralShortEnd:
+	TBZ     $3, len, 3(PC)
+	MOVD.P  8(src), tmp1
+	MOVD.P  tmp1, 8(dst)
+	TBZ     $2, len, 3(PC)
+	MOVW.P  4(src), tmp2
+	MOVW.P  tmp2, 4(dst)
+	TBZ     $1, len, 3(PC)
+	MOVH.P  2(src), tmp3
+	MOVH.P  tmp3, 2(dst)
+	TBZ     $0, len, 3(PC)
+	MOVBU.P 1(src), tmp4
+	MOVB.P  tmp4, 1(dst)
+
+copyLiteralDone:
+	// Initial part of match length.
+	AND $15, token, len
+
+	CMP src, srcend
+	BEQ end
+
+	// Read offset.
+	ADDS  $2, src
+	BCS   shortSrc
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVHU -2(src), offset
+	CBZ   offset, corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADDS    tmp1, len
+	BVS     shortDst
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	ADD $const_minMatch, len
+
+	// Bounds check dst+len.
+	ADDS dst, len, tmp2
+	BCS  shortDst
+	CMP  dstend, tmp2
+	BHI  shortDst
+
+	SUB offset, dst, match
+	CMP dstorig, match
+	BHS copyMatchTry8
+
+	// match < dstorig means the match starts in the dictionary,
+	// at len(dict) - offset + (dst - dstorig).
+	SUB  dstorig, dst, tmp1
+	SUB  offset, dictlen, tmp2
+	ADDS tmp2, tmp1
+	BMI  shortDict
+	ADD  dict, tmp1, match
+
+copyDict:
+	MOVBU.P 1(match), tmp3
+	MOVB.P  tmp3, 1(dst)
+	SUBS    $1, len
+	CCMP    NE, dictend, match, $0b0100 // 0100 sets the Z (EQ) flag.
+	BNE     copyDict
+
+	CBZ len, copyMatchDone
+
+	// If the match extends beyond the dictionary, the rest is at dstorig.
+	// Recompute the offset for the next check.
+	MOVD dstorig, match
+	SUB  dstorig, dst, offset
+
+copyMatchTry8:
+	// Copy doublewords if both len and offset are at least eight.
+	// A 16-at-a-time loop doesn't provide a further speedup.
+	CMP  $8, len
+	CCMP HS, offset, $8, $0
+	BLO  copyMatchTry4
+
+	AND    $7, len, lenRem
+	SUB    $8, len
+copyMatchLoop8:
+	MOVD.P 8(match), tmp1
+	MOVD.P tmp1, 8(dst)
+	SUBS   $8, len
+	BPL    copyMatchLoop8
+
+	MOVD (match)(len), tmp2 // match+len == match+lenRem-8.
+	ADD  lenRem, dst
+	MOVD $0, len
+	MOVD tmp2, -8(dst)
+	B    copyMatchDone
+
+copyMatchTry4:
+	// Copy words if both len and offset are at least four.
+	CMP  $4, len
+	CCMP HS, offset, $4, $0
+	BLO  copyMatchLoop1
+
+	MOVWU.P 4(match), tmp2
+	MOVWU.P tmp2, 4(dst)
+	SUBS    $4, len
+	BEQ     copyMatchDone
+
+copyMatchLoop1:
+	// Byte-at-a-time copy for small offsets <= 3.
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	SUBS    $1, len
+	BNE     copyMatchLoop1
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	CBNZ len, corrupt
+	SUB  dstorig, dst, tmp1
+	MOVD tmp1, ret+72(FP)
+	RET
+
+	// The error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDict:
+shortDst:
+shortSrc:
+corrupt:
+	MOVD $-1, tmp1
+	MOVD tmp1, ret+72(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
@ -0,0 +1,10 @@
+//go:build (amd64 || arm || arm64) && !appengine && gc && !noasm
+// +build amd64 arm arm64
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package lz4block
+
+//go:noescape
+func decodeBlock(dst, src, dict []byte) int
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
@ -0,0 +1,139 @@
+//go:build (!amd64 && !arm && !arm64) || appengine || !gc || noasm
+// +build !amd64,!arm,!arm64 appengine !gc noasm
+
+package lz4block
+
+import (
+	"encoding/binary"
+)
+
+func decodeBlock(dst, src, dict []byte) (ret int) {
+	// Restrict capacities so we don't read or write out of bounds.
+	dst = dst[:len(dst):len(dst)]
+	src = src[:len(src):len(src)]
+
+	const hasError = -2
+
+	if len(src) == 0 {
+		return hasError
+	}
+
+	defer func() {
+		if recover() != nil {
+			ret = hasError
+		}
+	}()
+
+	var si, di uint
+	for si < uint(len(src)) {
+		// Literals and match lengths (token).
+		b := uint(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			switch {
+			case lLen < 0xF && si+16 < uint(len(src)):
+				// Shortcut 1
+				// if we have enough room in src and dst, and the literals length
+				// is small enough (0..14) then copy all 16 bytes, even if not all
+				// are part of the literals.
+				copy(dst[di:], src[si:si+16])
+				si += lLen
+				di += lLen
+				if mLen := b & 0xF; mLen < 0xF {
+					// Shortcut 2
+					// if the match length (4..18) fits within the literals, then copy
+					// all 18 bytes, even if not all are part of the literals.
+					mLen += 4
+					if offset := u16(src[si:]); mLen <= offset && offset < di {
+						i := di - offset
+						// The remaining buffer may not hold 18 bytes.
+						// See https://github.com/pierrec/lz4/issues/51.
+						if end := i + 18; end <= uint(len(dst)) {
+							copy(dst[di:], dst[i:end])
+							si += 2
+							di += mLen
+							continue
+						}
+					}
+				}
+			case lLen == 0xF:
+				for {
+					x := uint(src[si])
+					if lLen += x; int(lLen) < 0 {
+						return hasError
+					}
+					si++
+					if x != 0xFF {
+						break
+					}
+				}
+				fallthrough
+			default:
+				copy(dst[di:di+lLen], src[si:si+lLen])
+				si += lLen
+				di += lLen
+			}
+		}
+
+		mLen := b & 0xF
+		if si == uint(len(src)) && mLen == 0 {
+			break
+		} else if si >= uint(len(src)) {
+			return hasError
+		}
+
+		offset := u16(src[si:])
+		if offset == 0 {
+			return hasError
+		}
+		si += 2
+
+		// Match.
+		mLen += minMatch
+		if mLen == minMatch+0xF {
+			for {
+				x := uint(src[si])
+				if mLen += x; int(mLen) < 0 {
+					return hasError
+				}
+				si++
+				if x != 0xFF {
+					break
+				}
+			}
+		}
+
+		// Copy the match.
+		if di < offset {
+			// The match is beyond our block, meaning the first part
+			// is in the dictionary.
+			fromDict := dict[uint(len(dict))+di-offset:]
+			n := uint(copy(dst[di:di+mLen], fromDict))
+			di += n
+			if mLen -= n; mLen == 0 {
+				continue
+			}
+			// We copied n = offset-di bytes from the dictionary,
+			// then set di = di+n = offset, so the following code
+			// copies from dst[di-offset:] = dst[0:].
+		}
+
+		expanded := dst[di-offset:]
+		if mLen > offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += uint(copy(dst[di:di+mLen], expanded[:mLen]))
+	}
+
+	return int(di)
+}
+
+func u16(p []byte) uint { return uint(binary.LittleEndian.Uint16(p)) }
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go
@ -0,0 +1,19 @@
+package lz4errors
+
+type Error string
+
+func (e Error) Error() string { return string(e) }
+
+const (
+	ErrInvalidSourceShortBuffer      Error = "lz4: invalid source or destination buffer too short"
+	ErrInvalidFrame                  Error = "lz4: bad magic number"
+	ErrInternalUnhandledState        Error = "lz4: unhandled state"
+	ErrInvalidHeaderChecksum         Error = "lz4: invalid header checksum"
+	ErrInvalidBlockChecksum          Error = "lz4: invalid block checksum"
+	ErrInvalidFrameChecksum          Error = "lz4: invalid frame checksum"
+	ErrOptionInvalidCompressionLevel Error = "lz4: invalid compression level"
+	ErrOptionClosedOrError           Error = "lz4: cannot apply options on closed or in error object"
+	ErrOptionInvalidBlockSize        Error = "lz4: invalid block size"
+	ErrOptionNotApplicable           Error = "lz4: option not applicable"
+	ErrWriterNotClosed               Error = "lz4: writer not closed"
+)
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
@ -0,0 +1,350 @@
+package lz4stream
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/xxh32"
+)
+
+type Blocks struct {
+	Block  *FrameDataBlock
+	Blocks chan chan *FrameDataBlock
+	mu     sync.Mutex
+	err    error
+}
+
+func (b *Blocks) initW(f *Frame, dst io.Writer, num int) {
+	if num == 1 {
+		b.Blocks = nil
+		b.Block = NewFrameDataBlock(f)
+		return
+	}
+	b.Block = nil
+	if cap(b.Blocks) != num {
+		b.Blocks = make(chan chan *FrameDataBlock, num)
+	}
+	// goroutine managing concurrent block compression goroutines.
+	go func() {
+		// Process next block compression item.
+		for c := range b.Blocks {
+			// Read the next compressed block result.
+			// Waiting here ensures that the blocks are output in the order they were sent.
+			// The incoming channel is always closed as it indicates to the caller that
+			// the block has been processed.
+			block := <-c
+			if block == nil {
+				// Notify the block compression routine that we are done with its result.
+				// This is used when a sentinel block is sent to terminate the compression.
+				close(c)
+				return
+			}
+			// Do not attempt to write the block upon any previous failure.
+			if b.err == nil {
+				// Write the block.
+				if err := block.Write(f, dst); err != nil {
+					// Keep the first error.
+					b.err = err
+					// All pending compression goroutines need to shut down, so we need to keep going.
+				}
+			}
+			close(c)
+		}
+	}()
+}
+
+func (b *Blocks) close(f *Frame, num int) error {
+	if num == 1 {
+		if b.Block != nil {
+			b.Block.Close(f)
+		}
+		err := b.err
+		b.err = nil
+		return err
+	}
+	if b.Blocks == nil {
+		err := b.err
+		b.err = nil
+		return err
+	}
+	c := make(chan *FrameDataBlock)
+	b.Blocks <- c
+	c <- nil
+	<-c
+	err := b.err
+	b.err = nil
+	return err
+}
+
+// ErrorR returns any error set while uncompressing a stream.
+func (b *Blocks) ErrorR() error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.err
+}
+
+// initR returns a channel that streams the uncompressed blocks if in concurrent
+// mode and no error. When the channel is closed, check for any error with b.ErrorR.
+//
+// If not in concurrent mode, the uncompressed block is b.Block and the returned error
+// needs to be checked.
+func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
+	size := f.Descriptor.Flags.BlockSizeIndex()
+	if num == 1 {
+		b.Blocks = nil
+		b.Block = NewFrameDataBlock(f)
+		return nil, nil
+	}
+	b.Block = nil
+	blocks := make(chan chan []byte, num)
+	// data receives the uncompressed blocks.
+	data := make(chan []byte)
+	// Read blocks from the source sequentially
+	// and uncompress them concurrently.
+
+	// In legacy mode, accrue the uncompress sizes in cum.
+	var cum uint32
+	go func() {
+		var cumx uint32
+		var err error
+		for b.ErrorR() == nil {
+			block := NewFrameDataBlock(f)
+			cumx, err = block.Read(f, src, 0)
+			if err != nil {
+				block.Close(f)
+				break
+			}
+			// Recheck for an error as reading may be slow and uncompressing is expensive.
+			if b.ErrorR() != nil {
+				block.Close(f)
+				break
+			}
+			c := make(chan []byte)
+			blocks <- c
+			go func() {
+				defer block.Close(f)
+				data, err := block.Uncompress(f, size.Get(), nil, false)
+				if err != nil {
+					b.closeR(err)
+					// Close the block channel to indicate an error.
+					close(c)
+				} else {
+					c <- data
+				}
+			}()
+		}
+		// End the collection loop and the data channel.
+		c := make(chan []byte)
+		blocks <- c
+		c <- nil // signal the collection loop that we are done
+		<-c      // wait for the collect loop to complete
+		if f.isLegacy() && cum == cumx {
+			err = io.EOF
+		}
+		b.closeR(err)
+		close(data)
+	}()
+	// Collect the uncompressed blocks and make them available
+	// on the returned channel.
+	go func(leg bool) {
+		defer close(blocks)
+		skipBlocks := false
+		for c := range blocks {
+			buf, ok := <-c
+			if !ok {
+				// A closed channel indicates an error.
+				// All remaining channels should be discarded.
+				skipBlocks = true
+				continue
+			}
+			if buf == nil {
+				// Signal to end the loop.
+				close(c)
+				return
+			}
+			if skipBlocks {
+				// A previous error has occurred, skipping remaining channels.
+				continue
+			}
+			// Perform checksum now as the blocks are received in order.
+			if f.Descriptor.Flags.ContentChecksum() {
+				_, _ = f.checksum.Write(buf)
+			}
+			if leg {
+				cum += uint32(len(buf))
+			}
+			data <- buf
+			close(c)
+		}
+	}(f.isLegacy())
+	return data, nil
+}
+
+// closeR safely sets the error on b if not already set.
+func (b *Blocks) closeR(err error) {
+	b.mu.Lock()
+	if b.err == nil {
+		b.err = err
+	}
+	b.mu.Unlock()
+}
+
+func NewFrameDataBlock(f *Frame) *FrameDataBlock {
+	buf := f.Descriptor.Flags.BlockSizeIndex().Get()
+	return &FrameDataBlock{Data: buf, data: buf}
+}
+
+type FrameDataBlock struct {
+	Size     DataBlockSize
+	Data     []byte // compressed or uncompressed data (.data or .src)
+	Checksum uint32
+	data     []byte // buffer for compressed data
+	src      []byte // uncompressed data
+	err      error  // used in concurrent mode
+}
+
+func (b *FrameDataBlock) Close(f *Frame) {
+	b.Size = 0
+	b.Checksum = 0
+	b.err = nil
+	if b.data != nil {
+		// Block was not already closed.
+		lz4block.Put(b.data)
+		b.Data = nil
+		b.data = nil
+		b.src = nil
+	}
+}
+
+// Block compression errors are ignored since the buffer is sized appropriately.
+func (b *FrameDataBlock) Compress(f *Frame, src []byte, level lz4block.CompressionLevel) *FrameDataBlock {
+	data := b.data
+	if f.isLegacy() {
+		// In legacy mode, the buffer is sized according to CompressBlockBound,
+		// but only 8Mb is buffered for compression.
+		src = src[:8<<20]
+	} else {
+		data = data[:len(src)] // trigger the incompressible flag in CompressBlock
+	}
+	var n int
+	switch level {
+	case lz4block.Fast:
+		n, _ = lz4block.CompressBlock(src, data)
+	default:
+		n, _ = lz4block.CompressBlockHC(src, data, level)
+	}
+	if n == 0 {
+		b.Size.UncompressedSet(true)
+		b.Data = src
+	} else {
+		b.Size.UncompressedSet(false)
+		b.Data = data[:n]
+	}
+	b.Size.sizeSet(len(b.Data))
+	b.src = src // keep track of the source for content checksum
+
+	if f.Descriptor.Flags.BlockChecksum() {
+		b.Checksum = xxh32.ChecksumZero(src)
+	}
+	return b
+}
+
+func (b *FrameDataBlock) Write(f *Frame, dst io.Writer) error {
+	// Write is called in the same order as blocks are compressed,
+	// so content checksum must be done here.
+	if f.Descriptor.Flags.ContentChecksum() {
+		_, _ = f.checksum.Write(b.src)
+	}
+	buf := f.buf[:]
+	binary.LittleEndian.PutUint32(buf, uint32(b.Size))
+	if _, err := dst.Write(buf[:4]); err != nil {
+		return err
+	}
+
+	if _, err := dst.Write(b.Data); err != nil {
+		return err
+	}
+
+	if b.Checksum == 0 {
+		return nil
+	}
+	binary.LittleEndian.PutUint32(buf, b.Checksum)
+	_, err := dst.Write(buf[:4])
+	return err
+}
+
+// Read updates b with the next block data, size and checksum if available.
+func (b *FrameDataBlock) Read(f *Frame, src io.Reader, cum uint32) (uint32, error) {
+	x, err := f.readUint32(src)
+	if err != nil {
+		return 0, err
+	}
+	if f.isLegacy() {
+		switch x {
+		case frameMagicLegacy:
+			// Concatenated legacy frame.
+			return b.Read(f, src, cum)
+		case cum:
+			// Only works in non concurrent mode, for concurrent mode
+			// it is handled separately.
+			// Linux kernel format appends the total uncompressed size at the end.
+			return 0, io.EOF
+		}
+	} else if x == 0 {
+		// Marker for end of stream.
+		return 0, io.EOF
+	}
+	b.Size = DataBlockSize(x)
+
+	size := b.Size.size()
+	if size > cap(b.data) {
+		return x, lz4errors.ErrOptionInvalidBlockSize
+	}
+	b.data = b.data[:size]
+	if _, err := io.ReadFull(src, b.data); err != nil {
+		return x, err
+	}
+	if f.Descriptor.Flags.BlockChecksum() {
+		sum, err := f.readUint32(src)
+		if err != nil {
+			return 0, err
+		}
+		b.Checksum = sum
+	}
+	return x, nil
+}
+
+func (b *FrameDataBlock) Uncompress(f *Frame, dst, dict []byte, sum bool) ([]byte, error) {
+	if b.Size.Uncompressed() {
+		n := copy(dst, b.data)
+		dst = dst[:n]
+	} else {
+		n, err := lz4block.UncompressBlock(b.data, dst, dict)
+		if err != nil {
+			return nil, err
+		}
+		dst = dst[:n]
+	}
+	if f.Descriptor.Flags.BlockChecksum() {
+		if c := xxh32.ChecksumZero(dst); c != b.Checksum {
+			err := fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidBlockChecksum, c, b.Checksum)
+			return nil, err
+		}
+	}
+	if sum && f.Descriptor.Flags.ContentChecksum() {
+		_, _ = f.checksum.Write(dst)
+	}
+	return dst, nil
+}
+
+func (f *Frame) readUint32(r io.Reader) (x uint32, err error) {
+	if _, err = io.ReadFull(r, f.buf[:4]); err != nil {
+		return
+	}
+	x = binary.LittleEndian.Uint32(f.buf[:4])
+	return
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
@ -0,0 +1,204 @@
+// Package lz4stream provides the types that support reading and writing LZ4 data streams.
+package lz4stream
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"io/ioutil"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/xxh32"
+)
+
+//go:generate go run gen.go
+
+const (
+	frameMagic       uint32 = 0x184D2204
+	frameSkipMagic   uint32 = 0x184D2A50
+	frameMagicLegacy uint32 = 0x184C2102
+)
+
+func NewFrame() *Frame {
+	return &Frame{}
+}
+
+type Frame struct {
+	buf        [15]byte // frame descriptor needs at most 4(magic)+4+8+1=11 bytes
+	Magic      uint32
+	Descriptor FrameDescriptor
+	Blocks     Blocks
+	Checksum   uint32
+	checksum   xxh32.XXHZero
+}
+
+// Reset allows reusing the Frame.
+// The Descriptor configuration is not modified.
+func (f *Frame) Reset(num int) {
+	f.Magic = 0
+	f.Descriptor.Checksum = 0
+	f.Descriptor.ContentSize = 0
+	_ = f.Blocks.close(f, num)
+	f.Checksum = 0
+}
+
+func (f *Frame) InitW(dst io.Writer, num int, legacy bool) {
+	if legacy {
+		f.Magic = frameMagicLegacy
+		idx := lz4block.Index(lz4block.Block8Mb)
+		f.Descriptor.Flags.BlockSizeIndexSet(idx)
+	} else {
+		f.Magic = frameMagic
+		f.Descriptor.initW()
+	}
+	f.Blocks.initW(f, dst, num)
+	f.checksum.Reset()
+}
+
+func (f *Frame) CloseW(dst io.Writer, num int) error {
+	if err := f.Blocks.close(f, num); err != nil {
+		return err
+	}
+	if f.isLegacy() {
+		return nil
+	}
+	buf := f.buf[:0]
+	// End mark (data block size of uint32(0)).
+	buf = append(buf, 0, 0, 0, 0)
+	if f.Descriptor.Flags.ContentChecksum() {
+		buf = f.checksum.Sum(buf)
+	}
+	_, err := dst.Write(buf)
+	return err
+}
+
+func (f *Frame) isLegacy() bool {
+	return f.Magic == frameMagicLegacy
+}
+
+func (f *Frame) ParseHeaders(src io.Reader) error {
+	if f.Magic > 0 {
+		// Header already read.
+		return nil
+	}
+
+newFrame:
+	var err error
+	if f.Magic, err = f.readUint32(src); err != nil {
+		return err
+	}
+	switch m := f.Magic; {
+	case m == frameMagic || m == frameMagicLegacy:
+	// All 16 values of frameSkipMagic are valid.
+	case m>>8 == frameSkipMagic>>8:
+		skip, err := f.readUint32(src)
+		if err != nil {
+			return err
+		}
+		if _, err := io.CopyN(ioutil.Discard, src, int64(skip)); err != nil {
+			return err
+		}
+		goto newFrame
+	default:
+		return lz4errors.ErrInvalidFrame
+	}
+	if err := f.Descriptor.initR(f, src); err != nil {
+		return err
+	}
+	f.checksum.Reset()
+	return nil
+}
+
+func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
+	return f.Blocks.initR(f, num, src)
+}
+
+func (f *Frame) CloseR(src io.Reader) (err error) {
+	if f.isLegacy() {
+		return nil
+	}
+	if !f.Descriptor.Flags.ContentChecksum() {
+		return nil
+	}
+	if f.Checksum, err = f.readUint32(src); err != nil {
+		return err
+	}
+	if c := f.checksum.Sum32(); c != f.Checksum {
+		return fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidFrameChecksum, c, f.Checksum)
+	}
+	return nil
+}
+
+type FrameDescriptor struct {
+	Flags       DescriptorFlags
+	ContentSize uint64
+	Checksum    uint8
+}
+
+func (fd *FrameDescriptor) initW() {
+	fd.Flags.VersionSet(1)
+	fd.Flags.BlockIndependenceSet(true)
+}
+
+func (fd *FrameDescriptor) Write(f *Frame, dst io.Writer) error {
+	if fd.Checksum > 0 {
+		// Header already written.
+		return nil
+	}
+
+	buf := f.buf[:4]
+	// Write the magic number here even though it belongs to the Frame.
+	binary.LittleEndian.PutUint32(buf, f.Magic)
+	if !f.isLegacy() {
+		buf = buf[:4+2]
+		binary.LittleEndian.PutUint16(buf[4:], uint16(fd.Flags))
+
+		if fd.Flags.Size() {
+			buf = buf[:4+2+8]
+			binary.LittleEndian.PutUint64(buf[4+2:], fd.ContentSize)
+		}
+		fd.Checksum = descriptorChecksum(buf[4:])
+		buf = append(buf, fd.Checksum)
+	}
+
+	_, err := dst.Write(buf)
+	return err
+}
+
+func (fd *FrameDescriptor) initR(f *Frame, src io.Reader) error {
+	if f.isLegacy() {
+		idx := lz4block.Index(lz4block.Block8Mb)
+		f.Descriptor.Flags.BlockSizeIndexSet(idx)
+		return nil
+	}
+	// Read the flags and the checksum, hoping that there is not content size.
+	buf := f.buf[:3]
+	if _, err := io.ReadFull(src, buf); err != nil {
+		return err
+	}
+	descr := binary.LittleEndian.Uint16(buf)
+	fd.Flags = DescriptorFlags(descr)
+	if fd.Flags.Size() {
+		// Append the 8 missing bytes.
+		buf = buf[:3+8]
+		if _, err := io.ReadFull(src, buf[3:]); err != nil {
+			return err
+		}
+		fd.ContentSize = binary.LittleEndian.Uint64(buf[2:])
+	}
+	fd.Checksum = buf[len(buf)-1] // the checksum is the last byte
+	buf = buf[:len(buf)-1]        // all descriptor fields except checksum
+	if c := descriptorChecksum(buf); fd.Checksum != c {
+		return fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidHeaderChecksum, c, fd.Checksum)
+	}
+	// Validate the elements that can be.
+	if idx := fd.Flags.BlockSizeIndex(); !idx.IsValid() {
+		return lz4errors.ErrOptionInvalidBlockSize
+	}
+	return nil
+}
+
+func descriptorChecksum(buf []byte) byte {
+	return byte(xxh32.ChecksumZero(buf) >> 8)
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go
@ -0,0 +1,103 @@
+// Code generated by `gen.exe`. DO NOT EDIT.
+
+package lz4stream
+
+import "github.com/pierrec/lz4/v4/internal/lz4block"
+
+// DescriptorFlags is defined as follow:
+//   field              bits
+//   -----              ----
+//   _                  2
+//   ContentChecksum    1
+//   Size               1
+//   BlockChecksum      1
+//   BlockIndependence  1
+//   Version            2
+//   _                  4
+//   BlockSizeIndex     3
+//   _                  1
+type DescriptorFlags uint16
+
+// Getters.
+func (x DescriptorFlags) ContentChecksum() bool   { return x>>2&1 != 0 }
+func (x DescriptorFlags) Size() bool              { return x>>3&1 != 0 }
+func (x DescriptorFlags) BlockChecksum() bool     { return x>>4&1 != 0 }
+func (x DescriptorFlags) BlockIndependence() bool { return x>>5&1 != 0 }
+func (x DescriptorFlags) Version() uint16         { return uint16(x >> 6 & 0x3) }
+func (x DescriptorFlags) BlockSizeIndex() lz4block.BlockSizeIndex {
+	return lz4block.BlockSizeIndex(x >> 12 & 0x7)
+}
+
+// Setters.
+func (x *DescriptorFlags) ContentChecksumSet(v bool) *DescriptorFlags {
+	const b = 1 << 2
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) SizeSet(v bool) *DescriptorFlags {
+	const b = 1 << 3
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) BlockChecksumSet(v bool) *DescriptorFlags {
+	const b = 1 << 4
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) BlockIndependenceSet(v bool) *DescriptorFlags {
+	const b = 1 << 5
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) VersionSet(v uint16) *DescriptorFlags {
+	*x = *x&^(0x3<<6) | (DescriptorFlags(v) & 0x3 << 6)
+	return x
+}
+func (x *DescriptorFlags) BlockSizeIndexSet(v lz4block.BlockSizeIndex) *DescriptorFlags {
+	*x = *x&^(0x7<<12) | (DescriptorFlags(v) & 0x7 << 12)
+	return x
+}
+
+// Code generated by `gen.exe`. DO NOT EDIT.
+
+// DataBlockSize is defined as follow:
+//   field         bits
+//   -----         ----
+//   size          31
+//   Uncompressed  1
+type DataBlockSize uint32
+
+// Getters.
+func (x DataBlockSize) size() int          { return int(x & 0x7FFFFFFF) }
+func (x DataBlockSize) Uncompressed() bool { return x>>31&1 != 0 }
+
+// Setters.
+func (x *DataBlockSize) sizeSet(v int) *DataBlockSize {
+	*x = *x&^0x7FFFFFFF | DataBlockSize(v)&0x7FFFFFFF
+	return x
+}
+func (x *DataBlockSize) UncompressedSet(v bool) *DataBlockSize {
+	const b = 1 << 31
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
@ -0,0 +1,212 @@
+// Package xxh32 implements the very fast XXH hashing algorithm (32 bits version).
+// (ported from the reference implementation https://github.com/Cyan4973/xxHash/)
+package xxh32
+
+import (
+	"encoding/binary"
+)
+
+const (
+	prime1 uint32 = 2654435761
+	prime2 uint32 = 2246822519
+	prime3 uint32 = 3266489917
+	prime4 uint32 = 668265263
+	prime5 uint32 = 374761393
+
+	primeMask   = 0xFFFFFFFF
+	prime1plus2 = uint32((uint64(prime1) + uint64(prime2)) & primeMask) // 606290984
+	prime1minus = uint32((-int64(prime1)) & primeMask)                  // 1640531535
+)
+
+// XXHZero represents an xxhash32 object with seed 0.
+type XXHZero struct {
+	v        [4]uint32
+	totalLen uint64
+	buf      [16]byte
+	bufused  int
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (xxh XXHZero) Sum(b []byte) []byte {
+	h32 := xxh.Sum32()
+	return append(b, byte(h32), byte(h32>>8), byte(h32>>16), byte(h32>>24))
+}
+
+// Reset resets the Hash to its initial state.
+func (xxh *XXHZero) Reset() {
+	xxh.v[0] = prime1plus2
+	xxh.v[1] = prime2
+	xxh.v[2] = 0
+	xxh.v[3] = prime1minus
+	xxh.totalLen = 0
+	xxh.bufused = 0
+}
+
+// Size returns the number of bytes returned by Sum().
+func (xxh *XXHZero) Size() int {
+	return 4
+}
+
+// BlockSizeIndex gives the minimum number of bytes accepted by Write().
+func (xxh *XXHZero) BlockSize() int {
+	return 1
+}
+
+// Write adds input bytes to the Hash.
+// It never returns an error.
+func (xxh *XXHZero) Write(input []byte) (int, error) {
+	if xxh.totalLen == 0 {
+		xxh.Reset()
+	}
+	n := len(input)
+	m := xxh.bufused
+
+	xxh.totalLen += uint64(n)
+
+	r := len(xxh.buf) - m
+	if n < r {
+		copy(xxh.buf[m:], input)
+		xxh.bufused += len(input)
+		return n, nil
+	}
+
+	var buf *[16]byte
+	if m != 0 {
+		// some data left from previous update
+		buf = &xxh.buf
+		c := copy(buf[m:], input)
+		n -= c
+		input = input[c:]
+	}
+	update(&xxh.v, buf, input)
+	xxh.bufused = copy(xxh.buf[:], input[n-n%16:])
+
+	return n, nil
+}
+
+// Portable version of update. This updates v by processing all of buf
+// (if not nil) and all full 16-byte blocks of input.
+func updateGo(v *[4]uint32, buf *[16]byte, input []byte) {
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := v[0], v[1], v[2], v[3]
+
+	if buf != nil {
+		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime2) * prime1
+	}
+
+	for ; len(input) >= 16; input = input[16:] {
+		sub := input[:16] //BCE hint for compiler
+		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
+	}
+	v[0], v[1], v[2], v[3] = v1, v2, v3, v4
+}
+
+// Sum32 returns the 32 bits Hash value.
+func (xxh *XXHZero) Sum32() uint32 {
+	h32 := uint32(xxh.totalLen)
+	if h32 >= 16 {
+		h32 += rol1(xxh.v[0]) + rol7(xxh.v[1]) + rol12(xxh.v[2]) + rol18(xxh.v[3])
+	} else {
+		h32 += prime5
+	}
+
+	p := 0
+	n := xxh.bufused
+	buf := xxh.buf
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(buf[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
+	}
+	for ; p < n; p++ {
+		h32 += uint32(buf[p]) * prime5
+		h32 = rol11(h32) * prime1
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime2
+	h32 ^= h32 >> 13
+	h32 *= prime3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+// Portable version of ChecksumZero.
+func checksumZeroGo(input []byte) uint32 {
+	n := len(input)
+	h32 := uint32(n)
+
+	if n < 16 {
+		h32 += prime5
+	} else {
+		v1 := prime1plus2
+		v2 := prime2
+		v3 := uint32(0)
+		v4 := prime1minus
+		p := 0
+		for n := n - 16; p <= n; p += 16 {
+			sub := input[p:][:16] //BCE hint for compiler
+			v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+			v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+			v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+			v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
+		}
+		input = input[p:]
+		n -= p
+		h32 += rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
+	}
+
+	p := 0
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(input[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
+	}
+	for p < n {
+		h32 += uint32(input[p]) * prime5
+		h32 = rol11(h32) * prime1
+		p++
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime2
+	h32 ^= h32 >> 13
+	h32 *= prime3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+func rol1(u uint32) uint32 {
+	return u<<1 | u>>31
+}
+
+func rol7(u uint32) uint32 {
+	return u<<7 | u>>25
+}
+
+func rol11(u uint32) uint32 {
+	return u<<11 | u>>21
+}
+
+func rol12(u uint32) uint32 {
+	return u<<12 | u>>20
+}
+
+func rol13(u uint32) uint32 {
+	return u<<13 | u>>19
+}
+
+func rol17(u uint32) uint32 {
+	return u<<17 | u>>15
+}
+
+func rol18(u uint32) uint32 {
+	return u<<18 | u>>14
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go
@ -0,0 +1,11 @@
+// +build !noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+//
+//go:noescape
+func ChecksumZero(input []byte) uint32
+
+//go:noescape
+func update(v *[4]uint32, buf *[16]byte, input []byte)
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
@ -0,0 +1,251 @@
+// +build !noasm
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Register allocation.
+#define p	R0
+#define n	R1
+#define h	R2
+#define v1	R2	// Alias for h.
+#define v2	R3
+#define v3	R4
+#define v4	R5
+#define x1	R6
+#define x2	R7
+#define x3	R8
+#define x4	R9
+
+// We need the primes in registers. The 16-byte loop only uses prime{1,2}.
+#define prime1r	R11
+#define prime2r	R12
+#define prime3r	R3	// The rest can alias v{2-4}.
+#define prime4r	R4
+#define prime5r	R5
+
+// Update round macros. These read from and increment p.
+
+#define round16aligned			\
+	MOVM.IA.W (p), [x1, x2, x3, x4]	\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MULA x2, prime2r, v2, v2	\
+	MULA x3, prime2r, v3, v3	\
+	MULA x4, prime2r, v4, v4	\
+					\
+	MOVW v1 @> 19, v1		\
+	MOVW v2 @> 19, v2		\
+	MOVW v3 @> 19, v3		\
+	MOVW v4 @> 19, v4		\
+					\
+	MUL prime1r, v1			\
+	MUL prime1r, v2			\
+	MUL prime1r, v3			\
+	MUL prime1r, v4			\
+
+#define round16unaligned 		\
+	MOVBU.P  16(p), x1		\
+	MOVBU   -15(p), x2		\
+	ORR     x2 <<  8, x1		\
+	MOVBU   -14(p), x3		\
+	MOVBU   -13(p), x4		\
+	ORR     x4 <<  8, x3		\
+	ORR     x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MOVW v1 @> 19, v1		\
+	MUL prime1r, v1			\
+					\
+	MOVBU -12(p), x1		\
+	MOVBU -11(p), x2		\
+	ORR   x2 <<  8, x1		\
+	MOVBU -10(p), x3		\
+	MOVBU  -9(p), x4		\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v2, v2	\
+	MOVW v2 @> 19, v2		\
+	MUL prime1r, v2			\
+					\
+	MOVBU -8(p), x1			\
+	MOVBU -7(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -6(p), x3			\
+	MOVBU -5(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v3, v3	\
+	MOVW v3 @> 19, v3		\
+	MUL prime1r, v3			\
+					\
+	MOVBU -4(p), x1			\
+	MOVBU -3(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -2(p), x3			\
+	MOVBU -1(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v4, v4	\
+	MOVW v4 @> 19, v4		\
+	MUL prime1r, v4			\
+
+
+// func ChecksumZero([]byte) uint32
+TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
+	MOVW input_base+0(FP), p
+	MOVW input_len+4(FP),  n
+
+	MOVW $const_prime1, prime1r
+	MOVW $const_prime2, prime2r
+
+	// Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
+	// here, but that's a pseudo-op that generates a load through R11.
+	MOVW $const_prime5, prime5r
+	ADD  prime5r, n, h
+	CMP  $0, n
+	BEQ  end
+
+	// We let n go negative so we can do comparisons with SUB.S
+	// instead of separate CMP.
+	SUB.S $16, n
+	BMI   loop16done
+
+	ADD  prime1r, prime2r, v1
+	MOVW prime2r, v2
+	MOVW $0, v3
+	RSB  $0, prime1r, v4
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   loop16finish
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+loop16finish:
+	MOVW v1 @> 31, h
+	ADD  v2 @> 25, h
+	ADD  v3 @> 20, h
+	ADD  v4 @> 14, h
+
+	// h += len(input) with v2 as temporary.
+	MOVW input_len+4(FP), v2
+	ADD  v2, h
+
+loop16done:
+	ADD $16, n	// Restore number of bytes left.
+
+	SUB.S $4, n
+	MOVW  $const_prime3, prime3r
+	BMI   loop4done
+	MOVW  $const_prime4, prime4r
+
+	TST $3, p
+	BNE loop4unaligned
+
+loop4aligned:
+	SUB.S $4, n
+
+	MOVW.P 4(p), x1
+	MULA   prime3r, x1, h, h
+	MOVW   h @> 15, h
+	MUL    prime4r, h
+
+	BPL loop4aligned
+	B   loop4done
+
+loop4unaligned:
+	SUB.S $4, n
+
+	MOVBU.P  4(p), x1
+	MOVBU   -3(p), x2
+	ORR     x2 <<  8, x1
+	MOVBU   -2(p), x3
+	ORR     x3 << 16, x1
+	MOVBU   -1(p), x4
+	ORR     x4 << 24, x1
+
+	MULA prime3r, x1, h, h
+	MOVW h @> 15, h
+	MUL  prime4r, h
+
+	BPL loop4unaligned
+
+loop4done:
+	ADD.S $4, n	// Restore number of bytes left.
+	BEQ   end
+
+	MOVW $const_prime5, prime5r
+
+loop1:
+	SUB.S $1, n
+
+	MOVBU.P 1(p), x1
+	MULA    prime5r, x1, h, h
+	MOVW    h @> 21, h
+	MUL     prime1r, h
+
+	BNE loop1
+
+end:
+	MOVW $const_prime3, prime3r
+	EOR  h >> 15, h
+	MUL  prime2r, h
+	EOR  h >> 13, h
+	MUL  prime3r, h
+	EOR  h >> 16, h
+
+	MOVW h, ret+12(FP)
+	RET
+
+
+// func update(v *[4]uint64, buf *[16]byte, p []byte)
+TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
+	MOVW    v+0(FP), p
+	MOVM.IA (p), [v1, v2, v3, v4]
+
+	MOVW $const_prime1, prime1r
+	MOVW $const_prime2, prime2r
+
+	// Process buf, if not nil.
+	MOVW buf+4(FP), p
+	CMP  $0, p
+	BEQ  noBuffered
+
+	round16aligned
+
+noBuffered:
+	MOVW input_base +8(FP), p
+	MOVW input_len +12(FP), n
+
+	SUB.S $16, n
+	BMI   end
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   end
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+end:
+	MOVW    v+0(FP), p
+	MOVM.IA [v1, v2, v3, v4], (p)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go
@ -0,0 +1,10 @@
+// +build !arm noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+func ChecksumZero(input []byte) uint32 { return checksumZeroGo(input) }
+
+func update(v *[4]uint32, buf *[16]byte, input []byte) {
+	updateGo(v, buf, input)
+}
--- a/vendor/github.com/pierrec/lz4/v4/lz4.go
+++ b/vendor/github.com/pierrec/lz4/v4/lz4.go
@ -0,0 +1,157 @@
+// Package lz4 implements reading and writing lz4 compressed data.
+//
+// The package supports both the LZ4 stream format,
+// as specified in http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html,
+// and the LZ4 block format, defined at
+// http://fastcompression.blogspot.fr/2011/05/lz4-explained.html.
+//
+// See https://github.com/lz4/lz4 for the reference C implementation.
+package lz4
+
+import (
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+func _() {
+	// Safety checks for duplicated elements.
+	var x [1]struct{}
+	_ = x[lz4block.CompressionLevel(Fast)-lz4block.Fast]
+	_ = x[Block64Kb-BlockSize(lz4block.Block64Kb)]
+	_ = x[Block256Kb-BlockSize(lz4block.Block256Kb)]
+	_ = x[Block1Mb-BlockSize(lz4block.Block1Mb)]
+	_ = x[Block4Mb-BlockSize(lz4block.Block4Mb)]
+}
+
+// CompressBlockBound returns the maximum size of a given buffer of size n, when not compressible.
+func CompressBlockBound(n int) int {
+	return lz4block.CompressBlockBound(n)
+}
+
+// UncompressBlock uncompresses the source buffer into the destination one,
+// and returns the uncompressed size.
+//
+// The destination buffer must be sized appropriately.
+//
+// An error is returned if the source data is invalid or the destination buffer is too small.
+func UncompressBlock(src, dst []byte) (int, error) {
+	return lz4block.UncompressBlock(src, dst, nil)
+}
+
+// UncompressBlockWithDict uncompresses the source buffer into the destination one using a
+// dictionary, and returns the uncompressed size.
+//
+// The destination buffer must be sized appropriately.
+//
+// An error is returned if the source data is invalid or the destination buffer is too small.
+func UncompressBlockWithDict(src, dst, dict []byte) (int, error) {
+	return lz4block.UncompressBlock(src, dst, dict)
+}
+
+// A Compressor compresses data into the LZ4 block format.
+// It uses a fast compression algorithm.
+//
+// A Compressor is not safe for concurrent use by multiple goroutines.
+//
+// Use a Writer to compress into the LZ4 stream format.
+type Compressor struct{ c lz4block.Compressor }
+
+// CompressBlock compresses the source buffer src into the destination dst.
+//
+// If compression is successful, the first return value is the size of the
+// compressed data, which is always >0.
+//
+// If dst has length at least CompressBlockBound(len(src)), compression always
+// succeeds. Otherwise, the first return value is zero. The error return is
+// non-nil if the compressed data does not fit in dst, but it might fit in a
+// larger buffer that is still smaller than CompressBlockBound(len(src)). The
+// return value (0, nil) means the data is likely incompressible and a buffer
+// of length CompressBlockBound(len(src)) should be passed in.
+func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
+	return c.c.CompressBlock(src, dst)
+}
+
+// CompressBlock compresses the source buffer into the destination one.
+// This is the fast version of LZ4 compression and also the default one.
+//
+// The argument hashTable is scratch space for a hash table used by the
+// compressor. If provided, it should have length at least 1<<16. If it is
+// shorter (or nil), CompressBlock allocates its own hash table.
+//
+// The size of the compressed data is returned.
+//
+// If the destination buffer size is lower than CompressBlockBound and
+// the compressed size is 0 and no error, then the data is incompressible.
+//
+// An error is returned if the destination buffer is too small.
+
+// CompressBlock is equivalent to Compressor.CompressBlock.
+// The final argument is ignored and should be set to nil.
+//
+// This function is deprecated. Use a Compressor instead.
+func CompressBlock(src, dst []byte, _ []int) (int, error) {
+	return lz4block.CompressBlock(src, dst)
+}
+
+// A CompressorHC compresses data into the LZ4 block format.
+// Its compression ratio is potentially better than that of a Compressor,
+// but it is also slower and requires more memory.
+//
+// A Compressor is not safe for concurrent use by multiple goroutines.
+//
+// Use a Writer to compress into the LZ4 stream format.
+type CompressorHC struct {
+	// Level is the maximum search depth for compression.
+	// Values <= 0 mean no maximum.
+	Level CompressionLevel
+	c     lz4block.CompressorHC
+}
+
+// CompressBlock compresses the source buffer src into the destination dst.
+//
+// If compression is successful, the first return value is the size of the
+// compressed data, which is always >0.
+//
+// If dst has length at least CompressBlockBound(len(src)), compression always
+// succeeds. Otherwise, the first return value is zero. The error return is
+// non-nil if the compressed data does not fit in dst, but it might fit in a
+// larger buffer that is still smaller than CompressBlockBound(len(src)). The
+// return value (0, nil) means the data is likely incompressible and a buffer
+// of length CompressBlockBound(len(src)) should be passed in.
+func (c *CompressorHC) CompressBlock(src, dst []byte) (int, error) {
+	return c.c.CompressBlock(src, dst, lz4block.CompressionLevel(c.Level))
+}
+
+// CompressBlockHC is equivalent to CompressorHC.CompressBlock.
+// The final two arguments are ignored and should be set to nil.
+//
+// This function is deprecated. Use a CompressorHC instead.
+func CompressBlockHC(src, dst []byte, depth CompressionLevel, _, _ []int) (int, error) {
+	return lz4block.CompressBlockHC(src, dst, lz4block.CompressionLevel(depth))
+}
+
+const (
+	// ErrInvalidSourceShortBuffer is returned by UncompressBlock or CompressBLock when a compressed
+	// block is corrupted or the destination buffer is not large enough for the uncompressed data.
+	ErrInvalidSourceShortBuffer = lz4errors.ErrInvalidSourceShortBuffer
+	// ErrInvalidFrame is returned when reading an invalid LZ4 archive.
+	ErrInvalidFrame = lz4errors.ErrInvalidFrame
+	// ErrInternalUnhandledState is an internal error.
+	ErrInternalUnhandledState = lz4errors.ErrInternalUnhandledState
+	// ErrInvalidHeaderChecksum is returned when reading a frame.
+	ErrInvalidHeaderChecksum = lz4errors.ErrInvalidHeaderChecksum
+	// ErrInvalidBlockChecksum is returned when reading a frame.
+	ErrInvalidBlockChecksum = lz4errors.ErrInvalidBlockChecksum
+	// ErrInvalidFrameChecksum is returned when reading a frame.
+	ErrInvalidFrameChecksum = lz4errors.ErrInvalidFrameChecksum
+	// ErrOptionInvalidCompressionLevel is returned when the supplied compression level is invalid.
+	ErrOptionInvalidCompressionLevel = lz4errors.ErrOptionInvalidCompressionLevel
+	// ErrOptionClosedOrError is returned when an option is applied to a closed or in error object.
+	ErrOptionClosedOrError = lz4errors.ErrOptionClosedOrError
+	// ErrOptionInvalidBlockSize is returned when
+	ErrOptionInvalidBlockSize = lz4errors.ErrOptionInvalidBlockSize
+	// ErrOptionNotApplicable is returned when trying to apply an option to an object not supporting it.
+	ErrOptionNotApplicable = lz4errors.ErrOptionNotApplicable
+	// ErrWriterNotClosed is returned when attempting to reset an unclosed writer.
+	ErrWriterNotClosed = lz4errors.ErrWriterNotClosed
+)
--- a/vendor/github.com/pierrec/lz4/v4/options.go
+++ b/vendor/github.com/pierrec/lz4/v4/options.go
@ -0,0 +1,214 @@
+package lz4
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+//go:generate go run golang.org/x/tools/cmd/stringer -type=BlockSize,CompressionLevel -output options_gen.go
+
+type (
+	applier interface {
+		Apply(...Option) error
+		private()
+	}
+	// Option defines the parameters to setup an LZ4 Writer or Reader.
+	Option func(applier) error
+)
+
+// String returns a string representation of the option with its parameter(s).
+func (o Option) String() string {
+	return o(nil).Error()
+}
+
+// Default options.
+var (
+	DefaultBlockSizeOption = BlockSizeOption(Block4Mb)
+	DefaultChecksumOption  = ChecksumOption(true)
+	DefaultConcurrency     = ConcurrencyOption(1)
+	defaultOnBlockDone     = OnBlockDoneOption(nil)
+)
+
+const (
+	Block64Kb BlockSize = 1 << (16 + iota*2)
+	Block256Kb
+	Block1Mb
+	Block4Mb
+)
+
+// BlockSizeIndex defines the size of the blocks to be compressed.
+type BlockSize uint32
+
+// BlockSizeOption defines the maximum size of compressed blocks (default=Block4Mb).
+func BlockSizeOption(size BlockSize) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("BlockSizeOption(%s)", size)
+			return lz4errors.Error(s)
+		case *Writer:
+			size := uint32(size)
+			if !lz4block.IsValid(size) {
+				return fmt.Errorf("%w: %d", lz4errors.ErrOptionInvalidBlockSize, size)
+			}
+			w.frame.Descriptor.Flags.BlockSizeIndexSet(lz4block.Index(size))
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// BlockChecksumOption enables or disables block checksum (default=false).
+func BlockChecksumOption(flag bool) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("BlockChecksumOption(%v)", flag)
+			return lz4errors.Error(s)
+		case *Writer:
+			w.frame.Descriptor.Flags.BlockChecksumSet(flag)
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// ChecksumOption enables/disables all blocks or content checksum (default=true).
+func ChecksumOption(flag bool) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("ChecksumOption(%v)", flag)
+			return lz4errors.Error(s)
+		case *Writer:
+			w.frame.Descriptor.Flags.ContentChecksumSet(flag)
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// SizeOption sets the size of the original uncompressed data (default=0). It is useful to know the size of the
+// whole uncompressed data stream.
+func SizeOption(size uint64) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("SizeOption(%d)", size)
+			return lz4errors.Error(s)
+		case *Writer:
+			w.frame.Descriptor.Flags.SizeSet(size > 0)
+			w.frame.Descriptor.ContentSize = size
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// ConcurrencyOption sets the number of go routines used for compression.
+// If n <= 0, then the output of runtime.GOMAXPROCS(0) is used.
+func ConcurrencyOption(n int) Option {
+	if n <= 0 {
+		n = runtime.GOMAXPROCS(0)
+	}
+	return func(a applier) error {
+		switch rw := a.(type) {
+		case nil:
+			s := fmt.Sprintf("ConcurrencyOption(%d)", n)
+			return lz4errors.Error(s)
+		case *Writer:
+			rw.num = n
+			return nil
+		case *Reader:
+			rw.num = n
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// CompressionLevel defines the level of compression to use. The higher the better, but slower, compression.
+type CompressionLevel uint32
+
+const (
+	Fast   CompressionLevel = 0
+	Level1 CompressionLevel = 1 << (8 + iota)
+	Level2
+	Level3
+	Level4
+	Level5
+	Level6
+	Level7
+	Level8
+	Level9
+)
+
+// CompressionLevelOption defines the compression level (default=Fast).
+func CompressionLevelOption(level CompressionLevel) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("CompressionLevelOption(%s)", level)
+			return lz4errors.Error(s)
+		case *Writer:
+			switch level {
+			case Fast, Level1, Level2, Level3, Level4, Level5, Level6, Level7, Level8, Level9:
+			default:
+				return fmt.Errorf("%w: %d", lz4errors.ErrOptionInvalidCompressionLevel, level)
+			}
+			w.level = lz4block.CompressionLevel(level)
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+func onBlockDone(int) {}
+
+// OnBlockDoneOption is triggered when a block has been processed. For a Writer, it is when is has been compressed,
+// for a Reader, it is when it has been uncompressed.
+func OnBlockDoneOption(handler func(size int)) Option {
+	if handler == nil {
+		handler = onBlockDone
+	}
+	return func(a applier) error {
+		switch rw := a.(type) {
+		case nil:
+			s := fmt.Sprintf("OnBlockDoneOption(%s)", reflect.TypeOf(handler).String())
+			return lz4errors.Error(s)
+		case *Writer:
+			rw.handler = handler
+			return nil
+		case *Reader:
+			rw.handler = handler
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// LegacyOption provides support for writing LZ4 frames in the legacy format.
+//
+// See https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md#legacy-frame.
+//
+// NB. compressed Linux kernel images use a tweaked LZ4 legacy format where
+// the compressed stream is followed by the original (uncompressed) size of
+// the kernel (https://events.static.linuxfound.org/sites/events/files/lcjpcojp13_klee.pdf).
+// This is also supported as a special case.
+func LegacyOption(legacy bool) Option {
+	return func(a applier) error {
+		switch rw := a.(type) {
+		case nil:
+			s := fmt.Sprintf("LegacyOption(%v)", legacy)
+			return lz4errors.Error(s)
+		case *Writer:
+			rw.legacy = legacy
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
--- a/vendor/github.com/pierrec/lz4/v4/options_gen.go
+++ b/vendor/github.com/pierrec/lz4/v4/options_gen.go
@ -0,0 +1,92 @@
+// Code generated by "stringer -type=BlockSize,CompressionLevel -output options_gen.go"; DO NOT EDIT.
+
+package lz4
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[Block64Kb-65536]
+	_ = x[Block256Kb-262144]
+	_ = x[Block1Mb-1048576]
+	_ = x[Block4Mb-4194304]
+}
+
+const (
+	_BlockSize_name_0 = "Block64Kb"
+	_BlockSize_name_1 = "Block256Kb"
+	_BlockSize_name_2 = "Block1Mb"
+	_BlockSize_name_3 = "Block4Mb"
+)
+
+func (i BlockSize) String() string {
+	switch {
+	case i == 65536:
+		return _BlockSize_name_0
+	case i == 262144:
+		return _BlockSize_name_1
+	case i == 1048576:
+		return _BlockSize_name_2
+	case i == 4194304:
+		return _BlockSize_name_3
+	default:
+		return "BlockSize(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[Fast-0]
+	_ = x[Level1-512]
+	_ = x[Level2-1024]
+	_ = x[Level3-2048]
+	_ = x[Level4-4096]
+	_ = x[Level5-8192]
+	_ = x[Level6-16384]
+	_ = x[Level7-32768]
+	_ = x[Level8-65536]
+	_ = x[Level9-131072]
+}
+
+const (
+	_CompressionLevel_name_0 = "Fast"
+	_CompressionLevel_name_1 = "Level1"
+	_CompressionLevel_name_2 = "Level2"
+	_CompressionLevel_name_3 = "Level3"
+	_CompressionLevel_name_4 = "Level4"
+	_CompressionLevel_name_5 = "Level5"
+	_CompressionLevel_name_6 = "Level6"
+	_CompressionLevel_name_7 = "Level7"
+	_CompressionLevel_name_8 = "Level8"
+	_CompressionLevel_name_9 = "Level9"
+)
+
+func (i CompressionLevel) String() string {
+	switch {
+	case i == 0:
+		return _CompressionLevel_name_0
+	case i == 512:
+		return _CompressionLevel_name_1
+	case i == 1024:
+		return _CompressionLevel_name_2
+	case i == 2048:
+		return _CompressionLevel_name_3
+	case i == 4096:
+		return _CompressionLevel_name_4
+	case i == 8192:
+		return _CompressionLevel_name_5
+	case i == 16384:
+		return _CompressionLevel_name_6
+	case i == 32768:
+		return _CompressionLevel_name_7
+	case i == 65536:
+		return _CompressionLevel_name_8
+	case i == 131072:
+		return _CompressionLevel_name_9
+	default:
+		return "CompressionLevel(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
--- a/vendor/github.com/pierrec/lz4/v4/reader.go
+++ b/vendor/github.com/pierrec/lz4/v4/reader.go
@ -0,0 +1,275 @@
+package lz4
+
+import (
+	"bytes"
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/lz4stream"
+)
+
+var readerStates = []aState{
+	noState:     newState,
+	errorState:  newState,
+	newState:    readState,
+	readState:   closedState,
+	closedState: newState,
+}
+
+// NewReader returns a new LZ4 frame decoder.
+func NewReader(r io.Reader) *Reader {
+	return newReader(r, false)
+}
+
+func newReader(r io.Reader, legacy bool) *Reader {
+	zr := &Reader{frame: lz4stream.NewFrame()}
+	zr.state.init(readerStates)
+	_ = zr.Apply(DefaultConcurrency, defaultOnBlockDone)
+	zr.Reset(r)
+	return zr
+}
+
+// Reader allows reading an LZ4 stream.
+type Reader struct {
+	state   _State
+	src     io.Reader        // source reader
+	num     int              // concurrency level
+	frame   *lz4stream.Frame // frame being read
+	data    []byte           // block buffer allocated in non concurrent mode
+	reads   chan []byte      // pending data
+	idx     int              // size of pending data
+	handler func(int)
+	cum     uint32
+	dict    []byte
+}
+
+func (*Reader) private() {}
+
+func (r *Reader) Apply(options ...Option) (err error) {
+	defer r.state.check(&err)
+	switch r.state.state {
+	case newState:
+	case errorState:
+		return r.state.err
+	default:
+		return lz4errors.ErrOptionClosedOrError
+	}
+	for _, o := range options {
+		if err = o(r); err != nil {
+			return
+		}
+	}
+	return
+}
+
+// Size returns the size of the underlying uncompressed data, if set in the stream.
+func (r *Reader) Size() int {
+	switch r.state.state {
+	case readState, closedState:
+		if r.frame.Descriptor.Flags.Size() {
+			return int(r.frame.Descriptor.ContentSize)
+		}
+	}
+	return 0
+}
+
+func (r *Reader) isNotConcurrent() bool {
+	return r.num == 1
+}
+
+func (r *Reader) init() error {
+	err := r.frame.ParseHeaders(r.src)
+	if err != nil {
+		return err
+	}
+	if !r.frame.Descriptor.Flags.BlockIndependence() {
+		// We can't decompress dependent blocks concurrently.
+		// Instead of throwing an error to the user, silently drop concurrency
+		r.num = 1
+	}
+	data, err := r.frame.InitR(r.src, r.num)
+	if err != nil {
+		return err
+	}
+	r.reads = data
+	r.idx = 0
+	size := r.frame.Descriptor.Flags.BlockSizeIndex()
+	r.data = size.Get()
+	r.cum = 0
+	return nil
+}
+
+func (r *Reader) Read(buf []byte) (n int, err error) {
+	defer r.state.check(&err)
+	switch r.state.state {
+	case readState:
+	case closedState, errorState:
+		return 0, r.state.err
+	case newState:
+		// First initialization.
+		if err = r.init(); r.state.next(err) {
+			return
+		}
+	default:
+		return 0, r.state.fail()
+	}
+	for len(buf) > 0 {
+		var bn int
+		if r.idx == 0 {
+			if r.isNotConcurrent() {
+				bn, err = r.read(buf)
+			} else {
+				lz4block.Put(r.data)
+				r.data = <-r.reads
+				if len(r.data) == 0 {
+					// No uncompressed data: something went wrong or we are done.
+					err = r.frame.Blocks.ErrorR()
+				}
+			}
+			switch err {
+			case nil:
+			case io.EOF:
+				if er := r.frame.CloseR(r.src); er != nil {
+					err = er
+				}
+				lz4block.Put(r.data)
+				r.data = nil
+				return
+			default:
+				return
+			}
+		}
+		if bn == 0 {
+			// Fill buf with buffered data.
+			bn = copy(buf, r.data[r.idx:])
+			r.idx += bn
+			if r.idx == len(r.data) {
+				// All data read, get ready for the next Read.
+				r.idx = 0
+			}
+		}
+		buf = buf[bn:]
+		n += bn
+		r.handler(bn)
+	}
+	return
+}
+
+// read uncompresses the next block as follow:
+// - if buf has enough room, the block is uncompressed into it directly
+//   and the lenght of used space is returned
+// - else, the uncompress data is stored in r.data and 0 is returned
+func (r *Reader) read(buf []byte) (int, error) {
+	block := r.frame.Blocks.Block
+	_, err := block.Read(r.frame, r.src, r.cum)
+	if err != nil {
+		return 0, err
+	}
+	var direct bool
+	dst := r.data[:cap(r.data)]
+	if len(buf) >= len(dst) {
+		// Uncompress directly into buf.
+		direct = true
+		dst = buf
+	}
+	dst, err = block.Uncompress(r.frame, dst, r.dict, true)
+	if err != nil {
+		return 0, err
+	}
+	if !r.frame.Descriptor.Flags.BlockIndependence() {
+		if len(r.dict)+len(dst) > 128*1024 {
+			preserveSize := 64*1024 - len(dst)
+			if preserveSize < 0 {
+				preserveSize = 0
+			}
+			r.dict = r.dict[len(r.dict)-preserveSize:]
+		}
+		r.dict = append(r.dict, dst...)
+	}
+	r.cum += uint32(len(dst))
+	if direct {
+		return len(dst), nil
+	}
+	r.data = dst
+	return 0, nil
+}
+
+// Reset clears the state of the Reader r such that it is equivalent to its
+// initial state from NewReader, but instead reading from reader.
+// No access to reader is performed.
+func (r *Reader) Reset(reader io.Reader) {
+	if r.data != nil {
+		lz4block.Put(r.data)
+		r.data = nil
+	}
+	r.frame.Reset(r.num)
+	r.state.reset()
+	r.src = reader
+	r.reads = nil
+}
+
+// WriteTo efficiently uncompresses the data from the Reader underlying source to w.
+func (r *Reader) WriteTo(w io.Writer) (n int64, err error) {
+	switch r.state.state {
+	case closedState, errorState:
+		return 0, r.state.err
+	case newState:
+		if err = r.init(); r.state.next(err) {
+			return
+		}
+	default:
+		return 0, r.state.fail()
+	}
+	defer r.state.nextd(&err)
+
+	var data []byte
+	if r.isNotConcurrent() {
+		size := r.frame.Descriptor.Flags.BlockSizeIndex()
+		data = size.Get()
+		defer lz4block.Put(data)
+	}
+	for {
+		var bn int
+		var dst []byte
+		if r.isNotConcurrent() {
+			bn, err = r.read(data)
+			dst = data[:bn]
+		} else {
+			lz4block.Put(dst)
+			dst = <-r.reads
+			bn = len(dst)
+			if bn == 0 {
+				// No uncompressed data: something went wrong or we are done.
+				err = r.frame.Blocks.ErrorR()
+			}
+		}
+		switch err {
+		case nil:
+		case io.EOF:
+			err = r.frame.CloseR(r.src)
+			return
+		default:
+			return
+		}
+		r.handler(bn)
+		bn, err = w.Write(dst)
+		n += int64(bn)
+		if err != nil {
+			return
+		}
+	}
+}
+
+// ValidFrameHeader returns a bool indicating if the given bytes slice matches a LZ4 header.
+func ValidFrameHeader(in []byte) (bool, error) {
+	f := lz4stream.NewFrame()
+	err := f.ParseHeaders(bytes.NewReader(in))
+	if err == nil {
+		return true, nil
+	}
+	if err == lz4errors.ErrInvalidFrame {
+		return false, nil
+	}
+	return false, err
+}
--- a/vendor/github.com/pierrec/lz4/v4/state.go
+++ b/vendor/github.com/pierrec/lz4/v4/state.go
@ -0,0 +1,75 @@
+package lz4
+
+import (
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+//go:generate go run golang.org/x/tools/cmd/stringer -type=aState -output state_gen.go
+
+const (
+	noState     aState = iota // uninitialized reader
+	errorState                // unrecoverable error encountered
+	newState                  // instantiated object
+	readState                 // reading data
+	writeState                // writing data
+	closedState               // all done
+)
+
+type (
+	aState uint8
+	_State struct {
+		states []aState
+		state  aState
+		err    error
+	}
+)
+
+func (s *_State) init(states []aState) {
+	s.states = states
+	s.state = states[0]
+}
+
+func (s *_State) reset() {
+	s.state = s.states[0]
+	s.err = nil
+}
+
+// next sets the state to the next one unless it is passed a non nil error.
+// It returns whether or not it is in error.
+func (s *_State) next(err error) bool {
+	if err != nil {
+		s.err = fmt.Errorf("%s: %w", s.state, err)
+		s.state = errorState
+		return true
+	}
+	s.state = s.states[s.state]
+	return false
+}
+
+// nextd is like next but for defers.
+func (s *_State) nextd(errp *error) bool {
+	return errp != nil && s.next(*errp)
+}
+
+// check sets s in error if not already in error and if the error is not nil or io.EOF,
+func (s *_State) check(errp *error) {
+	if s.state == errorState || errp == nil {
+		return
+	}
+	if err := *errp; err != nil {
+		s.err = fmt.Errorf("%w[%s]", err, s.state)
+		if !errors.Is(err, io.EOF) {
+			s.state = errorState
+		}
+	}
+}
+
+func (s *_State) fail() error {
+	s.state = errorState
+	s.err = fmt.Errorf("%w[%s]", lz4errors.ErrInternalUnhandledState, s.state)
+	return s.err
+}
--- a/vendor/github.com/pierrec/lz4/v4/state_gen.go
+++ b/vendor/github.com/pierrec/lz4/v4/state_gen.go
@ -0,0 +1,28 @@
+// Code generated by "stringer -type=aState -output state_gen.go"; DO NOT EDIT.
+
+package lz4
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[noState-0]
+	_ = x[errorState-1]
+	_ = x[newState-2]
+	_ = x[readState-3]
+	_ = x[writeState-4]
+	_ = x[closedState-5]
+}
+
+const _aState_name = "noStateerrorStatenewStatereadStatewriteStateclosedState"
+
+var _aState_index = [...]uint8{0, 7, 17, 25, 34, 44, 55}
+
+func (i aState) String() string {
+	if i >= aState(len(_aState_index)-1) {
+		return "aState(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _aState_name[_aState_index[i]:_aState_index[i+1]]
+}
--- a/vendor/github.com/pierrec/lz4/v4/writer.go
+++ b/vendor/github.com/pierrec/lz4/v4/writer.go
@ -0,0 +1,242 @@
+package lz4
+
+import (
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/lz4stream"
+)
+
+var writerStates = []aState{
+	noState:     newState,
+	newState:    writeState,
+	writeState:  closedState,
+	closedState: newState,
+	errorState:  newState,
+}
+
+// NewWriter returns a new LZ4 frame encoder.
+func NewWriter(w io.Writer) *Writer {
+	zw := &Writer{frame: lz4stream.NewFrame()}
+	zw.state.init(writerStates)
+	_ = zw.Apply(DefaultBlockSizeOption, DefaultChecksumOption, DefaultConcurrency, defaultOnBlockDone)
+	zw.Reset(w)
+	return zw
+}
+
+// Writer allows writing an LZ4 stream.
+type Writer struct {
+	state   _State
+	src     io.Writer                 // destination writer
+	level   lz4block.CompressionLevel // how hard to try
+	num     int                       // concurrency level
+	frame   *lz4stream.Frame          // frame being built
+	data    []byte                    // pending data
+	idx     int                       // size of pending data
+	handler func(int)
+	legacy  bool
+}
+
+func (*Writer) private() {}
+
+func (w *Writer) Apply(options ...Option) (err error) {
+	defer w.state.check(&err)
+	switch w.state.state {
+	case newState:
+	case errorState:
+		return w.state.err
+	default:
+		return lz4errors.ErrOptionClosedOrError
+	}
+	w.Reset(w.src)
+	for _, o := range options {
+		if err = o(w); err != nil {
+			return
+		}
+	}
+	return
+}
+
+func (w *Writer) isNotConcurrent() bool {
+	return w.num == 1
+}
+
+// init sets up the Writer when in newState. It does not change the Writer state.
+func (w *Writer) init() error {
+	w.frame.InitW(w.src, w.num, w.legacy)
+	size := w.frame.Descriptor.Flags.BlockSizeIndex()
+	w.data = size.Get()
+	w.idx = 0
+	return w.frame.Descriptor.Write(w.frame, w.src)
+}
+
+func (w *Writer) Write(buf []byte) (n int, err error) {
+	defer w.state.check(&err)
+	switch w.state.state {
+	case writeState:
+	case closedState, errorState:
+		return 0, w.state.err
+	case newState:
+		if err = w.init(); w.state.next(err) {
+			return
+		}
+	default:
+		return 0, w.state.fail()
+	}
+
+	zn := len(w.data)
+	for len(buf) > 0 {
+		if w.isNotConcurrent() && w.idx == 0 && len(buf) >= zn {
+			// Avoid a copy as there is enough data for a block.
+			if err = w.write(buf[:zn], false); err != nil {
+				return
+			}
+			n += zn
+			buf = buf[zn:]
+			continue
+		}
+		// Accumulate the data to be compressed.
+		m := copy(w.data[w.idx:], buf)
+		n += m
+		w.idx += m
+		buf = buf[m:]
+
+		if w.idx < len(w.data) {
+			// Buffer not filled.
+			return
+		}
+
+		// Buffer full.
+		if err = w.write(w.data, true); err != nil {
+			return
+		}
+		if !w.isNotConcurrent() {
+			size := w.frame.Descriptor.Flags.BlockSizeIndex()
+			w.data = size.Get()
+		}
+		w.idx = 0
+	}
+	return
+}
+
+func (w *Writer) write(data []byte, safe bool) error {
+	if w.isNotConcurrent() {
+		block := w.frame.Blocks.Block
+		err := block.Compress(w.frame, data, w.level).Write(w.frame, w.src)
+		w.handler(len(block.Data))
+		return err
+	}
+	c := make(chan *lz4stream.FrameDataBlock)
+	w.frame.Blocks.Blocks <- c
+	go func(c chan *lz4stream.FrameDataBlock, data []byte, safe bool) {
+		b := lz4stream.NewFrameDataBlock(w.frame)
+		c <- b.Compress(w.frame, data, w.level)
+		<-c
+		w.handler(len(b.Data))
+		b.Close(w.frame)
+		if safe {
+			// safe to put it back as the last usage of it was FrameDataBlock.Write() called before c is closed
+			lz4block.Put(data)
+		}
+	}(c, data, safe)
+
+	return nil
+}
+
+// Flush any buffered data to the underlying writer immediately.
+func (w *Writer) Flush() (err error) {
+	switch w.state.state {
+	case writeState:
+	case errorState:
+		return w.state.err
+	case newState:
+		if err = w.init(); w.state.next(err) {
+			return
+		}
+	default:
+		return nil
+	}
+
+	if w.idx > 0 {
+		// Flush pending data, disable w.data freeing as it is done later on.
+		if err = w.write(w.data[:w.idx], false); err != nil {
+			return err
+		}
+		w.idx = 0
+	}
+	return nil
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying writer
+// without closing it.
+func (w *Writer) Close() error {
+	if err := w.Flush(); err != nil {
+		return err
+	}
+	err := w.frame.CloseW(w.src, w.num)
+	// It is now safe to free the buffer.
+	if w.data != nil {
+		lz4block.Put(w.data)
+		w.data = nil
+	}
+	return err
+}
+
+// Reset clears the state of the Writer w such that it is equivalent to its
+// initial state from NewWriter, but instead writing to writer.
+// Reset keeps the previous options unless overwritten by the supplied ones.
+// No access to writer is performed.
+//
+// w.Close must be called before Reset or pending data may be dropped.
+func (w *Writer) Reset(writer io.Writer) {
+	w.frame.Reset(w.num)
+	w.state.reset()
+	w.src = writer
+}
+
+// ReadFrom efficiently reads from r and compressed into the Writer destination.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+	switch w.state.state {
+	case closedState, errorState:
+		return 0, w.state.err
+	case newState:
+		if err = w.init(); w.state.next(err) {
+			return
+		}
+	default:
+		return 0, w.state.fail()
+	}
+	defer w.state.check(&err)
+
+	size := w.frame.Descriptor.Flags.BlockSizeIndex()
+	var done bool
+	var rn int
+	data := size.Get()
+	if w.isNotConcurrent() {
+		// Keep the same buffer for the whole process.
+		defer lz4block.Put(data)
+	}
+	for !done {
+		rn, err = io.ReadFull(r, data)
+		switch err {
+		case nil:
+		case io.EOF, io.ErrUnexpectedEOF: // read may be partial
+			done = true
+		default:
+			return
+		}
+		n += int64(rn)
+		err = w.write(data[:rn], true)
+		if err != nil {
+			return
+		}
+		w.handler(rn)
+		if !done && !w.isNotConcurrent() {
+			// The buffer will be returned automatically by go routines (safe=true)
+			// so get a new one fo the next round.
+			data = size.Get()
+		}
+	}
+	return
+}
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -8,6 +8,9 @@ github.com/alecthomas/chroma/v2/styles
 ## explicit; go 1.13
 github.com/dlclark/regexp2
 github.com/dlclark/regexp2/syntax
+# github.com/golang/snappy v0.0.4
+## explicit
+github.com/golang/snappy
 # github.com/inconshreveable/mousetrap v1.1.0
 ## explicit; go 1.18
 github.com/inconshreveable/mousetrap
@ -23,6 +26,13 @@ github.com/klauspost/compress/internal/cpuinfo
 github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash
+# github.com/pierrec/lz4/v4 v4.1.19
+## explicit; go 1.14
+github.com/pierrec/lz4/v4
+github.com/pierrec/lz4/v4/internal/lz4block
+github.com/pierrec/lz4/v4/internal/lz4errors
+github.com/pierrec/lz4/v4/internal/lz4stream
+github.com/pierrec/lz4/v4/internal/xxh32
 # github.com/spf13/cobra v1.8.0
 ## explicit; go 1.15
 github.com/spf13/cobra