Merge pull request #24841 from rafalpotempa/upgrade-crypto

Security: Upgrade version of golang.org/x/crypto to latest
2020-05-13 09:19:36 -04:00 · 2020-05-13 09:19:36 -04:00 · 9a5e5bb5fc
parent 6cf3f76623 2646e42ac9
commit 9a5e5bb5fc
33 changed files with 973 additions and 1844 deletions
--- a/go.mod
+++ b/go.mod
@ -126,7 +126,7 @@ require (
 	go.uber.org/atomic v1.3.2 // indirect
 	go.uber.org/multierr v1.1.0 // indirect
 	go.uber.org/zap v1.9.1 // indirect
-	golang.org/x/crypto v0.0.0-20191202143827-86a70503ff7e
+	golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37
 	golang.org/x/mod v0.2.0
 	golang.org/x/net v0.0.0-20200202094626-16171245cfb2
 	golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45
--- a/go.sum
+++ b/go.sum
@ -464,6 +464,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550 h1:ObdrDkeb4kJdCP557AjRjq
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20191202143827-86a70503ff7e h1:egKlR8l7Nu9vHGWbcUV8lqR4987UfUbBd7GbhqGzNYU=
 golang.org/x/crypto v0.0.0-20191202143827-86a70503ff7e/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37 h1:cg5LA/zNPRzIXIWSCxQW10Rvpy94aQh3LT/ShoCpkHw=
 golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
--- a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.go
@ -2,8 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build go1.11
+// +build go1.11,!gccgo,!purego
 // +build !gccgo,!appengine
 package chacha20
--- a/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_arm64.s
@ -2,8 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build go1.11
+// +build go1.11,!gccgo,!purego
 // +build !gccgo,!appengine
 #include "textflag.h"
--- a/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_generic.go
@ -42,10 +42,14 @@ type Cipher struct {
 	// The last len bytes of buf are leftover key stream bytes from the previous
 	// XORKeyStream invocation. The size of buf depends on how many blocks are
-	// computed at a time.
+	// computed at a time by xorKeyStreamBlocks.
 	buf [bufSize]byte
 	len int
 	// overflow is set when the counter overflowed, no more blocks can be
 	// generated, and the next XORKeyStream call should panic.
 	overflow bool
 	// The counter-independent results of the first round are cached after they
 	// are computed the first time.
 	precompDone      bool
@ -89,6 +93,7 @@ func newUnauthenticatedCipher(c *Cipher, key, nonce []byte) (*Cipher, error) {
 		return nil, errors.New("chacha20: wrong nonce size")
 	}
 	key, nonce = key[:KeySize], nonce[:NonceSize] // bounds check elimination hint
 	c.key = [8]uint32{
 		binary.LittleEndian.Uint32(key[0:4]),
 		binary.LittleEndian.Uint32(key[4:8]),
@ -136,6 +141,36 @@ func quarterRound(a, b, c, d uint32) (uint32, uint32, uint32, uint32) {
 	return a, b, c, d
 }
 // SetCounter sets the Cipher counter. The next invocation of XORKeyStream will
 // behave as if (64 * counter) bytes had been encrypted so far.
 //
 // To prevent accidental counter reuse, SetCounter panics if counter is less
 // than the current value.
 //
 // Note that the execution time of XORKeyStream is not independent of the
 // counter value.
 func (s *Cipher) SetCounter(counter uint32) {
 	// Internally, s may buffer multiple blocks, which complicates this
 	// implementation slightly. When checking whether the counter has rolled
 	// back, we must use both s.counter and s.len to determine how many blocks
 	// we have already output.
 	outputCounter := s.counter - uint32(s.len)/blockSize
 	if s.overflow || counter < outputCounter {
 		panic("chacha20: SetCounter attempted to rollback counter")
 	}
 	// In the general case, we set the new counter value and reset s.len to 0,
 	// causing the next call to XORKeyStream to refill the buffer. However, if
 	// we're advancing within the existing buffer, we can save work by simply
 	// setting s.len.
 	if counter < s.counter {
 		s.len = int(s.counter-counter) * blockSize
 	} else {
 		s.counter = counter
 		s.len = 0
 	}
 }
 // XORKeyStream XORs each byte in the given slice with a byte from the
 // cipher's key stream. Dst and src must overlap entirely or not at all.
 //
@ -169,34 +204,52 @@ func (s *Cipher) XORKeyStream(dst, src []byte) {
 			dst[i] = src[i] ^ b
 		}
 		s.len -= len(keyStream)
-		src = src[len(keyStream):]
+		dst, src = dst[len(keyStream):], src[len(keyStream):]
-		dst = dst[len(keyStream):]
+	}
 	if len(src) == 0 {
 		return
 	}
-	const blocksPerBuf = bufSize / blockSize
+	// If we'd need to let the counter overflow and keep generating output,
-	numBufs := (uint64(len(src)) + bufSize - 1) / bufSize
+	// panic immediately. If instead we'd only reach the last block, remember
-	if uint64(s.counter)+numBufs*blocksPerBuf >= 1<<32 {
+	// not to generate any more output after the buffer is drained.
 	numBlocks := (uint64(len(src)) + blockSize - 1) / blockSize
 	if s.overflow || uint64(s.counter)+numBlocks > 1<<32 {
 		panic("chacha20: counter overflow")
 	} else if uint64(s.counter)+numBlocks == 1<<32 {
 		s.overflow = true
 	}
 	// xorKeyStreamBlocks implementations expect input lengths that are a
 	// multiple of bufSize. Platform-specific ones process multiple blocks at a
 	// time, so have bufSizes that are a multiple of blockSize.
-	rem := len(src) % bufSize
+	full := len(src) - len(src)%bufSize
 	full := len(src) - rem
 	if full > 0 {
 		s.xorKeyStreamBlocks(dst[:full], src[:full])
 	}
 	dst, src = dst[full:], src[full:]
 	// If using a multi-block xorKeyStreamBlocks would overflow, use the generic
 	// one that does one block at a time.
 	const blocksPerBuf = bufSize / blockSize
 	if uint64(s.counter)+blocksPerBuf > 1<<32 {
 		s.buf = [bufSize]byte{}
 		numBlocks := (len(src) + blockSize - 1) / blockSize
 		buf := s.buf[bufSize-numBlocks*blockSize:]
 		copy(buf, src)
 		s.xorKeyStreamBlocksGeneric(buf, buf)
 		s.len = len(buf) - copy(dst, buf)
 		return
 	}
 	// If we have a partial (multi-)block, pad it for xorKeyStreamBlocks, and
 	// keep the leftover keystream for the next XORKeyStream invocation.
-	if rem > 0 {
+	if len(src) > 0 {
 		s.buf = [bufSize]byte{}
-		copy(s.buf[:], src[full:])
+		copy(s.buf[:], src)
 		s.xorKeyStreamBlocks(s.buf[:], s.buf[:])
-		s.len = bufSize - copy(dst[full:], s.buf[:])
+		s.len = bufSize - copy(dst, s.buf[:])
 	}
 }
@ -233,7 +286,9 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
 		s.precompDone = true
 	}
-	for i := 0; i < len(src); i += blockSize {
+	// A condition of len(src) > 0 would be sufficient, but this also
 	// acts as a bounds check elimination hint.
 	for len(src) >= 64 && len(dst) >= 64 {
 		// The remainder of the first column round.
 		fcr0, fcr4, fcr8, fcr12 := quarterRound(c0, c4, c8, s.counter)
@ -258,49 +313,28 @@ func (s *Cipher) xorKeyStreamBlocksGeneric(dst, src []byte) {
 			x3, x4, x9, x14 = quarterRound(x3, x4, x9, x14)
 		}
-		// Finally, add back the initial state to generate the key stream.
+		// Add back the initial state to generate the key stream, then
-		x0 += c0
+		// XOR the key stream with the source and write out the result.
-		x1 += c1
+		addXor(dst[0:4], src[0:4], x0, c0)
-		x2 += c2
+		addXor(dst[4:8], src[4:8], x1, c1)
-		x3 += c3
+		addXor(dst[8:12], src[8:12], x2, c2)
-		x4 += c4
+		addXor(dst[12:16], src[12:16], x3, c3)
-		x5 += c5
+		addXor(dst[16:20], src[16:20], x4, c4)
-		x6 += c6
+		addXor(dst[20:24], src[20:24], x5, c5)
-		x7 += c7
+		addXor(dst[24:28], src[24:28], x6, c6)
-		x8 += c8
+		addXor(dst[28:32], src[28:32], x7, c7)
-		x9 += c9
+		addXor(dst[32:36], src[32:36], x8, c8)
-		x10 += c10
+		addXor(dst[36:40], src[36:40], x9, c9)
-		x11 += c11
+		addXor(dst[40:44], src[40:44], x10, c10)
-		x12 += s.counter
+		addXor(dst[44:48], src[44:48], x11, c11)
-		x13 += c13
+		addXor(dst[48:52], src[48:52], x12, s.counter)
-		x14 += c14
+		addXor(dst[52:56], src[52:56], x13, c13)
-		x15 += c15
+		addXor(dst[56:60], src[56:60], x14, c14)
 		addXor(dst[60:64], src[60:64], x15, c15)
 		s.counter += 1
 		if s.counter == 0 {
 			panic("chacha20: internal error: counter overflow")
 		}
-		in, out := src[i:], dst[i:]
+		src, dst = src[blockSize:], dst[blockSize:]
 		in, out = in[:blockSize], out[:blockSize] // bounds check elimination hint
 		// XOR the key stream with the source and write out the result.
 		xor(out[0:], in[0:], x0)
 		xor(out[4:], in[4:], x1)
 		xor(out[8:], in[8:], x2)
 		xor(out[12:], in[12:], x3)
 		xor(out[16:], in[16:], x4)
 		xor(out[20:], in[20:], x5)
 		xor(out[24:], in[24:], x6)
 		xor(out[28:], in[28:], x7)
 		xor(out[32:], in[32:], x8)
 		xor(out[36:], in[36:], x9)
 		xor(out[40:], in[40:], x10)
 		xor(out[44:], in[44:], x11)
 		xor(out[48:], in[48:], x12)
 		xor(out[52:], in[52:], x13)
 		xor(out[56:], in[56:], x14)
 		xor(out[60:], in[60:], x15)
 	}
 }
--- a/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !arm64,!s390x,!ppc64le arm64,!go1.11 gccgo appengine
+// +build !arm64,!s390x,!ppc64le arm64,!go1.11 gccgo purego
 package chacha20
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !gccgo,!appengine
+// +build !gccgo,!purego
 package chacha20
--- a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s
@ -19,7 +19,7 @@
 // The differences in this and the original implementation are
 // due to the calling conventions and initialization of constants.
-// +build !gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
--- a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !gccgo,!appengine
+// +build !gccgo,!purego
 package chacha20
--- a/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s
+++ b/vendor/golang.org/x/crypto/chacha20/chacha_s390x.s
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !gccgo,!appengine
+// +build !gccgo,!purego
 #include "go_asm.h"
 #include "textflag.h"
--- a/vendor/golang.org/x/crypto/chacha20/xor.go
+++ b/vendor/golang.org/x/crypto/chacha20/xor.go
@ -13,10 +13,10 @@ const unaligned = runtime.GOARCH == "386" ||
 	runtime.GOARCH == "ppc64le" ||
 	runtime.GOARCH == "s390x"
-// xor reads a little endian uint32 from src, XORs it with u and
+// addXor reads a little endian uint32 from src, XORs it with (a + b) and
 // places the result in little endian byte order in dst.
-func xor(dst, src []byte, u uint32) {
+func addXor(dst, src []byte, a, b uint32) {
-	_, _ = src[3], dst[3] // eliminate bounds checks
+	_, _ = src[3], dst[3] // bounds check elimination hint
 	if unaligned {
 		// The compiler should optimize this code into
 		// 32-bit unaligned little endian loads and stores.
@ -27,15 +27,16 @@ func xor(dst, src []byte, u uint32) {
 		v |= uint32(src[1]) << 8
 		v |= uint32(src[2]) << 16
 		v |= uint32(src[3]) << 24
-		v ^= u
+		v ^= a + b
 		dst[0] = byte(v)
 		dst[1] = byte(v >> 8)
 		dst[2] = byte(v >> 16)
 		dst[3] = byte(v >> 24)
 	} else {
-		dst[0] = src[0] ^ byte(u)
+		a += b
-		dst[1] = src[1] ^ byte(u>>8)
+		dst[0] = src[0] ^ byte(a)
-		dst[2] = src[2] ^ byte(u>>16)
+		dst[1] = src[1] ^ byte(a>>8)
-		dst[3] = src[3] ^ byte(u>>24)
+		dst[2] = src[2] ^ byte(a>>16)
 		dst[3] = src[3] ^ byte(a>>24)
 	}
 }
--- a/vendor/golang.org/x/crypto/openpgp/armor/armor.go
+++ b/vendor/golang.org/x/crypto/openpgp/armor/armor.go
@ -66,6 +66,7 @@ type lineReader struct {
 	buf    []byte
 	eof    bool
 	crc    uint32
 	crcSet bool
 }
 func (l *lineReader) Read(p []byte) (n int, err error) {
@ -87,6 +88,11 @@ func (l *lineReader) Read(p []byte) (n int, err error) {
 		return 0, ArmorCorrupt
 	}
 	if bytes.HasPrefix(line, armorEnd) {
 		l.eof = true
 		return 0, io.EOF
 	}
 	if len(line) == 5 && line[0] == '=' {
 		// This is the checksum line
 		var expectedBytes [3]byte
@ -108,6 +114,7 @@ func (l *lineReader) Read(p []byte) (n int, err error) {
 		}
 		l.eof = true
 		l.crcSet = true
 		return 0, io.EOF
 	}
@ -141,11 +148,9 @@ func (r *openpgpReader) Read(p []byte) (n int, err error) {
 	n, err = r.b64Reader.Read(p)
 	r.currentCRC = crc24(r.currentCRC, p[:n])
-	if err == io.EOF {
+	if err == io.EOF && r.lReader.crcSet && r.lReader.crc != uint32(r.currentCRC&crc24Mask) {
 		if r.lReader.crc != uint32(r.currentCRC&crc24Mask) {
 		return 0, ArmorCorrupt
 	}
 	}
 	return
 }
--- a/vendor/golang.org/x/crypto/openpgp/packet/packet.go
+++ b/vendor/golang.org/x/crypto/openpgp/packet/packet.go
@ -14,6 +14,7 @@ import (
 	"crypto/rsa"
 	"io"
 	"math/big"
 	"math/bits"
 	"golang.org/x/crypto/cast5"
 	"golang.org/x/crypto/openpgp/errors"
@ -100,33 +101,65 @@ func (r *partialLengthReader) Read(p []byte) (n int, err error) {
 type partialLengthWriter struct {
 	w          io.WriteCloser
 	lengthByte [1]byte
 	sentFirst  bool
 	buf        []byte
 }
 // RFC 4880 4.2.2.4: the first partial length MUST be at least 512 octets long.
 const minFirstPartialWrite = 512
 func (w *partialLengthWriter) Write(p []byte) (n int, err error) {
-	for len(p) > 0 {
+	off := 0
-		for power := uint(14); power < 32; power-- {
+	if !w.sentFirst {
-			l := 1 << power
+		if len(w.buf) > 0 || len(p) < minFirstPartialWrite {
-			if len(p) >= l {
+			off = len(w.buf)
-				w.lengthByte[0] = 224 + uint8(power)
+			w.buf = append(w.buf, p...)
-				_, err = w.w.Write(w.lengthByte[:])
+			if len(w.buf) < minFirstPartialWrite {
-				if err != nil {
+				return len(p), nil
 					return
 			}
 			p = w.buf
 			w.buf = nil
 		}
 		w.sentFirst = true
 	}
 	power := uint8(30)
 	for len(p) > 0 {
 		l := 1 << power
 		if len(p) < l {
 			power = uint8(bits.Len32(uint32(len(p)))) - 1
 			l = 1 << power
 		}
 		w.lengthByte[0] = 224 + power
 		_, err = w.w.Write(w.lengthByte[:])
 		if err == nil {
 			var m int
 			m, err = w.w.Write(p[:l])
 			n += m
 		}
 		if err != nil {
-					return
+			if n < off {
 				return 0, err
 			}
 			return n - off, err
 		}
 		p = p[l:]
 				break
 	}
-		}
+	return n - off, nil
 	}
 	return
 }
 func (w *partialLengthWriter) Close() error {
 	if len(w.buf) > 0 {
 		// In this case we can't send a 512 byte packet.
 		// Just send what we have.
 		p := w.buf
 		w.sentFirst = true
 		w.buf = nil
 		if _, err := w.Write(p); err != nil {
 			return err
 		}
 	}
 	w.lengthByte[0] = 0
 	_, err := w.w.Write(w.lengthByte[:])
 	if err != nil {
--- a/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
+++ b/vendor/golang.org/x/crypto/poly1305/mac_noasm.go
@ -2,10 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build !amd64,!ppc64le gccgo appengine
+// +build !amd64,!ppc64le,!s390x gccgo purego
 package poly1305
 type mac struct{ macGeneric }
 func newMAC(key *[32]byte) mac { return mac{newMACGeneric(key)} }
--- a/vendor/golang.org/x/crypto/poly1305/poly1305.go
+++ b/vendor/golang.org/x/crypto/poly1305/poly1305.go
@ -26,7 +26,9 @@ const TagSize = 16
 // 16-byte result into out. Authenticating two different messages with the same
 // key allows an attacker to forge messages at will.
 func Sum(out *[16]byte, m []byte, key *[32]byte) {
-	sum(out, m, key)
+	h := New(key)
 	h.Write(m)
 	h.Sum(out[:0])
 }
 // Verify returns true if mac is a valid authenticator for m with the given key.
@ -46,10 +48,9 @@ func Verify(mac *[16]byte, m []byte, key *[32]byte) bool {
 // two different messages with the same key allows an attacker
 // to forge messages at will.
 func New(key *[32]byte) *MAC {
-	return &MAC{
+	m := &MAC{}
-		mac:       newMAC(key),
+	initialize(key, &m.macState)
-		finalized: false,
+	return m
 	}
 }
 // MAC is an io.Writer computing an authentication tag
@ -58,7 +59,7 @@ func New(key *[32]byte) *MAC {
 // MAC cannot be used like common hash.Hash implementations,
 // because using a poly1305 key twice breaks its security.
 // Therefore writing data to a running MAC after calling
-// Sum causes it to panic.
+// Sum or Verify causes it to panic.
 type MAC struct {
 	mac // platform-dependent implementation
@ -71,10 +72,10 @@ func (h *MAC) Size() int { return TagSize }
 // Write adds more data to the running message authentication code.
 // It never returns an error.
 //
-// It must not be called after the first call of Sum.
+// It must not be called after the first call of Sum or Verify.
 func (h *MAC) Write(p []byte) (n int, err error) {
 	if h.finalized {
-		panic("poly1305: write to MAC after Sum")
+		panic("poly1305: write to MAC after Sum or Verify")
 	}
 	return h.mac.Write(p)
 }
@ -87,3 +88,12 @@ func (h *MAC) Sum(b []byte) []byte {
 	h.finalized = true
 	return append(b, mac[:]...)
 }
 // Verify returns whether the authenticator of all data written to
 // the message authentication code matches the expected value.
 func (h *MAC) Verify(expected []byte) bool {
 	var mac [TagSize]byte
 	h.mac.Sum(&mac)
 	h.finalized = true
 	return subtle.ConstantTimeCompare(expected, mac[:]) == 1
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.go
@ -2,24 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build amd64,!gccgo,!appengine
+// +build !gccgo,!purego
 package poly1305
 //go:noescape
 func update(state *macState, msg []byte)
 func sum(out *[16]byte, m []byte, key *[32]byte) {
 	h := newMAC(key)
 	h.Write(m)
 	h.Sum(out)
 }
 func newMAC(key *[32]byte) (h mac) {
 	initialize(key, &h.r, &h.s)
 	return
 }
 // mac is a wrapper for macGeneric that redirects calls that would have gone to
 // updateGeneric to update.
 //
--- a/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_amd64.s
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build amd64,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.go
@ -1,19 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build arm,!gccgo,!appengine,!nacl
 package poly1305
 // poly1305_auth_armv6 is implemented in sum_arm.s
 //go:noescape
 func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]byte)
 func sum(out *[16]byte, m []byte, key *[32]byte) {
 	var mPtr *byte
 	if len(m) > 0 {
 		mPtr = &m[0]
 	}
 	poly1305_auth_armv6(out, mPtr, uint32(len(m)), key)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_arm.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_arm.s
@ -1,427 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build arm,!gccgo,!appengine,!nacl
 #include "textflag.h"
 // This code was translated into a form compatible with 5a from the public
 // domain source by Andrew Moon: github.com/floodyberry/poly1305-opt/blob/master/app/extensions/poly1305.
 DATA ·poly1305_init_constants_armv6<>+0x00(SB)/4, $0x3ffffff
 DATA ·poly1305_init_constants_armv6<>+0x04(SB)/4, $0x3ffff03
 DATA ·poly1305_init_constants_armv6<>+0x08(SB)/4, $0x3ffc0ff
 DATA ·poly1305_init_constants_armv6<>+0x0c(SB)/4, $0x3f03fff
 DATA ·poly1305_init_constants_armv6<>+0x10(SB)/4, $0x00fffff
 GLOBL ·poly1305_init_constants_armv6<>(SB), 8, $20
 // Warning: the linker may use R11 to synthesize certain instructions. Please
 // take care and verify that no synthetic instructions use it.
 TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0
 	// Needs 16 bytes of stack and 64 bytes of space pointed to by R0.  (It
 	// might look like it's only 60 bytes of space but the final four bytes
 	// will be written by another function.) We need to skip over four
 	// bytes of stack because that's saving the value of 'g'.
 	ADD       $4, R13, R8
 	MOVM.IB   [R4-R7], (R8)
 	MOVM.IA.W (R1), [R2-R5]
 	MOVW      $·poly1305_init_constants_armv6<>(SB), R7
 	MOVW      R2, R8
 	MOVW      R2>>26, R9
 	MOVW      R3>>20, g
 	MOVW      R4>>14, R11
 	MOVW      R5>>8, R12
 	ORR       R3<<6, R9, R9
 	ORR       R4<<12, g, g
 	ORR       R5<<18, R11, R11
 	MOVM.IA   (R7), [R2-R6]
 	AND       R8, R2, R2
 	AND       R9, R3, R3
 	AND       g, R4, R4
 	AND       R11, R5, R5
 	AND       R12, R6, R6
 	MOVM.IA.W [R2-R6], (R0)
 	EOR       R2, R2, R2
 	EOR       R3, R3, R3
 	EOR       R4, R4, R4
 	EOR       R5, R5, R5
 	EOR       R6, R6, R6
 	MOVM.IA.W [R2-R6], (R0)
 	MOVM.IA.W (R1), [R2-R5]
 	MOVM.IA   [R2-R6], (R0)
 	ADD       $20, R13, R0
 	MOVM.DA   (R0), [R4-R7]
 	RET
 #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \
 	MOVBU (offset+0)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+0)(Rdst); \
 	MOVBU (offset+1)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+1)(Rdst); \
 	MOVBU (offset+2)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+2)(Rdst); \
 	MOVBU (offset+3)(Rsrc), Rtmp; \
 	MOVBU Rtmp, (offset+3)(Rdst)
 TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0
 	// Needs 24 bytes of stack for saved registers and then 88 bytes of
 	// scratch space after that. We assume that 24 bytes at (R13) have
 	// already been used: four bytes for the link register saved in the
 	// prelude of poly1305_auth_armv6, four bytes for saving the value of g
 	// in that function and 16 bytes of scratch space used around
 	// poly1305_finish_ext_armv6_skip1.
 	ADD     $24, R13, R12
 	MOVM.IB [R4-R8, R14], (R12)
 	MOVW    R0, 88(R13)
 	MOVW    R1, 92(R13)
 	MOVW    R2, 96(R13)
 	MOVW    R1, R14
 	MOVW    R2, R12
 	MOVW    56(R0), R8
 	WORD    $0xe1180008                // TST R8, R8 not working see issue 5921
 	EOR     R6, R6, R6
 	MOVW.EQ $(1<<24), R6
 	MOVW    R6, 84(R13)
 	ADD     $116, R13, g
 	MOVM.IA (R0), [R0-R9]
 	MOVM.IA [R0-R4], (g)
 	CMP     $16, R12
 	BLO     poly1305_blocks_armv6_done
 poly1305_blocks_armv6_mainloop:
 	WORD    $0xe31e0003                            // TST R14, #3 not working see issue 5921
 	BEQ     poly1305_blocks_armv6_mainloop_aligned
 	ADD     $100, R13, g
 	MOVW_UNALIGNED(R14, g, R0, 0)
 	MOVW_UNALIGNED(R14, g, R0, 4)
 	MOVW_UNALIGNED(R14, g, R0, 8)
 	MOVW_UNALIGNED(R14, g, R0, 12)
 	MOVM.IA (g), [R0-R3]
 	ADD     $16, R14
 	B       poly1305_blocks_armv6_mainloop_loaded
 poly1305_blocks_armv6_mainloop_aligned:
 	MOVM.IA.W (R14), [R0-R3]
 poly1305_blocks_armv6_mainloop_loaded:
 	MOVW    R0>>26, g
 	MOVW    R1>>20, R11
 	MOVW    R2>>14, R12
 	MOVW    R14, 92(R13)
 	MOVW    R3>>8, R4
 	ORR     R1<<6, g, g
 	ORR     R2<<12, R11, R11
 	ORR     R3<<18, R12, R12
 	BIC     $0xfc000000, R0, R0
 	BIC     $0xfc000000, g, g
 	MOVW    84(R13), R3
 	BIC     $0xfc000000, R11, R11
 	BIC     $0xfc000000, R12, R12
 	ADD     R0, R5, R5
 	ADD     g, R6, R6
 	ORR     R3, R4, R4
 	ADD     R11, R7, R7
 	ADD     $116, R13, R14
 	ADD     R12, R8, R8
 	ADD     R4, R9, R9
 	MOVM.IA (R14), [R0-R4]
 	MULLU   R4, R5, (R11, g)
 	MULLU   R3, R5, (R14, R12)
 	MULALU  R3, R6, (R11, g)
 	MULALU  R2, R6, (R14, R12)
 	MULALU  R2, R7, (R11, g)
 	MULALU  R1, R7, (R14, R12)
 	ADD     R4<<2, R4, R4
 	ADD     R3<<2, R3, R3
 	MULALU  R1, R8, (R11, g)
 	MULALU  R0, R8, (R14, R12)
 	MULALU  R0, R9, (R11, g)
 	MULALU  R4, R9, (R14, R12)
 	MOVW    g, 76(R13)
 	MOVW    R11, 80(R13)
 	MOVW    R12, 68(R13)
 	MOVW    R14, 72(R13)
 	MULLU   R2, R5, (R11, g)
 	MULLU   R1, R5, (R14, R12)
 	MULALU  R1, R6, (R11, g)
 	MULALU  R0, R6, (R14, R12)
 	MULALU  R0, R7, (R11, g)
 	MULALU  R4, R7, (R14, R12)
 	ADD     R2<<2, R2, R2
 	ADD     R1<<2, R1, R1
 	MULALU  R4, R8, (R11, g)
 	MULALU  R3, R8, (R14, R12)
 	MULALU  R3, R9, (R11, g)
 	MULALU  R2, R9, (R14, R12)
 	MOVW    g, 60(R13)
 	MOVW    R11, 64(R13)
 	MOVW    R12, 52(R13)
 	MOVW    R14, 56(R13)
 	MULLU   R0, R5, (R11, g)
 	MULALU  R4, R6, (R11, g)
 	MULALU  R3, R7, (R11, g)
 	MULALU  R2, R8, (R11, g)
 	MULALU  R1, R9, (R11, g)
 	ADD     $52, R13, R0
 	MOVM.IA (R0), [R0-R7]
 	MOVW    g>>26, R12
 	MOVW    R4>>26, R14
 	ORR     R11<<6, R12, R12
 	ORR     R5<<6, R14, R14
 	BIC     $0xfc000000, g, g
 	BIC     $0xfc000000, R4, R4
 	ADD.S   R12, R0, R0
 	ADC     $0, R1, R1
 	ADD.S   R14, R6, R6
 	ADC     $0, R7, R7
 	MOVW    R0>>26, R12
 	MOVW    R6>>26, R14
 	ORR     R1<<6, R12, R12
 	ORR     R7<<6, R14, R14
 	BIC     $0xfc000000, R0, R0
 	BIC     $0xfc000000, R6, R6
 	ADD     R14<<2, R14, R14
 	ADD.S   R12, R2, R2
 	ADC     $0, R3, R3
 	ADD     R14, g, g
 	MOVW    R2>>26, R12
 	MOVW    g>>26, R14
 	ORR     R3<<6, R12, R12
 	BIC     $0xfc000000, g, R5
 	BIC     $0xfc000000, R2, R7
 	ADD     R12, R4, R4
 	ADD     R14, R0, R0
 	MOVW    R4>>26, R12
 	BIC     $0xfc000000, R4, R8
 	ADD     R12, R6, R9
 	MOVW    96(R13), R12
 	MOVW    92(R13), R14
 	MOVW    R0, R6
 	CMP     $32, R12
 	SUB     $16, R12, R12
 	MOVW    R12, 96(R13)
 	BHS     poly1305_blocks_armv6_mainloop
 poly1305_blocks_armv6_done:
 	MOVW    88(R13), R12
 	MOVW    R5, 20(R12)
 	MOVW    R6, 24(R12)
 	MOVW    R7, 28(R12)
 	MOVW    R8, 32(R12)
 	MOVW    R9, 36(R12)
 	ADD     $48, R13, R0
 	MOVM.DA (R0), [R4-R8, R14]
 	RET
 #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \
 	MOVBU.P 1(Rsrc), Rtmp; \
 	MOVBU.P Rtmp, 1(Rdst); \
 	MOVBU.P 1(Rsrc), Rtmp; \
 	MOVBU.P Rtmp, 1(Rdst)
 #define MOVWP_UNALIGNED(Rsrc, Rdst, Rtmp) \
 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \
 	MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp)
 // func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key)
 TEXT ·poly1305_auth_armv6(SB), $196-16
 	// The value 196, just above, is the sum of 64 (the size of the context
 	// structure) and 132 (the amount of stack needed).
 	//
 	// At this point, the stack pointer (R13) has been moved down. It
 	// points to the saved link register and there's 196 bytes of free
 	// space above it.
 	//
 	// The stack for this function looks like:
 	//
 	// +---------------------
 	// |
 	// | 64 bytes of context structure
 	// |
 	// +---------------------
 	// |
 	// | 112 bytes for poly1305_blocks_armv6
 	// |
 	// +---------------------
 	// | 16 bytes of final block, constructed at
 	// | poly1305_finish_ext_armv6_skip8
 	// +---------------------
 	// | four bytes of saved 'g'
 	// +---------------------
 	// | lr, saved by prelude    <- R13 points here
 	// +---------------------
 	MOVW g, 4(R13)
 	MOVW out+0(FP), R4
 	MOVW m+4(FP), R5
 	MOVW mlen+8(FP), R6
 	MOVW key+12(FP), R7
 	ADD  $136, R13, R0 // 136 = 4 + 4 + 16 + 112
 	MOVW R7, R1
 	// poly1305_init_ext_armv6 will write to the stack from R13+4, but
 	// that's ok because none of the other values have been written yet.
 	BL    poly1305_init_ext_armv6<>(SB)
 	BIC.S $15, R6, R2
 	BEQ   poly1305_auth_armv6_noblocks
 	ADD   $136, R13, R0
 	MOVW  R5, R1
 	ADD   R2, R5, R5
 	SUB   R2, R6, R6
 	BL    poly1305_blocks_armv6<>(SB)
 poly1305_auth_armv6_noblocks:
 	ADD  $136, R13, R0
 	MOVW R5, R1
 	MOVW R6, R2
 	MOVW R4, R3
 	MOVW  R0, R5
 	MOVW  R1, R6
 	MOVW  R2, R7
 	MOVW  R3, R8
 	AND.S R2, R2, R2
 	BEQ   poly1305_finish_ext_armv6_noremaining
 	EOR   R0, R0
 	ADD   $8, R13, R9                           // 8 = offset to 16 byte scratch space
 	MOVW  R0, (R9)
 	MOVW  R0, 4(R9)
 	MOVW  R0, 8(R9)
 	MOVW  R0, 12(R9)
 	WORD  $0xe3110003                           // TST R1, #3 not working see issue 5921
 	BEQ   poly1305_finish_ext_armv6_aligned
 	WORD  $0xe3120008                           // TST R2, #8 not working see issue 5921
 	BEQ   poly1305_finish_ext_armv6_skip8
 	MOVWP_UNALIGNED(R1, R9, g)
 	MOVWP_UNALIGNED(R1, R9, g)
 poly1305_finish_ext_armv6_skip8:
 	WORD $0xe3120004                     // TST $4, R2 not working see issue 5921
 	BEQ  poly1305_finish_ext_armv6_skip4
 	MOVWP_UNALIGNED(R1, R9, g)
 poly1305_finish_ext_armv6_skip4:
 	WORD $0xe3120002                     // TST $2, R2 not working see issue 5921
 	BEQ  poly1305_finish_ext_armv6_skip2
 	MOVHUP_UNALIGNED(R1, R9, g)
 	B    poly1305_finish_ext_armv6_skip2
 poly1305_finish_ext_armv6_aligned:
 	WORD      $0xe3120008                             // TST R2, #8 not working see issue 5921
 	BEQ       poly1305_finish_ext_armv6_skip8_aligned
 	MOVM.IA.W (R1), [g-R11]
 	MOVM.IA.W [g-R11], (R9)
 poly1305_finish_ext_armv6_skip8_aligned:
 	WORD   $0xe3120004                             // TST $4, R2 not working see issue 5921
 	BEQ    poly1305_finish_ext_armv6_skip4_aligned
 	MOVW.P 4(R1), g
 	MOVW.P g, 4(R9)
 poly1305_finish_ext_armv6_skip4_aligned:
 	WORD    $0xe3120002                     // TST $2, R2 not working see issue 5921
 	BEQ     poly1305_finish_ext_armv6_skip2
 	MOVHU.P 2(R1), g
 	MOVH.P  g, 2(R9)
 poly1305_finish_ext_armv6_skip2:
 	WORD    $0xe3120001                     // TST $1, R2 not working see issue 5921
 	BEQ     poly1305_finish_ext_armv6_skip1
 	MOVBU.P 1(R1), g
 	MOVBU.P g, 1(R9)
 poly1305_finish_ext_armv6_skip1:
 	MOVW  $1, R11
 	MOVBU R11, 0(R9)
 	MOVW  R11, 56(R5)
 	MOVW  R5, R0
 	ADD   $8, R13, R1
 	MOVW  $16, R2
 	BL    poly1305_blocks_armv6<>(SB)
 poly1305_finish_ext_armv6_noremaining:
 	MOVW      20(R5), R0
 	MOVW      24(R5), R1
 	MOVW      28(R5), R2
 	MOVW      32(R5), R3
 	MOVW      36(R5), R4
 	MOVW      R4>>26, R12
 	BIC       $0xfc000000, R4, R4
 	ADD       R12<<2, R12, R12
 	ADD       R12, R0, R0
 	MOVW      R0>>26, R12
 	BIC       $0xfc000000, R0, R0
 	ADD       R12, R1, R1
 	MOVW      R1>>26, R12
 	BIC       $0xfc000000, R1, R1
 	ADD       R12, R2, R2
 	MOVW      R2>>26, R12
 	BIC       $0xfc000000, R2, R2
 	ADD       R12, R3, R3
 	MOVW      R3>>26, R12
 	BIC       $0xfc000000, R3, R3
 	ADD       R12, R4, R4
 	ADD       $5, R0, R6
 	MOVW      R6>>26, R12
 	BIC       $0xfc000000, R6, R6
 	ADD       R12, R1, R7
 	MOVW      R7>>26, R12
 	BIC       $0xfc000000, R7, R7
 	ADD       R12, R2, g
 	MOVW      g>>26, R12
 	BIC       $0xfc000000, g, g
 	ADD       R12, R3, R11
 	MOVW      $-(1<<26), R12
 	ADD       R11>>26, R12, R12
 	BIC       $0xfc000000, R11, R11
 	ADD       R12, R4, R9
 	MOVW      R9>>31, R12
 	SUB       $1, R12
 	AND       R12, R6, R6
 	AND       R12, R7, R7
 	AND       R12, g, g
 	AND       R12, R11, R11
 	AND       R12, R9, R9
 	MVN       R12, R12
 	AND       R12, R0, R0
 	AND       R12, R1, R1
 	AND       R12, R2, R2
 	AND       R12, R3, R3
 	AND       R12, R4, R4
 	ORR       R6, R0, R0
 	ORR       R7, R1, R1
 	ORR       g, R2, R2
 	ORR       R11, R3, R3
 	ORR       R9, R4, R4
 	ORR       R1<<26, R0, R0
 	MOVW      R1>>6, R1
 	ORR       R2<<20, R1, R1
 	MOVW      R2>>12, R2
 	ORR       R3<<14, R2, R2
 	MOVW      R3>>18, R3
 	ORR       R4<<8, R3, R3
 	MOVW      40(R5), R6
 	MOVW      44(R5), R7
 	MOVW      48(R5), g
 	MOVW      52(R5), R11
 	ADD.S     R6, R0, R0
 	ADC.S     R7, R1, R1
 	ADC.S     g, R2, R2
 	ADC.S     R11, R3, R3
 	MOVM.IA   [R0-R3], (R8)
 	MOVW      R5, R12
 	EOR       R0, R0, R0
 	EOR       R1, R1, R1
 	EOR       R2, R2, R2
 	EOR       R3, R3, R3
 	EOR       R4, R4, R4
 	EOR       R5, R5, R5
 	EOR       R6, R6, R6
 	EOR       R7, R7, R7
 	MOVM.IA.W [R0-R7], (R12)
 	MOVM.IA   [R0-R7], (R12)
 	MOVW      4(R13), g
 	RET
--- a/vendor/golang.org/x/crypto/poly1305/sum_generic.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_generic.go
@ -31,16 +31,18 @@ func sumGeneric(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	h.Sum(out)
 }
-func newMACGeneric(key *[32]byte) (h macGeneric) {
+func newMACGeneric(key *[32]byte) macGeneric {
-	initialize(key, &h.r, &h.s)
+	m := macGeneric{}
-	return
+	initialize(key, &m.macState)
 	return m
 }
 // macState holds numbers in saturated 64-bit little-endian limbs. That is,
 // the value of [x0, x1, x2] is x[0] + x[1] * 2⁶⁴ + x[2] * 2¹²⁸.
 type macState struct {
 	// h is the main accumulator. It is to be interpreted modulo 2¹³⁰ - 5, but
-	// can grow larger during and after rounds.
+	// can grow larger during and after rounds. It must, however, remain below
 	// 2 * (2¹³⁰ - 5).
 	h [3]uint64
 	// r and s are the private key components.
 	r [2]uint64
@ -97,11 +99,12 @@ const (
 	rMask1 = 0x0FFFFFFC0FFFFFFC
 )
-func initialize(key *[32]byte, r, s *[2]uint64) {
+// initialize loads the 256-bit key into the two 128-bit secret values r and s.
-	r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
+func initialize(key *[32]byte, m *macState) {
-	r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
+	m.r[0] = binary.LittleEndian.Uint64(key[0:8]) & rMask0
-	s[0] = binary.LittleEndian.Uint64(key[16:24])
+	m.r[1] = binary.LittleEndian.Uint64(key[8:16]) & rMask1
-	s[1] = binary.LittleEndian.Uint64(key[24:32])
+	m.s[0] = binary.LittleEndian.Uint64(key[16:24])
 	m.s[1] = binary.LittleEndian.Uint64(key[24:32])
 }
 // uint128 holds a 128-bit number as two 64-bit limbs, for use with the
--- a/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_noasm.go
@ -1,13 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x,!go1.11 !arm,!amd64,!s390x,!ppc64le gccgo appengine nacl
 package poly1305
 func sum(out *[TagSize]byte, msg []byte, key *[32]byte) {
 	h := newMAC(key)
 	h.Write(msg)
 	h.Sum(out)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.go
@ -2,24 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!purego
 package poly1305
 //go:noescape
 func update(state *macState, msg []byte)
 func sum(out *[16]byte, m []byte, key *[32]byte) {
 	h := newMAC(key)
 	h.Write(m)
 	h.Sum(out)
 }
 func newMAC(key *[32]byte) (h mac) {
 	initialize(key, &h.r, &h.s)
 	return
 }
 // mac is a wrapper for macGeneric that redirects calls that would have gone to
 // updateGeneric to update.
 //
--- a/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_ppc64le.s
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build ppc64le,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build s390x,go1.11,!gccgo,!appengine
+// +build !gccgo,!purego
 package poly1305
@ -10,30 +10,66 @@ import (
 	"golang.org/x/sys/cpu"
 )
-// poly1305vx is an assembly implementation of Poly1305 that uses vector
+// updateVX is an assembly implementation of Poly1305 that uses vector
 // instructions. It must only be called if the vector facility (vx) is
 // available.
 //go:noescape
-func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+func updateVX(state *macState, msg []byte)
-// poly1305vmsl is an assembly implementation of Poly1305 that uses vector
+// mac is a replacement for macGeneric that uses a larger buffer and redirects
-// instructions, including VMSL. It must only be called if the vector facility (vx) is
+// calls that would have gone to updateGeneric to updateVX if the vector
-// available and if VMSL is supported.
+// facility is installed.
-//go:noescape
+//
-func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]byte)
+// A larger buffer is required for good performance because the vector
 // implementation has a higher fixed cost per call than the generic
 // implementation.
 type mac struct {
 	macState
-func sum(out *[16]byte, m []byte, key *[32]byte) {
+	buffer [16 * TagSize]byte // size must be a multiple of block size (16)
 	offset int
 }
 func (h *mac) Write(p []byte) (int, error) {
 	nn := len(p)
 	if h.offset > 0 {
 		n := copy(h.buffer[h.offset:], p)
 		if h.offset+n < len(h.buffer) {
 			h.offset += n
 			return nn, nil
 		}
 		p = p[n:]
 		h.offset = 0
 		if cpu.S390X.HasVX {
-		var mPtr *byte
+			updateVX(&h.macState, h.buffer[:])
 		if len(m) > 0 {
 			mPtr = &m[0]
 		}
 		if cpu.S390X.HasVXE && len(m) > 256 {
 			poly1305vmsl(out, mPtr, uint64(len(m)), key)
 		} else {
-			poly1305vx(out, mPtr, uint64(len(m)), key)
+			updateGeneric(&h.macState, h.buffer[:])
 		}
 	}
 	tail := len(p) % len(h.buffer) // number of bytes to copy into buffer
 	body := len(p) - tail          // number of bytes to process now
 	if body > 0 {
 		if cpu.S390X.HasVX {
 			updateVX(&h.macState, p[:body])
 		} else {
-		sumGeneric(out, m, key)
+			updateGeneric(&h.macState, p[:body])
 		}
 	}
 	h.offset = copy(h.buffer[:], p[body:]) // copy tail bytes - can be 0
 	return nn, nil
 }
 func (h *mac) Sum(out *[TagSize]byte) {
 	state := h.macState
 	remainder := h.buffer[:h.offset]
 	// Use the generic implementation if we have 2 or fewer blocks left
 	// to sum. The vector implementation has a higher startup time.
 	if cpu.S390X.HasVX && len(remainder) > 2*TagSize {
 		updateVX(&state, remainder)
 	} else if len(remainder) > 0 {
 		updateGeneric(&state, remainder)
 	}
 	finalize(out, &state.h, &state.s)
 }
--- a/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_s390x.s
@ -2,115 +2,187 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// +build s390x,go1.11,!gccgo,!appengine
+// +build !gccgo,!purego
 #include "textflag.h"
-// Implementation of Poly1305 using the vector facility (vx).
+// This implementation of Poly1305 uses the vector facility (vx)
 // to process up to 2 blocks (32 bytes) per iteration using an
 // algorithm based on the one described in:
 //
 // NEON crypto, Daniel J. Bernstein & Peter Schwabe
 // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
 //
 // This algorithm uses 5 26-bit limbs to represent a 130-bit
 // value. These limbs are, for the most part, zero extended and
 // placed into 64-bit vector register elements. Each vector
 // register is 128-bits wide and so holds 2 of these elements.
 // Using 26-bit limbs allows us plenty of headroom to accomodate
 // accumulations before and after multiplication without
 // overflowing either 32-bits (before multiplication) or 64-bits
 // (after multiplication).
 //
 // In order to parallelise the operations required to calculate
 // the sum we use two separate accumulators and then sum those
 // in an extra final step. For compatibility with the generic
 // implementation we perform this summation at the end of every
 // updateVX call.
 //
 // To use two accumulators we must multiply the message blocks
 // by r² rather than r. Only the final message block should be
 // multiplied by r.
 //
 // Example:
 //
 // We want to calculate the sum (h) for a 64 byte message (m):
 //
 //   h = m[0:16]r⁴ + m[16:32]r³ + m[32:48]r² + m[48:64]r
 //
 // To do this we split the calculation into the even indices
 // and odd indices of the message. These form our SIMD 'lanes':
 //
 //   h = m[ 0:16]r⁴ + m[32:48]r² +   <- lane 0
 //       m[16:32]r³ + m[48:64]r      <- lane 1
 //
 // To calculate this iteratively we refactor so that both lanes
 // are written in terms of r² and r:
 //
 //   h = (m[ 0:16]r² + m[32:48])r² + <- lane 0
 //       (m[16:32]r² + m[48:64])r    <- lane 1
 //                ^             ^
 //                |             coefficients for second iteration
 //                coefficients for first iteration
 //
 // So in this case we would have two iterations. In the first
 // both lanes are multiplied by r². In the second only the
 // first lane is multiplied by r² and the second lane is
 // instead multiplied by r. This gives use the odd and even
 // powers of r that we need from the original equation.
 //
 // Notation:
 //
 //   h - accumulator
 //   r - key
 //   m - message
 //
 //   [a, b]       - SIMD register holding two 64-bit values
 //   [a, b, c, d] - SIMD register holding four 32-bit values
 //   xᵢ[n]        - limb n of variable x with bit width i
 //
 // Limbs are expressed in little endian order, so for 26-bit
 // limbs x₂₆[4] will be the most significant limb and x₂₆[0]
 // will be the least significant limb.
-// constants
+// masking constants
-#define MOD26 V0
+#define MOD24 V0 // [0x0000000000ffffff, 0x0000000000ffffff] - mask low 24-bits
-#define EX0   V1
+#define MOD26 V1 // [0x0000000003ffffff, 0x0000000003ffffff] - mask low 26-bits
 #define EX1   V2
 #define EX2   V3
-// temporaries
+// expansion constants (see EXPAND macro)
-#define T_0 V4
+#define EX0 V2
-#define T_1 V5
+#define EX1 V3
-#define T_2 V6
+#define EX2 V4
 #define T_3 V7
 #define T_4 V8
-// key (r)
+// key (r², r or 1 depending on context)
-#define R_0  V9
+#define R_0 V5
-#define R_1  V10
+#define R_1 V6
-#define R_2  V11
+#define R_2 V7
-#define R_3  V12
+#define R_3 V8
-#define R_4  V13
+#define R_4 V9
 #define R5_1 V14
 #define R5_2 V15
 #define R5_3 V16
 #define R5_4 V17
 #define RSAVE_0 R5
 #define RSAVE_1 R6
 #define RSAVE_2 R7
 #define RSAVE_3 R8
 #define RSAVE_4 R9
 #define R5SAVE_1 V28
 #define R5SAVE_2 V29
 #define R5SAVE_3 V30
 #define R5SAVE_4 V31
-// message block
+// precalculated coefficients (5r², 5r or 0 depending on context)
-#define F_0 V18
+#define R5_1 V10
-#define F_1 V19
+#define R5_2 V11
-#define F_2 V20
+#define R5_3 V12
-#define F_3 V21
+#define R5_4 V13
 #define F_4 V22
-// accumulator
+// message block (m)
-#define H_0 V23
+#define M_0 V14
-#define H_1 V24
+#define M_1 V15
-#define H_2 V25
+#define M_2 V16
-#define H_3 V26
+#define M_3 V17
-#define H_4 V27
+#define M_4 V18
-GLOBL ·keyMask<>(SB), RODATA, $16
+// accumulator (h)
-DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
+#define H_0 V19
-DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
+#define H_1 V20
 #define H_2 V21
 #define H_3 V22
 #define H_4 V23
-GLOBL ·bswapMask<>(SB), RODATA, $16
+// temporary registers (for short-lived values)
-DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
+#define T_0 V24
-DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
+#define T_1 V25
 #define T_2 V26
 #define T_3 V27
 #define T_4 V28
-GLOBL ·constants<>(SB), RODATA, $64
+GLOBL ·constants<>(SB), RODATA, $0x30
 // MOD26
 DATA ·constants<>+0(SB)/8, $0x3ffffff
 DATA ·constants<>+8(SB)/8, $0x3ffffff
 // EX0
-DATA ·constants<>+16(SB)/8, $0x0006050403020100
+DATA ·constants<>+0x00(SB)/8, $0x0006050403020100
-DATA ·constants<>+24(SB)/8, $0x1016151413121110
+DATA ·constants<>+0x08(SB)/8, $0x1016151413121110
 // EX1
-DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
+DATA ·constants<>+0x10(SB)/8, $0x060c0b0a09080706
-DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
+DATA ·constants<>+0x18(SB)/8, $0x161c1b1a19181716
 // EX2
-DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
+DATA ·constants<>+0x20(SB)/8, $0x0d0d0d0d0d0f0e0d
-DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
+DATA ·constants<>+0x28(SB)/8, $0x1d1d1d1d1d1f1e1d
-// h = (f*g) % (2**130-5) [partial reduction]
+// MULTIPLY multiplies each lane of f and g, partially reduced
 // modulo 2¹³⁰ - 5. The result, h, consists of partial products
 // in each lane that need to be reduced further to produce the
 // final result.
 //
 //   h₁₃₀ = (f₁₃₀g₁₃₀) % 2¹³⁰ + (5f₁₃₀g₁₃₀) / 2¹³⁰
 //
 // Note that the multiplication by 5 of the high bits is
 // achieved by precalculating the multiplication of four of the
 // g coefficients by 5. These are g51-g54.
 #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
 	VMLOF  f0, g0, h0        \
 	VMLOF  f0, g1, h1        \
 	VMLOF  f0, g2, h2        \
 	VMLOF  f0, g3, h3        \
 	VMLOF  f0, g1, h1        \
 	VMLOF  f0, g4, h4        \
 	VMLOF  f0, g2, h2        \
 	VMLOF  f1, g54, T_0      \
 	VMLOF  f1, g0, T_1       \
 	VMLOF  f1, g1, T_2       \
 	VMLOF  f1, g2, T_3       \
 	VMLOF  f1, g0, T_1       \
 	VMLOF  f1, g3, T_4       \
 	VMLOF  f1, g1, T_2       \
 	VMALOF f2, g53, h0, h0   \
 	VMALOF f2, g54, h1, h1   \
 	VMALOF f2, g0, h2, h2    \
 	VMALOF f2, g1, h3, h3    \
 	VMALOF f2, g54, h1, h1   \
 	VMALOF f2, g2, h4, h4    \
 	VMALOF f2, g0, h2, h2    \
 	VMALOF f3, g52, T_0, T_0 \
 	VMALOF f3, g53, T_1, T_1 \
 	VMALOF f3, g54, T_2, T_2 \
 	VMALOF f3, g0, T_3, T_3  \
 	VMALOF f3, g53, T_1, T_1 \
 	VMALOF f3, g1, T_4, T_4  \
 	VMALOF f3, g54, T_2, T_2 \
 	VMALOF f4, g51, h0, h0   \
 	VMALOF f4, g52, h1, h1   \
 	VMALOF f4, g53, h2, h2   \
 	VMALOF f4, g54, h3, h3   \
 	VMALOF f4, g52, h1, h1   \
 	VMALOF f4, g0, h4, h4    \
 	VMALOF f4, g53, h2, h2   \
 	VAG    T_0, h0, h0       \
 	VAG    T_1, h1, h1       \
 	VAG    T_2, h2, h2       \
 	VAG    T_3, h3, h3       \
-	VAG    T_4, h4, h4
+	VAG    T_1, h1, h1       \
 	VAG    T_4, h4, h4       \
 	VAG    T_2, h2, h2
-// carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
+// REDUCE performs the following carry operations in four
 // stages, as specified in Bernstein & Schwabe:
 //
 //   1: h₂₆[0]->h₂₆[1] h₂₆[3]->h₂₆[4]
 //   2: h₂₆[1]->h₂₆[2] h₂₆[4]->h₂₆[0]
 //   3: h₂₆[0]->h₂₆[1] h₂₆[2]->h₂₆[3]
 //   4: h₂₆[3]->h₂₆[4]
 //
 // The result is that all of the limbs are limited to 26-bits
 // except for h₂₆[1] and h₂₆[4] which are limited to 27-bits.
 //
 // Note that although each limb is aligned at 26-bit intervals
 // they may contain values that exceed 2²⁶ - 1, hence the need
 // to carry the excess bits in each limb.
 #define REDUCE(h0, h1, h2, h3, h4) \
 	VESRLG $26, h0, T_0  \
 	VESRLG $26, h3, T_1  \
@ -136,144 +208,155 @@ DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
 	VN     MOD26, h3, h3 \
 	VAG    T_2, h4, h4
-// expand in0 into d[0] and in1 into d[1]
+// EXPAND splits the 128-bit little-endian values in0 and in1
 // into 26-bit big-endian limbs and places the results into
 // the first and second lane of d₂₆[0:4] respectively.
 //
 // The EX0, EX1 and EX2 constants are arrays of byte indices
 // for permutation. The permutation both reverses the bytes
 // in the input and ensures the bytes are copied into the
 // destination limb ready to be shifted into their final
 // position.
 #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
 	VGBM   $0x0707, d1       \ // d1=tmp
 	VPERM  in0, in1, EX2, d4 \
 	VPERM  in0, in1, EX0, d0 \
 	VPERM  in0, in1, EX1, d2 \
-	VN     d1, d4, d4        \
+	VPERM  in0, in1, EX2, d4 \
 	VESRLG $26, d0, d1       \
 	VESRLG $30, d2, d3       \
 	VESRLG $4, d2, d2        \
-	VN     MOD26, d0, d0     \
+	VN     MOD26, d0, d0     \ // [in0₂₆[0], in1₂₆[0]]
-	VN     MOD26, d1, d1     \
+	VN     MOD26, d3, d3     \ // [in0₂₆[3], in1₂₆[3]]
-	VN     MOD26, d2, d2     \
+	VN     MOD26, d1, d1     \ // [in0₂₆[1], in1₂₆[1]]
-	VN     MOD26, d3, d3
+	VN     MOD24, d4, d4     \ // [in0₂₆[4], in1₂₆[4]]
 	VN     MOD26, d2, d2     // [in0₂₆[2], in1₂₆[2]]
-// pack h4:h0 into h1:h0 (no carry)
+// func updateVX(state *macState, msg []byte)
-#define PACK(h0, h1, h2, h3, h4) \
+TEXT ·updateVX(SB), NOSPLIT, $0
-	VESLG $26, h1, h1  \
+	MOVD state+0(FP), R1
-	VESLG $26, h3, h3  \
+	LMG  msg+8(FP), R2, R3 // R2=msg_base, R3=msg_len
 	VO    h0, h1, h0   \
 	VO    h2, h3, h2   \
 	VESLG $4, h2, h2   \
 	VLEIB $7, $48, h1  \
 	VSLB  h1, h2, h2   \
 	VO    h0, h2, h0   \
 	VLEIB $7, $104, h1 \
 	VSLB  h1, h4, h3   \
 	VO    h3, h0, h0   \
 	VLEIB $7, $24, h1  \
 	VSRLB h1, h4, h1
-// if h > 2**130-5 then h -= 2**130-5
+	// load EX0, EX1 and EX2
 #define MOD(h0, h1, t0, t1, t2) \
 	VZERO t0          \
 	VLEIG $1, $5, t0  \
 	VACCQ h0, t0, t1  \
 	VAQ   h0, t0, t0  \
 	VONE  t2          \
 	VLEIG $1, $-4, t2 \
 	VAQ   t2, t1, t1  \
 	VACCQ h1, t1, t1  \
 	VONE  t2          \
 	VAQ   t2, t1, t1  \
 	VN    h0, t1, t2  \
 	VNC   t0, t1, t1  \
 	VO    t1, t2, h0
 // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
 TEXT ·poly1305vx(SB), $0-32
 	// This code processes up to 2 blocks (32 bytes) per iteration
 	// using the algorithm described in:
 	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
 	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
 	LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
 	// load MOD26, EX0, EX1 and EX2
 	MOVD $·constants<>(SB), R5
-	VLM  (R5), MOD26, EX2
+	VLM  (R5), EX0, EX2
-	// setup r
+	// generate masks
-	VL   (R4), T_0
+	VGMG $(64-24), $63, MOD24 // [0x00ffffff, 0x00ffffff]
-	MOVD $·keyMask<>(SB), R6
+	VGMG $(64-26), $63, MOD26 // [0x03ffffff, 0x03ffffff]
 	VL   (R6), T_1
 	VN   T_0, T_1, T_0
 	EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
-	// setup r*5
+	// load h (accumulator) and r (key) from state
-	VLEIG $0, $5, T_0
+	VZERO T_1               // [0, 0]
-	VLEIG $1, $5, T_0
+	VL    0(R1), T_0        // [h₆₄[0], h₆₄[1]]
 	VLEG  $0, 16(R1), T_1   // [h₆₄[2], 0]
 	VL    24(R1), T_2       // [r₆₄[0], r₆₄[1]]
 	VPDI  $0, T_0, T_2, T_3 // [h₆₄[0], r₆₄[0]]
 	VPDI  $5, T_0, T_2, T_4 // [h₆₄[1], r₆₄[1]]
-	// store r (for final block)
+	// unpack h and r into 26-bit limbs
-	VMLOF T_0, R_1, R5SAVE_1
+	// note: h₆₄[2] may have the low 3 bits set, so h₂₆[4] is a 27-bit value
-	VMLOF T_0, R_2, R5SAVE_2
+	VN     MOD26, T_3, H_0            // [h₂₆[0], r₂₆[0]]
-	VMLOF T_0, R_3, R5SAVE_3
+	VZERO  H_1                        // [0, 0]
-	VMLOF T_0, R_4, R5SAVE_4
+	VZERO  H_3                        // [0, 0]
-	VLGVG $0, R_0, RSAVE_0
+	VGMG   $(64-12-14), $(63-12), T_0 // [0x03fff000, 0x03fff000] - 26-bit mask with low 12 bits masked out
-	VLGVG $0, R_1, RSAVE_1
+	VESLG  $24, T_1, T_1              // [h₆₄[2]<<24, 0]
-	VLGVG $0, R_2, RSAVE_2
+	VERIMG $-26&63, T_3, MOD26, H_1   // [h₂₆[1], r₂₆[1]]
-	VLGVG $0, R_3, RSAVE_3
+	VESRLG $+52&63, T_3, H_2          // [h₂₆[2], r₂₆[2]] - low 12 bits only
-	VLGVG $0, R_4, RSAVE_4
+	VERIMG $-14&63, T_4, MOD26, H_3   // [h₂₆[1], r₂₆[1]]
 	VESRLG $40, T_4, H_4              // [h₂₆[4], r₂₆[4]] - low 24 bits only
 	VERIMG $+12&63, T_4, T_0, H_2     // [h₂₆[2], r₂₆[2]] - complete
 	VO     T_1, H_4, H_4              // [h₂₆[4], r₂₆[4]] - complete
-	// skip r**2 calculation
+	// replicate r across all 4 vector elements
 	VREPF $3, H_0, R_0 // [r₂₆[0], r₂₆[0], r₂₆[0], r₂₆[0]]
 	VREPF $3, H_1, R_1 // [r₂₆[1], r₂₆[1], r₂₆[1], r₂₆[1]]
 	VREPF $3, H_2, R_2 // [r₂₆[2], r₂₆[2], r₂₆[2], r₂₆[2]]
 	VREPF $3, H_3, R_3 // [r₂₆[3], r₂₆[3], r₂₆[3], r₂₆[3]]
 	VREPF $3, H_4, R_4 // [r₂₆[4], r₂₆[4], r₂₆[4], r₂₆[4]]
 	// zero out lane 1 of h
 	VLEIG $1, $0, H_0 // [h₂₆[0], 0]
 	VLEIG $1, $0, H_1 // [h₂₆[1], 0]
 	VLEIG $1, $0, H_2 // [h₂₆[2], 0]
 	VLEIG $1, $0, H_3 // [h₂₆[3], 0]
 	VLEIG $1, $0, H_4 // [h₂₆[4], 0]
 	// calculate 5r (ignore least significant limb)
 	VREPIF $5, T_0
 	VMLF   T_0, R_1, R5_1 // [5r₂₆[1], 5r₂₆[1], 5r₂₆[1], 5r₂₆[1]]
 	VMLF   T_0, R_2, R5_2 // [5r₂₆[2], 5r₂₆[2], 5r₂₆[2], 5r₂₆[2]]
 	VMLF   T_0, R_3, R5_3 // [5r₂₆[3], 5r₂₆[3], 5r₂₆[3], 5r₂₆[3]]
 	VMLF   T_0, R_4, R5_4 // [5r₂₆[4], 5r₂₆[4], 5r₂₆[4], 5r₂₆[4]]
 	// skip r² calculation if we are only calculating one block
 	CMPBLE R3, $16, skip
-	// calculate r**2
+	// calculate r²
-	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
+	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, M_0, M_1, M_2, M_3, M_4)
-	REDUCE(H_0, H_1, H_2, H_3, H_4)
+	REDUCE(M_0, M_1, M_2, M_3, M_4)
-	VLEIG $0, $5, T_0
+	VGBM   $0x0f0f, T_0
-	VLEIG $1, $5, T_0
+	VERIMG $0, M_0, T_0, R_0 // [r₂₆[0], r²₂₆[0], r₂₆[0], r²₂₆[0]]
-	VMLOF T_0, H_1, R5_1
+	VERIMG $0, M_1, T_0, R_1 // [r₂₆[1], r²₂₆[1], r₂₆[1], r²₂₆[1]]
-	VMLOF T_0, H_2, R5_2
+	VERIMG $0, M_2, T_0, R_2 // [r₂₆[2], r²₂₆[2], r₂₆[2], r²₂₆[2]]
-	VMLOF T_0, H_3, R5_3
+	VERIMG $0, M_3, T_0, R_3 // [r₂₆[3], r²₂₆[3], r₂₆[3], r²₂₆[3]]
-	VMLOF T_0, H_4, R5_4
+	VERIMG $0, M_4, T_0, R_4 // [r₂₆[4], r²₂₆[4], r₂₆[4], r²₂₆[4]]
 	VLR   H_0, R_0
 	VLR   H_1, R_1
 	VLR   H_2, R_2
 	VLR   H_3, R_3
 	VLR   H_4, R_4
-	// initialize h
+	// calculate 5r² (ignore least significant limb)
-	VZERO H_0
+	VREPIF $5, T_0
-	VZERO H_1
+	VMLF   T_0, R_1, R5_1 // [5r₂₆[1], 5r²₂₆[1], 5r₂₆[1], 5r²₂₆[1]]
-	VZERO H_2
+	VMLF   T_0, R_2, R5_2 // [5r₂₆[2], 5r²₂₆[2], 5r₂₆[2], 5r²₂₆[2]]
-	VZERO H_3
+	VMLF   T_0, R_3, R5_3 // [5r₂₆[3], 5r²₂₆[3], 5r₂₆[3], 5r²₂₆[3]]
-	VZERO H_4
+	VMLF   T_0, R_4, R5_4 // [5r₂₆[4], 5r²₂₆[4], 5r₂₆[4], 5r²₂₆[4]]
 loop:
-	CMPBLE R3, $32, b2
+	CMPBLE R3, $32, b2 // 2 or fewer blocks remaining, need to change key coefficients
 	// load next 2 blocks from message
 	VLM (R2), T_0, T_1
 	// update message slice
 	SUB  $32, R3
 	MOVD $32(R2), R2
-	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
+
-	VLEIB  $4, $1, F_4
+	// unpack message blocks into 26-bit big-endian limbs
-	VLEIB  $12, $1, F_4
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
 	// add 2¹²⁸ to each message block value
 	VLEIB $4, $1, M_4
 	VLEIB $12, $1, M_4
 multiply:
-	VAG    H_0, F_0, F_0
+	// accumulate the incoming message
-	VAG    H_1, F_1, F_1
+	VAG H_0, M_0, M_0
-	VAG    H_2, F_2, F_2
+	VAG H_3, M_3, M_3
-	VAG    H_3, F_3, F_3
+	VAG H_1, M_1, M_1
-	VAG    H_4, F_4, F_4
+	VAG H_4, M_4, M_4
-	MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
+	VAG H_2, M_2, M_2
 	// multiply the accumulator by the key coefficient
 	MULTIPLY(M_0, M_1, M_2, M_3, M_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
 	// carry and partially reduce the partial products
 	REDUCE(H_0, H_1, H_2, H_3, H_4)
 	CMPBNE R3, $0, loop
 finish:
-	// sum vectors
+	// sum lane 0 and lane 1 and put the result in lane 1
 	VZERO  T_0
 	VSUMQG H_0, T_0, H_0
 	VSUMQG H_1, T_0, H_1
 	VSUMQG H_2, T_0, H_2
 	VSUMQG H_3, T_0, H_3
 	VSUMQG H_1, T_0, H_1
 	VSUMQG H_4, T_0, H_4
 	VSUMQG H_2, T_0, H_2
-	// h may be >= 2*(2**130-5) so we need to reduce it again
+	// reduce again after summation
 	// TODO(mundaym): there might be a more efficient way to do this
 	// now that we only have 1 active lane. For example, we could
 	// simultaneously pack the values as we reduce them.
 	REDUCE(H_0, H_1, H_2, H_3, H_4)
-	// carry h1->h4
+	// carry h[1] through to h[4] so that only h[4] can exceed 2²⁶ - 1
 	// TODO(mundaym): in testing this final carry was unnecessary.
 	// Needs a proof before it can be removed though.
 	VESRLG $26, H_1, T_1
 	VN     MOD26, H_1, H_1
 	VAQ    T_1, H_2, H_2
@ -284,95 +367,137 @@ finish:
 	VN     MOD26, H_3, H_3
 	VAQ    T_3, H_4, H_4
-	// h is now < 2*(2**130-5)
+	// h is now < 2(2¹³⁰ - 5)
-	// pack h into h1 (hi) and h0 (lo)
+	// Pack each lane in h₂₆[0:4] into h₁₂₈[0:1].
-	PACK(H_0, H_1, H_2, H_3, H_4)
+	VESLG $26, H_1, H_1
-
+	VESLG $26, H_3, H_3
-	// if h > 2**130-5 then h -= 2**130-5
+	VO    H_0, H_1, H_0
-	MOD(H_0, H_1, T_0, T_1, T_2)
+	VO    H_2, H_3, H_2
-
+	VESLG $4, H_2, H_2
-	// h += s
+	VLEIB $7, $48, H_1
-	MOVD  $·bswapMask<>(SB), R5
+	VSLB  H_1, H_2, H_2
-	VL    (R5), T_1
+	VO    H_0, H_2, H_0
-	VL    16(R4), T_0
+	VLEIB $7, $104, H_1
-	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
+	VSLB  H_1, H_4, H_3
-	VAQ   T_0, H_0, H_0
+	VO    H_3, H_0, H_0
-	VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
+	VLEIB $7, $24, H_1
-	VST   H_0, (R1)
+	VSRLB H_1, H_4, H_1
 	// update state
 	VSTEG $1, H_0, 0(R1)
 	VSTEG $0, H_0, 8(R1)
 	VSTEG $1, H_1, 16(R1)
 	RET
-b2:
+b2:  // 2 or fewer blocks remaining
 	CMPBLE R3, $16, b1
-	// 2 blocks remaining
+	// Load the 2 remaining blocks (17-32 bytes remaining).
-	SUB    $17, R3
+	MOVD $-17(R3), R0    // index of final byte to load modulo 16
-	VL     (R2), T_0
+	VL   (R2), T_0       // load full 16 byte block
-	VLL    R3, 16(R2), T_1
+	VLL  R0, 16(R2), T_1 // load final (possibly partial) block and pad with zeros to 16 bytes
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, T_1
 	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $12, $1, F_4
 	VLEIB  $4, $1, F_4
-	// setup [r²,r]
+	// The Poly1305 algorithm requires that a 1 bit be appended to
-	VLVGG $1, RSAVE_0, R_0
+	// each message block. If the final block is less than 16 bytes
-	VLVGG $1, RSAVE_1, R_1
+	// long then it is easiest to insert the 1 before the message
-	VLVGG $1, RSAVE_2, R_2
+	// block is split into 26-bit limbs. If, on the other hand, the
-	VLVGG $1, RSAVE_3, R_3
+	// final message block is 16 bytes long then we append the 1 bit
-	VLVGG $1, RSAVE_4, R_4
+	// after expansion as normal.
-	VPDI  $0, R5_1, R5SAVE_1, R5_1
+	MOVBZ  $1, R0
-	VPDI  $0, R5_2, R5SAVE_2, R5_2
+	MOVD   $-16(R3), R3   // index of byte in last block to insert 1 at (could be 16)
-	VPDI  $0, R5_3, R5SAVE_3, R5_3
+	CMPBEQ R3, $16, 2(PC) // skip the insertion if the final block is 16 bytes long
-	VPDI  $0, R5_4, R5SAVE_4, R5_4
+	VLVGB  R3, R0, T_1    // insert 1 into the byte at index R3
 	// Split both blocks into 26-bit limbs in the appropriate lanes.
 	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
 	// Append a 1 byte to the end of the second to last block.
 	VLEIB $4, $1, M_4
 	// Append a 1 byte to the end of the last block only if it is a
 	// full 16 byte block.
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $12, $1, M_4
 	// Finally, set up the coefficients for the final multiplication.
 	// We have previously saved r and 5r in the 32-bit even indexes
 	// of the R_[0-4] and R5_[1-4] coefficient registers.
 	//
 	// We want lane 0 to be multiplied by r² so that can be kept the
 	// same. We want lane 1 to be multiplied by r so we need to move
 	// the saved r value into the 32-bit odd index in lane 1 by
 	// rotating the 64-bit lane by 32.
 	VGBM   $0x00ff, T_0         // [0, 0xffffffffffffffff] - mask lane 1 only
 	VERIMG $32, R_0, T_0, R_0   // [_,  r²₂₆[0], _,  r₂₆[0]]
 	VERIMG $32, R_1, T_0, R_1   // [_,  r²₂₆[1], _,  r₂₆[1]]
 	VERIMG $32, R_2, T_0, R_2   // [_,  r²₂₆[2], _,  r₂₆[2]]
 	VERIMG $32, R_3, T_0, R_3   // [_,  r²₂₆[3], _,  r₂₆[3]]
 	VERIMG $32, R_4, T_0, R_4   // [_,  r²₂₆[4], _,  r₂₆[4]]
 	VERIMG $32, R5_1, T_0, R5_1 // [_, 5r²₂₆[1], _, 5r₂₆[1]]
 	VERIMG $32, R5_2, T_0, R5_2 // [_, 5r²₂₆[2], _, 5r₂₆[2]]
 	VERIMG $32, R5_3, T_0, R5_3 // [_, 5r²₂₆[3], _, 5r₂₆[3]]
 	VERIMG $32, R5_4, T_0, R5_4 // [_, 5r²₂₆[4], _, 5r₂₆[4]]
 	MOVD $0, R3
 	BR   multiply
 skip:
 	VZERO H_0
 	VZERO H_1
 	VZERO H_2
 	VZERO H_3
 	VZERO H_4
 	CMPBEQ R3, $0, finish
-b1:
+b1:  // 1 block remaining
-	// 1 block remaining
+
-	SUB    $1, R3
+	// Load the final block (1-16 bytes). This will be placed into
-	VLL    R3, (R2), T_0
+	// lane 0.
-	ADD    $1, R3
+	MOVD $-1(R3), R0
 	VLL  R0, (R2), T_0 // pad to 16 bytes with zeros
 	// The Poly1305 algorithm requires that a 1 bit be appended to
 	// each message block. If the final block is less than 16 bytes
 	// long then it is easiest to insert the 1 before the message
 	// block is split into 26-bit limbs. If, on the other hand, the
 	// final message block is 16 bytes long then we append the 1 bit
 	// after expansion as normal.
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, T_0
 	VZERO  T_1
 	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $4, $1, F_4
 	VLEIG  $1, $1, R_0
 	VZERO  R_1
 	VZERO  R_2
 	VZERO  R_3
 	VZERO  R_4
 	VZERO  R5_1
 	VZERO  R5_2
 	VZERO  R5_3
 	VZERO  R5_4
-	// setup [r, 1]
+	// Set the message block in lane 1 to the value 0 so that it
-	VLVGG $0, RSAVE_0, R_0
+	// can be accumulated without affecting the final result.
-	VLVGG $0, RSAVE_1, R_1
+	VZERO T_1
-	VLVGG $0, RSAVE_2, R_2
+
-	VLVGG $0, RSAVE_3, R_3
+	// Split the final message block into 26-bit limbs in lane 0.
-	VLVGG $0, RSAVE_4, R_4
+	// Lane 1 will be contain 0.
-	VPDI  $0, R5SAVE_1, R5_1, R5_1
+	EXPAND(T_0, T_1, M_0, M_1, M_2, M_3, M_4)
-	VPDI  $0, R5SAVE_2, R5_2, R5_2
+
-	VPDI  $0, R5SAVE_3, R5_3, R5_3
+	// Append a 1 byte to the end of the last block only if it is a
-	VPDI  $0, R5SAVE_4, R5_4, R5_4
+	// full 16 byte block.
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $4, $1, M_4
 	// We have previously saved r and 5r in the 32-bit even indexes
 	// of the R_[0-4] and R5_[1-4] coefficient registers.
 	//
 	// We want lane 0 to be multiplied by r so we need to move the
 	// saved r value into the 32-bit odd index in lane 0. We want
 	// lane 1 to be set to the value 1. This makes multiplication
 	// a no-op. We do this by setting lane 1 in every register to 0
 	// and then just setting the 32-bit index 3 in R_0 to 1.
 	VZERO T_0
 	MOVD  $0, R0
 	MOVD  $0x10111213, R12
 	VLVGP R12, R0, T_1         // [_, 0x10111213, _, 0x00000000]
 	VPERM T_0, R_0, T_1, R_0   // [_,  r₂₆[0], _, 0]
 	VPERM T_0, R_1, T_1, R_1   // [_,  r₂₆[1], _, 0]
 	VPERM T_0, R_2, T_1, R_2   // [_,  r₂₆[2], _, 0]
 	VPERM T_0, R_3, T_1, R_3   // [_,  r₂₆[3], _, 0]
 	VPERM T_0, R_4, T_1, R_4   // [_,  r₂₆[4], _, 0]
 	VPERM T_0, R5_1, T_1, R5_1 // [_, 5r₂₆[1], _, 0]
 	VPERM T_0, R5_2, T_1, R5_2 // [_, 5r₂₆[2], _, 0]
 	VPERM T_0, R5_3, T_1, R5_3 // [_, 5r₂₆[3], _, 0]
 	VPERM T_0, R5_4, T_1, R5_4 // [_, 5r₂₆[4], _, 0]
 	// Set the value of lane 1 to be 1.
 	VLEIF $3, $1, R_0 // [_,  r₂₆[0], _, 1]
 	MOVD $0, R3
 	BR   multiply
--- a/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
+++ b/vendor/golang.org/x/crypto/poly1305/sum_vmsl_s390x.s
@ -1,909 +0,0 @@
 // Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // +build s390x,go1.11,!gccgo,!appengine
 #include "textflag.h"
 // Implementation of Poly1305 using the vector facility (vx) and the VMSL instruction.
 // constants
 #define EX0   V1
 #define EX1   V2
 #define EX2   V3
 // temporaries
 #define T_0 V4
 #define T_1 V5
 #define T_2 V6
 #define T_3 V7
 #define T_4 V8
 #define T_5 V9
 #define T_6 V10
 #define T_7 V11
 #define T_8 V12
 #define T_9 V13
 #define T_10 V14
 // r**2 & r**4
 #define R_0  V15
 #define R_1  V16
 #define R_2  V17
 #define R5_1 V18
 #define R5_2 V19
 // key (r)
 #define RSAVE_0 R7
 #define RSAVE_1 R8
 #define RSAVE_2 R9
 #define R5SAVE_1 R10
 #define R5SAVE_2 R11
 // message block
 #define M0 V20
 #define M1 V21
 #define M2 V22
 #define M3 V23
 #define M4 V24
 #define M5 V25
 // accumulator
 #define H0_0 V26
 #define H1_0 V27
 #define H2_0 V28
 #define H0_1 V29
 #define H1_1 V30
 #define H2_1 V31
 GLOBL ·keyMask<>(SB), RODATA, $16
 DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
 DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
 GLOBL ·bswapMask<>(SB), RODATA, $16
 DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
 DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
 GLOBL ·constants<>(SB), RODATA, $48
 // EX0
 DATA ·constants<>+0(SB)/8, $0x18191a1b1c1d1e1f
 DATA ·constants<>+8(SB)/8, $0x0000050403020100
 // EX1
 DATA ·constants<>+16(SB)/8, $0x18191a1b1c1d1e1f
 DATA ·constants<>+24(SB)/8, $0x00000a0908070605
 // EX2
 DATA ·constants<>+32(SB)/8, $0x18191a1b1c1d1e1f
 DATA ·constants<>+40(SB)/8, $0x0000000f0e0d0c0b
 GLOBL ·c<>(SB), RODATA, $48
 // EX0
 DATA ·c<>+0(SB)/8, $0x0000050403020100
 DATA ·c<>+8(SB)/8, $0x0000151413121110
 // EX1
 DATA ·c<>+16(SB)/8, $0x00000a0908070605
 DATA ·c<>+24(SB)/8, $0x00001a1918171615
 // EX2
 DATA ·c<>+32(SB)/8, $0x0000000f0e0d0c0b
 DATA ·c<>+40(SB)/8, $0x0000001f1e1d1c1b
 GLOBL ·reduce<>(SB), RODATA, $32
 // 44 bit
 DATA ·reduce<>+0(SB)/8, $0x0
 DATA ·reduce<>+8(SB)/8, $0xfffffffffff
 // 42 bit
 DATA ·reduce<>+16(SB)/8, $0x0
 DATA ·reduce<>+24(SB)/8, $0x3ffffffffff
 // h = (f*g) % (2**130-5) [partial reduction]
 // uses T_0...T_9 temporary registers
 // input: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2
 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
 // output: m02_0, m02_1, m02_2, m13_0, m13_1, m13_2
 #define MULTIPLY(m02_0, m02_1, m02_2, m13_0, m13_1, m13_2, r_0, r_1, r_2, r5_1, r5_2, m4_0, m4_1, m4_2, m5_0, m5_1, m5_2, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9) \
 	\ // Eliminate the dependency for the last 2 VMSLs
 	VMSLG m02_0, r_2, m4_2, m4_2                       \
 	VMSLG m13_0, r_2, m5_2, m5_2                       \ // 8 VMSLs pipelined
 	VMSLG m02_0, r_0, m4_0, m4_0                       \
 	VMSLG m02_1, r5_2, V0, T_0                         \
 	VMSLG m02_0, r_1, m4_1, m4_1                       \
 	VMSLG m02_1, r_0, V0, T_1                          \
 	VMSLG m02_1, r_1, V0, T_2                          \
 	VMSLG m02_2, r5_1, V0, T_3                         \
 	VMSLG m02_2, r5_2, V0, T_4                         \
 	VMSLG m13_0, r_0, m5_0, m5_0                       \
 	VMSLG m13_1, r5_2, V0, T_5                         \
 	VMSLG m13_0, r_1, m5_1, m5_1                       \
 	VMSLG m13_1, r_0, V0, T_6                          \
 	VMSLG m13_1, r_1, V0, T_7                          \
 	VMSLG m13_2, r5_1, V0, T_8                         \
 	VMSLG m13_2, r5_2, V0, T_9                         \
 	VMSLG m02_2, r_0, m4_2, m4_2                       \
 	VMSLG m13_2, r_0, m5_2, m5_2                       \
 	VAQ   m4_0, T_0, m02_0                             \
 	VAQ   m4_1, T_1, m02_1                             \
 	VAQ   m5_0, T_5, m13_0                             \
 	VAQ   m5_1, T_6, m13_1                             \
 	VAQ   m02_0, T_3, m02_0                            \
 	VAQ   m02_1, T_4, m02_1                            \
 	VAQ   m13_0, T_8, m13_0                            \
 	VAQ   m13_1, T_9, m13_1                            \
 	VAQ   m4_2, T_2, m02_2                             \
 	VAQ   m5_2, T_7, m13_2                             \
 // SQUARE uses three limbs of r and r_2*5 to output square of r
 // uses T_1, T_5 and T_7 temporary registers
 // input: r_0, r_1, r_2, r5_2
 // temp: TEMP0, TEMP1, TEMP2
 // output: p0, p1, p2
 #define SQUARE(r_0, r_1, r_2, r5_2, p0, p1, p2, TEMP0, TEMP1, TEMP2) \
 	VMSLG r_0, r_0, p0, p0     \
 	VMSLG r_1, r5_2, V0, TEMP0 \
 	VMSLG r_2, r5_2, p1, p1    \
 	VMSLG r_0, r_1, V0, TEMP1  \
 	VMSLG r_1, r_1, p2, p2     \
 	VMSLG r_0, r_2, V0, TEMP2  \
 	VAQ   TEMP0, p0, p0        \
 	VAQ   TEMP1, p1, p1        \
 	VAQ   TEMP2, p2, p2        \
 	VAQ   TEMP0, p0, p0        \
 	VAQ   TEMP1, p1, p1        \
 	VAQ   TEMP2, p2, p2        \
 // carry h0->h1->h2->h0 || h3->h4->h5->h3
 // uses T_2, T_4, T_5, T_7, T_8, T_9
 //       t6,  t7,  t8,  t9, t10, t11
 // input: h0, h1, h2, h3, h4, h5
 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11
 // output: h0, h1, h2, h3, h4, h5
 #define REDUCE(h0, h1, h2, h3, h4, h5, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
 	VLM    (R12), t6, t7  \ // 44 and 42 bit clear mask
 	VLEIB  $7, $0x28, t10 \ // 5 byte shift mask
 	VREPIB $4, t8         \ // 4 bit shift mask
 	VREPIB $2, t11        \ // 2 bit shift mask
 	VSRLB  t10, h0, t0    \ // h0 byte shift
 	VSRLB  t10, h1, t1    \ // h1 byte shift
 	VSRLB  t10, h2, t2    \ // h2 byte shift
 	VSRLB  t10, h3, t3    \ // h3 byte shift
 	VSRLB  t10, h4, t4    \ // h4 byte shift
 	VSRLB  t10, h5, t5    \ // h5 byte shift
 	VSRL   t8, t0, t0     \ // h0 bit shift
 	VSRL   t8, t1, t1     \ // h2 bit shift
 	VSRL   t11, t2, t2    \ // h2 bit shift
 	VSRL   t8, t3, t3     \ // h3 bit shift
 	VSRL   t8, t4, t4     \ // h4 bit shift
 	VESLG  $2, t2, t9     \ // h2 carry x5
 	VSRL   t11, t5, t5    \ // h5 bit shift
 	VN     t6, h0, h0     \ // h0 clear carry
 	VAQ    t2, t9, t2     \ // h2 carry x5
 	VESLG  $2, t5, t9     \ // h5 carry x5
 	VN     t6, h1, h1     \ // h1 clear carry
 	VN     t7, h2, h2     \ // h2 clear carry
 	VAQ    t5, t9, t5     \ // h5 carry x5
 	VN     t6, h3, h3     \ // h3 clear carry
 	VN     t6, h4, h4     \ // h4 clear carry
 	VN     t7, h5, h5     \ // h5 clear carry
 	VAQ    t0, h1, h1     \ // h0->h1
 	VAQ    t3, h4, h4     \ // h3->h4
 	VAQ    t1, h2, h2     \ // h1->h2
 	VAQ    t4, h5, h5     \ // h4->h5
 	VAQ    t2, h0, h0     \ // h2->h0
 	VAQ    t5, h3, h3     \ // h5->h3
 	VREPG  $1, t6, t6     \ // 44 and 42 bit masks across both halves
 	VREPG  $1, t7, t7     \
 	VSLDB  $8, h0, h0, h0 \ // set up [h0/1/2, h3/4/5]
 	VSLDB  $8, h1, h1, h1 \
 	VSLDB  $8, h2, h2, h2 \
 	VO     h0, h3, h3     \
 	VO     h1, h4, h4     \
 	VO     h2, h5, h5     \
 	VESRLG $44, h3, t0    \ // 44 bit shift right
 	VESRLG $44, h4, t1    \
 	VESRLG $42, h5, t2    \
 	VN     t6, h3, h3     \ // clear carry bits
 	VN     t6, h4, h4     \
 	VN     t7, h5, h5     \
 	VESLG  $2, t2, t9     \ // multiply carry by 5
 	VAQ    t9, t2, t2     \
 	VAQ    t0, h4, h4     \
 	VAQ    t1, h5, h5     \
 	VAQ    t2, h3, h3     \
 // carry h0->h1->h2->h0
 // input: h0, h1, h2
 // temp: t0, t1, t2, t3, t4, t5, t6, t7, t8
 // output: h0, h1, h2
 #define REDUCE2(h0, h1, h2, t0, t1, t2, t3, t4, t5, t6, t7, t8) \
 	VLEIB  $7, $0x28, t3 \ // 5 byte shift mask
 	VREPIB $4, t4        \ // 4 bit shift mask
 	VREPIB $2, t7        \ // 2 bit shift mask
 	VGBM   $0x003F, t5   \ // mask to clear carry bits
 	VSRLB  t3, h0, t0    \
 	VSRLB  t3, h1, t1    \
 	VSRLB  t3, h2, t2    \
 	VESRLG $4, t5, t5    \ // 44 bit clear mask
 	VSRL   t4, t0, t0    \
 	VSRL   t4, t1, t1    \
 	VSRL   t7, t2, t2    \
 	VESRLG $2, t5, t6    \ // 42 bit clear mask
 	VESLG  $2, t2, t8    \
 	VAQ    t8, t2, t2    \
 	VN     t5, h0, h0    \
 	VN     t5, h1, h1    \
 	VN     t6, h2, h2    \
 	VAQ    t0, h1, h1    \
 	VAQ    t1, h2, h2    \
 	VAQ    t2, h0, h0    \
 	VSRLB  t3, h0, t0    \
 	VSRLB  t3, h1, t1    \
 	VSRLB  t3, h2, t2    \
 	VSRL   t4, t0, t0    \
 	VSRL   t4, t1, t1    \
 	VSRL   t7, t2, t2    \
 	VN     t5, h0, h0    \
 	VN     t5, h1, h1    \
 	VESLG  $2, t2, t8    \
 	VN     t6, h2, h2    \
 	VAQ    t0, h1, h1    \
 	VAQ    t8, t2, t2    \
 	VAQ    t1, h2, h2    \
 	VAQ    t2, h0, h0    \
 // expands two message blocks into the lower halfs of the d registers
 // moves the contents of the d registers into upper halfs
 // input: in1, in2, d0, d1, d2, d3, d4, d5
 // temp: TEMP0, TEMP1, TEMP2, TEMP3
 // output: d0, d1, d2, d3, d4, d5
 #define EXPACC(in1, in2, d0, d1, d2, d3, d4, d5, TEMP0, TEMP1, TEMP2, TEMP3) \
 	VGBM   $0xff3f, TEMP0      \
 	VGBM   $0xff1f, TEMP1      \
 	VESLG  $4, d1, TEMP2       \
 	VESLG  $4, d4, TEMP3       \
 	VESRLG $4, TEMP0, TEMP0    \
 	VPERM  in1, d0, EX0, d0    \
 	VPERM  in2, d3, EX0, d3    \
 	VPERM  in1, d2, EX2, d2    \
 	VPERM  in2, d5, EX2, d5    \
 	VPERM  in1, TEMP2, EX1, d1 \
 	VPERM  in2, TEMP3, EX1, d4 \
 	VN     TEMP0, d0, d0       \
 	VN     TEMP0, d3, d3       \
 	VESRLG $4, d1, d1          \
 	VESRLG $4, d4, d4          \
 	VN     TEMP1, d2, d2       \
 	VN     TEMP1, d5, d5       \
 	VN     TEMP0, d1, d1       \
 	VN     TEMP0, d4, d4       \
 // expands one message block into the lower halfs of the d registers
 // moves the contents of the d registers into upper halfs
 // input: in, d0, d1, d2
 // temp: TEMP0, TEMP1, TEMP2
 // output: d0, d1, d2
 #define EXPACC2(in, d0, d1, d2, TEMP0, TEMP1, TEMP2) \
 	VGBM   $0xff3f, TEMP0     \
 	VESLG  $4, d1, TEMP2      \
 	VGBM   $0xff1f, TEMP1     \
 	VPERM  in, d0, EX0, d0    \
 	VESRLG $4, TEMP0, TEMP0   \
 	VPERM  in, d2, EX2, d2    \
 	VPERM  in, TEMP2, EX1, d1 \
 	VN     TEMP0, d0, d0      \
 	VN     TEMP1, d2, d2      \
 	VESRLG $4, d1, d1         \
 	VN     TEMP0, d1, d1      \
 // pack h2:h0 into h1:h0 (no carry)
 // input: h0, h1, h2
 // output: h0, h1, h2
 #define PACK(h0, h1, h2) \
 	VMRLG  h1, h2, h2  \ // copy h1 to upper half h2
 	VESLG  $44, h1, h1 \ // shift limb 1 44 bits, leaving 20
 	VO     h0, h1, h0  \ // combine h0 with 20 bits from limb 1
 	VESRLG $20, h2, h1 \ // put top 24 bits of limb 1 into h1
 	VLEIG  $1, $0, h1  \ // clear h2 stuff from lower half of h1
 	VO     h0, h1, h0  \ // h0 now has 88 bits (limb 0 and 1)
 	VLEIG  $0, $0, h2  \ // clear upper half of h2
 	VESRLG $40, h2, h1 \ // h1 now has upper two bits of result
 	VLEIB  $7, $88, h1 \ // for byte shift (11 bytes)
 	VSLB   h1, h2, h2  \ // shift h2 11 bytes to the left
 	VO     h0, h2, h0  \ // combine h0 with 20 bits from limb 1
 	VLEIG  $0, $0, h1  \ // clear upper half of h1
 // if h > 2**130-5 then h -= 2**130-5
 // input: h0, h1
 // temp: t0, t1, t2
 // output: h0
 #define MOD(h0, h1, t0, t1, t2) \
 	VZERO t0          \
 	VLEIG $1, $5, t0  \
 	VACCQ h0, t0, t1  \
 	VAQ   h0, t0, t0  \
 	VONE  t2          \
 	VLEIG $1, $-4, t2 \
 	VAQ   t2, t1, t1  \
 	VACCQ h1, t1, t1  \
 	VONE  t2          \
 	VAQ   t2, t1, t1  \
 	VN    h0, t1, t2  \
 	VNC   t0, t1, t1  \
 	VO    t1, t2, h0  \
 // func poly1305vmsl(out *[16]byte, m *byte, mlen uint64, key *[32]key)
 TEXT ·poly1305vmsl(SB), $0-32
 	// This code processes 6 + up to 4 blocks (32 bytes) per iteration
 	// using the algorithm described in:
 	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
 	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
 	// And as moddified for VMSL as described in
 	// Accelerating Poly1305 Cryptographic Message Authentication on the z14
 	// O'Farrell et al, CASCON 2017, p48-55
 	// https://ibm.ent.box.com/s/jf9gedj0e9d2vjctfyh186shaztavnht
 	LMG   out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
 	VZERO V0                // c
 	// load EX0, EX1 and EX2
 	MOVD $·constants<>(SB), R5
 	VLM  (R5), EX0, EX2        // c
 	// setup r
 	VL    (R4), T_0
 	MOVD  $·keyMask<>(SB), R6
 	VL    (R6), T_1
 	VN    T_0, T_1, T_0
 	VZERO T_2                 // limbs for r
 	VZERO T_3
 	VZERO T_4
 	EXPACC2(T_0, T_2, T_3, T_4, T_1, T_5, T_7)
 	// T_2, T_3, T_4: [0, r]
 	// setup r*20
 	VLEIG $0, $0, T_0
 	VLEIG $1, $20, T_0       // T_0: [0, 20]
 	VZERO T_5
 	VZERO T_6
 	VMSLG T_0, T_3, T_5, T_5
 	VMSLG T_0, T_4, T_6, T_6
 	// store r for final block in GR
 	VLGVG $1, T_2, RSAVE_0  // c
 	VLGVG $1, T_3, RSAVE_1  // c
 	VLGVG $1, T_4, RSAVE_2  // c
 	VLGVG $1, T_5, R5SAVE_1 // c
 	VLGVG $1, T_6, R5SAVE_2 // c
 	// initialize h
 	VZERO H0_0
 	VZERO H1_0
 	VZERO H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	// initialize pointer for reduce constants
 	MOVD $·reduce<>(SB), R12
 	// calculate r**2 and 20*(r**2)
 	VZERO R_0
 	VZERO R_1
 	VZERO R_2
 	SQUARE(T_2, T_3, T_4, T_6, R_0, R_1, R_2, T_1, T_5, T_7)
 	REDUCE2(R_0, R_1, R_2, M0, M1, M2, M3, M4, R5_1, R5_2, M5, T_1)
 	VZERO R5_1
 	VZERO R5_2
 	VMSLG T_0, R_1, R5_1, R5_1
 	VMSLG T_0, R_2, R5_2, R5_2
 	// skip r**4 calculation if 3 blocks or less
 	CMPBLE R3, $48, b4
 	// calculate r**4 and 20*(r**4)
 	VZERO T_8
 	VZERO T_9
 	VZERO T_10
 	SQUARE(R_0, R_1, R_2, R5_2, T_8, T_9, T_10, T_1, T_5, T_7)
 	REDUCE2(T_8, T_9, T_10, M0, M1, M2, M3, M4, T_2, T_3, M5, T_1)
 	VZERO T_2
 	VZERO T_3
 	VMSLG T_0, T_9, T_2, T_2
 	VMSLG T_0, T_10, T_3, T_3
 	// put r**2 to the right and r**4 to the left of R_0, R_1, R_2
 	VSLDB $8, T_8, T_8, T_8
 	VSLDB $8, T_9, T_9, T_9
 	VSLDB $8, T_10, T_10, T_10
 	VSLDB $8, T_2, T_2, T_2
 	VSLDB $8, T_3, T_3, T_3
 	VO T_8, R_0, R_0
 	VO T_9, R_1, R_1
 	VO T_10, R_2, R_2
 	VO T_2, R5_1, R5_1
 	VO T_3, R5_2, R5_2
 	CMPBLE R3, $80, load // less than or equal to 5 blocks in message
 	// 6(or 5+1) blocks
 	SUB    $81, R3
 	VLM    (R2), M0, M4
 	VLL    R3, 80(R2), M5
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBGE R3, $16, 2(PC)
 	VLVGB  R3, R0, M5
 	MOVD   $96(R2), R2
 	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
 	EXPACC(M2, M3, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
 	VLEIB  $2, $1, H2_0
 	VLEIB  $2, $1, H2_1
 	VLEIB  $10, $1, H2_0
 	VLEIB  $10, $1, H2_1
 	VZERO  M0
 	VZERO  M1
 	VZERO  M2
 	VZERO  M3
 	VZERO  T_4
 	VZERO  T_10
 	EXPACC(M4, M5, M0, M1, M2, M3, T_4, T_10, T_0, T_1, T_2, T_3)
 	VLR    T_4, M4
 	VLEIB  $10, $1, M2
 	CMPBLT R3, $16, 2(PC)
 	VLEIB  $10, $1, T_10
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	SUB    $16, R3
 	CMPBLE R3, $0, square
 load:
 	// load EX0, EX1 and EX2
 	MOVD $·c<>(SB), R5
 	VLM  (R5), EX0, EX2
 loop:
 	CMPBLE R3, $64, add // b4	// last 4 or less blocks left
 	// next 4 full blocks
 	VLM  (R2), M2, M5
 	SUB  $64, R3
 	MOVD $64(R2), R2
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, T_0, T_1, T_3, T_4, T_5, T_2, T_7, T_8, T_9)
 	// expacc in-lined to create [m2, m3] limbs
 	VGBM   $0x3f3f, T_0     // 44 bit clear mask
 	VGBM   $0x1f1f, T_1     // 40 bit clear mask
 	VPERM  M2, M3, EX0, T_3
 	VESRLG $4, T_0, T_0     // 44 bit clear mask ready
 	VPERM  M2, M3, EX1, T_4
 	VPERM  M2, M3, EX2, T_5
 	VN     T_0, T_3, T_3
 	VESRLG $4, T_4, T_4
 	VN     T_1, T_5, T_5
 	VN     T_0, T_4, T_4
 	VMRHG  H0_1, T_3, H0_0
 	VMRHG  H1_1, T_4, H1_0
 	VMRHG  H2_1, T_5, H2_0
 	VMRLG  H0_1, T_3, H0_1
 	VMRLG  H1_1, T_4, H1_1
 	VMRLG  H2_1, T_5, H2_1
 	VLEIB  $10, $1, H2_0
 	VLEIB  $10, $1, H2_1
 	VPERM  M4, M5, EX0, T_3
 	VPERM  M4, M5, EX1, T_4
 	VPERM  M4, M5, EX2, T_5
 	VN     T_0, T_3, T_3
 	VESRLG $4, T_4, T_4
 	VN     T_1, T_5, T_5
 	VN     T_0, T_4, T_4
 	VMRHG  V0, T_3, M0
 	VMRHG  V0, T_4, M1
 	VMRHG  V0, T_5, M2
 	VMRLG  V0, T_3, M3
 	VMRLG  V0, T_4, M4
 	VMRLG  V0, T_5, M5
 	VLEIB  $10, $1, M2
 	VLEIB  $10, $1, M5
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	CMPBNE R3, $0, loop
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	// load EX0, EX1, EX2
 	MOVD $·constants<>(SB), R5
 	VLM  (R5), EX0, EX2
 	// sum vectors
 	VAQ H0_0, H0_1, H0_0
 	VAQ H1_0, H1_1, H1_0
 	VAQ H2_0, H2_1, H2_0
 	// h may be >= 2*(2**130-5) so we need to reduce it again
 	// M0...M4 are used as temps here
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 next:  // carry h1->h2
 	VLEIB  $7, $0x28, T_1
 	VREPIB $4, T_2
 	VGBM   $0x003F, T_3
 	VESRLG $4, T_3
 	// byte shift
 	VSRLB T_1, H1_0, T_4
 	// bit shift
 	VSRL T_2, T_4, T_4
 	// clear h1 carry bits
 	VN T_3, H1_0, H1_0
 	// add carry
 	VAQ T_4, H2_0, H2_0
 	// h is now < 2*(2**130-5)
 	// pack h into h1 (hi) and h0 (lo)
 	PACK(H0_0, H1_0, H2_0)
 	// if h > 2**130-5 then h -= 2**130-5
 	MOD(H0_0, H1_0, T_0, T_1, T_2)
 	// h += s
 	MOVD  $·bswapMask<>(SB), R5
 	VL    (R5), T_1
 	VL    16(R4), T_0
 	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
 	VAQ   T_0, H0_0, H0_0
 	VPERM H0_0, H0_0, T_1, H0_0 // reverse bytes (to little)
 	VST   H0_0, (R1)
 	RET
 add:
 	// load EX0, EX1, EX2
 	MOVD $·constants<>(SB), R5
 	VLM  (R5), EX0, EX2
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	CMPBLE R3, $64, b4
 b4:
 	CMPBLE R3, $48, b3 // 3 blocks or less
 	// 4(3+1) blocks remaining
 	SUB    $49, R3
 	VLM    (R2), M0, M2
 	VLL    R3, 48(R2), M3
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M3
 	MOVD   $64(R2), R2
 	EXPACC(M0, M1, H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_0, T_1, T_2, T_3)
 	VLEIB  $10, $1, H2_0
 	VLEIB  $10, $1, H2_1
 	VZERO  M0
 	VZERO  M1
 	VZERO  M4
 	VZERO  M5
 	VZERO  T_4
 	VZERO  T_10
 	EXPACC(M2, M3, M0, M1, M4, M5, T_4, T_10, T_0, T_1, T_2, T_3)
 	VLR    T_4, M2
 	VLEIB  $10, $1, M4
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, T_10
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M4, M5, M2, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M3, M4, M5, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG  V0, H0_1, H0_0
 	VMRHG  V0, H1_1, H1_0
 	VMRHG  V0, H2_1, H2_0
 	VMRLG  V0, H0_1, H0_1
 	VMRLG  V0, H1_1, H1_1
 	VMRLG  V0, H2_1, H2_1
 	SUB    $16, R3
 	CMPBLE R3, $0, square // this condition must always hold true!
 b3:
 	CMPBLE R3, $32, b2
 	// 3 blocks remaining
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// H*[r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, T_10, M5)
 	SUB    $33, R3
 	VLM    (R2), M0, M1
 	VLL    R3, 32(R2), M2
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M2
 	// H += m0
 	VZERO T_1
 	VZERO T_2
 	VZERO T_3
 	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)
 	VLEIB $10, $1, T_3
 	VAG   H0_0, T_1, H0_0
 	VAG   H1_0, T_2, H1_0
 	VAG   H2_0, T_3, H2_0
 	VZERO M0
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	VZERO T_10
 	// (H+m0)*r
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M3, M4, M5, V0, T_10, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_10, H0_1, H1_1, H2_1, T_9)
 	// H += m1
 	VZERO V0
 	VZERO T_1
 	VZERO T_2
 	VZERO T_3
 	EXPACC2(M1, T_1, T_2, T_3, T_4, T_5, T_6)
 	VLEIB $10, $1, T_3
 	VAQ   H0_0, T_1, H0_0
 	VAQ   H1_0, T_2, H1_0
 	VAQ   H2_0, T_3, H2_0
 	REDUCE2(H0_0, H1_0, H2_0, M0, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
 	// [H, m2] * [r**2, r]
 	EXPACC2(M2, H0_0, H1_0, H2_0, T_1, T_2, T_3)
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, H2_0
 	VZERO  M0
 	VZERO  M1
 	VZERO  M2
 	VZERO  M3
 	VZERO  M4
 	VZERO  M5
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, H0_1, H1_1, M5, T_10)
 	SUB    $16, R3
 	CMPBLE R3, $0, next   // this condition must always hold true!
 b2:
 	CMPBLE R3, $16, b1
 	// 2 blocks remaining
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// H*[r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, T_10, M0, M1, M2, M3, M4, T_4, T_5, T_2, T_7, T_8, T_9)
 	VMRHG V0, H0_1, H0_0
 	VMRHG V0, H1_1, H1_0
 	VMRHG V0, H2_1, H2_0
 	VMRLG V0, H0_1, H0_1
 	VMRLG V0, H1_1, H1_1
 	VMRLG V0, H2_1, H2_1
 	// move h to the left and 0s at the right
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	// get message blocks and append 1 to start
 	SUB    $17, R3
 	VL     (R2), M0
 	VLL    R3, 16(R2), M1
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M1
 	VZERO  T_6
 	VZERO  T_7
 	VZERO  T_8
 	EXPACC2(M0, T_6, T_7, T_8, T_1, T_2, T_3)
 	EXPACC2(M1, T_6, T_7, T_8, T_1, T_2, T_3)
 	VLEIB  $2, $1, T_8
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, T_8
 	// add [m0, m1] to h
 	VAG H0_0, T_6, H0_0
 	VAG H1_0, T_7, H1_0
 	VAG H2_0, T_8, H2_0
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	VZERO T_10
 	VZERO M0
 	// at this point R_0 .. R5_2 look like [r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M2, M3, M4, M5, T_10, M0, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M2, M3, M4, M5, T_9, H0_1, H1_1, H2_1, T_10)
 	SUB    $16, R3, R3
 	CMPBLE R3, $0, next
 b1:
 	CMPBLE R3, $0, next
 	// 1 block remaining
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// H*[r**2, r]
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 	// set up [0, m0] limbs
 	SUB    $1, R3
 	VLL    R3, (R2), M0
 	ADD    $1, R3
 	MOVBZ  $1, R0
 	CMPBEQ R3, $16, 2(PC)
 	VLVGB  R3, R0, M0
 	VZERO  T_1
 	VZERO  T_2
 	VZERO  T_3
 	EXPACC2(M0, T_1, T_2, T_3, T_4, T_5, T_6)// limbs: [0, m]
 	CMPBNE R3, $16, 2(PC)
 	VLEIB  $10, $1, T_3
 	// h+m0
 	VAQ H0_0, T_1, H0_0
 	VAQ H1_0, T_2, H1_0
 	VAQ H2_0, T_3, H2_0
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 	BR next
 square:
 	// setup [r²,r]
 	VSLDB $8, R_0, R_0, R_0
 	VSLDB $8, R_1, R_1, R_1
 	VSLDB $8, R_2, R_2, R_2
 	VSLDB $8, R5_1, R5_1, R5_1
 	VSLDB $8, R5_2, R5_2, R5_2
 	VLVGG $1, RSAVE_0, R_0
 	VLVGG $1, RSAVE_1, R_1
 	VLVGG $1, RSAVE_2, R_2
 	VLVGG $1, R5SAVE_1, R5_1
 	VLVGG $1, R5SAVE_2, R5_2
 	// setup [h0, h1]
 	VSLDB $8, H0_0, H0_0, H0_0
 	VSLDB $8, H1_0, H1_0, H1_0
 	VSLDB $8, H2_0, H2_0, H2_0
 	VO    H0_1, H0_0, H0_0
 	VO    H1_1, H1_0, H1_0
 	VO    H2_1, H2_0, H2_0
 	VZERO H0_1
 	VZERO H1_1
 	VZERO H2_1
 	VZERO M0
 	VZERO M1
 	VZERO M2
 	VZERO M3
 	VZERO M4
 	VZERO M5
 	// (h0*r**2) + (h1*r)
 	MULTIPLY(H0_0, H1_0, H2_0, H0_1, H1_1, H2_1, R_0, R_1, R_2, R5_1, R5_2, M0, M1, M2, M3, M4, M5, T_0, T_1, T_2, T_3, T_4, T_5, T_6, T_7, T_8, T_9)
 	REDUCE2(H0_0, H1_0, H2_0, M0, M1, M2, M3, M4, T_9, T_10, H0_1, M5)
 	BR next
--- a/vendor/golang.org/x/crypto/ssh/agent/client.go
+++ b/vendor/golang.org/x/crypto/ssh/agent/client.go
@ -102,8 +102,9 @@ type ConstraintExtension struct {
 // AddedKey describes an SSH key to be added to an Agent.
 type AddedKey struct {
-	// PrivateKey must be a *rsa.PrivateKey, *dsa.PrivateKey or
+	// PrivateKey must be a *rsa.PrivateKey, *dsa.PrivateKey,
-	// *ecdsa.PrivateKey, which will be inserted into the agent.
+	// ed25519.PrivateKey or *ecdsa.PrivateKey, which will be inserted into the
 	// agent.
 	PrivateKey interface{}
 	// Certificate, if not nil, is communicated to the agent and will be
 	// stored with the key.
@ -566,6 +567,17 @@ func (c *client) insertKey(s interface{}, comment string, constraints []byte) er
 			Comments:    comment,
 			Constraints: constraints,
 		})
 	case ed25519.PrivateKey:
 		req = ssh.Marshal(ed25519KeyMsg{
 			Type:        ssh.KeyAlgoED25519,
 			Pub:         []byte(k)[32:],
 			Priv:        []byte(k),
 			Comments:    comment,
 			Constraints: constraints,
 		})
 	// This function originally supported only *ed25519.PrivateKey, however the
 	// general idiom is to pass ed25519.PrivateKey by value, not by pointer.
 	// We still support the pointer variant for backwards compatibility.
 	case *ed25519.PrivateKey:
 		req = ssh.Marshal(ed25519KeyMsg{
 			Type:        ssh.KeyAlgoED25519,
@ -683,6 +695,18 @@ func (c *client) insertCert(s interface{}, cert *ssh.Certificate, comment string
 			Comments:    comment,
 			Constraints: constraints,
 		})
 	case ed25519.PrivateKey:
 		req = ssh.Marshal(ed25519CertMsg{
 			Type:        cert.Type(),
 			CertBytes:   cert.Marshal(),
 			Pub:         []byte(k)[32:],
 			Priv:        []byte(k),
 			Comments:    comment,
 			Constraints: constraints,
 		})
 	// This function originally supported only *ed25519.PrivateKey, however the
 	// general idiom is to pass ed25519.PrivateKey by value, not by pointer.
 	// We still support the pointer variant for backwards compatibility.
 	case *ed25519.PrivateKey:
 		req = ssh.Marshal(ed25519CertMsg{
 			Type:        cert.Type(),
--- a/vendor/golang.org/x/crypto/ssh/certs.go
+++ b/vendor/golang.org/x/crypto/ssh/certs.go
@ -414,8 +414,8 @@ func (c *CertChecker) CheckCert(principal string, cert *Certificate) error {
 	return nil
 }
-// SignCert sets c.SignatureKey to the authority's public key and stores a
+// SignCert signs the certificate with an authority, setting the Nonce,
-// Signature, by authority, in the certificate.
+// SignatureKey, and Signature fields.
 func (c *Certificate) SignCert(rand io.Reader, authority Signer) error {
 	c.Nonce = make([]byte, 32)
 	if _, err := io.ReadFull(rand, c.Nonce); err != nil {
--- a/vendor/golang.org/x/crypto/ssh/cipher.go
+++ b/vendor/golang.org/x/crypto/ssh/cipher.go
@ -119,7 +119,7 @@ var cipherModes = map[string]*cipherMode{
 	chacha20Poly1305ID: {64, 0, newChaCha20Cipher},
 	// CBC mode is insecure and so is not included in the default config.
-	// (See http://www.isg.rhul.ac.uk/~kp/SandPfinal.pdf). If absolutely
+	// (See https://www.ieee-security.org/TC/SP2013/papers/4977a526.pdf). If absolutely
 	// needed, it's possible to specify a custom Config to enable it.
 	// You should expect that an active attacker can recover plaintext if
 	// you do.
--- a/vendor/golang.org/x/crypto/ssh/internal/bcrypt_pbkdf/bcrypt_pbkdf.go
+++ b/vendor/golang.org/x/crypto/ssh/internal/bcrypt_pbkdf/bcrypt_pbkdf.go
@ -0,0 +1,93 @@
 // Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 // Package bcrypt_pbkdf implements bcrypt_pbkdf(3) from OpenBSD.
 //
 // See https://flak.tedunangst.com/post/bcrypt-pbkdf and
 // https://cvsweb.openbsd.org/cgi-bin/cvsweb/src/lib/libutil/bcrypt_pbkdf.c.
 package bcrypt_pbkdf
 import (
 	"crypto/sha512"
 	"errors"
 	"golang.org/x/crypto/blowfish"
 )
 const blockSize = 32
 // Key derives a key from the password, salt and rounds count, returning a
 // []byte of length keyLen that can be used as cryptographic key.
 func Key(password, salt []byte, rounds, keyLen int) ([]byte, error) {
 	if rounds < 1 {
 		return nil, errors.New("bcrypt_pbkdf: number of rounds is too small")
 	}
 	if len(password) == 0 {
 		return nil, errors.New("bcrypt_pbkdf: empty password")
 	}
 	if len(salt) == 0 || len(salt) > 1<<20 {
 		return nil, errors.New("bcrypt_pbkdf: bad salt length")
 	}
 	if keyLen > 1024 {
 		return nil, errors.New("bcrypt_pbkdf: keyLen is too large")
 	}
 	numBlocks := (keyLen + blockSize - 1) / blockSize
 	key := make([]byte, numBlocks*blockSize)
 	h := sha512.New()
 	h.Write(password)
 	shapass := h.Sum(nil)
 	shasalt := make([]byte, 0, sha512.Size)
 	cnt, tmp := make([]byte, 4), make([]byte, blockSize)
 	for block := 1; block <= numBlocks; block++ {
 		h.Reset()
 		h.Write(salt)
 		cnt[0] = byte(block >> 24)
 		cnt[1] = byte(block >> 16)
 		cnt[2] = byte(block >> 8)
 		cnt[3] = byte(block)
 		h.Write(cnt)
 		bcryptHash(tmp, shapass, h.Sum(shasalt))
 		out := make([]byte, blockSize)
 		copy(out, tmp)
 		for i := 2; i <= rounds; i++ {
 			h.Reset()
 			h.Write(tmp)
 			bcryptHash(tmp, shapass, h.Sum(shasalt))
 			for j := 0; j < len(out); j++ {
 				out[j] ^= tmp[j]
 			}
 		}
 		for i, v := range out {
 			key[i*numBlocks+(block-1)] = v
 		}
 	}
 	return key[:keyLen], nil
 }
 var magic = []byte("OxychromaticBlowfishSwatDynamite")
 func bcryptHash(out, shapass, shasalt []byte) {
 	c, err := blowfish.NewSaltedCipher(shapass, shasalt)
 	if err != nil {
 		panic(err)
 	}
 	for i := 0; i < 64; i++ {
 		blowfish.ExpandKey(shasalt, c)
 		blowfish.ExpandKey(shapass, c)
 	}
 	copy(out, magic)
 	for i := 0; i < 32; i += 8 {
 		for j := 0; j < 64; j++ {
 			c.Encrypt(out[i:i+8], out[i:i+8])
 		}
 	}
 	// Swap bytes due to different endianness.
 	for i := 0; i < 32; i += 4 {
 		out[i+3], out[i+2], out[i+1], out[i] = out[i], out[i+1], out[i+2], out[i+3]
 	}
 }
--- a/vendor/golang.org/x/crypto/ssh/kex.go
+++ b/vendor/golang.org/x/crypto/ssh/kex.go
@ -572,7 +572,7 @@ func (gex *dhGEXSHA) diffieHellman(theirPublic, myPrivate *big.Int) (*big.Int, e
 	return new(big.Int).Exp(theirPublic, myPrivate, gex.p), nil
 }
-func (gex *dhGEXSHA) Client(c packetConn, randSource io.Reader, magics *handshakeMagics) (*kexResult, error) {
+func (gex dhGEXSHA) Client(c packetConn, randSource io.Reader, magics *handshakeMagics) (*kexResult, error) {
 	// Send GexRequest
 	kexDHGexRequest := kexDHGexRequestMsg{
 		MinBits:      dhGroupExchangeMinimumBits,
@ -677,7 +677,7 @@ func (gex *dhGEXSHA) Client(c packetConn, randSource io.Reader, magics *handshak
 // Server half implementation of the Diffie Hellman Key Exchange with SHA1 and SHA256.
 //
 // This is a minimal implementation to satisfy the automated tests.
-func (gex *dhGEXSHA) Server(c packetConn, randSource io.Reader, magics *handshakeMagics, priv Signer) (result *kexResult, err error) {
+func (gex dhGEXSHA) Server(c packetConn, randSource io.Reader, magics *handshakeMagics, priv Signer) (result *kexResult, err error) {
 	// Receive GexRequest
 	packet, err := c.readPacket()
 	if err != nil {
--- a/vendor/golang.org/x/crypto/ssh/keys.go
+++ b/vendor/golang.org/x/crypto/ssh/keys.go
@ -7,6 +7,8 @@ package ssh
 import (
 	"bytes"
 	"crypto"
 	"crypto/aes"
 	"crypto/cipher"
 	"crypto/dsa"
 	"crypto/ecdsa"
 	"crypto/elliptic"
@ -25,6 +27,7 @@ import (
 	"strings"
 	"golang.org/x/crypto/ed25519"
 	"golang.org/x/crypto/ssh/internal/bcrypt_pbkdf"
 )
 // These constants represent the algorithm names for key types supported by this
@ -559,9 +562,11 @@ func parseED25519(in []byte) (out PublicKey, rest []byte, err error) {
 		return nil, nil, err
 	}
-	key := ed25519.PublicKey(w.KeyBytes)
+	if l := len(w.KeyBytes); l != ed25519.PublicKeySize {
 		return nil, nil, fmt.Errorf("invalid size %d for Ed25519 public key", l)
 	}
-	return (ed25519PublicKey)(key), w.Rest, nil
+	return ed25519PublicKey(w.KeyBytes), w.Rest, nil
 }
 func (k ed25519PublicKey) Marshal() []byte {
@ -579,9 +584,11 @@ func (k ed25519PublicKey) Verify(b []byte, sig *Signature) error {
 	if sig.Format != k.Type() {
 		return fmt.Errorf("ssh: signature type %s for key type %s", sig.Format, k.Type())
 	}
 	if l := len(k); l != ed25519.PublicKeySize {
 		return fmt.Errorf("ssh: invalid size %d for Ed25519 public key", l)
 	}
-	edKey := (ed25519.PublicKey)(k)
+	if ok := ed25519.Verify(ed25519.PublicKey(k), b, sig.Blob); !ok {
 	if ok := ed25519.Verify(edKey, b, sig.Blob); !ok {
 		return errors.New("ssh: signature did not verify")
 	}
@ -835,6 +842,10 @@ func parseSKEd25519(in []byte) (out PublicKey, rest []byte, err error) {
 		return nil, nil, err
 	}
 	if l := len(w.KeyBytes); l != ed25519.PublicKeySize {
 		return nil, nil, fmt.Errorf("invalid size %d for Ed25519 public key", l)
 	}
 	key := new(skEd25519PublicKey)
 	key.application = w.Application
 	key.PublicKey = ed25519.PublicKey(w.KeyBytes)
@ -859,6 +870,9 @@ func (k *skEd25519PublicKey) Verify(data []byte, sig *Signature) error {
 	if sig.Format != k.Type() {
 		return fmt.Errorf("ssh: signature type %s for key type %s", sig.Format, k.Type())
 	}
 	if l := len(k.PublicKey); l != ed25519.PublicKeySize {
 		return fmt.Errorf("invalid size %d for Ed25519 public key", l)
 	}
 	h := sha256.New()
 	h.Write([]byte(k.application))
@ -895,8 +909,7 @@ func (k *skEd25519PublicKey) Verify(data []byte, sig *Signature) error {
 	original := Marshal(blob)
-	edKey := (ed25519.PublicKey)(k.PublicKey)
+	if ok := ed25519.Verify(k.PublicKey, original, edSig.Signature); !ok {
 	if ok := ed25519.Verify(edKey, original, edSig.Signature); !ok {
 		return errors.New("ssh: signature did not verify")
 	}
@ -1048,14 +1061,18 @@ func NewPublicKey(key interface{}) (PublicKey, error) {
 	case *dsa.PublicKey:
 		return (*dsaPublicKey)(key), nil
 	case ed25519.PublicKey:
-		return (ed25519PublicKey)(key), nil
+		if l := len(key); l != ed25519.PublicKeySize {
 			return nil, fmt.Errorf("ssh: invalid size %d for Ed25519 public key", l)
 		}
 		return ed25519PublicKey(key), nil
 	default:
 		return nil, fmt.Errorf("ssh: unsupported key type %T", key)
 	}
 }
 // ParsePrivateKey returns a Signer from a PEM encoded private key. It supports
-// the same keys as ParseRawPrivateKey.
+// the same keys as ParseRawPrivateKey. If the private key is encrypted, it
 // will return a PassphraseMissingError.
 func ParsePrivateKey(pemBytes []byte) (Signer, error) {
 	key, err := ParseRawPrivateKey(pemBytes)
 	if err != nil {
@ -1068,8 +1085,8 @@ func ParsePrivateKey(pemBytes []byte) (Signer, error) {
 // ParsePrivateKeyWithPassphrase returns a Signer from a PEM encoded private
 // key and passphrase. It supports the same keys as
 // ParseRawPrivateKeyWithPassphrase.
-func ParsePrivateKeyWithPassphrase(pemBytes, passPhrase []byte) (Signer, error) {
+func ParsePrivateKeyWithPassphrase(pemBytes, passphrase []byte) (Signer, error) {
-	key, err := ParseRawPrivateKeyWithPassphrase(pemBytes, passPhrase)
+	key, err := ParseRawPrivateKeyWithPassphrase(pemBytes, passphrase)
 	if err != nil {
 		return nil, err
 	}
@ -1085,8 +1102,21 @@ func encryptedBlock(block *pem.Block) bool {
 	return strings.Contains(block.Headers["Proc-Type"], "ENCRYPTED")
 }
 // A PassphraseMissingError indicates that parsing this private key requires a
 // passphrase. Use ParsePrivateKeyWithPassphrase.
 type PassphraseMissingError struct {
 	// PublicKey will be set if the private key format includes an unencrypted
 	// public key along with the encrypted private key.
 	PublicKey PublicKey
 }
 func (*PassphraseMissingError) Error() string {
 	return "ssh: this private key is passphrase protected"
 }
 // ParseRawPrivateKey returns a private key from a PEM encoded private key. It
-// supports RSA (PKCS#1), PKCS#8, DSA (OpenSSL), and ECDSA private keys.
+// supports RSA (PKCS#1), PKCS#8, DSA (OpenSSL), and ECDSA private keys. If the
 // private key is encrypted, it will return a PassphraseMissingError.
 func ParseRawPrivateKey(pemBytes []byte) (interface{}, error) {
 	block, _ := pem.Decode(pemBytes)
 	if block == nil {
@ -1094,7 +1124,7 @@ func ParseRawPrivateKey(pemBytes []byte) (interface{}, error) {
 	}
 	if encryptedBlock(block) {
-		return nil, errors.New("ssh: cannot decode encrypted private keys")
+		return nil, &PassphraseMissingError{}
 	}
 	switch block.Type {
@ -1108,34 +1138,36 @@ func ParseRawPrivateKey(pemBytes []byte) (interface{}, error) {
 	case "DSA PRIVATE KEY":
 		return ParseDSAPrivateKey(block.Bytes)
 	case "OPENSSH PRIVATE KEY":
-		return parseOpenSSHPrivateKey(block.Bytes)
+		return parseOpenSSHPrivateKey(block.Bytes, unencryptedOpenSSHKey)
 	default:
 		return nil, fmt.Errorf("ssh: unsupported key type %q", block.Type)
 	}
 }
 // ParseRawPrivateKeyWithPassphrase returns a private key decrypted with
-// passphrase from a PEM encoded private key. If wrong passphrase, return
+// passphrase from a PEM encoded private key. If the passphrase is wrong, it
-// x509.IncorrectPasswordError.
+// will return x509.IncorrectPasswordError.
-func ParseRawPrivateKeyWithPassphrase(pemBytes, passPhrase []byte) (interface{}, error) {
+func ParseRawPrivateKeyWithPassphrase(pemBytes, passphrase []byte) (interface{}, error) {
 	block, _ := pem.Decode(pemBytes)
 	if block == nil {
 		return nil, errors.New("ssh: no key found")
 	}
 	buf := block.Bytes
-	if encryptedBlock(block) {
+	if block.Type == "OPENSSH PRIVATE KEY" {
-		if x509.IsEncryptedPEMBlock(block) {
+		return parseOpenSSHPrivateKey(block.Bytes, passphraseProtectedOpenSSHKey(passphrase))
-			var err error
+	}
-			buf, err = x509.DecryptPEMBlock(block, passPhrase)
+
 	if !encryptedBlock(block) || !x509.IsEncryptedPEMBlock(block) {
 		return nil, errors.New("ssh: not an encrypted key")
 	}
 	buf, err := x509.DecryptPEMBlock(block, passphrase)
 	if err != nil {
 		if err == x509.IncorrectPasswordError {
 			return nil, err
 		}
 		return nil, fmt.Errorf("ssh: cannot decode encrypted private keys: %v", err)
 	}
 		}
 	}
 	switch block.Type {
 	case "RSA PRIVATE KEY":
@ -1144,8 +1176,6 @@ func ParseRawPrivateKeyWithPassphrase(pemBytes, passPhrase []byte) (interface{},
 		return x509.ParseECPrivateKey(buf)
 	case "DSA PRIVATE KEY":
 		return ParseDSAPrivateKey(buf)
 	case "OPENSSH PRIVATE KEY":
 		return parseOpenSSHPrivateKey(buf)
 	default:
 		return nil, fmt.Errorf("ssh: unsupported key type %q", block.Type)
 	}
@ -1183,9 +1213,68 @@ func ParseDSAPrivateKey(der []byte) (*dsa.PrivateKey, error) {
 	}, nil
 }
-// Implemented based on the documentation at
+func unencryptedOpenSSHKey(cipherName, kdfName, kdfOpts string, privKeyBlock []byte) ([]byte, error) {
-// https://github.com/openssh/openssh-portable/blob/master/PROTOCOL.key
+	if kdfName != "none" || cipherName != "none" {
-func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
+		return nil, &PassphraseMissingError{}
 	}
 	if kdfOpts != "" {
 		return nil, errors.New("ssh: invalid openssh private key")
 	}
 	return privKeyBlock, nil
 }
 func passphraseProtectedOpenSSHKey(passphrase []byte) openSSHDecryptFunc {
 	return func(cipherName, kdfName, kdfOpts string, privKeyBlock []byte) ([]byte, error) {
 		if kdfName == "none" || cipherName == "none" {
 			return nil, errors.New("ssh: key is not password protected")
 		}
 		if kdfName != "bcrypt" {
 			return nil, fmt.Errorf("ssh: unknown KDF %q, only supports %q", kdfName, "bcrypt")
 		}
 		var opts struct {
 			Salt   string
 			Rounds uint32
 		}
 		if err := Unmarshal([]byte(kdfOpts), &opts); err != nil {
 			return nil, err
 		}
 		k, err := bcrypt_pbkdf.Key(passphrase, []byte(opts.Salt), int(opts.Rounds), 32+16)
 		if err != nil {
 			return nil, err
 		}
 		key, iv := k[:32], k[32:]
 		c, err := aes.NewCipher(key)
 		if err != nil {
 			return nil, err
 		}
 		switch cipherName {
 		case "aes256-ctr":
 			ctr := cipher.NewCTR(c, iv)
 			ctr.XORKeyStream(privKeyBlock, privKeyBlock)
 		case "aes256-cbc":
 			if len(privKeyBlock)%c.BlockSize() != 0 {
 				return nil, fmt.Errorf("ssh: invalid encrypted private key length, not a multiple of the block size")
 			}
 			cbc := cipher.NewCBCDecrypter(c, iv)
 			cbc.CryptBlocks(privKeyBlock, privKeyBlock)
 		default:
 			return nil, fmt.Errorf("ssh: unknown cipher %q, only supports %q or %q", cipherName, "aes256-ctr", "aes256-cbc")
 		}
 		return privKeyBlock, nil
 	}
 }
 type openSSHDecryptFunc func(CipherName, KdfName, KdfOpts string, PrivKeyBlock []byte) ([]byte, error)
 // parseOpenSSHPrivateKey parses an OpenSSH private key, using the decrypt
 // function to unwrap the encrypted portion. unencryptedOpenSSHKey can be used
 // as the decrypt function to parse an unencrypted private key. See
 // https://github.com/openssh/openssh-portable/blob/master/PROTOCOL.key.
 func parseOpenSSHPrivateKey(key []byte, decrypt openSSHDecryptFunc) (crypto.PrivateKey, error) {
 	const magic = "openssh-key-v1\x00"
 	if len(key) < len(magic) || string(key[:len(magic)]) != magic {
 		return nil, errors.New("ssh: invalid openssh private key format")
@ -1204,9 +1293,22 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 	if err := Unmarshal(remaining, &w); err != nil {
 		return nil, err
 	}
 	if w.NumKeys != 1 {
 		// We only support single key files, and so does OpenSSH.
 		// https://github.com/openssh/openssh-portable/blob/4103a3ec7/sshkey.c#L4171
 		return nil, errors.New("ssh: multi-key files are not supported")
 	}
-	if w.KdfName != "none" || w.CipherName != "none" {
+	privKeyBlock, err := decrypt(w.CipherName, w.KdfName, w.KdfOpts, w.PrivKeyBlock)
-		return nil, errors.New("ssh: cannot decode encrypted private keys")
+	if err != nil {
 		if err, ok := err.(*PassphraseMissingError); ok {
 			pub, errPub := ParsePublicKey(w.PubKey)
 			if errPub != nil {
 				return nil, fmt.Errorf("ssh: failed to parse embedded public key: %v", errPub)
 			}
 			err.PublicKey = pub
 		}
 		return nil, err
 	}
 	pk1 := struct {
@ -1216,15 +1318,13 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 		Rest    []byte `ssh:"rest"`
 	}{}
-	if err := Unmarshal(w.PrivKeyBlock, &pk1); err != nil {
+	if err := Unmarshal(privKeyBlock, &pk1); err != nil || pk1.Check1 != pk1.Check2 {
-		return nil, err
+		if w.CipherName != "none" {
 			return nil, x509.IncorrectPasswordError
 		}
 		return nil, errors.New("ssh: malformed OpenSSH key")
 	}
 	if pk1.Check1 != pk1.Check2 {
 		return nil, errors.New("ssh: checkint mismatch")
 	}
 	// we only handle ed25519 and rsa keys currently
 	switch pk1.Keytype {
 	case KeyAlgoRSA:
 		// https://github.com/openssh/openssh-portable/blob/master/sshkey.c#L2760-L2773
@ -1243,10 +1343,8 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 			return nil, err
 		}
-		for i, b := range key.Pad {
+		if err := checkOpenSSHKeyPadding(key.Pad); err != nil {
-			if int(b) != i+1 {
+			return nil, err
 				return nil, errors.New("ssh: padding not as expected")
 			}
 		}
 		pk := &rsa.PrivateKey{
@ -1281,20 +1379,78 @@ func parseOpenSSHPrivateKey(key []byte) (crypto.PrivateKey, error) {
 			return nil, errors.New("ssh: private key unexpected length")
 		}
-		for i, b := range key.Pad {
+		if err := checkOpenSSHKeyPadding(key.Pad); err != nil {
-			if int(b) != i+1 {
+			return nil, err
 				return nil, errors.New("ssh: padding not as expected")
 			}
 		}
 		pk := ed25519.PrivateKey(make([]byte, ed25519.PrivateKeySize))
 		copy(pk, key.Priv)
 		return &pk, nil
 	case KeyAlgoECDSA256, KeyAlgoECDSA384, KeyAlgoECDSA521:
 		key := struct {
 			Curve   string
 			Pub     []byte
 			D       *big.Int
 			Comment string
 			Pad     []byte `ssh:"rest"`
 		}{}
 		if err := Unmarshal(pk1.Rest, &key); err != nil {
 			return nil, err
 		}
 		if err := checkOpenSSHKeyPadding(key.Pad); err != nil {
 			return nil, err
 		}
 		var curve elliptic.Curve
 		switch key.Curve {
 		case "nistp256":
 			curve = elliptic.P256()
 		case "nistp384":
 			curve = elliptic.P384()
 		case "nistp521":
 			curve = elliptic.P521()
 		default:
 			return nil, errors.New("ssh: unhandled elliptic curve: " + key.Curve)
 		}
 		X, Y := elliptic.Unmarshal(curve, key.Pub)
 		if X == nil || Y == nil {
 			return nil, errors.New("ssh: failed to unmarshal public key")
 		}
 		if key.D.Cmp(curve.Params().N) >= 0 {
 			return nil, errors.New("ssh: scalar is out of range")
 		}
 		x, y := curve.ScalarBaseMult(key.D.Bytes())
 		if x.Cmp(X) != 0 || y.Cmp(Y) != 0 {
 			return nil, errors.New("ssh: public key does not match private key")
 		}
 		return &ecdsa.PrivateKey{
 			PublicKey: ecdsa.PublicKey{
 				Curve: curve,
 				X:     X,
 				Y:     Y,
 			},
 			D: key.D,
 		}, nil
 	default:
 		return nil, errors.New("ssh: unhandled key type")
 	}
 }
 func checkOpenSSHKeyPadding(pad []byte) error {
 	for i, b := range pad {
 		if int(b) != i+1 {
 			return errors.New("ssh: padding not as expected")
 		}
 	}
 	return nil
 }
 // FingerprintLegacyMD5 returns the user presentation of the key's
 // fingerprint as described by RFC 4716 section 4.
 func FingerprintLegacyMD5(pubKey PublicKey) string {
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -618,7 +618,7 @@ go.opencensus.io/trace/tracestate
 ## explicit
 # go.uber.org/zap v1.9.1
 ## explicit
-# golang.org/x/crypto v0.0.0-20191202143827-86a70503ff7e
+# golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37
 ## explicit
 golang.org/x/crypto/bcrypt
 golang.org/x/crypto/blowfish
@ -640,6 +640,7 @@ golang.org/x/crypto/pkcs12/internal/rc2
 golang.org/x/crypto/poly1305
 golang.org/x/crypto/ssh
 golang.org/x/crypto/ssh/agent
 golang.org/x/crypto/ssh/internal/bcrypt_pbkdf
 golang.org/x/crypto/ssh/knownhosts
 # golang.org/x/mod v0.2.0
 ## explicit