1. update clientset, deepcopy using code-generator

2. add a dummy file tools.go to force "go mod vendor" to see code-generator as dependencies 3. add a script to update CRD 4. add a README to document CRD updating steps run go mod tidy update README
2019-12-03 01:22:21 -08:00
parent 90533183e4
commit 728e29aa7e
1128 changed files with 167705 additions and 5135 deletions
--- a/vendor/gonum.org/v1/gonum/blas/README.md
+++ b/vendor/gonum.org/v1/gonum/blas/README.md
@@ -0,0 +1,47 @@
+# Gonum BLAS [![GoDoc](https://godoc.org/gonum.org/v1/gonum/blas?status.svg)](https://godoc.org/gonum.org/v1/gonum/blas)
+
+A collection of packages to provide BLAS functionality for the [Go programming
+language](http://golang.org)
+
+## Installation
+```sh
+  go get gonum.org/v1/gonum/blas/...
+```
+
+## Packages
+
+### blas
+
+Defines [BLAS API](http://www.netlib.org/blas/blast-forum/cinterface.pdf) split in several
+interfaces.
+
+### blas/gonum
+
+Go implementation of the BLAS API (incomplete, implements the `float32` and `float64` API).
+
+### blas/blas64 and blas/blas32
+
+Wrappers for an implementation of the double (i.e., `float64`) and single (`float32`)
+precision real parts of the BLAS API.
+
+```Go
+package main
+
+import (
+	"fmt"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+func main() {
+	v := blas64.Vector{Inc: 1, Data: []float64{1, 1, 1}}
+	fmt.Println("v has length:", blas64.Nrm2(len(v.Data), v))
+}
+```
+
+### blas/cblas128 and blas/cblas64
+
+Wrappers for an implementation of the double (i.e., `complex128`) and single (`complex64`) 
+precision complex parts of the blas API.
+
+Currently blas/cblas64 and blas/cblas128 require gonum.org/v1/netlib/blas.
--- a/vendor/gonum.org/v1/gonum/blas/blas.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas.go
@@ -0,0 +1,283 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./conversions.bash
+
+package blas
+
+// Flag constants indicate Givens transformation H matrix state.
+type Flag int
+
+const (
+	Identity    Flag = -2 // H is the identity matrix; no rotation is needed.
+	Rescaling   Flag = -1 // H specifies rescaling.
+	OffDiagonal Flag = 0  // Off-diagonal elements of H are non-unit.
+	Diagonal    Flag = 1  // Diagonal elements of H are non-unit.
+)
+
+// SrotmParams contains Givens transformation parameters returned
+// by the Float32 Srotm method.
+type SrotmParams struct {
+	Flag
+	H [4]float32 // Column-major 2 by 2 matrix.
+}
+
+// DrotmParams contains Givens transformation parameters returned
+// by the Float64 Drotm method.
+type DrotmParams struct {
+	Flag
+	H [4]float64 // Column-major 2 by 2 matrix.
+}
+
+// Transpose specifies the transposition operation of a matrix.
+type Transpose byte
+
+const (
+	NoTrans   Transpose = 'N'
+	Trans     Transpose = 'T'
+	ConjTrans Transpose = 'C'
+)
+
+// Uplo specifies whether a matrix is upper or lower triangular.
+type Uplo byte
+
+const (
+	Upper Uplo = 'U'
+	Lower Uplo = 'L'
+	All   Uplo = 'A'
+)
+
+// Diag specifies whether a matrix is unit triangular.
+type Diag byte
+
+const (
+	NonUnit Diag = 'N'
+	Unit    Diag = 'U'
+)
+
+// Side specifies from which side a multiplication operation is performed.
+type Side byte
+
+const (
+	Left  Side = 'L'
+	Right Side = 'R'
+)
+
+// Float32 implements the single precision real BLAS routines.
+type Float32 interface {
+	Float32Level1
+	Float32Level2
+	Float32Level3
+}
+
+// Float32Level1 implements the single precision real BLAS Level 1 routines.
+type Float32Level1 interface {
+	Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32
+	Dsdot(n int, x []float32, incX int, y []float32, incY int) float64
+	Sdot(n int, x []float32, incX int, y []float32, incY int) float32
+	Snrm2(n int, x []float32, incX int) float32
+	Sasum(n int, x []float32, incX int) float32
+	Isamax(n int, x []float32, incX int) int
+	Sswap(n int, x []float32, incX int, y []float32, incY int)
+	Scopy(n int, x []float32, incX int, y []float32, incY int)
+	Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int)
+	Srotg(a, b float32) (c, s, r, z float32)
+	Srotmg(d1, d2, b1, b2 float32) (p SrotmParams, rd1, rd2, rb1 float32)
+	Srot(n int, x []float32, incX int, y []float32, incY int, c, s float32)
+	Srotm(n int, x []float32, incX int, y []float32, incY int, p SrotmParams)
+	Sscal(n int, alpha float32, x []float32, incX int)
+}
+
+// Float32Level2 implements the single precision real BLAS Level 2 routines.
+type Float32Level2 interface {
+	Sgemv(tA Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sgbmv(tA Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Strmv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Strsv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Ssymv(ul Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Ssbmv(ul Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sspmv(ul Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int)
+	Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Ssyr(ul Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int)
+	Sspr(ul Uplo, n int, alpha float32, x []float32, incX int, ap []float32)
+	Ssyr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Sspr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32)
+}
+
+// Float32Level3 implements the single precision real BLAS Level 3 routines.
+type Float32Level3 interface {
+	Sgemm(tA, tB Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssymm(s Side, ul Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssyrk(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int)
+	Ssyr2k(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Strmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+	Strsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+}
+
+// Float64 implements the single precision real BLAS routines.
+type Float64 interface {
+	Float64Level1
+	Float64Level2
+	Float64Level3
+}
+
+// Float64Level1 implements the double precision real BLAS Level 1 routines.
+type Float64Level1 interface {
+	Ddot(n int, x []float64, incX int, y []float64, incY int) float64
+	Dnrm2(n int, x []float64, incX int) float64
+	Dasum(n int, x []float64, incX int) float64
+	Idamax(n int, x []float64, incX int) int
+	Dswap(n int, x []float64, incX int, y []float64, incY int)
+	Dcopy(n int, x []float64, incX int, y []float64, incY int)
+	Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int)
+	Drotg(a, b float64) (c, s, r, z float64)
+	Drotmg(d1, d2, b1, b2 float64) (p DrotmParams, rd1, rd2, rb1 float64)
+	Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64)
+	Drotm(n int, x []float64, incX int, y []float64, incY int, p DrotmParams)
+	Dscal(n int, alpha float64, x []float64, incX int)
+}
+
+// Float64Level2 implements the double precision real BLAS Level 2 routines.
+type Float64Level2 interface {
+	Dgemv(tA Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dgbmv(tA Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dtrmv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dtrsv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dsymv(ul Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dsbmv(ul Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dspmv(ul Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int)
+	Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dsyr(ul Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int)
+	Dspr(ul Uplo, n int, alpha float64, x []float64, incX int, ap []float64)
+	Dsyr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dspr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64)
+}
+
+// Float64Level3 implements the double precision real BLAS Level 3 routines.
+type Float64Level3 interface {
+	Dgemm(tA, tB Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsymm(s Side, ul Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsyrk(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int)
+	Dsyr2k(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dtrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+	Dtrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+}
+
+// Complex64 implements the single precision complex BLAS routines.
+type Complex64 interface {
+	Complex64Level1
+	Complex64Level2
+	Complex64Level3
+}
+
+// Complex64Level1 implements the single precision complex BLAS Level 1 routines.
+type Complex64Level1 interface {
+	Cdotu(n int, x []complex64, incX int, y []complex64, incY int) (dotu complex64)
+	Cdotc(n int, x []complex64, incX int, y []complex64, incY int) (dotc complex64)
+	Scnrm2(n int, x []complex64, incX int) float32
+	Scasum(n int, x []complex64, incX int) float32
+	Icamax(n int, x []complex64, incX int) int
+	Cswap(n int, x []complex64, incX int, y []complex64, incY int)
+	Ccopy(n int, x []complex64, incX int, y []complex64, incY int)
+	Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int)
+	Cscal(n int, alpha complex64, x []complex64, incX int)
+	Csscal(n int, alpha float32, x []complex64, incX int)
+}
+
+// Complex64Level2 implements the single precision complex BLAS routines Level 2 routines.
+type Complex64Level2 interface {
+	Cgemv(tA Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgbmv(tA Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Ctrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Ctrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Chemv(ul Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chbmv(ul Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chpmv(ul Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cher(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int)
+	Chpr(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64)
+	Cher2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Chpr2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64)
+}
+
+// Complex64Level3 implements the single precision complex BLAS Level 3 routines.
+type Complex64Level3 interface {
+	Cgemm(tA, tB Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csymm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csyrk(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int)
+	Csyr2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Ctrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Ctrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Chemm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Cherk(ul Uplo, t Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int)
+	Cher2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int)
+}
+
+// Complex128 implements the double precision complex BLAS routines.
+type Complex128 interface {
+	Complex128Level1
+	Complex128Level2
+	Complex128Level3
+}
+
+// Complex128Level1 implements the double precision complex BLAS Level 1 routines.
+type Complex128Level1 interface {
+	Zdotu(n int, x []complex128, incX int, y []complex128, incY int) (dotu complex128)
+	Zdotc(n int, x []complex128, incX int, y []complex128, incY int) (dotc complex128)
+	Dznrm2(n int, x []complex128, incX int) float64
+	Dzasum(n int, x []complex128, incX int) float64
+	Izamax(n int, x []complex128, incX int) int
+	Zswap(n int, x []complex128, incX int, y []complex128, incY int)
+	Zcopy(n int, x []complex128, incX int, y []complex128, incY int)
+	Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int)
+	Zscal(n int, alpha complex128, x []complex128, incX int)
+	Zdscal(n int, alpha float64, x []complex128, incX int)
+}
+
+// Complex128Level2 implements the double precision complex BLAS Level 2 routines.
+type Complex128Level2 interface {
+	Zgemv(tA Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgbmv(tA Transpose, m, n int, kL int, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Ztrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Ztrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Zhemv(ul Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhbmv(ul Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhpmv(ul Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zher(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int)
+	Zhpr(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128)
+	Zher2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zhpr2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128)
+}
+
+// Complex128Level3 implements the double precision complex BLAS Level 3 routines.
+type Complex128Level3 interface {
+	Zgemm(tA, tB Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsymm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsyrk(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int)
+	Zsyr2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Ztrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Ztrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Zhemm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zherk(ul Uplo, t Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int)
+	Zher2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int)
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
@@ -0,0 +1,469 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var blas64 blas.Float64 = gonum.Implementation{}
+
+// Use sets the BLAS float64 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Float64) {
+	blas64 = b
+}
+
+// Implementation returns the current BLAS float64 implementation.
+//
+// Implementation allows direct calls to the current the BLAS float64 implementation
+// giving finer control of parameters.
+func Implementation() blas.Float64 {
+	return blas64
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Data []float64
+	Inc  int
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Data       []float64
+	Stride     int
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Data       []float64
+	Stride     int
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	Uplo blas.Uplo
+	Diag blas.Diag
+	N    int
+	Data []float64
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	Uplo   blas.Uplo
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	Uplo   blas.Uplo
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	Uplo blas.Uplo
+	N    int
+	Data []float64
+}
+
+// Level 1
+
+const (
+	negInc    = "blas64: negative vector increment"
+	badLength = "blas64: vector length mismatch"
+)
+
+// Dot computes the dot product of the two vectors:
+//  \sum_i x[i]*y[i].
+func Dot(x, y Vector) float64 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return blas64.Ddot(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//  sqrt(\sum_i x[i]*x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dnrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of the absolute values of the elements of x:
+//  \sum_i |x[i]|.
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Idamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of the two vectors:
+//  x[i], y[i] = y[i], x[i] for all i.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//  y[i] = x[i] for all i.
+// Copy requires that the lengths of x and y match and will panic otherwise.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy adds x scaled by alpha to y:
+//  y[i] += alpha*x[i] for all i.
+func Axpy(alpha float64, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Daxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Rotg computes the parameters of a Givens plane rotation so that
+//  ⎡ c s⎤   ⎡a⎤   ⎡r⎤
+//  ⎣-s c⎦ * ⎣b⎦ = ⎣0⎦
+// where a and b are the Cartesian coordinates of a given point.
+// c, s, and r are defined as
+//  r = ±Sqrt(a^2 + b^2),
+//  c = a/r, the cosine of the rotation angle,
+//  s = a/r, the sine of the rotation angle,
+// and z is defined such that
+//  if |a| > |b|,        z = s,
+//  otherwise if c != 0, z = 1/c,
+//  otherwise            z = 1.
+func Rotg(a, b float64) (c, s, r, z float64) {
+	return blas64.Drotg(a, b)
+}
+
+// Rotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func Rotmg(d1, d2, b1, b2 float64) (p blas.DrotmParams, rd1, rd2, rb1 float64) {
+	return blas64.Drotmg(d1, d2, b1, b2)
+}
+
+// Rot applies a plane transformation to n points represented by the vectors x
+// and y:
+//  x[i] =  c*x[i] + s*y[i],
+//  y[i] = -s*x[i] + c*y[i], for all i.
+func Rot(x, y Vector, c, s float64) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drot(x.N, x.Data, x.Inc, y.Data, y.Inc, c, s)
+}
+
+// Rotm applies the modified Givens rotation to n points represented by the
+// vectors x and y.
+func Rotm(x, y Vector, p blas.DrotmParams) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drotm(x.N, x.Data, x.Inc, y.Data, y.Inc, p)
+}
+
+// Scal scales the vector x by alpha:
+//  x[i] *= alpha for all i.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	blas64.Dscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//  y = alpha * A * x + beta * y,   if t == blas.NoTrans,
+//  y = alpha * A^T * x + beta * y, if t == blas.Trans or blas.ConjTrans,
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func Gemv(t blas.Transpose, alpha float64, a General, x Vector, beta float64, y Vector) {
+	blas64.Dgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//  y = alpha * A * x + beta * y,   if t == blas.NoTrans,
+//  y = alpha * A^T * x + beta * y, if t == blas.Trans or blas.ConjTrans,
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are scalars.
+func Gbmv(t blas.Transpose, alpha float64, a Band, x Vector, beta float64, y Vector) {
+	blas64.Dgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//  x = A * x,   if t == blas.NoTrans,
+//  x = A^T * x, if t == blas.Trans or blas.ConjTrans,
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//  x = A * x,   if t == blas.NoTrans,
+//  x = A^T * x, if t == blas.Trans or blas.ConjTrans,
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//  x = A * x,   if t == blas.NoTrans,
+//  x = A^T * x, if t == blas.Trans or blas.ConjTrans,
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//  A * x = b,   if t == blas.NoTrans,
+//  A^T * x = b, if t == blas.Trans or blas.ConjTrans,
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//  A * x = b,   if t == blas.NoTrans,
+//  A^T * x = b, if t == blas.Trans or blas.ConjTrans,
+// where A is an n×n triangular band matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//  A * x = b,   if t == blas.NoTrans,
+//  A^T * x = b, if t == blas.Trans or blas.ConjTrans,
+// where A is an n×n triangular matrix in packed format, and x and b are
+// vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Symv computes
+//    y = alpha * A * x + beta * y,
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Symv(alpha float64, a Symmetric, x Vector, beta float64, y Vector) {
+	blas64.Dsymv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Sbmv performs
+//  y = alpha * A * x + beta * y,
+// where A is an n×n symmetric band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Sbmv(alpha float64, a SymmetricBand, x Vector, beta float64, y Vector) {
+	blas64.Dsbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Spmv performs
+//    y = alpha * A * x + beta * y,
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Spmv(alpha float64, a SymmetricPacked, x Vector, beta float64, y Vector) {
+	blas64.Dspmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Ger performs a rank-1 update
+//  A += alpha * x * y^T,
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(alpha float64, x, y Vector, a General) {
+	blas64.Dger(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Syr performs a rank-1 update
+//  A += alpha * x * x^T,
+// where A is an n×n symmetric matrix, x is a vector, and alpha is a scalar.
+func Syr(alpha float64, x Vector, a Symmetric) {
+	blas64.Dsyr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Spr performs the rank-1 update
+//  A += alpha * x * x^T,
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Spr(alpha float64, x Vector, a SymmetricPacked) {
+	blas64.Dspr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Syr2 performs a rank-2 update
+//  A += alpha * x * y^T + alpha * y * x^T,
+// where A is a symmetric n×n matrix, x and y are vectors, and alpha is a scalar.
+func Syr2(alpha float64, x, y Vector, a Symmetric) {
+	blas64.Dsyr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Spr2 performs a rank-2 update
+//  A += alpha * x * y^T + alpha * y * x^T,
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Spr2(alpha float64, x, y Vector, a SymmetricPacked) {
+	blas64.Dspr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//  C = alpha * A * B + beta * C,
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed.
+func Gemm(tA, tB blas.Transpose, alpha float64, a, b General, beta float64, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	blas64.Dgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//  C = alpha * A * B + beta * C, if s == blas.Left,
+//  C = alpha * B * A + beta * C, if s == blas.Right,
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha is a scalar.
+func Symm(s blas.Side, alpha float64, a Symmetric, b General, beta float64, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	blas64.Dsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//  C = alpha * A * A^T + beta * C, if t == blas.NoTrans,
+//  C = alpha * A^T * A + beta * C, if t == blas.Trans or blas.ConjTrans,
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans and
+// a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha float64, a General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//  C = alpha * A * B^T + alpha * B * A^T + beta * C, if t == blas.NoTrans,
+//  C = alpha * A^T * B + alpha * B^T * A + beta * C, if t == blas.Trans or blas.ConjTrans,
+// where C is an n×n symmetric matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha float64, a, b General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//  B = alpha * A * B,   if tA == blas.NoTrans and s == blas.Left,
+//  B = alpha * A^T * B, if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//  B = alpha * B * A,   if tA == blas.NoTrans and s == blas.Right,
+//  B = alpha * B * A^T, if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//  A * X = alpha * B,   if tA == blas.NoTrans and s == blas.Left,
+//  A^T * X = alpha * B, if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//  X * A = alpha * B,   if tA == blas.NoTrans and s == blas.Right,
+//  X * A^T = alpha * B, if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
@@ -0,0 +1,277 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("blas64: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("blas64: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a symmetric matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
@@ -0,0 +1,153 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package blas64 provides a simple interface to the float64 BLAS API.
+package blas64 // import "gonum.org/v1/gonum/blas/blas64"
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
@@ -0,0 +1,508 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var cblas128 blas.Complex128 = gonum.Implementation{}
+
+// Use sets the BLAS complex128 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Complex128) {
+	cblas128 = b
+}
+
+// Implementation returns the current BLAS complex128 implementation.
+//
+// Implementation allows direct calls to the current the BLAS complex128 implementation
+// giving finer control of parameters.
+func Implementation() blas.Complex128 {
+	return cblas128
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	Inc  int
+	Data []complex128
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Stride     int
+	Data       []complex128
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Stride     int
+	Data       []complex128
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+	Diag blas.Diag
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+}
+
+// Hermitian represents an Hermitian matrix using the conventional storage scheme.
+type Hermitian Symmetric
+
+// HermitianBand represents an Hermitian matrix using the band storage scheme.
+type HermitianBand SymmetricBand
+
+// HermitianPacked represents an Hermitian matrix using the packed storage scheme.
+type HermitianPacked SymmetricPacked
+
+// Level 1
+
+const negInc = "cblas128: negative vector increment"
+
+// Dotu computes the dot product of the two vectors without
+// complex conjugation:
+//  x^T * y.
+func Dotu(n int, x, y Vector) complex128 {
+	return cblas128.Zdotu(n, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Dotc computes the dot product of the two vectors with
+// complex conjugation:
+//  x^H * y.
+func Dotc(n int, x, y Vector) complex128 {
+	return cblas128.Zdotc(n, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//  sqrt(\sum_i x[i] * x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(n int, x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dznrm2(n, x.Data, x.Inc)
+}
+
+// Asum computes the sum of magnitudes of the real and imaginary parts of
+// elements of the vector x:
+//  \sum_i (|Re x[i]| + |Im x[i]|).
+//
+// Asum will panic if the vector increment is negative.
+func Asum(n int, x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dzasum(n, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest sum of
+// magnitudes of the real and imaginary parts (|Re x[i]|+|Im x[i]|).
+// If there are multiple such indices, the earliest is returned.
+//
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(n int, x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Izamax(n, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of two vectors:
+//  x[i], y[i] = y[i], x[i] for all i.
+func Swap(n int, x, y Vector) {
+	cblas128.Zswap(n, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//  y[i] = x[i] for all i.
+func Copy(n int, x, y Vector) {
+	cblas128.Zcopy(n, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy computes
+//  y = alpha * x + y,
+// where x and y are vectors, and alpha is a scalar.
+func Axpy(n int, alpha complex128, x, y Vector) {
+	cblas128.Zaxpy(n, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Scal computes
+//  x = alpha * x,
+// where x is a vector, and alpha is a scalar.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(n int, alpha complex128, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zscal(n, alpha, x.Data, x.Inc)
+}
+
+// Dscal computes
+//  x = alpha * x,
+// where x is a vector, and alpha is a real scalar.
+//
+// Dscal will panic if the vector increment is negative.
+func Dscal(n int, alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zdscal(n, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//  y = alpha * A * x + beta * y,   if t == blas.NoTrans,
+//  y = alpha * A^T * x + beta * y, if t == blas.Trans,
+//  y = alpha * A^H * x + beta * y, if t == blas.ConjTrans,
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gemv(t blas.Transpose, alpha complex128, a General, x Vector, beta complex128, y Vector) {
+	cblas128.Zgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//  y = alpha * A * x + beta * y,   if t == blas.NoTrans,
+//  y = alpha * A^T * x + beta * y, if t == blas.Trans,
+//  y = alpha * A^H * x + beta * y, if t == blas.ConjTrans,
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gbmv(t blas.Transpose, alpha complex128, a Band, x Vector, beta complex128, y Vector) {
+	cblas128.Zgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//  x = A * x,   if t == blas.NoTrans,
+//  x = A^T * x, if t == blas.Trans,
+//  x = A^H * x, if t == blas.ConjTrans,
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//  x = A * x,   if t == blas.NoTrans,
+//  x = A^T * x, if t == blas.Trans,
+//  x = A^H * x, if t == blas.ConjTrans,
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//  x = A * x,   if t == blas.NoTrans,
+//  x = A^T * x, if t == blas.Trans,
+//  x = A^H * x, if t == blas.ConjTrans,
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//  A * x = b,   if t == blas.NoTrans,
+//  A^T * x = b, if t == blas.Trans,
+//  A^H * x = b, if t == blas.ConjTrans,
+// where A is an n×n triangular matrix and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//  A * x = b,   if t == blas.NoTrans,
+//  A^T * x = b, if t == blas.Trans,
+//  A^H * x = b, if t == blas.ConjTrans,
+// where A is an n×n triangular band matrix, and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//  A * x = b,   if t == blas.NoTrans,
+//  A^T * x = b, if t == blas.Trans,
+//  A^H * x = b, if t == blas.ConjTrans,
+// where A is an n×n triangular matrix in packed format and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Hemv computes
+//  y = alpha * A * x + beta * y,
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Hemv(alpha complex128, a Hermitian, x Vector, beta complex128, y Vector) {
+	cblas128.Zhemv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hbmv performs
+//  y = alpha * A * x + beta * y,
+// where A is an n×n Hermitian band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Hbmv(alpha complex128, a HermitianBand, x Vector, beta complex128, y Vector) {
+	cblas128.Zhbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hpmv performs
+//  y = alpha * A * x + beta * y,
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Hpmv(alpha complex128, a HermitianPacked, x Vector, beta complex128, y Vector) {
+	cblas128.Zhpmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Geru performs a rank-1 update
+//  A += alpha * x * y^T,
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Geru(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgeru(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Gerc performs a rank-1 update
+//  A += alpha * x * y^H,
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Gerc(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgerc(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Her performs a rank-1 update
+//  A += alpha * x * y^T,
+// where A is an m×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her(alpha float64, x Vector, a Hermitian) {
+	cblas128.Zher(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Hpr performs a rank-1 update
+//  A += alpha * x * x^H,
+// where A is an n×n Hermitian matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Hpr(alpha float64, x Vector, a HermitianPacked) {
+	cblas128.Zhpr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Her2 performs a rank-2 update
+//  A += alpha * x * y^H + conj(alpha) * y * x^H,
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her2(alpha complex128, x, y Vector, a Hermitian) {
+	cblas128.Zher2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Hpr2 performs a rank-2 update
+//  A += alpha * x * y^H + conj(alpha) * y * x^H,
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Hpr2(alpha complex128, x, y Vector, a HermitianPacked) {
+	cblas128.Zhpr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//  C = alpha * A * B + beta * C,
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed or conjugated.
+func Gemm(tA, tB blas.Transpose, alpha complex128, a, b General, beta complex128, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	cblas128.Zgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//  C = alpha * A * B + beta * C, if s == blas.Left,
+//  C = alpha * B * A + beta * C, if s == blas.Right,
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Symm(s blas.Side, alpha complex128, a Symmetric, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//  C = alpha * A * A^T + beta * C, if t == blas.NoTrans,
+//  C = alpha * A^T * A + beta * C, if t == blas.Trans,
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha complex128, a General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//  C = alpha * A * B^T + alpha * B * A^T + beta * C, if t == blas.NoTrans,
+//  C = alpha * A^T * B + alpha * B^T * A + beta * C, if t == blas.Trans,
+// where C is an n×n symmetric matrix, A and B are n×k matrices if
+// t == blas.NoTrans and k×n otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha complex128, a, b General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//  B = alpha * A * B,   if tA == blas.NoTrans and s == blas.Left,
+//  B = alpha * A^T * B, if tA == blas.Trans and s == blas.Left,
+//  B = alpha * A^H * B, if tA == blas.ConjTrans and s == blas.Left,
+//  B = alpha * B * A,   if tA == blas.NoTrans and s == blas.Right,
+//  B = alpha * B * A^T, if tA == blas.Trans and s == blas.Right,
+//  B = alpha * B * A^H, if tA == blas.ConjTrans and s == blas.Right,
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//  A * X = alpha * B,   if tA == blas.NoTrans and s == blas.Left,
+//  A^T * X = alpha * B, if tA == blas.Trans and s == blas.Left,
+//  A^H * X = alpha * B, if tA == blas.ConjTrans and s == blas.Left,
+//  X * A = alpha * B,   if tA == blas.NoTrans and s == blas.Right,
+//  X * A^T = alpha * B, if tA == blas.Trans and s == blas.Right,
+//  X * A^H = alpha * B, if tA == blas.ConjTrans and s == blas.Right,
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, b contains the values of B, and the result is
+// stored in-place into b.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Hemm performs
+//  C = alpha * A * B + beta * C, if s == blas.Left,
+//  C = alpha * B * A + beta * C, if s == blas.Right,
+// where A is an n×n or m×m Hermitian matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Hemm(s blas.Side, alpha complex128, a Hermitian, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zhemm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Herk performs the Hermitian rank-k update
+//  C = alpha * A * A^H + beta*C, if t == blas.NoTrans,
+//  C = alpha * A^H * A + beta*C, if t == blas.ConjTrans,
+// where C is an n×n Hermitian matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Herk(t blas.Transpose, alpha float64, a General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zherk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Her2k performs the Hermitian rank-2k update
+//  C = alpha * A * B^H + conj(alpha) * B * A^H + beta * C, if t == blas.NoTrans,
+//  C = alpha * A^H * B + conj(alpha) * B^H * A + beta * C, if t == blas.ConjTrans,
+// where C is an n×n Hermitian matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Her2k(t blas.Transpose, alpha complex128, a, b General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zher2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
@@ -0,0 +1,279 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("cblas128: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("cblas128: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a symmetric matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// HermitianCols represents a matrix using the conventional column-major storage scheme.
+type HermitianCols Hermitian
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t HermitianCols) From(a Hermitian) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Hermitian) From(a HermitianCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// HermitianBandCols represents an Hermitian matrix using the band column-major storage scheme.
+type HermitianBandCols HermitianBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBandCols) From(a HermitianBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBand) From(a HermitianBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cblas128 provides a simple interface to the complex128 BLAS API.
+package cblas128 // import "gonum.org/v1/gonum/blas/cblas128"
--- a/vendor/gonum.org/v1/gonum/blas/conversions.bash
+++ b/vendor/gonum.org/v1/gonum/blas/conversions.bash
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+
+# Copyright ©2017 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Generate code for blas32.
+echo Generating blas32/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv.go
+
+echo Generating blas32/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_test.go
+
+echo Generating blas32/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv_symmetric.go
+
+echo Generating blas32/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_symmetric_test.go
+
+
+# Generate code for cblas128.
+echo Generating cblas128/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv.go
+
+echo Generating cblas128/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_test.go
+
+echo Generating cblas128/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv_symmetric.go
+
+echo Generating cblas128/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_symmetric_test.go
+
+echo Generating cblas128/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas128/conv_hermitian.go
+
+echo Generating cblas128/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_hermitian_test.go
+
+
+# Generate code for cblas64.
+echo Generating cblas64/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+\
+>> cblas64/conv.go
+
+echo Generating cblas64/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_test.go
+
+echo Generating cblas64/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas64/conv_hermitian.go
+
+echo Generating cblas64/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_hermitian_test.go
--- a/vendor/gonum.org/v1/gonum/blas/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/doc.go
@@ -0,0 +1,108 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package blas provides interfaces for the BLAS linear algebra standard.
+
+All methods must perform appropriate parameter checking and panic if
+provided parameters that do not conform to the requirements specified
+by the BLAS standard.
+
+Quick Reference Guide to the BLAS from http://www.netlib.org/lapack/lug/node145.html
+
+This version is modified to remove the "order" option. All matrix operations are
+on row-order matrices.
+
+Level 1 BLAS
+
+	        dim scalar vector   vector   scalars              5-element prefixes
+	                                                          struct
+
+	_rotg (                                      a, b )                S, D
+	_rotmg(                              d1, d2, a, b )                S, D
+	_rot  ( n,         x, incX, y, incY,               c, s )          S, D
+	_rotm ( n,         x, incX, y, incY,                      param )  S, D
+	_swap ( n,         x, incX, y, incY )                              S, D, C, Z
+	_scal ( n,  alpha, x, incX )                                       S, D, C, Z, Cs, Zd
+	_copy ( n,         x, incX, y, incY )                              S, D, C, Z
+	_axpy ( n,  alpha, x, incX, y, incY )                              S, D, C, Z
+	_dot  ( n,         x, incX, y, incY )                              S, D, Ds
+	_dotu ( n,         x, incX, y, incY )                              C, Z
+	_dotc ( n,         x, incX, y, incY )                              C, Z
+	__dot ( n,  alpha, x, incX, y, incY )                              Sds
+	_nrm2 ( n,         x, incX )                                       S, D, Sc, Dz
+	_asum ( n,         x, incX )                                       S, D, Sc, Dz
+	I_amax( n,         x, incX )                                       s, d, c, z
+
+Level 2 BLAS
+
+	        options                   dim   b-width scalar matrix  vector   scalar vector   prefixes
+
+	_gemv (        trans,      m, n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_gbmv (        trans,      m, n, kL, kU, alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_hemv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hpmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) C, Z
+	_symv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_sbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_spmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) S, D
+	_trmv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbmv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpmv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+	_trsv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbsv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpsv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+
+	        options                   dim   scalar vector   vector   matrix  prefixes
+
+	_ger  (                    m, n, alpha, x, incX, y, incY, a, lda ) S, D
+	_geru (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_gerc (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_her  ( uplo,                 n, alpha, x, incX,          a, lda ) C, Z
+	_hpr  ( uplo,                 n, alpha, x, incX,          ap )     C, Z
+	_her2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_hpr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     C, Z
+	_syr  ( uplo,                 n, alpha, x, incX,          a, lda ) S, D
+	_spr  ( uplo,                 n, alpha, x, incX,          ap )     S, D
+	_syr2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) S, D
+	_spr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     S, D
+
+Level 3 BLAS
+
+	        options                                 dim      scalar matrix  matrix  scalar matrix  prefixes
+
+	_gemm (             transA, transB,      m, n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_symm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_hemm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_syrk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) S, D, C, Z
+	_herk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) C, Z
+	_syr2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_her2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_trmm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+	_trsm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+
+Meaning of prefixes
+
+	S - float32	C - complex64
+	D - float64	Z - complex128
+
+Matrix types
+
+	GE - GEneral 		GB - General Band
+	SY - SYmmetric 		SB - Symmetric Band 	SP - Symmetric Packed
+	HE - HErmitian 		HB - Hermitian Band 	HP - Hermitian Packed
+	TR - TRiangular 	TB - Triangular Band 	TP - Triangular Packed
+
+Options
+
+	trans 	= NoTrans, Trans, ConjTrans
+	uplo 	= Upper, Lower
+	diag 	= Nonunit, Unit
+	side 	= Left, Right (A or op(A) on the left, or A or op(A) on the right)
+
+For real matrices, Trans and ConjTrans have the same meaning.
+For Hermitian matrices, trans = Trans is not allowed.
+For complex symmetric matrices, trans = ConjTrans is not allowed.
+*/
+package blas // import "gonum.org/v1/gonum/blas"
--- a/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
@@ -0,0 +1,314 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Dgemm performs one of the matrix-matrix operations
+//  C = alpha * A * B + beta * C
+//  C = alpha * A^T * B + beta * C
+//  C = alpha * A * B^T + beta * C
+//  C = alpha * A^T * B^T + beta * C
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+func (Implementation) Dgemm(tA, tB blas.Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	dgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func dgemmParallel(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(A^T * B)
+	// Cij = \sum_k Aik Bjk,	(A * B^T)
+	// Cij = \sum_k Aki Bjk,	(A^T * B^T)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		dgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	nWorkers := runtime.GOMAXPROCS(0)
+	if parBlocks < nWorkers {
+		nWorkers = parBlocks
+	}
+	// There is a tradeoff between the workers having to wait for work
+	// and a large buffer making operations slow.
+	buf := buffMul * nWorkers
+	if buf > parBlocks {
+		buf = parBlocks
+	}
+
+	sendChan := make(chan subMul, buf)
+
+	// Launch workers. A worker receives an {i, j} submatrix of c, and computes
+	// A_ik B_ki (or the transposed version) storing the result in c_ij. When the
+	// channel is finally closed, it signals to the waitgroup that it has finished
+	// computing.
+	var wg sync.WaitGroup
+	for i := 0; i < nWorkers; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for sub := range sendChan {
+				i := sub.i
+				j := sub.j
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView64(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float64
+					if aTrans {
+						aSub = sliceView64(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView64(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView64(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView64(b, ldb, k, j, lenk, lenj)
+					}
+					dgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}
+		}()
+	}
+
+	// Send out all of the {i, j} subblocks for computation.
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			sendChan <- subMul{
+				i: i,
+				j: j,
+			}
+		}
+	}
+	close(sendChan)
+	wg.Wait()
+}
+
+// dgemmSerial is serial matrix multiply
+func dgemmSerial(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	switch {
+	case !aTrans && !bTrans:
+		dgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		dgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		dgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		dgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// dgemmSerial where neither a nor b are transposed
+func dgemmSerialNotNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is transposed and b is not
+func dgemmSerialTransNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is not transposed and b is
+func dgemmSerialNotTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f64.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// dgemmSerial where both are transposed
+func dgemmSerialTransTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView64(a []float64, lda, i, j, r, c int) []float64 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
@@ -0,0 +1,88 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ensure changes made to blas/native are reflected in blas/cgo where relevant.
+
+/*
+Package gonum is a Go implementation of the BLAS API. This implementation
+panics when the input arguments are invalid as per the standard, for example
+if a vector increment is zero. Note that the treatment of NaN values
+is not specified, and differs among the BLAS implementations.
+gonum.org/v1/gonum/blas/blas64 provides helpful wrapper functions to the BLAS
+interface. The rest of this text describes the layout of the data for the input types.
+
+Note that in the function documentation, x[i] refers to the i^th element
+of the vector, which will be different from the i^th element of the slice if
+incX != 1.
+
+See http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
+for more license information.
+
+Vector arguments are effectively strided slices. They have two input arguments,
+a number of elements, n, and an increment, incX. The increment specifies the
+distance between elements of the vector. The actual Go slice may be longer
+than necessary.
+The increment may be positive or negative, except in functions with only
+a single vector argument where the increment may only be positive. If the increment
+is negative, s[0] is the last element in the slice. Note that this is not the same
+as counting backward from the end of the slice, as len(s) may be longer than
+necessary. So, for example, if n = 5 and incX = 3, the elements of s are
+	[0 * * 1 * * 2 * * 3 * * 4 * * * ...]
+where ∗ elements are never accessed. If incX = -3, the same elements are
+accessed, just in reverse order (4, 3, 2, 1, 0).
+
+Dense matrices are specified by a number of rows, a number of columns, and a stride.
+The stride specifies the number of entries in the slice between the first element
+of successive rows. The stride must be at least as large as the number of columns
+but may be longer.
+	[a00 ... a0n a0* ... a1stride-1 a21 ... amn am* ... amstride-1]
+Thus, dense[i*ld + j] refers to the {i, j}th element of the matrix.
+
+Symmetric and triangular matrices (non-packed) are stored identically to Dense,
+except that only elements in one triangle of the matrix are accessed.
+
+Packed symmetric and packed triangular matrices are laid out with the entries
+condensed such that all of the unreferenced elements are removed. So, the upper triangular
+matrix
+  [
+    1  2  3
+    0  4  5
+    0  0  6
+  ]
+and the lower-triangular matrix
+  [
+    1  0  0
+    2  3  0
+    4  5  6
+  ]
+will both be compacted as [1 2 3 4 5 6]. The (i, j) element of the original
+dense matrix can be found at element i*n - (i-1)*i/2 + j for upper triangular,
+and at element i * (i+1) /2 + j for lower triangular.
+
+Banded matrices are laid out in a compact format, constructed by removing the
+zeros in the rows and aligning the diagonals. For example, the matrix
+  [
+    1  2  3  0  0  0
+    4  5  6  7  0  0
+    0  8  9 10 11  0
+    0  0 12 13 14 15
+    0  0  0 16 17 18
+    0  0  0  0 19 20
+  ]
+
+implicitly becomes (∗ entries are never accessed)
+  [
+     *  1  2  3
+     4  5  6  7
+     8  9 10 11
+    12 13 14 15
+    16 17 18  *
+    19 20  *  *
+  ]
+which is given to the BLAS routine as [∗ 1 2 3 4 ...].
+
+See http://www.crest.iu.edu/research/mtl/reference/html/banded.html
+for more information
+*/
+package gonum // import "gonum.org/v1/gonum/blas/gonum"
--- a/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
@@ -0,0 +1,35 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Panic strings used during parameter checks.
+// This list is duplicated in netlib/blas/netlib. Keep in sync.
+const (
+	zeroIncX = "blas: zero x index increment"
+	zeroIncY = "blas: zero y index increment"
+
+	mLT0  = "blas: m < 0"
+	nLT0  = "blas: n < 0"
+	kLT0  = "blas: k < 0"
+	kLLT0 = "blas: kL < 0"
+	kULT0 = "blas: kU < 0"
+
+	badUplo      = "blas: illegal triangle"
+	badTranspose = "blas: illegal transpose"
+	badDiag      = "blas: illegal diagonal"
+	badSide      = "blas: illegal side"
+	badFlag      = "blas: illegal rotm flag"
+
+	badLdA = "blas: bad leading dimension of A"
+	badLdB = "blas: bad leading dimension of B"
+	badLdC = "blas: bad leading dimension of C"
+
+	shortX  = "blas: insufficient length of x"
+	shortY  = "blas: insufficient length of y"
+	shortAP = "blas: insufficient length of ap"
+	shortA  = "blas: insufficient length of a"
+	shortB  = "blas: insufficient length of b"
+	shortC  = "blas: insufficient length of c"
+)
--- a/vendor/gonum.org/v1/gonum/blas/gonum/gemv.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/gemv.go
@@ -0,0 +1,190 @@
+// Copyright ©2018 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// TODO(Kunde21):  Merge these methods back into level2double/level2single when Sgemv assembly kernels are merged into f32.
+
+// Dgemv computes
+//  y = alpha * A * x + beta * y    if tA = blas.NoTrans
+//  y = alpha * A^T * x + beta * y  if tA = blas.Trans or blas.ConjTrans
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func (Implementation) Dgemv(tA blas.Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	// Set up indexes
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+
+	// Quick return if possible
+	if m == 0 || n == 0 {
+		return
+	}
+
+	if (incX > 0 && (lenX-1)*incX >= len(x)) || (incX < 0 && (1-lenX)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (lenY-1)*incY >= len(y)) || (incY < 0 && (1-lenY)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		// First form y = beta * y
+		if incY > 0 {
+			Implementation{}.Dscal(lenY, beta, y, incY)
+		} else {
+			Implementation{}.Dscal(lenY, beta, y, -incY)
+		}
+		return
+	}
+
+	// Form y = alpha * A * x + y
+	if tA == blas.NoTrans {
+		f64.GemvN(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+		return
+	}
+	// Cases where a is transposed.
+	f64.GemvT(uintptr(m), uintptr(n), alpha, a, uintptr(lda), x, uintptr(incX), beta, y, uintptr(incY))
+}
+
+// Sgemv computes
+//  y = alpha * A * x + beta * y    if tA = blas.NoTrans
+//  y = alpha * A^T * x + beta * y  if tA = blas.Trans or blas.ConjTrans
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemv(tA blas.Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int) {
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if lda < max(1, n) {
+		panic(badLdA)
+	}
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// Set up indexes
+	lenX := m
+	lenY := n
+	if tA == blas.NoTrans {
+		lenX = n
+		lenY = m
+	}
+	if (incX > 0 && (lenX-1)*incX >= len(x)) || (incX < 0 && (1-lenX)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (lenY-1)*incY >= len(y)) || (incY < 0 && (1-lenY)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if len(a) < lda*(m-1)+n {
+		panic(shortA)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	// First form y = beta * y
+	if incY > 0 {
+		Implementation{}.Sscal(lenY, beta, y, incY)
+	} else {
+		Implementation{}.Sscal(lenY, beta, y, -incY)
+	}
+
+	if alpha == 0 {
+		return
+	}
+
+	var kx, ky int
+	if incX < 0 {
+		kx = -(lenX - 1) * incX
+	}
+	if incY < 0 {
+		ky = -(lenY - 1) * incY
+	}
+
+	// Form y = alpha * A * x + y
+	if tA == blas.NoTrans {
+		if incX == 1 && incY == 1 {
+			for i := 0; i < m; i++ {
+				y[i] += alpha * f32.DotUnitary(a[lda*i:lda*i+n], x[:n])
+			}
+			return
+		}
+		iy := ky
+		for i := 0; i < m; i++ {
+			y[iy] += alpha * f32.DotInc(x, a[lda*i:lda*i+n], uintptr(n), uintptr(incX), 1, uintptr(kx), 0)
+			iy += incY
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if incX == 1 && incY == 1 {
+		for i := 0; i < m; i++ {
+			tmp := alpha * x[i]
+			if tmp != 0 {
+				f32.AxpyUnitaryTo(y, tmp, a[lda*i:lda*i+n], y[:n])
+			}
+		}
+		return
+	}
+	ix := kx
+	for i := 0; i < m; i++ {
+		tmp := alpha * x[ix]
+		if tmp != 0 {
+			f32.AxpyInc(tmp, a[lda*i:lda*i+n], y, uintptr(n), 1, uintptr(incY), 0, uintptr(ky))
+		}
+		ix += incX
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
@@ -0,0 +1,58 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./single_precision.bash
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+type Implementation struct{}
+
+// [SD]gemm behavior constants. These are kept here to keep them out of the
+// way during single precision code genration.
+const (
+	blockSize   = 64 // b x b matrix
+	minParBlock = 4  // minimum number of blocks needed to go parallel
+	buffMul     = 4  // how big is the buffer relative to the number of workers
+)
+
+// subMul is a common type shared by [SD]gemm.
+type subMul struct {
+	i, j int // index of block
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func min(a, b int) int {
+	if a > b {
+		return b
+	}
+	return a
+}
+
+// blocks returns the number of divisions of the dimension length with the given
+// block size.
+func blocks(dim, bsize int) int {
+	return (dim + bsize - 1) / bsize
+}
+
+// dcabs1 returns |real(z)|+|imag(z)|.
+func dcabs1(z complex128) float64 {
+	return math.Abs(real(z)) + math.Abs(imag(z))
+}
+
+// scabs1 returns |real(z)|+|imag(z)|.
+func scabs1(z complex64) float32 {
+	return math32.Abs(real(z)) + math32.Abs(imag(z))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
@@ -0,0 +1,445 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level1 = Implementation{}
+
+// Dzasum returns the sum of the absolute values of the elements of x
+//  \sum_i |Re(x[i])| + |Im(x[i])|
+// Dzasum returns 0 if incX is negative.
+func (Implementation) Dzasum(n int, x []complex128, incX int) float64 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float64
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += dcabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += dcabs1(v)
+	}
+	return sum
+}
+
+// Dznrm2 computes the Euclidean norm of the complex vector x,
+//  ‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+// This function returns 0 if incX is negative.
+func (Implementation) Dznrm2(n int, x []complex128, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float64
+		ssq   float64 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Izamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Izamax returns -1 if n is 0 or incX is negative.
+func (Implementation) Izamax(n int, x []complex128, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := dcabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := dcabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := dcabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Zaxpy adds alpha times x to y:
+//  y[i] += alpha * x[i] for all i
+func (Implementation) Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c128.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c128.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zcopy copies the vector x to vector y.
+func (Implementation) Zcopy(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Zdotc computes the dot product
+//  x^H · y
+// of two complex vectors x and y.
+func (Implementation) Zdotc(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdotu computes the dot product
+//  x^T · y
+// of two complex vectors x and y.
+func (Implementation) Zdotu(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdscal scales the vector x by a real scalar alpha.
+// Zdscal has no effect if incX < 0.
+func (Implementation) Zdscal(n int, alpha float64, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Zscal scales the vector x by a complex scalar alpha.
+// Zscal has no effect if incX < 0.
+func (Implementation) Zscal(n int, alpha complex128, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c128.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c128.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Zswap exchanges the elements of two complex vectors x and y.
+func (Implementation) Zswap(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
@@ -0,0 +1,467 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level1 = Implementation{}
+
+// Scasum returns the sum of the absolute values of the elements of x
+//  \sum_i |Re(x[i])| + |Im(x[i])|
+// Scasum returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scasum(n int, x []complex64, incX int) float32 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float32
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += scabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += scabs1(v)
+	}
+	return sum
+}
+
+// Scnrm2 computes the Euclidean norm of the complex vector x,
+//  ‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+// This function returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scnrm2(n int, x []complex64, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float32
+		ssq   float32 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Icamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Icamax returns -1 if n is 0 or incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Icamax(n int, x []complex64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := scabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := scabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := scabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Caxpy adds alpha times x to y:
+//  y[i] += alpha * x[i] for all i
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Ccopy copies the vector x to vector y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ccopy(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Cdotc computes the dot product
+//  x^H · y
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotc(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Cdotu computes the dot product
+//  x^T · y
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotu(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Csscal scales the vector x by a real scalar alpha.
+// Csscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csscal(n int, alpha float32, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Cscal scales the vector x by a complex scalar alpha.
+// Cscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cscal(n int, alpha complex64, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Cswap exchanges the elements of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cswap(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
@@ -0,0 +1,644 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level1 = Implementation{}
+
+// Snrm2 computes the Euclidean norm of a vector,
+//  sqrt(\sum_i x[i] * x[i]).
+// This function returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Snrm2(n int, x []float32, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	var (
+		scale      float32 = 0
+		sumSquares float32 = 1
+	)
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			if v == 0 {
+				continue
+			}
+			absxi := math.Abs(v)
+			if math.IsNaN(absxi) {
+				return math.NaN()
+			}
+			if scale < absxi {
+				sumSquares = 1 + sumSquares*(scale/absxi)*(scale/absxi)
+				scale = absxi
+			} else {
+				sumSquares = sumSquares + (absxi/scale)*(absxi/scale)
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(sumSquares)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		val := x[ix]
+		if val == 0 {
+			continue
+		}
+		absxi := math.Abs(val)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			sumSquares = 1 + sumSquares*(scale/absxi)*(scale/absxi)
+			scale = absxi
+		} else {
+			sumSquares = sumSquares + (absxi/scale)*(absxi/scale)
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// Sasum computes the sum of the absolute values of the elements of x.
+//  \sum_i |x[i]|
+// Sasum returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sasum(n int, x []float32, incX int) float32 {
+	var sum float32
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Isamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Isamax returns -1 if n == 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Isamax(n int, x []float32, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Sswap exchanges the elements of two vectors.
+//  x[i], y[i] = y[i], x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sswap(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Scopy copies the elements of x into the elements of y.
+//  y[i] = x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Scopy(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Saxpy adds alpha times x to y
+//  y[i] += alpha * x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f32.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f32.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Srotg computes the plane rotation
+//   _    _      _ _       _ _
+//  |  c s |    | a |     | r |
+//  | -s c |  * | b |   = | 0 |
+//   ‾    ‾      ‾ ‾       ‾ ‾
+// where
+//  r = ±√(a^2 + b^2)
+//  c = a/r, the cosine of the plane rotation
+//  s = b/r, the sine of the plane rotation
+//
+// NOTE: There is a discrepancy between the reference implementation and the BLAS
+// technical manual regarding the sign for r when a or b are zero.
+// Srotg agrees with the definition in the manual and other
+// common BLAS implementations.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotg(a, b float32) (c, s, r, z float32) {
+	if b == 0 && a == 0 {
+		return 1, 0, a, 0
+	}
+	absA := math.Abs(a)
+	absB := math.Abs(b)
+	aGTb := absA > absB
+	r = math.Hypot(a, b)
+	if aGTb {
+		r = math.Copysign(r, a)
+	} else {
+		r = math.Copysign(r, b)
+	}
+	c = a / r
+	s = b / r
+	if aGTb {
+		z = s
+	} else if c != 0 { // r == 0 case handled above
+		z = 1 / c
+	} else {
+		z = 1
+	}
+	return
+}
+
+// Srotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotmg(d1, d2, x1, y1 float32) (p blas.SrotmParams, rd1, rd2, rx1 float32) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float32
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - h12*h21
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + h11*h22
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float32{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float32{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float32{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Srot applies a plane transformation.
+//  x[i] = c * x[i] + s * y[i]
+//  y[i] = c * y[i] - s * x[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srot(n int, x []float32, incX int, y []float32, incY int, c float32, s float32) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Srotm applies the modified Givens rotation to the 2×n matrix.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotm(n int, x []float32, incX int, y []float32, incY int, p blas.SrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx*h11+vy*h12, vx*h21+vy*h22
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx*h11+vy*h12, vx*h21+vy*h22
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+vy*h12, vx*h21+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+vy*h12, vx*h21+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx*h11+vy, -vx+vy*h22
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx*h11+vy, -vx+vy*h22
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Sscal scales x by alpha.
+//  x[i] *= alpha
+// Sscal has no effect if incX < 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sscal(n int, alpha float32, x []float32, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f32.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f32.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
@@ -0,0 +1,53 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Dsdot computes the dot product of the two vectors
+//  \sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Dsdot(n int, x []float32, incX int, y []float32, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DdotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
@@ -0,0 +1,53 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdot computes the dot product of the two vectors
+//  \sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdot(n int, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
@@ -0,0 +1,53 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdsdot computes the dot product of the two vectors plus a constant
+//  alpha + \sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return alpha + float32(f32.DdotUnitary(x[:n], y[:n]))
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return alpha + float32(f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy)))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
@@ -0,0 +1,620 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level1 = Implementation{}
+
+// Dnrm2 computes the Euclidean norm of a vector,
+//  sqrt(\sum_i x[i] * x[i]).
+// This function returns 0 if incX is negative.
+func (Implementation) Dnrm2(n int, x []float64, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	var (
+		scale      float64 = 0
+		sumSquares float64 = 1
+	)
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			if v == 0 {
+				continue
+			}
+			absxi := math.Abs(v)
+			if math.IsNaN(absxi) {
+				return math.NaN()
+			}
+			if scale < absxi {
+				sumSquares = 1 + sumSquares*(scale/absxi)*(scale/absxi)
+				scale = absxi
+			} else {
+				sumSquares = sumSquares + (absxi/scale)*(absxi/scale)
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(sumSquares)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		val := x[ix]
+		if val == 0 {
+			continue
+		}
+		absxi := math.Abs(val)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			sumSquares = 1 + sumSquares*(scale/absxi)*(scale/absxi)
+			scale = absxi
+		} else {
+			sumSquares = sumSquares + (absxi/scale)*(absxi/scale)
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// Dasum computes the sum of the absolute values of the elements of x.
+//  \sum_i |x[i]|
+// Dasum returns 0 if incX is negative.
+func (Implementation) Dasum(n int, x []float64, incX int) float64 {
+	var sum float64
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Idamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Idamax returns -1 if n == 0.
+func (Implementation) Idamax(n int, x []float64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Dswap exchanges the elements of two vectors.
+//  x[i], y[i] = y[i], x[i] for all i
+func (Implementation) Dswap(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dcopy copies the elements of x into the elements of y.
+//  y[i] = x[i] for all i
+func (Implementation) Dcopy(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Daxpy adds alpha times x to y
+//  y[i] += alpha * x[i] for all i
+func (Implementation) Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Drotg computes the plane rotation
+//   _    _      _ _       _ _
+//  |  c s |    | a |     | r |
+//  | -s c |  * | b |   = | 0 |
+//   ‾    ‾      ‾ ‾       ‾ ‾
+// where
+//  r = ±√(a^2 + b^2)
+//  c = a/r, the cosine of the plane rotation
+//  s = b/r, the sine of the plane rotation
+//
+// NOTE: There is a discrepancy between the reference implementation and the BLAS
+// technical manual regarding the sign for r when a or b are zero.
+// Drotg agrees with the definition in the manual and other
+// common BLAS implementations.
+func (Implementation) Drotg(a, b float64) (c, s, r, z float64) {
+	if b == 0 && a == 0 {
+		return 1, 0, a, 0
+	}
+	absA := math.Abs(a)
+	absB := math.Abs(b)
+	aGTb := absA > absB
+	r = math.Hypot(a, b)
+	if aGTb {
+		r = math.Copysign(r, a)
+	} else {
+		r = math.Copysign(r, b)
+	}
+	c = a / r
+	s = b / r
+	if aGTb {
+		z = s
+	} else if c != 0 { // r == 0 case handled above
+		z = 1 / c
+	} else {
+		z = 1
+	}
+	return
+}
+
+// Drotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func (Implementation) Drotmg(d1, d2, x1, y1 float64) (p blas.DrotmParams, rd1, rd2, rx1 float64) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float64
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - h12*h21
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + h11*h22
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float64{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float64{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float64{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Drot applies a plane transformation.
+//  x[i] = c * x[i] + s * y[i]
+//  y[i] = c * y[i] - s * x[i]
+func (Implementation) Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Drotm applies the modified Givens rotation to the 2×n matrix.
+func (Implementation) Drotm(n int, x []float64, incX int, y []float64, incY int, p blas.DrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx*h11+vy*h12, vx*h21+vy*h22
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx*h11+vy*h12, vx*h21+vy*h22
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+vy*h12, vx*h21+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+vy*h12, vx*h21+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx*h11+vy, -vx+vy*h22
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx*h11+vy, -vx+vy*h22
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Dscal scales x by alpha.
+//  x[i] *= alpha
+// Dscal has no effect if incX < 0.
+func (Implementation) Dscal(n int, alpha float64, x []float64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
@@ -0,0 +1,49 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Ddot computes the dot product of the two vectors
+//  \sum_i x[i]*y[i]
+func (Implementation) Ddot(n int, x []float64, incX int, y []float64, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f64.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f64.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
@@ -0,0 +1,876 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level3 = Implementation{}
+
+// Strsm solves one of the matrix equations
+//  A * X = alpha * B    if tA == blas.NoTrans and side == blas.Left
+//  A^T * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//  X * A = alpha * B    if tA == blas.NoTrans and side == blas.Right
+//  X * A^T = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f32.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f32.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f32.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f32.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f32.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f32.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f32.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Ssymm performs one of the matrix-matrix operations
+//  C = alpha * A * B + beta * C  if side == blas.Left
+//  C = alpha * B * A + beta * C  if side == blas.Right
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := 0; j < n; j++ {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float32
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float32
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Ssyrk performs one of the symmetric rank-k operations
+//  C = alpha * A * A^T + beta * C  if tA == blas.NoTrans
+//  C = alpha * A^T * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f32.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Ssyr2k performs one of the symmetric rank 2k operations
+//  C = alpha * A * B^T + alpha * B * A^T + beta * C  if tA == blas.NoTrans
+//  C = alpha * A^T * B + alpha * B^T * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for jc := range ctmp {
+					j := i + jc
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[jc] *= beta
+					ctmp[jc] += alpha * (tmp1 + tmp2)
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := 0; j <= i; j++ {
+				var tmp1, tmp2 float32
+				binner := b[j*ldb : j*ldb+k]
+				for l, v := range a[j*lda : j*lda+k] {
+					tmp1 += v * btmp[l]
+					tmp2 += atmp[l] * binner[l]
+				}
+				ctmp[j] *= beta
+				ctmp[j] += alpha * (tmp1 + tmp2)
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Strmm performs one of the matrix-matrix operations
+//  B = alpha * A * B    if tA == blas.NoTrans and side == blas.Left
+//  B = alpha * A^T * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//  B = alpha * B * A    if tA == blas.NoTrans and side == blas.Right
+//  B = alpha * B * A^T  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f32.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f32.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f32.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f32.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f32.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f32.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f32.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
@@ -0,0 +1,864 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level3 = Implementation{}
+
+// Dtrsm solves one of the matrix equations
+//  A * X = alpha * B    if tA == blas.NoTrans and side == blas.Left
+//  A^T * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//  X * A = alpha * B    if tA == blas.NoTrans and side == blas.Right
+//  X * A^T = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func (Implementation) Dtrsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f64.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f64.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f64.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f64.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f64.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f64.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f64.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Dsymm performs one of the matrix-matrix operations
+//  C = alpha * A * B + beta * C  if side == blas.Left
+//  C = alpha * B * A + beta * C  if side == blas.Right
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+func (Implementation) Dsymm(s blas.Side, ul blas.Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := 0; j < n; j++ {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float64
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float64
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Dsyrk performs one of the symmetric rank-k operations
+//  C = alpha * A * A^T + beta * C  if tA == blas.NoTrans
+//  C = alpha * A^T * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+func (Implementation) Dsyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f64.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Dsyr2k performs one of the symmetric rank 2k operations
+//  C = alpha * A * B^T + alpha * B * A^T + beta * C  if tA == blas.NoTrans
+//  C = alpha * A^T * B + alpha * B^T * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+func (Implementation) Dsyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for jc := range ctmp {
+					j := i + jc
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[jc] *= beta
+					ctmp[jc] += alpha * (tmp1 + tmp2)
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := 0; j <= i; j++ {
+				var tmp1, tmp2 float64
+				binner := b[j*ldb : j*ldb+k]
+				for l, v := range a[j*lda : j*lda+k] {
+					tmp1 += v * btmp[l]
+					tmp2 += atmp[l] * binner[l]
+				}
+				ctmp[j] *= beta
+				ctmp[j] += alpha * (tmp1 + tmp2)
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Dtrmm performs one of the matrix-matrix operations
+//  B = alpha * A * B    if tA == blas.NoTrans and side == blas.Left
+//  B = alpha * A^T * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//  B = alpha * B * A    if tA == blas.NoTrans and side == blas.Right
+//  B = alpha * B * A^T  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+func (Implementation) Dtrmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f64.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f64.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f64.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f64.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f64.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f64.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f64.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
@@ -0,0 +1,318 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sgemm performs one of the matrix-matrix operations
+//  C = alpha * A * B + beta * C
+//  C = alpha * A^T * B + beta * C
+//  C = alpha * A * B^T + beta * C
+//  C = alpha * A^T * B^T + beta * C
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemm(tA, tB blas.Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	sgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func sgemmParallel(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(A^T * B)
+	// Cij = \sum_k Aik Bjk,	(A * B^T)
+	// Cij = \sum_k Aki Bjk,	(A^T * B^T)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		sgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	nWorkers := runtime.GOMAXPROCS(0)
+	if parBlocks < nWorkers {
+		nWorkers = parBlocks
+	}
+	// There is a tradeoff between the workers having to wait for work
+	// and a large buffer making operations slow.
+	buf := buffMul * nWorkers
+	if buf > parBlocks {
+		buf = parBlocks
+	}
+
+	sendChan := make(chan subMul, buf)
+
+	// Launch workers. A worker receives an {i, j} submatrix of c, and computes
+	// A_ik B_ki (or the transposed version) storing the result in c_ij. When the
+	// channel is finally closed, it signals to the waitgroup that it has finished
+	// computing.
+	var wg sync.WaitGroup
+	for i := 0; i < nWorkers; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for sub := range sendChan {
+				i := sub.i
+				j := sub.j
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView32(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float32
+					if aTrans {
+						aSub = sliceView32(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView32(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView32(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView32(b, ldb, k, j, lenk, lenj)
+					}
+					sgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}
+		}()
+	}
+
+	// Send out all of the {i, j} subblocks for computation.
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			sendChan <- subMul{
+				i: i,
+				j: j,
+			}
+		}
+	}
+	close(sendChan)
+	wg.Wait()
+}
+
+// sgemmSerial is serial matrix multiply
+func sgemmSerial(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	switch {
+	case !aTrans && !bTrans:
+		sgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		sgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		sgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		sgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// sgemmSerial where neither a nor b are transposed
+func sgemmSerialNotNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is transposed and b is not
+func sgemmSerialTransNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is not transposed and b is
+func sgemmSerialNotTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f32.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// sgemmSerial where both are transposed
+func sgemmSerialTransTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView32(a []float32, lda, i, j, r, c int) []float32 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+
+# Copyright ©2015 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+WARNINGF32='//\
+// Float32 implementations are autogenerated and not directly tested.\
+'
+WARNINGC64='//\
+// Complex64 implementations are autogenerated and not directly tested.\
+'
+
+# Level1 routines.
+
+echo Generating level1float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32.go
+cat level1float64.go \
+| gofmt -r 'blas.Float64Level1 -> blas.Float32Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'blas.DrotmParams -> blas.SrotmParams' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e "s_^\(func (Implementation) \)Id\(.*\)\$_$WARNINGF32\1Is\2_" \
+      -e 's_^// Id_// Is_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+>> level1float32.go
+
+echo Generating level1cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1cmplx64.go
+cat level1cmplx128.go \
+| gofmt -r 'blas.Complex128Level1 -> blas.Complex64Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotcInc -> c64.DotcInc' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'dcabs1 -> scabs1' \
+\
+| sed -e "s_^\(func (Implementation) \)Zdot\(.*\)\$_$WARNINGC64\1Cdot\2_" \
+      -e 's_^// Zdot_// Cdot_' \
+      -e "s_^\(func (Implementation) \)Zdscal\(.*\)\$_$WARNINGC64\1Csscal\2_" \
+      -e 's_^// Zdscal_// Csscal_' \
+      -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e "s_^\(func (Implementation) \)Iz\(.*\)\$_$WARNINGC64\1Ic\2_" \
+      -e 's_^// Iz_// Ic_' \
+      -e "s_^\(func (Implementation) \)Dz\(.*\)\$_$WARNINGC64\1Sc\2_" \
+      -e 's_^// Dz_// Sc_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+>> level1cmplx64.go
+
+echo Generating level1float32_sdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdot.go
+
+echo Generating level1float32_dsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_dsdot.go
+cat level1float64_ddot.go \
+| gofmt -r '[]float64 -> []float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DdotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DdotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Ds\2_" \
+      -e 's_^// D_// Ds_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_dsdot.go
+
+echo Generating level1float32_sdsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdsdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)) -> alpha + float32(f32.DdotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)))' \
+| gofmt -r 'f64.DotUnitary(a, b) -> alpha + float32(f32.DdotUnitary(a, b))' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Sds\2_" \
+      -e 's_^// D\(.*\)$_// Sds\1 plus a constant_' \
+      -e 's_\\sum_alpha + \\sum_' \
+      -e 's/n int/n int, alpha float32/' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdsdot.go
+
+
+# Level2 routines.
+
+echo Generating level2float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2float32.go
+cat level2float64.go \
+| gofmt -r 'blas.Float64Level2 -> blas.Float32Level2' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyIncTo -> f32.AxpyIncTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+| gofmt -r 'f64.Ger -> f32.Ger' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level2float32.go
+
+echo Generating level2cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2cmplx64.go
+cat level2cmplx128.go \
+| gofmt -r 'blas.Complex128Level2 -> blas.Complex64Level2' \
+\
+| gofmt -r 'complex128 -> complex64' \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level2cmplx64.go
+
+# Level3 routines.
+
+echo Generating level3float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3float32.go
+cat level3float64.go \
+| gofmt -r 'blas.Float64Level3 -> blas.Float32Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level3float32.go
+
+echo Generating sgemm.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > sgemm.go
+cat dgemm.go \
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'sliceView64 -> sliceView32' \
+\
+| gofmt -r 'dgemmParallel -> sgemmParallel' \
+| gofmt -r 'computeNumBlocks64 -> computeNumBlocks32' \
+| gofmt -r 'dgemmSerial -> sgemmSerial' \
+| gofmt -r 'dgemmSerialNotNot -> sgemmSerialNotNot' \
+| gofmt -r 'dgemmSerialTransNot -> sgemmSerialTransNot' \
+| gofmt -r 'dgemmSerialNotTrans -> sgemmSerialNotTrans' \
+| gofmt -r 'dgemmSerialTransTrans -> sgemmSerialTransTrans' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_^// d_// s_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> sgemm.go
+
+echo Generating level3cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3cmplx64.go
+cat level3cmplx128.go \
+| gofmt -r 'blas.Complex128Level3 -> blas.Complex64Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'c128.DscalUnitary -> c64.SscalUnitary' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level3cmplx64.go