1. update clientset, deepcopy using code-generator

2. add a dummy file tools.go to force "go mod vendor" to see
code-generator as dependencies
3. add a script to update CRD
4. add a README to document CRD updating steps
run go mod tidy
update README
This commit is contained in:
xiangqian
2019-12-03 01:22:21 -08:00
parent 90533183e4
commit 728e29aa7e
1128 changed files with 167705 additions and 5135 deletions

View File

@@ -0,0 +1,73 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ n+56(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JLE axpyi_end
MOVQ x_base+8(FP), SI // SI = &x
MOVQ y_base+32(FP), DI // DI = &y
MOVQ ix+80(FP), R8 // R8 = ix
MOVQ iy+88(FP), R9 // R9 = iy
LEAQ (SI)(R8*4), SI // SI = &(x[ix])
LEAQ (DI)(R9*4), DI // DI = &(y[iy])
MOVQ DI, DX // DX = DI Read Pointer for y
MOVQ incX+64(FP), R8 // R8 = incX
SHLQ $2, R8 // R8 *= sizeof(float32)
MOVQ incY+72(FP), R9 // R9 = incY
SHLQ $2, R9 // R9 *= sizeof(float32)
MOVSS alpha+0(FP), X0 // X0 = alpha
MOVSS X0, X1 // X1 = X0 // for pipelining
MOVQ CX, BX
ANDQ $3, BX // BX = n % 4
SHRQ $2, CX // CX = floor( n / 4 )
JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
axpyi_loop: // Loop unrolled 4x do {
MOVSS (SI), X2 // X_i = x[i]
MOVSS (SI)(R8*1), X3
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVSS (SI), X4
MOVSS (SI)(R8*1), X5
MULSS X1, X2 // X_i *= a
MULSS X0, X3
MULSS X1, X4
MULSS X0, X5
ADDSS (DX), X2 // X_i += y[i]
ADDSS (DX)(R9*1), X3
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDSS (DX), X4
ADDSS (DX)(R9*1), X5
MOVSS X2, (DI) // y[i] = X_i
MOVSS X3, (DI)(R9*1)
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
MOVSS X4, (DI)
MOVSS X5, (DI)(R9*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
LOOP axpyi_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpyi_end
axpyi_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
axpyi_tail: // do {
MOVSS (SI), X2 // X2 = x[i]
MULSS X1, X2 // X2 *= a
ADDSS (DI), X2 // X2 += y[i]
MOVSS X2, (DI) // y[i] = X2
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,78 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ n+96(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JLE axpyi_end
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+48(FP), SI // SI = &x
MOVQ y_base+72(FP), DX // DX = &y
MOVQ ix+120(FP), R8 // R8 = ix // Load the first index
MOVQ iy+128(FP), R9 // R9 = iy
MOVQ idst+32(FP), R10 // R10 = idst
LEAQ (SI)(R8*4), SI // SI = &(x[ix])
LEAQ (DX)(R9*4), DX // DX = &(y[iy])
LEAQ (DI)(R10*4), DI // DI = &(dst[idst])
MOVQ incX+104(FP), R8 // R8 = incX
SHLQ $2, R8 // R8 *= sizeof(float32)
MOVQ incY+112(FP), R9 // R9 = incY
SHLQ $2, R9 // R9 *= sizeof(float32)
MOVQ incDst+24(FP), R10 // R10 = incDst
SHLQ $2, R10 // R10 *= sizeof(float32)
MOVSS alpha+40(FP), X0 // X0 = alpha
MOVSS X0, X1 // X1 = X0 // for pipelining
MOVQ CX, BX
ANDQ $3, BX // BX = n % 4
SHRQ $2, CX // CX = floor( n / 4 )
JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
axpyi_loop: // Loop unrolled 4x do {
MOVSS (SI), X2 // X_i = x[i]
MOVSS (SI)(R8*1), X3
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVSS (SI), X4
MOVSS (SI)(R8*1), X5
MULSS X1, X2 // X_i *= a
MULSS X0, X3
MULSS X1, X4
MULSS X0, X5
ADDSS (DX), X2 // X_i += y[i]
ADDSS (DX)(R9*1), X3
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDSS (DX), X4
ADDSS (DX)(R9*1), X5
MOVSS X2, (DI) // dst[i] = X_i
MOVSS X3, (DI)(R10*1)
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
MOVSS X4, (DI)
MOVSS X5, (DI)(R10*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
LOOP axpyi_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpyi_end
axpyi_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
axpyi_tail: // do {
MOVSS (SI), X2 // X2 = x[i]
MULSS X1, X2 // X2 *= a
ADDSS (DX), X2 // X2 += y[i]
MOVSS X2, (DI) // dst[i] = X2
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DX // DX = &(DX[incY])
ADDQ R10, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,97 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// func AxpyUnitary(alpha float32, x, y []float32)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI // SI = &x
MOVQ y_base+32(FP), DI // DI = &y
MOVQ x_len+16(FP), BX // BX = min( len(x), len(y) )
CMPQ y_len+40(FP), BX
CMOVQLE y_len+40(FP), BX
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
MOVSS alpha+0(FP), X0
SHUFPS $0, X0, X0 // X0 = { a, a, a, a }
XORQ AX, AX // i = 0
PXOR X2, X2 // 2 NOP instructions (PXOR) to align
PXOR X3, X3 // loop to cache line
MOVQ DI, CX
ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
INCQ CX
SHRQ $2, CX
axpy_align: // Trim first value(s) in unaligned buffer do {
MOVSS (SI)(AX*4), X2 // X2 = x[i]
MULSS X0, X2 // X2 *= a
ADDSS (DI)(AX*4), X2 // X2 += y[i]
MOVSS X2, (DI)(AX*4) // y[i] = X2
INCQ AX // i++
DECQ BX
JZ axpy_end // if --BX == 0 { return }
LOOP axpy_align // } while --CX > 0
axpy_no_trim:
MOVUPS X0, X1 // Copy X0 to X1 for pipelining
MOVQ BX, CX
ANDQ $0xF, BX // BX = len % 16
SHRQ $4, CX // CX = int( len / 16 )
JZ axpy_tail4_start // if CX == 0 { return }
axpy_loop: // Loop unrolled 16x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
MOVUPS 16(SI)(AX*4), X3
MOVUPS 32(SI)(AX*4), X4
MOVUPS 48(SI)(AX*4), X5
MULPS X0, X2 // X2 *= a
MULPS X1, X3
MULPS X0, X4
MULPS X1, X5
ADDPS (DI)(AX*4), X2 // X2 += y[i:i+4]
ADDPS 16(DI)(AX*4), X3
ADDPS 32(DI)(AX*4), X4
ADDPS 48(DI)(AX*4), X5
MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
MOVUPS X3, 16(DI)(AX*4)
MOVUPS X4, 32(DI)(AX*4)
MOVUPS X5, 48(DI)(AX*4)
ADDQ $16, AX // i += 16
LOOP axpy_loop // while (--CX) > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
axpy_tail4_start: // Reset loop counter for 4-wide tail loop
MOVQ BX, CX // CX = floor( BX / 4 )
SHRQ $2, CX
JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
axpy_tail4: // Loop unrolled 4x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i]
MULPS X0, X2 // X2 *= a
ADDPS (DI)(AX*4), X2 // X2 += y[i]
MOVUPS X2, (DI)(AX*4) // y[i] = X2
ADDQ $4, AX // i += 4
LOOP axpy_tail4 // } while --CX > 0
axpy_tail_start: // Reset loop counter for 1-wide tail loop
MOVQ BX, CX // CX = BX % 4
ANDQ $3, CX
JZ axpy_end // if CX == 0 { return }
axpy_tail:
MOVSS (SI)(AX*4), X1 // X1 = x[i]
MULSS X0, X1 // X1 *= a
ADDSS (DI)(AX*4), X1 // X1 += y[i]
MOVSS X1, (DI)(AX*4) // y[i] = X1
INCQ AX // i++
LOOP axpy_tail // } while --CX > 0
axpy_end:
RET

View File

@@ -0,0 +1,98 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+32(FP), SI // SI = &x
MOVQ y_base+56(FP), DX // DX = &y
MOVQ x_len+40(FP), BX // BX = min( len(x), len(y), len(dst) )
CMPQ y_len+64(FP), BX
CMOVQLE y_len+64(FP), BX
CMPQ dst_len+8(FP), BX
CMOVQLE dst_len+8(FP), BX
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
MOVSS alpha+24(FP), X0
SHUFPS $0, X0, X0 // X0 = { a, a, a, a, }
XORQ AX, AX // i = 0
MOVQ DX, CX
ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
INCQ CX
SHRQ $2, CX
axpy_align: // Trim first value(s) in unaligned buffer do {
MOVSS (SI)(AX*4), X2 // X2 = x[i]
MULSS X0, X2 // X2 *= a
ADDSS (DX)(AX*4), X2 // X2 += y[i]
MOVSS X2, (DI)(AX*4) // y[i] = X2
INCQ AX // i++
DECQ BX
JZ axpy_end // if --BX == 0 { return }
LOOP axpy_align // } while --CX > 0
axpy_no_trim:
MOVUPS X0, X1 // Copy X0 to X1 for pipelining
MOVQ BX, CX
ANDQ $0xF, BX // BX = len % 16
SHRQ $4, CX // CX = floor( len / 16 )
JZ axpy_tail4_start // if CX == 0 { return }
axpy_loop: // Loop unrolled 16x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
MOVUPS 16(SI)(AX*4), X3
MOVUPS 32(SI)(AX*4), X4
MOVUPS 48(SI)(AX*4), X5
MULPS X0, X2 // X2 *= a
MULPS X1, X3
MULPS X0, X4
MULPS X1, X5
ADDPS (DX)(AX*4), X2 // X2 += y[i:i+4]
ADDPS 16(DX)(AX*4), X3
ADDPS 32(DX)(AX*4), X4
ADDPS 48(DX)(AX*4), X5
MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
MOVUPS X3, 16(DI)(AX*4)
MOVUPS X4, 32(DI)(AX*4)
MOVUPS X5, 48(DI)(AX*4)
ADDQ $16, AX // i += 16
LOOP axpy_loop // while (--CX) > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
axpy_tail4_start: // Reset loop counter for 4-wide tail loop
MOVQ BX, CX // CX = floor( BX / 4 )
SHRQ $2, CX
JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
axpy_tail4: // Loop unrolled 4x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i]
MULPS X0, X2 // X2 *= a
ADDPS (DX)(AX*4), X2 // X2 += y[i]
MOVUPS X2, (DI)(AX*4) // y[i] = X2
ADDQ $4, AX // i += 4
LOOP axpy_tail4 // } while --CX > 0
axpy_tail_start: // Reset loop counter for 1-wide tail loop
MOVQ BX, CX // CX = BX % 4
ANDQ $3, CX
JZ axpy_end // if CX == 0 { return }
axpy_tail:
MOVSS (SI)(AX*4), X1 // X1 = x[i]
MULSS X0, X1 // X1 *= a
ADDSS (DX)(AX*4), X1 // X1 += y[i]
MOVSS X1, (DI)(AX*4) // y[i] = X1
INCQ AX // i++
LOOP axpy_tail // } while --CX > 0
axpy_end:
RET

View File

@@ -0,0 +1,91 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R10
#define INC_Y R9
#define INCx3_Y R11
#define SUM X0
#define P_SUM X1
// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
TEXT ·DdotInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ n+48(FP), LEN // LEN = n
PXOR SUM, SUM // SUM = 0
CMPQ LEN, $0
JE dot_end
MOVQ ix+72(FP), INC_X // INC_X = ix
MOVQ iy+80(FP), INC_Y // INC_Y = iy
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
SHLQ $2, INC_X
MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
SHLQ $2, INC_Y
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dot_tail // if LEN == 0 { goto dot_tail }
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
dot_loop: // Loop unrolled 4x do {
CVTSS2SD (X_PTR), X2 // X_i = x[i:i+1]
CVTSS2SD (X_PTR)(INC_X*1), X3
CVTSS2SD (X_PTR)(INC_X*2), X4
CVTSS2SD (X_PTR)(INCx3_X*1), X5
CVTSS2SD (Y_PTR), X6 // X_j = y[i:i+1]
CVTSS2SD (Y_PTR)(INC_Y*1), X7
CVTSS2SD (Y_PTR)(INC_Y*2), X8
CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
MULSD X6, X2 // X_i *= X_j
MULSD X7, X3
MULSD X8, X4
MULSD X9, X5
ADDSD X2, SUM // SUM += X_i
ADDSD X3, P_SUM
ADDSD X4, SUM
ADDSD X5, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDSD P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
CVTSS2SD (X_PTR), X2 // X2 = x[i]
CVTSS2SD (Y_PTR), X3 // X2 *= y[i]
MULSD X3, X2
ADDSD X2, SUM // SUM += X2
ADDQ INC_X, X_PTR // X_PTR += INC_X
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVSD SUM, sum+88(FP) // return SUM
RET

View File

@@ -0,0 +1,110 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define IDX AX
#define SUM X0
#define P_SUM X1
// func DdotUnitary(x, y []float32) (sum float32)
TEXT ·DdotUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
PXOR SUM, SUM // psum = 0
CMPQ LEN, $0
JE dot_end
XORQ IDX, IDX
MOVQ Y_PTR, DX
ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS
JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
SUBQ $16, DX
dot_align: // Trim first value(s) in unaligned buffer do {
CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
MULSD X3, X2
ADDSD X2, SUM // SUM += X2
INCQ IDX // IDX++
DECQ LEN
JZ dot_end // if --TAIL == 0 { return }
ADDQ $4, DX
JNZ dot_align // } while --LEN > 0
dot_no_trim:
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
MOVQ LEN, TAIL
ANDQ $0x7, TAIL // TAIL = LEN % 8
SHRQ $3, LEN // LEN = floor( LEN / 8 )
JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
dot_loop: // Loop unrolled 8x do {
CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
CVTPS2PD 8(X_PTR)(IDX*4), X3
CVTPS2PD 16(X_PTR)(IDX*4), X4
CVTPS2PD 24(X_PTR)(IDX*4), X5
CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
CVTPS2PD 8(Y_PTR)(IDX*4), X7
CVTPS2PD 16(Y_PTR)(IDX*4), X8
CVTPS2PD 24(Y_PTR)(IDX*4), X9
MULPD X6, X2 // X_i *= X_j
MULPD X7, X3
MULPD X8, X4
MULPD X9, X5
ADDPD X2, SUM // SUM += X_i
ADDPD X3, P_SUM
ADDPD X4, SUM
ADDPD X5, P_SUM
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail_start:
MOVQ TAIL, LEN
SHRQ $1, LEN
JZ dot_tail_one
dot_tail_two:
CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
MULPD X6, X2 // X_i *= X_j
ADDPD X2, SUM // SUM += X_i
ADDQ $2, IDX // IDX += 2
DECQ LEN
JNZ dot_tail_two // } while --LEN > 0
ANDQ $1, TAIL
JZ dot_end
dot_tail_one:
CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
MULSD X3, X2 // X2 *= X3
ADDSD X2, SUM // SUM += X2
dot_end:
HADDPD_SUM_SUM // SUM = \sum{ SUM[i] }
MOVSD SUM, sum+48(FP) // return SUM
RET

6
vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package f32 provides float32 vector primitives.
package f32 // import "gonum.org/v1/gonum/internal/asm/f32"

View File

@@ -0,0 +1,85 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R10
#define INC_Y R9
#define INCx3_Y R11
#define SUM X0
#define P_SUM X1
// func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
TEXT ·DotInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
MOVQ n+48(FP), LEN // LEN = n
CMPQ LEN, $0
JE dot_end
MOVQ ix+72(FP), INC_X // INC_X = ix
MOVQ iy+80(FP), INC_Y // INC_Y = iy
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
SHLQ $2, INC_X
MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
SHLQ $2, INC_Y
MOVQ LEN, TAIL
ANDQ $0x3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dot_tail // if LEN == 0 { goto dot_tail }
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
dot_loop: // Loop unrolled 4x do {
MOVSS (X_PTR), X2 // X_i = x[i:i+1]
MOVSS (X_PTR)(INC_X*1), X3
MOVSS (X_PTR)(INC_X*2), X4
MOVSS (X_PTR)(INCx3_X*1), X5
MULSS (Y_PTR), X2 // X_i *= y[i:i+1]
MULSS (Y_PTR)(INC_Y*1), X3
MULSS (Y_PTR)(INC_Y*2), X4
MULSS (Y_PTR)(INCx3_Y*1), X5
ADDSS X2, SUM // SUM += X_i
ADDSS X3, P_SUM
ADDSS X4, SUM
ADDSS X5, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDSS P_SUM, SUM // P_SUM += SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVSS (X_PTR), X2 // X2 = x[i]
MULSS (Y_PTR), X2 // X2 *= y[i]
ADDSS X2, SUM // SUM += X2
ADDQ INC_X, X_PTR // X_PTR += INC_X
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVSS SUM, sum+88(FP) // return SUM
RET

View File

@@ -0,0 +1,106 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define IDX AX
#define SUM X0
#define P_SUM X1
// func DotUnitary(x, y []float32) (sum float32)
TEXT ·DotUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
CMPQ LEN, $0
JE dot_end
XORQ IDX, IDX
MOVQ Y_PTR, DX
ANDQ $0xF, DX // Align on 16-byte boundary for MULPS
JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
SUBQ $16, DX
dot_align: // Trim first value(s) in unaligned buffer do {
MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
ADDSS X2, SUM // SUM += X2
INCQ IDX // IDX++
DECQ LEN
JZ dot_end // if --TAIL == 0 { return }
ADDQ $4, DX
JNZ dot_align // } while --DX > 0
dot_no_trim:
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
MOVQ LEN, TAIL
ANDQ $0xF, TAIL // TAIL = LEN % 16
SHRQ $4, LEN // LEN = floor( LEN / 16 )
JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
dot_loop: // Loop unrolled 16x do {
MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
MOVUPS 16(X_PTR)(IDX*4), X3
MOVUPS 32(X_PTR)(IDX*4), X4
MOVUPS 48(X_PTR)(IDX*4), X5
MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
MULPS 16(Y_PTR)(IDX*4), X3
MULPS 32(Y_PTR)(IDX*4), X4
MULPS 48(Y_PTR)(IDX*4), X5
ADDPS X2, SUM // SUM += X_i
ADDPS X3, P_SUM
ADDPS X4, SUM
ADDPS X5, P_SUM
ADDQ $16, IDX // IDX += 16
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPS P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail4_start: // Reset loop counter for 4-wide tail loop
MOVQ TAIL, LEN // LEN = floor( TAIL / 4 )
SHRQ $2, LEN
JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
dot_tail4_loop: // Loop unrolled 4x do {
MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
ADDPS X2, SUM // SUM += X_i
ADDQ $4, IDX // i += 4
DECQ LEN
JNZ dot_tail4_loop // } while --LEN > 0
dot_tail_start: // Reset loop counter for 1-wide tail loop
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ dot_end // if TAIL == 0 { return }
dot_tail: // do {
MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
ADDSS X2, SUM // psum += X2
INCQ IDX // IDX++
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
HADDPS_SUM_SUM // SUM = \sum{ SUM[i] }
HADDPS_SUM_SUM
MOVSS SUM, sum+48(FP) // return SUM
RET

15
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,15 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!appengine,!safe
package f32
// Ger performs the rank-one operation
// A += alpha * x * y^T
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Ger(m, n uintptr, alpha float32,
x []float32, incX uintptr,
y []float32, incY uintptr,
a []float32, lda uintptr)

757
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,757 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define SIZE 4
#define BITSIZE 2
#define KERNELSIZE 3
#define M_DIM m+0(FP)
#define M CX
#define N_DIM n+8(FP)
#define N BX
#define TMP1 R14
#define TMP2 R15
#define X_PTR SI
#define Y y_base+56(FP)
#define Y_PTR DX
#define A_ROW AX
#define A_PTR DI
#define INC_X R8
#define INC3_X R9
#define INC_Y R10
#define INC3_Y R11
#define LDA R12
#define LDA3 R13
#define ALPHA X0
#define ALPHA_SPILL al-16(SP)
#define LOAD_ALPHA \
MOVSS alpha+16(FP), ALPHA \
SHUFPS $0, ALPHA, ALPHA
#define LOAD_SCALED4 \
PREFETCHNTA 16*SIZE(X_PTR) \
MOVDDUP (X_PTR), X1 \
MOVDDUP 2*SIZE(X_PTR), X3 \
MOVSHDUP X1, X2 \
MOVSHDUP X3, X4 \
MOVSLDUP X1, X1 \
MOVSLDUP X3, X3 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2 \
MULPS ALPHA, X3 \
MULPS ALPHA, X4
#define LOAD_SCALED2 \
MOVDDUP (X_PTR), X1 \
MOVSHDUP X1, X2 \
MOVSLDUP X1, X1 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2
#define LOAD_SCALED1 \
MOVSS (X_PTR), X1 \
SHUFPS $0, X1, X1 \
MULPS ALPHA, X1
#define LOAD_SCALED4_INC \
PREFETCHNTA (X_PTR)(INC_X*8) \
MOVSS (X_PTR), X1 \
MOVSS (X_PTR)(INC_X*1), X2 \
MOVSS (X_PTR)(INC_X*2), X3 \
MOVSS (X_PTR)(INC3_X*1), X4 \
SHUFPS $0, X1, X1 \
SHUFPS $0, X2, X2 \
SHUFPS $0, X3, X3 \
SHUFPS $0, X4, X4 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2 \
MULPS ALPHA, X3 \
MULPS ALPHA, X4
#define LOAD_SCALED2_INC \
MOVSS (X_PTR), X1 \
MOVSS (X_PTR)(INC_X*1), X2 \
SHUFPS $0, X1, X1 \
SHUFPS $0, X2, X2 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2
#define KERNEL_LOAD8 \
MOVUPS (Y_PTR), X5 \
MOVUPS 4*SIZE(Y_PTR), X6
#define KERNEL_LOAD8_INC \
MOVSS (Y_PTR), X5 \
MOVSS (Y_PTR)(INC_Y*1), X6 \
MOVSS (Y_PTR)(INC_Y*2), X7 \
MOVSS (Y_PTR)(INC3_Y*1), X8 \
UNPCKLPS X6, X5 \
UNPCKLPS X8, X7 \
MOVLHPS X7, X5 \
LEAQ (Y_PTR)(INC_Y*4), Y_PTR \
MOVSS (Y_PTR), X6 \
MOVSS (Y_PTR)(INC_Y*1), X7 \
MOVSS (Y_PTR)(INC_Y*2), X8 \
MOVSS (Y_PTR)(INC3_Y*1), X9 \
UNPCKLPS X7, X6 \
UNPCKLPS X9, X8 \
MOVLHPS X8, X6
#define KERNEL_LOAD4 \
MOVUPS (Y_PTR), X5
#define KERNEL_LOAD4_INC \
MOVSS (Y_PTR), X5 \
MOVSS (Y_PTR)(INC_Y*1), X6 \
MOVSS (Y_PTR)(INC_Y*2), X7 \
MOVSS (Y_PTR)(INC3_Y*1), X8 \
UNPCKLPS X6, X5 \
UNPCKLPS X8, X7 \
MOVLHPS X7, X5
#define KERNEL_LOAD2 \
MOVSD (Y_PTR), X5
#define KERNEL_LOAD2_INC \
MOVSS (Y_PTR), X5 \
MOVSS (Y_PTR)(INC_Y*1), X6 \
UNPCKLPS X6, X5
#define KERNEL_4x8 \
MOVUPS X5, X7 \
MOVUPS X6, X8 \
MOVUPS X5, X9 \
MOVUPS X6, X10 \
MOVUPS X5, X11 \
MOVUPS X6, X12 \
MULPS X1, X5 \
MULPS X1, X6 \
MULPS X2, X7 \
MULPS X2, X8 \
MULPS X3, X9 \
MULPS X3, X10 \
MULPS X4, X11 \
MULPS X4, X12
#define STORE_4x8 \
MOVUPS ALPHA, ALPHA_SPILL \
MOVUPS (A_PTR), X13 \
ADDPS X13, X5 \
MOVUPS 4*SIZE(A_PTR), X14 \
ADDPS X14, X6 \
MOVUPS (A_PTR)(LDA*1), X15 \
ADDPS X15, X7 \
MOVUPS 4*SIZE(A_PTR)(LDA*1), X0 \
ADDPS X0, X8 \
MOVUPS (A_PTR)(LDA*2), X13 \
ADDPS X13, X9 \
MOVUPS 4*SIZE(A_PTR)(LDA*2), X14 \
ADDPS X14, X10 \
MOVUPS (A_PTR)(LDA3*1), X15 \
ADDPS X15, X11 \
MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0 \
ADDPS X0, X12 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, 4*SIZE(A_PTR) \
MOVUPS X7, (A_PTR)(LDA*1) \
MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \
MOVUPS X9, (A_PTR)(LDA*2) \
MOVUPS X10, 4*SIZE(A_PTR)(LDA*2) \
MOVUPS X11, (A_PTR)(LDA3*1) \
MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \
MOVUPS ALPHA_SPILL, ALPHA \
ADDQ $8*SIZE, A_PTR
#define KERNEL_4x4 \
MOVUPS X5, X6 \
MOVUPS X5, X7 \
MOVUPS X5, X8 \
MULPS X1, X5 \
MULPS X2, X6 \
MULPS X3, X7 \
MULPS X4, X8
#define STORE_4x4 \
MOVUPS (A_PTR), X13 \
ADDPS X13, X5 \
MOVUPS (A_PTR)(LDA*1), X14 \
ADDPS X14, X6 \
MOVUPS (A_PTR)(LDA*2), X15 \
ADDPS X15, X7 \
MOVUPS (A_PTR)(LDA3*1), X13 \
ADDPS X13, X8 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, (A_PTR)(LDA*1) \
MOVUPS X7, (A_PTR)(LDA*2) \
MOVUPS X8, (A_PTR)(LDA3*1) \
ADDQ $4*SIZE, A_PTR
#define KERNEL_4x2 \
MOVUPS X5, X6 \
MOVUPS X5, X7 \
MOVUPS X5, X8 \
MULPS X1, X5 \
MULPS X2, X6 \
MULPS X3, X7 \
MULPS X4, X8
#define STORE_4x2 \
MOVSD (A_PTR), X9 \
ADDPS X9, X5 \
MOVSD (A_PTR)(LDA*1), X10 \
ADDPS X10, X6 \
MOVSD (A_PTR)(LDA*2), X11 \
ADDPS X11, X7 \
MOVSD (A_PTR)(LDA3*1), X12 \
ADDPS X12, X8 \
MOVSD X5, (A_PTR) \
MOVSD X6, (A_PTR)(LDA*1) \
MOVSD X7, (A_PTR)(LDA*2) \
MOVSD X8, (A_PTR)(LDA3*1) \
ADDQ $2*SIZE, A_PTR
#define KERNEL_4x1 \
MOVSS (Y_PTR), X5 \
MOVSS X5, X6 \
MOVSS X5, X7 \
MOVSS X5, X8 \
MULSS X1, X5 \
MULSS X2, X6 \
MULSS X3, X7 \
MULSS X4, X8
#define STORE_4x1 \
ADDSS (A_PTR), X5 \
ADDSS (A_PTR)(LDA*1), X6 \
ADDSS (A_PTR)(LDA*2), X7 \
ADDSS (A_PTR)(LDA3*1), X8 \
MOVSS X5, (A_PTR) \
MOVSS X6, (A_PTR)(LDA*1) \
MOVSS X7, (A_PTR)(LDA*2) \
MOVSS X8, (A_PTR)(LDA3*1) \
ADDQ $SIZE, A_PTR
#define KERNEL_2x8 \
MOVUPS X5, X7 \
MOVUPS X6, X8 \
MULPS X1, X5 \
MULPS X1, X6 \
MULPS X2, X7 \
MULPS X2, X8
#define STORE_2x8 \
MOVUPS (A_PTR), X9 \
ADDPS X9, X5 \
MOVUPS 4*SIZE(A_PTR), X10 \
ADDPS X10, X6 \
MOVUPS (A_PTR)(LDA*1), X11 \
ADDPS X11, X7 \
MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \
ADDPS X12, X8 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, 4*SIZE(A_PTR) \
MOVUPS X7, (A_PTR)(LDA*1) \
MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \
ADDQ $8*SIZE, A_PTR
#define KERNEL_2x4 \
MOVUPS X5, X6 \
MULPS X1, X5 \
MULPS X2, X6
#define STORE_2x4 \
MOVUPS (A_PTR), X9 \
ADDPS X9, X5 \
MOVUPS (A_PTR)(LDA*1), X11 \
ADDPS X11, X6 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, (A_PTR)(LDA*1) \
ADDQ $4*SIZE, A_PTR
#define KERNEL_2x2 \
MOVSD X5, X6 \
MULPS X1, X5 \
MULPS X2, X6
#define STORE_2x2 \
MOVSD (A_PTR), X7 \
ADDPS X7, X5 \
MOVSD (A_PTR)(LDA*1), X8 \
ADDPS X8, X6 \
MOVSD X5, (A_PTR) \
MOVSD X6, (A_PTR)(LDA*1) \
ADDQ $2*SIZE, A_PTR
#define KERNEL_2x1 \
MOVSS (Y_PTR), X5 \
MOVSS X5, X6 \
MULSS X1, X5 \
MULSS X2, X6
#define STORE_2x1 \
ADDSS (A_PTR), X5 \
ADDSS (A_PTR)(LDA*1), X6 \
MOVSS X5, (A_PTR) \
MOVSS X6, (A_PTR)(LDA*1) \
ADDQ $SIZE, A_PTR
#define KERNEL_1x8 \
MULPS X1, X5 \
MULPS X1, X6
#define STORE_1x8 \
MOVUPS (A_PTR), X7 \
ADDPS X7, X5 \
MOVUPS 4*SIZE(A_PTR), X8 \
ADDPS X8, X6 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, 4*SIZE(A_PTR) \
ADDQ $8*SIZE, A_PTR
#define KERNEL_1x4 \
MULPS X1, X5 \
MULPS X1, X6
#define STORE_1x4 \
MOVUPS (A_PTR), X7 \
ADDPS X7, X5 \
MOVUPS X5, (A_PTR) \
ADDQ $4*SIZE, A_PTR
#define KERNEL_1x2 \
MULPS X1, X5
#define STORE_1x2 \
MOVSD (A_PTR), X6 \
ADDPS X6, X5 \
MOVSD X5, (A_PTR) \
ADDQ $2*SIZE, A_PTR
#define KERNEL_1x1 \
MOVSS (Y_PTR), X5 \
MULSS X1, X5
#define STORE_1x1 \
ADDSS (A_PTR), X5 \
MOVSS X5, (A_PTR) \
ADDQ $SIZE, A_PTR
// func Ger(m, n uintptr, alpha float32,
// x []float32, incX uintptr,
// y []float32, incY uintptr,
// a []float32, lda uintptr)
TEXT ·Ger(SB), 0, $16-120
MOVQ M_DIM, M
MOVQ N_DIM, N
CMPQ M, $0
JE end
CMPQ N, $0
JE end
LOAD_ALPHA
MOVQ x_base+24(FP), X_PTR
MOVQ y_base+56(FP), Y_PTR
MOVQ a_base+88(FP), A_ROW
MOVQ A_ROW, A_PTR
MOVQ lda+112(FP), LDA // LDA = LDA * sizeof(float32)
SHLQ $BITSIZE, LDA
LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3
CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
JNE inc
CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path)
JNE inc
SHRQ $2, M
JZ r2
r4:
// LOAD 4
LOAD_SCALED4
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ r4c4
r4c8:
// 4x8 KERNEL
KERNEL_LOAD8
KERNEL_4x8
STORE_4x8
ADDQ $8*SIZE, Y_PTR
DECQ N
JNZ r4c8
r4c4:
TESTQ $4, N_DIM
JZ r4c2
// 4x4 KERNEL
KERNEL_LOAD4
KERNEL_4x4
STORE_4x4
ADDQ $4*SIZE, Y_PTR
r4c2:
TESTQ $2, N_DIM
JZ r4c1
// 4x2 KERNEL
KERNEL_LOAD2
KERNEL_4x2
STORE_4x2
ADDQ $2*SIZE, Y_PTR
r4c1:
TESTQ $1, N_DIM
JZ r4end
// 4x1 KERNEL
KERNEL_4x1
STORE_4x1
ADDQ $SIZE, Y_PTR
r4end:
ADDQ $4*SIZE, X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*4), A_ROW
MOVQ A_ROW, A_PTR
DECQ M
JNZ r4
r2:
TESTQ $2, M_DIM
JZ r1
// LOAD 2
LOAD_SCALED2
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ r2c4
r2c8:
// 2x8 KERNEL
KERNEL_LOAD8
KERNEL_2x8
STORE_2x8
ADDQ $8*SIZE, Y_PTR
DECQ N
JNZ r2c8
r2c4:
TESTQ $4, N_DIM
JZ r2c2
// 2x4 KERNEL
KERNEL_LOAD4
KERNEL_2x4
STORE_2x4
ADDQ $4*SIZE, Y_PTR
r2c2:
TESTQ $2, N_DIM
JZ r2c1
// 2x2 KERNEL
KERNEL_LOAD2
KERNEL_2x2
STORE_2x2
ADDQ $2*SIZE, Y_PTR
r2c1:
TESTQ $1, N_DIM
JZ r2end
// 2x1 KERNEL
KERNEL_2x1
STORE_2x1
ADDQ $SIZE, Y_PTR
r2end:
ADDQ $2*SIZE, X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*2), A_ROW
MOVQ A_ROW, A_PTR
r1:
TESTQ $1, M_DIM
JZ end
// LOAD 1
LOAD_SCALED1
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ r1c4
r1c8:
// 1x8 KERNEL
KERNEL_LOAD8
KERNEL_1x8
STORE_1x8
ADDQ $8*SIZE, Y_PTR
DECQ N
JNZ r1c8
r1c4:
TESTQ $4, N_DIM
JZ r1c2
// 1x4 KERNEL
KERNEL_LOAD4
KERNEL_1x4
STORE_1x4
ADDQ $4*SIZE, Y_PTR
r1c2:
TESTQ $2, N_DIM
JZ r1c1
// 1x2 KERNEL
KERNEL_LOAD2
KERNEL_1x2
STORE_1x2
ADDQ $2*SIZE, Y_PTR
r1c1:
TESTQ $1, N_DIM
JZ end
// 1x1 KERNEL
KERNEL_1x1
STORE_1x1
end:
RET
inc: // Algorithm for incY != 0 ( split loads in kernel )
MOVQ incX+48(FP), INC_X // INC_X = incX * sizeof(float32)
SHLQ $BITSIZE, INC_X
MOVQ incY+80(FP), INC_Y // INC_Y = incY * sizeof(float32)
SHLQ $BITSIZE, INC_Y
LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
XORQ TMP2, TMP2
MOVQ M, TMP1
SUBQ $1, TMP1
IMULQ INC_X, TMP1
NEGQ TMP1
CMPQ INC_X, $0
CMOVQLT TMP1, TMP2
LEAQ (X_PTR)(TMP2*SIZE), X_PTR
XORQ TMP2, TMP2
MOVQ N, TMP1
SUBQ $1, TMP1
IMULQ INC_Y, TMP1
NEGQ TMP1
CMPQ INC_Y, $0
CMOVQLT TMP1, TMP2
LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR
SHRQ $2, M
JZ inc_r2
inc_r4:
// LOAD 4
LOAD_SCALED4_INC
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ inc_r4c4
inc_r4c8:
// 4x4 KERNEL
KERNEL_LOAD8_INC
KERNEL_4x8
STORE_4x8
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ N
JNZ inc_r4c8
inc_r4c4:
TESTQ $4, N_DIM
JZ inc_r4c2
// 4x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_4x4
STORE_4x4
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r4c2:
TESTQ $2, N_DIM
JZ inc_r4c1
// 4x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_4x2
STORE_4x2
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r4c1:
TESTQ $1, N_DIM
JZ inc_r4end
// 4x1 KERNEL
KERNEL_4x1
STORE_4x1
ADDQ INC_Y, Y_PTR
inc_r4end:
LEAQ (X_PTR)(INC_X*4), X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*4), A_ROW
MOVQ A_ROW, A_PTR
DECQ M
JNZ inc_r4
inc_r2:
TESTQ $2, M_DIM
JZ inc_r1
// LOAD 2
LOAD_SCALED2_INC
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ inc_r2c4
inc_r2c8:
// 2x8 KERNEL
KERNEL_LOAD8_INC
KERNEL_2x8
STORE_2x8
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ N
JNZ inc_r2c8
inc_r2c4:
TESTQ $4, N_DIM
JZ inc_r2c2
// 2x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_2x4
STORE_2x4
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r2c2:
TESTQ $2, N_DIM
JZ inc_r2c1
// 2x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_2x2
STORE_2x2
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r2c1:
TESTQ $1, N_DIM
JZ inc_r2end
// 2x1 KERNEL
KERNEL_2x1
STORE_2x1
ADDQ INC_Y, Y_PTR
inc_r2end:
LEAQ (X_PTR)(INC_X*2), X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*2), A_ROW
MOVQ A_ROW, A_PTR
inc_r1:
TESTQ $1, M_DIM
JZ end
// LOAD 1
LOAD_SCALED1
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ inc_r1c4
inc_r1c8:
// 1x8 KERNEL
KERNEL_LOAD8_INC
KERNEL_1x8
STORE_1x8
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ N
JNZ inc_r1c8
inc_r1c4:
TESTQ $4, N_DIM
JZ inc_r1c2
// 1x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_1x4
STORE_1x4
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r1c2:
TESTQ $2, N_DIM
JZ inc_r1c1
// 1x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_1x2
STORE_1x2
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r1c1:
TESTQ $1, N_DIM
JZ inc_end
// 1x1 KERNEL
KERNEL_1x1
STORE_1x1
inc_end:
RET

36
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go generated vendored Normal file
View File

@@ -0,0 +1,36 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 noasm appengine safe
package f32
// Ger performs the rank-one operation
// A += alpha * x * y^T
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Ger(m, n uintptr, alpha float32, x []float32, incX uintptr, y []float32, incY uintptr, a []float32, lda uintptr) {
if incX == 1 && incY == 1 {
x = x[:m]
y = y[:n]
for i, xv := range x {
AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
}
return
}
var ky, kx uintptr
if int(incY) < 0 {
ky = uintptr(-int(n-1) * int(incY))
}
if int(incX) < 0 {
kx = uintptr(-int(m-1) * int(incX))
}
ix := kx
for i := 0; i < int(m); i++ {
AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], uintptr(n), uintptr(incY), 1, uintptr(ky), 0)
ix += incX
}
}

55
vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go generated vendored Normal file
View File

@@ -0,0 +1,55 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package f32
// ScalUnitary is
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha float32, x []float32) {
for i := range x {
x[i] *= alpha
}
}
// ScalUnitaryTo is
// for i, v := range x {
// dst[i] = alpha * v
// }
func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
for i, v := range x {
dst[i] = alpha * v
}
}
// ScalInc is
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha float32, x []float32, n, incX uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] *= alpha
ix += incX
}
}
// ScalIncTo is
// var idst, ix uintptr
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha * x[ix]
// ix += incX
// idst += incDst
// }
func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
var idst, ix uintptr
for i := 0; i < int(n); i++ {
dst[idst] = alpha * x[ix]
ix += incX
idst += incDst
}
}

View File

@@ -0,0 +1,68 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!appengine,!safe
package f32
// AxpyUnitary is
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha float32, x, y []float32)
// AxpyUnitaryTo is
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
// AxpyInc is
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
// AxpyIncTo is
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
// DdotUnitary is
// for i, v := range x {
// sum += float64(y[i]) * float64(v)
// }
// return
func DdotUnitary(x, y []float32) (sum float64)
// DdotInc is
// for i := 0; i < int(n); i++ {
// sum += float64(y[iy]) * float64(x[ix])
// ix += incX
// iy += incY
// }
// return
func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
// DotUnitary is
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotUnitary(x, y []float32) (sum float32)
// DotInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)

View File

@@ -0,0 +1,113 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 noasm appengine safe
package f32
// AxpyUnitary is
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha float32, x, y []float32) {
for i, v := range x {
y[i] += alpha * v
}
}
// AxpyUnitaryTo is
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
for i, v := range x {
dst[i] = alpha*v + y[i]
}
}
// AxpyInc is
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
y[iy] += alpha * x[ix]
ix += incX
iy += incY
}
}
// AxpyIncTo is
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
dst[idst] = alpha*x[ix] + y[iy]
ix += incX
iy += incY
idst += incDst
}
}
// DotUnitary is
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotUnitary(x, y []float32) (sum float32) {
for i, v := range x {
sum += y[i] * v
}
return sum
}
// DotInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
for i := 0; i < int(n); i++ {
sum += y[iy] * x[ix]
ix += incX
iy += incY
}
return sum
}
// DdotUnitary is
// for i, v := range x {
// sum += float64(y[i]) * float64(v)
// }
// return
func DdotUnitary(x, y []float32) (sum float64) {
for i, v := range x {
sum += float64(y[i]) * float64(v)
}
return
}
// DdotInc is
// for i := 0; i < int(n); i++ {
// sum += float64(y[iy]) * float64(x[ix])
// ix += incX
// iy += incY
// }
// return
func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
for i := 0; i < int(n); i++ {
sum += float64(y[iy]) * float64(x[ix])
ix += incX
iy += incY
}
return
}