1. update clientset, deepcopy using code-generator
2. add a dummy file tools.go to force "go mod vendor" to see code-generator as dependencies 3. add a script to update CRD 4. add a README to document CRD updating steps run go mod tidy update README
This commit is contained in:
73
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
generated
vendored
Normal file
73
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,73 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||
TEXT ·AxpyInc(SB), NOSPLIT, $0
|
||||
MOVQ n+56(FP), CX // CX = n
|
||||
CMPQ CX, $0 // if n==0 { return }
|
||||
JLE axpyi_end
|
||||
MOVQ x_base+8(FP), SI // SI = &x
|
||||
MOVQ y_base+32(FP), DI // DI = &y
|
||||
MOVQ ix+80(FP), R8 // R8 = ix
|
||||
MOVQ iy+88(FP), R9 // R9 = iy
|
||||
LEAQ (SI)(R8*4), SI // SI = &(x[ix])
|
||||
LEAQ (DI)(R9*4), DI // DI = &(y[iy])
|
||||
MOVQ DI, DX // DX = DI Read Pointer for y
|
||||
MOVQ incX+64(FP), R8 // R8 = incX
|
||||
SHLQ $2, R8 // R8 *= sizeof(float32)
|
||||
MOVQ incY+72(FP), R9 // R9 = incY
|
||||
SHLQ $2, R9 // R9 *= sizeof(float32)
|
||||
MOVSS alpha+0(FP), X0 // X0 = alpha
|
||||
MOVSS X0, X1 // X1 = X0 // for pipelining
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, BX // BX = n % 4
|
||||
SHRQ $2, CX // CX = floor( n / 4 )
|
||||
JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
|
||||
|
||||
axpyi_loop: // Loop unrolled 4x do {
|
||||
MOVSS (SI), X2 // X_i = x[i]
|
||||
MOVSS (SI)(R8*1), X3
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
|
||||
MOVSS (SI), X4
|
||||
MOVSS (SI)(R8*1), X5
|
||||
MULSS X1, X2 // X_i *= a
|
||||
MULSS X0, X3
|
||||
MULSS X1, X4
|
||||
MULSS X0, X5
|
||||
ADDSS (DX), X2 // X_i += y[i]
|
||||
ADDSS (DX)(R9*1), X3
|
||||
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
|
||||
ADDSS (DX), X4
|
||||
ADDSS (DX)(R9*1), X5
|
||||
MOVSS X2, (DI) // y[i] = X_i
|
||||
MOVSS X3, (DI)(R9*1)
|
||||
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
|
||||
MOVSS X4, (DI)
|
||||
MOVSS X5, (DI)(R9*1)
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
|
||||
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
|
||||
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
|
||||
LOOP axpyi_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE axpyi_end
|
||||
|
||||
axpyi_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
axpyi_tail: // do {
|
||||
MOVSS (SI), X2 // X2 = x[i]
|
||||
MULSS X1, X2 // X2 *= a
|
||||
ADDSS (DI), X2 // X2 += y[i]
|
||||
MOVSS X2, (DI) // y[i] = X2
|
||||
ADDQ R8, SI // SI = &(SI[incX])
|
||||
ADDQ R9, DI // DI = &(DI[incY])
|
||||
LOOP axpyi_tail // } while --CX > 0
|
||||
|
||||
axpyi_end:
|
||||
RET
|
||||
|
78
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
generated
vendored
Normal file
78
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,78 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
|
||||
MOVQ n+96(FP), CX // CX = n
|
||||
CMPQ CX, $0 // if n==0 { return }
|
||||
JLE axpyi_end
|
||||
MOVQ dst_base+0(FP), DI // DI = &dst
|
||||
MOVQ x_base+48(FP), SI // SI = &x
|
||||
MOVQ y_base+72(FP), DX // DX = &y
|
||||
MOVQ ix+120(FP), R8 // R8 = ix // Load the first index
|
||||
MOVQ iy+128(FP), R9 // R9 = iy
|
||||
MOVQ idst+32(FP), R10 // R10 = idst
|
||||
LEAQ (SI)(R8*4), SI // SI = &(x[ix])
|
||||
LEAQ (DX)(R9*4), DX // DX = &(y[iy])
|
||||
LEAQ (DI)(R10*4), DI // DI = &(dst[idst])
|
||||
MOVQ incX+104(FP), R8 // R8 = incX
|
||||
SHLQ $2, R8 // R8 *= sizeof(float32)
|
||||
MOVQ incY+112(FP), R9 // R9 = incY
|
||||
SHLQ $2, R9 // R9 *= sizeof(float32)
|
||||
MOVQ incDst+24(FP), R10 // R10 = incDst
|
||||
SHLQ $2, R10 // R10 *= sizeof(float32)
|
||||
MOVSS alpha+40(FP), X0 // X0 = alpha
|
||||
MOVSS X0, X1 // X1 = X0 // for pipelining
|
||||
MOVQ CX, BX
|
||||
ANDQ $3, BX // BX = n % 4
|
||||
SHRQ $2, CX // CX = floor( n / 4 )
|
||||
JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
|
||||
|
||||
axpyi_loop: // Loop unrolled 4x do {
|
||||
MOVSS (SI), X2 // X_i = x[i]
|
||||
MOVSS (SI)(R8*1), X3
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
|
||||
MOVSS (SI), X4
|
||||
MOVSS (SI)(R8*1), X5
|
||||
MULSS X1, X2 // X_i *= a
|
||||
MULSS X0, X3
|
||||
MULSS X1, X4
|
||||
MULSS X0, X5
|
||||
ADDSS (DX), X2 // X_i += y[i]
|
||||
ADDSS (DX)(R9*1), X3
|
||||
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
|
||||
ADDSS (DX), X4
|
||||
ADDSS (DX)(R9*1), X5
|
||||
MOVSS X2, (DI) // dst[i] = X_i
|
||||
MOVSS X3, (DI)(R10*1)
|
||||
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
|
||||
MOVSS X4, (DI)
|
||||
MOVSS X5, (DI)(R10*1)
|
||||
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
|
||||
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
|
||||
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
|
||||
LOOP axpyi_loop // } while --CX > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE axpyi_end
|
||||
|
||||
axpyi_tail_start: // Reset loop registers
|
||||
MOVQ BX, CX // Loop counter: CX = BX
|
||||
|
||||
axpyi_tail: // do {
|
||||
MOVSS (SI), X2 // X2 = x[i]
|
||||
MULSS X1, X2 // X2 *= a
|
||||
ADDSS (DX), X2 // X2 += y[i]
|
||||
MOVSS X2, (DI) // dst[i] = X2
|
||||
ADDQ R8, SI // SI = &(SI[incX])
|
||||
ADDQ R9, DX // DX = &(DX[incY])
|
||||
ADDQ R10, DI // DI = &(DI[incY])
|
||||
LOOP axpyi_tail // } while --CX > 0
|
||||
|
||||
axpyi_end:
|
||||
RET
|
||||
|
97
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
generated
vendored
Normal file
97
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,97 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyUnitary(alpha float32, x, y []float32)
|
||||
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
|
||||
MOVQ x_base+8(FP), SI // SI = &x
|
||||
MOVQ y_base+32(FP), DI // DI = &y
|
||||
MOVQ x_len+16(FP), BX // BX = min( len(x), len(y) )
|
||||
CMPQ y_len+40(FP), BX
|
||||
CMOVQLE y_len+40(FP), BX
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE axpy_end
|
||||
MOVSS alpha+0(FP), X0
|
||||
SHUFPS $0, X0, X0 // X0 = { a, a, a, a }
|
||||
XORQ AX, AX // i = 0
|
||||
PXOR X2, X2 // 2 NOP instructions (PXOR) to align
|
||||
PXOR X3, X3 // loop to cache line
|
||||
MOVQ DI, CX
|
||||
ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
|
||||
JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
|
||||
|
||||
XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
|
||||
INCQ CX
|
||||
SHRQ $2, CX
|
||||
|
||||
axpy_align: // Trim first value(s) in unaligned buffer do {
|
||||
MOVSS (SI)(AX*4), X2 // X2 = x[i]
|
||||
MULSS X0, X2 // X2 *= a
|
||||
ADDSS (DI)(AX*4), X2 // X2 += y[i]
|
||||
MOVSS X2, (DI)(AX*4) // y[i] = X2
|
||||
INCQ AX // i++
|
||||
DECQ BX
|
||||
JZ axpy_end // if --BX == 0 { return }
|
||||
LOOP axpy_align // } while --CX > 0
|
||||
|
||||
axpy_no_trim:
|
||||
MOVUPS X0, X1 // Copy X0 to X1 for pipelining
|
||||
MOVQ BX, CX
|
||||
ANDQ $0xF, BX // BX = len % 16
|
||||
SHRQ $4, CX // CX = int( len / 16 )
|
||||
JZ axpy_tail4_start // if CX == 0 { return }
|
||||
|
||||
axpy_loop: // Loop unrolled 16x do {
|
||||
MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
|
||||
MOVUPS 16(SI)(AX*4), X3
|
||||
MOVUPS 32(SI)(AX*4), X4
|
||||
MOVUPS 48(SI)(AX*4), X5
|
||||
MULPS X0, X2 // X2 *= a
|
||||
MULPS X1, X3
|
||||
MULPS X0, X4
|
||||
MULPS X1, X5
|
||||
ADDPS (DI)(AX*4), X2 // X2 += y[i:i+4]
|
||||
ADDPS 16(DI)(AX*4), X3
|
||||
ADDPS 32(DI)(AX*4), X4
|
||||
ADDPS 48(DI)(AX*4), X5
|
||||
MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
|
||||
MOVUPS X3, 16(DI)(AX*4)
|
||||
MOVUPS X4, 32(DI)(AX*4)
|
||||
MOVUPS X5, 48(DI)(AX*4)
|
||||
ADDQ $16, AX // i += 16
|
||||
LOOP axpy_loop // while (--CX) > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE axpy_end
|
||||
|
||||
axpy_tail4_start: // Reset loop counter for 4-wide tail loop
|
||||
MOVQ BX, CX // CX = floor( BX / 4 )
|
||||
SHRQ $2, CX
|
||||
JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
|
||||
|
||||
axpy_tail4: // Loop unrolled 4x do {
|
||||
MOVUPS (SI)(AX*4), X2 // X2 = x[i]
|
||||
MULPS X0, X2 // X2 *= a
|
||||
ADDPS (DI)(AX*4), X2 // X2 += y[i]
|
||||
MOVUPS X2, (DI)(AX*4) // y[i] = X2
|
||||
ADDQ $4, AX // i += 4
|
||||
LOOP axpy_tail4 // } while --CX > 0
|
||||
|
||||
axpy_tail_start: // Reset loop counter for 1-wide tail loop
|
||||
MOVQ BX, CX // CX = BX % 4
|
||||
ANDQ $3, CX
|
||||
JZ axpy_end // if CX == 0 { return }
|
||||
|
||||
axpy_tail:
|
||||
MOVSS (SI)(AX*4), X1 // X1 = x[i]
|
||||
MULSS X0, X1 // X1 *= a
|
||||
ADDSS (DI)(AX*4), X1 // X1 += y[i]
|
||||
MOVSS X1, (DI)(AX*4) // y[i] = X1
|
||||
INCQ AX // i++
|
||||
LOOP axpy_tail // } while --CX > 0
|
||||
|
||||
axpy_end:
|
||||
RET
|
98
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
generated
vendored
Normal file
98
vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
|
||||
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
|
||||
MOVQ dst_base+0(FP), DI // DI = &dst
|
||||
MOVQ x_base+32(FP), SI // SI = &x
|
||||
MOVQ y_base+56(FP), DX // DX = &y
|
||||
MOVQ x_len+40(FP), BX // BX = min( len(x), len(y), len(dst) )
|
||||
CMPQ y_len+64(FP), BX
|
||||
CMOVQLE y_len+64(FP), BX
|
||||
CMPQ dst_len+8(FP), BX
|
||||
CMOVQLE dst_len+8(FP), BX
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE axpy_end
|
||||
MOVSS alpha+24(FP), X0
|
||||
SHUFPS $0, X0, X0 // X0 = { a, a, a, a, }
|
||||
XORQ AX, AX // i = 0
|
||||
MOVQ DX, CX
|
||||
ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
|
||||
JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
|
||||
|
||||
XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
|
||||
INCQ CX
|
||||
SHRQ $2, CX
|
||||
|
||||
axpy_align: // Trim first value(s) in unaligned buffer do {
|
||||
MOVSS (SI)(AX*4), X2 // X2 = x[i]
|
||||
MULSS X0, X2 // X2 *= a
|
||||
ADDSS (DX)(AX*4), X2 // X2 += y[i]
|
||||
MOVSS X2, (DI)(AX*4) // y[i] = X2
|
||||
INCQ AX // i++
|
||||
DECQ BX
|
||||
JZ axpy_end // if --BX == 0 { return }
|
||||
LOOP axpy_align // } while --CX > 0
|
||||
|
||||
axpy_no_trim:
|
||||
MOVUPS X0, X1 // Copy X0 to X1 for pipelining
|
||||
MOVQ BX, CX
|
||||
ANDQ $0xF, BX // BX = len % 16
|
||||
SHRQ $4, CX // CX = floor( len / 16 )
|
||||
JZ axpy_tail4_start // if CX == 0 { return }
|
||||
|
||||
axpy_loop: // Loop unrolled 16x do {
|
||||
MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
|
||||
MOVUPS 16(SI)(AX*4), X3
|
||||
MOVUPS 32(SI)(AX*4), X4
|
||||
MOVUPS 48(SI)(AX*4), X5
|
||||
MULPS X0, X2 // X2 *= a
|
||||
MULPS X1, X3
|
||||
MULPS X0, X4
|
||||
MULPS X1, X5
|
||||
ADDPS (DX)(AX*4), X2 // X2 += y[i:i+4]
|
||||
ADDPS 16(DX)(AX*4), X3
|
||||
ADDPS 32(DX)(AX*4), X4
|
||||
ADDPS 48(DX)(AX*4), X5
|
||||
MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
|
||||
MOVUPS X3, 16(DI)(AX*4)
|
||||
MOVUPS X4, 32(DI)(AX*4)
|
||||
MOVUPS X5, 48(DI)(AX*4)
|
||||
ADDQ $16, AX // i += 16
|
||||
LOOP axpy_loop // while (--CX) > 0
|
||||
CMPQ BX, $0 // if BX == 0 { return }
|
||||
JE axpy_end
|
||||
|
||||
axpy_tail4_start: // Reset loop counter for 4-wide tail loop
|
||||
MOVQ BX, CX // CX = floor( BX / 4 )
|
||||
SHRQ $2, CX
|
||||
JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
|
||||
|
||||
axpy_tail4: // Loop unrolled 4x do {
|
||||
MOVUPS (SI)(AX*4), X2 // X2 = x[i]
|
||||
MULPS X0, X2 // X2 *= a
|
||||
ADDPS (DX)(AX*4), X2 // X2 += y[i]
|
||||
MOVUPS X2, (DI)(AX*4) // y[i] = X2
|
||||
ADDQ $4, AX // i += 4
|
||||
LOOP axpy_tail4 // } while --CX > 0
|
||||
|
||||
axpy_tail_start: // Reset loop counter for 1-wide tail loop
|
||||
MOVQ BX, CX // CX = BX % 4
|
||||
ANDQ $3, CX
|
||||
JZ axpy_end // if CX == 0 { return }
|
||||
|
||||
axpy_tail:
|
||||
MOVSS (SI)(AX*4), X1 // X1 = x[i]
|
||||
MULSS X0, X1 // X1 *= a
|
||||
ADDSS (DX)(AX*4), X1 // X1 += y[i]
|
||||
MOVSS X1, (DI)(AX*4) // y[i] = X1
|
||||
INCQ AX // i++
|
||||
LOOP axpy_tail // } while --CX > 0
|
||||
|
||||
axpy_end:
|
||||
RET
|
91
vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
generated
vendored
Normal file
91
vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define X_PTR SI
|
||||
#define Y_PTR DI
|
||||
#define LEN CX
|
||||
#define TAIL BX
|
||||
#define INC_X R8
|
||||
#define INCx3_X R10
|
||||
#define INC_Y R9
|
||||
#define INCx3_Y R11
|
||||
#define SUM X0
|
||||
#define P_SUM X1
|
||||
|
||||
// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
|
||||
TEXT ·DdotInc(SB), NOSPLIT, $0
|
||||
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
|
||||
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
|
||||
MOVQ n+48(FP), LEN // LEN = n
|
||||
PXOR SUM, SUM // SUM = 0
|
||||
CMPQ LEN, $0
|
||||
JE dot_end
|
||||
|
||||
MOVQ ix+72(FP), INC_X // INC_X = ix
|
||||
MOVQ iy+80(FP), INC_Y // INC_Y = iy
|
||||
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
|
||||
|
||||
MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
|
||||
SHLQ $2, INC_X
|
||||
MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
|
||||
SHLQ $2, INC_Y
|
||||
|
||||
MOVQ LEN, TAIL
|
||||
ANDQ $3, TAIL // TAIL = LEN % 4
|
||||
SHRQ $2, LEN // LEN = floor( LEN / 4 )
|
||||
JZ dot_tail // if LEN == 0 { goto dot_tail }
|
||||
|
||||
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
|
||||
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
|
||||
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
|
||||
|
||||
dot_loop: // Loop unrolled 4x do {
|
||||
CVTSS2SD (X_PTR), X2 // X_i = x[i:i+1]
|
||||
CVTSS2SD (X_PTR)(INC_X*1), X3
|
||||
CVTSS2SD (X_PTR)(INC_X*2), X4
|
||||
CVTSS2SD (X_PTR)(INCx3_X*1), X5
|
||||
|
||||
CVTSS2SD (Y_PTR), X6 // X_j = y[i:i+1]
|
||||
CVTSS2SD (Y_PTR)(INC_Y*1), X7
|
||||
CVTSS2SD (Y_PTR)(INC_Y*2), X8
|
||||
CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
|
||||
|
||||
MULSD X6, X2 // X_i *= X_j
|
||||
MULSD X7, X3
|
||||
MULSD X8, X4
|
||||
MULSD X9, X5
|
||||
|
||||
ADDSD X2, SUM // SUM += X_i
|
||||
ADDSD X3, P_SUM
|
||||
ADDSD X4, SUM
|
||||
ADDSD X5, P_SUM
|
||||
|
||||
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
|
||||
|
||||
DECQ LEN
|
||||
JNZ dot_loop // } while --LEN > 0
|
||||
|
||||
ADDSD P_SUM, SUM // SUM += P_SUM
|
||||
CMPQ TAIL, $0 // if TAIL == 0 { return }
|
||||
JE dot_end
|
||||
|
||||
dot_tail: // do {
|
||||
CVTSS2SD (X_PTR), X2 // X2 = x[i]
|
||||
CVTSS2SD (Y_PTR), X3 // X2 *= y[i]
|
||||
MULSD X3, X2
|
||||
ADDSD X2, SUM // SUM += X2
|
||||
ADDQ INC_X, X_PTR // X_PTR += INC_X
|
||||
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
|
||||
DECQ TAIL
|
||||
JNZ dot_tail // } while --TAIL > 0
|
||||
|
||||
dot_end:
|
||||
MOVSD SUM, sum+88(FP) // return SUM
|
||||
RET
|
110
vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
generated
vendored
Normal file
110
vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,110 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0
|
||||
|
||||
#define X_PTR SI
|
||||
#define Y_PTR DI
|
||||
#define LEN CX
|
||||
#define TAIL BX
|
||||
#define IDX AX
|
||||
#define SUM X0
|
||||
#define P_SUM X1
|
||||
|
||||
// func DdotUnitary(x, y []float32) (sum float32)
|
||||
TEXT ·DdotUnitary(SB), NOSPLIT, $0
|
||||
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
|
||||
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
|
||||
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
|
||||
CMPQ y_len+32(FP), LEN
|
||||
CMOVQLE y_len+32(FP), LEN
|
||||
PXOR SUM, SUM // psum = 0
|
||||
CMPQ LEN, $0
|
||||
JE dot_end
|
||||
|
||||
XORQ IDX, IDX
|
||||
MOVQ Y_PTR, DX
|
||||
ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS
|
||||
JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
|
||||
|
||||
SUBQ $16, DX
|
||||
|
||||
dot_align: // Trim first value(s) in unaligned buffer do {
|
||||
CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
|
||||
CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
|
||||
MULSD X3, X2
|
||||
ADDSD X2, SUM // SUM += X2
|
||||
INCQ IDX // IDX++
|
||||
DECQ LEN
|
||||
JZ dot_end // if --TAIL == 0 { return }
|
||||
ADDQ $4, DX
|
||||
JNZ dot_align // } while --LEN > 0
|
||||
|
||||
dot_no_trim:
|
||||
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
|
||||
MOVQ LEN, TAIL
|
||||
ANDQ $0x7, TAIL // TAIL = LEN % 8
|
||||
SHRQ $3, LEN // LEN = floor( LEN / 8 )
|
||||
JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
|
||||
|
||||
dot_loop: // Loop unrolled 8x do {
|
||||
CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
|
||||
CVTPS2PD 8(X_PTR)(IDX*4), X3
|
||||
CVTPS2PD 16(X_PTR)(IDX*4), X4
|
||||
CVTPS2PD 24(X_PTR)(IDX*4), X5
|
||||
|
||||
CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
|
||||
CVTPS2PD 8(Y_PTR)(IDX*4), X7
|
||||
CVTPS2PD 16(Y_PTR)(IDX*4), X8
|
||||
CVTPS2PD 24(Y_PTR)(IDX*4), X9
|
||||
|
||||
MULPD X6, X2 // X_i *= X_j
|
||||
MULPD X7, X3
|
||||
MULPD X8, X4
|
||||
MULPD X9, X5
|
||||
|
||||
ADDPD X2, SUM // SUM += X_i
|
||||
ADDPD X3, P_SUM
|
||||
ADDPD X4, SUM
|
||||
ADDPD X5, P_SUM
|
||||
|
||||
ADDQ $8, IDX // IDX += 8
|
||||
DECQ LEN
|
||||
JNZ dot_loop // } while --LEN > 0
|
||||
|
||||
ADDPD P_SUM, SUM // SUM += P_SUM
|
||||
CMPQ TAIL, $0 // if TAIL == 0 { return }
|
||||
JE dot_end
|
||||
|
||||
dot_tail_start:
|
||||
MOVQ TAIL, LEN
|
||||
SHRQ $1, LEN
|
||||
JZ dot_tail_one
|
||||
|
||||
dot_tail_two:
|
||||
CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
|
||||
CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
|
||||
MULPD X6, X2 // X_i *= X_j
|
||||
ADDPD X2, SUM // SUM += X_i
|
||||
ADDQ $2, IDX // IDX += 2
|
||||
DECQ LEN
|
||||
JNZ dot_tail_two // } while --LEN > 0
|
||||
|
||||
ANDQ $1, TAIL
|
||||
JZ dot_end
|
||||
|
||||
dot_tail_one:
|
||||
CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
|
||||
CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
|
||||
MULSD X3, X2 // X2 *= X3
|
||||
ADDSD X2, SUM // SUM += X2
|
||||
|
||||
dot_end:
|
||||
HADDPD_SUM_SUM // SUM = \sum{ SUM[i] }
|
||||
MOVSD SUM, sum+48(FP) // return SUM
|
||||
RET
|
6
vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
generated
vendored
Normal file
6
vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
generated
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package f32 provides float32 vector primitives.
|
||||
package f32 // import "gonum.org/v1/gonum/internal/asm/f32"
|
85
vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
generated
vendored
Normal file
85
vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define X_PTR SI
|
||||
#define Y_PTR DI
|
||||
#define LEN CX
|
||||
#define TAIL BX
|
||||
#define INC_X R8
|
||||
#define INCx3_X R10
|
||||
#define INC_Y R9
|
||||
#define INCx3_Y R11
|
||||
#define SUM X0
|
||||
#define P_SUM X1
|
||||
|
||||
// func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
|
||||
TEXT ·DotInc(SB), NOSPLIT, $0
|
||||
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
|
||||
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
|
||||
PXOR SUM, SUM // SUM = 0
|
||||
MOVQ n+48(FP), LEN // LEN = n
|
||||
CMPQ LEN, $0
|
||||
JE dot_end
|
||||
|
||||
MOVQ ix+72(FP), INC_X // INC_X = ix
|
||||
MOVQ iy+80(FP), INC_Y // INC_Y = iy
|
||||
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
|
||||
|
||||
MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
|
||||
SHLQ $2, INC_X
|
||||
MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
|
||||
SHLQ $2, INC_Y
|
||||
|
||||
MOVQ LEN, TAIL
|
||||
ANDQ $0x3, TAIL // TAIL = LEN % 4
|
||||
SHRQ $2, LEN // LEN = floor( LEN / 4 )
|
||||
JZ dot_tail // if LEN == 0 { goto dot_tail }
|
||||
|
||||
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
|
||||
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
|
||||
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
|
||||
|
||||
dot_loop: // Loop unrolled 4x do {
|
||||
MOVSS (X_PTR), X2 // X_i = x[i:i+1]
|
||||
MOVSS (X_PTR)(INC_X*1), X3
|
||||
MOVSS (X_PTR)(INC_X*2), X4
|
||||
MOVSS (X_PTR)(INCx3_X*1), X5
|
||||
|
||||
MULSS (Y_PTR), X2 // X_i *= y[i:i+1]
|
||||
MULSS (Y_PTR)(INC_Y*1), X3
|
||||
MULSS (Y_PTR)(INC_Y*2), X4
|
||||
MULSS (Y_PTR)(INCx3_Y*1), X5
|
||||
|
||||
ADDSS X2, SUM // SUM += X_i
|
||||
ADDSS X3, P_SUM
|
||||
ADDSS X4, SUM
|
||||
ADDSS X5, P_SUM
|
||||
|
||||
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
|
||||
|
||||
DECQ LEN
|
||||
JNZ dot_loop // } while --LEN > 0
|
||||
|
||||
ADDSS P_SUM, SUM // P_SUM += SUM
|
||||
CMPQ TAIL, $0 // if TAIL == 0 { return }
|
||||
JE dot_end
|
||||
|
||||
dot_tail: // do {
|
||||
MOVSS (X_PTR), X2 // X2 = x[i]
|
||||
MULSS (Y_PTR), X2 // X2 *= y[i]
|
||||
ADDSS X2, SUM // SUM += X2
|
||||
ADDQ INC_X, X_PTR // X_PTR += INC_X
|
||||
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
|
||||
DECQ TAIL
|
||||
JNZ dot_tail // } while --TAIL > 0
|
||||
|
||||
dot_end:
|
||||
MOVSS SUM, sum+88(FP) // return SUM
|
||||
RET
|
106
vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
generated
vendored
Normal file
106
vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0
|
||||
|
||||
#define X_PTR SI
|
||||
#define Y_PTR DI
|
||||
#define LEN CX
|
||||
#define TAIL BX
|
||||
#define IDX AX
|
||||
#define SUM X0
|
||||
#define P_SUM X1
|
||||
|
||||
// func DotUnitary(x, y []float32) (sum float32)
|
||||
TEXT ·DotUnitary(SB), NOSPLIT, $0
|
||||
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
|
||||
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
|
||||
PXOR SUM, SUM // SUM = 0
|
||||
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
|
||||
CMPQ y_len+32(FP), LEN
|
||||
CMOVQLE y_len+32(FP), LEN
|
||||
CMPQ LEN, $0
|
||||
JE dot_end
|
||||
|
||||
XORQ IDX, IDX
|
||||
MOVQ Y_PTR, DX
|
||||
ANDQ $0xF, DX // Align on 16-byte boundary for MULPS
|
||||
JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
|
||||
SUBQ $16, DX
|
||||
|
||||
dot_align: // Trim first value(s) in unaligned buffer do {
|
||||
MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
|
||||
MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
|
||||
ADDSS X2, SUM // SUM += X2
|
||||
INCQ IDX // IDX++
|
||||
DECQ LEN
|
||||
JZ dot_end // if --TAIL == 0 { return }
|
||||
ADDQ $4, DX
|
||||
JNZ dot_align // } while --DX > 0
|
||||
|
||||
dot_no_trim:
|
||||
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
|
||||
MOVQ LEN, TAIL
|
||||
ANDQ $0xF, TAIL // TAIL = LEN % 16
|
||||
SHRQ $4, LEN // LEN = floor( LEN / 16 )
|
||||
JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
|
||||
|
||||
dot_loop: // Loop unrolled 16x do {
|
||||
MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
|
||||
MOVUPS 16(X_PTR)(IDX*4), X3
|
||||
MOVUPS 32(X_PTR)(IDX*4), X4
|
||||
MOVUPS 48(X_PTR)(IDX*4), X5
|
||||
|
||||
MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
|
||||
MULPS 16(Y_PTR)(IDX*4), X3
|
||||
MULPS 32(Y_PTR)(IDX*4), X4
|
||||
MULPS 48(Y_PTR)(IDX*4), X5
|
||||
|
||||
ADDPS X2, SUM // SUM += X_i
|
||||
ADDPS X3, P_SUM
|
||||
ADDPS X4, SUM
|
||||
ADDPS X5, P_SUM
|
||||
|
||||
ADDQ $16, IDX // IDX += 16
|
||||
DECQ LEN
|
||||
JNZ dot_loop // } while --LEN > 0
|
||||
|
||||
ADDPS P_SUM, SUM // SUM += P_SUM
|
||||
CMPQ TAIL, $0 // if TAIL == 0 { return }
|
||||
JE dot_end
|
||||
|
||||
dot_tail4_start: // Reset loop counter for 4-wide tail loop
|
||||
MOVQ TAIL, LEN // LEN = floor( TAIL / 4 )
|
||||
SHRQ $2, LEN
|
||||
JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
|
||||
|
||||
dot_tail4_loop: // Loop unrolled 4x do {
|
||||
MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
|
||||
MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
|
||||
ADDPS X2, SUM // SUM += X_i
|
||||
ADDQ $4, IDX // i += 4
|
||||
DECQ LEN
|
||||
JNZ dot_tail4_loop // } while --LEN > 0
|
||||
|
||||
dot_tail_start: // Reset loop counter for 1-wide tail loop
|
||||
ANDQ $3, TAIL // TAIL = TAIL % 4
|
||||
JZ dot_end // if TAIL == 0 { return }
|
||||
|
||||
dot_tail: // do {
|
||||
MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
|
||||
MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
|
||||
ADDSS X2, SUM // psum += X2
|
||||
INCQ IDX // IDX++
|
||||
DECQ TAIL
|
||||
JNZ dot_tail // } while --TAIL > 0
|
||||
|
||||
dot_end:
|
||||
HADDPS_SUM_SUM // SUM = \sum{ SUM[i] }
|
||||
HADDPS_SUM_SUM
|
||||
MOVSS SUM, sum+48(FP) // return SUM
|
||||
RET
|
15
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
generated
vendored
Normal file
15
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !noasm,!appengine,!safe
|
||||
|
||||
package f32
|
||||
|
||||
// Ger performs the rank-one operation
|
||||
// A += alpha * x * y^T
|
||||
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
|
||||
func Ger(m, n uintptr, alpha float32,
|
||||
x []float32, incX uintptr,
|
||||
y []float32, incY uintptr,
|
||||
a []float32, lda uintptr)
|
757
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
generated
vendored
Normal file
757
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
generated
vendored
Normal file
@@ -0,0 +1,757 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//+build !noasm,!appengine,!safe
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define SIZE 4
|
||||
#define BITSIZE 2
|
||||
#define KERNELSIZE 3
|
||||
|
||||
#define M_DIM m+0(FP)
|
||||
#define M CX
|
||||
#define N_DIM n+8(FP)
|
||||
#define N BX
|
||||
|
||||
#define TMP1 R14
|
||||
#define TMP2 R15
|
||||
|
||||
#define X_PTR SI
|
||||
#define Y y_base+56(FP)
|
||||
#define Y_PTR DX
|
||||
#define A_ROW AX
|
||||
#define A_PTR DI
|
||||
|
||||
#define INC_X R8
|
||||
#define INC3_X R9
|
||||
|
||||
#define INC_Y R10
|
||||
#define INC3_Y R11
|
||||
|
||||
#define LDA R12
|
||||
#define LDA3 R13
|
||||
|
||||
#define ALPHA X0
|
||||
#define ALPHA_SPILL al-16(SP)
|
||||
|
||||
#define LOAD_ALPHA \
|
||||
MOVSS alpha+16(FP), ALPHA \
|
||||
SHUFPS $0, ALPHA, ALPHA
|
||||
|
||||
#define LOAD_SCALED4 \
|
||||
PREFETCHNTA 16*SIZE(X_PTR) \
|
||||
MOVDDUP (X_PTR), X1 \
|
||||
MOVDDUP 2*SIZE(X_PTR), X3 \
|
||||
MOVSHDUP X1, X2 \
|
||||
MOVSHDUP X3, X4 \
|
||||
MOVSLDUP X1, X1 \
|
||||
MOVSLDUP X3, X3 \
|
||||
MULPS ALPHA, X1 \
|
||||
MULPS ALPHA, X2 \
|
||||
MULPS ALPHA, X3 \
|
||||
MULPS ALPHA, X4
|
||||
|
||||
#define LOAD_SCALED2 \
|
||||
MOVDDUP (X_PTR), X1 \
|
||||
MOVSHDUP X1, X2 \
|
||||
MOVSLDUP X1, X1 \
|
||||
MULPS ALPHA, X1 \
|
||||
MULPS ALPHA, X2
|
||||
|
||||
#define LOAD_SCALED1 \
|
||||
MOVSS (X_PTR), X1 \
|
||||
SHUFPS $0, X1, X1 \
|
||||
MULPS ALPHA, X1
|
||||
|
||||
#define LOAD_SCALED4_INC \
|
||||
PREFETCHNTA (X_PTR)(INC_X*8) \
|
||||
MOVSS (X_PTR), X1 \
|
||||
MOVSS (X_PTR)(INC_X*1), X2 \
|
||||
MOVSS (X_PTR)(INC_X*2), X3 \
|
||||
MOVSS (X_PTR)(INC3_X*1), X4 \
|
||||
SHUFPS $0, X1, X1 \
|
||||
SHUFPS $0, X2, X2 \
|
||||
SHUFPS $0, X3, X3 \
|
||||
SHUFPS $0, X4, X4 \
|
||||
MULPS ALPHA, X1 \
|
||||
MULPS ALPHA, X2 \
|
||||
MULPS ALPHA, X3 \
|
||||
MULPS ALPHA, X4
|
||||
|
||||
#define LOAD_SCALED2_INC \
|
||||
MOVSS (X_PTR), X1 \
|
||||
MOVSS (X_PTR)(INC_X*1), X2 \
|
||||
SHUFPS $0, X1, X1 \
|
||||
SHUFPS $0, X2, X2 \
|
||||
MULPS ALPHA, X1 \
|
||||
MULPS ALPHA, X2
|
||||
|
||||
#define KERNEL_LOAD8 \
|
||||
MOVUPS (Y_PTR), X5 \
|
||||
MOVUPS 4*SIZE(Y_PTR), X6
|
||||
|
||||
#define KERNEL_LOAD8_INC \
|
||||
MOVSS (Y_PTR), X5 \
|
||||
MOVSS (Y_PTR)(INC_Y*1), X6 \
|
||||
MOVSS (Y_PTR)(INC_Y*2), X7 \
|
||||
MOVSS (Y_PTR)(INC3_Y*1), X8 \
|
||||
UNPCKLPS X6, X5 \
|
||||
UNPCKLPS X8, X7 \
|
||||
MOVLHPS X7, X5 \
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR \
|
||||
MOVSS (Y_PTR), X6 \
|
||||
MOVSS (Y_PTR)(INC_Y*1), X7 \
|
||||
MOVSS (Y_PTR)(INC_Y*2), X8 \
|
||||
MOVSS (Y_PTR)(INC3_Y*1), X9 \
|
||||
UNPCKLPS X7, X6 \
|
||||
UNPCKLPS X9, X8 \
|
||||
MOVLHPS X8, X6
|
||||
|
||||
#define KERNEL_LOAD4 \
|
||||
MOVUPS (Y_PTR), X5
|
||||
|
||||
#define KERNEL_LOAD4_INC \
|
||||
MOVSS (Y_PTR), X5 \
|
||||
MOVSS (Y_PTR)(INC_Y*1), X6 \
|
||||
MOVSS (Y_PTR)(INC_Y*2), X7 \
|
||||
MOVSS (Y_PTR)(INC3_Y*1), X8 \
|
||||
UNPCKLPS X6, X5 \
|
||||
UNPCKLPS X8, X7 \
|
||||
MOVLHPS X7, X5
|
||||
|
||||
#define KERNEL_LOAD2 \
|
||||
MOVSD (Y_PTR), X5
|
||||
|
||||
#define KERNEL_LOAD2_INC \
|
||||
MOVSS (Y_PTR), X5 \
|
||||
MOVSS (Y_PTR)(INC_Y*1), X6 \
|
||||
UNPCKLPS X6, X5
|
||||
|
||||
#define KERNEL_4x8 \
|
||||
MOVUPS X5, X7 \
|
||||
MOVUPS X6, X8 \
|
||||
MOVUPS X5, X9 \
|
||||
MOVUPS X6, X10 \
|
||||
MOVUPS X5, X11 \
|
||||
MOVUPS X6, X12 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X1, X6 \
|
||||
MULPS X2, X7 \
|
||||
MULPS X2, X8 \
|
||||
MULPS X3, X9 \
|
||||
MULPS X3, X10 \
|
||||
MULPS X4, X11 \
|
||||
MULPS X4, X12
|
||||
|
||||
#define STORE_4x8 \
|
||||
MOVUPS ALPHA, ALPHA_SPILL \
|
||||
MOVUPS (A_PTR), X13 \
|
||||
ADDPS X13, X5 \
|
||||
MOVUPS 4*SIZE(A_PTR), X14 \
|
||||
ADDPS X14, X6 \
|
||||
MOVUPS (A_PTR)(LDA*1), X15 \
|
||||
ADDPS X15, X7 \
|
||||
MOVUPS 4*SIZE(A_PTR)(LDA*1), X0 \
|
||||
ADDPS X0, X8 \
|
||||
MOVUPS (A_PTR)(LDA*2), X13 \
|
||||
ADDPS X13, X9 \
|
||||
MOVUPS 4*SIZE(A_PTR)(LDA*2), X14 \
|
||||
ADDPS X14, X10 \
|
||||
MOVUPS (A_PTR)(LDA3*1), X15 \
|
||||
ADDPS X15, X11 \
|
||||
MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0 \
|
||||
ADDPS X0, X12 \
|
||||
MOVUPS X5, (A_PTR) \
|
||||
MOVUPS X6, 4*SIZE(A_PTR) \
|
||||
MOVUPS X7, (A_PTR)(LDA*1) \
|
||||
MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \
|
||||
MOVUPS X9, (A_PTR)(LDA*2) \
|
||||
MOVUPS X10, 4*SIZE(A_PTR)(LDA*2) \
|
||||
MOVUPS X11, (A_PTR)(LDA3*1) \
|
||||
MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \
|
||||
MOVUPS ALPHA_SPILL, ALPHA \
|
||||
ADDQ $8*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_4x4 \
|
||||
MOVUPS X5, X6 \
|
||||
MOVUPS X5, X7 \
|
||||
MOVUPS X5, X8 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X2, X6 \
|
||||
MULPS X3, X7 \
|
||||
MULPS X4, X8
|
||||
|
||||
#define STORE_4x4 \
|
||||
MOVUPS (A_PTR), X13 \
|
||||
ADDPS X13, X5 \
|
||||
MOVUPS (A_PTR)(LDA*1), X14 \
|
||||
ADDPS X14, X6 \
|
||||
MOVUPS (A_PTR)(LDA*2), X15 \
|
||||
ADDPS X15, X7 \
|
||||
MOVUPS (A_PTR)(LDA3*1), X13 \
|
||||
ADDPS X13, X8 \
|
||||
MOVUPS X5, (A_PTR) \
|
||||
MOVUPS X6, (A_PTR)(LDA*1) \
|
||||
MOVUPS X7, (A_PTR)(LDA*2) \
|
||||
MOVUPS X8, (A_PTR)(LDA3*1) \
|
||||
ADDQ $4*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_4x2 \
|
||||
MOVUPS X5, X6 \
|
||||
MOVUPS X5, X7 \
|
||||
MOVUPS X5, X8 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X2, X6 \
|
||||
MULPS X3, X7 \
|
||||
MULPS X4, X8
|
||||
|
||||
#define STORE_4x2 \
|
||||
MOVSD (A_PTR), X9 \
|
||||
ADDPS X9, X5 \
|
||||
MOVSD (A_PTR)(LDA*1), X10 \
|
||||
ADDPS X10, X6 \
|
||||
MOVSD (A_PTR)(LDA*2), X11 \
|
||||
ADDPS X11, X7 \
|
||||
MOVSD (A_PTR)(LDA3*1), X12 \
|
||||
ADDPS X12, X8 \
|
||||
MOVSD X5, (A_PTR) \
|
||||
MOVSD X6, (A_PTR)(LDA*1) \
|
||||
MOVSD X7, (A_PTR)(LDA*2) \
|
||||
MOVSD X8, (A_PTR)(LDA3*1) \
|
||||
ADDQ $2*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_4x1 \
|
||||
MOVSS (Y_PTR), X5 \
|
||||
MOVSS X5, X6 \
|
||||
MOVSS X5, X7 \
|
||||
MOVSS X5, X8 \
|
||||
MULSS X1, X5 \
|
||||
MULSS X2, X6 \
|
||||
MULSS X3, X7 \
|
||||
MULSS X4, X8
|
||||
|
||||
#define STORE_4x1 \
|
||||
ADDSS (A_PTR), X5 \
|
||||
ADDSS (A_PTR)(LDA*1), X6 \
|
||||
ADDSS (A_PTR)(LDA*2), X7 \
|
||||
ADDSS (A_PTR)(LDA3*1), X8 \
|
||||
MOVSS X5, (A_PTR) \
|
||||
MOVSS X6, (A_PTR)(LDA*1) \
|
||||
MOVSS X7, (A_PTR)(LDA*2) \
|
||||
MOVSS X8, (A_PTR)(LDA3*1) \
|
||||
ADDQ $SIZE, A_PTR
|
||||
|
||||
#define KERNEL_2x8 \
|
||||
MOVUPS X5, X7 \
|
||||
MOVUPS X6, X8 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X1, X6 \
|
||||
MULPS X2, X7 \
|
||||
MULPS X2, X8
|
||||
|
||||
#define STORE_2x8 \
|
||||
MOVUPS (A_PTR), X9 \
|
||||
ADDPS X9, X5 \
|
||||
MOVUPS 4*SIZE(A_PTR), X10 \
|
||||
ADDPS X10, X6 \
|
||||
MOVUPS (A_PTR)(LDA*1), X11 \
|
||||
ADDPS X11, X7 \
|
||||
MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \
|
||||
ADDPS X12, X8 \
|
||||
MOVUPS X5, (A_PTR) \
|
||||
MOVUPS X6, 4*SIZE(A_PTR) \
|
||||
MOVUPS X7, (A_PTR)(LDA*1) \
|
||||
MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \
|
||||
ADDQ $8*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_2x4 \
|
||||
MOVUPS X5, X6 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X2, X6
|
||||
|
||||
#define STORE_2x4 \
|
||||
MOVUPS (A_PTR), X9 \
|
||||
ADDPS X9, X5 \
|
||||
MOVUPS (A_PTR)(LDA*1), X11 \
|
||||
ADDPS X11, X6 \
|
||||
MOVUPS X5, (A_PTR) \
|
||||
MOVUPS X6, (A_PTR)(LDA*1) \
|
||||
ADDQ $4*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_2x2 \
|
||||
MOVSD X5, X6 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X2, X6
|
||||
|
||||
#define STORE_2x2 \
|
||||
MOVSD (A_PTR), X7 \
|
||||
ADDPS X7, X5 \
|
||||
MOVSD (A_PTR)(LDA*1), X8 \
|
||||
ADDPS X8, X6 \
|
||||
MOVSD X5, (A_PTR) \
|
||||
MOVSD X6, (A_PTR)(LDA*1) \
|
||||
ADDQ $2*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_2x1 \
|
||||
MOVSS (Y_PTR), X5 \
|
||||
MOVSS X5, X6 \
|
||||
MULSS X1, X5 \
|
||||
MULSS X2, X6
|
||||
|
||||
#define STORE_2x1 \
|
||||
ADDSS (A_PTR), X5 \
|
||||
ADDSS (A_PTR)(LDA*1), X6 \
|
||||
MOVSS X5, (A_PTR) \
|
||||
MOVSS X6, (A_PTR)(LDA*1) \
|
||||
ADDQ $SIZE, A_PTR
|
||||
|
||||
#define KERNEL_1x8 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X1, X6
|
||||
|
||||
#define STORE_1x8 \
|
||||
MOVUPS (A_PTR), X7 \
|
||||
ADDPS X7, X5 \
|
||||
MOVUPS 4*SIZE(A_PTR), X8 \
|
||||
ADDPS X8, X6 \
|
||||
MOVUPS X5, (A_PTR) \
|
||||
MOVUPS X6, 4*SIZE(A_PTR) \
|
||||
ADDQ $8*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_1x4 \
|
||||
MULPS X1, X5 \
|
||||
MULPS X1, X6
|
||||
|
||||
#define STORE_1x4 \
|
||||
MOVUPS (A_PTR), X7 \
|
||||
ADDPS X7, X5 \
|
||||
MOVUPS X5, (A_PTR) \
|
||||
ADDQ $4*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_1x2 \
|
||||
MULPS X1, X5
|
||||
|
||||
#define STORE_1x2 \
|
||||
MOVSD (A_PTR), X6 \
|
||||
ADDPS X6, X5 \
|
||||
MOVSD X5, (A_PTR) \
|
||||
ADDQ $2*SIZE, A_PTR
|
||||
|
||||
#define KERNEL_1x1 \
|
||||
MOVSS (Y_PTR), X5 \
|
||||
MULSS X1, X5
|
||||
|
||||
#define STORE_1x1 \
|
||||
ADDSS (A_PTR), X5 \
|
||||
MOVSS X5, (A_PTR) \
|
||||
ADDQ $SIZE, A_PTR
|
||||
|
||||
// func Ger(m, n uintptr, alpha float32,
|
||||
// x []float32, incX uintptr,
|
||||
// y []float32, incY uintptr,
|
||||
// a []float32, lda uintptr)
|
||||
TEXT ·Ger(SB), 0, $16-120
|
||||
MOVQ M_DIM, M
|
||||
MOVQ N_DIM, N
|
||||
CMPQ M, $0
|
||||
JE end
|
||||
CMPQ N, $0
|
||||
JE end
|
||||
|
||||
LOAD_ALPHA
|
||||
|
||||
MOVQ x_base+24(FP), X_PTR
|
||||
MOVQ y_base+56(FP), Y_PTR
|
||||
MOVQ a_base+88(FP), A_ROW
|
||||
MOVQ A_ROW, A_PTR
|
||||
MOVQ lda+112(FP), LDA // LDA = LDA * sizeof(float32)
|
||||
SHLQ $BITSIZE, LDA
|
||||
LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3
|
||||
|
||||
CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
|
||||
JNE inc
|
||||
CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path)
|
||||
JNE inc
|
||||
|
||||
SHRQ $2, M
|
||||
JZ r2
|
||||
|
||||
r4:
|
||||
|
||||
// LOAD 4
|
||||
LOAD_SCALED4
|
||||
|
||||
MOVQ N_DIM, N
|
||||
SHRQ $KERNELSIZE, N
|
||||
JZ r4c4
|
||||
|
||||
r4c8:
|
||||
// 4x8 KERNEL
|
||||
KERNEL_LOAD8
|
||||
KERNEL_4x8
|
||||
STORE_4x8
|
||||
|
||||
ADDQ $8*SIZE, Y_PTR
|
||||
|
||||
DECQ N
|
||||
JNZ r4c8
|
||||
|
||||
r4c4:
|
||||
TESTQ $4, N_DIM
|
||||
JZ r4c2
|
||||
|
||||
// 4x4 KERNEL
|
||||
KERNEL_LOAD4
|
||||
KERNEL_4x4
|
||||
STORE_4x4
|
||||
|
||||
ADDQ $4*SIZE, Y_PTR
|
||||
|
||||
r4c2:
|
||||
TESTQ $2, N_DIM
|
||||
JZ r4c1
|
||||
|
||||
// 4x2 KERNEL
|
||||
KERNEL_LOAD2
|
||||
KERNEL_4x2
|
||||
STORE_4x2
|
||||
|
||||
ADDQ $2*SIZE, Y_PTR
|
||||
|
||||
r4c1:
|
||||
TESTQ $1, N_DIM
|
||||
JZ r4end
|
||||
|
||||
// 4x1 KERNEL
|
||||
KERNEL_4x1
|
||||
STORE_4x1
|
||||
|
||||
ADDQ $SIZE, Y_PTR
|
||||
|
||||
r4end:
|
||||
ADDQ $4*SIZE, X_PTR
|
||||
MOVQ Y, Y_PTR
|
||||
LEAQ (A_ROW)(LDA*4), A_ROW
|
||||
MOVQ A_ROW, A_PTR
|
||||
|
||||
DECQ M
|
||||
JNZ r4
|
||||
|
||||
r2:
|
||||
TESTQ $2, M_DIM
|
||||
JZ r1
|
||||
|
||||
// LOAD 2
|
||||
LOAD_SCALED2
|
||||
|
||||
MOVQ N_DIM, N
|
||||
SHRQ $KERNELSIZE, N
|
||||
JZ r2c4
|
||||
|
||||
r2c8:
|
||||
// 2x8 KERNEL
|
||||
KERNEL_LOAD8
|
||||
KERNEL_2x8
|
||||
STORE_2x8
|
||||
|
||||
ADDQ $8*SIZE, Y_PTR
|
||||
|
||||
DECQ N
|
||||
JNZ r2c8
|
||||
|
||||
r2c4:
|
||||
TESTQ $4, N_DIM
|
||||
JZ r2c2
|
||||
|
||||
// 2x4 KERNEL
|
||||
KERNEL_LOAD4
|
||||
KERNEL_2x4
|
||||
STORE_2x4
|
||||
|
||||
ADDQ $4*SIZE, Y_PTR
|
||||
|
||||
r2c2:
|
||||
TESTQ $2, N_DIM
|
||||
JZ r2c1
|
||||
|
||||
// 2x2 KERNEL
|
||||
KERNEL_LOAD2
|
||||
KERNEL_2x2
|
||||
STORE_2x2
|
||||
|
||||
ADDQ $2*SIZE, Y_PTR
|
||||
|
||||
r2c1:
|
||||
TESTQ $1, N_DIM
|
||||
JZ r2end
|
||||
|
||||
// 2x1 KERNEL
|
||||
KERNEL_2x1
|
||||
STORE_2x1
|
||||
|
||||
ADDQ $SIZE, Y_PTR
|
||||
|
||||
r2end:
|
||||
ADDQ $2*SIZE, X_PTR
|
||||
MOVQ Y, Y_PTR
|
||||
LEAQ (A_ROW)(LDA*2), A_ROW
|
||||
MOVQ A_ROW, A_PTR
|
||||
|
||||
r1:
|
||||
TESTQ $1, M_DIM
|
||||
JZ end
|
||||
|
||||
// LOAD 1
|
||||
LOAD_SCALED1
|
||||
|
||||
MOVQ N_DIM, N
|
||||
SHRQ $KERNELSIZE, N
|
||||
JZ r1c4
|
||||
|
||||
r1c8:
|
||||
// 1x8 KERNEL
|
||||
KERNEL_LOAD8
|
||||
KERNEL_1x8
|
||||
STORE_1x8
|
||||
|
||||
ADDQ $8*SIZE, Y_PTR
|
||||
|
||||
DECQ N
|
||||
JNZ r1c8
|
||||
|
||||
r1c4:
|
||||
TESTQ $4, N_DIM
|
||||
JZ r1c2
|
||||
|
||||
// 1x4 KERNEL
|
||||
KERNEL_LOAD4
|
||||
KERNEL_1x4
|
||||
STORE_1x4
|
||||
|
||||
ADDQ $4*SIZE, Y_PTR
|
||||
|
||||
r1c2:
|
||||
TESTQ $2, N_DIM
|
||||
JZ r1c1
|
||||
|
||||
// 1x2 KERNEL
|
||||
KERNEL_LOAD2
|
||||
KERNEL_1x2
|
||||
STORE_1x2
|
||||
|
||||
ADDQ $2*SIZE, Y_PTR
|
||||
|
||||
r1c1:
|
||||
TESTQ $1, N_DIM
|
||||
JZ end
|
||||
|
||||
// 1x1 KERNEL
|
||||
KERNEL_1x1
|
||||
STORE_1x1
|
||||
|
||||
end:
|
||||
RET
|
||||
|
||||
inc: // Algorithm for incY != 0 ( split loads in kernel )
|
||||
|
||||
MOVQ incX+48(FP), INC_X // INC_X = incX * sizeof(float32)
|
||||
SHLQ $BITSIZE, INC_X
|
||||
MOVQ incY+80(FP), INC_Y // INC_Y = incY * sizeof(float32)
|
||||
SHLQ $BITSIZE, INC_Y
|
||||
LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
|
||||
LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
|
||||
|
||||
XORQ TMP2, TMP2
|
||||
MOVQ M, TMP1
|
||||
SUBQ $1, TMP1
|
||||
IMULQ INC_X, TMP1
|
||||
NEGQ TMP1
|
||||
CMPQ INC_X, $0
|
||||
CMOVQLT TMP1, TMP2
|
||||
LEAQ (X_PTR)(TMP2*SIZE), X_PTR
|
||||
|
||||
XORQ TMP2, TMP2
|
||||
MOVQ N, TMP1
|
||||
SUBQ $1, TMP1
|
||||
IMULQ INC_Y, TMP1
|
||||
NEGQ TMP1
|
||||
CMPQ INC_Y, $0
|
||||
CMOVQLT TMP1, TMP2
|
||||
LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR
|
||||
|
||||
SHRQ $2, M
|
||||
JZ inc_r2
|
||||
|
||||
inc_r4:
|
||||
// LOAD 4
|
||||
LOAD_SCALED4_INC
|
||||
|
||||
MOVQ N_DIM, N
|
||||
SHRQ $KERNELSIZE, N
|
||||
JZ inc_r4c4
|
||||
|
||||
inc_r4c8:
|
||||
// 4x4 KERNEL
|
||||
KERNEL_LOAD8_INC
|
||||
KERNEL_4x8
|
||||
STORE_4x8
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
||||
DECQ N
|
||||
JNZ inc_r4c8
|
||||
|
||||
inc_r4c4:
|
||||
TESTQ $4, N_DIM
|
||||
JZ inc_r4c2
|
||||
|
||||
// 4x4 KERNEL
|
||||
KERNEL_LOAD4_INC
|
||||
KERNEL_4x4
|
||||
STORE_4x4
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
||||
|
||||
inc_r4c2:
|
||||
TESTQ $2, N_DIM
|
||||
JZ inc_r4c1
|
||||
|
||||
// 4x2 KERNEL
|
||||
KERNEL_LOAD2_INC
|
||||
KERNEL_4x2
|
||||
STORE_4x2
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
||||
|
||||
inc_r4c1:
|
||||
TESTQ $1, N_DIM
|
||||
JZ inc_r4end
|
||||
|
||||
// 4x1 KERNEL
|
||||
KERNEL_4x1
|
||||
STORE_4x1
|
||||
|
||||
ADDQ INC_Y, Y_PTR
|
||||
|
||||
inc_r4end:
|
||||
LEAQ (X_PTR)(INC_X*4), X_PTR
|
||||
MOVQ Y, Y_PTR
|
||||
LEAQ (A_ROW)(LDA*4), A_ROW
|
||||
MOVQ A_ROW, A_PTR
|
||||
|
||||
DECQ M
|
||||
JNZ inc_r4
|
||||
|
||||
inc_r2:
|
||||
TESTQ $2, M_DIM
|
||||
JZ inc_r1
|
||||
|
||||
// LOAD 2
|
||||
LOAD_SCALED2_INC
|
||||
|
||||
MOVQ N_DIM, N
|
||||
SHRQ $KERNELSIZE, N
|
||||
JZ inc_r2c4
|
||||
|
||||
inc_r2c8:
|
||||
// 2x8 KERNEL
|
||||
KERNEL_LOAD8_INC
|
||||
KERNEL_2x8
|
||||
STORE_2x8
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
||||
DECQ N
|
||||
JNZ inc_r2c8
|
||||
|
||||
inc_r2c4:
|
||||
TESTQ $4, N_DIM
|
||||
JZ inc_r2c2
|
||||
|
||||
// 2x4 KERNEL
|
||||
KERNEL_LOAD4_INC
|
||||
KERNEL_2x4
|
||||
STORE_2x4
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
||||
|
||||
inc_r2c2:
|
||||
TESTQ $2, N_DIM
|
||||
JZ inc_r2c1
|
||||
|
||||
// 2x2 KERNEL
|
||||
KERNEL_LOAD2_INC
|
||||
KERNEL_2x2
|
||||
STORE_2x2
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
||||
|
||||
inc_r2c1:
|
||||
TESTQ $1, N_DIM
|
||||
JZ inc_r2end
|
||||
|
||||
// 2x1 KERNEL
|
||||
KERNEL_2x1
|
||||
STORE_2x1
|
||||
|
||||
ADDQ INC_Y, Y_PTR
|
||||
|
||||
inc_r2end:
|
||||
LEAQ (X_PTR)(INC_X*2), X_PTR
|
||||
MOVQ Y, Y_PTR
|
||||
LEAQ (A_ROW)(LDA*2), A_ROW
|
||||
MOVQ A_ROW, A_PTR
|
||||
|
||||
inc_r1:
|
||||
TESTQ $1, M_DIM
|
||||
JZ end
|
||||
|
||||
// LOAD 1
|
||||
LOAD_SCALED1
|
||||
|
||||
MOVQ N_DIM, N
|
||||
SHRQ $KERNELSIZE, N
|
||||
JZ inc_r1c4
|
||||
|
||||
inc_r1c8:
|
||||
// 1x8 KERNEL
|
||||
KERNEL_LOAD8_INC
|
||||
KERNEL_1x8
|
||||
STORE_1x8
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
||||
DECQ N
|
||||
JNZ inc_r1c8
|
||||
|
||||
inc_r1c4:
|
||||
TESTQ $4, N_DIM
|
||||
JZ inc_r1c2
|
||||
|
||||
// 1x4 KERNEL
|
||||
KERNEL_LOAD4_INC
|
||||
KERNEL_1x4
|
||||
STORE_1x4
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
|
||||
|
||||
inc_r1c2:
|
||||
TESTQ $2, N_DIM
|
||||
JZ inc_r1c1
|
||||
|
||||
// 1x2 KERNEL
|
||||
KERNEL_LOAD2_INC
|
||||
KERNEL_1x2
|
||||
STORE_1x2
|
||||
|
||||
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
|
||||
|
||||
inc_r1c1:
|
||||
TESTQ $1, N_DIM
|
||||
JZ inc_end
|
||||
|
||||
// 1x1 KERNEL
|
||||
KERNEL_1x1
|
||||
STORE_1x1
|
||||
|
||||
inc_end:
|
||||
RET
|
36
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
generated
vendored
Normal file
36
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
generated
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
// Copyright ©2017 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 noasm appengine safe
|
||||
|
||||
package f32
|
||||
|
||||
// Ger performs the rank-one operation
|
||||
// A += alpha * x * y^T
|
||||
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
|
||||
func Ger(m, n uintptr, alpha float32, x []float32, incX uintptr, y []float32, incY uintptr, a []float32, lda uintptr) {
|
||||
|
||||
if incX == 1 && incY == 1 {
|
||||
x = x[:m]
|
||||
y = y[:n]
|
||||
for i, xv := range x {
|
||||
AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var ky, kx uintptr
|
||||
if int(incY) < 0 {
|
||||
ky = uintptr(-int(n-1) * int(incY))
|
||||
}
|
||||
if int(incX) < 0 {
|
||||
kx = uintptr(-int(m-1) * int(incX))
|
||||
}
|
||||
|
||||
ix := kx
|
||||
for i := 0; i < int(m); i++ {
|
||||
AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], uintptr(n), uintptr(incY), 1, uintptr(ky), 0)
|
||||
ix += incX
|
||||
}
|
||||
}
|
55
vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
generated
vendored
Normal file
55
vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
generated
vendored
Normal file
@@ -0,0 +1,55 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package f32
|
||||
|
||||
// ScalUnitary is
|
||||
// for i := range x {
|
||||
// x[i] *= alpha
|
||||
// }
|
||||
func ScalUnitary(alpha float32, x []float32) {
|
||||
for i := range x {
|
||||
x[i] *= alpha
|
||||
}
|
||||
}
|
||||
|
||||
// ScalUnitaryTo is
|
||||
// for i, v := range x {
|
||||
// dst[i] = alpha * v
|
||||
// }
|
||||
func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
|
||||
for i, v := range x {
|
||||
dst[i] = alpha * v
|
||||
}
|
||||
}
|
||||
|
||||
// ScalInc is
|
||||
// var ix uintptr
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// x[ix] *= alpha
|
||||
// ix += incX
|
||||
// }
|
||||
func ScalInc(alpha float32, x []float32, n, incX uintptr) {
|
||||
var ix uintptr
|
||||
for i := 0; i < int(n); i++ {
|
||||
x[ix] *= alpha
|
||||
ix += incX
|
||||
}
|
||||
}
|
||||
|
||||
// ScalIncTo is
|
||||
// var idst, ix uintptr
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// dst[idst] = alpha * x[ix]
|
||||
// ix += incX
|
||||
// idst += incDst
|
||||
// }
|
||||
func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
|
||||
var idst, ix uintptr
|
||||
for i := 0; i < int(n); i++ {
|
||||
dst[idst] = alpha * x[ix]
|
||||
ix += incX
|
||||
idst += incDst
|
||||
}
|
||||
}
|
68
vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
generated
vendored
Normal file
68
vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
generated
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !noasm,!appengine,!safe
|
||||
|
||||
package f32
|
||||
|
||||
// AxpyUnitary is
|
||||
// for i, v := range x {
|
||||
// y[i] += alpha * v
|
||||
// }
|
||||
func AxpyUnitary(alpha float32, x, y []float32)
|
||||
|
||||
// AxpyUnitaryTo is
|
||||
// for i, v := range x {
|
||||
// dst[i] = alpha*v + y[i]
|
||||
// }
|
||||
func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
|
||||
|
||||
// AxpyInc is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// y[iy] += alpha * x[ix]
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// }
|
||||
func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||
|
||||
// AxpyIncTo is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// dst[idst] = alpha*x[ix] + y[iy]
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// idst += incDst
|
||||
// }
|
||||
func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
|
||||
|
||||
// DdotUnitary is
|
||||
// for i, v := range x {
|
||||
// sum += float64(y[i]) * float64(v)
|
||||
// }
|
||||
// return
|
||||
func DdotUnitary(x, y []float32) (sum float64)
|
||||
|
||||
// DdotInc is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// sum += float64(y[iy]) * float64(x[ix])
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// }
|
||||
// return
|
||||
func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
|
||||
|
||||
// DotUnitary is
|
||||
// for i, v := range x {
|
||||
// sum += y[i] * v
|
||||
// }
|
||||
// return sum
|
||||
func DotUnitary(x, y []float32) (sum float32)
|
||||
|
||||
// DotInc is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// sum += y[iy] * x[ix]
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// }
|
||||
// return sum
|
||||
func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
|
113
vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
generated
vendored
Normal file
113
vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
generated
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
// Copyright ©2016 The Gonum Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 noasm appengine safe
|
||||
|
||||
package f32
|
||||
|
||||
// AxpyUnitary is
|
||||
// for i, v := range x {
|
||||
// y[i] += alpha * v
|
||||
// }
|
||||
func AxpyUnitary(alpha float32, x, y []float32) {
|
||||
for i, v := range x {
|
||||
y[i] += alpha * v
|
||||
}
|
||||
}
|
||||
|
||||
// AxpyUnitaryTo is
|
||||
// for i, v := range x {
|
||||
// dst[i] = alpha*v + y[i]
|
||||
// }
|
||||
func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
|
||||
for i, v := range x {
|
||||
dst[i] = alpha*v + y[i]
|
||||
}
|
||||
}
|
||||
|
||||
// AxpyInc is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// y[iy] += alpha * x[ix]
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// }
|
||||
func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
|
||||
for i := 0; i < int(n); i++ {
|
||||
y[iy] += alpha * x[ix]
|
||||
ix += incX
|
||||
iy += incY
|
||||
}
|
||||
}
|
||||
|
||||
// AxpyIncTo is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// dst[idst] = alpha*x[ix] + y[iy]
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// idst += incDst
|
||||
// }
|
||||
func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
|
||||
for i := 0; i < int(n); i++ {
|
||||
dst[idst] = alpha*x[ix] + y[iy]
|
||||
ix += incX
|
||||
iy += incY
|
||||
idst += incDst
|
||||
}
|
||||
}
|
||||
|
||||
// DotUnitary is
|
||||
// for i, v := range x {
|
||||
// sum += y[i] * v
|
||||
// }
|
||||
// return sum
|
||||
func DotUnitary(x, y []float32) (sum float32) {
|
||||
for i, v := range x {
|
||||
sum += y[i] * v
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// DotInc is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// sum += y[iy] * x[ix]
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// }
|
||||
// return sum
|
||||
func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
|
||||
for i := 0; i < int(n); i++ {
|
||||
sum += y[iy] * x[ix]
|
||||
ix += incX
|
||||
iy += incY
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
// DdotUnitary is
|
||||
// for i, v := range x {
|
||||
// sum += float64(y[i]) * float64(v)
|
||||
// }
|
||||
// return
|
||||
func DdotUnitary(x, y []float32) (sum float64) {
|
||||
for i, v := range x {
|
||||
sum += float64(y[i]) * float64(v)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// DdotInc is
|
||||
// for i := 0; i < int(n); i++ {
|
||||
// sum += float64(y[iy]) * float64(x[ix])
|
||||
// ix += incX
|
||||
// iy += incY
|
||||
// }
|
||||
// return
|
||||
func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
|
||||
for i := 0; i < int(n); i++ {
|
||||
sum += float64(y[iy]) * float64(x[ix])
|
||||
ix += incX
|
||||
iy += incY
|
||||
}
|
||||
return
|
||||
}
|
Reference in New Issue
Block a user