1. update clientset, deepcopy using code-generator

2. add a dummy file tools.go to force "go mod vendor" to see
code-generator as dependencies
3. add a script to update CRD
4. add a README to document CRD updating steps
run go mod tidy
update README
This commit is contained in:
xiangqian
2019-12-03 01:22:21 -08:00
parent 90533183e4
commit 728e29aa7e
1128 changed files with 167705 additions and 5135 deletions

View File

@@ -0,0 +1,134 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SI // SI = &x
MOVQ y_base+40(FP), DI // DI = &y
MOVQ n+64(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+88(FP), R8 // R8 = ix // Load the first index
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ iy+96(FP), R9 // R9 = iy
SHLQ $4, R9 // R9 *= sizeof(complex128)
LEAQ (SI)(R8*1), SI // SI = &(x[ix])
LEAQ (DI)(R9*1), DI // DI = &(y[iy])
MOVQ DI, DX // DX = DI // Separate Read/Write pointers
MOVQ incX+72(FP), R8 // R8 = incX
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ incY+80(FP), R9 // R9 = iy
SHLQ $4, R9 // R9 *= sizeof(complex128)
MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ axpyi_tail // if BX == 0 { goto axpyi_tail }
axpyi_loop: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SI)(R8*1), X4
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVUPS (SI), X6
MOVUPS (SI)(R8*1), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
ADDPD (DX)(R9*1), X5
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDPD (DX), X7
ADDPD (DX)(R9*1), X9
MOVUPS X3, (DI) // dst[i] = X_(i+1)
MOVUPS X5, (DI)(R9*1)
LEAQ (DI)(R9*2), DI
MOVUPS X7, (DI)
MOVUPS X9, (DI)(R9*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
DECQ BX
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI), X3
MOVUPS X3, (DI) // y[i] = X_i
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,141 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+56(FP), SI // SI = &x
MOVQ y_base+80(FP), DX // DX = &y
MOVQ n+104(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+128(FP), R8 // R8 = ix // Load the first index
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ iy+136(FP), R9 // R9 = iy
SHLQ $4, R9 // R9 *= sizeof(complex128)
MOVQ idst+32(FP), R10 // R10 = idst
SHLQ $4, R10 // R10 *= sizeof(complex128)
LEAQ (SI)(R8*1), SI // SI = &(x[ix])
LEAQ (DX)(R9*1), DX // DX = &(y[iy])
LEAQ (DI)(R10*1), DI // DI = &(dst[idst])
MOVQ incX+112(FP), R8 // R8 = incX
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ incY+120(FP), R9 // R9 = incY
SHLQ $4, R9 // R9 *= sizeof(complex128)
MOVQ incDst+24(FP), R10 // R10 = incDst
SHLQ $4, R10 // R10 *= sizeof(complex128)
MOVUPS alpha+40(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ axpyi_tail // if BX == 0 { goto axpyi_tail }
axpyi_loop: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SI)(R8*1), X4
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVUPS (SI), X6
MOVUPS (SI)(R8*1), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
ADDPD (DX)(R9*1), X5
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDPD (DX), X7
ADDPD (DX)(R9*1), X9
MOVUPS X3, (DI) // dst[i] = X_(i+1)
MOVUPS X5, (DI)(R10*1)
LEAQ (DI)(R10*2), DI
MOVUPS X7, (DI)
MOVUPS X9, (DI)(R10*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
DECQ BX
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
MOVUPS X3, (DI) // y[i] X_(i+1)
ADDQ R8, SI // SI += incX
ADDQ R9, DX // DX += incY
ADDQ R10, DI // DI += incDst
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,122 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyUnitary(alpha complex128, x, y []complex128)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SI // SI = &x
MOVQ y_base+40(FP), DI // DI = &y
MOVQ x_len+24(FP), CX // CX = min( len(x), len(y) )
CMPQ y_len+48(FP), CX
CMOVQLE y_len+48(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
PXOR X0, X0 // Clear work registers and cache-align loop
PXOR X1, X1
MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
XORQ AX, AX // i = 0
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SI)(AX*8), X4
MOVUPS 32(SI)(AX*8), X6
MOVUPS 48(SI)(AX*8), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI)(AX*8), X3
ADDPD 16(DI)(AX*8), X5
ADDPD 32(DI)(AX*8), X7
ADDPD 48(DI)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
DECQ BX
JNZ caxy_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI)(AX*8), X3
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
ADDQ $2, AX // i += 2
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

View File

@@ -0,0 +1,123 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+40(FP), SI // SI = &x
MOVQ y_base+64(FP), DX // DX = &y
MOVQ x_len+48(FP), CX // CX = min( len(x), len(y), len(dst) )
CMPQ y_len+72(FP), CX
CMOVQLE y_len+72(FP), CX
CMPQ dst_len+8(FP), CX
CMOVQLE dst_len+8(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
MOVUPS alpha+24(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
XORQ AX, AX // i = 0
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SI)(AX*8), X4
MOVUPS 32(SI)(AX*8), X6
MOVUPS 48(SI)(AX*8), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX)(AX*8), X3
ADDPD 16(DX)(AX*8), X5
ADDPD 32(DX)(AX*8), X7
ADDPD 48(DX)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
DECQ BX
JNZ caxy_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // Same calculation, but read in values to avoid trampling memory
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX)(AX*8), X3
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
ADDQ $2, AX // i += 2
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

6
vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package c128 provides complex128 vector primitives.
package c128 // import "gonum.org/v1/gonum/internal/asm/c128"

View File

@@ -0,0 +1,153 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define MOVDDUP_XPTR__X3 LONG $0x1E120FF2 // MOVDDUP (SI), X3
#define MOVDDUP_XPTR_INCX__X5 LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
#define MOVDDUP_XPTR_INCX_2__X7 LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
#define MOVDDUP_XPTR_INCx3X__X9 LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
#define MOVDDUP_8_XPTR__X2 LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
#define MOVDDUP_8_XPTR_INCX__X4 LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
#define MOVDDUP_8_XPTR_INCX_2__X6 LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
#define MOVDDUP_8_XPTR_INCx3X__X8 LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define INC_X R8
#define INCx3_X R9
#define INC_Y R10
#define INCx3_Y R11
#define NEG1 X15
#define P_NEG1 X14
// func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
TEXT ·DotcInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ n+48(FP), LEN // LEN = n
PXOR SUM, SUM // SUM = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
PXOR P_SUM, P_SUM // P_SUM = 0
MOVQ ix+72(FP), INC_X // INC_X = ix * sizeof(complex128)
SHLQ $4, INC_X
MOVQ iy+80(FP), INC_Y // INC_Y = iy * sizeof(complex128)
SHLQ $4, INC_Y
LEAQ (X_PTR)(INC_X*1), X_PTR // X_PTR = &(X_PTR[ix])
LEAQ (Y_PTR)(INC_Y*1), Y_PTR // Y_PTR = &(Y_PTR[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX
SHLQ $4, INC_X // INC_X *= sizeof(complex128)
MOVQ incY+64(FP), INC_Y // INC_Y = incY
SHLQ $4, INC_Y // INC_Y *= sizeof(complex128)
MOVSD $(-1.0), NEG1
SHUFPD $0, NEG1, NEG1 // { -1, -1 }
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = n % 4
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dot_tail // if n <= 4 { goto dot_tail }
MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
dot_loop: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_XPTR_INCX__X5
MOVDDUP_XPTR_INCX_2__X7
MOVDDUP_XPTR_INCx3X__X9
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_8_XPTR_INCX__X4
MOVDDUP_8_XPTR_INCX_2__X6
MOVDDUP_8_XPTR_INCx3X__X8
// X_i = { -imag(x[i]), -imag(x[i]) }
MULPD NEG1, X2
MULPD P_NEG1, X4
MULPD NEG1, X6
MULPD P_NEG1, X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR), X10
MOVUPS (Y_PTR)(INC_Y*1), X11
MOVUPS (Y_PTR)(INC_Y*2), X12
MOVUPS (Y_PTR)(INCx3_Y*1), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // sum += psum
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD NEG1, X2 // X_i = { -imag(x[i]) , -imag(x[i]) }
MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // sum += result[i]
ADDQ INC_X, X_PTR // X_PTR += incX
ADDQ INC_Y, Y_PTR // Y_PTR += incY
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVUPS SUM, sum+88(FP)
RET

View File

@@ -0,0 +1,143 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define MOVDDUP_XPTR_IDX_8__X3 LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
#define MOVDDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
#define MOVDDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
#define MOVDDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
#define MOVDDUP_XPTR_IIDX_8__X2 LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
#define MOVDDUP_16_XPTR_IIDX_8__X4 LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
#define MOVDDUP_32_XPTR_IIDX_8__X6 LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
#define MOVDDUP_48_XPTR_IIDX_8__X8 LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define IDX AX
#define I_IDX DX
#define NEG1 X15
#define P_NEG1 X14
// func DotcUnitary(x, y []complex128) (sum complex128)
TEXT ·DotcUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
PXOR SUM, SUM // sum = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
XORPS P_SUM, P_SUM // psum = 0
MOVSD $(-1.0), NEG1
SHUFPD $0, NEG1, NEG1 // { -1, -1 }
XORQ IDX, IDX // i := 0
MOVQ $1, I_IDX // j := 1
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = floor( TAIL / 4 )
SHRQ $2, LEN // LEN = TAIL % 4
JZ dot_tail // if LEN == 0 { goto dot_tail }
MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
dot_loop: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_16_XPTR_IDX_8__X5
MOVDDUP_32_XPTR_IDX_8__X7
MOVDDUP_48_XPTR_IDX_8__X9
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_16_XPTR_IIDX_8__X4
MOVDDUP_32_XPTR_IIDX_8__X6
MOVDDUP_48_XPTR_IIDX_8__X8
// X_i = { -imag(x[i]), -imag(x[i]) }
MULPD NEG1, X2
MULPD P_NEG1, X4
MULPD NEG1, X6
MULPD P_NEG1, X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR)(IDX*8), X10
MOVUPS 16(Y_PTR)(IDX*8), X11
MOVUPS 32(Y_PTR)(IDX*8), X12
MOVUPS 48(Y_PTR)(IDX*8), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
ADDQ $8, IDX // IDX += 8
ADDQ $8, I_IDX // I_IDX += 8
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // sum += psum
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i]) , real(x[i]) }
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]) , imag(x[i]) }
MULPD NEG1, X2 // X_i = { -imag(x[i]) , -imag(x[i]) }
MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // SUM += result[i]
ADDQ $2, IDX // IDX += 2
ADDQ $2, I_IDX // I_IDX += 2
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVUPS SUM, sum+48(FP)
RET

View File

@@ -0,0 +1,141 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define MOVDDUP_XPTR__X3 LONG $0x1E120FF2 // MOVDDUP (SI), X3
#define MOVDDUP_XPTR_INCX__X5 LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
#define MOVDDUP_XPTR_INCX_2__X7 LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
#define MOVDDUP_XPTR_INCx3X__X9 LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
#define MOVDDUP_8_XPTR__X2 LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
#define MOVDDUP_8_XPTR_INCX__X4 LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
#define MOVDDUP_8_XPTR_INCX_2__X6 LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
#define MOVDDUP_8_XPTR_INCx3X__X8 LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define INC_X R8
#define INCx3_X R9
#define INC_Y R10
#define INCx3_Y R11
// func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
TEXT ·DotuInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ n+48(FP), LEN // LEN = n
PXOR SUM, SUM // sum = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
MOVQ ix+72(FP), INC_X // INC_X = ix * sizeof(complex128)
SHLQ $4, INC_X
MOVQ iy+80(FP), INC_Y // INC_Y = iy * sizeof(complex128)
SHLQ $4, INC_Y
LEAQ (X_PTR)(INC_X*1), X_PTR // X_PTR = &(X_PTR[ix])
LEAQ (Y_PTR)(INC_Y*1), Y_PTR // Y_PTR = &(Y_PTR[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX
SHLQ $4, INC_X // INC_X *= sizeof(complex128)
MOVQ incY+64(FP), INC_Y // INC_Y = incY
SHLQ $4, INC_Y // INC_Y *= sizeof(complex128)
MOVQ LEN, TAIL
ANDQ $3, TAIL // LEN = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dot_tail // if LEN <= 4 { goto dot_tail }
PXOR P_SUM, P_SUM // psum = 0
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
dot_loop: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_XPTR_INCX__X5
MOVDDUP_XPTR_INCX_2__X7
MOVDDUP_XPTR_INCx3X__X9
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_8_XPTR_INCX__X4
MOVDDUP_8_XPTR_INCX_2__X6
MOVDDUP_8_XPTR_INCx3X__X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR), X10
MOVUPS (Y_PTR)(INC_Y*1), X11
MOVUPS (Y_PTR)(INC_Y*2), X12
MOVUPS (Y_PTR)(INCx3_Y*1), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
DECQ LEN
JNZ dot_loop // } while --BX > 0
ADDPD P_SUM, SUM // sum += psum
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // sum += result[i]
ADDQ INC_X, X_PTR // X_PTR += incX
ADDQ INC_Y, Y_PTR // Y_PTR += incY
DECQ TAIL // --TAIL
JNZ dot_tail // } while TAIL > 0
dot_end:
MOVUPS SUM, sum+88(FP)
RET

View File

@@ -0,0 +1,130 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define MOVDDUP_XPTR_IDX_8__X3 LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
#define MOVDDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
#define MOVDDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
#define MOVDDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
#define MOVDDUP_XPTR_IIDX_8__X2 LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
#define MOVDDUP_16_XPTR_IIDX_8__X4 LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
#define MOVDDUP_32_XPTR_IIDX_8__X6 LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
#define MOVDDUP_48_XPTR_IIDX_8__X8 LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define IDX AX
#define I_IDX DX
// func DotuUnitary(x, y []complex128) (sum complex128)
TEXT ·DotuUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
PXOR SUM, SUM // SUM = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
PXOR P_SUM, P_SUM // P_SUM = 0
XORQ IDX, IDX // IDX = 0
MOVQ $1, DX // j = 1
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = floor( LEN / 4 )
SHRQ $2, LEN // LEN = LEN % 4
JZ dot_tail // if LEN == 0 { goto dot_tail }
dot_loop: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_16_XPTR_IDX_8__X5
MOVDDUP_32_XPTR_IDX_8__X7
MOVDDUP_48_XPTR_IDX_8__X9
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_16_XPTR_IIDX_8__X4
MOVDDUP_32_XPTR_IIDX_8__X6
MOVDDUP_48_XPTR_IIDX_8__X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR)(IDX*8), X10
MOVUPS 16(Y_PTR)(IDX*8), X11
MOVUPS 32(Y_PTR)(IDX*8), X12
MOVUPS 48(Y_PTR)(IDX*8), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
ADDQ $8, IDX // IDX += 8
ADDQ $8, I_IDX // I_IDX += 8
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i] , real(x[i]) }
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]) , imag(x[i]) }
MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // psum += result[i]
ADDQ $2, IDX // IDX += 2
ADDQ $2, I_IDX // I_IDX += 2
DECQ TAIL // --TAIL
JNZ dot_tail // } while TAIL > 0
dot_end:
MOVUPS SUM, sum+48(FP)
RET

View File

@@ -0,0 +1,69 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define TAIL BX
#define INC R9
#define INC3 R10
#define ALPHA X0
#define ALPHA_2 X1
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
// func DscalInc(alpha float64, x []complex128, n, inc uintptr)
TEXT ·DscalInc(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SRC // SRC = &x
MOVQ n+32(FP), LEN // LEN = n
CMPQ LEN, $0 // if LEN == 0 { return }
JE dscal_end
MOVDDUP_ALPHA // ALPHA = alpha
MOVQ inc+40(FP), INC // INC = inc
SHLQ $4, INC // INC = INC * sizeof(complex128)
LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA and ALPHA_2 for pipelining
MOVQ LEN, TAIL // TAIL = LEN
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dscal_tail // if LEN == 0 { goto dscal_tail }
dscal_loop: // do {
MOVUPS (SRC), X2 // X_i = x[i]
MOVUPS (SRC)(INC*1), X3
MOVUPS (SRC)(INC*2), X4
MOVUPS (SRC)(INC3*1), X5
MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
MOVUPS X2, (DST) // x[i] = X_i
MOVUPS X3, (DST)(INC*1)
MOVUPS X4, (DST)(INC*2)
MOVUPS X5, (DST)(INC3*1)
LEAQ (SRC)(INC*4), SRC // SRC += INC*4
DECQ LEN
JNZ dscal_loop // } while --LEN > 0
dscal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JE dscal_end // if TAIL == 0 { return }
dscal_tail_loop: // do {
MOVUPS (SRC), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST) // x[i] = X_i
ADDQ INC, SRC // SRC += INC
DECQ TAIL
JNZ dscal_tail_loop // } while --TAIL > 0
dscal_end:
RET

View File

@@ -0,0 +1,66 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
// func DscalUnitary(alpha float64, x []complex128)
TEXT ·DscalUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SRC // SRC = &x
MOVQ x_len+16(FP), LEN // LEN = len(x)
CMPQ LEN, $0 // if LEN == 0 { return }
JE dscal_end
MOVDDUP_ALPHA // ALPHA = alpha
XORQ IDX, IDX // IDX = 0
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
MOVQ LEN, TAIL // TAIL = LEN
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dscal_tail // if LEN == 0 { goto dscal_tail }
dscal_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
MOVUPS 16(SRC)(IDX*8), X3
MOVUPS 32(SRC)(IDX*8), X4
MOVUPS 48(SRC)(IDX*8), X5
MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
MOVUPS X3, 16(DST)(IDX*8)
MOVUPS X4, 32(DST)(IDX*8)
MOVUPS X5, 48(DST)(IDX*8)
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dscal_loop // } while --LEN > 0
dscal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ dscal_end // if TAIL == 0 { return }
dscal_tail_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
ADDQ $2, IDX // IDX += 2
DECQ TAIL
JNZ dscal_tail_loop // } while --TAIL > 0
dscal_end:
RET

31
vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go generated vendored Normal file
View File

@@ -0,0 +1,31 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package c128
// ScalUnitaryTo is
// for i, v := range x {
// dst[i] = alpha * v
// }
func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
for i, v := range x {
dst[i] = alpha * v
}
}
// ScalIncTo is
// var idst, ix uintptr
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha * x[ix]
// ix += incX
// idst += incDst
// }
func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) {
var idst, ix uintptr
for i := 0; i < int(n); i++ {
dst[idst] = alpha * x[ix]
ix += incX
idst += incDst
}
}

View File

@@ -0,0 +1,116 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_C X1
#define ALPHA2 X10
#define ALPHA_C2 X11
#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3
#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5
#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7
#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
// func ScalUnitary(alpha complex128, x []complex128)
TEXT ·ScalUnitary(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SRC // SRC = &x
MOVQ x_len+24(FP), LEN // LEN = len(x)
CMPQ LEN, $0 // if LEN == 0 { return }
JE scal_end
MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) }
MOVAPS ALPHA, ALPHA_C
SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
XORQ IDX, IDX // IDX = 0
MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining
MOVAPS ALPHA_C, ALPHA_C2
MOVQ LEN, TAIL
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ scal_tail // if BX == 0 { goto scal_tail }
scal_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SRC)(IDX*8), X4
MOVUPS 32(SRC)(IDX*8), X6
MOVUPS 48(SRC)(IDX*8), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
MULPD ALPHA_C, X2
MULPD ALPHA, X3
MULPD ALPHA_C2, X4
MULPD ALPHA2, X5
MULPD ALPHA_C, X6
MULPD ALPHA, X7
MULPD ALPHA_C2, X8
MULPD ALPHA2, X9
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
MOVUPS X5, 16(DST)(IDX*8)
MOVUPS X7, 32(DST)(IDX*8)
MOVUPS X9, 48(DST)(IDX*8)
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ scal_loop // } while --LEN > 0
scal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ scal_end // if TAIL == 0 { return }
scal_tail_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
ADDQ $2, IDX // IDX += 2
DECQ TAIL
JNZ scal_tail_loop // } while --LEN > 0
scal_end:
RET

View File

@@ -0,0 +1,121 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//+build !noasm,!appengine,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define TAIL BX
#define INC R9
#define INC3 R10
#define ALPHA X0
#define ALPHA_C X1
#define ALPHA2 X10
#define ALPHA_C2 X11
#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3
#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5
#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7
#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
// func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
TEXT ·ScalInc(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SRC // SRC = &x
MOVQ n+40(FP), LEN // LEN = len(x)
CMPQ LEN, $0
JE scal_end // if LEN == 0 { return }
MOVQ inc+48(FP), INC // INC = inc
SHLQ $4, INC // INC = INC * sizeof(complex128)
LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) }
MOVAPS ALPHA, ALPHA_C
SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining
MOVAPS ALPHA_C, ALPHA_C2
MOVQ LEN, TAIL
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ scal_tail // if BX == 0 { goto scal_tail }
scal_loop: // do {
MOVUPS (SRC), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SRC)(INC*1), X4
MOVUPS (SRC)(INC*2), X6
MOVUPS (SRC)(INC3*1), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
MULPD ALPHA_C, X2
MULPD ALPHA, X3
MULPD ALPHA_C2, X4
MULPD ALPHA2, X5
MULPD ALPHA_C, X6
MULPD ALPHA, X7
MULPD ALPHA_C2, X8
MULPD ALPHA2, X9
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
MOVUPS X3, (DST) // x[i] = X_(i+1)
MOVUPS X5, (DST)(INC*1)
MOVUPS X7, (DST)(INC*2)
MOVUPS X9, (DST)(INC3*1)
LEAQ (SRC)(INC*4), SRC // SRC = &(SRC[inc*4])
DECQ LEN
JNZ scal_loop // } while --BX > 0
scal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JE scal_end // if TAIL == 0 { return }
scal_tail_loop: // do {
MOVUPS (SRC), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
MOVUPS X3, (DST) // x[i] = X_i
ADDQ INC, SRC // SRC = &(SRC[incX])
DECQ TAIL
JNZ scal_tail_loop // } while --TAIL > 0
scal_end:
RET

View File

@@ -0,0 +1,96 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!appengine,!safe
package c128
// AxpyUnitary is
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha complex128, x, y []complex128)
// AxpyUnitaryTo is
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128)
// AxpyInc is
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
// AxpyIncTo is
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
// DscalUnitary is
// for i, v := range x {
// x[i] = complex(real(v)*alpha, imag(v)*alpha)
// }
func DscalUnitary(alpha float64, x []complex128)
// DscalInc is
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
// ix += inc
// }
func DscalInc(alpha float64, x []complex128, n, inc uintptr)
// ScalInc is
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
// ScalUnitary is
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha complex128, x []complex128)
// DotcUnitary is
// for i, v := range x {
// sum += y[i] * cmplx.Conj(v)
// }
// return sum
func DotcUnitary(x, y []complex128) (sum complex128)
// DotcInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * cmplx.Conj(x[ix])
// ix += incX
// iy += incY
// }
// return sum
func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
// DotuUnitary is
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotuUnitary(x, y []complex128) (sum complex128)
// DotuInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)

View File

@@ -0,0 +1,163 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 noasm appengine safe
package c128
import "math/cmplx"
// AxpyUnitary is
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha complex128, x, y []complex128) {
for i, v := range x {
y[i] += alpha * v
}
}
// AxpyUnitaryTo is
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) {
for i, v := range x {
dst[i] = alpha*v + y[i]
}
}
// AxpyInc is
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
y[iy] += alpha * x[ix]
ix += incX
iy += incY
}
}
// AxpyIncTo is
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
dst[idst] = alpha*x[ix] + y[iy]
ix += incX
iy += incY
idst += incDst
}
}
// DscalUnitary is
// for i, v := range x {
// x[i] = complex(real(v)*alpha, imag(v)*alpha)
// }
func DscalUnitary(alpha float64, x []complex128) {
for i, v := range x {
x[i] = complex(real(v)*alpha, imag(v)*alpha)
}
}
// DscalInc is
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
// ix += inc
// }
func DscalInc(alpha float64, x []complex128, n, inc uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
ix += inc
}
}
// ScalInc is
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha complex128, x []complex128, n, inc uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] *= alpha
ix += inc
}
}
// ScalUnitary is
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha complex128, x []complex128) {
for i := range x {
x[i] *= alpha
}
}
// DotcUnitary is
// for i, v := range x {
// sum += y[i] * cmplx.Conj(v)
// }
// return sum
func DotcUnitary(x, y []complex128) (sum complex128) {
for i, v := range x {
sum += y[i] * cmplx.Conj(v)
}
return sum
}
// DotcInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * cmplx.Conj(x[ix])
// ix += incX
// iy += incY
// }
// return sum
func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
for i := 0; i < int(n); i++ {
sum += y[iy] * cmplx.Conj(x[ix])
ix += incX
iy += incY
}
return sum
}
// DotuUnitary is
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotuUnitary(x, y []complex128) (sum complex128) {
for i, v := range x {
sum += y[i] * v
}
return sum
}
// DotuInc is
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
for i := 0; i < int(n); i++ {
sum += y[iy] * x[ix]
ix += incX
iy += incY
}
return sum
}