| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447 |
- // Copyright 2016 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // +build ignore
- package main
- import (
- "bytes"
- "io/ioutil"
- "log"
- "strings"
- "text/template"
- )
- const (
- copyright = "" +
- "// Copyright 2016 The Go Authors. All rights reserved.\n" +
- "// Use of this source code is governed by a BSD-style\n" +
- "// license that can be found in the LICENSE file.\n"
- doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n"
- dashDashDash = "// --------"
- )
- func main() {
- tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl")
- if err != nil {
- log.Fatalf("ReadFile: %v", err)
- }
- if !bytes.HasPrefix(tmpl, []byte(copyright)) {
- log.Fatal("source template did not start with the copyright header")
- }
- tmpl = tmpl[len(copyright):]
- preamble := []byte(nil)
- if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 {
- log.Fatalf("source template did not contain %q", dashDashDash)
- } else {
- preamble, tmpl = tmpl[:i], tmpl[i:]
- }
- t, err := template.New("").Parse(string(tmpl))
- if err != nil {
- log.Fatalf("Parse: %v", err)
- }
- out := bytes.NewBuffer(nil)
- out.WriteString(doNotEdit)
- out.Write(preamble)
- for i, v := range instances {
- if i != 0 {
- out.WriteString("\n")
- }
- if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
- v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
- }
- if err := t.Execute(out, v); err != nil {
- log.Fatalf("Execute(%q): %v", v.ShortName, err)
- }
- }
- if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil {
- log.Fatalf("WriteFile: %v", err)
- }
- }
- var instances = []struct {
- LongName string
- ShortName string
- FrameSize string
- ArgsSize string
- Args string
- DstElemSize1 int
- DstElemSize4 int
- XMM3 string
- XMM4 string
- XMM5 string
- XMM6 string
- XMM8 string
- XMM9 string
- XMM10 string
- LoadArgs string
- Setup string
- LoadXMMRegs string
- Add string
- ClampAndScale string
- ConvertToInt32 string
- Store4 string
- Store1 string
- }{{
- LongName: "fixedAccumulateOpOver",
- ShortName: "fxAccOpOver",
- FrameSize: fxFrameSize,
- ArgsSize: twoArgArgsSize,
- Args: "dst []uint8, src []uint32",
- DstElemSize1: 1 * sizeOfUint8,
- DstElemSize4: 4 * sizeOfUint8,
- XMM3: fxXMM3,
- XMM4: fxXMM4,
- XMM5: fxXMM5,
- XMM6: opOverXMM6,
- XMM8: opOverXMM8,
- XMM9: opOverXMM9,
- XMM10: opOverXMM10,
- LoadArgs: twoArgLoadArgs,
- Setup: fxSetup,
- LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
- Add: fxAdd,
- ClampAndScale: fxClampAndScale,
- ConvertToInt32: fxConvertToInt32,
- Store4: opOverStore4,
- Store1: opOverStore1,
- }, {
- LongName: "fixedAccumulateOpSrc",
- ShortName: "fxAccOpSrc",
- FrameSize: fxFrameSize,
- ArgsSize: twoArgArgsSize,
- Args: "dst []uint8, src []uint32",
- DstElemSize1: 1 * sizeOfUint8,
- DstElemSize4: 4 * sizeOfUint8,
- XMM3: fxXMM3,
- XMM4: fxXMM4,
- XMM5: fxXMM5,
- XMM6: opSrcXMM6,
- XMM8: opSrcXMM8,
- XMM9: opSrcXMM9,
- XMM10: opSrcXMM10,
- LoadArgs: twoArgLoadArgs,
- Setup: fxSetup,
- LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
- Add: fxAdd,
- ClampAndScale: fxClampAndScale,
- ConvertToInt32: fxConvertToInt32,
- Store4: opSrcStore4,
- Store1: opSrcStore1,
- }, {
- LongName: "fixedAccumulateMask",
- ShortName: "fxAccMask",
- FrameSize: fxFrameSize,
- ArgsSize: oneArgArgsSize,
- Args: "buf []uint32",
- DstElemSize1: 1 * sizeOfUint32,
- DstElemSize4: 4 * sizeOfUint32,
- XMM3: fxXMM3,
- XMM4: fxXMM4,
- XMM5: fxXMM5,
- XMM6: maskXMM6,
- XMM8: maskXMM8,
- XMM9: maskXMM9,
- XMM10: maskXMM10,
- LoadArgs: oneArgLoadArgs,
- Setup: fxSetup,
- LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
- Add: fxAdd,
- ClampAndScale: fxClampAndScale,
- ConvertToInt32: fxConvertToInt32,
- Store4: maskStore4,
- Store1: maskStore1,
- }, {
- LongName: "floatingAccumulateOpOver",
- ShortName: "flAccOpOver",
- FrameSize: flFrameSize,
- ArgsSize: twoArgArgsSize,
- Args: "dst []uint8, src []float32",
- DstElemSize1: 1 * sizeOfUint8,
- DstElemSize4: 4 * sizeOfUint8,
- XMM3: flXMM3,
- XMM4: flXMM4,
- XMM5: flXMM5,
- XMM6: opOverXMM6,
- XMM8: opOverXMM8,
- XMM9: opOverXMM9,
- XMM10: opOverXMM10,
- LoadArgs: twoArgLoadArgs,
- Setup: flSetup,
- LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
- Add: flAdd,
- ClampAndScale: flClampAndScale,
- ConvertToInt32: flConvertToInt32,
- Store4: opOverStore4,
- Store1: opOverStore1,
- }, {
- LongName: "floatingAccumulateOpSrc",
- ShortName: "flAccOpSrc",
- FrameSize: flFrameSize,
- ArgsSize: twoArgArgsSize,
- Args: "dst []uint8, src []float32",
- DstElemSize1: 1 * sizeOfUint8,
- DstElemSize4: 4 * sizeOfUint8,
- XMM3: flXMM3,
- XMM4: flXMM4,
- XMM5: flXMM5,
- XMM6: opSrcXMM6,
- XMM8: opSrcXMM8,
- XMM9: opSrcXMM9,
- XMM10: opSrcXMM10,
- LoadArgs: twoArgLoadArgs,
- Setup: flSetup,
- LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
- Add: flAdd,
- ClampAndScale: flClampAndScale,
- ConvertToInt32: flConvertToInt32,
- Store4: opSrcStore4,
- Store1: opSrcStore1,
- }, {
- LongName: "floatingAccumulateMask",
- ShortName: "flAccMask",
- FrameSize: flFrameSize,
- ArgsSize: twoArgArgsSize,
- Args: "dst []uint32, src []float32",
- DstElemSize1: 1 * sizeOfUint32,
- DstElemSize4: 4 * sizeOfUint32,
- XMM3: flXMM3,
- XMM4: flXMM4,
- XMM5: flXMM5,
- XMM6: maskXMM6,
- XMM8: maskXMM8,
- XMM9: maskXMM9,
- XMM10: maskXMM10,
- LoadArgs: twoArgLoadArgs,
- Setup: flSetup,
- LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs,
- Add: flAdd,
- ClampAndScale: flClampAndScale,
- ConvertToInt32: flConvertToInt32,
- Store4: maskStore4,
- Store1: maskStore1,
- }}
- const (
- fxFrameSize = `0`
- flFrameSize = `8`
- oneArgArgsSize = `24`
- twoArgArgsSize = `48`
- sizeOfUint8 = 1
- sizeOfUint32 = 4
- fxXMM3 = `-`
- flXMM3 = `flSignMask`
- fxXMM4 = `-`
- flXMM4 = `flOne`
- fxXMM5 = `fxAlmost65536`
- flXMM5 = `flAlmost65536`
- oneArgLoadArgs = `
- MOVQ buf_base+0(FP), DI
- MOVQ buf_len+8(FP), BX
- MOVQ buf_base+0(FP), SI
- MOVQ buf_len+8(FP), R10
- `
- twoArgLoadArgs = `
- MOVQ dst_base+0(FP), DI
- MOVQ dst_len+8(FP), BX
- MOVQ src_base+24(FP), SI
- MOVQ src_len+32(FP), R10
- // Sanity check that len(dst) >= len(src).
- CMPQ BX, R10
- JLT {{.ShortName}}End
- `
- fxSetup = ``
- flSetup = `
- // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
- // "Round To Zero".
- STMXCSR mxcsrOrig-8(SP)
- MOVL mxcsrOrig-8(SP), AX
- ORL $0x6000, AX
- MOVL AX, mxcsrNew-4(SP)
- `
- fxLoadXMMRegs = `
- // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
- MOVOU fxAlmost65536<>(SB), X5
- `
- flLoadXMMRegs = `
- // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
- // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
- // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
- MOVOU flSignMask<>(SB), X3
- MOVOU flOne<>(SB), X4
- MOVOU flAlmost65536<>(SB), X5
- `
- fxAdd = `PADDD`
- flAdd = `ADDPS`
- fxClampAndScale = `
- // y = abs(x)
- // y >>= 2 // Shift by 2*ϕ - 16.
- // y = min(y, fxAlmost65536)
- //
- // pabsd %xmm1,%xmm2
- // psrld $0x2,%xmm2
- // pminud %xmm5,%xmm2
- //
- // Hopefully we'll get these opcode mnemonics into the assembler for Go
- // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
- // it's similar.
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
- BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
- BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
- `
- flClampAndScale = `
- // y = x & flSignMask
- // y = min(y, flOne)
- // y = mul(y, flAlmost65536)
- MOVOU X3, X2
- ANDPS X1, X2
- MINPS X4, X2
- MULPS X5, X2
- `
- fxConvertToInt32 = `
- // z = convertToInt32(y)
- // No-op.
- `
- flConvertToInt32 = `
- // z = convertToInt32(y)
- LDMXCSR mxcsrNew-4(SP)
- CVTPS2PL X2, X2
- LDMXCSR mxcsrOrig-8(SP)
- `
- opOverStore4 = `
- // Blend over the dst's prior value. SIMD for i in 0..3:
- //
- // dstA := uint32(dst[i]) * 0x101
- // maskA := z@i
- // outA := dstA*(0xffff-maskA)/0xffff + maskA
- // dst[i] = uint8(outA >> 8)
- //
- // First, set X0 to dstA*(0xfff-maskA).
- MOVL (DI), X0
- PSHUFB X8, X0
- MOVOU X9, X11
- PSUBL X2, X11
- PMULLD X11, X0
- // We implement uint32 division by 0xffff as multiplication by a magic
- // constant (0x800080001) and then a shift by a magic constant (47).
- // See TestDivideByFFFF for a justification.
- //
- // That multiplication widens from uint32 to uint64, so we have to
- // duplicate and shift our four uint32s from one XMM register (X0) to
- // two XMM registers (X0 and X11).
- //
- // Move the second and fourth uint32s in X0 to be the first and third
- // uint32s in X11.
- MOVOU X0, X11
- PSRLQ $32, X11
- // Multiply by magic, shift by magic.
- //
- // pmuludq %xmm10,%xmm0
- // pmuludq %xmm10,%xmm11
- BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
- BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
- PSRLQ $47, X0
- PSRLQ $47, X11
- // Merge the two registers back to one, X11, and add maskA.
- PSLLQ $32, X11
- XORPS X0, X11
- PADDD X11, X2
- // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
- PSHUFB X6, X2
- MOVL X2, (DI)
- `
- opSrcStore4 = `
- // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
- // copy(dst[:4], low4BytesOf(z))
- PSHUFB X6, X2
- MOVL X2, (DI)
- `
- maskStore4 = `
- // copy(dst[:4], z)
- MOVOU X2, (DI)
- `
- opOverStore1 = `
- // Blend over the dst's prior value.
- //
- // dstA := uint32(dst[0]) * 0x101
- // maskA := z
- // outA := dstA*(0xffff-maskA)/0xffff + maskA
- // dst[0] = uint8(outA >> 8)
- MOVBLZX (DI), R12
- IMULL $0x101, R12
- MOVL X2, R13
- MOVL $0xffff, AX
- SUBL R13, AX
- MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
- MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
- MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
- SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
- ADDL DX, R13
- SHRL $8, R13
- MOVB R13, (DI)
- `
- opSrcStore1 = `
- // dst[0] = uint8(z>>8)
- MOVL X2, BX
- SHRL $8, BX
- MOVB BX, (DI)
- `
- maskStore1 = `
- // dst[0] = uint32(z)
- MOVL X2, (DI)
- `
- opOverXMM6 = `gather`
- opSrcXMM6 = `gather`
- maskXMM6 = `-`
- opOverXMM8 = `scatterAndMulBy0x101`
- opSrcXMM8 = `-`
- maskXMM8 = `-`
- opOverXMM9 = `fxAlmost65536`
- opSrcXMM9 = `-`
- maskXMM9 = `-`
- opOverXMM10 = `inverseFFFF`
- opSrcXMM10 = `-`
- maskXMM10 = `-`
- opOverLoadXMMRegs = `
- // gather := XMM(see above) // PSHUFB shuffle mask.
- // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
- // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
- // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
- MOVOU gather<>(SB), X6
- MOVOU scatterAndMulBy0x101<>(SB), X8
- MOVOU fxAlmost65536<>(SB), X9
- MOVOU inverseFFFF<>(SB), X10
- `
- opSrcLoadXMMRegs = `
- // gather := XMM(see above) // PSHUFB shuffle mask.
- MOVOU gather<>(SB), X6
- `
- maskLoadXMMRegs = ``
- )
|