gen.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. package main
  6. import (
  7. "bytes"
  8. "io/ioutil"
  9. "log"
  10. "strings"
  11. "text/template"
  12. )
  13. const (
  14. copyright = "" +
  15. "// Copyright 2016 The Go Authors. All rights reserved.\n" +
  16. "// Use of this source code is governed by a BSD-style\n" +
  17. "// license that can be found in the LICENSE file.\n"
  18. doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n"
  19. dashDashDash = "// --------"
  20. )
  21. func main() {
  22. tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl")
  23. if err != nil {
  24. log.Fatalf("ReadFile: %v", err)
  25. }
  26. if !bytes.HasPrefix(tmpl, []byte(copyright)) {
  27. log.Fatal("source template did not start with the copyright header")
  28. }
  29. tmpl = tmpl[len(copyright):]
  30. preamble := []byte(nil)
  31. if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 {
  32. log.Fatalf("source template did not contain %q", dashDashDash)
  33. } else {
  34. preamble, tmpl = tmpl[:i], tmpl[i:]
  35. }
  36. t, err := template.New("").Parse(string(tmpl))
  37. if err != nil {
  38. log.Fatalf("Parse: %v", err)
  39. }
  40. out := bytes.NewBuffer(nil)
  41. out.WriteString(doNotEdit)
  42. out.Write(preamble)
  43. for i, v := range instances {
  44. if i != 0 {
  45. out.WriteString("\n")
  46. }
  47. if strings.Contains(v.LoadArgs, "{{.ShortName}}") {
  48. v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1)
  49. }
  50. if err := t.Execute(out, v); err != nil {
  51. log.Fatalf("Execute(%q): %v", v.ShortName, err)
  52. }
  53. }
  54. if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil {
  55. log.Fatalf("WriteFile: %v", err)
  56. }
  57. }
  58. var instances = []struct {
  59. LongName string
  60. ShortName string
  61. FrameSize string
  62. ArgsSize string
  63. Args string
  64. DstElemSize1 int
  65. DstElemSize4 int
  66. XMM3 string
  67. XMM4 string
  68. XMM5 string
  69. XMM6 string
  70. XMM8 string
  71. XMM9 string
  72. XMM10 string
  73. LoadArgs string
  74. Setup string
  75. LoadXMMRegs string
  76. Add string
  77. ClampAndScale string
  78. ConvertToInt32 string
  79. Store4 string
  80. Store1 string
  81. }{{
  82. LongName: "fixedAccumulateOpOver",
  83. ShortName: "fxAccOpOver",
  84. FrameSize: fxFrameSize,
  85. ArgsSize: twoArgArgsSize,
  86. Args: "dst []uint8, src []uint32",
  87. DstElemSize1: 1 * sizeOfUint8,
  88. DstElemSize4: 4 * sizeOfUint8,
  89. XMM3: fxXMM3,
  90. XMM4: fxXMM4,
  91. XMM5: fxXMM5,
  92. XMM6: opOverXMM6,
  93. XMM8: opOverXMM8,
  94. XMM9: opOverXMM9,
  95. XMM10: opOverXMM10,
  96. LoadArgs: twoArgLoadArgs,
  97. Setup: fxSetup,
  98. LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs,
  99. Add: fxAdd,
  100. ClampAndScale: fxClampAndScale,
  101. ConvertToInt32: fxConvertToInt32,
  102. Store4: opOverStore4,
  103. Store1: opOverStore1,
  104. }, {
  105. LongName: "fixedAccumulateOpSrc",
  106. ShortName: "fxAccOpSrc",
  107. FrameSize: fxFrameSize,
  108. ArgsSize: twoArgArgsSize,
  109. Args: "dst []uint8, src []uint32",
  110. DstElemSize1: 1 * sizeOfUint8,
  111. DstElemSize4: 4 * sizeOfUint8,
  112. XMM3: fxXMM3,
  113. XMM4: fxXMM4,
  114. XMM5: fxXMM5,
  115. XMM6: opSrcXMM6,
  116. XMM8: opSrcXMM8,
  117. XMM9: opSrcXMM9,
  118. XMM10: opSrcXMM10,
  119. LoadArgs: twoArgLoadArgs,
  120. Setup: fxSetup,
  121. LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
  122. Add: fxAdd,
  123. ClampAndScale: fxClampAndScale,
  124. ConvertToInt32: fxConvertToInt32,
  125. Store4: opSrcStore4,
  126. Store1: opSrcStore1,
  127. }, {
  128. LongName: "fixedAccumulateMask",
  129. ShortName: "fxAccMask",
  130. FrameSize: fxFrameSize,
  131. ArgsSize: oneArgArgsSize,
  132. Args: "buf []uint32",
  133. DstElemSize1: 1 * sizeOfUint32,
  134. DstElemSize4: 4 * sizeOfUint32,
  135. XMM3: fxXMM3,
  136. XMM4: fxXMM4,
  137. XMM5: fxXMM5,
  138. XMM6: maskXMM6,
  139. XMM8: maskXMM8,
  140. XMM9: maskXMM9,
  141. XMM10: maskXMM10,
  142. LoadArgs: oneArgLoadArgs,
  143. Setup: fxSetup,
  144. LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs,
  145. Add: fxAdd,
  146. ClampAndScale: fxClampAndScale,
  147. ConvertToInt32: fxConvertToInt32,
  148. Store4: maskStore4,
  149. Store1: maskStore1,
  150. }, {
  151. LongName: "floatingAccumulateOpOver",
  152. ShortName: "flAccOpOver",
  153. FrameSize: flFrameSize,
  154. ArgsSize: twoArgArgsSize,
  155. Args: "dst []uint8, src []float32",
  156. DstElemSize1: 1 * sizeOfUint8,
  157. DstElemSize4: 4 * sizeOfUint8,
  158. XMM3: flXMM3,
  159. XMM4: flXMM4,
  160. XMM5: flXMM5,
  161. XMM6: opOverXMM6,
  162. XMM8: opOverXMM8,
  163. XMM9: opOverXMM9,
  164. XMM10: opOverXMM10,
  165. LoadArgs: twoArgLoadArgs,
  166. Setup: flSetup,
  167. LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs,
  168. Add: flAdd,
  169. ClampAndScale: flClampAndScale,
  170. ConvertToInt32: flConvertToInt32,
  171. Store4: opOverStore4,
  172. Store1: opOverStore1,
  173. }, {
  174. LongName: "floatingAccumulateOpSrc",
  175. ShortName: "flAccOpSrc",
  176. FrameSize: flFrameSize,
  177. ArgsSize: twoArgArgsSize,
  178. Args: "dst []uint8, src []float32",
  179. DstElemSize1: 1 * sizeOfUint8,
  180. DstElemSize4: 4 * sizeOfUint8,
  181. XMM3: flXMM3,
  182. XMM4: flXMM4,
  183. XMM5: flXMM5,
  184. XMM6: opSrcXMM6,
  185. XMM8: opSrcXMM8,
  186. XMM9: opSrcXMM9,
  187. XMM10: opSrcXMM10,
  188. LoadArgs: twoArgLoadArgs,
  189. Setup: flSetup,
  190. LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs,
  191. Add: flAdd,
  192. ClampAndScale: flClampAndScale,
  193. ConvertToInt32: flConvertToInt32,
  194. Store4: opSrcStore4,
  195. Store1: opSrcStore1,
  196. }, {
  197. LongName: "floatingAccumulateMask",
  198. ShortName: "flAccMask",
  199. FrameSize: flFrameSize,
  200. ArgsSize: twoArgArgsSize,
  201. Args: "dst []uint32, src []float32",
  202. DstElemSize1: 1 * sizeOfUint32,
  203. DstElemSize4: 4 * sizeOfUint32,
  204. XMM3: flXMM3,
  205. XMM4: flXMM4,
  206. XMM5: flXMM5,
  207. XMM6: maskXMM6,
  208. XMM8: maskXMM8,
  209. XMM9: maskXMM9,
  210. XMM10: maskXMM10,
  211. LoadArgs: twoArgLoadArgs,
  212. Setup: flSetup,
  213. LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs,
  214. Add: flAdd,
  215. ClampAndScale: flClampAndScale,
  216. ConvertToInt32: flConvertToInt32,
  217. Store4: maskStore4,
  218. Store1: maskStore1,
  219. }}
  220. const (
  221. fxFrameSize = `0`
  222. flFrameSize = `8`
  223. oneArgArgsSize = `24`
  224. twoArgArgsSize = `48`
  225. sizeOfUint8 = 1
  226. sizeOfUint32 = 4
  227. fxXMM3 = `-`
  228. flXMM3 = `flSignMask`
  229. fxXMM4 = `-`
  230. flXMM4 = `flOne`
  231. fxXMM5 = `fxAlmost65536`
  232. flXMM5 = `flAlmost65536`
  233. oneArgLoadArgs = `
  234. MOVQ buf_base+0(FP), DI
  235. MOVQ buf_len+8(FP), BX
  236. MOVQ buf_base+0(FP), SI
  237. MOVQ buf_len+8(FP), R10
  238. `
  239. twoArgLoadArgs = `
  240. MOVQ dst_base+0(FP), DI
  241. MOVQ dst_len+8(FP), BX
  242. MOVQ src_base+24(FP), SI
  243. MOVQ src_len+32(FP), R10
  244. // Sanity check that len(dst) >= len(src).
  245. CMPQ BX, R10
  246. JLT {{.ShortName}}End
  247. `
  248. fxSetup = ``
  249. flSetup = `
  250. // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is
  251. // "Round To Zero".
  252. STMXCSR mxcsrOrig-8(SP)
  253. MOVL mxcsrOrig-8(SP), AX
  254. ORL $0x6000, AX
  255. MOVL AX, mxcsrNew-4(SP)
  256. `
  257. fxLoadXMMRegs = `
  258. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16.
  259. MOVOU fxAlmost65536<>(SB), X5
  260. `
  261. flLoadXMMRegs = `
  262. // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32.
  263. // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32.
  264. // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32.
  265. MOVOU flSignMask<>(SB), X3
  266. MOVOU flOne<>(SB), X4
  267. MOVOU flAlmost65536<>(SB), X5
  268. `
  269. fxAdd = `PADDD`
  270. flAdd = `ADDPS`
  271. fxClampAndScale = `
  272. // y = abs(x)
  273. // y >>= 2 // Shift by 2*ϕ - 16.
  274. // y = min(y, fxAlmost65536)
  275. //
  276. // pabsd %xmm1,%xmm2
  277. // psrld $0x2,%xmm2
  278. // pminud %xmm5,%xmm2
  279. //
  280. // Hopefully we'll get these opcode mnemonics into the assembler for Go
  281. // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but
  282. // it's similar.
  283. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1
  284. BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02
  285. BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5
  286. `
  287. flClampAndScale = `
  288. // y = x & flSignMask
  289. // y = min(y, flOne)
  290. // y = mul(y, flAlmost65536)
  291. MOVOU X3, X2
  292. ANDPS X1, X2
  293. MINPS X4, X2
  294. MULPS X5, X2
  295. `
  296. fxConvertToInt32 = `
  297. // z = convertToInt32(y)
  298. // No-op.
  299. `
  300. flConvertToInt32 = `
  301. // z = convertToInt32(y)
  302. LDMXCSR mxcsrNew-4(SP)
  303. CVTPS2PL X2, X2
  304. LDMXCSR mxcsrOrig-8(SP)
  305. `
  306. opOverStore4 = `
  307. // Blend over the dst's prior value. SIMD for i in 0..3:
  308. //
  309. // dstA := uint32(dst[i]) * 0x101
  310. // maskA := z@i
  311. // outA := dstA*(0xffff-maskA)/0xffff + maskA
  312. // dst[i] = uint8(outA >> 8)
  313. //
  314. // First, set X0 to dstA*(0xfff-maskA).
  315. MOVL (DI), X0
  316. PSHUFB X8, X0
  317. MOVOU X9, X11
  318. PSUBL X2, X11
  319. PMULLD X11, X0
  320. // We implement uint32 division by 0xffff as multiplication by a magic
  321. // constant (0x800080001) and then a shift by a magic constant (47).
  322. // See TestDivideByFFFF for a justification.
  323. //
  324. // That multiplication widens from uint32 to uint64, so we have to
  325. // duplicate and shift our four uint32s from one XMM register (X0) to
  326. // two XMM registers (X0 and X11).
  327. //
  328. // Move the second and fourth uint32s in X0 to be the first and third
  329. // uint32s in X11.
  330. MOVOU X0, X11
  331. PSRLQ $32, X11
  332. // Multiply by magic, shift by magic.
  333. //
  334. // pmuludq %xmm10,%xmm0
  335. // pmuludq %xmm10,%xmm11
  336. BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2
  337. BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda
  338. PSRLQ $47, X0
  339. PSRLQ $47, X11
  340. // Merge the two registers back to one, X11, and add maskA.
  341. PSLLQ $32, X11
  342. XORPS X0, X11
  343. PADDD X11, X2
  344. // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes.
  345. PSHUFB X6, X2
  346. MOVL X2, (DI)
  347. `
  348. opSrcStore4 = `
  349. // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z)
  350. // copy(dst[:4], low4BytesOf(z))
  351. PSHUFB X6, X2
  352. MOVL X2, (DI)
  353. `
  354. maskStore4 = `
  355. // copy(dst[:4], z)
  356. MOVOU X2, (DI)
  357. `
  358. opOverStore1 = `
  359. // Blend over the dst's prior value.
  360. //
  361. // dstA := uint32(dst[0]) * 0x101
  362. // maskA := z
  363. // outA := dstA*(0xffff-maskA)/0xffff + maskA
  364. // dst[0] = uint8(outA >> 8)
  365. MOVBLZX (DI), R12
  366. IMULL $0x101, R12
  367. MOVL X2, R13
  368. MOVL $0xffff, AX
  369. SUBL R13, AX
  370. MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX.
  371. MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant...
  372. MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX.
  373. SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15).
  374. ADDL DX, R13
  375. SHRL $8, R13
  376. MOVB R13, (DI)
  377. `
  378. opSrcStore1 = `
  379. // dst[0] = uint8(z>>8)
  380. MOVL X2, BX
  381. SHRL $8, BX
  382. MOVB BX, (DI)
  383. `
  384. maskStore1 = `
  385. // dst[0] = uint32(z)
  386. MOVL X2, (DI)
  387. `
  388. opOverXMM6 = `gather`
  389. opSrcXMM6 = `gather`
  390. maskXMM6 = `-`
  391. opOverXMM8 = `scatterAndMulBy0x101`
  392. opSrcXMM8 = `-`
  393. maskXMM8 = `-`
  394. opOverXMM9 = `fxAlmost65536`
  395. opSrcXMM9 = `-`
  396. maskXMM9 = `-`
  397. opOverXMM10 = `inverseFFFF`
  398. opSrcXMM10 = `-`
  399. maskXMM10 = `-`
  400. opOverLoadXMMRegs = `
  401. // gather := XMM(see above) // PSHUFB shuffle mask.
  402. // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask.
  403. // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff.
  404. // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff.
  405. MOVOU gather<>(SB), X6
  406. MOVOU scatterAndMulBy0x101<>(SB), X8
  407. MOVOU fxAlmost65536<>(SB), X9
  408. MOVOU inverseFFFF<>(SB), X10
  409. `
  410. opSrcLoadXMMRegs = `
  411. // gather := XMM(see above) // PSHUFB shuffle mask.
  412. MOVOU gather<>(SB), X6
  413. `
  414. maskLoadXMMRegs = ``
  415. )