crc32_amd64.s 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. //+build !noasm
  2. //+build !appengine
  3. //+build !gccgo
  4. // Copyright 2015, Klaus Post, see LICENSE for details.
  5. // func crc32sse(a []byte) uint32
  6. TEXT ·crc32sse(SB), 4, $0
  7. MOVQ a+0(FP), R10
  8. XORQ BX, BX
  9. // CRC32 dword (R10), EBX
  10. BYTE $0xF2; BYTE $0x41; BYTE $0x0f
  11. BYTE $0x38; BYTE $0xf1; BYTE $0x1a
  12. MOVL BX, ret+24(FP)
  13. RET
  14. // func crc32sseAll(a []byte, dst []uint32)
  15. TEXT ·crc32sseAll(SB), 4, $0
  16. MOVQ a+0(FP), R8 // R8: src
  17. MOVQ a_len+8(FP), R10 // input length
  18. MOVQ dst+24(FP), R9 // R9: dst
  19. SUBQ $4, R10
  20. JS end
  21. JZ one_crc
  22. MOVQ R10, R13
  23. SHRQ $2, R10 // len/4
  24. ANDQ $3, R13 // len&3
  25. XORQ BX, BX
  26. ADDQ $1, R13
  27. TESTQ R10, R10
  28. JZ rem_loop
  29. crc_loop:
  30. MOVQ (R8), R11
  31. XORQ BX, BX
  32. XORQ DX, DX
  33. XORQ DI, DI
  34. MOVQ R11, R12
  35. SHRQ $8, R11
  36. MOVQ R12, AX
  37. MOVQ R11, CX
  38. SHRQ $16, R12
  39. SHRQ $16, R11
  40. MOVQ R12, SI
  41. // CRC32 EAX, EBX
  42. BYTE $0xF2; BYTE $0x0f
  43. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  44. // CRC32 ECX, EDX
  45. BYTE $0xF2; BYTE $0x0f
  46. BYTE $0x38; BYTE $0xf1; BYTE $0xd1
  47. // CRC32 ESI, EDI
  48. BYTE $0xF2; BYTE $0x0f
  49. BYTE $0x38; BYTE $0xf1; BYTE $0xfe
  50. MOVL BX, (R9)
  51. MOVL DX, 4(R9)
  52. MOVL DI, 8(R9)
  53. XORQ BX, BX
  54. MOVL R11, AX
  55. // CRC32 EAX, EBX
  56. BYTE $0xF2; BYTE $0x0f
  57. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  58. MOVL BX, 12(R9)
  59. ADDQ $16, R9
  60. ADDQ $4, R8
  61. XORQ BX, BX
  62. SUBQ $1, R10
  63. JNZ crc_loop
  64. rem_loop:
  65. MOVL (R8), AX
  66. // CRC32 EAX, EBX
  67. BYTE $0xF2; BYTE $0x0f
  68. BYTE $0x38; BYTE $0xf1; BYTE $0xd8
  69. MOVL BX, (R9)
  70. ADDQ $4, R9
  71. ADDQ $1, R8
  72. XORQ BX, BX
  73. SUBQ $1, R13
  74. JNZ rem_loop
  75. end:
  76. RET
  77. one_crc:
  78. MOVQ $1, R13
  79. XORQ BX, BX
  80. JMP rem_loop
  81. // func matchLenSSE4(a, b []byte, max int) int
  82. TEXT ·matchLenSSE4(SB), 4, $0
  83. MOVQ a_base+0(FP), SI
  84. MOVQ b_base+24(FP), DI
  85. MOVQ DI, DX
  86. MOVQ max+48(FP), CX
  87. cmp8:
  88. // As long as we are 8 or more bytes before the end of max, we can load and
  89. // compare 8 bytes at a time. If those 8 bytes are equal, repeat.
  90. CMPQ CX, $8
  91. JLT cmp1
  92. MOVQ (SI), AX
  93. MOVQ (DI), BX
  94. CMPQ AX, BX
  95. JNE bsf
  96. ADDQ $8, SI
  97. ADDQ $8, DI
  98. SUBQ $8, CX
  99. JMP cmp8
  100. bsf:
  101. // If those 8 bytes were not equal, XOR the two 8 byte values, and return
  102. // the index of the first byte that differs. The BSF instruction finds the
  103. // least significant 1 bit, the amd64 architecture is little-endian, and
  104. // the shift by 3 converts a bit index to a byte index.
  105. XORQ AX, BX
  106. BSFQ BX, BX
  107. SHRQ $3, BX
  108. ADDQ BX, DI
  109. // Subtract off &b[0] to convert from &b[ret] to ret, and return.
  110. SUBQ DX, DI
  111. MOVQ DI, ret+56(FP)
  112. RET
  113. cmp1:
  114. // In the slices' tail, compare 1 byte at a time.
  115. CMPQ CX, $0
  116. JEQ matchLenEnd
  117. MOVB (SI), AX
  118. MOVB (DI), BX
  119. CMPB AX, BX
  120. JNE matchLenEnd
  121. ADDQ $1, SI
  122. ADDQ $1, DI
  123. SUBQ $1, CX
  124. JMP cmp1
  125. matchLenEnd:
  126. // Subtract off &b[0] to convert from &b[ret] to ret, and return.
  127. SUBQ DX, DI
  128. MOVQ DI, ret+56(FP)
  129. RET
  130. // func histogram(b []byte, h []int32)
  131. TEXT ·histogram(SB), 4, $0
  132. MOVQ b+0(FP), SI // SI: &b
  133. MOVQ b_len+8(FP), R9 // R9: len(b)
  134. MOVQ h+24(FP), DI // DI: Histogram
  135. MOVQ R9, R8
  136. SHRQ $3, R8
  137. JZ hist1
  138. XORQ R11, R11
  139. loop_hist8:
  140. MOVQ (SI), R10
  141. MOVB R10, R11
  142. INCL (DI)(R11*4)
  143. SHRQ $8, R10
  144. MOVB R10, R11
  145. INCL (DI)(R11*4)
  146. SHRQ $8, R10
  147. MOVB R10, R11
  148. INCL (DI)(R11*4)
  149. SHRQ $8, R10
  150. MOVB R10, R11
  151. INCL (DI)(R11*4)
  152. SHRQ $8, R10
  153. MOVB R10, R11
  154. INCL (DI)(R11*4)
  155. SHRQ $8, R10
  156. MOVB R10, R11
  157. INCL (DI)(R11*4)
  158. SHRQ $8, R10
  159. MOVB R10, R11
  160. INCL (DI)(R11*4)
  161. SHRQ $8, R10
  162. INCL (DI)(R10*4)
  163. ADDQ $8, SI
  164. DECQ R8
  165. JNZ loop_hist8
  166. hist1:
  167. ANDQ $7, R9
  168. JZ end_hist
  169. XORQ R10, R10
  170. loop_hist1:
  171. MOVB (SI), R10
  172. INCL (DI)(R10*4)
  173. INCQ SI
  174. DECQ R9
  175. JNZ loop_hist1
  176. end_hist:
  177. RET