rs_amd64.s 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. // Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
  2. #include "textflag.h"
  3. #define low_tbl Y0
  4. #define high_tbl Y1
  5. #define mask Y2
  6. #define in0 Y3
  7. #define in1 Y4
  8. #define in2 Y5
  9. #define in3 Y6
  10. #define in4 Y7
  11. #define in5 Y8
  12. #define in0_h Y10
  13. #define in1_h Y11
  14. #define in2_h Y12
  15. #define in3_h Y13
  16. #define in4_h Y14
  17. #define in5_h Y15
  18. #define in BX
  19. #define out DI
  20. #define len R8
  21. #define pos R9
  22. #define tmp0 R10
  23. #define low_tblx X0
  24. #define high_tblx X1
  25. #define maskx X2
  26. #define in0x X3
  27. #define in0_hx X10
  28. #define tmp0x X9
  29. #define tmp1x X11
  30. #define tmp2x X12
  31. #define tmp3x X13
  32. // func mulVectAVX2(tbl, d, p []byte)
  33. TEXT ·mulVectAVX2(SB), NOSPLIT, $0
  34. MOVQ i+24(FP), in
  35. MOVQ o+48(FP), out
  36. MOVQ tbl+0(FP), tmp0
  37. VMOVDQU (tmp0), low_tblx
  38. VMOVDQU 16(tmp0), high_tblx
  39. MOVB $0x0f, DX
  40. LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2
  41. VPBROADCASTB maskx, maskx
  42. MOVQ in_len+32(FP), len
  43. TESTQ $31, len
  44. JNZ one16b
  45. ymm:
  46. VINSERTI128 $1, low_tblx, low_tbl, low_tbl
  47. VINSERTI128 $1, high_tblx, high_tbl, high_tbl
  48. VINSERTI128 $1, maskx, mask, mask
  49. TESTQ $255, len
  50. JNZ not_aligned
  51. // 256bytes/loop
  52. aligned:
  53. MOVQ $0, pos
  54. loop256b:
  55. VMOVDQU (in)(pos*1), in0
  56. VPSRLQ $4, in0, in0_h
  57. VPAND mask, in0_h, in0_h
  58. VPAND mask, in0, in0
  59. VPSHUFB in0_h, high_tbl, in0_h
  60. VPSHUFB in0, low_tbl, in0
  61. VPXOR in0, in0_h, in0
  62. VMOVDQU in0, (out)(pos*1)
  63. VMOVDQU 32(in)(pos*1), in1
  64. VPSRLQ $4, in1, in1_h
  65. VPAND mask, in1_h, in1_h
  66. VPAND mask, in1, in1
  67. VPSHUFB in1_h, high_tbl, in1_h
  68. VPSHUFB in1, low_tbl, in1
  69. VPXOR in1, in1_h, in1
  70. VMOVDQU in1, 32(out)(pos*1)
  71. VMOVDQU 64(in)(pos*1), in2
  72. VPSRLQ $4, in2, in2_h
  73. VPAND mask, in2_h, in2_h
  74. VPAND mask, in2, in2
  75. VPSHUFB in2_h, high_tbl, in2_h
  76. VPSHUFB in2, low_tbl, in2
  77. VPXOR in2, in2_h, in2
  78. VMOVDQU in2, 64(out)(pos*1)
  79. VMOVDQU 96(in)(pos*1), in3
  80. VPSRLQ $4, in3, in3_h
  81. VPAND mask, in3_h, in3_h
  82. VPAND mask, in3, in3
  83. VPSHUFB in3_h, high_tbl, in3_h
  84. VPSHUFB in3, low_tbl, in3
  85. VPXOR in3, in3_h, in3
  86. VMOVDQU in3, 96(out)(pos*1)
  87. VMOVDQU 128(in)(pos*1), in4
  88. VPSRLQ $4, in4, in4_h
  89. VPAND mask, in4_h, in4_h
  90. VPAND mask, in4, in4
  91. VPSHUFB in4_h, high_tbl, in4_h
  92. VPSHUFB in4, low_tbl, in4
  93. VPXOR in4, in4_h, in4
  94. VMOVDQU in4, 128(out)(pos*1)
  95. VMOVDQU 160(in)(pos*1), in5
  96. VPSRLQ $4, in5, in5_h
  97. VPAND mask, in5_h, in5_h
  98. VPAND mask, in5, in5
  99. VPSHUFB in5_h, high_tbl, in5_h
  100. VPSHUFB in5, low_tbl, in5
  101. VPXOR in5, in5_h, in5
  102. VMOVDQU in5, 160(out)(pos*1)
  103. VMOVDQU 192(in)(pos*1), in0
  104. VPSRLQ $4, in0, in0_h
  105. VPAND mask, in0_h, in0_h
  106. VPAND mask, in0, in0
  107. VPSHUFB in0_h, high_tbl, in0_h
  108. VPSHUFB in0, low_tbl, in0
  109. VPXOR in0, in0_h, in0
  110. VMOVDQU in0, 192(out)(pos*1)
  111. VMOVDQU 224(in)(pos*1), in1
  112. VPSRLQ $4, in1, in1_h
  113. VPAND mask, in1_h, in1_h
  114. VPAND mask, in1, in1
  115. VPSHUFB in1_h, high_tbl, in1_h
  116. VPSHUFB in1, low_tbl, in1
  117. VPXOR in1, in1_h, in1
  118. VMOVDQU in1, 224(out)(pos*1)
  119. ADDQ $256, pos
  120. CMPQ len, pos
  121. JNE loop256b
  122. VZEROUPPER
  123. RET
  124. not_aligned:
  125. MOVQ len, tmp0
  126. ANDQ $255, tmp0
  127. loop32b:
  128. VMOVDQU -32(in)(len*1), in0
  129. VPSRLQ $4, in0, in0_h
  130. VPAND mask, in0_h, in0_h
  131. VPAND mask, in0, in0
  132. VPSHUFB in0_h, high_tbl, in0_h
  133. VPSHUFB in0, low_tbl, in0
  134. VPXOR in0, in0_h, in0
  135. VMOVDQU in0, -32(out)(len*1)
  136. SUBQ $32, len
  137. SUBQ $32, tmp0
  138. JG loop32b
  139. CMPQ len, $256
  140. JGE aligned
  141. VZEROUPPER
  142. RET
  143. one16b:
  144. VMOVDQU -16(in)(len*1), in0x
  145. VPSRLQ $4, in0x, in0_hx
  146. VPAND maskx, in0x, in0x
  147. VPAND maskx, in0_hx, in0_hx
  148. VPSHUFB in0_hx, high_tblx, in0_hx
  149. VPSHUFB in0x, low_tblx, in0x
  150. VPXOR in0x, in0_hx, in0x
  151. VMOVDQU in0x, -16(out)(len*1)
  152. SUBQ $16, len
  153. CMPQ len, $0
  154. JNE ymm
  155. RET
  156. // func mulVectAddAVX2(tbl, d, p []byte)
  157. TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
  158. MOVQ i+24(FP), in
  159. MOVQ o+48(FP), out
  160. MOVQ tbl+0(FP), tmp0
  161. VMOVDQU (tmp0), low_tblx
  162. VMOVDQU 16(tmp0), high_tblx
  163. MOVB $0x0f, DX
  164. LONG $0x2069e3c4; WORD $0x00d2
  165. VPBROADCASTB maskx, maskx
  166. MOVQ in_len+32(FP), len
  167. TESTQ $31, len
  168. JNZ one16b
  169. ymm:
  170. VINSERTI128 $1, low_tblx, low_tbl, low_tbl
  171. VINSERTI128 $1, high_tblx, high_tbl, high_tbl
  172. VINSERTI128 $1, maskx, mask, mask
  173. TESTQ $255, len
  174. JNZ not_aligned
  175. aligned:
  176. MOVQ $0, pos
  177. loop256b:
  178. VMOVDQU (in)(pos*1), in0
  179. VPSRLQ $4, in0, in0_h
  180. VPAND mask, in0_h, in0_h
  181. VPAND mask, in0, in0
  182. VPSHUFB in0_h, high_tbl, in0_h
  183. VPSHUFB in0, low_tbl, in0
  184. VPXOR in0, in0_h, in0
  185. VPXOR (out)(pos*1), in0, in0
  186. VMOVDQU in0, (out)(pos*1)
  187. VMOVDQU 32(in)(pos*1), in1
  188. VPSRLQ $4, in1, in1_h
  189. VPAND mask, in1_h, in1_h
  190. VPAND mask, in1, in1
  191. VPSHUFB in1_h, high_tbl, in1_h
  192. VPSHUFB in1, low_tbl, in1
  193. VPXOR in1, in1_h, in1
  194. VPXOR 32(out)(pos*1), in1, in1
  195. VMOVDQU in1, 32(out)(pos*1)
  196. VMOVDQU 64(in)(pos*1), in2
  197. VPSRLQ $4, in2, in2_h
  198. VPAND mask, in2_h, in2_h
  199. VPAND mask, in2, in2
  200. VPSHUFB in2_h, high_tbl, in2_h
  201. VPSHUFB in2, low_tbl, in2
  202. VPXOR in2, in2_h, in2
  203. VPXOR 64(out)(pos*1), in2, in2
  204. VMOVDQU in2, 64(out)(pos*1)
  205. VMOVDQU 96(in)(pos*1), in3
  206. VPSRLQ $4, in3, in3_h
  207. VPAND mask, in3_h, in3_h
  208. VPAND mask, in3, in3
  209. VPSHUFB in3_h, high_tbl, in3_h
  210. VPSHUFB in3, low_tbl, in3
  211. VPXOR in3, in3_h, in3
  212. VPXOR 96(out)(pos*1), in3, in3
  213. VMOVDQU in3, 96(out)(pos*1)
  214. VMOVDQU 128(in)(pos*1), in4
  215. VPSRLQ $4, in4, in4_h
  216. VPAND mask, in4_h, in4_h
  217. VPAND mask, in4, in4
  218. VPSHUFB in4_h, high_tbl, in4_h
  219. VPSHUFB in4, low_tbl, in4
  220. VPXOR in4, in4_h, in4
  221. VPXOR 128(out)(pos*1), in4, in4
  222. VMOVDQU in4, 128(out)(pos*1)
  223. VMOVDQU 160(in)(pos*1), in5
  224. VPSRLQ $4, in5, in5_h
  225. VPAND mask, in5_h, in5_h
  226. VPAND mask, in5, in5
  227. VPSHUFB in5_h, high_tbl, in5_h
  228. VPSHUFB in5, low_tbl, in5
  229. VPXOR in5, in5_h, in5
  230. VPXOR 160(out)(pos*1), in5, in5
  231. VMOVDQU in5, 160(out)(pos*1)
  232. VMOVDQU 192(in)(pos*1), in0
  233. VPSRLQ $4, in0, in0_h
  234. VPAND mask, in0_h, in0_h
  235. VPAND mask, in0, in0
  236. VPSHUFB in0_h, high_tbl, in0_h
  237. VPSHUFB in0, low_tbl, in0
  238. VPXOR in0, in0_h, in0
  239. VPXOR 192(out)(pos*1), in0, in0
  240. VMOVDQU in0, 192(out)(pos*1)
  241. VMOVDQU 224(in)(pos*1), in1
  242. VPSRLQ $4, in1, in1_h
  243. VPAND mask, in1_h, in1_h
  244. VPAND mask, in1, in1
  245. VPSHUFB in1_h, high_tbl, in1_h
  246. VPSHUFB in1, low_tbl, in1
  247. VPXOR in1, in1_h, in1
  248. VPXOR 224(out)(pos*1), in1, in1
  249. VMOVDQU in1, 224(out)(pos*1)
  250. ADDQ $256, pos
  251. CMPQ len, pos
  252. JNE loop256b
  253. VZEROUPPER
  254. RET
  255. not_aligned:
  256. MOVQ len, tmp0
  257. ANDQ $255, tmp0
  258. loop32b:
  259. VMOVDQU -32(in)(len*1), in0
  260. VPSRLQ $4, in0, in0_h
  261. VPAND mask, in0_h, in0_h
  262. VPAND mask, in0, in0
  263. VPSHUFB in0_h, high_tbl, in0_h
  264. VPSHUFB in0, low_tbl, in0
  265. VPXOR in0, in0_h, in0
  266. VPXOR -32(out)(len*1), in0, in0
  267. VMOVDQU in0, -32(out)(len*1)
  268. SUBQ $32, len
  269. SUBQ $32, tmp0
  270. JG loop32b
  271. CMPQ len, $256
  272. JGE aligned
  273. VZEROUPPER
  274. RET
  275. one16b:
  276. VMOVDQU -16(in)(len*1), in0x
  277. VPSRLQ $4, in0x, in0_hx
  278. VPAND maskx, in0x, in0x
  279. VPAND maskx, in0_hx, in0_hx
  280. VPSHUFB in0_hx, high_tblx, in0_hx
  281. VPSHUFB in0x, low_tblx, in0x
  282. VPXOR in0x, in0_hx, in0x
  283. VPXOR -16(out)(len*1), in0x, in0x
  284. VMOVDQU in0x, -16(out)(len*1)
  285. SUBQ $16, len
  286. CMPQ len, $0
  287. JNE ymm
  288. RET
  289. // func mulVectSSSE3(tbl, d, p []byte)
  290. TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
  291. MOVQ i+24(FP), in
  292. MOVQ o+48(FP), out
  293. MOVQ tbl+0(FP), tmp0
  294. MOVOU (tmp0), low_tblx
  295. MOVOU 16(tmp0), high_tblx
  296. MOVB $15, tmp0
  297. MOVQ tmp0, maskx
  298. PXOR tmp0x, tmp0x
  299. PSHUFB tmp0x, maskx
  300. MOVQ in_len+32(FP), len
  301. SHRQ $4, len
  302. loop:
  303. MOVOU (in), in0x
  304. MOVOU in0x, in0_hx
  305. PSRLQ $4, in0_hx
  306. PAND maskx, in0x
  307. PAND maskx, in0_hx
  308. MOVOU low_tblx, tmp1x
  309. MOVOU high_tblx, tmp2x
  310. PSHUFB in0x, tmp1x
  311. PSHUFB in0_hx, tmp2x
  312. PXOR tmp1x, tmp2x
  313. MOVOU tmp2x, (out)
  314. ADDQ $16, in
  315. ADDQ $16, out
  316. SUBQ $1, len
  317. JNZ loop
  318. RET
  319. // func mulVectAddSSSE3(tbl, d, p []byte)
  320. TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
  321. MOVQ i+24(FP), in
  322. MOVQ o+48(FP), out
  323. MOVQ tbl+0(FP), tmp0
  324. MOVOU (tmp0), low_tblx
  325. MOVOU 16(tmp0), high_tblx
  326. MOVB $15, tmp0
  327. MOVQ tmp0, maskx
  328. PXOR tmp0x, tmp0x
  329. PSHUFB tmp0x, maskx
  330. MOVQ in_len+32(FP), len
  331. SHRQ $4, len
  332. loop:
  333. MOVOU (in), in0x
  334. MOVOU in0x, in0_hx
  335. PSRLQ $4, in0_hx
  336. PAND maskx, in0x
  337. PAND maskx, in0_hx
  338. MOVOU low_tblx, tmp1x
  339. MOVOU high_tblx, tmp2x
  340. PSHUFB in0x, tmp1x
  341. PSHUFB in0_hx, tmp2x
  342. PXOR tmp1x, tmp2x
  343. MOVOU (out), tmp3x
  344. PXOR tmp3x, tmp2x
  345. MOVOU tmp2x, (out)
  346. ADDQ $16, in
  347. ADDQ $16, out
  348. SUBQ $1, len
  349. JNZ loop
  350. RET
  351. // func copy32B(dst, src []byte)
  352. TEXT ·copy32B(SB), NOSPLIT, $0
  353. MOVQ dst+0(FP), SI
  354. MOVQ src+24(FP), DX
  355. MOVOU (DX), X0
  356. MOVOU 16(DX), X1
  357. MOVOU X0, (SI)
  358. MOVOU X1, 16(SI)
  359. RET