avx2_amd64.s 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438
  1. #include "textflag.h"
  2. // addr of mem
  3. #define DST BX
  4. #define SRC SI
  5. #define SRC0 TMP4
  6. #define SRC1 TMP5
  7. // loop args
  8. // num of vect
  9. #define VECT CX
  10. #define LEN DX
  11. // pos of matrix
  12. #define POS R8
  13. // tmp store
  14. // num of vect or ...
  15. #define TMP1 R9
  16. // pos of matrix or ...
  17. #define TMP2 R10
  18. // store addr of data/parity or ...
  19. #define TMP3 R11
  20. #define TMP4 R12
  21. #define TMP5 R13
  22. #define TMP6 R14
  23. // func bytesAVX2mini(dst, src0, src1 []byte, size int)
  24. TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
  25. MOVQ len+72(FP), LEN
  26. CMPQ LEN, $0
  27. JE ret
  28. MOVQ dst+0(FP), DST
  29. MOVQ src0+24(FP), SRC0
  30. MOVQ src1+48(FP), SRC1
  31. TESTQ $31, LEN
  32. JNZ not_aligned
  33. aligned:
  34. MOVQ $0, POS
  35. loop32b:
  36. VMOVDQU (SRC0)(POS*1), Y0
  37. VPXOR (SRC1)(POS*1), Y0, Y0
  38. VMOVDQU Y0, (DST)(POS*1)
  39. ADDQ $32, POS
  40. CMPQ LEN, POS
  41. JNE loop32b
  42. VZEROUPPER
  43. RET
  44. loop_1b:
  45. MOVB -1(SRC0)(LEN*1), TMP1
  46. MOVB -1(SRC1)(LEN*1), TMP2
  47. XORB TMP1, TMP2
  48. MOVB TMP2, -1(DST)(LEN*1)
  49. SUBQ $1, LEN
  50. TESTQ $7, LEN
  51. JNZ loop_1b
  52. CMPQ LEN, $0
  53. JE ret
  54. TESTQ $31, LEN
  55. JZ aligned
  56. not_aligned:
  57. TESTQ $7, LEN
  58. JNE loop_1b
  59. MOVQ LEN, TMP1
  60. ANDQ $31, TMP1
  61. loop_8b:
  62. MOVQ -8(SRC0)(LEN*1), TMP2
  63. MOVQ -8(SRC1)(LEN*1), TMP3
  64. XORQ TMP2, TMP3
  65. MOVQ TMP3, -8(DST)(LEN*1)
  66. SUBQ $8, LEN
  67. SUBQ $8, TMP1
  68. JG loop_8b
  69. CMPQ LEN, $32
  70. JGE aligned
  71. RET
  72. ret:
  73. RET
  74. // func bytesAVX2small(dst, src0, src1 []byte, size int)
  75. TEXT ·bytesAVX2small(SB), NOSPLIT, $0
  76. MOVQ len+72(FP), LEN
  77. CMPQ LEN, $0
  78. JE ret
  79. MOVQ dst+0(FP), DST
  80. MOVQ src0+24(FP), SRC0
  81. MOVQ src1+48(FP), SRC1
  82. TESTQ $127, LEN
  83. JNZ not_aligned
  84. aligned:
  85. MOVQ $0, POS
  86. loop128b:
  87. VMOVDQU (SRC0)(POS*1), Y0
  88. VMOVDQU 32(SRC0)(POS*1), Y1
  89. VMOVDQU 64(SRC0)(POS*1), Y2
  90. VMOVDQU 96(SRC0)(POS*1), Y3
  91. VPXOR (SRC1)(POS*1), Y0, Y0
  92. VPXOR 32(SRC1)(POS*1), Y1, Y1
  93. VPXOR 64(SRC1)(POS*1), Y2, Y2
  94. VPXOR 96(SRC1)(POS*1), Y3, Y3
  95. VMOVDQU Y0, (DST)(POS*1)
  96. VMOVDQU Y1, 32(DST)(POS*1)
  97. VMOVDQU Y2, 64(DST)(POS*1)
  98. VMOVDQU Y3, 96(DST)(POS*1)
  99. ADDQ $128, POS
  100. CMPQ LEN, POS
  101. JNE loop128b
  102. VZEROUPPER
  103. RET
  104. loop_1b:
  105. MOVB -1(SRC0)(LEN*1), TMP1
  106. MOVB -1(SRC1)(LEN*1), TMP2
  107. XORB TMP1, TMP2
  108. MOVB TMP2, -1(DST)(LEN*1)
  109. SUBQ $1, LEN
  110. TESTQ $7, LEN
  111. JNZ loop_1b
  112. CMPQ LEN, $0
  113. JE ret
  114. TESTQ $127, LEN
  115. JZ aligned
  116. not_aligned:
  117. TESTQ $7, LEN
  118. JNE loop_1b
  119. MOVQ LEN, TMP1
  120. ANDQ $127, TMP1
  121. loop_8b:
  122. MOVQ -8(SRC0)(LEN*1), TMP2
  123. MOVQ -8(SRC1)(LEN*1), TMP3
  124. XORQ TMP2, TMP3
  125. MOVQ TMP3, -8(DST)(LEN*1)
  126. SUBQ $8, LEN
  127. SUBQ $8, TMP1
  128. JG loop_8b
  129. CMPQ LEN, $128
  130. JGE aligned
  131. RET
  132. ret:
  133. RET
  134. // func bytesAVX2big(dst, src0, src1 []byte, size int)
  135. TEXT ·bytesAVX2big(SB), NOSPLIT, $0
  136. MOVQ len+72(FP), LEN
  137. CMPQ LEN, $0
  138. JE ret
  139. MOVQ dst+0(FP), DST
  140. MOVQ src0+24(FP), SRC0
  141. MOVQ src1+48(FP), SRC1
  142. TESTQ $127, LEN
  143. JNZ not_aligned
  144. aligned:
  145. MOVQ $0, POS
  146. loop128b:
  147. VMOVDQU (SRC0)(POS*1), Y0
  148. VMOVDQU 32(SRC0)(POS*1), Y1
  149. VMOVDQU 64(SRC0)(POS*1), Y2
  150. VMOVDQU 96(SRC0)(POS*1), Y3
  151. VPXOR (SRC1)(POS*1), Y0, Y0
  152. VPXOR 32(SRC1)(POS*1), Y1, Y1
  153. VPXOR 64(SRC1)(POS*1), Y2, Y2
  154. VPXOR 96(SRC1)(POS*1), Y3, Y3
  155. LONG $0xe77da1c4; WORD $0x0304
  156. LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
  157. LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
  158. LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
  159. ADDQ $128, POS
  160. CMPQ LEN, POS
  161. JNE loop128b
  162. SFENCE
  163. VZEROUPPER
  164. RET
  165. loop_1b:
  166. MOVB -1(SRC0)(LEN*1), TMP1
  167. MOVB -1(SRC1)(LEN*1), TMP2
  168. XORB TMP1, TMP2
  169. MOVB TMP2, -1(DST)(LEN*1)
  170. SUBQ $1, LEN
  171. TESTQ $7, LEN
  172. JNZ loop_1b
  173. CMPQ LEN, $0
  174. JE ret
  175. TESTQ $127, LEN
  176. JZ aligned
  177. not_aligned:
  178. TESTQ $7, LEN
  179. JNE loop_1b
  180. MOVQ LEN, TMP1
  181. ANDQ $127, TMP1
  182. loop_8b:
  183. MOVQ -8(SRC0)(LEN*1), TMP2
  184. MOVQ -8(SRC1)(LEN*1), TMP3
  185. XORQ TMP2, TMP3
  186. MOVQ TMP3, -8(DST)(LEN*1)
  187. SUBQ $8, LEN
  188. SUBQ $8, TMP1
  189. JG loop_8b
  190. CMPQ LEN, $128
  191. JGE aligned
  192. RET
  193. ret:
  194. RET
  195. // func matrixAVX2small(dst []byte, src [][]byte)
  196. TEXT ·matrixAVX2small(SB), NOSPLIT, $0
  197. MOVQ dst+0(FP), DST
  198. MOVQ src+24(FP), SRC
  199. MOVQ vec+32(FP), VECT
  200. MOVQ len+8(FP), LEN
  201. TESTQ $127, LEN
  202. JNZ not_aligned
  203. aligned:
  204. MOVQ $0, POS
  205. loop128b:
  206. MOVQ VECT, TMP1
  207. SUBQ $2, TMP1
  208. MOVQ $0, TMP2
  209. MOVQ (SRC)(TMP2*1), TMP3
  210. MOVQ TMP3, TMP4
  211. VMOVDQU (TMP3)(POS*1), Y0
  212. VMOVDQU 32(TMP4)(POS*1), Y1
  213. VMOVDQU 64(TMP3)(POS*1), Y2
  214. VMOVDQU 96(TMP4)(POS*1), Y3
  215. next_vect:
  216. ADDQ $24, TMP2
  217. MOVQ (SRC)(TMP2*1), TMP3
  218. MOVQ TMP3, TMP4
  219. VMOVDQU (TMP3)(POS*1), Y4
  220. VMOVDQU 32(TMP4)(POS*1), Y5
  221. VMOVDQU 64(TMP3)(POS*1), Y6
  222. VMOVDQU 96(TMP4)(POS*1), Y7
  223. VPXOR Y4, Y0, Y0
  224. VPXOR Y5, Y1, Y1
  225. VPXOR Y6, Y2, Y2
  226. VPXOR Y7, Y3, Y3
  227. SUBQ $1, TMP1
  228. JGE next_vect
  229. VMOVDQU Y0, (DST)(POS*1)
  230. VMOVDQU Y1, 32(DST)(POS*1)
  231. VMOVDQU Y2, 64(DST)(POS*1)
  232. VMOVDQU Y3, 96(DST)(POS*1)
  233. ADDQ $128, POS
  234. CMPQ LEN, POS
  235. JNE loop128b
  236. VZEROUPPER
  237. RET
  238. loop_1b:
  239. MOVQ VECT, TMP1
  240. MOVQ $0, TMP2
  241. MOVQ (SRC)(TMP2*1), TMP3
  242. SUBQ $2, TMP1
  243. MOVB -1(TMP3)(LEN*1), TMP5
  244. next_vect_1b:
  245. ADDQ $24, TMP2
  246. MOVQ (SRC)(TMP2*1), TMP3
  247. MOVB -1(TMP3)(LEN*1), TMP6
  248. XORB TMP6, TMP5
  249. SUBQ $1, TMP1
  250. JGE next_vect_1b
  251. MOVB TMP5, -1(DST)(LEN*1)
  252. SUBQ $1, LEN
  253. TESTQ $7, LEN
  254. JNZ loop_1b
  255. CMPQ LEN, $0
  256. JE ret
  257. TESTQ $127, LEN
  258. JZ aligned
  259. not_aligned:
  260. TESTQ $7, LEN
  261. JNE loop_1b
  262. MOVQ LEN, TMP4
  263. ANDQ $127, TMP4
  264. loop_8b:
  265. MOVQ VECT, TMP1
  266. MOVQ $0, TMP2
  267. MOVQ (SRC)(TMP2*1), TMP3
  268. SUBQ $2, TMP1
  269. MOVQ -8(TMP3)(LEN*1), TMP5
  270. next_vect_8b:
  271. ADDQ $24, TMP2
  272. MOVQ (SRC)(TMP2*1), TMP3
  273. MOVQ -8(TMP3)(LEN*1), TMP6
  274. XORQ TMP6, TMP5
  275. SUBQ $1, TMP1
  276. JGE next_vect_8b
  277. MOVQ TMP5, -8(DST)(LEN*1)
  278. SUBQ $8, LEN
  279. SUBQ $8, TMP4
  280. JG loop_8b
  281. CMPQ LEN, $128
  282. JGE aligned
  283. RET
  284. ret:
  285. RET
  286. // func matrixAVX2big(dst []byte, src [][]byte)
  287. TEXT ·matrixAVX2big(SB), NOSPLIT, $0
  288. MOVQ dst+0(FP), DST
  289. MOVQ src+24(FP), SRC
  290. MOVQ vec+32(FP), VECT
  291. MOVQ len+8(FP), LEN
  292. TESTQ $127, LEN
  293. JNZ not_aligned
  294. aligned:
  295. MOVQ $0, POS
  296. loop128b:
  297. MOVQ VECT, TMP1
  298. SUBQ $2, TMP1
  299. MOVQ $0, TMP2
  300. MOVQ (SRC)(TMP2*1), TMP3
  301. MOVQ TMP3, TMP4
  302. VMOVDQU (TMP3)(POS*1), Y0
  303. VMOVDQU 32(TMP4)(POS*1), Y1
  304. VMOVDQU 64(TMP3)(POS*1), Y2
  305. VMOVDQU 96(TMP4)(POS*1), Y3
  306. next_vect:
  307. ADDQ $24, TMP2
  308. MOVQ (SRC)(TMP2*1), TMP3
  309. MOVQ TMP3, TMP4
  310. VMOVDQU (TMP3)(POS*1), Y4
  311. VMOVDQU 32(TMP4)(POS*1), Y5
  312. VMOVDQU 64(TMP3)(POS*1), Y6
  313. VMOVDQU 96(TMP4)(POS*1), Y7
  314. VPXOR Y4, Y0, Y0
  315. VPXOR Y5, Y1, Y1
  316. VPXOR Y6, Y2, Y2
  317. VPXOR Y7, Y3, Y3
  318. SUBQ $1, TMP1
  319. JGE next_vect
  320. LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
  321. LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
  322. LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
  323. LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
  324. ADDQ $128, POS
  325. CMPQ LEN, POS
  326. JNE loop128b
  327. VZEROUPPER
  328. RET
  329. loop_1b:
  330. MOVQ VECT, TMP1
  331. MOVQ $0, TMP2
  332. MOVQ (SRC)(TMP2*1), TMP3
  333. SUBQ $2, TMP1
  334. MOVB -1(TMP3)(LEN*1), TMP5
  335. next_vect_1b:
  336. ADDQ $24, TMP2
  337. MOVQ (SRC)(TMP2*1), TMP3
  338. MOVB -1(TMP3)(LEN*1), TMP6
  339. XORB TMP6, TMP5
  340. SUBQ $1, TMP1
  341. JGE next_vect_1b
  342. MOVB TMP5, -1(DST)(LEN*1)
  343. SUBQ $1, LEN
  344. TESTQ $7, LEN
  345. JNZ loop_1b
  346. CMPQ LEN, $0
  347. JE ret
  348. TESTQ $127, LEN
  349. JZ aligned
  350. not_aligned:
  351. TESTQ $7, LEN
  352. JNE loop_1b
  353. MOVQ LEN, TMP4
  354. ANDQ $127, TMP4
  355. loop_8b:
  356. MOVQ VECT, TMP1
  357. MOVQ $0, TMP2
  358. MOVQ (SRC)(TMP2*1), TMP3
  359. SUBQ $2, TMP1
  360. MOVQ -8(TMP3)(LEN*1), TMP5
  361. next_vect_8b:
  362. ADDQ $24, TMP2
  363. MOVQ (SRC)(TMP2*1), TMP3
  364. MOVQ -8(TMP3)(LEN*1), TMP6
  365. XORQ TMP6, TMP5
  366. SUBQ $1, TMP1
  367. JGE next_vect_8b
  368. MOVQ TMP5, -8(DST)(LEN*1)
  369. SUBQ $8, LEN
  370. SUBQ $8, TMP4
  371. JG loop_8b
  372. CMPQ LEN, $128
  373. JGE aligned
  374. RET
  375. ret:
  376. RET