sse2_amd64.s 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. #include "textflag.h"
  2. // addr of mem
  3. #define DST BX
  4. #define SRC SI
  5. #define SRC0 TMP4
  6. #define SRC1 TMP5
  7. // loop args
  8. // num of vect
  9. #define VECT CX
  10. #define LEN DX
  11. // pos of matrix
  12. #define POS R8
  13. // tmp store
  14. // num of vect or ...
  15. #define TMP1 R9
  16. // pos of matrix or ...
  17. #define TMP2 R10
  18. // store addr of data/parity or ...
  19. #define TMP3 R11
  20. #define TMP4 R12
  21. #define TMP5 R13
  22. #define TMP6 R14
  23. // func bytesSrc0(dst, src0, src1 []byte)
  24. TEXT ·xorSrc0(SB), NOSPLIT, $0
  25. MOVQ len+32(FP), LEN
  26. CMPQ LEN, $0
  27. JE ret
  28. MOVQ dst+0(FP), DST
  29. MOVQ src0+24(FP), SRC0
  30. MOVQ src1+48(FP), SRC1
  31. TESTQ $15, LEN
  32. JNZ not_aligned
  33. aligned:
  34. MOVQ $0, POS
  35. loop16b:
  36. MOVOU (SRC0)(POS*1), X0
  37. XORPD (SRC1)(POS*1), X0
  38. MOVOU X0, (DST)(POS*1)
  39. ADDQ $16, POS
  40. CMPQ LEN, POS
  41. JNE loop16b
  42. RET
  43. loop_1b:
  44. MOVB -1(SRC0)(LEN*1), TMP1
  45. MOVB -1(SRC1)(LEN*1), TMP2
  46. XORB TMP1, TMP2
  47. MOVB TMP2, -1(DST)(LEN*1)
  48. SUBQ $1, LEN
  49. TESTQ $7, LEN
  50. JNZ loop_1b
  51. CMPQ LEN, $0
  52. JE ret
  53. TESTQ $15, LEN
  54. JZ aligned
  55. not_aligned:
  56. TESTQ $7, LEN
  57. JNE loop_1b
  58. MOVQ LEN, TMP1
  59. ANDQ $15, TMP1
  60. loop_8b:
  61. MOVQ -8(SRC0)(LEN*1), TMP2
  62. MOVQ -8(SRC1)(LEN*1), TMP3
  63. XORQ TMP2, TMP3
  64. MOVQ TMP3, -8(DST)(LEN*1)
  65. SUBQ $8, LEN
  66. SUBQ $8, TMP1
  67. JG loop_8b
  68. CMPQ LEN, $16
  69. JGE aligned
  70. RET
  71. ret:
  72. RET
  73. // func bytesSrc1(dst, src0, src1 []byte)
  74. TEXT ·xorSrc1(SB), NOSPLIT, $0
  75. MOVQ len+56(FP), LEN
  76. CMPQ LEN, $0
  77. JE ret
  78. MOVQ dst+0(FP), DST
  79. MOVQ src0+24(FP), SRC0
  80. MOVQ src1+48(FP), SRC1
  81. TESTQ $15, LEN
  82. JNZ not_aligned
  83. aligned:
  84. MOVQ $0, POS
  85. loop16b:
  86. MOVOU (SRC0)(POS*1), X0
  87. XORPD (SRC1)(POS*1), X0
  88. MOVOU X0, (DST)(POS*1)
  89. ADDQ $16, POS
  90. CMPQ LEN, POS
  91. JNE loop16b
  92. RET
  93. loop_1b:
  94. MOVB -1(SRC0)(LEN*1), TMP1
  95. MOVB -1(SRC1)(LEN*1), TMP2
  96. XORB TMP1, TMP2
  97. MOVB TMP2, -1(DST)(LEN*1)
  98. SUBQ $1, LEN
  99. TESTQ $7, LEN
  100. JNZ loop_1b
  101. CMPQ LEN, $0
  102. JE ret
  103. TESTQ $15, LEN
  104. JZ aligned
  105. not_aligned:
  106. TESTQ $7, LEN
  107. JNE loop_1b
  108. MOVQ LEN, TMP1
  109. ANDQ $15, TMP1
  110. loop_8b:
  111. MOVQ -8(SRC0)(LEN*1), TMP2
  112. MOVQ -8(SRC1)(LEN*1), TMP3
  113. XORQ TMP2, TMP3
  114. MOVQ TMP3, -8(DST)(LEN*1)
  115. SUBQ $8, LEN
  116. SUBQ $8, TMP1
  117. JG loop_8b
  118. CMPQ LEN, $16
  119. JGE aligned
  120. RET
  121. ret:
  122. RET
  123. // func bytesSSE2mini(dst, src0, src1 []byte, size int)
  124. TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
  125. MOVQ len+72(FP), LEN
  126. CMPQ LEN, $0
  127. JE ret
  128. MOVQ dst+0(FP), DST
  129. MOVQ src0+24(FP), SRC0
  130. MOVQ src1+48(FP), SRC1
  131. TESTQ $15, LEN
  132. JNZ not_aligned
  133. aligned:
  134. MOVQ $0, POS
  135. loop16b:
  136. MOVOU (SRC0)(POS*1), X0
  137. XORPD (SRC1)(POS*1), X0
  138. // MOVOU (SRC1)(POS*1), X4
  139. // PXOR X4, X0
  140. MOVOU X0, (DST)(POS*1)
  141. ADDQ $16, POS
  142. CMPQ LEN, POS
  143. JNE loop16b
  144. RET
  145. loop_1b:
  146. MOVB -1(SRC0)(LEN*1), TMP1
  147. MOVB -1(SRC1)(LEN*1), TMP2
  148. XORB TMP1, TMP2
  149. MOVB TMP2, -1(DST)(LEN*1)
  150. SUBQ $1, LEN
  151. TESTQ $7, LEN
  152. JNZ loop_1b
  153. CMPQ LEN, $0
  154. JE ret
  155. TESTQ $15, LEN
  156. JZ aligned
  157. not_aligned:
  158. TESTQ $7, LEN
  159. JNE loop_1b
  160. MOVQ LEN, TMP1
  161. ANDQ $15, TMP1
  162. loop_8b:
  163. MOVQ -8(SRC0)(LEN*1), TMP2
  164. MOVQ -8(SRC1)(LEN*1), TMP3
  165. XORQ TMP2, TMP3
  166. MOVQ TMP3, -8(DST)(LEN*1)
  167. SUBQ $8, LEN
  168. SUBQ $8, TMP1
  169. JG loop_8b
  170. CMPQ LEN, $16
  171. JGE aligned
  172. RET
  173. ret:
  174. RET
  175. // func bytesSSE2small(dst, src0, src1 []byte, size int)
  176. TEXT ·bytesSSE2small(SB), NOSPLIT, $0
  177. MOVQ len+72(FP), LEN
  178. CMPQ LEN, $0
  179. JE ret
  180. MOVQ dst+0(FP), DST
  181. MOVQ src0+24(FP), SRC0
  182. MOVQ src1+48(FP), SRC1
  183. TESTQ $63, LEN
  184. JNZ not_aligned
  185. aligned:
  186. MOVQ $0, POS
  187. loop64b:
  188. MOVOU (SRC0)(POS*1), X0
  189. MOVOU 16(SRC0)(POS*1), X1
  190. MOVOU 32(SRC0)(POS*1), X2
  191. MOVOU 48(SRC0)(POS*1), X3
  192. MOVOU (SRC1)(POS*1), X4
  193. MOVOU 16(SRC1)(POS*1), X5
  194. MOVOU 32(SRC1)(POS*1), X6
  195. MOVOU 48(SRC1)(POS*1), X7
  196. PXOR X4, X0
  197. PXOR X5, X1
  198. PXOR X6, X2
  199. PXOR X7, X3
  200. MOVOU X0, (DST)(POS*1)
  201. MOVOU X1, 16(DST)(POS*1)
  202. MOVOU X2, 32(DST)(POS*1)
  203. MOVOU X3, 48(DST)(POS*1)
  204. ADDQ $64, POS
  205. CMPQ LEN, POS
  206. JNE loop64b
  207. RET
  208. loop_1b:
  209. MOVB -1(SRC0)(LEN*1), TMP1
  210. MOVB -1(SRC1)(LEN*1), TMP2
  211. XORB TMP1, TMP2
  212. MOVB TMP2, -1(DST)(LEN*1)
  213. SUBQ $1, LEN
  214. TESTQ $7, LEN
  215. JNZ loop_1b
  216. CMPQ LEN, $0
  217. JE ret
  218. TESTQ $63, LEN
  219. JZ aligned
  220. not_aligned:
  221. TESTQ $7, LEN
  222. JNE loop_1b
  223. MOVQ LEN, TMP1
  224. ANDQ $63, TMP1
  225. loop_8b:
  226. MOVQ -8(SRC0)(LEN*1), TMP2
  227. MOVQ -8(SRC1)(LEN*1), TMP3
  228. XORQ TMP2, TMP3
  229. MOVQ TMP3, -8(DST)(LEN*1)
  230. SUBQ $8, LEN
  231. SUBQ $8, TMP1
  232. JG loop_8b
  233. CMPQ LEN, $64
  234. JGE aligned
  235. RET
  236. ret:
  237. RET
  238. // func bytesSSE2big(dst, src0, src1 []byte, size int)
  239. TEXT ·bytesSSE2big(SB), NOSPLIT, $0
  240. MOVQ len+72(FP), LEN
  241. CMPQ LEN, $0
  242. JE ret
  243. MOVQ dst+0(FP), DST
  244. MOVQ src0+24(FP), SRC0
  245. MOVQ src1+48(FP), SRC1
  246. TESTQ $63, LEN
  247. JNZ not_aligned
  248. aligned:
  249. MOVQ $0, POS
  250. loop64b:
  251. MOVOU (SRC0)(POS*1), X0
  252. MOVOU 16(SRC0)(POS*1), X1
  253. MOVOU 32(SRC0)(POS*1), X2
  254. MOVOU 48(SRC0)(POS*1), X3
  255. MOVOU (SRC1)(POS*1), X4
  256. MOVOU 16(SRC1)(POS*1), X5
  257. MOVOU 32(SRC1)(POS*1), X6
  258. MOVOU 48(SRC1)(POS*1), X7
  259. PXOR X4, X0
  260. PXOR X5, X1
  261. PXOR X6, X2
  262. PXOR X7, X3
  263. LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ
  264. LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
  265. LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
  266. LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
  267. ADDQ $64, POS
  268. CMPQ LEN, POS
  269. JNE loop64b
  270. RET
  271. loop_1b:
  272. MOVB -1(SRC0)(LEN*1), TMP1
  273. MOVB -1(SRC1)(LEN*1), TMP2
  274. XORB TMP1, TMP2
  275. MOVB TMP2, -1(DST)(LEN*1)
  276. SUBQ $1, LEN
  277. TESTQ $7, LEN
  278. JNZ loop_1b
  279. CMPQ LEN, $0
  280. JE ret
  281. TESTQ $63, LEN
  282. JZ aligned
  283. not_aligned:
  284. TESTQ $7, LEN
  285. JNE loop_1b
  286. MOVQ LEN, TMP1
  287. ANDQ $63, TMP1
  288. loop_8b:
  289. MOVQ -8(SRC0)(LEN*1), TMP2
  290. MOVQ -8(SRC1)(LEN*1), TMP3
  291. XORQ TMP2, TMP3
  292. MOVQ TMP3, -8(DST)(LEN*1)
  293. SUBQ $8, LEN
  294. SUBQ $8, TMP1
  295. JG loop_8b
  296. CMPQ LEN, $64
  297. JGE aligned
  298. RET
  299. ret:
  300. RET
  301. // func matrixSSE2small(dst []byte, src [][]byte)
  302. TEXT ·matrixSSE2small(SB), NOSPLIT, $0
  303. MOVQ dst+0(FP), DST
  304. MOVQ src+24(FP), SRC
  305. MOVQ vec+32(FP), VECT
  306. MOVQ len+8(FP), LEN
  307. TESTQ $63, LEN
  308. JNZ not_aligned
  309. aligned:
  310. MOVQ $0, POS
  311. loop64b:
  312. MOVQ VECT, TMP1
  313. SUBQ $2, TMP1
  314. MOVQ $0, TMP2
  315. MOVQ (SRC)(TMP2*1), TMP3
  316. MOVQ TMP3, TMP4
  317. MOVOU (TMP3)(POS*1), X0
  318. MOVOU 16(TMP4)(POS*1), X1
  319. MOVOU 32(TMP3)(POS*1), X2
  320. MOVOU 48(TMP4)(POS*1), X3
  321. next_vect:
  322. ADDQ $24, TMP2
  323. MOVQ (SRC)(TMP2*1), TMP3
  324. MOVQ TMP3, TMP4
  325. MOVOU (TMP3)(POS*1), X4
  326. MOVOU 16(TMP4)(POS*1), X5
  327. MOVOU 32(TMP3)(POS*1), X6
  328. MOVOU 48(TMP4)(POS*1), X7
  329. PXOR X4, X0
  330. PXOR X5, X1
  331. PXOR X6, X2
  332. PXOR X7, X3
  333. SUBQ $1, TMP1
  334. JGE next_vect
  335. MOVOU X0, (DST)(POS*1)
  336. MOVOU X1, 16(DST)(POS*1)
  337. MOVOU X2, 32(DST)(POS*1)
  338. MOVOU X3, 48(DST)(POS*1)
  339. ADDQ $64, POS
  340. CMPQ LEN, POS
  341. JNE loop64b
  342. RET
  343. loop_1b:
  344. MOVQ VECT, TMP1
  345. MOVQ $0, TMP2
  346. MOVQ (SRC)(TMP2*1), TMP3
  347. SUBQ $2, TMP1
  348. MOVB -1(TMP3)(LEN*1), TMP5
  349. next_vect_1b:
  350. ADDQ $24, TMP2
  351. MOVQ (SRC)(TMP2*1), TMP3
  352. MOVB -1(TMP3)(LEN*1), TMP6
  353. XORB TMP6, TMP5
  354. SUBQ $1, TMP1
  355. JGE next_vect_1b
  356. MOVB TMP5, -1(DST)(LEN*1)
  357. SUBQ $1, LEN
  358. TESTQ $7, LEN
  359. JNZ loop_1b
  360. CMPQ LEN, $0
  361. JE ret
  362. TESTQ $63, LEN
  363. JZ aligned
  364. not_aligned:
  365. TESTQ $7, LEN
  366. JNE loop_1b
  367. MOVQ LEN, TMP4
  368. ANDQ $63, TMP4
  369. loop_8b:
  370. MOVQ VECT, TMP1
  371. MOVQ $0, TMP2
  372. MOVQ (SRC)(TMP2*1), TMP3
  373. SUBQ $2, TMP1
  374. MOVQ -8(TMP3)(LEN*1), TMP5
  375. next_vect_8b:
  376. ADDQ $24, TMP2
  377. MOVQ (SRC)(TMP2*1), TMP3
  378. MOVQ -8(TMP3)(LEN*1), TMP6
  379. XORQ TMP6, TMP5
  380. SUBQ $1, TMP1
  381. JGE next_vect_8b
  382. MOVQ TMP5, -8(DST)(LEN*1)
  383. SUBQ $8, LEN
  384. SUBQ $8, TMP4
  385. JG loop_8b
  386. CMPQ LEN, $64
  387. JGE aligned
  388. RET
  389. ret:
  390. RET
  391. // func matrixSSE2big(dst []byte, src [][]byte)
  392. TEXT ·matrixSSE2big(SB), NOSPLIT, $0
  393. MOVQ dst+0(FP), DST
  394. MOVQ src+24(FP), SRC
  395. MOVQ vec+32(FP), VECT
  396. MOVQ len+8(FP), LEN
  397. TESTQ $63, LEN
  398. JNZ not_aligned
  399. aligned:
  400. MOVQ $0, POS
  401. loop64b:
  402. MOVQ VECT, TMP1
  403. SUBQ $2, TMP1
  404. MOVQ $0, TMP2
  405. MOVQ (SRC)(TMP2*1), TMP3
  406. MOVQ TMP3, TMP4
  407. MOVOU (TMP3)(POS*1), X0
  408. MOVOU 16(TMP4)(POS*1), X1
  409. MOVOU 32(TMP3)(POS*1), X2
  410. MOVOU 48(TMP4)(POS*1), X3
  411. next_vect:
  412. ADDQ $24, TMP2
  413. MOVQ (SRC)(TMP2*1), TMP3
  414. MOVQ TMP3, TMP4
  415. MOVOU (TMP3)(POS*1), X4
  416. MOVOU 16(TMP4)(POS*1), X5
  417. MOVOU 32(TMP3)(POS*1), X6
  418. MOVOU 48(TMP4)(POS*1), X7
  419. PXOR X4, X0
  420. PXOR X5, X1
  421. PXOR X6, X2
  422. PXOR X7, X3
  423. SUBQ $1, TMP1
  424. JGE next_vect
  425. LONG $0xe70f4266; WORD $0x0304
  426. LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
  427. LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
  428. LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
  429. ADDQ $64, POS
  430. CMPQ LEN, POS
  431. JNE loop64b
  432. RET
  433. loop_1b:
  434. MOVQ VECT, TMP1
  435. MOVQ $0, TMP2
  436. MOVQ (SRC)(TMP2*1), TMP3
  437. SUBQ $2, TMP1
  438. MOVB -1(TMP3)(LEN*1), TMP5
  439. next_vect_1b:
  440. ADDQ $24, TMP2
  441. MOVQ (SRC)(TMP2*1), TMP3
  442. MOVB -1(TMP3)(LEN*1), TMP6
  443. XORB TMP6, TMP5
  444. SUBQ $1, TMP1
  445. JGE next_vect_1b
  446. MOVB TMP5, -1(DST)(LEN*1)
  447. SUBQ $1, LEN
  448. TESTQ $7, LEN
  449. JNZ loop_1b
  450. CMPQ LEN, $0
  451. JE ret
  452. TESTQ $63, LEN
  453. JZ aligned
  454. not_aligned:
  455. TESTQ $7, LEN
  456. JNE loop_1b
  457. MOVQ LEN, TMP4
  458. ANDQ $63, TMP4
  459. loop_8b:
  460. MOVQ VECT, TMP1
  461. MOVQ $0, TMP2
  462. MOVQ (SRC)(TMP2*1), TMP3
  463. SUBQ $2, TMP1
  464. MOVQ -8(TMP3)(LEN*1), TMP5
  465. next_vect_8b:
  466. ADDQ $24, TMP2
  467. MOVQ (SRC)(TMP2*1), TMP3
  468. MOVQ -8(TMP3)(LEN*1), TMP6
  469. XORQ TMP6, TMP5
  470. SUBQ $1, TMP1
  471. JGE next_vect_8b
  472. MOVQ TMP5, -8(DST)(LEN*1)
  473. SUBQ $8, LEN
  474. SUBQ $8, TMP4
  475. JG loop_8b
  476. CMPQ LEN, $64
  477. JGE aligned
  478. RET
  479. ret:
  480. RET
  481. TEXT ·hasSSE2(SB), NOSPLIT, $0
  482. XORQ AX, AX
  483. INCL AX
  484. CPUID
  485. SHRQ $26, DX
  486. ANDQ $1, DX
  487. MOVB DX, ret+0(FP)
  488. RET