galois_amd64.s 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164
  1. //+build !noasm !appengine
  2. // Copyright 2015, Klaus Post, see LICENSE for details.
  3. // Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
  4. // and http://jerasure.org/jerasure/gf-complete/tree/master
  5. // func galMulSSSE3Xor(low, high, in, out []byte)
  6. TEXT ·galMulSSSE3Xor(SB), 7, $0
  7. MOVQ low+0(FP), SI // SI: &low
  8. MOVQ high+24(FP), DX // DX: &high
  9. MOVOU (SI), X6 // X6 low
  10. MOVOU (DX), X7 // X7: high
  11. MOVQ $15, BX // BX: low mask
  12. MOVQ BX, X8
  13. PXOR X5, X5
  14. MOVQ in+48(FP), SI // R11: &in
  15. MOVQ in_len+56(FP), R9 // R9: len(in)
  16. MOVQ out+72(FP), DX // DX: &out
  17. PSHUFB X5, X8 // X8: lomask (unpacked)
  18. SHRQ $4, R9 // len(in) / 16
  19. CMPQ R9, $0
  20. JEQ done_xor
  21. loopback_xor:
  22. MOVOU (SI), X0 // in[x]
  23. MOVOU (DX), X4 // out[x]
  24. MOVOU X0, X1 // in[x]
  25. MOVOU X6, X2 // low copy
  26. MOVOU X7, X3 // high copy
  27. PSRLQ $4, X1 // X1: high input
  28. PAND X8, X0 // X0: low input
  29. PAND X8, X1 // X0: high input
  30. PSHUFB X0, X2 // X2: mul low part
  31. PSHUFB X1, X3 // X3: mul high part
  32. PXOR X2, X3 // X3: Result
  33. PXOR X4, X3 // X3: Result xor existing out
  34. MOVOU X3, (DX) // Store
  35. ADDQ $16, SI // in+=16
  36. ADDQ $16, DX // out+=16
  37. SUBQ $1, R9
  38. JNZ loopback_xor
  39. done_xor:
  40. RET
  41. // func galMulSSSE3(low, high, in, out []byte)
  42. TEXT ·galMulSSSE3(SB), 7, $0
  43. MOVQ low+0(FP), SI // SI: &low
  44. MOVQ high+24(FP), DX // DX: &high
  45. MOVOU (SI), X6 // X6 low
  46. MOVOU (DX), X7 // X7: high
  47. MOVQ $15, BX // BX: low mask
  48. MOVQ BX, X8
  49. PXOR X5, X5
  50. MOVQ in+48(FP), SI // R11: &in
  51. MOVQ in_len+56(FP), R9 // R9: len(in)
  52. MOVQ out+72(FP), DX // DX: &out
  53. PSHUFB X5, X8 // X8: lomask (unpacked)
  54. SHRQ $4, R9 // len(in) / 16
  55. CMPQ R9, $0
  56. JEQ done
  57. loopback:
  58. MOVOU (SI), X0 // in[x]
  59. MOVOU X0, X1 // in[x]
  60. MOVOU X6, X2 // low copy
  61. MOVOU X7, X3 // high copy
  62. PSRLQ $4, X1 // X1: high input
  63. PAND X8, X0 // X0: low input
  64. PAND X8, X1 // X0: high input
  65. PSHUFB X0, X2 // X2: mul low part
  66. PSHUFB X1, X3 // X3: mul high part
  67. PXOR X2, X3 // X3: Result
  68. MOVOU X3, (DX) // Store
  69. ADDQ $16, SI // in+=16
  70. ADDQ $16, DX // out+=16
  71. SUBQ $1, R9
  72. JNZ loopback
  73. done:
  74. RET
  75. // func galMulAVX2Xor(low, high, in, out []byte)
  76. TEXT ·galMulAVX2Xor(SB), 7, $0
  77. MOVQ low+0(FP), SI // SI: &low
  78. MOVQ high+24(FP), DX // DX: &high
  79. MOVQ $15, BX // BX: low mask
  80. MOVQ BX, X5
  81. MOVOU (SI), X6 // X6 low
  82. MOVOU (DX), X7 // X7: high
  83. MOVQ in_len+56(FP), R9 // R9: len(in)
  84. LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
  85. LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
  86. LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
  87. SHRQ $5, R9 // len(in) /32
  88. MOVQ out+72(FP), DX // DX: &out
  89. MOVQ in+48(FP), SI // R11: &in
  90. TESTQ R9, R9
  91. JZ done_xor_avx2
  92. loopback_xor_avx2:
  93. LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
  94. LONG $0x226ffec5 // VMOVDQU YMM4, [rdx]
  95. LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
  96. LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
  97. LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
  98. LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
  99. LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
  100. LONG $0xdbefedc5 // VPXOR YMM3, YMM2, YMM3 ; X3: Result
  101. LONG $0xe4efe5c5 // VPXOR YMM4, YMM3, YMM4 ; X4: Result
  102. LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
  103. ADDQ $32, SI // in+=32
  104. ADDQ $32, DX // out+=32
  105. SUBQ $1, R9
  106. JNZ loopback_xor_avx2
  107. done_xor_avx2:
  108. // VZEROUPPER
  109. BYTE $0xc5; BYTE $0xf8; BYTE $0x77
  110. RET
  111. // func galMulAVX2(low, high, in, out []byte)
  112. TEXT ·galMulAVX2(SB), 7, $0
  113. MOVQ low+0(FP), SI // SI: &low
  114. MOVQ high+24(FP), DX // DX: &high
  115. MOVQ $15, BX // BX: low mask
  116. MOVQ BX, X5
  117. MOVOU (SI), X6 // X6 low
  118. MOVOU (DX), X7 // X7: high
  119. MOVQ in_len+56(FP), R9 // R9: len(in)
  120. LONG $0x384de3c4; WORD $0x01f6 // VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
  121. LONG $0x3845e3c4; WORD $0x01ff // VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
  122. LONG $0x787d62c4; BYTE $0xc5 // VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
  123. SHRQ $5, R9 // len(in) /32
  124. MOVQ out+72(FP), DX // DX: &out
  125. MOVQ in+48(FP), SI // R11: &in
  126. TESTQ R9, R9
  127. JZ done_avx2
  128. loopback_avx2:
  129. LONG $0x066ffec5 // VMOVDQU YMM0, [rsi]
  130. LONG $0xd073f5c5; BYTE $0x04 // VPSRLQ YMM1, YMM0, 4 ; X1: high input
  131. LONG $0xdb7dc1c4; BYTE $0xc0 // VPAND YMM0, YMM0, YMM8 ; X0: low input
  132. LONG $0xdb75c1c4; BYTE $0xc8 // VPAND YMM1, YMM1, YMM8 ; X1: high input
  133. LONG $0x004de2c4; BYTE $0xd0 // VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
  134. LONG $0x0045e2c4; BYTE $0xd9 // VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
  135. LONG $0xe3efedc5 // VPXOR YMM4, YMM2, YMM3 ; X4: Result
  136. LONG $0x227ffec5 // VMOVDQU [rdx], YMM4
  137. ADDQ $32, SI // in+=32
  138. ADDQ $32, DX // out+=32
  139. SUBQ $1, R9
  140. JNZ loopback_avx2
  141. done_avx2:
  142. BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
  143. RET