123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401 |
- // Reference: www.ssrc.ucsc.edu/Papers/plank-fast13.pdf
- #include "textflag.h"
- #define low_tbl Y0
- #define high_tbl Y1
- #define mask Y2
- #define in0 Y3
- #define in1 Y4
- #define in2 Y5
- #define in3 Y6
- #define in4 Y7
- #define in5 Y8
- #define in0_h Y10
- #define in1_h Y11
- #define in2_h Y12
- #define in3_h Y13
- #define in4_h Y14
- #define in5_h Y15
- #define in BX
- #define out DI
- #define len R8
- #define pos R9
- #define tmp0 R10
- #define low_tblx X0
- #define high_tblx X1
- #define maskx X2
- #define in0x X3
- #define in0_hx X10
- #define tmp0x X9
- #define tmp1x X11
- #define tmp2x X12
- #define tmp3x X13
- // func mulVectAVX2(tbl, d, p []byte)
- TEXT ·mulVectAVX2(SB), NOSPLIT, $0
- MOVQ i+24(FP), in
- MOVQ o+48(FP), out
- MOVQ tbl+0(FP), tmp0
- VMOVDQU (tmp0), low_tblx
- VMOVDQU 16(tmp0), high_tblx
- MOVB $0x0f, DX
- LONG $0x2069e3c4; WORD $0x00d2 // VPINSRB $0x00, EDX, XMM2, XMM2
- VPBROADCASTB maskx, maskx
- MOVQ in_len+32(FP), len
- TESTQ $31, len
- JNZ one16b
- ymm:
- VINSERTI128 $1, low_tblx, low_tbl, low_tbl
- VINSERTI128 $1, high_tblx, high_tbl, high_tbl
- VINSERTI128 $1, maskx, mask, mask
- TESTQ $255, len
- JNZ not_aligned
- // 256bytes/loop
- aligned:
- MOVQ $0, pos
- loop256b:
- VMOVDQU (in)(pos*1), in0
- VPSRLQ $4, in0, in0_h
- VPAND mask, in0_h, in0_h
- VPAND mask, in0, in0
- VPSHUFB in0_h, high_tbl, in0_h
- VPSHUFB in0, low_tbl, in0
- VPXOR in0, in0_h, in0
- VMOVDQU in0, (out)(pos*1)
- VMOVDQU 32(in)(pos*1), in1
- VPSRLQ $4, in1, in1_h
- VPAND mask, in1_h, in1_h
- VPAND mask, in1, in1
- VPSHUFB in1_h, high_tbl, in1_h
- VPSHUFB in1, low_tbl, in1
- VPXOR in1, in1_h, in1
- VMOVDQU in1, 32(out)(pos*1)
- VMOVDQU 64(in)(pos*1), in2
- VPSRLQ $4, in2, in2_h
- VPAND mask, in2_h, in2_h
- VPAND mask, in2, in2
- VPSHUFB in2_h, high_tbl, in2_h
- VPSHUFB in2, low_tbl, in2
- VPXOR in2, in2_h, in2
- VMOVDQU in2, 64(out)(pos*1)
- VMOVDQU 96(in)(pos*1), in3
- VPSRLQ $4, in3, in3_h
- VPAND mask, in3_h, in3_h
- VPAND mask, in3, in3
- VPSHUFB in3_h, high_tbl, in3_h
- VPSHUFB in3, low_tbl, in3
- VPXOR in3, in3_h, in3
- VMOVDQU in3, 96(out)(pos*1)
- VMOVDQU 128(in)(pos*1), in4
- VPSRLQ $4, in4, in4_h
- VPAND mask, in4_h, in4_h
- VPAND mask, in4, in4
- VPSHUFB in4_h, high_tbl, in4_h
- VPSHUFB in4, low_tbl, in4
- VPXOR in4, in4_h, in4
- VMOVDQU in4, 128(out)(pos*1)
- VMOVDQU 160(in)(pos*1), in5
- VPSRLQ $4, in5, in5_h
- VPAND mask, in5_h, in5_h
- VPAND mask, in5, in5
- VPSHUFB in5_h, high_tbl, in5_h
- VPSHUFB in5, low_tbl, in5
- VPXOR in5, in5_h, in5
- VMOVDQU in5, 160(out)(pos*1)
- VMOVDQU 192(in)(pos*1), in0
- VPSRLQ $4, in0, in0_h
- VPAND mask, in0_h, in0_h
- VPAND mask, in0, in0
- VPSHUFB in0_h, high_tbl, in0_h
- VPSHUFB in0, low_tbl, in0
- VPXOR in0, in0_h, in0
- VMOVDQU in0, 192(out)(pos*1)
- VMOVDQU 224(in)(pos*1), in1
- VPSRLQ $4, in1, in1_h
- VPAND mask, in1_h, in1_h
- VPAND mask, in1, in1
- VPSHUFB in1_h, high_tbl, in1_h
- VPSHUFB in1, low_tbl, in1
- VPXOR in1, in1_h, in1
- VMOVDQU in1, 224(out)(pos*1)
- ADDQ $256, pos
- CMPQ len, pos
- JNE loop256b
- VZEROUPPER
- RET
- not_aligned:
- MOVQ len, tmp0
- ANDQ $255, tmp0
- loop32b:
- VMOVDQU -32(in)(len*1), in0
- VPSRLQ $4, in0, in0_h
- VPAND mask, in0_h, in0_h
- VPAND mask, in0, in0
- VPSHUFB in0_h, high_tbl, in0_h
- VPSHUFB in0, low_tbl, in0
- VPXOR in0, in0_h, in0
- VMOVDQU in0, -32(out)(len*1)
- SUBQ $32, len
- SUBQ $32, tmp0
- JG loop32b
- CMPQ len, $256
- JGE aligned
- VZEROUPPER
- RET
- one16b:
- VMOVDQU -16(in)(len*1), in0x
- VPSRLQ $4, in0x, in0_hx
- VPAND maskx, in0x, in0x
- VPAND maskx, in0_hx, in0_hx
- VPSHUFB in0_hx, high_tblx, in0_hx
- VPSHUFB in0x, low_tblx, in0x
- VPXOR in0x, in0_hx, in0x
- VMOVDQU in0x, -16(out)(len*1)
- SUBQ $16, len
- CMPQ len, $0
- JNE ymm
- RET
- // func mulVectAddAVX2(tbl, d, p []byte)
- TEXT ·mulVectAddAVX2(SB), NOSPLIT, $0
- MOVQ i+24(FP), in
- MOVQ o+48(FP), out
- MOVQ tbl+0(FP), tmp0
- VMOVDQU (tmp0), low_tblx
- VMOVDQU 16(tmp0), high_tblx
- MOVB $0x0f, DX
- LONG $0x2069e3c4; WORD $0x00d2
- VPBROADCASTB maskx, maskx
- MOVQ in_len+32(FP), len
- TESTQ $31, len
- JNZ one16b
- ymm:
- VINSERTI128 $1, low_tblx, low_tbl, low_tbl
- VINSERTI128 $1, high_tblx, high_tbl, high_tbl
- VINSERTI128 $1, maskx, mask, mask
- TESTQ $255, len
- JNZ not_aligned
- aligned:
- MOVQ $0, pos
- loop256b:
- VMOVDQU (in)(pos*1), in0
- VPSRLQ $4, in0, in0_h
- VPAND mask, in0_h, in0_h
- VPAND mask, in0, in0
- VPSHUFB in0_h, high_tbl, in0_h
- VPSHUFB in0, low_tbl, in0
- VPXOR in0, in0_h, in0
- VPXOR (out)(pos*1), in0, in0
- VMOVDQU in0, (out)(pos*1)
- VMOVDQU 32(in)(pos*1), in1
- VPSRLQ $4, in1, in1_h
- VPAND mask, in1_h, in1_h
- VPAND mask, in1, in1
- VPSHUFB in1_h, high_tbl, in1_h
- VPSHUFB in1, low_tbl, in1
- VPXOR in1, in1_h, in1
- VPXOR 32(out)(pos*1), in1, in1
- VMOVDQU in1, 32(out)(pos*1)
- VMOVDQU 64(in)(pos*1), in2
- VPSRLQ $4, in2, in2_h
- VPAND mask, in2_h, in2_h
- VPAND mask, in2, in2
- VPSHUFB in2_h, high_tbl, in2_h
- VPSHUFB in2, low_tbl, in2
- VPXOR in2, in2_h, in2
- VPXOR 64(out)(pos*1), in2, in2
- VMOVDQU in2, 64(out)(pos*1)
- VMOVDQU 96(in)(pos*1), in3
- VPSRLQ $4, in3, in3_h
- VPAND mask, in3_h, in3_h
- VPAND mask, in3, in3
- VPSHUFB in3_h, high_tbl, in3_h
- VPSHUFB in3, low_tbl, in3
- VPXOR in3, in3_h, in3
- VPXOR 96(out)(pos*1), in3, in3
- VMOVDQU in3, 96(out)(pos*1)
- VMOVDQU 128(in)(pos*1), in4
- VPSRLQ $4, in4, in4_h
- VPAND mask, in4_h, in4_h
- VPAND mask, in4, in4
- VPSHUFB in4_h, high_tbl, in4_h
- VPSHUFB in4, low_tbl, in4
- VPXOR in4, in4_h, in4
- VPXOR 128(out)(pos*1), in4, in4
- VMOVDQU in4, 128(out)(pos*1)
- VMOVDQU 160(in)(pos*1), in5
- VPSRLQ $4, in5, in5_h
- VPAND mask, in5_h, in5_h
- VPAND mask, in5, in5
- VPSHUFB in5_h, high_tbl, in5_h
- VPSHUFB in5, low_tbl, in5
- VPXOR in5, in5_h, in5
- VPXOR 160(out)(pos*1), in5, in5
- VMOVDQU in5, 160(out)(pos*1)
- VMOVDQU 192(in)(pos*1), in0
- VPSRLQ $4, in0, in0_h
- VPAND mask, in0_h, in0_h
- VPAND mask, in0, in0
- VPSHUFB in0_h, high_tbl, in0_h
- VPSHUFB in0, low_tbl, in0
- VPXOR in0, in0_h, in0
- VPXOR 192(out)(pos*1), in0, in0
- VMOVDQU in0, 192(out)(pos*1)
- VMOVDQU 224(in)(pos*1), in1
- VPSRLQ $4, in1, in1_h
- VPAND mask, in1_h, in1_h
- VPAND mask, in1, in1
- VPSHUFB in1_h, high_tbl, in1_h
- VPSHUFB in1, low_tbl, in1
- VPXOR in1, in1_h, in1
- VPXOR 224(out)(pos*1), in1, in1
- VMOVDQU in1, 224(out)(pos*1)
- ADDQ $256, pos
- CMPQ len, pos
- JNE loop256b
- VZEROUPPER
- RET
- not_aligned:
- MOVQ len, tmp0
- ANDQ $255, tmp0
- loop32b:
- VMOVDQU -32(in)(len*1), in0
- VPSRLQ $4, in0, in0_h
- VPAND mask, in0_h, in0_h
- VPAND mask, in0, in0
- VPSHUFB in0_h, high_tbl, in0_h
- VPSHUFB in0, low_tbl, in0
- VPXOR in0, in0_h, in0
- VPXOR -32(out)(len*1), in0, in0
- VMOVDQU in0, -32(out)(len*1)
- SUBQ $32, len
- SUBQ $32, tmp0
- JG loop32b
- CMPQ len, $256
- JGE aligned
- VZEROUPPER
- RET
- one16b:
- VMOVDQU -16(in)(len*1), in0x
- VPSRLQ $4, in0x, in0_hx
- VPAND maskx, in0x, in0x
- VPAND maskx, in0_hx, in0_hx
- VPSHUFB in0_hx, high_tblx, in0_hx
- VPSHUFB in0x, low_tblx, in0x
- VPXOR in0x, in0_hx, in0x
- VPXOR -16(out)(len*1), in0x, in0x
- VMOVDQU in0x, -16(out)(len*1)
- SUBQ $16, len
- CMPQ len, $0
- JNE ymm
- RET
- // func mulVectSSSE3(tbl, d, p []byte)
- TEXT ·mulVectSSSE3(SB), NOSPLIT, $0
- MOVQ i+24(FP), in
- MOVQ o+48(FP), out
- MOVQ tbl+0(FP), tmp0
- MOVOU (tmp0), low_tblx
- MOVOU 16(tmp0), high_tblx
- MOVB $15, tmp0
- MOVQ tmp0, maskx
- PXOR tmp0x, tmp0x
- PSHUFB tmp0x, maskx
- MOVQ in_len+32(FP), len
- SHRQ $4, len
- loop:
- MOVOU (in), in0x
- MOVOU in0x, in0_hx
- PSRLQ $4, in0_hx
- PAND maskx, in0x
- PAND maskx, in0_hx
- MOVOU low_tblx, tmp1x
- MOVOU high_tblx, tmp2x
- PSHUFB in0x, tmp1x
- PSHUFB in0_hx, tmp2x
- PXOR tmp1x, tmp2x
- MOVOU tmp2x, (out)
- ADDQ $16, in
- ADDQ $16, out
- SUBQ $1, len
- JNZ loop
- RET
- // func mulVectAddSSSE3(tbl, d, p []byte)
- TEXT ·mulVectAddSSSE3(SB), NOSPLIT, $0
- MOVQ i+24(FP), in
- MOVQ o+48(FP), out
- MOVQ tbl+0(FP), tmp0
- MOVOU (tmp0), low_tblx
- MOVOU 16(tmp0), high_tblx
- MOVB $15, tmp0
- MOVQ tmp0, maskx
- PXOR tmp0x, tmp0x
- PSHUFB tmp0x, maskx
- MOVQ in_len+32(FP), len
- SHRQ $4, len
- loop:
- MOVOU (in), in0x
- MOVOU in0x, in0_hx
- PSRLQ $4, in0_hx
- PAND maskx, in0x
- PAND maskx, in0_hx
- MOVOU low_tblx, tmp1x
- MOVOU high_tblx, tmp2x
- PSHUFB in0x, tmp1x
- PSHUFB in0_hx, tmp2x
- PXOR tmp1x, tmp2x
- MOVOU (out), tmp3x
- PXOR tmp3x, tmp2x
- MOVOU tmp2x, (out)
- ADDQ $16, in
- ADDQ $16, out
- SUBQ $1, len
- JNZ loop
- RET
- // func copy32B(dst, src []byte)
- TEXT ·copy32B(SB), NOSPLIT, $0
- MOVQ dst+0(FP), SI
- MOVQ src+24(FP), DX
- MOVOU (DX), X0
- MOVOU 16(DX), X1
- MOVOU X0, (SI)
- MOVOU X1, 16(SI)
- RET
-
|