123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438 |
- #include "textflag.h"
- // addr of mem
- #define DST BX
- #define SRC SI
- #define SRC0 TMP4
- #define SRC1 TMP5
- // loop args
- // num of vect
- #define VECT CX
- #define LEN DX
- // pos of matrix
- #define POS R8
- // tmp store
- // num of vect or ...
- #define TMP1 R9
- // pos of matrix or ...
- #define TMP2 R10
- // store addr of data/parity or ...
- #define TMP3 R11
- #define TMP4 R12
- #define TMP5 R13
- #define TMP6 R14
- // func bytesAVX2mini(dst, src0, src1 []byte, size int)
- TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
- MOVQ len+72(FP), LEN
- CMPQ LEN, $0
- JE ret
- MOVQ dst+0(FP), DST
- MOVQ src0+24(FP), SRC0
- MOVQ src1+48(FP), SRC1
- TESTQ $31, LEN
- JNZ not_aligned
- aligned:
- MOVQ $0, POS
- loop32b:
- VMOVDQU (SRC0)(POS*1), Y0
- VPXOR (SRC1)(POS*1), Y0, Y0
- VMOVDQU Y0, (DST)(POS*1)
- ADDQ $32, POS
- CMPQ LEN, POS
- JNE loop32b
- VZEROUPPER
- RET
- loop_1b:
- MOVB -1(SRC0)(LEN*1), TMP1
- MOVB -1(SRC1)(LEN*1), TMP2
- XORB TMP1, TMP2
- MOVB TMP2, -1(DST)(LEN*1)
- SUBQ $1, LEN
- TESTQ $7, LEN
- JNZ loop_1b
- CMPQ LEN, $0
- JE ret
- TESTQ $31, LEN
- JZ aligned
- not_aligned:
- TESTQ $7, LEN
- JNE loop_1b
- MOVQ LEN, TMP1
- ANDQ $31, TMP1
- loop_8b:
- MOVQ -8(SRC0)(LEN*1), TMP2
- MOVQ -8(SRC1)(LEN*1), TMP3
- XORQ TMP2, TMP3
- MOVQ TMP3, -8(DST)(LEN*1)
- SUBQ $8, LEN
- SUBQ $8, TMP1
- JG loop_8b
- CMPQ LEN, $32
- JGE aligned
- RET
- ret:
- RET
- // func bytesAVX2small(dst, src0, src1 []byte, size int)
- TEXT ·bytesAVX2small(SB), NOSPLIT, $0
- MOVQ len+72(FP), LEN
- CMPQ LEN, $0
- JE ret
- MOVQ dst+0(FP), DST
- MOVQ src0+24(FP), SRC0
- MOVQ src1+48(FP), SRC1
- TESTQ $127, LEN
- JNZ not_aligned
- aligned:
- MOVQ $0, POS
- loop128b:
- VMOVDQU (SRC0)(POS*1), Y0
- VMOVDQU 32(SRC0)(POS*1), Y1
- VMOVDQU 64(SRC0)(POS*1), Y2
- VMOVDQU 96(SRC0)(POS*1), Y3
- VPXOR (SRC1)(POS*1), Y0, Y0
- VPXOR 32(SRC1)(POS*1), Y1, Y1
- VPXOR 64(SRC1)(POS*1), Y2, Y2
- VPXOR 96(SRC1)(POS*1), Y3, Y3
- VMOVDQU Y0, (DST)(POS*1)
- VMOVDQU Y1, 32(DST)(POS*1)
- VMOVDQU Y2, 64(DST)(POS*1)
- VMOVDQU Y3, 96(DST)(POS*1)
- ADDQ $128, POS
- CMPQ LEN, POS
- JNE loop128b
- VZEROUPPER
- RET
- loop_1b:
- MOVB -1(SRC0)(LEN*1), TMP1
- MOVB -1(SRC1)(LEN*1), TMP2
- XORB TMP1, TMP2
- MOVB TMP2, -1(DST)(LEN*1)
- SUBQ $1, LEN
- TESTQ $7, LEN
- JNZ loop_1b
- CMPQ LEN, $0
- JE ret
- TESTQ $127, LEN
- JZ aligned
- not_aligned:
- TESTQ $7, LEN
- JNE loop_1b
- MOVQ LEN, TMP1
- ANDQ $127, TMP1
- loop_8b:
- MOVQ -8(SRC0)(LEN*1), TMP2
- MOVQ -8(SRC1)(LEN*1), TMP3
- XORQ TMP2, TMP3
- MOVQ TMP3, -8(DST)(LEN*1)
- SUBQ $8, LEN
- SUBQ $8, TMP1
- JG loop_8b
- CMPQ LEN, $128
- JGE aligned
- RET
- ret:
- RET
- // func bytesAVX2big(dst, src0, src1 []byte, size int)
- TEXT ·bytesAVX2big(SB), NOSPLIT, $0
- MOVQ len+72(FP), LEN
- CMPQ LEN, $0
- JE ret
- MOVQ dst+0(FP), DST
- MOVQ src0+24(FP), SRC0
- MOVQ src1+48(FP), SRC1
- TESTQ $127, LEN
- JNZ not_aligned
- aligned:
- MOVQ $0, POS
- loop128b:
- VMOVDQU (SRC0)(POS*1), Y0
- VMOVDQU 32(SRC0)(POS*1), Y1
- VMOVDQU 64(SRC0)(POS*1), Y2
- VMOVDQU 96(SRC0)(POS*1), Y3
- VPXOR (SRC1)(POS*1), Y0, Y0
- VPXOR 32(SRC1)(POS*1), Y1, Y1
- VPXOR 64(SRC1)(POS*1), Y2, Y2
- VPXOR 96(SRC1)(POS*1), Y3, Y3
- LONG $0xe77da1c4; WORD $0x0304
- LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
- LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
- LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
- ADDQ $128, POS
- CMPQ LEN, POS
- JNE loop128b
- SFENCE
- VZEROUPPER
- RET
- loop_1b:
- MOVB -1(SRC0)(LEN*1), TMP1
- MOVB -1(SRC1)(LEN*1), TMP2
- XORB TMP1, TMP2
- MOVB TMP2, -1(DST)(LEN*1)
- SUBQ $1, LEN
- TESTQ $7, LEN
- JNZ loop_1b
- CMPQ LEN, $0
- JE ret
- TESTQ $127, LEN
- JZ aligned
- not_aligned:
- TESTQ $7, LEN
- JNE loop_1b
- MOVQ LEN, TMP1
- ANDQ $127, TMP1
- loop_8b:
- MOVQ -8(SRC0)(LEN*1), TMP2
- MOVQ -8(SRC1)(LEN*1), TMP3
- XORQ TMP2, TMP3
- MOVQ TMP3, -8(DST)(LEN*1)
- SUBQ $8, LEN
- SUBQ $8, TMP1
- JG loop_8b
- CMPQ LEN, $128
- JGE aligned
- RET
- ret:
- RET
- // func matrixAVX2small(dst []byte, src [][]byte)
- TEXT ·matrixAVX2small(SB), NOSPLIT, $0
- MOVQ dst+0(FP), DST
- MOVQ src+24(FP), SRC
- MOVQ vec+32(FP), VECT
- MOVQ len+8(FP), LEN
- TESTQ $127, LEN
- JNZ not_aligned
- aligned:
- MOVQ $0, POS
- loop128b:
- MOVQ VECT, TMP1
- SUBQ $2, TMP1
- MOVQ $0, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVQ TMP3, TMP4
- VMOVDQU (TMP3)(POS*1), Y0
- VMOVDQU 32(TMP4)(POS*1), Y1
- VMOVDQU 64(TMP3)(POS*1), Y2
- VMOVDQU 96(TMP4)(POS*1), Y3
- next_vect:
- ADDQ $24, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVQ TMP3, TMP4
- VMOVDQU (TMP3)(POS*1), Y4
- VMOVDQU 32(TMP4)(POS*1), Y5
- VMOVDQU 64(TMP3)(POS*1), Y6
- VMOVDQU 96(TMP4)(POS*1), Y7
- VPXOR Y4, Y0, Y0
- VPXOR Y5, Y1, Y1
- VPXOR Y6, Y2, Y2
- VPXOR Y7, Y3, Y3
- SUBQ $1, TMP1
- JGE next_vect
- VMOVDQU Y0, (DST)(POS*1)
- VMOVDQU Y1, 32(DST)(POS*1)
- VMOVDQU Y2, 64(DST)(POS*1)
- VMOVDQU Y3, 96(DST)(POS*1)
- ADDQ $128, POS
- CMPQ LEN, POS
- JNE loop128b
- VZEROUPPER
- RET
- loop_1b:
- MOVQ VECT, TMP1
- MOVQ $0, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- SUBQ $2, TMP1
- MOVB -1(TMP3)(LEN*1), TMP5
- next_vect_1b:
- ADDQ $24, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVB -1(TMP3)(LEN*1), TMP6
- XORB TMP6, TMP5
- SUBQ $1, TMP1
- JGE next_vect_1b
- MOVB TMP5, -1(DST)(LEN*1)
- SUBQ $1, LEN
- TESTQ $7, LEN
- JNZ loop_1b
- CMPQ LEN, $0
- JE ret
- TESTQ $127, LEN
- JZ aligned
- not_aligned:
- TESTQ $7, LEN
- JNE loop_1b
- MOVQ LEN, TMP4
- ANDQ $127, TMP4
- loop_8b:
- MOVQ VECT, TMP1
- MOVQ $0, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- SUBQ $2, TMP1
- MOVQ -8(TMP3)(LEN*1), TMP5
- next_vect_8b:
- ADDQ $24, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVQ -8(TMP3)(LEN*1), TMP6
- XORQ TMP6, TMP5
- SUBQ $1, TMP1
- JGE next_vect_8b
- MOVQ TMP5, -8(DST)(LEN*1)
- SUBQ $8, LEN
- SUBQ $8, TMP4
- JG loop_8b
- CMPQ LEN, $128
- JGE aligned
- RET
- ret:
- RET
- // func matrixAVX2big(dst []byte, src [][]byte)
- TEXT ·matrixAVX2big(SB), NOSPLIT, $0
- MOVQ dst+0(FP), DST
- MOVQ src+24(FP), SRC
- MOVQ vec+32(FP), VECT
- MOVQ len+8(FP), LEN
- TESTQ $127, LEN
- JNZ not_aligned
- aligned:
- MOVQ $0, POS
- loop128b:
- MOVQ VECT, TMP1
- SUBQ $2, TMP1
- MOVQ $0, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVQ TMP3, TMP4
- VMOVDQU (TMP3)(POS*1), Y0
- VMOVDQU 32(TMP4)(POS*1), Y1
- VMOVDQU 64(TMP3)(POS*1), Y2
- VMOVDQU 96(TMP4)(POS*1), Y3
- next_vect:
- ADDQ $24, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVQ TMP3, TMP4
- VMOVDQU (TMP3)(POS*1), Y4
- VMOVDQU 32(TMP4)(POS*1), Y5
- VMOVDQU 64(TMP3)(POS*1), Y6
- VMOVDQU 96(TMP4)(POS*1), Y7
- VPXOR Y4, Y0, Y0
- VPXOR Y5, Y1, Y1
- VPXOR Y6, Y2, Y2
- VPXOR Y7, Y3, Y3
- SUBQ $1, TMP1
- JGE next_vect
- LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has
- LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
- LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
- LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
- ADDQ $128, POS
- CMPQ LEN, POS
- JNE loop128b
- VZEROUPPER
- RET
- loop_1b:
- MOVQ VECT, TMP1
- MOVQ $0, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- SUBQ $2, TMP1
- MOVB -1(TMP3)(LEN*1), TMP5
- next_vect_1b:
- ADDQ $24, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVB -1(TMP3)(LEN*1), TMP6
- XORB TMP6, TMP5
- SUBQ $1, TMP1
- JGE next_vect_1b
- MOVB TMP5, -1(DST)(LEN*1)
- SUBQ $1, LEN
- TESTQ $7, LEN
- JNZ loop_1b
- CMPQ LEN, $0
- JE ret
- TESTQ $127, LEN
- JZ aligned
- not_aligned:
- TESTQ $7, LEN
- JNE loop_1b
- MOVQ LEN, TMP4
- ANDQ $127, TMP4
- loop_8b:
- MOVQ VECT, TMP1
- MOVQ $0, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- SUBQ $2, TMP1
- MOVQ -8(TMP3)(LEN*1), TMP5
- next_vect_8b:
- ADDQ $24, TMP2
- MOVQ (SRC)(TMP2*1), TMP3
- MOVQ -8(TMP3)(LEN*1), TMP6
- XORQ TMP6, TMP5
- SUBQ $1, TMP1
- JGE next_vect_8b
- MOVQ TMP5, -8(DST)(LEN*1)
- SUBQ $8, LEN
- SUBQ $8, TMP4
- JG loop_8b
- CMPQ LEN, $128
- JGE aligned
- RET
- ret:
- RET
|