sum_amd64.s 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build amd64,!gccgo,!appengine
  5. #include "textflag.h"
  6. #define POLY1305_ADD(msg, h0, h1, h2) \
  7. ADDQ 0(msg), h0; \
  8. ADCQ 8(msg), h1; \
  9. ADCQ $1, h2; \
  10. LEAQ 16(msg), msg
  11. #define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \
  12. MOVQ r0, AX; \
  13. MULQ h0; \
  14. MOVQ AX, t0; \
  15. MOVQ DX, t1; \
  16. MOVQ r0, AX; \
  17. MULQ h1; \
  18. ADDQ AX, t1; \
  19. ADCQ $0, DX; \
  20. MOVQ r0, t2; \
  21. IMULQ h2, t2; \
  22. ADDQ DX, t2; \
  23. \
  24. MOVQ r1, AX; \
  25. MULQ h0; \
  26. ADDQ AX, t1; \
  27. ADCQ $0, DX; \
  28. MOVQ DX, h0; \
  29. MOVQ r1, t3; \
  30. IMULQ h2, t3; \
  31. MOVQ r1, AX; \
  32. MULQ h1; \
  33. ADDQ AX, t2; \
  34. ADCQ DX, t3; \
  35. ADDQ h0, t2; \
  36. ADCQ $0, t3; \
  37. \
  38. MOVQ t0, h0; \
  39. MOVQ t1, h1; \
  40. MOVQ t2, h2; \
  41. ANDQ $3, h2; \
  42. MOVQ t2, t0; \
  43. ANDQ $0xFFFFFFFFFFFFFFFC, t0; \
  44. ADDQ t0, h0; \
  45. ADCQ t3, h1; \
  46. ADCQ $0, h2; \
  47. SHRQ $2, t3, t2; \
  48. SHRQ $2, t3; \
  49. ADDQ t2, h0; \
  50. ADCQ t3, h1; \
  51. ADCQ $0, h2
  52. DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
  53. DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
  54. GLOBL ·poly1305Mask<>(SB), RODATA, $16
  55. // func poly1305(out *[16]byte, m *byte, mlen uint64, key *[32]key)
  56. TEXT ·poly1305(SB), $0-32
  57. MOVQ out+0(FP), DI
  58. MOVQ m+8(FP), SI
  59. MOVQ mlen+16(FP), R15
  60. MOVQ key+24(FP), AX
  61. MOVQ 0(AX), R11
  62. MOVQ 8(AX), R12
  63. ANDQ ·poly1305Mask<>(SB), R11 // r0
  64. ANDQ ·poly1305Mask<>+8(SB), R12 // r1
  65. XORQ R8, R8 // h0
  66. XORQ R9, R9 // h1
  67. XORQ R10, R10 // h2
  68. CMPQ R15, $16
  69. JB bytes_between_0_and_15
  70. loop:
  71. POLY1305_ADD(SI, R8, R9, R10)
  72. multiply:
  73. POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14)
  74. SUBQ $16, R15
  75. CMPQ R15, $16
  76. JAE loop
  77. bytes_between_0_and_15:
  78. TESTQ R15, R15
  79. JZ done
  80. MOVQ $1, BX
  81. XORQ CX, CX
  82. XORQ R13, R13
  83. ADDQ R15, SI
  84. flush_buffer:
  85. SHLQ $8, BX, CX
  86. SHLQ $8, BX
  87. MOVB -1(SI), R13
  88. XORQ R13, BX
  89. DECQ SI
  90. DECQ R15
  91. JNZ flush_buffer
  92. ADDQ BX, R8
  93. ADCQ CX, R9
  94. ADCQ $0, R10
  95. MOVQ $16, R15
  96. JMP multiply
  97. done:
  98. MOVQ R8, AX
  99. MOVQ R9, BX
  100. SUBQ $0xFFFFFFFFFFFFFFFB, AX
  101. SBBQ $0xFFFFFFFFFFFFFFFF, BX
  102. SBBQ $3, R10
  103. CMOVQCS R8, AX
  104. CMOVQCS R9, BX
  105. MOVQ key+24(FP), R8
  106. ADDQ 16(R8), AX
  107. ADCQ 24(R8), BX
  108. MOVQ AX, 0(DI)
  109. MOVQ BX, 8(DI)
  110. RET