-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathfp2_1271_AVX2.S
446 lines (406 loc) · 12.8 KB
/
fp2_1271_AVX2.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
//***********************************************************************************
// FourQlib: a high-performance crypto library based on the elliptic curve FourQ
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// Abstract: arithmetic over GF(p^2) using x64 assembly for Linux with AVX2 support
//***********************************************************************************
#include "consts.s"
.intel_syntax noprefix
// Registers that are used for parameter passing:
#define reg_p1 rdi
#define reg_p2 rsi
#define reg_p3 rdx
#define reg_p4 rcx
.text
//**************************************************************************
// Quadratic extension field multiplication using lazy reduction
// Based on schoolbook method
// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] in GF(p^2), p = 2^127-1
// NOTE: only a=c is allowed for fp2mul1271_a(a, b, c)
//**************************************************************************
.global fp2mul1271_a
fp2mul1271_a:
mov rcx, reg_p3
// T0 = a0 * b0, (r11, r10, r9, r8) <- [reg_p1_0-8] * [reg_p2_0-8]
mov rdx, [reg_p2]
mulx r9, r8, [reg_p1]
mulx rax, r10, [reg_p1+8]
push r15
push r14
add r9, r10
mov rdx, [reg_p2+8]
mulx r11, r10, [reg_p1+8]
push r13
adc r10, rax
push r12
mulx rax, rdx, [reg_p1]
adc r11, 0
add r9, rdx
// T1 = a1 * b1, (r15, r14, r13, r12) <- [reg_p1_16-24] * [reg_p2_16-24]
mov rdx, [reg_p2+16]
mulx r13, r12, [reg_p1+16]
adc r10, rax
mulx rax, r14, [reg_p1+24]
adc r11, 0
mov rdx, [reg_p2+24]
add r13, r14
mulx r15, r14, [reg_p1+24]
adc r14, rax
adc r15, 0
mulx rax, rdx, [reg_p1+16]
add r13, rdx
adc r14, rax
adc r15, 0
// c0 = T0 - T1 = a0*b0 - a1*b1
xor rax, rax
sub r8, r12
sbb r9, r13
sbb r10, r14
sbb r11, r15
shld r11, r10, 1
shld r10, r9, 1
mov rdx, [reg_p2+16]
btr r9, 63
// T0 = a0 * b1, (r15, r14, r13, r12) <- [reg_p1_0-8] * [reg_p2_16-24]
mulx r13, r12, [reg_p1]
btr r11, 63 // Add prime if borrow=1
sbb r10, 0
sbb r11, 0
mulx rax, r14, [reg_p1+8]
add r13, r14
mov rdx, [reg_p2+24]
mulx r15, r14, [reg_p1+8]
adc r14, rax
adc r15, 0
mulx rax, rdx, [reg_p1]
add r13, rdx
adc r14, rax
adc r15, 0
// Reducing and storing c0
add r10, r8
adc r11, r9
btr r11, 63
adc r10, 0
adc r11, 0
// T1 = a1 * b0, (r12, r11, r10, r9) <- [reg_p1_16-24] * [reg_p2_0-8]
mov rdx, [reg_p2]
mulx r9, r8, [reg_p1+16]
mov [rcx], r10
mulx rax, r10, [reg_p1+24]
mov [rcx+8], r11
add r9, r10
mov rdx, [reg_p2+8]
mulx r11, r10, [reg_p1+24]
adc r10, rax
adc r11, 0
mulx rax, rdx, [reg_p1+16]
add r9, rdx
adc r10, rax
adc r11, 0
// c1 = T0 + T1 = a0*b1 + a1*b0
add r8, r12
pop r12
adc r9, r13
pop r13
adc r10, r14
pop r14
adc r11, r15
// Reducing and storing c1
shld r11, r10, 1
shld r10, r9, 1
btr r9, 63
btr r11, 63
adc r8, r10
adc r9, r11
btr r9, 63
pop r15
adc r8, 0
adc r9, 0
mov [rcx+16], r8
mov [rcx+24], r9
ret
//***********************************************************************
// Quadratic extension field squaring
// Operation: c [reg_p2] = a^2 [reg_p1] in GF(p^2), p = 2^127-1
// NOTE: a=c is not allowed for fp2sqr1271_a(a, c)
//***********************************************************************
.global fp2sqr1271_a
fp2sqr1271_a:
// t0 = (r9, r8) = a0 + a1, (rcx, r14) <- a1
mov r10, [reg_p1]
push r14
mov r14, [reg_p1+16]
sub r10, r14
mov r11, [reg_p1+8]
mov rcx, [reg_p1+24]
sbb r11, rcx
push r13
btr r11, 63
push r12
sbb r10, 0
// t1 = (r11, r10) = a0 - a1
mov rdx, r10
mov r8, [reg_p1]
add r8, r14
mov r9, [reg_p1+8]
adc r9, rcx
// c0 = t0 * t1 = (a0 + a1)*(a0 - a1), (rcx, r14, r13, r12) <- (r9, r8) * (r11, r10)
mulx r13, r12, r8
sbb r11, 0
mulx rax, r14, r9
mov rdx, r11
add r13, r14
mulx rcx, r14, r9
mov r9, [reg_p1+8]
adc r14, rax
adc rcx, 0
mulx rax, rdx, r8
mov r8, [reg_p1]
add r13, rdx
adc r14, rax
adc rcx, 0
// t2 = (r9, r8) = 2*a0
add r8, r8
adc r9, r9
// Reducing and storing c0
shld rcx, r14, 1
shld r14, r13, 1
btr r13, 63
btr rcx, 63
adc r12, r14
adc r13, rcx
btr r13, 63
adc r12, 0
adc r13, 0
mov [reg_p2], r12
mov [reg_p2+8], r13
// c1 = 2a0 * a1, (rcx, r14, r11, r10) <- (r9, r8) * [reg_p1_16-24]
mov rdx, [reg_p1+16]
mulx r11, r10, r8
pop r12
mulx rax, r14, r9
pop r13
add r11, r14
mov rdx, [reg_p1+24]
mulx rcx, r14, r9
adc r14, rax
adc rcx, 0
mulx rax, rdx, r8
add r11, rdx
adc r14, rax
adc rcx, 0
// Reducing and storing c1
shld rcx, r14, 1
shld r14, r11, 1
btr r11, 63
btr rcx, 63
adc r10, r14
adc r11, rcx
btr r11, 63
pop r14
adc r10, 0
adc r11, 0
mov [reg_p2+16], r10
mov [reg_p2+24], r11
ret
//***************************************************************************
// Quadratic extension field addition/subtraction
// Operation: c [reg_p3] = 2*a [reg_p1] - b [reg_p2] in GF(p^2), p = 2^127-1
//***************************************************************************
.global fp2addsub1271_a
fp2addsub1271_a:
mov r8, [reg_p1]
mov r9, [reg_p1+8]
add r8, r8
adc r9, r9
btr r9, 63
adc r8, 0
adc r9, 0
mov r10, [reg_p2]
sub r8, r10
mov r10, [reg_p2+8]
sbb r9, r10
btr r9, 63
sbb r8, 0
mov [reg_p3], r8
sbb r9, 0
mov [reg_p3+8], r9
mov r8, [reg_p1+16]
mov r9, [reg_p1+24]
add r8, r8
adc r9, r9
btr r9, 63
adc r8, 0
adc r9, 0
mov r10, [reg_p2+16]
sub r8, r10
mov r10, [reg_p2+24]
sbb r9, r10
btr r9, 63
sbb r8, 0
mov [reg_p3+16], r8
sbb r9, 0
mov [reg_p3+24], r9
ret
//***********************************************************************************************
// Constant-time table lookup to extract a point
// Inputs: sign_mask, digit, table containing 8 points
// Output: P = sign*table[digit], where sign=1 if sign_mask=0xFF...FF and sign=-1 if sign_mask=0
//***********************************************************************************************
.global table_lookup_1x8_a
table_lookup_1x8_a:
vpbroadcastd ymm4, DWORD PTR [reg_p3]
vpbroadcastd ymm14, DWORD PTR [reg_p4]
vmovdqu ymm5, [ONEx8+rip]
vmovdqu ymm11, [TWOx8+rip]
vmovdqu ymm0, YMMWORD PTR [reg_p1]
vmovdqu ymm1, YMMWORD PTR [reg_p1+32]
vmovdqu ymm2, YMMWORD PTR [reg_p1+64]
vmovdqu ymm3, YMMWORD PTR [reg_p1+96]
vmovdqu ymm10, ymm4
// While digit>=0 mask = 0x00...0 else mask = 0xFF...F
// If mask = 0xFF...F then point = point, else if mask = 0x00...0 then point = temp_point
vpsubd ymm4, ymm4, ymm5
vpsubd ymm10, ymm10, ymm11
vmovdqu ymm6, YMMWORD PTR [reg_p1+128]
vmovdqu ymm7, YMMWORD PTR [reg_p1+160]
vmovdqu ymm8, YMMWORD PTR [reg_p1+192]
vmovdqu ymm9, YMMWORD PTR [reg_p1+224]
vpsrad ymm15, ymm4, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vmovdqu ymm6, YMMWORD PTR [reg_p1+256]
vmovdqu ymm7, YMMWORD PTR [reg_p1+288]
vmovdqu ymm8, YMMWORD PTR [reg_p1+320]
vmovdqu ymm9, YMMWORD PTR [reg_p1+352]
vpsrad ymm15, ymm10, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpsubd ymm4, ymm10, ymm5
vpsubd ymm10, ymm10, ymm11
vmovdqu ymm6, YMMWORD PTR [reg_p1+384]
vmovdqu ymm7, YMMWORD PTR [reg_p1+416]
vmovdqu ymm8, YMMWORD PTR [reg_p1+448]
vmovdqu ymm9, YMMWORD PTR [reg_p1+480]
vpsrad ymm15, ymm4, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vmovdqu ymm6, YMMWORD PTR [reg_p1+512]
vmovdqu ymm7, YMMWORD PTR [reg_p1+544]
vmovdqu ymm8, YMMWORD PTR [reg_p1+576]
vmovdqu ymm9, YMMWORD PTR [reg_p1+608]
vpsrad ymm15, ymm10, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpsubd ymm4, ymm10, ymm5
vpsubd ymm10, ymm10, ymm11
vmovdqu ymm6, YMMWORD PTR [reg_p1+640]
vmovdqu ymm7, YMMWORD PTR [reg_p1+672]
vmovdqu ymm8, YMMWORD PTR [reg_p1+704]
vmovdqu ymm9, YMMWORD PTR [reg_p1+736]
vpsrad ymm15, ymm4, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vmovdqu ymm6, YMMWORD PTR [reg_p1+768]
vmovdqu ymm7, YMMWORD PTR [reg_p1+800]
vmovdqu ymm8, YMMWORD PTR [reg_p1+832]
vmovdqu ymm9, YMMWORD PTR [reg_p1+864]
vpsrad ymm15, ymm10, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpsubd ymm4, ymm10, ymm5
vmovdqu ymm6, YMMWORD PTR [reg_p1+896]
vmovdqu ymm7, YMMWORD PTR [reg_p1+928]
vmovdqu ymm8, YMMWORD PTR [reg_p1+960]
vmovdqu ymm9, YMMWORD PTR [reg_p1+992]
vpsrad ymm15, ymm4, 31
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
vpand ymm0, ymm0, ymm15
vpand ymm1, ymm1, ymm15
vpand ymm2, ymm2, ymm15
vpand ymm3, ymm3, ymm15
vpxor ymm0, ymm0, ymm6
vpxor ymm1, ymm1, ymm7
vpxor ymm2, ymm2, ymm8
vpxor ymm3, ymm3, ymm9
// point: x+y,y-x,2dt, temp_point: y-x,x+y,-2dt coordinate
// If sign_mask = 0 then choose negative of the point
vmovdqu ymm5, [PRIME1271+rip]
vmovdqu ymm6, ymm0
vpsubq ymm7, ymm5, ymm3 // Negate 2dt coordinate
vpxor ymm10, ymm0, ymm1
vpand ymm10, ymm10, ymm14
vpxor ymm0, ymm1, ymm10
vpxor ymm10, ymm6, ymm1
vpand ymm10, ymm10, ymm14
vpxor ymm1, ymm6, ymm10
vpblendvb ymm3, ymm7, ymm3, ymm14
vmovdqu YMMWORD PTR [reg_p2], ymm0
vmovdqu YMMWORD PTR [reg_p2+32], ymm1
vmovdqu YMMWORD PTR [reg_p2+64], ymm2
vmovdqu YMMWORD PTR [reg_p2+96], ymm3
ret