-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathfp2_1271.S
354 lines (306 loc) · 7.35 KB
/
fp2_1271.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
//***********************************************************************************
// FourQlib: a high-performance crypto library based on the elliptic curve FourQ
//
// Copyright (c) Microsoft Corporation. All rights reserved.
//
// Abstract: arithmetic over GF(p^2) using x64 assembly for Linux
//***********************************************************************************
.intel_syntax noprefix
// Registers that are used for parameter passing:
#define reg_p1 rdi
#define reg_p2 rsi
#define reg_p3 rdx
#define reg_p4 rcx
.text
//**************************************************************************
// Quadratic extension field multiplication using lazy reduction
// Based on schoolbook method
// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] in GF(p^2), p = 2^127-1
// NOTE: only a=c is allowed for fp2mul1271_a(a, b, c)
//**************************************************************************
.global fp2mul1271_a
fp2mul1271_a:
push r15
#if defined(PUSH_SET)
push r12
push r14
push r13
#endif
mov rcx, reg_p3
// T0 = a0 * b0, (r11, r10, r9, r8) <- [reg_p1_0-8] * [reg_p2_0-8]
mov rax, [reg_p1]
mov r11, [reg_p2]
mul r11
#if !defined(PUSH_SET)
push r12
#endif
xor r10, r10
mov r8, rax
mov r9, rdx
mov r12, [reg_p2+8]
mov rax, [reg_p1]
mul r12
add r9, rax
#if !defined(PUSH_SET)
push r14
#endif
adc r10, rdx
mov rax, [reg_p1+8]
mul r11
add r9, rax
#if !defined(PUSH_SET)
push r13
#endif
adc r10, rdx
mov rax, [reg_p1+8]
mul r12
add r10, rax
mov r11, 0
adc r11, rdx
// T1 = a1 * b1, (r15, r14, r13, r12) <- [reg_p1_16-24] * [reg_p2_16-24]
xor r14, r14
mov rax, [reg_p1+16]
mov r15, [reg_p2+16]
mul r15
mov r12, rax
mov rax, [reg_p2+24]
mov r13, rdx
mov rdx, [reg_p1+16]
mul rdx
add r13, rax
mov rax, [reg_p1+24]
adc r14, rdx
mul r15
add r13, rax
adc r14, rdx
mov r15, [reg_p2+24]
mov rax, [reg_p1+24]
mul r15
mov r15, 0
add r14, rax
adc r15, rdx
// c0 = T0 - T1 = a0*b0 - a1*b1
xor rax, rax
sub r8, r12
sbb r9, r13
sbb r10, r14
sbb r11, r15
adc rax, 0
shld r11, r10, 1
shld r10, r9, 1
mov r15, [reg_p2+16]
mov rax, [reg_p1]
btr r9, 63
// T0 = a0 * b1, (r15, r14, r13, r12) <- [reg_p1_0-8] * [reg_p2_16-24]
mul r15
btr r11, 63 // Add prime if borrow=1
sbb r10, 0
sbb r11, 0
xor r14, r14
mov r12, rax
mov rax, [reg_p2+24]
mov r13, rdx
mov rdx, [reg_p1]
mul rdx
add r13, rax
mov rax, [reg_p1+8]
adc r14, rdx
mul r15
xor r15, r15
add r13, rax
mov rax, [reg_p1+8]
adc r14, rdx
mul qword ptr [reg_p2+24]
add r8, r10
adc r9, r11
add r14, rax
adc r15, rdx
// Reducing and storing c0
btr r9, 63
adc r8, 0
mov r11, [reg_p2]
adc r9, 0
// T1 = a1 * b0, (r12, r11, r10, r9) <- [reg_p1_16-24] * [reg_p2_0-8]
mov rax, [reg_p1+16]
mul r11
mov [rcx], r8
mov [rcx+8], r9
mov r8, rax
mov r9, rdx
mov rax, [reg_p1+16]
mov rsi, [reg_p2+8]
mul rsi
xor r10, r10
add r9, rax
adc r10, rdx
mov rax, [reg_p1+24]
mul r11
add r9, rax
adc r10, rdx
xor r11, r11
mov rax, [reg_p1+24]
mul rsi
add r10, rax
adc r11, rdx
// c1 = T0 + T1 = a0*b1 + a1*b0
add r8, r12
adc r9, r13
pop r13
adc r10, r14
pop r14
pop r12
adc r11, r15
pop r15
// Reducing and storing c1
shld r11, r10, 1
shld r10, r9, 1
btr r9, 63
btr r11, 63
adc r8, r10
adc r9, r11
btr r9, 63
adc r8, 0
adc r9, 0
mov [rcx+16], r8
mov [rcx+24], r9
ret
//***********************************************************************
// Quadratic extension field squaring
// Operation: c [reg_p2] = a^2 [reg_p1] in GF(p^2), p = 2^127-1
// NOTE: a=c is not allowed for fp2sqr1271_a(a, c)
//***********************************************************************
.global fp2sqr1271_a
fp2sqr1271_a:
push r14
// t0 = (r9, r8) = a0 + a1, (rcx, r14) <- a1
mov r8, [reg_p1]
mov r14, [reg_p1+16]
add r8, r14
mov r9, [reg_p1+8]
mov rcx, [reg_p1+24]
adc r9, rcx
btr r9, 63
push r12
adc r8, 0
adc r9, 0
// t1 = (r11, r10) = a0 - a1
mov r10, [reg_p1]
sub r10, r14
mov r11, [reg_p1+8]
sbb r11, rcx
btr r11, 63
sbb r10, 0
push r13
sbb r11, 0
// c0 = t0 * t1 = (a0 + a1)*(a0 - a1), (rcx, r14, r13, r12) <- (r9, r8) * (r11, r10)
xor r14, r14
mov rax, r8
mul r10
mov r12, rax
mov rax, r11
mov r13, rdx
mul r8
xor rcx, rcx
add r13, rax
adc r14, rdx
mov rax, r9
mul r10
mov r8, [reg_p1]
add r13, rax
adc r14, rdx
mov rax, r9
mul r11
mov r9, [reg_p1+8]
add r14, rax
adc rcx, rdx
// t2 = (r9, r8) = 2*a0
add r8, r8
adc r9, r9
btr r9, 63
adc r8, 0
adc r9, 0
// Reducing and storing c0
shld rcx, r14, 1
shld r14, r13, 1
btr r13, 63
add r12, r14
adc r13, rcx
btr r13, 63
adc r12, 0
adc r13, 0
mov [reg_p2], r12
mov [reg_p2+8], r13
// c1 = 2a0 * a1, (rcx, r14, r11, r10) <- (r9, r8) * [reg_p1_16-24]
mov rcx, [reg_p1+16]
mov rax, r8
mul rcx
mov r10, rax
mov r11, rdx
mov rax, [reg_p1+24]
xor r14, r14
mul r8
add r11, rax
adc r14, rdx
mov rax, rcx
mul r9
add r11, rax
adc r14, rdx
mov rax, [reg_p1+24]
mul r9
xor rcx, rcx
add r14, rax
pop r13
adc rcx, rdx
// Reducing and storing c1
shld rcx, r14, 1
shld r14, r11, 1
btr r11, 63
add r10, r14
pop r12
adc r11, rcx
btr r11, 63
adc r10, 0
pop r14
adc r11, 0
mov [reg_p2+16], r10
mov [reg_p2+24], r11
ret
//***************************************************************************
// Quadratic extension field addition/subtraction
// Operation: c [reg_p3] = 2*a [reg_p1] - b [reg_p2] in GF(p^2), p = 2^127-1
//***************************************************************************
.global fp2addsub1271_a
fp2addsub1271_a:
mov r8, [reg_p1]
mov r9, [reg_p1+8]
add r8, r8
adc r9, r9
btr r9, 63
adc r8, 0
adc r9, 0
mov r10, [reg_p2]
sub r8, r10
mov r10, [reg_p2+8]
sbb r9, r10
btr r9, 63
sbb r8, 0
mov [reg_p3], r8
sbb r9, 0
mov [reg_p3+8], r9
mov r8, [reg_p1+16]
mov r9, [reg_p1+24]
add r8, r8
adc r9, r9
btr r9, 63
adc r8, 0
adc r9, 0
mov r10, [reg_p2+16]
sub r8, r10
mov r10, [reg_p2+24]
sbb r9, r10
btr r9, 63
sbb r8, 0
mov [reg_p3+16], r8
sbb r9, 0
mov [reg_p3+24], r9
ret