1 |
/* Copyright 2008, Google Inc. |
2 |
* All rights reserved. |
3 |
* |
4 |
* Redistribution and use in source and binary forms, with or without |
5 |
* modification, are permitted provided that the following conditions are |
6 |
* met: |
7 |
* |
8 |
* * Redistributions of source code must retain the above copyright |
9 |
* notice, this list of conditions and the following disclaimer. |
10 |
* * Redistributions in binary form must reproduce the above |
11 |
* copyright notice, this list of conditions and the following disclaimer |
12 |
* in the documentation and/or other materials provided with the |
13 |
* distribution. |
14 |
* * Neither the name of Google Inc. nor the names of its |
15 |
* contributors may be used to endorse or promote products derived from |
16 |
* this software without specific prior written permission. |
17 |
* |
18 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 |
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 |
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 |
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 |
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 |
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 |
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 |
* |
30 |
* curve25519-donna: Curve25519 elliptic curve, public key function |
31 |
* |
32 |
* http://code.google.com/p/curve25519-donna/ |
33 |
* |
34 |
* Adam Langley <agl@imperialviolet.org> |
35 |
* |
36 |
* Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> |
37 |
* |
38 |
* More information about curve25519 can be found here |
39 |
* http://cr.yp.to/ecdh.html |
40 |
* |
41 |
* djb's sample implementation of curve25519 is written in a special assembly |
42 |
* language called qhasm and uses the floating point registers. |
43 |
* |
44 |
* This is, almost, a clean room reimplementation from the curve25519 paper. It |
45 |
* uses many of the tricks described therein. Only the crecip function is taken |
46 |
* from the sample implementation. */ |
47 |
|
48 |
#include <string.h> |
49 |
#include <stdint.h> |
50 |
|
51 |
#ifdef _MSC_VER |
52 |
#define inline __inline |
53 |
#endif |
54 |
|
55 |
typedef uint8_t u8; |
56 |
typedef int32_t s32; |
57 |
typedef int64_t limb; |
58 |
|
59 |
/* Field element representation: |
60 |
* |
61 |
* Field elements are written as an array of signed, 64-bit limbs, least |
62 |
* significant first. The value of the field element is: |
63 |
* x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... |
64 |
* |
65 |
* i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ |
66 |
|
67 |
/* Sum two numbers: output += in */ |
68 |
static void fsum(limb *output, const limb *in) { |
69 |
unsigned i; |
70 |
for (i = 0; i < 10; i += 2) { |
71 |
output[0+i] = output[0+i] + in[0+i]; |
72 |
output[1+i] = output[1+i] + in[1+i]; |
73 |
} |
74 |
} |
75 |
|
76 |
/* Find the difference of two numbers: output = in - output |
77 |
* (note the order of the arguments!). */ |
78 |
static void fdifference(limb *output, const limb *in) { |
79 |
unsigned i; |
80 |
for (i = 0; i < 10; ++i) { |
81 |
output[i] = in[i] - output[i]; |
82 |
} |
83 |
} |
84 |
|
85 |
/* Multiply a number by a scalar: output = in * scalar */ |
86 |
static void fscalar_product(limb *output, const limb *in, const limb scalar) { |
87 |
unsigned i; |
88 |
for (i = 0; i < 10; ++i) { |
89 |
output[i] = in[i] * scalar; |
90 |
} |
91 |
} |
92 |
|
93 |
/* Multiply two numbers: output = in2 * in |
94 |
* |
95 |
* output must be distinct to both inputs. The inputs are reduced coefficient |
96 |
* form, the output is not. |
97 |
* |
98 |
* output[x] <= 14 * the largest product of the input limbs. */ |
99 |
static void fproduct(limb *output, const limb *in2, const limb *in) { |
100 |
output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); |
101 |
output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + |
102 |
((limb) ((s32) in2[1])) * ((s32) in[0]); |
103 |
output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + |
104 |
((limb) ((s32) in2[0])) * ((s32) in[2]) + |
105 |
((limb) ((s32) in2[2])) * ((s32) in[0]); |
106 |
output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) + |
107 |
((limb) ((s32) in2[2])) * ((s32) in[1]) + |
108 |
((limb) ((s32) in2[0])) * ((s32) in[3]) + |
109 |
((limb) ((s32) in2[3])) * ((s32) in[0]); |
110 |
output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) + |
111 |
2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) + |
112 |
((limb) ((s32) in2[3])) * ((s32) in[1])) + |
113 |
((limb) ((s32) in2[0])) * ((s32) in[4]) + |
114 |
((limb) ((s32) in2[4])) * ((s32) in[0]); |
115 |
output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) + |
116 |
((limb) ((s32) in2[3])) * ((s32) in[2]) + |
117 |
((limb) ((s32) in2[1])) * ((s32) in[4]) + |
118 |
((limb) ((s32) in2[4])) * ((s32) in[1]) + |
119 |
((limb) ((s32) in2[0])) * ((s32) in[5]) + |
120 |
((limb) ((s32) in2[5])) * ((s32) in[0]); |
121 |
output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) + |
122 |
((limb) ((s32) in2[1])) * ((s32) in[5]) + |
123 |
((limb) ((s32) in2[5])) * ((s32) in[1])) + |
124 |
((limb) ((s32) in2[2])) * ((s32) in[4]) + |
125 |
((limb) ((s32) in2[4])) * ((s32) in[2]) + |
126 |
((limb) ((s32) in2[0])) * ((s32) in[6]) + |
127 |
((limb) ((s32) in2[6])) * ((s32) in[0]); |
128 |
output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) + |
129 |
((limb) ((s32) in2[4])) * ((s32) in[3]) + |
130 |
((limb) ((s32) in2[2])) * ((s32) in[5]) + |
131 |
((limb) ((s32) in2[5])) * ((s32) in[2]) + |
132 |
((limb) ((s32) in2[1])) * ((s32) in[6]) + |
133 |
((limb) ((s32) in2[6])) * ((s32) in[1]) + |
134 |
((limb) ((s32) in2[0])) * ((s32) in[7]) + |
135 |
((limb) ((s32) in2[7])) * ((s32) in[0]); |
136 |
output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) + |
137 |
2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) + |
138 |
((limb) ((s32) in2[5])) * ((s32) in[3]) + |
139 |
((limb) ((s32) in2[1])) * ((s32) in[7]) + |
140 |
((limb) ((s32) in2[7])) * ((s32) in[1])) + |
141 |
((limb) ((s32) in2[2])) * ((s32) in[6]) + |
142 |
((limb) ((s32) in2[6])) * ((s32) in[2]) + |
143 |
((limb) ((s32) in2[0])) * ((s32) in[8]) + |
144 |
((limb) ((s32) in2[8])) * ((s32) in[0]); |
145 |
output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) + |
146 |
((limb) ((s32) in2[5])) * ((s32) in[4]) + |
147 |
((limb) ((s32) in2[3])) * ((s32) in[6]) + |
148 |
((limb) ((s32) in2[6])) * ((s32) in[3]) + |
149 |
((limb) ((s32) in2[2])) * ((s32) in[7]) + |
150 |
((limb) ((s32) in2[7])) * ((s32) in[2]) + |
151 |
((limb) ((s32) in2[1])) * ((s32) in[8]) + |
152 |
((limb) ((s32) in2[8])) * ((s32) in[1]) + |
153 |
((limb) ((s32) in2[0])) * ((s32) in[9]) + |
154 |
((limb) ((s32) in2[9])) * ((s32) in[0]); |
155 |
output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) + |
156 |
((limb) ((s32) in2[3])) * ((s32) in[7]) + |
157 |
((limb) ((s32) in2[7])) * ((s32) in[3]) + |
158 |
((limb) ((s32) in2[1])) * ((s32) in[9]) + |
159 |
((limb) ((s32) in2[9])) * ((s32) in[1])) + |
160 |
((limb) ((s32) in2[4])) * ((s32) in[6]) + |
161 |
((limb) ((s32) in2[6])) * ((s32) in[4]) + |
162 |
((limb) ((s32) in2[2])) * ((s32) in[8]) + |
163 |
((limb) ((s32) in2[8])) * ((s32) in[2]); |
164 |
output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) + |
165 |
((limb) ((s32) in2[6])) * ((s32) in[5]) + |
166 |
((limb) ((s32) in2[4])) * ((s32) in[7]) + |
167 |
((limb) ((s32) in2[7])) * ((s32) in[4]) + |
168 |
((limb) ((s32) in2[3])) * ((s32) in[8]) + |
169 |
((limb) ((s32) in2[8])) * ((s32) in[3]) + |
170 |
((limb) ((s32) in2[2])) * ((s32) in[9]) + |
171 |
((limb) ((s32) in2[9])) * ((s32) in[2]); |
172 |
output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) + |
173 |
2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) + |
174 |
((limb) ((s32) in2[7])) * ((s32) in[5]) + |
175 |
((limb) ((s32) in2[3])) * ((s32) in[9]) + |
176 |
((limb) ((s32) in2[9])) * ((s32) in[3])) + |
177 |
((limb) ((s32) in2[4])) * ((s32) in[8]) + |
178 |
((limb) ((s32) in2[8])) * ((s32) in[4]); |
179 |
output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) + |
180 |
((limb) ((s32) in2[7])) * ((s32) in[6]) + |
181 |
((limb) ((s32) in2[5])) * ((s32) in[8]) + |
182 |
((limb) ((s32) in2[8])) * ((s32) in[5]) + |
183 |
((limb) ((s32) in2[4])) * ((s32) in[9]) + |
184 |
((limb) ((s32) in2[9])) * ((s32) in[4]); |
185 |
output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) + |
186 |
((limb) ((s32) in2[5])) * ((s32) in[9]) + |
187 |
((limb) ((s32) in2[9])) * ((s32) in[5])) + |
188 |
((limb) ((s32) in2[6])) * ((s32) in[8]) + |
189 |
((limb) ((s32) in2[8])) * ((s32) in[6]); |
190 |
output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) + |
191 |
((limb) ((s32) in2[8])) * ((s32) in[7]) + |
192 |
((limb) ((s32) in2[6])) * ((s32) in[9]) + |
193 |
((limb) ((s32) in2[9])) * ((s32) in[6]); |
194 |
output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) + |
195 |
2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) + |
196 |
((limb) ((s32) in2[9])) * ((s32) in[7])); |
197 |
output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + |
198 |
((limb) ((s32) in2[9])) * ((s32) in[8]); |
199 |
output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); |
200 |
} |
201 |
|
202 |
/* Reduce a long form to a short form by taking the input mod 2^255 - 19. |
203 |
* |
204 |
* On entry: |output[i]| < 14*2^54 |
205 |
* On exit: |output[0..8]| < 280*2^54 */ |
206 |
static void freduce_degree(limb *output) { |
207 |
/* Each of these shifts and adds ends up multiplying the value by 19. |
208 |
* |
209 |
* For output[0..8], the absolute entry value is < 14*2^54 and we add, at |
210 |
* most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ |
211 |
output[8] += output[18] << 4; |
212 |
output[8] += output[18] << 1; |
213 |
output[8] += output[18]; |
214 |
output[7] += output[17] << 4; |
215 |
output[7] += output[17] << 1; |
216 |
output[7] += output[17]; |
217 |
output[6] += output[16] << 4; |
218 |
output[6] += output[16] << 1; |
219 |
output[6] += output[16]; |
220 |
output[5] += output[15] << 4; |
221 |
output[5] += output[15] << 1; |
222 |
output[5] += output[15]; |
223 |
output[4] += output[14] << 4; |
224 |
output[4] += output[14] << 1; |
225 |
output[4] += output[14]; |
226 |
output[3] += output[13] << 4; |
227 |
output[3] += output[13] << 1; |
228 |
output[3] += output[13]; |
229 |
output[2] += output[12] << 4; |
230 |
output[2] += output[12] << 1; |
231 |
output[2] += output[12]; |
232 |
output[1] += output[11] << 4; |
233 |
output[1] += output[11] << 1; |
234 |
output[1] += output[11]; |
235 |
output[0] += output[10] << 4; |
236 |
output[0] += output[10] << 1; |
237 |
output[0] += output[10]; |
238 |
} |
239 |
|
240 |
#if (-1 & 3) != 3 |
241 |
#error "This code only works on a two's complement system" |
242 |
#endif |
243 |
|
244 |
/* return v / 2^26, using only shifts and adds. |
245 |
* |
246 |
* On entry: v can take any value. */ |
247 |
static inline limb |
248 |
div_by_2_26(const limb v) |
249 |
{ |
250 |
/* High word of v; no shift needed. */ |
251 |
const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
252 |
/* Set to all 1s if v was negative; else set to 0s. */ |
253 |
const int32_t sign = ((int32_t) highword) >> 31; |
254 |
/* Set to 0x3ffffff if v was negative; else set to 0. */ |
255 |
const int32_t roundoff = ((uint32_t) sign) >> 6; |
256 |
/* Should return v / (1<<26) */ |
257 |
return (v + roundoff) >> 26; |
258 |
} |
259 |
|
260 |
/* return v / (2^25), using only shifts and adds. |
261 |
* |
262 |
* On entry: v can take any value. */ |
263 |
static inline limb |
264 |
div_by_2_25(const limb v) |
265 |
{ |
266 |
/* High word of v; no shift needed*/ |
267 |
const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
268 |
/* Set to all 1s if v was negative; else set to 0s. */ |
269 |
const int32_t sign = ((int32_t) highword) >> 31; |
270 |
/* Set to 0x1ffffff if v was negative; else set to 0. */ |
271 |
const int32_t roundoff = ((uint32_t) sign) >> 7; |
272 |
/* Should return v / (1<<25) */ |
273 |
return (v + roundoff) >> 25; |
274 |
} |
275 |
|
276 |
/* Reduce all coefficients of the short form input so that |x| < 2^26. |
277 |
* |
278 |
* On entry: |output[i]| < 280*2^54 */ |
279 |
static void freduce_coefficients(limb *output) { |
280 |
unsigned i; |
281 |
|
282 |
output[10] = 0; |
283 |
|
284 |
for (i = 0; i < 10; i += 2) { |
285 |
limb over = div_by_2_26(output[i]); |
286 |
/* The entry condition (that |output[i]| < 280*2^54) means that over is, at |
287 |
* most, 280*2^28 in the first iteration of this loop. This is added to the |
288 |
* next limb and we can approximate the resulting bound of that limb by |
289 |
* 281*2^54. */ |
290 |
output[i] -= over << 26; |
291 |
output[i+1] += over; |
292 |
|
293 |
/* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < |
294 |
* 281*2^29. When this is added to the next limb, the resulting bound can |
295 |
* be approximated as 281*2^54. |
296 |
* |
297 |
* For subsequent iterations of the loop, 281*2^54 remains a conservative |
298 |
* bound and no overflow occurs. */ |
299 |
over = div_by_2_25(output[i+1]); |
300 |
output[i+1] -= over << 25; |
301 |
output[i+2] += over; |
302 |
} |
303 |
/* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ |
304 |
output[0] += output[10] << 4; |
305 |
output[0] += output[10] << 1; |
306 |
output[0] += output[10]; |
307 |
|
308 |
output[10] = 0; |
309 |
|
310 |
/* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 |
311 |
* So |over| will be no more than 2^16. */ |
312 |
{ |
313 |
limb over = div_by_2_26(output[0]); |
314 |
output[0] -= over << 26; |
315 |
output[1] += over; |
316 |
} |
317 |
|
318 |
/* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The |
319 |
* bound on |output[1]| is sufficient to meet our needs. */ |
320 |
} |
321 |
|
322 |
/* A helpful wrapper around fproduct: output = in * in2. |
323 |
* |
324 |
* On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. |
325 |
* |
326 |
* output must be distinct to both inputs. The output is reduced degree |
327 |
* (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ |
328 |
static void |
329 |
fmul(limb *output, const limb *in, const limb *in2) { |
330 |
limb t[19]; |
331 |
fproduct(t, in, in2); |
332 |
/* |t[i]| < 14*2^54 */ |
333 |
freduce_degree(t); |
334 |
freduce_coefficients(t); |
335 |
/* |t[i]| < 2^26 */ |
336 |
memcpy(output, t, sizeof(limb) * 10); |
337 |
} |
338 |
|
339 |
/* Square a number: output = in**2 |
340 |
* |
341 |
* output must be distinct from the input. The inputs are reduced coefficient |
342 |
* form, the output is not. |
343 |
* |
344 |
* output[x] <= 14 * the largest product of the input limbs. */ |
345 |
static void fsquare_inner(limb *output, const limb *in) { |
346 |
output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); |
347 |
output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); |
348 |
output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + |
349 |
((limb) ((s32) in[0])) * ((s32) in[2])); |
350 |
output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) + |
351 |
((limb) ((s32) in[0])) * ((s32) in[3])); |
352 |
output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + |
353 |
4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + |
354 |
2 * ((limb) ((s32) in[0])) * ((s32) in[4]); |
355 |
output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) + |
356 |
((limb) ((s32) in[1])) * ((s32) in[4]) + |
357 |
((limb) ((s32) in[0])) * ((s32) in[5])); |
358 |
output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) + |
359 |
((limb) ((s32) in[2])) * ((s32) in[4]) + |
360 |
((limb) ((s32) in[0])) * ((s32) in[6]) + |
361 |
2 * ((limb) ((s32) in[1])) * ((s32) in[5])); |
362 |
output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) + |
363 |
((limb) ((s32) in[2])) * ((s32) in[5]) + |
364 |
((limb) ((s32) in[1])) * ((s32) in[6]) + |
365 |
((limb) ((s32) in[0])) * ((s32) in[7])); |
366 |
output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + |
367 |
2 * (((limb) ((s32) in[2])) * ((s32) in[6]) + |
368 |
((limb) ((s32) in[0])) * ((s32) in[8]) + |
369 |
2 * (((limb) ((s32) in[1])) * ((s32) in[7]) + |
370 |
((limb) ((s32) in[3])) * ((s32) in[5]))); |
371 |
output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) + |
372 |
((limb) ((s32) in[3])) * ((s32) in[6]) + |
373 |
((limb) ((s32) in[2])) * ((s32) in[7]) + |
374 |
((limb) ((s32) in[1])) * ((s32) in[8]) + |
375 |
((limb) ((s32) in[0])) * ((s32) in[9])); |
376 |
output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) + |
377 |
((limb) ((s32) in[4])) * ((s32) in[6]) + |
378 |
((limb) ((s32) in[2])) * ((s32) in[8]) + |
379 |
2 * (((limb) ((s32) in[3])) * ((s32) in[7]) + |
380 |
((limb) ((s32) in[1])) * ((s32) in[9]))); |
381 |
output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) + |
382 |
((limb) ((s32) in[4])) * ((s32) in[7]) + |
383 |
((limb) ((s32) in[3])) * ((s32) in[8]) + |
384 |
((limb) ((s32) in[2])) * ((s32) in[9])); |
385 |
output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + |
386 |
2 * (((limb) ((s32) in[4])) * ((s32) in[8]) + |
387 |
2 * (((limb) ((s32) in[5])) * ((s32) in[7]) + |
388 |
((limb) ((s32) in[3])) * ((s32) in[9]))); |
389 |
output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) + |
390 |
((limb) ((s32) in[5])) * ((s32) in[8]) + |
391 |
((limb) ((s32) in[4])) * ((s32) in[9])); |
392 |
output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) + |
393 |
((limb) ((s32) in[6])) * ((s32) in[8]) + |
394 |
2 * ((limb) ((s32) in[5])) * ((s32) in[9])); |
395 |
output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) + |
396 |
((limb) ((s32) in[6])) * ((s32) in[9])); |
397 |
output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + |
398 |
4 * ((limb) ((s32) in[7])) * ((s32) in[9]); |
399 |
output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); |
400 |
output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); |
401 |
} |
402 |
|
403 |
/* fsquare sets output = in^2. |
404 |
* |
405 |
* On entry: The |in| argument is in reduced coefficients form and |in[i]| < |
406 |
* 2^27. |
407 |
* |
408 |
* On exit: The |output| argument is in reduced coefficients form (indeed, one |
409 |
* need only provide storage for 10 limbs) and |out[i]| < 2^26. */ |
410 |
static void |
411 |
fsquare(limb *output, const limb *in) { |
412 |
limb t[19]; |
413 |
fsquare_inner(t, in); |
414 |
/* |t[i]| < 14*2^54 because the largest product of two limbs will be < |
415 |
* 2^(27+27) and fsquare_inner adds together, at most, 14 of those |
416 |
* products. */ |
417 |
freduce_degree(t); |
418 |
freduce_coefficients(t); |
419 |
/* |t[i]| < 2^26 */ |
420 |
memcpy(output, t, sizeof(limb) * 10); |
421 |
} |
422 |
|
423 |
/* Take a little-endian, 32-byte number and expand it into polynomial form */ |
424 |
static void |
425 |
fexpand(limb *output, const u8 *input) { |
426 |
#define F(n,start,shift,mask) \ |
427 |
output[n] = ((((limb) input[start + 0]) | \ |
428 |
((limb) input[start + 1]) << 8 | \ |
429 |
((limb) input[start + 2]) << 16 | \ |
430 |
((limb) input[start + 3]) << 24) >> shift) & mask; |
431 |
F(0, 0, 0, 0x3ffffff); |
432 |
F(1, 3, 2, 0x1ffffff); |
433 |
F(2, 6, 3, 0x3ffffff); |
434 |
F(3, 9, 5, 0x1ffffff); |
435 |
F(4, 12, 6, 0x3ffffff); |
436 |
F(5, 16, 0, 0x1ffffff); |
437 |
F(6, 19, 1, 0x3ffffff); |
438 |
F(7, 22, 3, 0x1ffffff); |
439 |
F(8, 25, 4, 0x3ffffff); |
440 |
F(9, 28, 6, 0x1ffffff); |
441 |
#undef F |
442 |
} |
443 |
|
444 |
#if (-32 >> 1) != -16 |
445 |
#error "This code only works when >> does sign-extension on negative numbers" |
446 |
#endif |
447 |
|
448 |
/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ |
449 |
static s32 s32_eq(s32 a, s32 b) { |
450 |
a = ~(a ^ b); |
451 |
a &= a << 16; |
452 |
a &= a << 8; |
453 |
a &= a << 4; |
454 |
a &= a << 2; |
455 |
a &= a << 1; |
456 |
return a >> 31; |
457 |
} |
458 |
|
459 |
/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are |
460 |
* both non-negative. */ |
461 |
static s32 s32_gte(s32 a, s32 b) { |
462 |
a -= b; |
463 |
/* a >= 0 iff a >= b. */ |
464 |
return ~(a >> 31); |
465 |
} |
466 |
|
467 |
/* Take a fully reduced polynomial form number and contract it into a |
468 |
* little-endian, 32-byte array. |
469 |
* |
470 |
* On entry: |input_limbs[i]| < 2^26 */ |
471 |
static void |
472 |
fcontract(u8 *output, limb *input_limbs) { |
473 |
int i; |
474 |
int j; |
475 |
s32 input[10]; |
476 |
s32 mask; |
477 |
|
478 |
/* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ |
479 |
for (i = 0; i < 10; i++) { |
480 |
input[i] = input_limbs[i]; |
481 |
} |
482 |
|
483 |
for (j = 0; j < 2; ++j) { |
484 |
for (i = 0; i < 9; ++i) { |
485 |
if ((i & 1) == 1) { |
486 |
/* This calculation is a time-invariant way to make input[i] |
487 |
* non-negative by borrowing from the next-larger limb. */ |
488 |
const s32 mask = input[i] >> 31; |
489 |
const s32 carry = -((input[i] & mask) >> 25); |
490 |
input[i] = input[i] + (carry << 25); |
491 |
input[i+1] = input[i+1] - carry; |
492 |
} else { |
493 |
const s32 mask = input[i] >> 31; |
494 |
const s32 carry = -((input[i] & mask) >> 26); |
495 |
input[i] = input[i] + (carry << 26); |
496 |
input[i+1] = input[i+1] - carry; |
497 |
} |
498 |
} |
499 |
|
500 |
/* There's no greater limb for input[9] to borrow from, but we can multiply |
501 |
* by 19 and borrow from input[0], which is valid mod 2^255-19. */ |
502 |
{ |
503 |
const s32 mask = input[9] >> 31; |
504 |
const s32 carry = -((input[9] & mask) >> 25); |
505 |
input[9] = input[9] + (carry << 25); |
506 |
input[0] = input[0] - (carry * 19); |
507 |
} |
508 |
|
509 |
/* After the first iteration, input[1..9] are non-negative and fit within |
510 |
* 25 or 26 bits, depending on position. However, input[0] may be |
511 |
* negative. */ |
512 |
} |
513 |
|
514 |
/* The first borrow-propagation pass above ended with every limb |
515 |
except (possibly) input[0] non-negative. |
516 |
|
517 |
If input[0] was negative after the first pass, then it was because of a |
518 |
carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, |
519 |
one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. |
520 |
|
521 |
In the second pass, each limb is decreased by at most one. Thus the second |
522 |
borrow-propagation pass could only have wrapped around to decrease |
523 |
input[0] again if the first pass left input[0] negative *and* input[1] |
524 |
through input[9] were all zero. In that case, input[1] is now 2^25 - 1, |
525 |
and this last borrow-propagation step will leave input[1] non-negative. */ |
526 |
{ |
527 |
const s32 mask = input[0] >> 31; |
528 |
const s32 carry = -((input[0] & mask) >> 26); |
529 |
input[0] = input[0] + (carry << 26); |
530 |
input[1] = input[1] - carry; |
531 |
} |
532 |
|
533 |
/* All input[i] are now non-negative. However, there might be values between |
534 |
* 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ |
535 |
for (j = 0; j < 2; j++) { |
536 |
for (i = 0; i < 9; i++) { |
537 |
if ((i & 1) == 1) { |
538 |
const s32 carry = input[i] >> 25; |
539 |
input[i] &= 0x1ffffff; |
540 |
input[i+1] += carry; |
541 |
} else { |
542 |
const s32 carry = input[i] >> 26; |
543 |
input[i] &= 0x3ffffff; |
544 |
input[i+1] += carry; |
545 |
} |
546 |
} |
547 |
|
548 |
{ |
549 |
const s32 carry = input[9] >> 25; |
550 |
input[9] &= 0x1ffffff; |
551 |
input[0] += 19*carry; |
552 |
} |
553 |
} |
554 |
|
555 |
/* If the first carry-chain pass, just above, ended up with a carry from |
556 |
* input[9], and that caused input[0] to be out-of-bounds, then input[0] was |
557 |
* < 2^26 + 2*19, because the carry was, at most, two. |
558 |
* |
559 |
* If the second pass carried from input[9] again then input[0] is < 2*19 and |
560 |
* the input[9] -> input[0] carry didn't push input[0] out of bounds. */ |
561 |
|
562 |
/* It still remains the case that input might be between 2^255-19 and 2^255. |
563 |
* In this case, input[1..9] must take their maximum value and input[0] must |
564 |
* be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ |
565 |
mask = s32_gte(input[0], 0x3ffffed); |
566 |
for (i = 1; i < 10; i++) { |
567 |
if ((i & 1) == 1) { |
568 |
mask &= s32_eq(input[i], 0x1ffffff); |
569 |
} else { |
570 |
mask &= s32_eq(input[i], 0x3ffffff); |
571 |
} |
572 |
} |
573 |
|
574 |
/* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus |
575 |
* this conditionally subtracts 2^255-19. */ |
576 |
input[0] -= mask & 0x3ffffed; |
577 |
|
578 |
for (i = 1; i < 10; i++) { |
579 |
if ((i & 1) == 1) { |
580 |
input[i] -= mask & 0x1ffffff; |
581 |
} else { |
582 |
input[i] -= mask & 0x3ffffff; |
583 |
} |
584 |
} |
585 |
|
586 |
input[1] <<= 2; |
587 |
input[2] <<= 3; |
588 |
input[3] <<= 5; |
589 |
input[4] <<= 6; |
590 |
input[6] <<= 1; |
591 |
input[7] <<= 3; |
592 |
input[8] <<= 4; |
593 |
input[9] <<= 6; |
594 |
#define F(i, s) \ |
595 |
output[s+0] |= input[i] & 0xff; \ |
596 |
output[s+1] = (input[i] >> 8) & 0xff; \ |
597 |
output[s+2] = (input[i] >> 16) & 0xff; \ |
598 |
output[s+3] = (input[i] >> 24) & 0xff; |
599 |
output[0] = 0; |
600 |
output[16] = 0; |
601 |
F(0,0); |
602 |
F(1,3); |
603 |
F(2,6); |
604 |
F(3,9); |
605 |
F(4,12); |
606 |
F(5,16); |
607 |
F(6,19); |
608 |
F(7,22); |
609 |
F(8,25); |
610 |
F(9,28); |
611 |
#undef F |
612 |
} |
613 |
|
614 |
/* Input: Q, Q', Q-Q' |
615 |
* Output: 2Q, Q+Q' |
616 |
* |
617 |
* x2 z3: long form |
618 |
* x3 z3: long form |
619 |
* x z: short form, destroyed |
620 |
* xprime zprime: short form, destroyed |
621 |
* qmqp: short form, preserved |
622 |
* |
623 |
* On entry and exit, the absolute value of the limbs of all inputs and outputs |
624 |
* are < 2^26. */ |
625 |
static void fmonty(limb *x2, limb *z2, /* output 2Q */ |
626 |
limb *x3, limb *z3, /* output Q + Q' */ |
627 |
limb *x, limb *z, /* input Q */ |
628 |
limb *xprime, limb *zprime, /* input Q' */ |
629 |
const limb *qmqp /* input Q - Q' */) { |
630 |
limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], |
631 |
zzprime[19], zzzprime[19], xxxprime[19]; |
632 |
|
633 |
memcpy(origx, x, 10 * sizeof(limb)); |
634 |
fsum(x, z); |
635 |
/* |x[i]| < 2^27 */ |
636 |
fdifference(z, origx); /* does x - z */ |
637 |
/* |z[i]| < 2^27 */ |
638 |
|
639 |
memcpy(origxprime, xprime, sizeof(limb) * 10); |
640 |
fsum(xprime, zprime); |
641 |
/* |xprime[i]| < 2^27 */ |
642 |
fdifference(zprime, origxprime); |
643 |
/* |zprime[i]| < 2^27 */ |
644 |
fproduct(xxprime, xprime, z); |
645 |
/* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < |
646 |
* 2^(27+27) and fproduct adds together, at most, 14 of those products. |
647 |
* (Approximating that to 2^58 doesn't work out.) */ |
648 |
fproduct(zzprime, x, zprime); |
649 |
/* |zzprime[i]| < 14*2^54 */ |
650 |
freduce_degree(xxprime); |
651 |
freduce_coefficients(xxprime); |
652 |
/* |xxprime[i]| < 2^26 */ |
653 |
freduce_degree(zzprime); |
654 |
freduce_coefficients(zzprime); |
655 |
/* |zzprime[i]| < 2^26 */ |
656 |
memcpy(origxprime, xxprime, sizeof(limb) * 10); |
657 |
fsum(xxprime, zzprime); |
658 |
/* |xxprime[i]| < 2^27 */ |
659 |
fdifference(zzprime, origxprime); |
660 |
/* |zzprime[i]| < 2^27 */ |
661 |
fsquare(xxxprime, xxprime); |
662 |
/* |xxxprime[i]| < 2^26 */ |
663 |
fsquare(zzzprime, zzprime); |
664 |
/* |zzzprime[i]| < 2^26 */ |
665 |
fproduct(zzprime, zzzprime, qmqp); |
666 |
/* |zzprime[i]| < 14*2^52 */ |
667 |
freduce_degree(zzprime); |
668 |
freduce_coefficients(zzprime); |
669 |
/* |zzprime[i]| < 2^26 */ |
670 |
memcpy(x3, xxxprime, sizeof(limb) * 10); |
671 |
memcpy(z3, zzprime, sizeof(limb) * 10); |
672 |
|
673 |
fsquare(xx, x); |
674 |
/* |xx[i]| < 2^26 */ |
675 |
fsquare(zz, z); |
676 |
/* |zz[i]| < 2^26 */ |
677 |
fproduct(x2, xx, zz); |
678 |
/* |x2[i]| < 14*2^52 */ |
679 |
freduce_degree(x2); |
680 |
freduce_coefficients(x2); |
681 |
/* |x2[i]| < 2^26 */ |
682 |
fdifference(zz, xx); // does zz = xx - zz |
683 |
/* |zz[i]| < 2^27 */ |
684 |
memset(zzz + 10, 0, sizeof(limb) * 9); |
685 |
fscalar_product(zzz, zz, 121665); |
686 |
/* |zzz[i]| < 2^(27+17) */ |
687 |
/* No need to call freduce_degree here: |
688 |
fscalar_product doesn't increase the degree of its input. */ |
689 |
freduce_coefficients(zzz); |
690 |
/* |zzz[i]| < 2^26 */ |
691 |
fsum(zzz, xx); |
692 |
/* |zzz[i]| < 2^27 */ |
693 |
fproduct(z2, zz, zzz); |
694 |
/* |z2[i]| < 14*2^(26+27) */ |
695 |
freduce_degree(z2); |
696 |
freduce_coefficients(z2); |
697 |
/* |z2|i| < 2^26 */ |
698 |
} |
699 |
|
700 |
/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave |
701 |
* them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid |
702 |
* side-channel attacks. |
703 |
* |
704 |
* NOTE that this function requires that 'iswap' be 1 or 0; other values give |
705 |
* wrong results. Also, the two limb arrays must be in reduced-coefficient, |
706 |
* reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, |
707 |
* and all all values in a[0..9],b[0..9] must have magnitude less than |
708 |
* INT32_MAX. */ |
709 |
static void |
710 |
swap_conditional(limb a[19], limb b[19], limb iswap) { |
711 |
unsigned i; |
712 |
const s32 swap = (s32) -iswap; |
713 |
|
714 |
for (i = 0; i < 10; ++i) { |
715 |
const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) ); |
716 |
a[i] = ((s32)a[i]) ^ x; |
717 |
b[i] = ((s32)b[i]) ^ x; |
718 |
} |
719 |
} |
720 |
|
721 |
/* Calculates nQ where Q is the x-coordinate of a point on the curve |
722 |
* |
723 |
* resultx/resultz: the x coordinate of the resulting curve point (short form) |
724 |
* n: a little endian, 32-byte number |
725 |
* q: a point of the curve (short form) */ |
726 |
static void |
727 |
cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { |
728 |
limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; |
729 |
limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; |
730 |
limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; |
731 |
limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; |
732 |
|
733 |
unsigned i, j; |
734 |
|
735 |
memcpy(nqpqx, q, sizeof(limb) * 10); |
736 |
|
737 |
for (i = 0; i < 32; ++i) { |
738 |
u8 byte = n[31 - i]; |
739 |
for (j = 0; j < 8; ++j) { |
740 |
const limb bit = byte >> 7; |
741 |
|
742 |
swap_conditional(nqx, nqpqx, bit); |
743 |
swap_conditional(nqz, nqpqz, bit); |
744 |
fmonty(nqx2, nqz2, |
745 |
nqpqx2, nqpqz2, |
746 |
nqx, nqz, |
747 |
nqpqx, nqpqz, |
748 |
q); |
749 |
swap_conditional(nqx2, nqpqx2, bit); |
750 |
swap_conditional(nqz2, nqpqz2, bit); |
751 |
|
752 |
t = nqx; |
753 |
nqx = nqx2; |
754 |
nqx2 = t; |
755 |
t = nqz; |
756 |
nqz = nqz2; |
757 |
nqz2 = t; |
758 |
t = nqpqx; |
759 |
nqpqx = nqpqx2; |
760 |
nqpqx2 = t; |
761 |
t = nqpqz; |
762 |
nqpqz = nqpqz2; |
763 |
nqpqz2 = t; |
764 |
|
765 |
byte <<= 1; |
766 |
} |
767 |
} |
768 |
|
769 |
memcpy(resultx, nqx, sizeof(limb) * 10); |
770 |
memcpy(resultz, nqz, sizeof(limb) * 10); |
771 |
} |
772 |
|
773 |
// ----------------------------------------------------------------------------- |
774 |
// Shamelessly copied from djb's code |
775 |
// ----------------------------------------------------------------------------- |
776 |
static void |
777 |
crecip(limb *out, const limb *z) { |
778 |
limb z2[10]; |
779 |
limb z9[10]; |
780 |
limb z11[10]; |
781 |
limb z2_5_0[10]; |
782 |
limb z2_10_0[10]; |
783 |
limb z2_20_0[10]; |
784 |
limb z2_50_0[10]; |
785 |
limb z2_100_0[10]; |
786 |
limb t0[10]; |
787 |
limb t1[10]; |
788 |
int i; |
789 |
|
790 |
/* 2 */ fsquare(z2,z); |
791 |
/* 4 */ fsquare(t1,z2); |
792 |
/* 8 */ fsquare(t0,t1); |
793 |
/* 9 */ fmul(z9,t0,z); |
794 |
/* 11 */ fmul(z11,z9,z2); |
795 |
/* 22 */ fsquare(t0,z11); |
796 |
/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9); |
797 |
|
798 |
/* 2^6 - 2^1 */ fsquare(t0,z2_5_0); |
799 |
/* 2^7 - 2^2 */ fsquare(t1,t0); |
800 |
/* 2^8 - 2^3 */ fsquare(t0,t1); |
801 |
/* 2^9 - 2^4 */ fsquare(t1,t0); |
802 |
/* 2^10 - 2^5 */ fsquare(t0,t1); |
803 |
/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0); |
804 |
|
805 |
/* 2^11 - 2^1 */ fsquare(t0,z2_10_0); |
806 |
/* 2^12 - 2^2 */ fsquare(t1,t0); |
807 |
/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
808 |
/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0); |
809 |
|
810 |
/* 2^21 - 2^1 */ fsquare(t0,z2_20_0); |
811 |
/* 2^22 - 2^2 */ fsquare(t1,t0); |
812 |
/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
813 |
/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0); |
814 |
|
815 |
/* 2^41 - 2^1 */ fsquare(t1,t0); |
816 |
/* 2^42 - 2^2 */ fsquare(t0,t1); |
817 |
/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } |
818 |
/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0); |
819 |
|
820 |
/* 2^51 - 2^1 */ fsquare(t0,z2_50_0); |
821 |
/* 2^52 - 2^2 */ fsquare(t1,t0); |
822 |
/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
823 |
/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0); |
824 |
|
825 |
/* 2^101 - 2^1 */ fsquare(t1,z2_100_0); |
826 |
/* 2^102 - 2^2 */ fsquare(t0,t1); |
827 |
/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } |
828 |
/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0); |
829 |
|
830 |
/* 2^201 - 2^1 */ fsquare(t0,t1); |
831 |
/* 2^202 - 2^2 */ fsquare(t1,t0); |
832 |
/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
833 |
/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0); |
834 |
|
835 |
/* 2^251 - 2^1 */ fsquare(t1,t0); |
836 |
/* 2^252 - 2^2 */ fsquare(t0,t1); |
837 |
/* 2^253 - 2^3 */ fsquare(t1,t0); |
838 |
/* 2^254 - 2^4 */ fsquare(t0,t1); |
839 |
/* 2^255 - 2^5 */ fsquare(t1,t0); |
840 |
/* 2^255 - 21 */ fmul(out,t1,z11); |
841 |
} |
842 |
|
843 |
int |
844 |
curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { |
845 |
limb bp[10], x[10], z[11], zmone[10]; |
846 |
uint8_t e[32]; |
847 |
int i; |
848 |
|
849 |
for (i = 0; i < 32; ++i) e[i] = secret[i]; |
850 |
e[0] &= 248; |
851 |
e[31] &= 127; |
852 |
e[31] |= 64; |
853 |
|
854 |
fexpand(bp, basepoint); |
855 |
cmult(x, z, e, bp); |
856 |
crecip(zmone, z); |
857 |
fmul(z, x, zmone); |
858 |
fcontract(mypublic, z); |
859 |
return 0; |
860 |
} |