… | |
… | |
41 | * djb's sample implementation of curve25519 is written in a special assembly |
41 | * djb's sample implementation of curve25519 is written in a special assembly |
42 | * language called qhasm and uses the floating point registers. |
42 | * language called qhasm and uses the floating point registers. |
43 | * |
43 | * |
44 | * This is, almost, a clean room reimplementation from the curve25519 paper. It |
44 | * This is, almost, a clean room reimplementation from the curve25519 paper. It |
45 | * uses many of the tricks described therein. Only the crecip function is taken |
45 | * uses many of the tricks described therein. Only the crecip function is taken |
46 | * from the sample implementation. |
46 | * from the sample implementation. */ |
47 | */ |
|
|
48 | |
47 | |
49 | #include <string.h> |
48 | #include <string.h> |
50 | #include <stdint.h> |
49 | #include <stdint.h> |
51 | |
50 | |
52 | #ifdef _MSC_VER |
51 | #ifdef _MSC_VER |
… | |
… | |
61 | * |
60 | * |
62 | * Field elements are written as an array of signed, 64-bit limbs, least |
61 | * Field elements are written as an array of signed, 64-bit limbs, least |
63 | * significant first. The value of the field element is: |
62 | * significant first. The value of the field element is: |
64 | * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... |
63 | * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... |
65 | * |
64 | * |
66 | * i.e. the limbs are 26, 25, 26, 25, ... bits wide. |
65 | * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ |
67 | */ |
|
|
68 | |
66 | |
69 | /* Sum two numbers: output += in */ |
67 | /* Sum two numbers: output += in */ |
70 | static void fsum(limb *output, const limb *in) { |
68 | static void fsum(limb *output, const limb *in) { |
71 | unsigned i; |
69 | unsigned i; |
72 | for (i = 0; i < 10; i += 2) { |
70 | for (i = 0; i < 10; i += 2) { |
73 | output[0+i] = (output[0+i] + in[0+i]); |
71 | output[0+i] = output[0+i] + in[0+i]; |
74 | output[1+i] = (output[1+i] + in[1+i]); |
72 | output[1+i] = output[1+i] + in[1+i]; |
75 | } |
73 | } |
76 | } |
74 | } |
77 | |
75 | |
78 | /* Find the difference of two numbers: output = in - output |
76 | /* Find the difference of two numbers: output = in - output |
79 | * (note the order of the arguments!) |
77 | * (note the order of the arguments!). */ |
80 | */ |
|
|
81 | static void fdifference(limb *output, const limb *in) { |
78 | static void fdifference(limb *output, const limb *in) { |
82 | unsigned i; |
79 | unsigned i; |
83 | for (i = 0; i < 10; ++i) { |
80 | for (i = 0; i < 10; ++i) { |
84 | output[i] = (in[i] - output[i]); |
81 | output[i] = in[i] - output[i]; |
85 | } |
82 | } |
86 | } |
83 | } |
87 | |
84 | |
88 | /* Multiply a number by a scalar: output = in * scalar */ |
85 | /* Multiply a number by a scalar: output = in * scalar */ |
89 | static void fscalar_product(limb *output, const limb *in, const limb scalar) { |
86 | static void fscalar_product(limb *output, const limb *in, const limb scalar) { |
… | |
… | |
95 | |
92 | |
96 | /* Multiply two numbers: output = in2 * in |
93 | /* Multiply two numbers: output = in2 * in |
97 | * |
94 | * |
98 | * output must be distinct to both inputs. The inputs are reduced coefficient |
95 | * output must be distinct to both inputs. The inputs are reduced coefficient |
99 | * form, the output is not. |
96 | * form, the output is not. |
100 | */ |
97 | * |
|
|
98 | * output[x] <= 14 * the largest product of the input limbs. */ |
101 | static void fproduct(limb *output, const limb *in2, const limb *in) { |
99 | static void fproduct(limb *output, const limb *in2, const limb *in) { |
102 | output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); |
100 | output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); |
103 | output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + |
101 | output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + |
104 | ((limb) ((s32) in2[1])) * ((s32) in[0]); |
102 | ((limb) ((s32) in2[1])) * ((s32) in[0]); |
105 | output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + |
103 | output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + |
… | |
… | |
199 | output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + |
197 | output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + |
200 | ((limb) ((s32) in2[9])) * ((s32) in[8]); |
198 | ((limb) ((s32) in2[9])) * ((s32) in[8]); |
201 | output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); |
199 | output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); |
202 | } |
200 | } |
203 | |
201 | |
204 | /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */ |
202 | /* Reduce a long form to a short form by taking the input mod 2^255 - 19. |
|
|
203 | * |
|
|
204 | * On entry: |output[i]| < 14*2^54 |
|
|
205 | * On exit: |output[0..8]| < 280*2^54 */ |
205 | static void freduce_degree(limb *output) { |
206 | static void freduce_degree(limb *output) { |
206 | /* Each of these shifts and adds ends up multiplying the value by 19. */ |
207 | /* Each of these shifts and adds ends up multiplying the value by 19. |
|
|
208 | * |
|
|
209 | * For output[0..8], the absolute entry value is < 14*2^54 and we add, at |
|
|
210 | * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ |
207 | output[8] += output[18] << 4; |
211 | output[8] += output[18] << 4; |
208 | output[8] += output[18] << 1; |
212 | output[8] += output[18] << 1; |
209 | output[8] += output[18]; |
213 | output[8] += output[18]; |
210 | output[7] += output[17] << 4; |
214 | output[7] += output[17] << 4; |
211 | output[7] += output[17] << 1; |
215 | output[7] += output[17] << 1; |
… | |
… | |
235 | |
239 | |
236 | #if (-1 & 3) != 3 |
240 | #if (-1 & 3) != 3 |
237 | #error "This code only works on a two's complement system" |
241 | #error "This code only works on a two's complement system" |
238 | #endif |
242 | #endif |
239 | |
243 | |
240 | /* return v / 2^26, using only shifts and adds. */ |
244 | /* return v / 2^26, using only shifts and adds. |
|
|
245 | * |
|
|
246 | * On entry: v can take any value. */ |
241 | static inline limb |
247 | static inline limb |
242 | div_by_2_26(const limb v) |
248 | div_by_2_26(const limb v) |
243 | { |
249 | { |
244 | /* High word of v; no shift needed*/ |
250 | /* High word of v; no shift needed. */ |
245 | const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
251 | const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
246 | /* Set to all 1s if v was negative; else set to 0s. */ |
252 | /* Set to all 1s if v was negative; else set to 0s. */ |
247 | const int32_t sign = ((int32_t) highword) >> 31; |
253 | const int32_t sign = ((int32_t) highword) >> 31; |
248 | /* Set to 0x3ffffff if v was negative; else set to 0. */ |
254 | /* Set to 0x3ffffff if v was negative; else set to 0. */ |
249 | const int32_t roundoff = ((uint32_t) sign) >> 6; |
255 | const int32_t roundoff = ((uint32_t) sign) >> 6; |
250 | /* Should return v / (1<<26) */ |
256 | /* Should return v / (1<<26) */ |
251 | return (v + roundoff) >> 26; |
257 | return (v + roundoff) >> 26; |
252 | } |
258 | } |
253 | |
259 | |
254 | /* return v / (2^25), using only shifts and adds. */ |
260 | /* return v / (2^25), using only shifts and adds. |
|
|
261 | * |
|
|
262 | * On entry: v can take any value. */ |
255 | static inline limb |
263 | static inline limb |
256 | div_by_2_25(const limb v) |
264 | div_by_2_25(const limb v) |
257 | { |
265 | { |
258 | /* High word of v; no shift needed*/ |
266 | /* High word of v; no shift needed*/ |
259 | const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
267 | const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
… | |
… | |
263 | const int32_t roundoff = ((uint32_t) sign) >> 7; |
271 | const int32_t roundoff = ((uint32_t) sign) >> 7; |
264 | /* Should return v / (1<<25) */ |
272 | /* Should return v / (1<<25) */ |
265 | return (v + roundoff) >> 25; |
273 | return (v + roundoff) >> 25; |
266 | } |
274 | } |
267 | |
275 | |
268 | static inline s32 |
|
|
269 | div_s32_by_2_25(const s32 v) |
|
|
270 | { |
|
|
271 | const s32 roundoff = ((uint32_t)(v >> 31)) >> 7; |
|
|
272 | return (v + roundoff) >> 25; |
|
|
273 | } |
|
|
274 | |
|
|
275 | /* Reduce all coefficients of the short form input so that |x| < 2^26. |
276 | /* Reduce all coefficients of the short form input so that |x| < 2^26. |
276 | * |
277 | * |
277 | * On entry: |output[i]| < 2^62 |
278 | * On entry: |output[i]| < 280*2^54 */ |
278 | */ |
|
|
279 | static void freduce_coefficients(limb *output) { |
279 | static void freduce_coefficients(limb *output) { |
280 | unsigned i; |
280 | unsigned i; |
281 | |
281 | |
282 | output[10] = 0; |
282 | output[10] = 0; |
283 | |
283 | |
284 | for (i = 0; i < 10; i += 2) { |
284 | for (i = 0; i < 10; i += 2) { |
285 | limb over = div_by_2_26(output[i]); |
285 | limb over = div_by_2_26(output[i]); |
|
|
286 | /* The entry condition (that |output[i]| < 280*2^54) means that over is, at |
|
|
287 | * most, 280*2^28 in the first iteration of this loop. This is added to the |
|
|
288 | * next limb and we can approximate the resulting bound of that limb by |
|
|
289 | * 281*2^54. */ |
286 | output[i] -= over << 26; |
290 | output[i] -= over << 26; |
287 | output[i+1] += over; |
291 | output[i+1] += over; |
288 | |
292 | |
|
|
293 | /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < |
|
|
294 | * 281*2^29. When this is added to the next limb, the resulting bound can |
|
|
295 | * be approximated as 281*2^54. |
|
|
296 | * |
|
|
297 | * For subsequent iterations of the loop, 281*2^54 remains a conservative |
|
|
298 | * bound and no overflow occurs. */ |
289 | over = div_by_2_25(output[i+1]); |
299 | over = div_by_2_25(output[i+1]); |
290 | output[i+1] -= over << 25; |
300 | output[i+1] -= over << 25; |
291 | output[i+2] += over; |
301 | output[i+2] += over; |
292 | } |
302 | } |
293 | /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */ |
303 | /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ |
294 | output[0] += output[10] << 4; |
304 | output[0] += output[10] << 4; |
295 | output[0] += output[10] << 1; |
305 | output[0] += output[10] << 1; |
296 | output[0] += output[10]; |
306 | output[0] += output[10]; |
297 | |
307 | |
298 | output[10] = 0; |
308 | output[10] = 0; |
299 | |
309 | |
300 | /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38 |
310 | /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 |
301 | * So |over| will be no more than 77825 */ |
311 | * So |over| will be no more than 2^16. */ |
302 | { |
312 | { |
303 | limb over = div_by_2_26(output[0]); |
313 | limb over = div_by_2_26(output[0]); |
304 | output[0] -= over << 26; |
314 | output[0] -= over << 26; |
305 | output[1] += over; |
315 | output[1] += over; |
306 | } |
316 | } |
307 | |
317 | |
308 | /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825 |
318 | /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The |
309 | * So |over| will be no more than 1. */ |
319 | * bound on |output[1]| is sufficient to meet our needs. */ |
310 | { |
|
|
311 | /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */ |
|
|
312 | s32 over32 = div_s32_by_2_25((s32) output[1]); |
|
|
313 | output[1] -= over32 << 25; |
|
|
314 | output[2] += over32; |
|
|
315 | } |
|
|
316 | |
|
|
317 | /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced": |
|
|
318 | * we have |output[2]| <= 2^26. This is good enough for all of our math, |
|
|
319 | * but it will require an extra freduce_coefficients before fcontract. */ |
|
|
320 | } |
320 | } |
321 | |
321 | |
322 | /* A helpful wrapper around fproduct: output = in * in2. |
322 | /* A helpful wrapper around fproduct: output = in * in2. |
323 | * |
323 | * |
|
|
324 | * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. |
|
|
325 | * |
324 | * output must be distinct to both inputs. The output is reduced degree and |
326 | * output must be distinct to both inputs. The output is reduced degree |
325 | * reduced coefficient. |
327 | * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ |
326 | */ |
|
|
327 | static void |
328 | static void |
328 | fmul(limb *output, const limb *in, const limb *in2) { |
329 | fmul(limb *output, const limb *in, const limb *in2) { |
329 | limb t[19]; |
330 | limb t[19]; |
330 | fproduct(t, in, in2); |
331 | fproduct(t, in, in2); |
|
|
332 | /* |t[i]| < 14*2^54 */ |
331 | freduce_degree(t); |
333 | freduce_degree(t); |
332 | freduce_coefficients(t); |
334 | freduce_coefficients(t); |
|
|
335 | /* |t[i]| < 2^26 */ |
333 | memcpy(output, t, sizeof(limb) * 10); |
336 | memcpy(output, t, sizeof(limb) * 10); |
334 | } |
337 | } |
335 | |
338 | |
|
|
339 | /* Square a number: output = in**2 |
|
|
340 | * |
|
|
341 | * output must be distinct from the input. The inputs are reduced coefficient |
|
|
342 | * form, the output is not. |
|
|
343 | * |
|
|
344 | * output[x] <= 14 * the largest product of the input limbs. */ |
336 | static void fsquare_inner(limb *output, const limb *in) { |
345 | static void fsquare_inner(limb *output, const limb *in) { |
337 | output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); |
346 | output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); |
338 | output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); |
347 | output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); |
339 | output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + |
348 | output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + |
340 | ((limb) ((s32) in[0])) * ((s32) in[2])); |
349 | ((limb) ((s32) in[0])) * ((s32) in[2])); |
… | |
… | |
389 | 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); |
398 | 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); |
390 | output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); |
399 | output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); |
391 | output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); |
400 | output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); |
392 | } |
401 | } |
393 | |
402 | |
|
|
403 | /* fsquare sets output = in^2. |
|
|
404 | * |
|
|
405 | * On entry: The |in| argument is in reduced coefficients form and |in[i]| < |
|
|
406 | * 2^27. |
|
|
407 | * |
|
|
408 | * On exit: The |output| argument is in reduced coefficients form (indeed, one |
|
|
409 | * need only provide storage for 10 limbs) and |out[i]| < 2^26. */ |
394 | static void |
410 | static void |
395 | fsquare(limb *output, const limb *in) { |
411 | fsquare(limb *output, const limb *in) { |
396 | limb t[19]; |
412 | limb t[19]; |
397 | fsquare_inner(t, in); |
413 | fsquare_inner(t, in); |
|
|
414 | /* |t[i]| < 14*2^54 because the largest product of two limbs will be < |
|
|
415 | * 2^(27+27) and fsquare_inner adds together, at most, 14 of those |
|
|
416 | * products. */ |
398 | freduce_degree(t); |
417 | freduce_degree(t); |
399 | freduce_coefficients(t); |
418 | freduce_coefficients(t); |
|
|
419 | /* |t[i]| < 2^26 */ |
400 | memcpy(output, t, sizeof(limb) * 10); |
420 | memcpy(output, t, sizeof(limb) * 10); |
401 | } |
421 | } |
402 | |
422 | |
403 | /* Take a little-endian, 32-byte number and expand it into polynomial form */ |
423 | /* Take a little-endian, 32-byte number and expand it into polynomial form */ |
404 | static void |
424 | static void |
… | |
… | |
423 | |
443 | |
424 | #if (-32 >> 1) != -16 |
444 | #if (-32 >> 1) != -16 |
425 | #error "This code only works when >> does sign-extension on negative numbers" |
445 | #error "This code only works when >> does sign-extension on negative numbers" |
426 | #endif |
446 | #endif |
427 | |
447 | |
|
|
448 | /* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ |
|
|
449 | static s32 s32_eq(s32 a, s32 b) { |
|
|
450 | a = ~(a ^ b); |
|
|
451 | a &= a << 16; |
|
|
452 | a &= a << 8; |
|
|
453 | a &= a << 4; |
|
|
454 | a &= a << 2; |
|
|
455 | a &= a << 1; |
|
|
456 | return a >> 31; |
|
|
457 | } |
|
|
458 | |
|
|
459 | /* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are |
|
|
460 | * both non-negative. */ |
|
|
461 | static s32 s32_gte(s32 a, s32 b) { |
|
|
462 | a -= b; |
|
|
463 | /* a >= 0 iff a >= b. */ |
|
|
464 | return ~(a >> 31); |
|
|
465 | } |
|
|
466 | |
428 | /* Take a fully reduced polynomial form number and contract it into a |
467 | /* Take a fully reduced polynomial form number and contract it into a |
429 | * little-endian, 32-byte array |
468 | * little-endian, 32-byte array. |
430 | */ |
469 | * |
|
|
470 | * On entry: |input_limbs[i]| < 2^26 */ |
431 | static void |
471 | static void |
432 | fcontract(u8 *output, limb *input) { |
472 | fcontract(u8 *output, limb *input_limbs) { |
433 | int i; |
473 | int i; |
434 | int j; |
474 | int j; |
|
|
475 | s32 input[10]; |
|
|
476 | s32 mask; |
|
|
477 | |
|
|
478 | /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ |
|
|
479 | for (i = 0; i < 10; i++) { |
|
|
480 | input[i] = input_limbs[i]; |
|
|
481 | } |
435 | |
482 | |
436 | for (j = 0; j < 2; ++j) { |
483 | for (j = 0; j < 2; ++j) { |
437 | for (i = 0; i < 9; ++i) { |
484 | for (i = 0; i < 9; ++i) { |
438 | if ((i & 1) == 1) { |
485 | if ((i & 1) == 1) { |
439 | /* This calculation is a time-invariant way to make input[i] positive |
486 | /* This calculation is a time-invariant way to make input[i] |
440 | by borrowing from the next-larger limb. |
487 | * non-negative by borrowing from the next-larger limb. */ |
441 | */ |
|
|
442 | const s32 mask = (s32)(input[i]) >> 31; |
488 | const s32 mask = input[i] >> 31; |
443 | const s32 carry = -(((s32)(input[i]) & mask) >> 25); |
489 | const s32 carry = -((input[i] & mask) >> 25); |
444 | input[i] = (s32)(input[i]) + (carry << 25); |
490 | input[i] = input[i] + (carry << 25); |
445 | input[i+1] = (s32)(input[i+1]) - carry; |
491 | input[i+1] = input[i+1] - carry; |
446 | } else { |
492 | } else { |
447 | const s32 mask = (s32)(input[i]) >> 31; |
493 | const s32 mask = input[i] >> 31; |
448 | const s32 carry = -(((s32)(input[i]) & mask) >> 26); |
494 | const s32 carry = -((input[i] & mask) >> 26); |
449 | input[i] = (s32)(input[i]) + (carry << 26); |
495 | input[i] = input[i] + (carry << 26); |
450 | input[i+1] = (s32)(input[i+1]) - carry; |
496 | input[i+1] = input[i+1] - carry; |
451 | } |
497 | } |
452 | } |
498 | } |
|
|
499 | |
|
|
500 | /* There's no greater limb for input[9] to borrow from, but we can multiply |
|
|
501 | * by 19 and borrow from input[0], which is valid mod 2^255-19. */ |
453 | { |
502 | { |
454 | const s32 mask = (s32)(input[9]) >> 31; |
503 | const s32 mask = input[9] >> 31; |
455 | const s32 carry = -(((s32)(input[9]) & mask) >> 25); |
504 | const s32 carry = -((input[9] & mask) >> 25); |
456 | input[9] = (s32)(input[9]) + (carry << 25); |
505 | input[9] = input[9] + (carry << 25); |
457 | input[0] = (s32)(input[0]) - (carry * 19); |
506 | input[0] = input[0] - (carry * 19); |
458 | } |
507 | } |
|
|
508 | |
|
|
509 | /* After the first iteration, input[1..9] are non-negative and fit within |
|
|
510 | * 25 or 26 bits, depending on position. However, input[0] may be |
|
|
511 | * negative. */ |
459 | } |
512 | } |
460 | |
513 | |
461 | /* The first borrow-propagation pass above ended with every limb |
514 | /* The first borrow-propagation pass above ended with every limb |
462 | except (possibly) input[0] non-negative. |
515 | except (possibly) input[0] non-negative. |
463 | |
516 | |
464 | Since each input limb except input[0] is decreased by at most 1 |
517 | If input[0] was negative after the first pass, then it was because of a |
465 | by a borrow-propagation pass, the second borrow-propagation pass |
518 | carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, |
466 | could only have wrapped around to decrease input[0] again if the |
519 | one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. |
467 | first pass left input[0] negative *and* input[1] through input[9] |
520 | |
|
|
521 | In the second pass, each limb is decreased by at most one. Thus the second |
|
|
522 | borrow-propagation pass could only have wrapped around to decrease |
|
|
523 | input[0] again if the first pass left input[0] negative *and* input[1] |
468 | were all zero. In that case, input[1] is now 2^25 - 1, and this |
524 | through input[9] were all zero. In that case, input[1] is now 2^25 - 1, |
469 | last borrow-propagation step will leave input[1] non-negative. |
525 | and this last borrow-propagation step will leave input[1] non-negative. */ |
470 | */ |
|
|
471 | { |
526 | { |
472 | const s32 mask = (s32)(input[0]) >> 31; |
527 | const s32 mask = input[0] >> 31; |
473 | const s32 carry = -(((s32)(input[0]) & mask) >> 26); |
528 | const s32 carry = -((input[0] & mask) >> 26); |
474 | input[0] = (s32)(input[0]) + (carry << 26); |
529 | input[0] = input[0] + (carry << 26); |
475 | input[1] = (s32)(input[1]) - carry; |
530 | input[1] = input[1] - carry; |
|
|
531 | } |
|
|
532 | |
|
|
533 | /* All input[i] are now non-negative. However, there might be values between |
|
|
534 | * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ |
|
|
535 | for (j = 0; j < 2; j++) { |
|
|
536 | for (i = 0; i < 9; i++) { |
|
|
537 | if ((i & 1) == 1) { |
|
|
538 | const s32 carry = input[i] >> 25; |
|
|
539 | input[i] &= 0x1ffffff; |
|
|
540 | input[i+1] += carry; |
|
|
541 | } else { |
|
|
542 | const s32 carry = input[i] >> 26; |
|
|
543 | input[i] &= 0x3ffffff; |
|
|
544 | input[i+1] += carry; |
|
|
545 | } |
476 | } |
546 | } |
477 | |
547 | |
478 | /* Both passes through the above loop, plus the last 0-to-1 step, are |
548 | { |
479 | necessary: if input[9] is -1 and input[0] through input[8] are 0, |
549 | const s32 carry = input[9] >> 25; |
480 | negative values will remain in the array until the end. |
550 | input[9] &= 0x1ffffff; |
|
|
551 | input[0] += 19*carry; |
|
|
552 | } |
|
|
553 | } |
|
|
554 | |
|
|
555 | /* If the first carry-chain pass, just above, ended up with a carry from |
|
|
556 | * input[9], and that caused input[0] to be out-of-bounds, then input[0] was |
|
|
557 | * < 2^26 + 2*19, because the carry was, at most, two. |
481 | */ |
558 | * |
|
|
559 | * If the second pass carried from input[9] again then input[0] is < 2*19 and |
|
|
560 | * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ |
|
|
561 | |
|
|
562 | /* It still remains the case that input might be between 2^255-19 and 2^255. |
|
|
563 | * In this case, input[1..9] must take their maximum value and input[0] must |
|
|
564 | * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ |
|
|
565 | mask = s32_gte(input[0], 0x3ffffed); |
|
|
566 | for (i = 1; i < 10; i++) { |
|
|
567 | if ((i & 1) == 1) { |
|
|
568 | mask &= s32_eq(input[i], 0x1ffffff); |
|
|
569 | } else { |
|
|
570 | mask &= s32_eq(input[i], 0x3ffffff); |
|
|
571 | } |
|
|
572 | } |
|
|
573 | |
|
|
574 | /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus |
|
|
575 | * this conditionally subtracts 2^255-19. */ |
|
|
576 | input[0] -= mask & 0x3ffffed; |
|
|
577 | |
|
|
578 | for (i = 1; i < 10; i++) { |
|
|
579 | if ((i & 1) == 1) { |
|
|
580 | input[i] -= mask & 0x1ffffff; |
|
|
581 | } else { |
|
|
582 | input[i] -= mask & 0x3ffffff; |
|
|
583 | } |
|
|
584 | } |
482 | |
585 | |
483 | input[1] <<= 2; |
586 | input[1] <<= 2; |
484 | input[2] <<= 3; |
587 | input[2] <<= 3; |
485 | input[3] <<= 5; |
588 | input[3] <<= 5; |
486 | input[4] <<= 6; |
589 | input[4] <<= 6; |
… | |
… | |
514 | * x2 z3: long form |
617 | * x2 z3: long form |
515 | * x3 z3: long form |
618 | * x3 z3: long form |
516 | * x z: short form, destroyed |
619 | * x z: short form, destroyed |
517 | * xprime zprime: short form, destroyed |
620 | * xprime zprime: short form, destroyed |
518 | * qmqp: short form, preserved |
621 | * qmqp: short form, preserved |
519 | */ |
622 | * |
|
|
623 | * On entry and exit, the absolute value of the limbs of all inputs and outputs |
|
|
624 | * are < 2^26. */ |
520 | static void fmonty(limb *x2, limb *z2, /* output 2Q */ |
625 | static void fmonty(limb *x2, limb *z2, /* output 2Q */ |
521 | limb *x3, limb *z3, /* output Q + Q' */ |
626 | limb *x3, limb *z3, /* output Q + Q' */ |
522 | limb *x, limb *z, /* input Q */ |
627 | limb *x, limb *z, /* input Q */ |
523 | limb *xprime, limb *zprime, /* input Q' */ |
628 | limb *xprime, limb *zprime, /* input Q' */ |
524 | const limb *qmqp /* input Q - Q' */) { |
629 | const limb *qmqp /* input Q - Q' */) { |
525 | limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], |
630 | limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], |
526 | zzprime[19], zzzprime[19], xxxprime[19]; |
631 | zzprime[19], zzzprime[19], xxxprime[19]; |
527 | |
632 | |
528 | memcpy(origx, x, 10 * sizeof(limb)); |
633 | memcpy(origx, x, 10 * sizeof(limb)); |
529 | fsum(x, z); |
634 | fsum(x, z); |
|
|
635 | /* |x[i]| < 2^27 */ |
530 | fdifference(z, origx); // does x - z |
636 | fdifference(z, origx); /* does x - z */ |
|
|
637 | /* |z[i]| < 2^27 */ |
531 | |
638 | |
532 | memcpy(origxprime, xprime, sizeof(limb) * 10); |
639 | memcpy(origxprime, xprime, sizeof(limb) * 10); |
533 | fsum(xprime, zprime); |
640 | fsum(xprime, zprime); |
|
|
641 | /* |xprime[i]| < 2^27 */ |
534 | fdifference(zprime, origxprime); |
642 | fdifference(zprime, origxprime); |
|
|
643 | /* |zprime[i]| < 2^27 */ |
535 | fproduct(xxprime, xprime, z); |
644 | fproduct(xxprime, xprime, z); |
|
|
645 | /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < |
|
|
646 | * 2^(27+27) and fproduct adds together, at most, 14 of those products. |
|
|
647 | * (Approximating that to 2^58 doesn't work out.) */ |
536 | fproduct(zzprime, x, zprime); |
648 | fproduct(zzprime, x, zprime); |
|
|
649 | /* |zzprime[i]| < 14*2^54 */ |
537 | freduce_degree(xxprime); |
650 | freduce_degree(xxprime); |
538 | freduce_coefficients(xxprime); |
651 | freduce_coefficients(xxprime); |
|
|
652 | /* |xxprime[i]| < 2^26 */ |
539 | freduce_degree(zzprime); |
653 | freduce_degree(zzprime); |
540 | freduce_coefficients(zzprime); |
654 | freduce_coefficients(zzprime); |
|
|
655 | /* |zzprime[i]| < 2^26 */ |
541 | memcpy(origxprime, xxprime, sizeof(limb) * 10); |
656 | memcpy(origxprime, xxprime, sizeof(limb) * 10); |
542 | fsum(xxprime, zzprime); |
657 | fsum(xxprime, zzprime); |
|
|
658 | /* |xxprime[i]| < 2^27 */ |
543 | fdifference(zzprime, origxprime); |
659 | fdifference(zzprime, origxprime); |
|
|
660 | /* |zzprime[i]| < 2^27 */ |
544 | fsquare(xxxprime, xxprime); |
661 | fsquare(xxxprime, xxprime); |
|
|
662 | /* |xxxprime[i]| < 2^26 */ |
545 | fsquare(zzzprime, zzprime); |
663 | fsquare(zzzprime, zzprime); |
|
|
664 | /* |zzzprime[i]| < 2^26 */ |
546 | fproduct(zzprime, zzzprime, qmqp); |
665 | fproduct(zzprime, zzzprime, qmqp); |
|
|
666 | /* |zzprime[i]| < 14*2^52 */ |
547 | freduce_degree(zzprime); |
667 | freduce_degree(zzprime); |
548 | freduce_coefficients(zzprime); |
668 | freduce_coefficients(zzprime); |
|
|
669 | /* |zzprime[i]| < 2^26 */ |
549 | memcpy(x3, xxxprime, sizeof(limb) * 10); |
670 | memcpy(x3, xxxprime, sizeof(limb) * 10); |
550 | memcpy(z3, zzprime, sizeof(limb) * 10); |
671 | memcpy(z3, zzprime, sizeof(limb) * 10); |
551 | |
672 | |
552 | fsquare(xx, x); |
673 | fsquare(xx, x); |
|
|
674 | /* |xx[i]| < 2^26 */ |
553 | fsquare(zz, z); |
675 | fsquare(zz, z); |
|
|
676 | /* |zz[i]| < 2^26 */ |
554 | fproduct(x2, xx, zz); |
677 | fproduct(x2, xx, zz); |
|
|
678 | /* |x2[i]| < 14*2^52 */ |
555 | freduce_degree(x2); |
679 | freduce_degree(x2); |
556 | freduce_coefficients(x2); |
680 | freduce_coefficients(x2); |
|
|
681 | /* |x2[i]| < 2^26 */ |
557 | fdifference(zz, xx); // does zz = xx - zz |
682 | fdifference(zz, xx); // does zz = xx - zz |
|
|
683 | /* |zz[i]| < 2^27 */ |
558 | memset(zzz + 10, 0, sizeof(limb) * 9); |
684 | memset(zzz + 10, 0, sizeof(limb) * 9); |
559 | fscalar_product(zzz, zz, 121665); |
685 | fscalar_product(zzz, zz, 121665); |
|
|
686 | /* |zzz[i]| < 2^(27+17) */ |
560 | /* No need to call freduce_degree here: |
687 | /* No need to call freduce_degree here: |
561 | fscalar_product doesn't increase the degree of its input. */ |
688 | fscalar_product doesn't increase the degree of its input. */ |
562 | freduce_coefficients(zzz); |
689 | freduce_coefficients(zzz); |
|
|
690 | /* |zzz[i]| < 2^26 */ |
563 | fsum(zzz, xx); |
691 | fsum(zzz, xx); |
|
|
692 | /* |zzz[i]| < 2^27 */ |
564 | fproduct(z2, zz, zzz); |
693 | fproduct(z2, zz, zzz); |
|
|
694 | /* |z2[i]| < 14*2^(26+27) */ |
565 | freduce_degree(z2); |
695 | freduce_degree(z2); |
566 | freduce_coefficients(z2); |
696 | freduce_coefficients(z2); |
|
|
697 | /* |z2|i| < 2^26 */ |
567 | } |
698 | } |
568 | |
699 | |
569 | /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave |
700 | /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave |
570 | * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid |
701 | * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid |
571 | * side-channel attacks. |
702 | * side-channel attacks. |
572 | * |
703 | * |
573 | * NOTE that this function requires that 'iswap' be 1 or 0; other values give |
704 | * NOTE that this function requires that 'iswap' be 1 or 0; other values give |
574 | * wrong results. Also, the two limb arrays must be in reduced-coefficient, |
705 | * wrong results. Also, the two limb arrays must be in reduced-coefficient, |
575 | * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, |
706 | * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, |
576 | * and all all values in a[0..9],b[0..9] must have magnitude less than |
707 | * and all all values in a[0..9],b[0..9] must have magnitude less than |
577 | * INT32_MAX. |
708 | * INT32_MAX. */ |
578 | */ |
|
|
579 | static void |
709 | static void |
580 | swap_conditional(limb a[19], limb b[19], limb iswap) { |
710 | swap_conditional(limb a[19], limb b[19], limb iswap) { |
581 | unsigned i; |
711 | unsigned i; |
582 | const s32 swap = (s32) -iswap; |
712 | const s32 swap = (s32) -iswap; |
583 | |
713 | |
… | |
… | |
590 | |
720 | |
591 | /* Calculates nQ where Q is the x-coordinate of a point on the curve |
721 | /* Calculates nQ where Q is the x-coordinate of a point on the curve |
592 | * |
722 | * |
593 | * resultx/resultz: the x coordinate of the resulting curve point (short form) |
723 | * resultx/resultz: the x coordinate of the resulting curve point (short form) |
594 | * n: a little endian, 32-byte number |
724 | * n: a little endian, 32-byte number |
595 | * q: a point of the curve (short form) |
725 | * q: a point of the curve (short form) */ |
596 | */ |
|
|
597 | static void |
726 | static void |
598 | cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { |
727 | cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { |
599 | limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; |
728 | limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; |
600 | limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; |
729 | limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; |
601 | limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; |
730 | limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; |
… | |
… | |
709 | /* 2^254 - 2^4 */ fsquare(t0,t1); |
838 | /* 2^254 - 2^4 */ fsquare(t0,t1); |
710 | /* 2^255 - 2^5 */ fsquare(t1,t0); |
839 | /* 2^255 - 2^5 */ fsquare(t1,t0); |
711 | /* 2^255 - 21 */ fmul(out,t1,z11); |
840 | /* 2^255 - 21 */ fmul(out,t1,z11); |
712 | } |
841 | } |
713 | |
842 | |
714 | int curve25519_donna(u8 *, const u8 *, const u8 *); |
|
|
715 | |
|
|
716 | int |
843 | int |
717 | curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { |
844 | curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { |
718 | limb bp[10], x[10], z[11], zmone[10]; |
845 | limb bp[10], x[10], z[11], zmone[10]; |
719 | uint8_t e[32]; |
846 | uint8_t e[32]; |
720 | int i; |
847 | int i; |
… | |
… | |
726 | |
853 | |
727 | fexpand(bp, basepoint); |
854 | fexpand(bp, basepoint); |
728 | cmult(x, z, e, bp); |
855 | cmult(x, z, e, bp); |
729 | crecip(zmone, z); |
856 | crecip(zmone, z); |
730 | fmul(z, x, zmone); |
857 | fmul(z, x, zmone); |
731 | freduce_coefficients(z); |
|
|
732 | fcontract(mypublic, z); |
858 | fcontract(mypublic, z); |
733 | return 0; |
859 | return 0; |
734 | } |
860 | } |