1 |
root |
1.1 |
/* Copyright 2008, Google Inc. |
2 |
|
|
* All rights reserved. |
3 |
|
|
* |
4 |
|
|
* Redistribution and use in source and binary forms, with or without |
5 |
|
|
* modification, are permitted provided that the following conditions are |
6 |
|
|
* met: |
7 |
|
|
* |
8 |
|
|
* * Redistributions of source code must retain the above copyright |
9 |
|
|
* notice, this list of conditions and the following disclaimer. |
10 |
|
|
* * Redistributions in binary form must reproduce the above |
11 |
|
|
* copyright notice, this list of conditions and the following disclaimer |
12 |
|
|
* in the documentation and/or other materials provided with the |
13 |
|
|
* distribution. |
14 |
|
|
* * Neither the name of Google Inc. nor the names of its |
15 |
|
|
* contributors may be used to endorse or promote products derived from |
16 |
|
|
* this software without specific prior written permission. |
17 |
|
|
* |
18 |
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 |
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 |
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 |
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
22 |
|
|
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
23 |
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
24 |
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
25 |
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
26 |
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
27 |
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 |
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 |
|
|
* |
30 |
|
|
* curve25519-donna: Curve25519 elliptic curve, public key function |
31 |
|
|
* |
32 |
|
|
* http://code.google.com/p/curve25519-donna/ |
33 |
|
|
* |
34 |
|
|
* Adam Langley <agl@imperialviolet.org> |
35 |
|
|
* |
36 |
|
|
* Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> |
37 |
|
|
* |
38 |
|
|
* More information about curve25519 can be found here |
39 |
|
|
* http://cr.yp.to/ecdh.html |
40 |
|
|
* |
41 |
|
|
* djb's sample implementation of curve25519 is written in a special assembly |
42 |
|
|
* language called qhasm and uses the floating point registers. |
43 |
|
|
* |
44 |
|
|
* This is, almost, a clean room reimplementation from the curve25519 paper. It |
45 |
|
|
* uses many of the tricks described therein. Only the crecip function is taken |
46 |
|
|
* from the sample implementation. |
47 |
|
|
*/ |
48 |
|
|
|
49 |
|
|
#include <string.h> |
50 |
|
|
#include <stdint.h> |
51 |
|
|
|
52 |
|
|
#ifdef _MSC_VER |
53 |
|
|
#define inline __inline |
54 |
|
|
#endif |
55 |
|
|
|
56 |
|
|
typedef uint8_t u8; |
57 |
|
|
typedef int32_t s32; |
58 |
|
|
typedef int64_t limb; |
59 |
|
|
|
60 |
|
|
/* Field element representation: |
61 |
|
|
* |
62 |
|
|
* Field elements are written as an array of signed, 64-bit limbs, least |
63 |
|
|
* significant first. The value of the field element is: |
64 |
|
|
* x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... |
65 |
|
|
* |
66 |
|
|
* i.e. the limbs are 26, 25, 26, 25, ... bits wide. |
67 |
|
|
*/ |
68 |
|
|
|
69 |
|
|
/* Sum two numbers: output += in */ |
70 |
|
|
static void fsum(limb *output, const limb *in) { |
71 |
|
|
unsigned i; |
72 |
|
|
for (i = 0; i < 10; i += 2) { |
73 |
|
|
output[0+i] = (output[0+i] + in[0+i]); |
74 |
|
|
output[1+i] = (output[1+i] + in[1+i]); |
75 |
|
|
} |
76 |
|
|
} |
77 |
|
|
|
78 |
|
|
/* Find the difference of two numbers: output = in - output |
79 |
|
|
* (note the order of the arguments!) |
80 |
|
|
*/ |
81 |
|
|
static void fdifference(limb *output, const limb *in) { |
82 |
|
|
unsigned i; |
83 |
|
|
for (i = 0; i < 10; ++i) { |
84 |
|
|
output[i] = (in[i] - output[i]); |
85 |
|
|
} |
86 |
|
|
} |
87 |
|
|
|
88 |
|
|
/* Multiply a number by a scalar: output = in * scalar */ |
89 |
|
|
static void fscalar_product(limb *output, const limb *in, const limb scalar) { |
90 |
|
|
unsigned i; |
91 |
|
|
for (i = 0; i < 10; ++i) { |
92 |
|
|
output[i] = in[i] * scalar; |
93 |
|
|
} |
94 |
|
|
} |
95 |
|
|
|
96 |
|
|
/* Multiply two numbers: output = in2 * in |
97 |
|
|
* |
98 |
|
|
* output must be distinct to both inputs. The inputs are reduced coefficient |
99 |
|
|
* form, the output is not. |
100 |
|
|
*/ |
101 |
|
|
static void fproduct(limb *output, const limb *in2, const limb *in) { |
102 |
|
|
output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); |
103 |
|
|
output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + |
104 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[0]); |
105 |
|
|
output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + |
106 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[2]) + |
107 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[0]); |
108 |
|
|
output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) + |
109 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[1]) + |
110 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[3]) + |
111 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[0]); |
112 |
|
|
output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) + |
113 |
|
|
2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) + |
114 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[1])) + |
115 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[4]) + |
116 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[0]); |
117 |
|
|
output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) + |
118 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[2]) + |
119 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[4]) + |
120 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[1]) + |
121 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[5]) + |
122 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[0]); |
123 |
|
|
output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) + |
124 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[5]) + |
125 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[1])) + |
126 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[4]) + |
127 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[2]) + |
128 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[6]) + |
129 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[0]); |
130 |
|
|
output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) + |
131 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[3]) + |
132 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[5]) + |
133 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[2]) + |
134 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[6]) + |
135 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[1]) + |
136 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[7]) + |
137 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[0]); |
138 |
|
|
output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) + |
139 |
|
|
2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) + |
140 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[3]) + |
141 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[7]) + |
142 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[1])) + |
143 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[6]) + |
144 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[2]) + |
145 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[8]) + |
146 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[0]); |
147 |
|
|
output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) + |
148 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[4]) + |
149 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[6]) + |
150 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[3]) + |
151 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[7]) + |
152 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[2]) + |
153 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[8]) + |
154 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[1]) + |
155 |
|
|
((limb) ((s32) in2[0])) * ((s32) in[9]) + |
156 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[0]); |
157 |
|
|
output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) + |
158 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[7]) + |
159 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[3]) + |
160 |
|
|
((limb) ((s32) in2[1])) * ((s32) in[9]) + |
161 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[1])) + |
162 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[6]) + |
163 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[4]) + |
164 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[8]) + |
165 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[2]); |
166 |
|
|
output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) + |
167 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[5]) + |
168 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[7]) + |
169 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[4]) + |
170 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[8]) + |
171 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[3]) + |
172 |
|
|
((limb) ((s32) in2[2])) * ((s32) in[9]) + |
173 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[2]); |
174 |
|
|
output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) + |
175 |
|
|
2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) + |
176 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[5]) + |
177 |
|
|
((limb) ((s32) in2[3])) * ((s32) in[9]) + |
178 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[3])) + |
179 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[8]) + |
180 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[4]); |
181 |
|
|
output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) + |
182 |
|
|
((limb) ((s32) in2[7])) * ((s32) in[6]) + |
183 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[8]) + |
184 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[5]) + |
185 |
|
|
((limb) ((s32) in2[4])) * ((s32) in[9]) + |
186 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[4]); |
187 |
|
|
output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) + |
188 |
|
|
((limb) ((s32) in2[5])) * ((s32) in[9]) + |
189 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[5])) + |
190 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[8]) + |
191 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[6]); |
192 |
|
|
output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) + |
193 |
|
|
((limb) ((s32) in2[8])) * ((s32) in[7]) + |
194 |
|
|
((limb) ((s32) in2[6])) * ((s32) in[9]) + |
195 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[6]); |
196 |
|
|
output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) + |
197 |
|
|
2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) + |
198 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[7])); |
199 |
|
|
output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + |
200 |
|
|
((limb) ((s32) in2[9])) * ((s32) in[8]); |
201 |
|
|
output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); |
202 |
|
|
} |
203 |
|
|
|
204 |
|
|
/* Reduce a long form to a short form by taking the input mod 2^255 - 19. */ |
205 |
|
|
static void freduce_degree(limb *output) { |
206 |
|
|
/* Each of these shifts and adds ends up multiplying the value by 19. */ |
207 |
|
|
output[8] += output[18] << 4; |
208 |
|
|
output[8] += output[18] << 1; |
209 |
|
|
output[8] += output[18]; |
210 |
|
|
output[7] += output[17] << 4; |
211 |
|
|
output[7] += output[17] << 1; |
212 |
|
|
output[7] += output[17]; |
213 |
|
|
output[6] += output[16] << 4; |
214 |
|
|
output[6] += output[16] << 1; |
215 |
|
|
output[6] += output[16]; |
216 |
|
|
output[5] += output[15] << 4; |
217 |
|
|
output[5] += output[15] << 1; |
218 |
|
|
output[5] += output[15]; |
219 |
|
|
output[4] += output[14] << 4; |
220 |
|
|
output[4] += output[14] << 1; |
221 |
|
|
output[4] += output[14]; |
222 |
|
|
output[3] += output[13] << 4; |
223 |
|
|
output[3] += output[13] << 1; |
224 |
|
|
output[3] += output[13]; |
225 |
|
|
output[2] += output[12] << 4; |
226 |
|
|
output[2] += output[12] << 1; |
227 |
|
|
output[2] += output[12]; |
228 |
|
|
output[1] += output[11] << 4; |
229 |
|
|
output[1] += output[11] << 1; |
230 |
|
|
output[1] += output[11]; |
231 |
|
|
output[0] += output[10] << 4; |
232 |
|
|
output[0] += output[10] << 1; |
233 |
|
|
output[0] += output[10]; |
234 |
|
|
} |
235 |
|
|
|
236 |
|
|
#if (-1 & 3) != 3 |
237 |
|
|
#error "This code only works on a two's complement system" |
238 |
|
|
#endif |
239 |
|
|
|
240 |
|
|
/* return v / 2^26, using only shifts and adds. */ |
241 |
|
|
static inline limb |
242 |
|
|
div_by_2_26(const limb v) |
243 |
|
|
{ |
244 |
|
|
/* High word of v; no shift needed*/ |
245 |
|
|
const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
246 |
|
|
/* Set to all 1s if v was negative; else set to 0s. */ |
247 |
|
|
const int32_t sign = ((int32_t) highword) >> 31; |
248 |
|
|
/* Set to 0x3ffffff if v was negative; else set to 0. */ |
249 |
|
|
const int32_t roundoff = ((uint32_t) sign) >> 6; |
250 |
|
|
/* Should return v / (1<<26) */ |
251 |
|
|
return (v + roundoff) >> 26; |
252 |
|
|
} |
253 |
|
|
|
254 |
|
|
/* return v / (2^25), using only shifts and adds. */ |
255 |
|
|
static inline limb |
256 |
|
|
div_by_2_25(const limb v) |
257 |
|
|
{ |
258 |
|
|
/* High word of v; no shift needed*/ |
259 |
|
|
const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
260 |
|
|
/* Set to all 1s if v was negative; else set to 0s. */ |
261 |
|
|
const int32_t sign = ((int32_t) highword) >> 31; |
262 |
|
|
/* Set to 0x1ffffff if v was negative; else set to 0. */ |
263 |
|
|
const int32_t roundoff = ((uint32_t) sign) >> 7; |
264 |
|
|
/* Should return v / (1<<25) */ |
265 |
|
|
return (v + roundoff) >> 25; |
266 |
|
|
} |
267 |
|
|
|
268 |
|
|
static inline s32 |
269 |
|
|
div_s32_by_2_25(const s32 v) |
270 |
|
|
{ |
271 |
|
|
const s32 roundoff = ((uint32_t)(v >> 31)) >> 7; |
272 |
|
|
return (v + roundoff) >> 25; |
273 |
|
|
} |
274 |
|
|
|
275 |
|
|
/* Reduce all coefficients of the short form input so that |x| < 2^26. |
276 |
|
|
* |
277 |
|
|
* On entry: |output[i]| < 2^62 |
278 |
|
|
*/ |
279 |
|
|
static void freduce_coefficients(limb *output) { |
280 |
|
|
unsigned i; |
281 |
|
|
|
282 |
|
|
output[10] = 0; |
283 |
|
|
|
284 |
|
|
for (i = 0; i < 10; i += 2) { |
285 |
|
|
limb over = div_by_2_26(output[i]); |
286 |
|
|
output[i] -= over << 26; |
287 |
|
|
output[i+1] += over; |
288 |
|
|
|
289 |
|
|
over = div_by_2_25(output[i+1]); |
290 |
|
|
output[i+1] -= over << 25; |
291 |
|
|
output[i+2] += over; |
292 |
|
|
} |
293 |
|
|
/* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */ |
294 |
|
|
output[0] += output[10] << 4; |
295 |
|
|
output[0] += output[10] << 1; |
296 |
|
|
output[0] += output[10]; |
297 |
|
|
|
298 |
|
|
output[10] = 0; |
299 |
|
|
|
300 |
|
|
/* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38 |
301 |
|
|
* So |over| will be no more than 77825 */ |
302 |
|
|
{ |
303 |
|
|
limb over = div_by_2_26(output[0]); |
304 |
|
|
output[0] -= over << 26; |
305 |
|
|
output[1] += over; |
306 |
|
|
} |
307 |
|
|
|
308 |
|
|
/* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825 |
309 |
|
|
* So |over| will be no more than 1. */ |
310 |
|
|
{ |
311 |
|
|
/* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */ |
312 |
|
|
s32 over32 = div_s32_by_2_25((s32) output[1]); |
313 |
|
|
output[1] -= over32 << 25; |
314 |
|
|
output[2] += over32; |
315 |
|
|
} |
316 |
|
|
|
317 |
|
|
/* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced": |
318 |
|
|
* we have |output[2]| <= 2^26. This is good enough for all of our math, |
319 |
|
|
* but it will require an extra freduce_coefficients before fcontract. */ |
320 |
|
|
} |
321 |
|
|
|
322 |
|
|
/* A helpful wrapper around fproduct: output = in * in2. |
323 |
|
|
* |
324 |
|
|
* output must be distinct to both inputs. The output is reduced degree and |
325 |
|
|
* reduced coefficient. |
326 |
|
|
*/ |
327 |
|
|
static void |
328 |
|
|
fmul(limb *output, const limb *in, const limb *in2) { |
329 |
|
|
limb t[19]; |
330 |
|
|
fproduct(t, in, in2); |
331 |
|
|
freduce_degree(t); |
332 |
|
|
freduce_coefficients(t); |
333 |
|
|
memcpy(output, t, sizeof(limb) * 10); |
334 |
|
|
} |
335 |
|
|
|
336 |
|
|
static void fsquare_inner(limb *output, const limb *in) { |
337 |
|
|
output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); |
338 |
|
|
output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); |
339 |
|
|
output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + |
340 |
|
|
((limb) ((s32) in[0])) * ((s32) in[2])); |
341 |
|
|
output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) + |
342 |
|
|
((limb) ((s32) in[0])) * ((s32) in[3])); |
343 |
|
|
output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + |
344 |
|
|
4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + |
345 |
|
|
2 * ((limb) ((s32) in[0])) * ((s32) in[4]); |
346 |
|
|
output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) + |
347 |
|
|
((limb) ((s32) in[1])) * ((s32) in[4]) + |
348 |
|
|
((limb) ((s32) in[0])) * ((s32) in[5])); |
349 |
|
|
output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) + |
350 |
|
|
((limb) ((s32) in[2])) * ((s32) in[4]) + |
351 |
|
|
((limb) ((s32) in[0])) * ((s32) in[6]) + |
352 |
|
|
2 * ((limb) ((s32) in[1])) * ((s32) in[5])); |
353 |
|
|
output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) + |
354 |
|
|
((limb) ((s32) in[2])) * ((s32) in[5]) + |
355 |
|
|
((limb) ((s32) in[1])) * ((s32) in[6]) + |
356 |
|
|
((limb) ((s32) in[0])) * ((s32) in[7])); |
357 |
|
|
output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + |
358 |
|
|
2 * (((limb) ((s32) in[2])) * ((s32) in[6]) + |
359 |
|
|
((limb) ((s32) in[0])) * ((s32) in[8]) + |
360 |
|
|
2 * (((limb) ((s32) in[1])) * ((s32) in[7]) + |
361 |
|
|
((limb) ((s32) in[3])) * ((s32) in[5]))); |
362 |
|
|
output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) + |
363 |
|
|
((limb) ((s32) in[3])) * ((s32) in[6]) + |
364 |
|
|
((limb) ((s32) in[2])) * ((s32) in[7]) + |
365 |
|
|
((limb) ((s32) in[1])) * ((s32) in[8]) + |
366 |
|
|
((limb) ((s32) in[0])) * ((s32) in[9])); |
367 |
|
|
output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) + |
368 |
|
|
((limb) ((s32) in[4])) * ((s32) in[6]) + |
369 |
|
|
((limb) ((s32) in[2])) * ((s32) in[8]) + |
370 |
|
|
2 * (((limb) ((s32) in[3])) * ((s32) in[7]) + |
371 |
|
|
((limb) ((s32) in[1])) * ((s32) in[9]))); |
372 |
|
|
output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) + |
373 |
|
|
((limb) ((s32) in[4])) * ((s32) in[7]) + |
374 |
|
|
((limb) ((s32) in[3])) * ((s32) in[8]) + |
375 |
|
|
((limb) ((s32) in[2])) * ((s32) in[9])); |
376 |
|
|
output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + |
377 |
|
|
2 * (((limb) ((s32) in[4])) * ((s32) in[8]) + |
378 |
|
|
2 * (((limb) ((s32) in[5])) * ((s32) in[7]) + |
379 |
|
|
((limb) ((s32) in[3])) * ((s32) in[9]))); |
380 |
|
|
output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) + |
381 |
|
|
((limb) ((s32) in[5])) * ((s32) in[8]) + |
382 |
|
|
((limb) ((s32) in[4])) * ((s32) in[9])); |
383 |
|
|
output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) + |
384 |
|
|
((limb) ((s32) in[6])) * ((s32) in[8]) + |
385 |
|
|
2 * ((limb) ((s32) in[5])) * ((s32) in[9])); |
386 |
|
|
output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) + |
387 |
|
|
((limb) ((s32) in[6])) * ((s32) in[9])); |
388 |
|
|
output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + |
389 |
|
|
4 * ((limb) ((s32) in[7])) * ((s32) in[9]); |
390 |
|
|
output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); |
391 |
|
|
output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); |
392 |
|
|
} |
393 |
|
|
|
394 |
|
|
static void |
395 |
|
|
fsquare(limb *output, const limb *in) { |
396 |
|
|
limb t[19]; |
397 |
|
|
fsquare_inner(t, in); |
398 |
|
|
freduce_degree(t); |
399 |
|
|
freduce_coefficients(t); |
400 |
|
|
memcpy(output, t, sizeof(limb) * 10); |
401 |
|
|
} |
402 |
|
|
|
403 |
|
|
/* Take a little-endian, 32-byte number and expand it into polynomial form */ |
404 |
|
|
static void |
405 |
|
|
fexpand(limb *output, const u8 *input) { |
406 |
|
|
#define F(n,start,shift,mask) \ |
407 |
|
|
output[n] = ((((limb) input[start + 0]) | \ |
408 |
|
|
((limb) input[start + 1]) << 8 | \ |
409 |
|
|
((limb) input[start + 2]) << 16 | \ |
410 |
|
|
((limb) input[start + 3]) << 24) >> shift) & mask; |
411 |
|
|
F(0, 0, 0, 0x3ffffff); |
412 |
|
|
F(1, 3, 2, 0x1ffffff); |
413 |
|
|
F(2, 6, 3, 0x3ffffff); |
414 |
|
|
F(3, 9, 5, 0x1ffffff); |
415 |
|
|
F(4, 12, 6, 0x3ffffff); |
416 |
|
|
F(5, 16, 0, 0x1ffffff); |
417 |
|
|
F(6, 19, 1, 0x3ffffff); |
418 |
|
|
F(7, 22, 3, 0x1ffffff); |
419 |
|
|
F(8, 25, 4, 0x3ffffff); |
420 |
|
|
F(9, 28, 6, 0x1ffffff); |
421 |
|
|
#undef F |
422 |
|
|
} |
423 |
|
|
|
424 |
|
|
#if (-32 >> 1) != -16 |
425 |
|
|
#error "This code only works when >> does sign-extension on negative numbers" |
426 |
|
|
#endif |
427 |
|
|
|
428 |
|
|
/* Take a fully reduced polynomial form number and contract it into a |
429 |
|
|
* little-endian, 32-byte array |
430 |
|
|
*/ |
431 |
|
|
static void |
432 |
|
|
fcontract(u8 *output, limb *input) { |
433 |
|
|
int i; |
434 |
|
|
int j; |
435 |
|
|
|
436 |
|
|
for (j = 0; j < 2; ++j) { |
437 |
|
|
for (i = 0; i < 9; ++i) { |
438 |
|
|
if ((i & 1) == 1) { |
439 |
|
|
/* This calculation is a time-invariant way to make input[i] positive |
440 |
|
|
by borrowing from the next-larger limb. |
441 |
|
|
*/ |
442 |
|
|
const s32 mask = (s32)(input[i]) >> 31; |
443 |
|
|
const s32 carry = -(((s32)(input[i]) & mask) >> 25); |
444 |
|
|
input[i] = (s32)(input[i]) + (carry << 25); |
445 |
|
|
input[i+1] = (s32)(input[i+1]) - carry; |
446 |
|
|
} else { |
447 |
|
|
const s32 mask = (s32)(input[i]) >> 31; |
448 |
|
|
const s32 carry = -(((s32)(input[i]) & mask) >> 26); |
449 |
|
|
input[i] = (s32)(input[i]) + (carry << 26); |
450 |
|
|
input[i+1] = (s32)(input[i+1]) - carry; |
451 |
|
|
} |
452 |
|
|
} |
453 |
|
|
{ |
454 |
|
|
const s32 mask = (s32)(input[9]) >> 31; |
455 |
|
|
const s32 carry = -(((s32)(input[9]) & mask) >> 25); |
456 |
|
|
input[9] = (s32)(input[9]) + (carry << 25); |
457 |
|
|
input[0] = (s32)(input[0]) - (carry * 19); |
458 |
|
|
} |
459 |
|
|
} |
460 |
|
|
|
461 |
|
|
/* The first borrow-propagation pass above ended with every limb |
462 |
|
|
except (possibly) input[0] non-negative. |
463 |
|
|
|
464 |
|
|
Since each input limb except input[0] is decreased by at most 1 |
465 |
|
|
by a borrow-propagation pass, the second borrow-propagation pass |
466 |
|
|
could only have wrapped around to decrease input[0] again if the |
467 |
|
|
first pass left input[0] negative *and* input[1] through input[9] |
468 |
|
|
were all zero. In that case, input[1] is now 2^25 - 1, and this |
469 |
|
|
last borrow-propagation step will leave input[1] non-negative. |
470 |
|
|
*/ |
471 |
|
|
{ |
472 |
|
|
const s32 mask = (s32)(input[0]) >> 31; |
473 |
|
|
const s32 carry = -(((s32)(input[0]) & mask) >> 26); |
474 |
|
|
input[0] = (s32)(input[0]) + (carry << 26); |
475 |
|
|
input[1] = (s32)(input[1]) - carry; |
476 |
|
|
} |
477 |
|
|
|
478 |
|
|
/* Both passes through the above loop, plus the last 0-to-1 step, are |
479 |
|
|
necessary: if input[9] is -1 and input[0] through input[8] are 0, |
480 |
|
|
negative values will remain in the array until the end. |
481 |
|
|
*/ |
482 |
|
|
|
483 |
|
|
input[1] <<= 2; |
484 |
|
|
input[2] <<= 3; |
485 |
|
|
input[3] <<= 5; |
486 |
|
|
input[4] <<= 6; |
487 |
|
|
input[6] <<= 1; |
488 |
|
|
input[7] <<= 3; |
489 |
|
|
input[8] <<= 4; |
490 |
|
|
input[9] <<= 6; |
491 |
|
|
#define F(i, s) \ |
492 |
|
|
output[s+0] |= input[i] & 0xff; \ |
493 |
|
|
output[s+1] = (input[i] >> 8) & 0xff; \ |
494 |
|
|
output[s+2] = (input[i] >> 16) & 0xff; \ |
495 |
|
|
output[s+3] = (input[i] >> 24) & 0xff; |
496 |
|
|
output[0] = 0; |
497 |
|
|
output[16] = 0; |
498 |
|
|
F(0,0); |
499 |
|
|
F(1,3); |
500 |
|
|
F(2,6); |
501 |
|
|
F(3,9); |
502 |
|
|
F(4,12); |
503 |
|
|
F(5,16); |
504 |
|
|
F(6,19); |
505 |
|
|
F(7,22); |
506 |
|
|
F(8,25); |
507 |
|
|
F(9,28); |
508 |
|
|
#undef F |
509 |
|
|
} |
510 |
|
|
|
511 |
|
|
/* Input: Q, Q', Q-Q' |
512 |
|
|
* Output: 2Q, Q+Q' |
513 |
|
|
* |
514 |
|
|
* x2 z3: long form |
515 |
|
|
* x3 z3: long form |
516 |
|
|
* x z: short form, destroyed |
517 |
|
|
* xprime zprime: short form, destroyed |
518 |
|
|
* qmqp: short form, preserved |
519 |
|
|
*/ |
520 |
|
|
static void fmonty(limb *x2, limb *z2, /* output 2Q */ |
521 |
|
|
limb *x3, limb *z3, /* output Q + Q' */ |
522 |
|
|
limb *x, limb *z, /* input Q */ |
523 |
|
|
limb *xprime, limb *zprime, /* input Q' */ |
524 |
|
|
const limb *qmqp /* input Q - Q' */) { |
525 |
|
|
limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], |
526 |
|
|
zzprime[19], zzzprime[19], xxxprime[19]; |
527 |
|
|
|
528 |
|
|
memcpy(origx, x, 10 * sizeof(limb)); |
529 |
|
|
fsum(x, z); |
530 |
|
|
fdifference(z, origx); // does x - z |
531 |
|
|
|
532 |
|
|
memcpy(origxprime, xprime, sizeof(limb) * 10); |
533 |
|
|
fsum(xprime, zprime); |
534 |
|
|
fdifference(zprime, origxprime); |
535 |
|
|
fproduct(xxprime, xprime, z); |
536 |
|
|
fproduct(zzprime, x, zprime); |
537 |
|
|
freduce_degree(xxprime); |
538 |
|
|
freduce_coefficients(xxprime); |
539 |
|
|
freduce_degree(zzprime); |
540 |
|
|
freduce_coefficients(zzprime); |
541 |
|
|
memcpy(origxprime, xxprime, sizeof(limb) * 10); |
542 |
|
|
fsum(xxprime, zzprime); |
543 |
|
|
fdifference(zzprime, origxprime); |
544 |
|
|
fsquare(xxxprime, xxprime); |
545 |
|
|
fsquare(zzzprime, zzprime); |
546 |
|
|
fproduct(zzprime, zzzprime, qmqp); |
547 |
|
|
freduce_degree(zzprime); |
548 |
|
|
freduce_coefficients(zzprime); |
549 |
|
|
memcpy(x3, xxxprime, sizeof(limb) * 10); |
550 |
|
|
memcpy(z3, zzprime, sizeof(limb) * 10); |
551 |
|
|
|
552 |
|
|
fsquare(xx, x); |
553 |
|
|
fsquare(zz, z); |
554 |
|
|
fproduct(x2, xx, zz); |
555 |
|
|
freduce_degree(x2); |
556 |
|
|
freduce_coefficients(x2); |
557 |
|
|
fdifference(zz, xx); // does zz = xx - zz |
558 |
|
|
memset(zzz + 10, 0, sizeof(limb) * 9); |
559 |
|
|
fscalar_product(zzz, zz, 121665); |
560 |
|
|
/* No need to call freduce_degree here: |
561 |
|
|
fscalar_product doesn't increase the degree of its input. */ |
562 |
|
|
freduce_coefficients(zzz); |
563 |
|
|
fsum(zzz, xx); |
564 |
|
|
fproduct(z2, zz, zzz); |
565 |
|
|
freduce_degree(z2); |
566 |
|
|
freduce_coefficients(z2); |
567 |
|
|
} |
568 |
|
|
|
569 |
|
|
/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave |
570 |
|
|
* them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid |
571 |
|
|
* side-channel attacks. |
572 |
|
|
* |
573 |
|
|
* NOTE that this function requires that 'iswap' be 1 or 0; other values give |
574 |
|
|
* wrong results. Also, the two limb arrays must be in reduced-coefficient, |
575 |
|
|
* reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, |
576 |
|
|
* and all all values in a[0..9],b[0..9] must have magnitude less than |
577 |
|
|
* INT32_MAX. |
578 |
|
|
*/ |
579 |
|
|
static void |
580 |
|
|
swap_conditional(limb a[19], limb b[19], limb iswap) { |
581 |
|
|
unsigned i; |
582 |
|
|
const s32 swap = (s32) -iswap; |
583 |
|
|
|
584 |
|
|
for (i = 0; i < 10; ++i) { |
585 |
|
|
const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) ); |
586 |
|
|
a[i] = ((s32)a[i]) ^ x; |
587 |
|
|
b[i] = ((s32)b[i]) ^ x; |
588 |
|
|
} |
589 |
|
|
} |
590 |
|
|
|
591 |
|
|
/* Calculates nQ where Q is the x-coordinate of a point on the curve |
592 |
|
|
* |
593 |
|
|
* resultx/resultz: the x coordinate of the resulting curve point (short form) |
594 |
|
|
* n: a little endian, 32-byte number |
595 |
|
|
* q: a point of the curve (short form) |
596 |
|
|
*/ |
597 |
|
|
static void |
598 |
|
|
cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { |
599 |
|
|
limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; |
600 |
|
|
limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; |
601 |
|
|
limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; |
602 |
|
|
limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; |
603 |
|
|
|
604 |
|
|
unsigned i, j; |
605 |
|
|
|
606 |
|
|
memcpy(nqpqx, q, sizeof(limb) * 10); |
607 |
|
|
|
608 |
|
|
for (i = 0; i < 32; ++i) { |
609 |
|
|
u8 byte = n[31 - i]; |
610 |
|
|
for (j = 0; j < 8; ++j) { |
611 |
|
|
const limb bit = byte >> 7; |
612 |
|
|
|
613 |
|
|
swap_conditional(nqx, nqpqx, bit); |
614 |
|
|
swap_conditional(nqz, nqpqz, bit); |
615 |
|
|
fmonty(nqx2, nqz2, |
616 |
|
|
nqpqx2, nqpqz2, |
617 |
|
|
nqx, nqz, |
618 |
|
|
nqpqx, nqpqz, |
619 |
|
|
q); |
620 |
|
|
swap_conditional(nqx2, nqpqx2, bit); |
621 |
|
|
swap_conditional(nqz2, nqpqz2, bit); |
622 |
|
|
|
623 |
|
|
t = nqx; |
624 |
|
|
nqx = nqx2; |
625 |
|
|
nqx2 = t; |
626 |
|
|
t = nqz; |
627 |
|
|
nqz = nqz2; |
628 |
|
|
nqz2 = t; |
629 |
|
|
t = nqpqx; |
630 |
|
|
nqpqx = nqpqx2; |
631 |
|
|
nqpqx2 = t; |
632 |
|
|
t = nqpqz; |
633 |
|
|
nqpqz = nqpqz2; |
634 |
|
|
nqpqz2 = t; |
635 |
|
|
|
636 |
|
|
byte <<= 1; |
637 |
|
|
} |
638 |
|
|
} |
639 |
|
|
|
640 |
|
|
memcpy(resultx, nqx, sizeof(limb) * 10); |
641 |
|
|
memcpy(resultz, nqz, sizeof(limb) * 10); |
642 |
|
|
} |
643 |
|
|
|
644 |
|
|
// ----------------------------------------------------------------------------- |
645 |
|
|
// Shamelessly copied from djb's code |
646 |
|
|
// ----------------------------------------------------------------------------- |
647 |
|
|
static void |
648 |
|
|
crecip(limb *out, const limb *z) { |
649 |
|
|
limb z2[10]; |
650 |
|
|
limb z9[10]; |
651 |
|
|
limb z11[10]; |
652 |
|
|
limb z2_5_0[10]; |
653 |
|
|
limb z2_10_0[10]; |
654 |
|
|
limb z2_20_0[10]; |
655 |
|
|
limb z2_50_0[10]; |
656 |
|
|
limb z2_100_0[10]; |
657 |
|
|
limb t0[10]; |
658 |
|
|
limb t1[10]; |
659 |
|
|
int i; |
660 |
|
|
|
661 |
|
|
/* 2 */ fsquare(z2,z); |
662 |
|
|
/* 4 */ fsquare(t1,z2); |
663 |
|
|
/* 8 */ fsquare(t0,t1); |
664 |
|
|
/* 9 */ fmul(z9,t0,z); |
665 |
|
|
/* 11 */ fmul(z11,z9,z2); |
666 |
|
|
/* 22 */ fsquare(t0,z11); |
667 |
|
|
/* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9); |
668 |
|
|
|
669 |
|
|
/* 2^6 - 2^1 */ fsquare(t0,z2_5_0); |
670 |
|
|
/* 2^7 - 2^2 */ fsquare(t1,t0); |
671 |
|
|
/* 2^8 - 2^3 */ fsquare(t0,t1); |
672 |
|
|
/* 2^9 - 2^4 */ fsquare(t1,t0); |
673 |
|
|
/* 2^10 - 2^5 */ fsquare(t0,t1); |
674 |
|
|
/* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0); |
675 |
|
|
|
676 |
|
|
/* 2^11 - 2^1 */ fsquare(t0,z2_10_0); |
677 |
|
|
/* 2^12 - 2^2 */ fsquare(t1,t0); |
678 |
|
|
/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
679 |
|
|
/* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0); |
680 |
|
|
|
681 |
|
|
/* 2^21 - 2^1 */ fsquare(t0,z2_20_0); |
682 |
|
|
/* 2^22 - 2^2 */ fsquare(t1,t0); |
683 |
|
|
/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
684 |
|
|
/* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0); |
685 |
|
|
|
686 |
|
|
/* 2^41 - 2^1 */ fsquare(t1,t0); |
687 |
|
|
/* 2^42 - 2^2 */ fsquare(t0,t1); |
688 |
|
|
/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } |
689 |
|
|
/* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0); |
690 |
|
|
|
691 |
|
|
/* 2^51 - 2^1 */ fsquare(t0,z2_50_0); |
692 |
|
|
/* 2^52 - 2^2 */ fsquare(t1,t0); |
693 |
|
|
/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
694 |
|
|
/* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0); |
695 |
|
|
|
696 |
|
|
/* 2^101 - 2^1 */ fsquare(t1,z2_100_0); |
697 |
|
|
/* 2^102 - 2^2 */ fsquare(t0,t1); |
698 |
|
|
/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } |
699 |
|
|
/* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0); |
700 |
|
|
|
701 |
|
|
/* 2^201 - 2^1 */ fsquare(t0,t1); |
702 |
|
|
/* 2^202 - 2^2 */ fsquare(t1,t0); |
703 |
|
|
/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } |
704 |
|
|
/* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0); |
705 |
|
|
|
706 |
|
|
/* 2^251 - 2^1 */ fsquare(t1,t0); |
707 |
|
|
/* 2^252 - 2^2 */ fsquare(t0,t1); |
708 |
|
|
/* 2^253 - 2^3 */ fsquare(t1,t0); |
709 |
|
|
/* 2^254 - 2^4 */ fsquare(t0,t1); |
710 |
|
|
/* 2^255 - 2^5 */ fsquare(t1,t0); |
711 |
|
|
/* 2^255 - 21 */ fmul(out,t1,z11); |
712 |
|
|
} |
713 |
|
|
|
714 |
|
|
int curve25519_donna(u8 *, const u8 *, const u8 *); |
715 |
|
|
|
716 |
|
|
int |
717 |
|
|
curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { |
718 |
|
|
limb bp[10], x[10], z[11], zmone[10]; |
719 |
|
|
uint8_t e[32]; |
720 |
|
|
int i; |
721 |
|
|
|
722 |
|
|
for (i = 0; i < 32; ++i) e[i] = secret[i]; |
723 |
|
|
e[0] &= 248; |
724 |
|
|
e[31] &= 127; |
725 |
|
|
e[31] |= 64; |
726 |
|
|
|
727 |
|
|
fexpand(bp, basepoint); |
728 |
|
|
cmult(x, z, e, bp); |
729 |
|
|
crecip(zmone, z); |
730 |
|
|
fmul(z, x, zmone); |
731 |
|
|
freduce_coefficients(z); |
732 |
|
|
fcontract(mypublic, z); |
733 |
|
|
return 0; |
734 |
|
|
} |