411 lines
12 KiB
C
Vendored
411 lines
12 KiB
C
Vendored
/* The MIT License (MIT)
|
|
*
|
|
* Copyright (c) 2015 mehdi sotoodeh
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included
|
|
* in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
#include "../include/external_calls.h"
|
|
#include "curve25519_mehdi.h"
|
|
|
|
/*
|
|
The curve used is y2 = x^3 + 486662x^2 + x, a Montgomery curve, over
|
|
the prime field defined by the prime number 2^255 - 19, and it uses the
|
|
base point x = 9.
|
|
Protocol uses compressed elliptic point (only X coordinates), so it
|
|
allows for efficient use of the Montgomery ladder for ECDH, using only
|
|
XZ coordinates.
|
|
|
|
The curve is birationally equivalent to Ed25519 (Twisted Edwards curve).
|
|
|
|
b = 256
|
|
p = 2**255 - 19
|
|
l = 2**252 + 27742317777372353535851937790883648493
|
|
|
|
This library is a constant-time implementation of field operations
|
|
*/
|
|
|
|
typedef struct
|
|
{
|
|
U32 X[8]; /* x = X/Z */
|
|
U32 Z[8]; /* */
|
|
} XZ_POINT;
|
|
|
|
const U32 _w_P[8] = {
|
|
0xFFFFFFED,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
|
|
0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x7FFFFFFF
|
|
};
|
|
|
|
/* Maximum number of prime p that fits into 256-bits */
|
|
const U32 _w_maxP[8] = { /* 2*P < 2**256 */
|
|
0xFFFFFFDA,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
|
|
0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
|
|
};
|
|
|
|
void ecp_SetValue(U32* X, U32 value)
|
|
{
|
|
X[0] = value;
|
|
X[1] = X[2] = X[3] = X[4] = X[5] = X[6] = X[7] = 0;
|
|
}
|
|
|
|
/* Y = X */
|
|
void ecp_Copy(U32* Y, const U32* X)
|
|
{
|
|
memcpy(Y, X, 8*sizeof(U32));
|
|
}
|
|
|
|
int ecp_CmpNE(const U32* X, const U32* Y)
|
|
{
|
|
return ((X[0] ^ Y[0]) | (X[1] ^ Y[1]) | (X[2] ^ Y[2]) | (X[3] ^ Y[3]) |
|
|
(X[4] ^ Y[4]) | (X[5] ^ Y[5]) | (X[6] ^ Y[6]) | (X[7] ^ Y[7]));
|
|
}
|
|
|
|
int ecp_CmpLT(const U32* X, const U32* Y)
|
|
{
|
|
U32 T[8];
|
|
return ecp_Sub(T, X, Y);
|
|
}
|
|
|
|
#define ECP_ADD_C0(Y,X,V) c.u64 = (U64)(X) + (V); Y = c.u32.lo;
|
|
#define ECP_ADD_C1(Y,X) c.u64 = (U64)(X) + c.u32.hi; Y = c.u32.lo;
|
|
|
|
#define ECP_SUB_C0(Y,X,V) c.s64 = (U64)(X) - (V); Y = c.u32.lo;
|
|
#define ECP_SUB_C1(Y,X) c.s64 = (U64)(X) + (S64)c.s32.hi; Y = c.u32.lo;
|
|
|
|
#define ECP_MULSET_W0(Y,b,X) c.u64 = (U64)(b)*(X); Y = c.u32.lo;
|
|
#define ECP_MULSET_W1(Y,b,X) c.u64 = (U64)(b)*(X) + c.u32.hi; Y = c.u32.lo;
|
|
|
|
#define ECP_MULADD_W0(Z,Y,b,X) c.u64 = (U64)(b)*(X) + (Y); Z = c.u32.lo;
|
|
#define ECP_MULADD_W1(Z,Y,b,X) c.u64 = (U64)(b)*(X) + (U64)(Y) + c.u32.hi; Z = c.u32.lo;
|
|
|
|
#define ECP_ADD32(Z,X,Y) c.u64 = (U64)(X) + (Y); Z = c.u32.lo;
|
|
#define ECP_ADC32(Z,X,Y) c.u64 = (U64)(X) + (U64)(Y) + c.u32.hi; Z = c.u32.lo;
|
|
#define ECP_SUB32(Z,X,Y) b.s64 = (S64)(X) - (Y); Z = b.s32.lo;
|
|
#define ECP_SBC32(Z,X,Y) b.s64 = (S64)(X) - (U64)(Y) + b.s32.hi; Z = b.s32.lo;
|
|
|
|
/* Computes Z = X+Y */
|
|
U32 ecp_Add(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
M64 c;
|
|
|
|
ECP_ADD32(Z[0], X[0], Y[0]);
|
|
ECP_ADC32(Z[1], X[1], Y[1]);
|
|
ECP_ADC32(Z[2], X[2], Y[2]);
|
|
ECP_ADC32(Z[3], X[3], Y[3]);
|
|
ECP_ADC32(Z[4], X[4], Y[4]);
|
|
ECP_ADC32(Z[5], X[5], Y[5]);
|
|
ECP_ADC32(Z[6], X[6], Y[6]);
|
|
ECP_ADC32(Z[7], X[7], Y[7]);
|
|
return c.u32.hi;
|
|
}
|
|
|
|
/* Computes Z = X-Y */
|
|
S32 ecp_Sub(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
M64 b;
|
|
ECP_SUB32(Z[0], X[0], Y[0]);
|
|
ECP_SBC32(Z[1], X[1], Y[1]);
|
|
ECP_SBC32(Z[2], X[2], Y[2]);
|
|
ECP_SBC32(Z[3], X[3], Y[3]);
|
|
ECP_SBC32(Z[4], X[4], Y[4]);
|
|
ECP_SBC32(Z[5], X[5], Y[5]);
|
|
ECP_SBC32(Z[6], X[6], Y[6]);
|
|
ECP_SBC32(Z[7], X[7], Y[7]);
|
|
return b.s32.hi;
|
|
}
|
|
|
|
/* Computes Z = X+Y mod P */
|
|
void ecp_AddReduce(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
M64 c;
|
|
c.u32.hi = ecp_Add(Z, X, Y) * 38;
|
|
|
|
/* Z += c.u32.hi * 38 */
|
|
ECP_ADD_C0(Z[0], Z[0], c.u32.hi);
|
|
ECP_ADD_C1(Z[1], Z[1]);
|
|
ECP_ADD_C1(Z[2], Z[2]);
|
|
ECP_ADD_C1(Z[3], Z[3]);
|
|
ECP_ADD_C1(Z[4], Z[4]);
|
|
ECP_ADD_C1(Z[5], Z[5]);
|
|
ECP_ADD_C1(Z[6], Z[6]);
|
|
ECP_ADD_C1(Z[7], Z[7]);
|
|
|
|
/* One more carry at most */
|
|
ECP_ADD_C0(Z[0], Z[0], c.u32.hi*38);
|
|
ECP_ADD_C1(Z[1], Z[1]);
|
|
ECP_ADD_C1(Z[2], Z[2]);
|
|
ECP_ADD_C1(Z[3], Z[3]);
|
|
ECP_ADD_C1(Z[4], Z[4]);
|
|
ECP_ADD_C1(Z[5], Z[5]);
|
|
ECP_ADD_C1(Z[6], Z[6]);
|
|
ECP_ADD_C1(Z[7], Z[7]);
|
|
}
|
|
|
|
/* Computes Z = X-Y mod P */
|
|
void ecp_SubReduce(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
M64 c;
|
|
c.u32.hi = ecp_Sub(Z, X, Y) & 38;
|
|
|
|
ECP_SUB_C0(Z[0], Z[0], c.u32.hi);
|
|
ECP_SUB_C1(Z[1], Z[1]);
|
|
ECP_SUB_C1(Z[2], Z[2]);
|
|
ECP_SUB_C1(Z[3], Z[3]);
|
|
ECP_SUB_C1(Z[4], Z[4]);
|
|
ECP_SUB_C1(Z[5], Z[5]);
|
|
ECP_SUB_C1(Z[6], Z[6]);
|
|
ECP_SUB_C1(Z[7], Z[7]);
|
|
|
|
ECP_SUB_C0(Z[0], Z[0], c.u32.hi & 38);
|
|
ECP_SUB_C1(Z[1], Z[1]);
|
|
ECP_SUB_C1(Z[2], Z[2]);
|
|
ECP_SUB_C1(Z[3], Z[3]);
|
|
ECP_SUB_C1(Z[4], Z[4]);
|
|
ECP_SUB_C1(Z[5], Z[5]);
|
|
ECP_SUB_C1(Z[6], Z[6]);
|
|
ECP_SUB_C1(Z[7], Z[7]);
|
|
}
|
|
|
|
void ecp_Mod(U32 *X)
|
|
{
|
|
U32 T[8];
|
|
U32 c = (U32)ecp_Sub(X, X, _w_P);
|
|
|
|
/* set T = 0 if c=0, else T = P */
|
|
|
|
T[0] = c & 0xFFFFFFED;
|
|
T[1] = T[2] = T[3] = T[4] = T[5] = T[6] = c;
|
|
T[7] = c >> 1;
|
|
|
|
ecp_Add(X, X, T); /* X += 0 or P */
|
|
|
|
/* In case there is another P there */
|
|
|
|
c = (U32)ecp_Sub(X, X, _w_P);
|
|
|
|
/* set T = 0 if c=0, else T = P */
|
|
|
|
T[0] = c & 0xFFFFFFED;
|
|
T[1] = T[2] = T[3] = T[4] = T[5] = T[6] = c;
|
|
T[7] = c >> 1;
|
|
|
|
ecp_Add(X, X, T); /* X += 0 or P */
|
|
}
|
|
|
|
/* Computes Y = b*X */
|
|
static void ecp_mul_set(U32* Y, U32 b, const U32* X)
|
|
{
|
|
M64 c;
|
|
ECP_MULSET_W0(Y[0], b, X[0]);
|
|
ECP_MULSET_W1(Y[1], b, X[1]);
|
|
ECP_MULSET_W1(Y[2], b, X[2]);
|
|
ECP_MULSET_W1(Y[3], b, X[3]);
|
|
ECP_MULSET_W1(Y[4], b, X[4]);
|
|
ECP_MULSET_W1(Y[5], b, X[5]);
|
|
ECP_MULSET_W1(Y[6], b, X[6]);
|
|
ECP_MULSET_W1(Y[7], b, X[7]);
|
|
Y[8] = c.u32.hi;
|
|
}
|
|
|
|
/* Computes Y += b*X */
|
|
/* Addition is performed on lower 8-words of Y */
|
|
static void ecp_mul_add(U32* Y, U32 b, const U32* X)
|
|
{
|
|
M64 c;
|
|
ECP_MULADD_W0(Y[0], Y[0], b, X[0]);
|
|
ECP_MULADD_W1(Y[1], Y[1], b, X[1]);
|
|
ECP_MULADD_W1(Y[2], Y[2], b, X[2]);
|
|
ECP_MULADD_W1(Y[3], Y[3], b, X[3]);
|
|
ECP_MULADD_W1(Y[4], Y[4], b, X[4]);
|
|
ECP_MULADD_W1(Y[5], Y[5], b, X[5]);
|
|
ECP_MULADD_W1(Y[6], Y[6], b, X[6]);
|
|
ECP_MULADD_W1(Y[7], Y[7], b, X[7]);
|
|
Y[8] = c.u32.hi;
|
|
}
|
|
|
|
/* Computes Z = Y + b*X and return carry */
|
|
void ecp_WordMulAddReduce(U32 *Z, const U32* Y, U32 b, const U32* X)
|
|
{
|
|
M64 c;
|
|
ECP_MULADD_W0(Z[0], Y[0], b, X[0]);
|
|
ECP_MULADD_W1(Z[1], Y[1], b, X[1]);
|
|
ECP_MULADD_W1(Z[2], Y[2], b, X[2]);
|
|
ECP_MULADD_W1(Z[3], Y[3], b, X[3]);
|
|
ECP_MULADD_W1(Z[4], Y[4], b, X[4]);
|
|
ECP_MULADD_W1(Z[5], Y[5], b, X[5]);
|
|
ECP_MULADD_W1(Z[6], Y[6], b, X[6]);
|
|
ECP_MULADD_W1(Z[7], Y[7], b, X[7]);
|
|
|
|
/* Z += c.u32.hi * 38 */
|
|
ECP_MULADD_W0(Z[0], Z[0], c.u32.hi, 38);
|
|
ECP_ADD_C1(Z[1], Z[1]);
|
|
ECP_ADD_C1(Z[2], Z[2]);
|
|
ECP_ADD_C1(Z[3], Z[3]);
|
|
ECP_ADD_C1(Z[4], Z[4]);
|
|
ECP_ADD_C1(Z[5], Z[5]);
|
|
ECP_ADD_C1(Z[6], Z[6]);
|
|
ECP_ADD_C1(Z[7], Z[7]);
|
|
|
|
/* One more time at most */
|
|
ECP_MULADD_W0(Z[0], Z[0], c.u32.hi, 38);
|
|
ECP_ADD_C1(Z[1], Z[1]);
|
|
ECP_ADD_C1(Z[2], Z[2]);
|
|
ECP_ADD_C1(Z[3], Z[3]);
|
|
ECP_ADD_C1(Z[4], Z[4]);
|
|
ECP_ADD_C1(Z[5], Z[5]);
|
|
ECP_ADD_C1(Z[6], Z[6]);
|
|
ECP_ADD_C1(Z[7], Z[7]);
|
|
}
|
|
|
|
/* Computes Z = X*Y mod P. */
|
|
/* Output fits into 8 words but could be greater than P */
|
|
void ecp_MulReduce(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
U32 T[16];
|
|
|
|
ecp_mul_set(T+0, X[0], Y);
|
|
ecp_mul_add(T+1, X[1], Y);
|
|
ecp_mul_add(T+2, X[2], Y);
|
|
ecp_mul_add(T+3, X[3], Y);
|
|
ecp_mul_add(T+4, X[4], Y);
|
|
ecp_mul_add(T+5, X[5], Y);
|
|
ecp_mul_add(T+6, X[6], Y);
|
|
ecp_mul_add(T+7, X[7], Y);
|
|
|
|
/* We have T = X*Y, now do the reduction in size */
|
|
|
|
ecp_WordMulAddReduce(Z, T, 38, T+8);
|
|
}
|
|
|
|
/* Computes Z = X*Y */
|
|
void ecp_Mul(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
ecp_mul_set(Z+0, X[0], Y);
|
|
ecp_mul_add(Z+1, X[1], Y);
|
|
ecp_mul_add(Z+2, X[2], Y);
|
|
ecp_mul_add(Z+3, X[3], Y);
|
|
ecp_mul_add(Z+4, X[4], Y);
|
|
ecp_mul_add(Z+5, X[5], Y);
|
|
ecp_mul_add(Z+6, X[6], Y);
|
|
ecp_mul_add(Z+7, X[7], Y);
|
|
}
|
|
|
|
/* Computes Z = X*Y mod P. */
|
|
void ecp_SqrReduce(U32* Y, const U32* X)
|
|
{
|
|
/* TBD: Implementation is based on multiply */
|
|
/* Optimize for squaring */
|
|
|
|
U32 T[16];
|
|
|
|
ecp_mul_set(T+0, X[0], X);
|
|
ecp_mul_add(T+1, X[1], X);
|
|
ecp_mul_add(T+2, X[2], X);
|
|
ecp_mul_add(T+3, X[3], X);
|
|
ecp_mul_add(T+4, X[4], X);
|
|
ecp_mul_add(T+5, X[5], X);
|
|
ecp_mul_add(T+6, X[6], X);
|
|
ecp_mul_add(T+7, X[7], X);
|
|
|
|
/* We have T = X*X, now do the reduction in size */
|
|
|
|
ecp_WordMulAddReduce(Y, T, 38, T+8);
|
|
}
|
|
|
|
/* Computes Z = X*Y mod P. */
|
|
void ecp_MulMod(U32* Z, const U32* X, const U32* Y)
|
|
{
|
|
ecp_MulReduce(Z, X, Y);
|
|
ecp_Mod(Z);
|
|
}
|
|
|
|
/* Courtesy of DJB */
|
|
/* Return out = 1/z mod P */
|
|
void ecp_Inverse(U32 *out, const U32 *z)
|
|
{
|
|
int i;
|
|
U32 t0[8],t1[8],z2[8],z9[8],z11[8];
|
|
U32 z2_5_0[8],z2_10_0[8],z2_20_0[8],z2_50_0[8],z2_100_0[8];
|
|
|
|
/* 2 */ ecp_SqrReduce(z2,z);
|
|
/* 4 */ ecp_SqrReduce(t1,z2);
|
|
/* 8 */ ecp_SqrReduce(t0,t1);
|
|
/* 9 */ ecp_MulReduce(z9,t0,z);
|
|
/* 11 */ ecp_MulReduce(z11,z9,z2);
|
|
/* 22 */ ecp_SqrReduce(t0,z11);
|
|
/* 2^5 - 2^0 = 31 */ ecp_MulReduce(z2_5_0,t0,z9);
|
|
|
|
/* 2^6 - 2^1 */ ecp_SqrReduce(t0,z2_5_0);
|
|
/* 2^7 - 2^2 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^8 - 2^3 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^9 - 2^4 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^10 - 2^5 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^10 - 2^0 */ ecp_MulReduce(z2_10_0,t0,z2_5_0);
|
|
|
|
/* 2^11 - 2^1 */ ecp_SqrReduce(t0,z2_10_0);
|
|
/* 2^12 - 2^2 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) {
|
|
ecp_SqrReduce(t0,t1);
|
|
ecp_SqrReduce(t1,t0); }
|
|
/* 2^20 - 2^0 */ ecp_MulReduce(z2_20_0,t1,z2_10_0);
|
|
|
|
/* 2^21 - 2^1 */ ecp_SqrReduce(t0,z2_20_0);
|
|
/* 2^22 - 2^2 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) {
|
|
ecp_SqrReduce(t0,t1);
|
|
ecp_SqrReduce(t1,t0); }
|
|
/* 2^40 - 2^0 */ ecp_MulReduce(t0,t1,z2_20_0);
|
|
|
|
/* 2^41 - 2^1 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^42 - 2^2 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) {
|
|
ecp_SqrReduce(t1,t0);
|
|
ecp_SqrReduce(t0,t1); }
|
|
/* 2^50 - 2^0 */ ecp_MulReduce(z2_50_0,t0,z2_10_0);
|
|
|
|
/* 2^51 - 2^1 */ ecp_SqrReduce(t0,z2_50_0);
|
|
/* 2^52 - 2^2 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) {
|
|
ecp_SqrReduce(t0,t1);
|
|
ecp_SqrReduce(t1,t0); }
|
|
/* 2^100 - 2^0 */ ecp_MulReduce(z2_100_0,t1,z2_50_0);
|
|
|
|
/* 2^101 - 2^1 */ ecp_SqrReduce(t1,z2_100_0);
|
|
/* 2^102 - 2^2 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) {
|
|
ecp_SqrReduce(t1,t0);
|
|
ecp_SqrReduce(t0,t1); }
|
|
/* 2^200 - 2^0 */ ecp_MulReduce(t1,t0,z2_100_0);
|
|
|
|
/* 2^201 - 2^1 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^202 - 2^2 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) {
|
|
ecp_SqrReduce(t0,t1);
|
|
ecp_SqrReduce(t1,t0); }
|
|
/* 2^250 - 2^0 */ ecp_MulReduce(t0,t1,z2_50_0);
|
|
|
|
/* 2^251 - 2^1 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^252 - 2^2 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^253 - 2^3 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^254 - 2^4 */ ecp_SqrReduce(t0,t1);
|
|
/* 2^255 - 2^5 */ ecp_SqrReduce(t1,t0);
|
|
/* 2^255 - 21 */ ecp_MulReduce(out,t1,z11);
|
|
}
|
|
|