osb/source/extern/curve25519/source/curve25519_mehdi.c
2023-07-12 15:13:30 +10:00

411 lines
12 KiB
C
Vendored

/* The MIT License (MIT)
*
* Copyright (c) 2015 mehdi sotoodeh
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "../include/external_calls.h"
#include "curve25519_mehdi.h"
/*
The curve used is y2 = x^3 + 486662x^2 + x, a Montgomery curve, over
the prime field defined by the prime number 2^255 - 19, and it uses the
base point x = 9.
Protocol uses compressed elliptic point (only X coordinates), so it
allows for efficient use of the Montgomery ladder for ECDH, using only
XZ coordinates.
The curve is birationally equivalent to Ed25519 (Twisted Edwards curve).
b = 256
p = 2**255 - 19
l = 2**252 + 27742317777372353535851937790883648493
This library is a constant-time implementation of field operations
*/
typedef struct
{
U32 X[8]; /* x = X/Z */
U32 Z[8]; /* */
} XZ_POINT;
const U32 _w_P[8] = {
0xFFFFFFED,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x7FFFFFFF
};
/* Maximum number of prime p that fits into 256-bits */
const U32 _w_maxP[8] = { /* 2*P < 2**256 */
0xFFFFFFDA,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
};
void ecp_SetValue(U32* X, U32 value)
{
X[0] = value;
X[1] = X[2] = X[3] = X[4] = X[5] = X[6] = X[7] = 0;
}
/* Y = X */
void ecp_Copy(U32* Y, const U32* X)
{
memcpy(Y, X, 8*sizeof(U32));
}
int ecp_CmpNE(const U32* X, const U32* Y)
{
return ((X[0] ^ Y[0]) | (X[1] ^ Y[1]) | (X[2] ^ Y[2]) | (X[3] ^ Y[3]) |
(X[4] ^ Y[4]) | (X[5] ^ Y[5]) | (X[6] ^ Y[6]) | (X[7] ^ Y[7]));
}
int ecp_CmpLT(const U32* X, const U32* Y)
{
U32 T[8];
return ecp_Sub(T, X, Y);
}
#define ECP_ADD_C0(Y,X,V) c.u64 = (U64)(X) + (V); Y = c.u32.lo;
#define ECP_ADD_C1(Y,X) c.u64 = (U64)(X) + c.u32.hi; Y = c.u32.lo;
#define ECP_SUB_C0(Y,X,V) c.s64 = (U64)(X) - (V); Y = c.u32.lo;
#define ECP_SUB_C1(Y,X) c.s64 = (U64)(X) + (S64)c.s32.hi; Y = c.u32.lo;
#define ECP_MULSET_W0(Y,b,X) c.u64 = (U64)(b)*(X); Y = c.u32.lo;
#define ECP_MULSET_W1(Y,b,X) c.u64 = (U64)(b)*(X) + c.u32.hi; Y = c.u32.lo;
#define ECP_MULADD_W0(Z,Y,b,X) c.u64 = (U64)(b)*(X) + (Y); Z = c.u32.lo;
#define ECP_MULADD_W1(Z,Y,b,X) c.u64 = (U64)(b)*(X) + (U64)(Y) + c.u32.hi; Z = c.u32.lo;
#define ECP_ADD32(Z,X,Y) c.u64 = (U64)(X) + (Y); Z = c.u32.lo;
#define ECP_ADC32(Z,X,Y) c.u64 = (U64)(X) + (U64)(Y) + c.u32.hi; Z = c.u32.lo;
#define ECP_SUB32(Z,X,Y) b.s64 = (S64)(X) - (Y); Z = b.s32.lo;
#define ECP_SBC32(Z,X,Y) b.s64 = (S64)(X) - (U64)(Y) + b.s32.hi; Z = b.s32.lo;
/* Computes Z = X+Y */
U32 ecp_Add(U32* Z, const U32* X, const U32* Y)
{
M64 c;
ECP_ADD32(Z[0], X[0], Y[0]);
ECP_ADC32(Z[1], X[1], Y[1]);
ECP_ADC32(Z[2], X[2], Y[2]);
ECP_ADC32(Z[3], X[3], Y[3]);
ECP_ADC32(Z[4], X[4], Y[4]);
ECP_ADC32(Z[5], X[5], Y[5]);
ECP_ADC32(Z[6], X[6], Y[6]);
ECP_ADC32(Z[7], X[7], Y[7]);
return c.u32.hi;
}
/* Computes Z = X-Y */
S32 ecp_Sub(U32* Z, const U32* X, const U32* Y)
{
M64 b;
ECP_SUB32(Z[0], X[0], Y[0]);
ECP_SBC32(Z[1], X[1], Y[1]);
ECP_SBC32(Z[2], X[2], Y[2]);
ECP_SBC32(Z[3], X[3], Y[3]);
ECP_SBC32(Z[4], X[4], Y[4]);
ECP_SBC32(Z[5], X[5], Y[5]);
ECP_SBC32(Z[6], X[6], Y[6]);
ECP_SBC32(Z[7], X[7], Y[7]);
return b.s32.hi;
}
/* Computes Z = X+Y mod P */
void ecp_AddReduce(U32* Z, const U32* X, const U32* Y)
{
M64 c;
c.u32.hi = ecp_Add(Z, X, Y) * 38;
/* Z += c.u32.hi * 38 */
ECP_ADD_C0(Z[0], Z[0], c.u32.hi);
ECP_ADD_C1(Z[1], Z[1]);
ECP_ADD_C1(Z[2], Z[2]);
ECP_ADD_C1(Z[3], Z[3]);
ECP_ADD_C1(Z[4], Z[4]);
ECP_ADD_C1(Z[5], Z[5]);
ECP_ADD_C1(Z[6], Z[6]);
ECP_ADD_C1(Z[7], Z[7]);
/* One more carry at most */
ECP_ADD_C0(Z[0], Z[0], c.u32.hi*38);
ECP_ADD_C1(Z[1], Z[1]);
ECP_ADD_C1(Z[2], Z[2]);
ECP_ADD_C1(Z[3], Z[3]);
ECP_ADD_C1(Z[4], Z[4]);
ECP_ADD_C1(Z[5], Z[5]);
ECP_ADD_C1(Z[6], Z[6]);
ECP_ADD_C1(Z[7], Z[7]);
}
/* Computes Z = X-Y mod P */
void ecp_SubReduce(U32* Z, const U32* X, const U32* Y)
{
M64 c;
c.u32.hi = ecp_Sub(Z, X, Y) & 38;
ECP_SUB_C0(Z[0], Z[0], c.u32.hi);
ECP_SUB_C1(Z[1], Z[1]);
ECP_SUB_C1(Z[2], Z[2]);
ECP_SUB_C1(Z[3], Z[3]);
ECP_SUB_C1(Z[4], Z[4]);
ECP_SUB_C1(Z[5], Z[5]);
ECP_SUB_C1(Z[6], Z[6]);
ECP_SUB_C1(Z[7], Z[7]);
ECP_SUB_C0(Z[0], Z[0], c.u32.hi & 38);
ECP_SUB_C1(Z[1], Z[1]);
ECP_SUB_C1(Z[2], Z[2]);
ECP_SUB_C1(Z[3], Z[3]);
ECP_SUB_C1(Z[4], Z[4]);
ECP_SUB_C1(Z[5], Z[5]);
ECP_SUB_C1(Z[6], Z[6]);
ECP_SUB_C1(Z[7], Z[7]);
}
void ecp_Mod(U32 *X)
{
U32 T[8];
U32 c = (U32)ecp_Sub(X, X, _w_P);
/* set T = 0 if c=0, else T = P */
T[0] = c & 0xFFFFFFED;
T[1] = T[2] = T[3] = T[4] = T[5] = T[6] = c;
T[7] = c >> 1;
ecp_Add(X, X, T); /* X += 0 or P */
/* In case there is another P there */
c = (U32)ecp_Sub(X, X, _w_P);
/* set T = 0 if c=0, else T = P */
T[0] = c & 0xFFFFFFED;
T[1] = T[2] = T[3] = T[4] = T[5] = T[6] = c;
T[7] = c >> 1;
ecp_Add(X, X, T); /* X += 0 or P */
}
/* Computes Y = b*X */
static void ecp_mul_set(U32* Y, U32 b, const U32* X)
{
M64 c;
ECP_MULSET_W0(Y[0], b, X[0]);
ECP_MULSET_W1(Y[1], b, X[1]);
ECP_MULSET_W1(Y[2], b, X[2]);
ECP_MULSET_W1(Y[3], b, X[3]);
ECP_MULSET_W1(Y[4], b, X[4]);
ECP_MULSET_W1(Y[5], b, X[5]);
ECP_MULSET_W1(Y[6], b, X[6]);
ECP_MULSET_W1(Y[7], b, X[7]);
Y[8] = c.u32.hi;
}
/* Computes Y += b*X */
/* Addition is performed on lower 8-words of Y */
static void ecp_mul_add(U32* Y, U32 b, const U32* X)
{
M64 c;
ECP_MULADD_W0(Y[0], Y[0], b, X[0]);
ECP_MULADD_W1(Y[1], Y[1], b, X[1]);
ECP_MULADD_W1(Y[2], Y[2], b, X[2]);
ECP_MULADD_W1(Y[3], Y[3], b, X[3]);
ECP_MULADD_W1(Y[4], Y[4], b, X[4]);
ECP_MULADD_W1(Y[5], Y[5], b, X[5]);
ECP_MULADD_W1(Y[6], Y[6], b, X[6]);
ECP_MULADD_W1(Y[7], Y[7], b, X[7]);
Y[8] = c.u32.hi;
}
/* Computes Z = Y + b*X and return carry */
void ecp_WordMulAddReduce(U32 *Z, const U32* Y, U32 b, const U32* X)
{
M64 c;
ECP_MULADD_W0(Z[0], Y[0], b, X[0]);
ECP_MULADD_W1(Z[1], Y[1], b, X[1]);
ECP_MULADD_W1(Z[2], Y[2], b, X[2]);
ECP_MULADD_W1(Z[3], Y[3], b, X[3]);
ECP_MULADD_W1(Z[4], Y[4], b, X[4]);
ECP_MULADD_W1(Z[5], Y[5], b, X[5]);
ECP_MULADD_W1(Z[6], Y[6], b, X[6]);
ECP_MULADD_W1(Z[7], Y[7], b, X[7]);
/* Z += c.u32.hi * 38 */
ECP_MULADD_W0(Z[0], Z[0], c.u32.hi, 38);
ECP_ADD_C1(Z[1], Z[1]);
ECP_ADD_C1(Z[2], Z[2]);
ECP_ADD_C1(Z[3], Z[3]);
ECP_ADD_C1(Z[4], Z[4]);
ECP_ADD_C1(Z[5], Z[5]);
ECP_ADD_C1(Z[6], Z[6]);
ECP_ADD_C1(Z[7], Z[7]);
/* One more time at most */
ECP_MULADD_W0(Z[0], Z[0], c.u32.hi, 38);
ECP_ADD_C1(Z[1], Z[1]);
ECP_ADD_C1(Z[2], Z[2]);
ECP_ADD_C1(Z[3], Z[3]);
ECP_ADD_C1(Z[4], Z[4]);
ECP_ADD_C1(Z[5], Z[5]);
ECP_ADD_C1(Z[6], Z[6]);
ECP_ADD_C1(Z[7], Z[7]);
}
/* Computes Z = X*Y mod P. */
/* Output fits into 8 words but could be greater than P */
void ecp_MulReduce(U32* Z, const U32* X, const U32* Y)
{
U32 T[16];
ecp_mul_set(T+0, X[0], Y);
ecp_mul_add(T+1, X[1], Y);
ecp_mul_add(T+2, X[2], Y);
ecp_mul_add(T+3, X[3], Y);
ecp_mul_add(T+4, X[4], Y);
ecp_mul_add(T+5, X[5], Y);
ecp_mul_add(T+6, X[6], Y);
ecp_mul_add(T+7, X[7], Y);
/* We have T = X*Y, now do the reduction in size */
ecp_WordMulAddReduce(Z, T, 38, T+8);
}
/* Computes Z = X*Y */
void ecp_Mul(U32* Z, const U32* X, const U32* Y)
{
ecp_mul_set(Z+0, X[0], Y);
ecp_mul_add(Z+1, X[1], Y);
ecp_mul_add(Z+2, X[2], Y);
ecp_mul_add(Z+3, X[3], Y);
ecp_mul_add(Z+4, X[4], Y);
ecp_mul_add(Z+5, X[5], Y);
ecp_mul_add(Z+6, X[6], Y);
ecp_mul_add(Z+7, X[7], Y);
}
/* Computes Z = X*Y mod P. */
void ecp_SqrReduce(U32* Y, const U32* X)
{
/* TBD: Implementation is based on multiply */
/* Optimize for squaring */
U32 T[16];
ecp_mul_set(T+0, X[0], X);
ecp_mul_add(T+1, X[1], X);
ecp_mul_add(T+2, X[2], X);
ecp_mul_add(T+3, X[3], X);
ecp_mul_add(T+4, X[4], X);
ecp_mul_add(T+5, X[5], X);
ecp_mul_add(T+6, X[6], X);
ecp_mul_add(T+7, X[7], X);
/* We have T = X*X, now do the reduction in size */
ecp_WordMulAddReduce(Y, T, 38, T+8);
}
/* Computes Z = X*Y mod P. */
void ecp_MulMod(U32* Z, const U32* X, const U32* Y)
{
ecp_MulReduce(Z, X, Y);
ecp_Mod(Z);
}
/* Courtesy of DJB */
/* Return out = 1/z mod P */
void ecp_Inverse(U32 *out, const U32 *z)
{
int i;
U32 t0[8],t1[8],z2[8],z9[8],z11[8];
U32 z2_5_0[8],z2_10_0[8],z2_20_0[8],z2_50_0[8],z2_100_0[8];
/* 2 */ ecp_SqrReduce(z2,z);
/* 4 */ ecp_SqrReduce(t1,z2);
/* 8 */ ecp_SqrReduce(t0,t1);
/* 9 */ ecp_MulReduce(z9,t0,z);
/* 11 */ ecp_MulReduce(z11,z9,z2);
/* 22 */ ecp_SqrReduce(t0,z11);
/* 2^5 - 2^0 = 31 */ ecp_MulReduce(z2_5_0,t0,z9);
/* 2^6 - 2^1 */ ecp_SqrReduce(t0,z2_5_0);
/* 2^7 - 2^2 */ ecp_SqrReduce(t1,t0);
/* 2^8 - 2^3 */ ecp_SqrReduce(t0,t1);
/* 2^9 - 2^4 */ ecp_SqrReduce(t1,t0);
/* 2^10 - 2^5 */ ecp_SqrReduce(t0,t1);
/* 2^10 - 2^0 */ ecp_MulReduce(z2_10_0,t0,z2_5_0);
/* 2^11 - 2^1 */ ecp_SqrReduce(t0,z2_10_0);
/* 2^12 - 2^2 */ ecp_SqrReduce(t1,t0);
/* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) {
ecp_SqrReduce(t0,t1);
ecp_SqrReduce(t1,t0); }
/* 2^20 - 2^0 */ ecp_MulReduce(z2_20_0,t1,z2_10_0);
/* 2^21 - 2^1 */ ecp_SqrReduce(t0,z2_20_0);
/* 2^22 - 2^2 */ ecp_SqrReduce(t1,t0);
/* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) {
ecp_SqrReduce(t0,t1);
ecp_SqrReduce(t1,t0); }
/* 2^40 - 2^0 */ ecp_MulReduce(t0,t1,z2_20_0);
/* 2^41 - 2^1 */ ecp_SqrReduce(t1,t0);
/* 2^42 - 2^2 */ ecp_SqrReduce(t0,t1);
/* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) {
ecp_SqrReduce(t1,t0);
ecp_SqrReduce(t0,t1); }
/* 2^50 - 2^0 */ ecp_MulReduce(z2_50_0,t0,z2_10_0);
/* 2^51 - 2^1 */ ecp_SqrReduce(t0,z2_50_0);
/* 2^52 - 2^2 */ ecp_SqrReduce(t1,t0);
/* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) {
ecp_SqrReduce(t0,t1);
ecp_SqrReduce(t1,t0); }
/* 2^100 - 2^0 */ ecp_MulReduce(z2_100_0,t1,z2_50_0);
/* 2^101 - 2^1 */ ecp_SqrReduce(t1,z2_100_0);
/* 2^102 - 2^2 */ ecp_SqrReduce(t0,t1);
/* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) {
ecp_SqrReduce(t1,t0);
ecp_SqrReduce(t0,t1); }
/* 2^200 - 2^0 */ ecp_MulReduce(t1,t0,z2_100_0);
/* 2^201 - 2^1 */ ecp_SqrReduce(t0,t1);
/* 2^202 - 2^2 */ ecp_SqrReduce(t1,t0);
/* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) {
ecp_SqrReduce(t0,t1);
ecp_SqrReduce(t1,t0); }
/* 2^250 - 2^0 */ ecp_MulReduce(t0,t1,z2_50_0);
/* 2^251 - 2^1 */ ecp_SqrReduce(t1,t0);
/* 2^252 - 2^2 */ ecp_SqrReduce(t0,t1);
/* 2^253 - 2^3 */ ecp_SqrReduce(t1,t0);
/* 2^254 - 2^4 */ ecp_SqrReduce(t0,t1);
/* 2^255 - 2^5 */ ecp_SqrReduce(t1,t0);
/* 2^255 - 21 */ ecp_MulReduce(out,t1,z11);
}