osb/source/extern/curve25519/source/curve25519_mehdi.c

/* The MIT License (MIT)
 *
 * Copyright (c) 2015 mehdi sotoodeh
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
#include "../include/external_calls.h"
#include "curve25519_mehdi.h"

/*
    The curve used is y2 = x^3 + 486662x^2 + x, a Montgomery curve, over
    the prime field defined by the prime number 2^255 - 19, and it uses the
    base point x = 9.
    Protocol uses compressed elliptic point (only X coordinates), so it
    allows for efficient use of the Montgomery ladder for ECDH, using only
    XZ coordinates.

    The curve is birationally equivalent to Ed25519 (Twisted Edwards curve).

    b = 256
    p = 2**255 - 19
    l = 2**252 + 27742317777372353535851937790883648493

    This library is a constant-time implementation of field operations
*/

typedef struct
{
    U32 X[8];   /* x = X/Z */
    U32 Z[8];   /*  */
} XZ_POINT;

const U32 _w_P[8] = {
    0xFFFFFFED,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
    0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0x7FFFFFFF
};

/* Maximum number of prime p that fits into 256-bits */
const U32 _w_maxP[8] = {   /* 2*P < 2**256 */
    0xFFFFFFDA,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,
    0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF
};

void ecp_SetValue(U32* X, U32 value)
{
    X[0] = value;
    X[1] = X[2] = X[3] = X[4] = X[5] = X[6] = X[7] = 0;
}

/* Y = X */
void ecp_Copy(U32* Y, const U32* X)
{
    memcpy(Y, X, 8*sizeof(U32));
}

int ecp_CmpNE(const U32* X, const U32* Y)
{
    return ((X[0] ^ Y[0]) | (X[1] ^ Y[1]) | (X[2] ^ Y[2]) | (X[3] ^ Y[3]) |
            (X[4] ^ Y[4]) | (X[5] ^ Y[5]) | (X[6] ^ Y[6]) | (X[7] ^ Y[7]));
}

int ecp_CmpLT(const U32* X, const U32* Y)
{
    U32 T[8];
    return ecp_Sub(T, X, Y);
}

#define ECP_ADD_C0(Y,X,V) c.u64 = (U64)(X) + (V); Y = c.u32.lo;
#define ECP_ADD_C1(Y,X) c.u64 = (U64)(X) + c.u32.hi; Y = c.u32.lo;

#define ECP_SUB_C0(Y,X,V) c.s64 = (U64)(X) - (V); Y = c.u32.lo;
#define ECP_SUB_C1(Y,X) c.s64 = (U64)(X) + (S64)c.s32.hi; Y = c.u32.lo;

#define ECP_MULSET_W0(Y,b,X) c.u64 = (U64)(b)*(X); Y = c.u32.lo;
#define ECP_MULSET_W1(Y,b,X) c.u64 = (U64)(b)*(X) + c.u32.hi; Y = c.u32.lo;

#define ECP_MULADD_W0(Z,Y,b,X) c.u64 = (U64)(b)*(X) + (Y); Z = c.u32.lo;
#define ECP_MULADD_W1(Z,Y,b,X) c.u64 = (U64)(b)*(X) + (U64)(Y) + c.u32.hi; Z = c.u32.lo;

#define ECP_ADD32(Z,X,Y) c.u64 = (U64)(X) + (Y); Z = c.u32.lo;
#define ECP_ADC32(Z,X,Y) c.u64 = (U64)(X) + (U64)(Y) + c.u32.hi; Z = c.u32.lo;
#define ECP_SUB32(Z,X,Y) b.s64 = (S64)(X) - (Y); Z = b.s32.lo;
#define ECP_SBC32(Z,X,Y) b.s64 = (S64)(X) - (U64)(Y) + b.s32.hi; Z = b.s32.lo;

/* Computes Z = X+Y */
U32 ecp_Add(U32* Z, const U32* X, const U32* Y)
{
    M64 c;

    ECP_ADD32(Z[0], X[0], Y[0]);
    ECP_ADC32(Z[1], X[1], Y[1]);
    ECP_ADC32(Z[2], X[2], Y[2]);
    ECP_ADC32(Z[3], X[3], Y[3]);
    ECP_ADC32(Z[4], X[4], Y[4]);
    ECP_ADC32(Z[5], X[5], Y[5]);
    ECP_ADC32(Z[6], X[6], Y[6]);
    ECP_ADC32(Z[7], X[7], Y[7]);
    return c.u32.hi;
}

/* Computes Z = X-Y */
S32 ecp_Sub(U32* Z, const U32* X, const U32* Y)
{
    M64 b;
    ECP_SUB32(Z[0], X[0], Y[0]);
    ECP_SBC32(Z[1], X[1], Y[1]);
    ECP_SBC32(Z[2], X[2], Y[2]);
    ECP_SBC32(Z[3], X[3], Y[3]);
    ECP_SBC32(Z[4], X[4], Y[4]);
    ECP_SBC32(Z[5], X[5], Y[5]);
    ECP_SBC32(Z[6], X[6], Y[6]);
    ECP_SBC32(Z[7], X[7], Y[7]);
    return b.s32.hi;
}

/* Computes Z = X+Y mod P */
void ecp_AddReduce(U32* Z, const U32* X, const U32* Y)
{
    M64 c;
    c.u32.hi = ecp_Add(Z, X, Y) * 38;

    /* Z += c.u32.hi * 38 */
    ECP_ADD_C0(Z[0], Z[0], c.u32.hi);
    ECP_ADD_C1(Z[1], Z[1]);
    ECP_ADD_C1(Z[2], Z[2]);
    ECP_ADD_C1(Z[3], Z[3]);
    ECP_ADD_C1(Z[4], Z[4]);
    ECP_ADD_C1(Z[5], Z[5]);
    ECP_ADD_C1(Z[6], Z[6]);
    ECP_ADD_C1(Z[7], Z[7]);

    /* One more carry at most */
    ECP_ADD_C0(Z[0], Z[0], c.u32.hi*38);
    ECP_ADD_C1(Z[1], Z[1]);
    ECP_ADD_C1(Z[2], Z[2]);
    ECP_ADD_C1(Z[3], Z[3]);
    ECP_ADD_C1(Z[4], Z[4]);
    ECP_ADD_C1(Z[5], Z[5]);
    ECP_ADD_C1(Z[6], Z[6]);
    ECP_ADD_C1(Z[7], Z[7]);
}

/* Computes Z = X-Y mod P */
void ecp_SubReduce(U32* Z, const U32* X, const U32* Y)
{
    M64 c;
    c.u32.hi = ecp_Sub(Z, X, Y) & 38;

    ECP_SUB_C0(Z[0], Z[0], c.u32.hi);
    ECP_SUB_C1(Z[1], Z[1]);
    ECP_SUB_C1(Z[2], Z[2]);
    ECP_SUB_C1(Z[3], Z[3]);
    ECP_SUB_C1(Z[4], Z[4]);
    ECP_SUB_C1(Z[5], Z[5]);
    ECP_SUB_C1(Z[6], Z[6]);
    ECP_SUB_C1(Z[7], Z[7]);

    ECP_SUB_C0(Z[0], Z[0], c.u32.hi & 38);
    ECP_SUB_C1(Z[1], Z[1]);
    ECP_SUB_C1(Z[2], Z[2]);
    ECP_SUB_C1(Z[3], Z[3]);
    ECP_SUB_C1(Z[4], Z[4]);
    ECP_SUB_C1(Z[5], Z[5]);
    ECP_SUB_C1(Z[6], Z[6]);
    ECP_SUB_C1(Z[7], Z[7]);
}

void ecp_Mod(U32 *X)
{
    U32 T[8];
    U32 c = (U32)ecp_Sub(X, X, _w_P);

    /* set T = 0 if c=0, else T = P */

    T[0] = c & 0xFFFFFFED;
    T[1] = T[2] = T[3] = T[4] = T[5] = T[6] = c;
    T[7] = c >> 1;

    ecp_Add(X, X, T);   /* X += 0 or P */

    /* In case there is another P there */

    c = (U32)ecp_Sub(X, X, _w_P);

    /* set T = 0 if c=0, else T = P */

    T[0] = c & 0xFFFFFFED;
    T[1] = T[2] = T[3] = T[4] = T[5] = T[6] = c;
    T[7] = c >> 1;

    ecp_Add(X, X, T);   /* X += 0 or P */
}

/* Computes Y = b*X */
static void ecp_mul_set(U32* Y, U32 b, const U32* X)
{
    M64 c;
    ECP_MULSET_W0(Y[0], b, X[0]);
    ECP_MULSET_W1(Y[1], b, X[1]);
    ECP_MULSET_W1(Y[2], b, X[2]);
    ECP_MULSET_W1(Y[3], b, X[3]);
    ECP_MULSET_W1(Y[4], b, X[4]);
    ECP_MULSET_W1(Y[5], b, X[5]);
    ECP_MULSET_W1(Y[6], b, X[6]);
    ECP_MULSET_W1(Y[7], b, X[7]);
    Y[8] = c.u32.hi;
}

/* Computes Y += b*X */
/* Addition is performed on lower 8-words of Y */
static void ecp_mul_add(U32* Y, U32 b, const U32* X)
{
    M64 c;
    ECP_MULADD_W0(Y[0], Y[0], b, X[0]);
    ECP_MULADD_W1(Y[1], Y[1], b, X[1]);
    ECP_MULADD_W1(Y[2], Y[2], b, X[2]);
    ECP_MULADD_W1(Y[3], Y[3], b, X[3]);
    ECP_MULADD_W1(Y[4], Y[4], b, X[4]);
    ECP_MULADD_W1(Y[5], Y[5], b, X[5]);
    ECP_MULADD_W1(Y[6], Y[6], b, X[6]);
    ECP_MULADD_W1(Y[7], Y[7], b, X[7]);
    Y[8] = c.u32.hi;
}

/* Computes Z = Y + b*X and return carry */
void ecp_WordMulAddReduce(U32 *Z, const U32* Y, U32 b, const U32* X)
{
    M64 c;
    ECP_MULADD_W0(Z[0], Y[0], b, X[0]);
    ECP_MULADD_W1(Z[1], Y[1], b, X[1]);
    ECP_MULADD_W1(Z[2], Y[2], b, X[2]);
    ECP_MULADD_W1(Z[3], Y[3], b, X[3]);
    ECP_MULADD_W1(Z[4], Y[4], b, X[4]);
    ECP_MULADD_W1(Z[5], Y[5], b, X[5]);
    ECP_MULADD_W1(Z[6], Y[6], b, X[6]);
    ECP_MULADD_W1(Z[7], Y[7], b, X[7]);

    /* Z += c.u32.hi * 38 */
    ECP_MULADD_W0(Z[0], Z[0], c.u32.hi, 38);
    ECP_ADD_C1(Z[1], Z[1]);
    ECP_ADD_C1(Z[2], Z[2]);
    ECP_ADD_C1(Z[3], Z[3]);
    ECP_ADD_C1(Z[4], Z[4]);
    ECP_ADD_C1(Z[5], Z[5]);
    ECP_ADD_C1(Z[6], Z[6]);
    ECP_ADD_C1(Z[7], Z[7]);

    /* One more time at most */
    ECP_MULADD_W0(Z[0], Z[0], c.u32.hi, 38);
    ECP_ADD_C1(Z[1], Z[1]);
    ECP_ADD_C1(Z[2], Z[2]);
    ECP_ADD_C1(Z[3], Z[3]);
    ECP_ADD_C1(Z[4], Z[4]);
    ECP_ADD_C1(Z[5], Z[5]);
    ECP_ADD_C1(Z[6], Z[6]);
    ECP_ADD_C1(Z[7], Z[7]);
}

/* Computes Z = X*Y mod P. */
/* Output fits into 8 words but could be greater than P */
void ecp_MulReduce(U32* Z, const U32* X, const U32* Y)
{
    U32 T[16];

    ecp_mul_set(T+0, X[0], Y);
    ecp_mul_add(T+1, X[1], Y);
    ecp_mul_add(T+2, X[2], Y);
    ecp_mul_add(T+3, X[3], Y);
    ecp_mul_add(T+4, X[4], Y);
    ecp_mul_add(T+5, X[5], Y);
    ecp_mul_add(T+6, X[6], Y);
    ecp_mul_add(T+7, X[7], Y);

    /* We have T = X*Y, now do the reduction in size */

    ecp_WordMulAddReduce(Z, T, 38, T+8);
}

/* Computes Z = X*Y */
void ecp_Mul(U32* Z, const U32* X, const U32* Y)
{
    ecp_mul_set(Z+0, X[0], Y);
    ecp_mul_add(Z+1, X[1], Y);
    ecp_mul_add(Z+2, X[2], Y);
    ecp_mul_add(Z+3, X[3], Y);
    ecp_mul_add(Z+4, X[4], Y);
    ecp_mul_add(Z+5, X[5], Y);
    ecp_mul_add(Z+6, X[6], Y);
    ecp_mul_add(Z+7, X[7], Y);
}

/* Computes Z = X*Y mod P. */
void ecp_SqrReduce(U32* Y, const U32* X)
{
    /* TBD: Implementation is based on multiply */
    /*      Optimize for squaring */

    U32 T[16];

    ecp_mul_set(T+0, X[0], X);
    ecp_mul_add(T+1, X[1], X);
    ecp_mul_add(T+2, X[2], X);
    ecp_mul_add(T+3, X[3], X);
    ecp_mul_add(T+4, X[4], X);
    ecp_mul_add(T+5, X[5], X);
    ecp_mul_add(T+6, X[6], X);
    ecp_mul_add(T+7, X[7], X);

    /* We have T = X*X, now do the reduction in size */

    ecp_WordMulAddReduce(Y, T, 38, T+8);
}

/* Computes Z = X*Y mod P. */
void ecp_MulMod(U32* Z, const U32* X, const U32* Y)
{
    ecp_MulReduce(Z, X, Y);
    ecp_Mod(Z);
}

/* Courtesy of DJB */
/* Return out = 1/z mod P */
void ecp_Inverse(U32 *out, const U32 *z)
{
  int i;
  U32 t0[8],t1[8],z2[8],z9[8],z11[8];
  U32 z2_5_0[8],z2_10_0[8],z2_20_0[8],z2_50_0[8],z2_100_0[8];

  /* 2 */               ecp_SqrReduce(z2,z);
  /* 4 */               ecp_SqrReduce(t1,z2);
  /* 8 */               ecp_SqrReduce(t0,t1);
  /* 9 */               ecp_MulReduce(z9,t0,z);
  /* 11 */              ecp_MulReduce(z11,z9,z2);
  /* 22 */              ecp_SqrReduce(t0,z11);
  /* 2^5 - 2^0 = 31 */  ecp_MulReduce(z2_5_0,t0,z9);

  /* 2^6 - 2^1 */       ecp_SqrReduce(t0,z2_5_0);
  /* 2^7 - 2^2 */       ecp_SqrReduce(t1,t0);
  /* 2^8 - 2^3 */       ecp_SqrReduce(t0,t1);
  /* 2^9 - 2^4 */       ecp_SqrReduce(t1,t0);
  /* 2^10 - 2^5 */      ecp_SqrReduce(t0,t1);
  /* 2^10 - 2^0 */      ecp_MulReduce(z2_10_0,t0,z2_5_0);

  /* 2^11 - 2^1 */      ecp_SqrReduce(t0,z2_10_0);
  /* 2^12 - 2^2 */      ecp_SqrReduce(t1,t0);
  /* 2^20 - 2^10 */     for (i = 2;i < 10;i += 2) {
                            ecp_SqrReduce(t0,t1);
                            ecp_SqrReduce(t1,t0); }
  /* 2^20 - 2^0 */      ecp_MulReduce(z2_20_0,t1,z2_10_0);

  /* 2^21 - 2^1 */      ecp_SqrReduce(t0,z2_20_0);
  /* 2^22 - 2^2 */      ecp_SqrReduce(t1,t0);
  /* 2^40 - 2^20 */     for (i = 2;i < 20;i += 2) {
                            ecp_SqrReduce(t0,t1);
                            ecp_SqrReduce(t1,t0); }
  /* 2^40 - 2^0 */      ecp_MulReduce(t0,t1,z2_20_0);

  /* 2^41 - 2^1 */      ecp_SqrReduce(t1,t0);
  /* 2^42 - 2^2 */      ecp_SqrReduce(t0,t1);
  /* 2^50 - 2^10 */     for (i = 2;i < 10;i += 2) {
                            ecp_SqrReduce(t1,t0);
                            ecp_SqrReduce(t0,t1); }
  /* 2^50 - 2^0 */      ecp_MulReduce(z2_50_0,t0,z2_10_0);

  /* 2^51 - 2^1 */      ecp_SqrReduce(t0,z2_50_0);
  /* 2^52 - 2^2 */      ecp_SqrReduce(t1,t0);
  /* 2^100 - 2^50 */    for (i = 2;i < 50;i += 2) {
                            ecp_SqrReduce(t0,t1);
                            ecp_SqrReduce(t1,t0); }
  /* 2^100 - 2^0 */     ecp_MulReduce(z2_100_0,t1,z2_50_0);

  /* 2^101 - 2^1 */     ecp_SqrReduce(t1,z2_100_0);
  /* 2^102 - 2^2 */     ecp_SqrReduce(t0,t1);
  /* 2^200 - 2^100 */   for (i = 2;i < 100;i += 2) {
                            ecp_SqrReduce(t1,t0);
                            ecp_SqrReduce(t0,t1); }
  /* 2^200 - 2^0 */     ecp_MulReduce(t1,t0,z2_100_0);

  /* 2^201 - 2^1 */     ecp_SqrReduce(t0,t1);
  /* 2^202 - 2^2 */     ecp_SqrReduce(t1,t0);
  /* 2^250 - 2^50 */    for (i = 2;i < 50;i += 2) {
                            ecp_SqrReduce(t0,t1);
                            ecp_SqrReduce(t1,t0); }
  /* 2^250 - 2^0 */     ecp_MulReduce(t0,t1,z2_50_0);

  /* 2^251 - 2^1 */     ecp_SqrReduce(t1,t0);
  /* 2^252 - 2^2 */     ecp_SqrReduce(t0,t1);
  /* 2^253 - 2^3 */     ecp_SqrReduce(t1,t0);
  /* 2^254 - 2^4 */     ecp_SqrReduce(t0,t1);
  /* 2^255 - 2^5 */     ecp_SqrReduce(t1,t0);
  /* 2^255 - 21 */      ecp_MulReduce(out,t1,z11);
}