mirror of https://github.com/starwels/secp256k1
6 changed files with 506 additions and 642 deletions
@ -1,57 +0,0 @@
@@ -1,57 +0,0 @@
|
||||
#! /bin/sh |
||||
command="" |
||||
infile="" |
||||
o_opt=no |
||||
pic=no |
||||
while [ $# -gt 0 ]; do |
||||
case "$1" in |
||||
-DPIC|-fPIC|-fpic|-Kpic|-KPIC) |
||||
if [ "$pic" != "yes" ] ; then |
||||
command="$command -DPIC" |
||||
pic=yes |
||||
fi |
||||
;; |
||||
-f|-fbin|-faout|-faoutb|-fcoff|-felf|-felf64|-fas86| \ |
||||
-fobj|-fwin32|-fwin64|-frdf|-fieee|-fmacho|-fmacho64) |
||||
# it's a file format specifier for nasm. |
||||
command="$command $1" |
||||
;; |
||||
-f*) |
||||
# maybe a code-generation flag for gcc. |
||||
;; |
||||
-[Ii]*) |
||||
incdir=`echo "$1" | sed 's/^-[Ii]//'` |
||||
if [ "x$incdir" = x -a "x$2" != x ] ; then |
||||
case "$2" in |
||||
-*) ;; |
||||
*) incdir="$2"; shift;; |
||||
esac |
||||
fi |
||||
if [ "x$incdir" != x ] ; then |
||||
# In the case of NASM, the trailing slash is necessary. |
||||
incdir=`echo "$incdir" | sed 's%/*$%/%'` |
||||
command="$command -I$incdir" |
||||
fi |
||||
;; |
||||
-o*) |
||||
o_opt=yes |
||||
command="$command $1" |
||||
;; |
||||
*.asm) |
||||
infile=$1 |
||||
command="$command $1" |
||||
;; |
||||
*) |
||||
command="$command $1" |
||||
;; |
||||
esac |
||||
shift |
||||
done |
||||
if [ "$o_opt" != yes ] ; then |
||||
# By default, NASM creates an output file |
||||
# in the same directory as the input file. |
||||
outfile="-o `echo $infile | sed -e 's%^.*/%%' -e 's%\.[^.]*$%%'`.o" |
||||
command="$command $outfile" |
||||
fi |
||||
echo $command |
||||
exec $command |
@ -1,529 +0,0 @@
@@ -1,529 +0,0 @@
|
||||
;; Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille |
||||
;; Distributed under the MIT software license, see the accompanying |
||||
;; file COPYING or http://www.opensource.org/licenses/mit-license.php. |
||||
|
||||
;; Changelog: |
||||
;; * March 2013, Diederik Huys: Original version |
||||
;; * November 2014, Pieter Wuille: Updated to use Peter Dettman's parallel |
||||
;; multiplication algorithm |
||||
;; |
||||
;; Provided public procedures: |
||||
;; secp256k1_fe_mul_inner |
||||
;; secp256k1_fe_sqr_inner |
||||
;; |
||||
;; Needed tools: YASM (http://yasm.tortall.net) |
||||
;; |
||||
;; |
||||
|
||||
BITS 64 |
||||
|
||||
%ifidn __OUTPUT_FORMAT__,macho64 |
||||
%define SYM(x) _ %+ x |
||||
%else |
||||
%define SYM(x) x |
||||
%endif |
||||
|
||||
;; Procedure ExSetMult |
||||
;; Register Layout: |
||||
;; INPUT: rdi = a->n |
||||
;; rsi = b->n |
||||
;; rdx = r->a |
||||
;; |
||||
;; INTERNAL: rdx:rax = multiplication accumulator |
||||
;; r9:r8 = c |
||||
;; r10:r14 = a0-a4 |
||||
;; rcx:rbx = d |
||||
;; rbp = R |
||||
;; rdi = t? |
||||
;; r15 = b->n |
||||
;; rsi = r->n |
||||
GLOBAL SYM(secp256k1_fe_mul_inner) |
||||
ALIGN 32 |
||||
SYM(secp256k1_fe_mul_inner): |
||||
push rbp |
||||
push rbx |
||||
push r12 |
||||
push r13 |
||||
push r14 |
||||
push r15 |
||||
mov r10,[rdi+0*8] |
||||
mov r11,[rdi+1*8] |
||||
mov r12,[rdi+2*8] |
||||
mov r13,[rdi+3*8] |
||||
mov r14,[rdi+4*8] |
||||
mov rbp,01000003D10h |
||||
mov r15,rsi |
||||
mov rsi,rdx |
||||
|
||||
;; d += a3 * b0 |
||||
mov rax,[r15+0*8] |
||||
mul r13 |
||||
mov rbx,rax |
||||
mov rcx,rdx |
||||
;; d += a2 * b1 |
||||
mov rax,[r15+1*8] |
||||
mul r12 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a1 * b2 |
||||
mov rax,[r15+2*8] |
||||
mul r11 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d = a0 * b3 |
||||
mov rax,[r15+3*8] |
||||
mul r10 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; c = a4 * b4 |
||||
mov rax,[r15+4*8] |
||||
mul r14 |
||||
mov r8,rax |
||||
mov r9,rdx |
||||
;; d += (c & M) * R |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mul rbp |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; c >>= 52 (r8 only) |
||||
shrd r8,r9,52 |
||||
;; t3 (stack) = d & M |
||||
mov rdi,rbx |
||||
mov rdx,0fffffffffffffh |
||||
and rdi,rdx |
||||
push rdi |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; d += a4 * b0 |
||||
mov rax,[r15+0*8] |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a3 * b1 |
||||
mov rax,[r15+1*8] |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a2 * b2 |
||||
mov rax,[r15+2*8] |
||||
mul r12 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a1 * b3 |
||||
mov rax,[r15+3*8] |
||||
mul r11 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a0 * b4 |
||||
mov rax,[r15+4*8] |
||||
mul r10 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += c * R |
||||
mov rax,r8 |
||||
mul rbp |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; t4 = d & M (rdi) |
||||
mov rdi,rbx |
||||
mov rdx,0fffffffffffffh |
||||
and rdi,rdx |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; tx = t4 >> 48 (rbp, overwrites R) |
||||
mov rbp,rdi |
||||
shr rbp,48 |
||||
;; t4 &= (M >> 4) (stack) |
||||
mov rax,0ffffffffffffh |
||||
and rdi,rax |
||||
push rdi |
||||
;; c = a0 * b0 |
||||
mov rax,[r15+0*8] |
||||
mul r10 |
||||
mov r8,rax |
||||
mov r9,rdx |
||||
;; d += a4 * b1 |
||||
mov rax,[r15+1*8] |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a3 * b2 |
||||
mov rax,[r15+2*8] |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a2 * b3 |
||||
mov rax,[r15+3*8] |
||||
mul r12 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a1 * b4 |
||||
mov rax,[r15+4*8] |
||||
mul r11 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; u0 = d & M (rdi) |
||||
mov rdi,rbx |
||||
mov rdx,0fffffffffffffh |
||||
and rdi,rdx |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; u0 = (u0 << 4) | tx (rdi) |
||||
shl rdi,4 |
||||
or rdi,rbp |
||||
;; c += u0 * (R >> 4) |
||||
mov rax,01000003D1h |
||||
mul rdi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; r[0] = c & M |
||||
mov rax,r8 |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mov [rsi+0*8],rax |
||||
;; c >>= 52 |
||||
shrd r8,r9,52 |
||||
mov r9,0 |
||||
;; c += a1 * b0 |
||||
mov rax,[r15+0*8] |
||||
mul r11 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; c += a0 * b1 |
||||
mov rax,[r15+1*8] |
||||
mul r10 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d += a4 * b2 |
||||
mov rax,[r15+2*8] |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a3 * b3 |
||||
mov rax,[r15+3*8] |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a2 * b4 |
||||
mov rax,[r15+4*8] |
||||
mul r12 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; restore rdp = R |
||||
mov rbp,01000003D10h |
||||
;; c += (d & M) * R |
||||
mov rax,rbx |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mul rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; r[1] = c & M |
||||
mov rax,r8 |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mov [rsi+8*1],rax |
||||
;; c >>= 52 |
||||
shrd r8,r9,52 |
||||
mov r9,0 |
||||
;; c += a2 * b0 |
||||
mov rax,[r15+0*8] |
||||
mul r12 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; c += a1 * b1 |
||||
mov rax,[r15+1*8] |
||||
mul r11 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; c += a0 * b2 (last use of r10 = a0) |
||||
mov rax,[r15+2*8] |
||||
mul r10 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; fetch t3 (r10, overwrites a0),t4 (rdi) |
||||
pop rdi |
||||
pop r10 |
||||
;; d += a4 * b3 |
||||
mov rax,[r15+3*8] |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a3 * b4 |
||||
mov rax,[r15+4*8] |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; c += (d & M) * R |
||||
mov rax,rbx |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mul rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d >>= 52 (rbx only) |
||||
shrd rbx,rcx,52 |
||||
;; r[2] = c & M |
||||
mov rax,r8 |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mov [rsi+2*8],rax |
||||
;; c >>= 52 |
||||
shrd r8,r9,52 |
||||
mov r9,0 |
||||
;; c += t3 |
||||
add r8,r10 |
||||
;; c += d * R |
||||
mov rax,rbx |
||||
mul rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; r[3] = c & M |
||||
mov rax,r8 |
||||
mov rdx,0fffffffffffffh |
||||
and rax,rdx |
||||
mov [rsi+3*8],rax |
||||
;; c >>= 52 (r8 only) |
||||
shrd r8,r9,52 |
||||
;; c += t4 (r8 only) |
||||
add r8,rdi |
||||
;; r[4] = c |
||||
mov [rsi+4*8],r8 |
||||
|
||||
pop r15 |
||||
pop r14 |
||||
pop r13 |
||||
pop r12 |
||||
pop rbx |
||||
pop rbp |
||||
ret |
||||
|
||||
|
||||
;; PROC ExSetSquare |
||||
;; Register Layout: |
||||
;; INPUT: rdi = a.n |
||||
;; rsi = r.n |
||||
;; INTERNAL: rdx:rax = multiplication accumulator |
||||
;; r9:r8 = c |
||||
;; r10:r14 = a0-a4 |
||||
;; rcx:rbx = d |
||||
;; rbp = R |
||||
;; rdi = t? |
||||
;; r15 = M |
||||
GLOBAL SYM(secp256k1_fe_sqr_inner) |
||||
ALIGN 32 |
||||
SYM(secp256k1_fe_sqr_inner): |
||||
push rbp |
||||
push rbx |
||||
push r12 |
||||
push r13 |
||||
push r14 |
||||
push r15 |
||||
mov r10,[rdi+0*8] |
||||
mov r11,[rdi+1*8] |
||||
mov r12,[rdi+2*8] |
||||
mov r13,[rdi+3*8] |
||||
mov r14,[rdi+4*8] |
||||
mov rbp,01000003D10h |
||||
mov r15,0fffffffffffffh |
||||
|
||||
;; d = (a0*2) * a3 |
||||
lea rax,[r10*2] |
||||
mul r13 |
||||
mov rbx,rax |
||||
mov rcx,rdx |
||||
;; d += (a1*2) * a2 |
||||
lea rax,[r11*2] |
||||
mul r12 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; c = a4 * a4 |
||||
mov rax,r14 |
||||
mul r14 |
||||
mov r8,rax |
||||
mov r9,rdx |
||||
;; d += (c & M) * R |
||||
and rax,r15 |
||||
mul rbp |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; c >>= 52 (r8 only) |
||||
shrd r8,r9,52 |
||||
;; t3 (stack) = d & M |
||||
mov rdi,rbx |
||||
and rdi,r15 |
||||
push rdi |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; a4 *= 2 |
||||
add r14,r14 |
||||
;; d += a0 * a4 |
||||
mov rax,r10 |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d+= (a1*2) * a3 |
||||
lea rax,[r11*2] |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a2 * a2 |
||||
mov rax,r12 |
||||
mul r12 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += c * R |
||||
mov rax,r8 |
||||
mul rbp |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; t4 = d & M (rdi) |
||||
mov rdi,rbx |
||||
and rdi,r15 |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; tx = t4 >> 48 (rbp, overwrites constant) |
||||
mov rbp,rdi |
||||
shr rbp,48 |
||||
;; t4 &= (M >> 4) (stack) |
||||
mov rax,0ffffffffffffh |
||||
and rdi,rax |
||||
push rdi |
||||
;; c = a0 * a0 |
||||
mov rax,r10 |
||||
mul r10 |
||||
mov r8,rax |
||||
mov r9,rdx |
||||
;; d += a1 * a4 |
||||
mov rax,r11 |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += (a2*2) * a3 |
||||
lea rax,[r12*2] |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; u0 = d & M (rdi) |
||||
mov rdi,rbx |
||||
and rdi,r15 |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; u0 = (u0 << 4) | tx (rdi) |
||||
shl rdi,4 |
||||
or rdi,rbp |
||||
;; c += u0 * (R >> 4) |
||||
mov rax,01000003D1h |
||||
mul rdi |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; r[0] = c & M |
||||
mov rax,r8 |
||||
and rax,r15 |
||||
mov [rsi+0*8],rax |
||||
;; c >>= 52 |
||||
shrd r8,r9,52 |
||||
mov r9,0 |
||||
;; a0 *= 2 |
||||
add r10,r10 |
||||
;; c += a0 * a1 |
||||
mov rax,r10 |
||||
mul r11 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d += a2 * a4 |
||||
mov rax,r12 |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; d += a3 * a3 |
||||
mov rax,r13 |
||||
mul r13 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; load R in rbp |
||||
mov rbp,01000003D10h |
||||
;; c += (d & M) * R |
||||
mov rax,rbx |
||||
and rax,r15 |
||||
mul rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d >>= 52 |
||||
shrd rbx,rcx,52 |
||||
mov rcx,0 |
||||
;; r[1] = c & M |
||||
mov rax,r8 |
||||
and rax,r15 |
||||
mov [rsi+8*1],rax |
||||
;; c >>= 52 |
||||
shrd r8,r9,52 |
||||
mov r9,0 |
||||
;; c += a0 * a2 (last use of r10) |
||||
mov rax,r10 |
||||
mul r12 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; fetch t3 (r10, overwrites a0),t4 (rdi) |
||||
pop rdi |
||||
pop r10 |
||||
;; c += a1 * a1 |
||||
mov rax,r11 |
||||
mul r11 |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d += a3 * a4 |
||||
mov rax,r13 |
||||
mul r14 |
||||
add rbx,rax |
||||
adc rcx,rdx |
||||
;; c += (d & M) * R |
||||
mov rax,rbx |
||||
and rax,r15 |
||||
mul rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; d >>= 52 (rbx only) |
||||
shrd rbx,rcx,52 |
||||
;; r[2] = c & M |
||||
mov rax,r8 |
||||
and rax,r15 |
||||
mov [rsi+2*8],rax |
||||
;; c >>= 52 |
||||
shrd r8,r9,52 |
||||
mov r9,0 |
||||
;; c += t3 |
||||
add r8,r10 |
||||
;; c += d * R |
||||
mov rax,rbx |
||||
mul rbp |
||||
add r8,rax |
||||
adc r9,rdx |
||||
;; r[3] = c & M |
||||
mov rax,r8 |
||||
and rax,r15 |
||||
mov [rsi+3*8],rax |
||||
;; c >>= 52 (r8 only) |
||||
shrd r8,r9,52 |
||||
;; c += t4 (r8 only) |
||||
add r8,rdi |
||||
;; r[4] = c |
||||
mov [rsi+4*8],r8 |
||||
|
||||
pop r15 |
||||
pop r14 |
||||
pop r13 |
||||
pop r12 |
||||
pop rbx |
||||
pop rbp |
||||
ret |
@ -1,13 +1,502 @@
@@ -1,13 +1,502 @@
|
||||
/**********************************************************************
|
||||
* Copyright (c) 2013 Pieter Wuille * |
||||
* Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * |
||||
* Distributed under the MIT software license, see the accompanying * |
||||
* file COPYING or http://www.opensource.org/licenses/mit-license.php.*
|
||||
**********************************************************************/ |
||||
|
||||
/**
|
||||
* Changelog: |
||||
* - March 2013, Diederik Huys: original version |
||||
* - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm |
||||
* - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly |
||||
*/ |
||||
|
||||
#ifndef _SECP256K1_FIELD_INNER5X52_IMPL_H_ |
||||
#define _SECP256K1_FIELD_INNER5X52_IMPL_H_ |
||||
|
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t *b, uint64_t *r); |
||||
void __attribute__ ((sysv_abi)) secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r); |
||||
SECP256K1_INLINE static void secp256k1_fe_mul_inner(const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b, uint64_t *r) { |
||||
/**
|
||||
* Registers: rdx:rax = multiplication accumulator |
||||
* r9:r8 = c |
||||
* r15:rcx = d |
||||
* r10-r14 = a0-a4 |
||||
* rbx = b |
||||
* %2 = r |
||||
* %0 = a / t? |
||||
* rbp = R (0x1000003d10) |
||||
*/ |
||||
__asm__ __volatile__( |
||||
"pushq %%rbp\n" |
||||
|
||||
"movq 0(%0),%%r10\n" |
||||
"movq 8(%0),%%r11\n" |
||||
"movq 16(%0),%%r12\n" |
||||
"movq 24(%0),%%r13\n" |
||||
"movq 32(%0),%%r14\n" |
||||
"movq $0x1000003d10,%%rbp\n" |
||||
|
||||
/* d += a3 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"movq %%rax,%%rcx\n" |
||||
"movq %%rdx,%%r15\n" |
||||
/* d += a2 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a1 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d = a0 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c = a4 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += (c & M) * R */ |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* t3 (stack) = d & M */ |
||||
"movq %%rcx,%0\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%0\n" |
||||
"pushq %0\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* d += a4 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a2 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a1 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a0 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += c * R */ |
||||
"movq %%r8,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* t4 = d & M (%0) */ |
||||
"movq %%rcx,%0\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%0\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* tx = t4 >> 48 (%%rbp, overwrites R) */ |
||||
"movq %0,%%rbp\n" |
||||
"shrq $48,%%rbp\n" |
||||
/* t4 &= (M >> 4) (stack) */ |
||||
"movq $0xffffffffffff,%%rax\n" |
||||
"andq %%rax,%0\n" |
||||
"pushq %0\n" |
||||
/* c = a0 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += a4 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a2 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a1 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* u0 = d & M (%0) */ |
||||
"movq %%rcx,%0\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%0\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* u0 = (u0 << 4) | tx (%0) */ |
||||
"shlq $4,%0\n" |
||||
"orq %%rbp,%0\n" |
||||
/* c += u0 * (R >> 4) */ |
||||
"movq $0x1000003d1,%%rax\n" |
||||
"mulq %0\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[0] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,0(%2)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += a1 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* c += a0 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d += a4 * b2 */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a2 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* restore rdp = R */ |
||||
"movq $0x1000003d10,%%rbp\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rcx,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
"xorq %%r15,%%r15\n" |
||||
/* r[1] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,8(%2)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += a2 * b0 */ |
||||
"movq 0(%%rbx),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* c += a1 * b1 */ |
||||
"movq 8(%%rbx),%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* c += a0 * b2 (last use of %%r10 = a0) */ |
||||
"movq 16(%%rbx),%%rax\n" |
||||
"mulq %%r10\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */ |
||||
"popq %0\n" |
||||
"popq %%r10\n" |
||||
/* d += a4 * b3 */ |
||||
"movq 24(%%rbx),%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* d += a3 * b4 */ |
||||
"movq 32(%%rbx),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rcx\n" |
||||
"adcq %%rdx,%%r15\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rcx,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 (%%rcx only) */ |
||||
"shrdq $52,%%r15,%%rcx\n" |
||||
/* r[2] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,16(%2)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += t3 */ |
||||
"addq %%r10,%%r8\n" |
||||
/* c += d * R */ |
||||
"movq %%rcx,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[3] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"movq $0xfffffffffffff,%%rdx\n" |
||||
"andq %%rdx,%%rax\n" |
||||
"movq %%rax,24(%2)\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* c += t4 (%%r8 only) */ |
||||
"addq %0,%%r8\n" |
||||
/* r[4] = c */ |
||||
"movq %%r8,32(%2)\n" |
||||
|
||||
"popq %%rbp\n" |
||||
: "+S"(a) |
||||
: "b"(b), "D"(r) |
||||
: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" |
||||
); |
||||
} |
||||
|
||||
SECP256K1_INLINE static void secp256k1_fe_sqr_inner(const uint64_t *a, uint64_t *r) { |
||||
/**
|
||||
* Registers: rdx:rax = multiplication accumulator |
||||
* r9:r8 = c |
||||
* rcx:rbx = d |
||||
* r10-r14 = a0-a4 |
||||
* r15 = M (0xfffffffffffff) |
||||
* %1 = r |
||||
* %0 = a / t? |
||||
* rbp = R (0x1000003d10) |
||||
*/ |
||||
__asm__ __volatile__( |
||||
"pushq %%rbp\n" |
||||
|
||||
"movq 0(%0),%%r10\n" |
||||
"movq 8(%0),%%r11\n" |
||||
"movq 16(%0),%%r12\n" |
||||
"movq 24(%0),%%r13\n" |
||||
"movq 32(%0),%%r14\n" |
||||
"movq $0x1000003d10,%%rbp\n" |
||||
"movq $0xfffffffffffff,%%r15\n" |
||||
|
||||
/* d = (a0*2) * a3 */ |
||||
"leaq (%%r10,%%r10,1),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"movq %%rax,%%rbx\n" |
||||
"movq %%rdx,%%rcx\n" |
||||
/* d += (a1*2) * a2 */ |
||||
"leaq (%%r11,%%r11,1),%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c = a4 * a4 */ |
||||
"movq %%r14,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += (c & M) * R */ |
||||
"andq %%r15,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* t3 (stack) = d & M */ |
||||
"movq %%rbx,%0\n" |
||||
"andq %%r15,%0\n" |
||||
"pushq %0\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* a4 *= 2 */ |
||||
"addq %%r14,%%r14\n" |
||||
/* d += a0 * a4 */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d+= (a1*2) * a3 */ |
||||
"leaq (%%r11,%%r11,1),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += a2 * a2 */ |
||||
"movq %%r12,%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += c * R */ |
||||
"movq %%r8,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* t4 = d & M (%0) */ |
||||
"movq %%rbx,%0\n" |
||||
"andq %%r15,%0\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* tx = t4 >> 48 (%%rbp, overwrites constant) */ |
||||
"movq %0,%%rbp\n" |
||||
"shrq $48,%%rbp\n" |
||||
/* t4 &= (M >> 4) (stack) */ |
||||
"movq $0xffffffffffff,%%rax\n" |
||||
"andq %%rax,%0\n" |
||||
"pushq %0\n" |
||||
/* c = a0 * a0 */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r10\n" |
||||
"movq %%rax,%%r8\n" |
||||
"movq %%rdx,%%r9\n" |
||||
/* d += a1 * a4 */ |
||||
"movq %%r11,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += (a2*2) * a3 */ |
||||
"leaq (%%r12,%%r12,1),%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* u0 = d & M (%0) */ |
||||
"movq %%rbx,%0\n" |
||||
"andq %%r15,%0\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* u0 = (u0 << 4) | tx (%0) */ |
||||
"shlq $4,%0\n" |
||||
"orq %%rbp,%0\n" |
||||
/* c += u0 * (R >> 4) */ |
||||
"movq $0x1000003d1,%%rax\n" |
||||
"mulq %0\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[0] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,0(%1)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* a0 *= 2 */ |
||||
"addq %%r10,%%r10\n" |
||||
/* c += a0 * a1 */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d += a2 * a4 */ |
||||
"movq %%r12,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* d += a3 * a3 */ |
||||
"movq %%r13,%%rax\n" |
||||
"mulq %%r13\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* load R in %%rbp */ |
||||
"movq $0x1000003d10,%%rbp\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rbx,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
"xorq %%rcx,%%rcx\n" |
||||
/* r[1] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,8(%1)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += a0 * a2 (last use of %%r10) */ |
||||
"movq %%r10,%%rax\n" |
||||
"mulq %%r12\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* fetch t3 (%%r10, overwrites a0),t4 (%0) */ |
||||
"popq %0\n" |
||||
"popq %%r10\n" |
||||
/* c += a1 * a1 */ |
||||
"movq %%r11,%%rax\n" |
||||
"mulq %%r11\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d += a3 * a4 */ |
||||
"movq %%r13,%%rax\n" |
||||
"mulq %%r14\n" |
||||
"addq %%rax,%%rbx\n" |
||||
"adcq %%rdx,%%rcx\n" |
||||
/* c += (d & M) * R */ |
||||
"movq %%rbx,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* d >>= 52 (%%rbx only) */ |
||||
"shrdq $52,%%rcx,%%rbx\n" |
||||
/* r[2] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,16(%1)\n" |
||||
/* c >>= 52 */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
"xorq %%r9,%%r9\n" |
||||
/* c += t3 */ |
||||
"addq %%r10,%%r8\n" |
||||
/* c += d * R */ |
||||
"movq %%rbx,%%rax\n" |
||||
"mulq %%rbp\n" |
||||
"addq %%rax,%%r8\n" |
||||
"adcq %%rdx,%%r9\n" |
||||
/* r[3] = c & M */ |
||||
"movq %%r8,%%rax\n" |
||||
"andq %%r15,%%rax\n" |
||||
"movq %%rax,24(%1)\n" |
||||
/* c >>= 52 (%%r8 only) */ |
||||
"shrdq $52,%%r9,%%r8\n" |
||||
/* c += t4 (%%r8 only) */ |
||||
"addq %0,%%r8\n" |
||||
/* r[4] = c */ |
||||
"movq %%r8,32(%1)\n" |
||||
|
||||
"popq %%rbp\n" |
||||
: "+S"(a) |
||||
: "D"(r) |
||||
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" |
||||
); |
||||
} |
||||
|
||||
#endif |
||||
|
Loading…
Reference in new issue