Skip to content

Commit 47e465c

Browse files
theRealAphSimon Tooke
authored and
Simon Tooke
committedJun 25, 2020
8243114: Implement montgomery{Multiply,Square}intrinsics on Windows
Reviewed-by: dholmes, andrew
1 parent 0f2ac20 commit 47e465c

File tree

2 files changed

+74
-57
lines changed

2 files changed

+74
-57
lines changed
 

‎src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp

+74-55
Original file line numberDiff line numberDiff line change
@@ -3679,14 +3679,11 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
36793679

36803680
#ifndef _WINDOWS
36813681

3682-
#define ASM_SUBTRACT
3683-
3684-
#ifdef ASM_SUBTRACT
36853682
// Subtract 0:b from carry:a. Return carry.
3686-
static unsigned long
3687-
sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3688-
long i = 0, cnt = len;
3689-
unsigned long tmp;
3683+
static julong
3684+
sub(julong a[], julong b[], julong carry, long len) {
3685+
long long i = 0, cnt = len;
3686+
julong tmp;
36903687
asm volatile("clc; "
36913688
"0: ; "
36923689
"mov (%[b], %[i], 8), %[tmp]; "
@@ -3699,24 +3696,6 @@ sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
36993696
: "memory");
37003697
return tmp;
37013698
}
3702-
#else // ASM_SUBTRACT
3703-
typedef int __attribute__((mode(TI))) int128;
3704-
3705-
// Subtract 0:b from carry:a. Return carry.
3706-
static unsigned long
3707-
sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3708-
int128 tmp = 0;
3709-
int i;
3710-
for (i = 0; i < len; i++) {
3711-
tmp += a[i];
3712-
tmp -= b[i];
3713-
a[i] = tmp;
3714-
tmp >>= 64;
3715-
assert(-1 <= tmp && tmp <= 0, "invariant");
3716-
}
3717-
return tmp + carry;
3718-
}
3719-
#endif // ! ASM_SUBTRACT
37203699

37213700
// Multiply (unsigned) Long A by Long B, accumulating the double-
37223701
// length result into the accumulator formed of T0, T1, and T2.
@@ -3739,17 +3718,59 @@ do { \
37393718
: "r"(A), "a"(B) : "cc"); \
37403719
} while(0)
37413720

3721+
#else //_WINDOWS
3722+
3723+
static julong
3724+
sub(julong a[], julong b[], julong carry, long len) {
3725+
long i;
3726+
julong tmp;
3727+
unsigned char c = 1;
3728+
for (i = 0; i < len; i++) {
3729+
c = _addcarry_u64(c, a[i], ~b[i], &tmp);
3730+
a[i] = tmp;
3731+
}
3732+
c = _addcarry_u64(c, carry, ~0, &tmp);
3733+
return tmp;
3734+
}
3735+
3736+
// Multiply (unsigned) Long A by Long B, accumulating the double-
3737+
// length result into the accumulator formed of T0, T1, and T2.
3738+
#define MACC(A, B, T0, T1, T2) \
3739+
do { \
3740+
julong hi, lo; \
3741+
lo = _umul128(A, B, &hi); \
3742+
unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3743+
c = _addcarry_u64(c, hi, T1, &T1); \
3744+
_addcarry_u64(c, T2, 0, &T2); \
3745+
} while(0)
3746+
3747+
// As above, but add twice the double-length result into the
3748+
// accumulator.
3749+
#define MACC2(A, B, T0, T1, T2) \
3750+
do { \
3751+
julong hi, lo; \
3752+
lo = _umul128(A, B, &hi); \
3753+
unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
3754+
c = _addcarry_u64(c, hi, T1, &T1); \
3755+
_addcarry_u64(c, T2, 0, &T2); \
3756+
c = _addcarry_u64(0, lo, T0, &T0); \
3757+
c = _addcarry_u64(c, hi, T1, &T1); \
3758+
_addcarry_u64(c, T2, 0, &T2); \
3759+
} while(0)
3760+
3761+
#endif //_WINDOWS
3762+
37423763
// Fast Montgomery multiplication. The derivation of the algorithm is
37433764
// in A Cryptographic Library for the Motorola DSP56000,
37443765
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
37453766

3746-
static void __attribute__((noinline))
3747-
montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3748-
unsigned long m[], unsigned long inv, int len) {
3749-
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3767+
static void NOINLINE
3768+
montgomery_multiply(julong a[], julong b[], julong n[],
3769+
julong m[], julong inv, int len) {
3770+
julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
37503771
int i;
37513772

3752-
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3773+
assert(inv * n[0] == -1ULL, "broken inverse in Montgomery multiply");
37533774

37543775
for (i = 0; i < len; i++) {
37553776
int j;
@@ -3785,13 +3806,13 @@ montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
37853806
// multiplication. However, its loop control is more complex and it
37863807
// may actually run slower on some machines.
37873808

3788-
static void __attribute__((noinline))
3789-
montgomery_square(unsigned long a[], unsigned long n[],
3790-
unsigned long m[], unsigned long inv, int len) {
3791-
unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3809+
static void NOINLINE
3810+
montgomery_square(julong a[], julong n[],
3811+
julong m[], julong inv, int len) {
3812+
julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
37923813
int i;
37933814

3794-
assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3815+
assert(inv * n[0] == -1ULL, "broken inverse in Montgomery square");
37953816

37963817
for (i = 0; i < len; i++) {
37973818
int j;
@@ -3837,13 +3858,13 @@ montgomery_square(unsigned long a[], unsigned long n[],
38373858
}
38383859

38393860
// Swap words in a longword.
3840-
static unsigned long swap(unsigned long x) {
3861+
static julong swap(julong x) {
38413862
return (x << 32) | (x >> 32);
38423863
}
38433864

38443865
// Copy len longwords from s to d, word-swapping as we go. The
38453866
// destination array is reversed.
3846-
static void reverse_words(unsigned long *s, unsigned long *d, int len) {
3867+
static void reverse_words(julong *s, julong *d, int len) {
38473868
d += len;
38483869
while(len-- > 0) {
38493870
d--;
@@ -3865,24 +3886,24 @@ void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints
38653886
// Make very sure we don't use so much space that the stack might
38663887
// overflow. 512 jints corresponds to an 16384-bit integer and
38673888
// will use here a total of 8k bytes of stack space.
3868-
int total_allocation = longwords * sizeof (unsigned long) * 4;
3889+
int total_allocation = longwords * sizeof (julong) * 4;
38693890
guarantee(total_allocation <= 8192, "must be");
3870-
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3891+
julong *scratch = (julong *)alloca(total_allocation);
38713892

38723893
// Local scratch arrays
3873-
unsigned long
3894+
julong
38743895
*a = scratch + 0 * longwords,
38753896
*b = scratch + 1 * longwords,
38763897
*n = scratch + 2 * longwords,
38773898
*m = scratch + 3 * longwords;
38783899

3879-
reverse_words((unsigned long *)a_ints, a, longwords);
3880-
reverse_words((unsigned long *)b_ints, b, longwords);
3881-
reverse_words((unsigned long *)n_ints, n, longwords);
3900+
reverse_words((julong *)a_ints, a, longwords);
3901+
reverse_words((julong *)b_ints, b, longwords);
3902+
reverse_words((julong *)n_ints, n, longwords);
38823903

3883-
::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
3904+
::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
38843905

3885-
reverse_words(m, (unsigned long *)m_ints, longwords);
3906+
reverse_words(m, (julong *)m_ints, longwords);
38863907
}
38873908

38883909
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
@@ -3894,30 +3915,28 @@ void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
38943915
// Make very sure we don't use so much space that the stack might
38953916
// overflow. 512 jints corresponds to an 16384-bit integer and
38963917
// will use here a total of 6k bytes of stack space.
3897-
int total_allocation = longwords * sizeof (unsigned long) * 3;
3918+
int total_allocation = longwords * sizeof (julong) * 3;
38983919
guarantee(total_allocation <= 8192, "must be");
3899-
unsigned long *scratch = (unsigned long *)alloca(total_allocation);
3920+
julong *scratch = (julong *)alloca(total_allocation);
39003921

39013922
// Local scratch arrays
3902-
unsigned long
3923+
julong
39033924
*a = scratch + 0 * longwords,
39043925
*n = scratch + 1 * longwords,
39053926
*m = scratch + 2 * longwords;
39063927

3907-
reverse_words((unsigned long *)a_ints, a, longwords);
3908-
reverse_words((unsigned long *)n_ints, n, longwords);
3928+
reverse_words((julong *)a_ints, a, longwords);
3929+
reverse_words((julong *)n_ints, n, longwords);
39093930

39103931
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3911-
::montgomery_square(a, n, m, (unsigned long)inv, longwords);
3932+
::montgomery_square(a, n, m, (julong)inv, longwords);
39123933
} else {
3913-
::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
3934+
::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
39143935
}
39153936

3916-
reverse_words(m, (unsigned long *)m_ints, longwords);
3937+
reverse_words(m, (julong *)m_ints, longwords);
39173938
}
39183939

3919-
#endif // WINDOWS
3920-
39213940
#ifdef COMPILER2
39223941
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
39233942
//

‎src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

-2
Original file line numberDiff line numberDiff line change
@@ -6566,7 +6566,6 @@ address generate_avx_ghash_processBlocks() {
65666566
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
65676567
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
65686568
}
6569-
#ifndef _WINDOWS
65706569
if (UseMontgomeryMultiplyIntrinsic) {
65716570
StubRoutines::_montgomeryMultiply
65726571
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
@@ -6575,7 +6574,6 @@ address generate_avx_ghash_processBlocks() {
65756574
StubRoutines::_montgomerySquare
65766575
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
65776576
}
6578-
#endif // WINDOWS
65796577
#endif // COMPILER2
65806578

65816579
if (UseVectorizedMismatchIntrinsic) {

0 commit comments

Comments
 (0)
Please sign in to comment.