@@ -3679,14 +3679,11 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
3679
3679
3680
3680
#ifndef _WINDOWS
3681
3681
3682
- #define ASM_SUBTRACT
3683
-
3684
- #ifdef ASM_SUBTRACT
3685
3682
// Subtract 0:b from carry:a. Return carry.
3686
- static unsigned long
3687
- sub (unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3688
- long i = 0 , cnt = len;
3689
- unsigned long tmp;
3683
+ static julong
3684
+ sub (julong a[], julong b[], julong carry, long len) {
3685
+ long long i = 0 , cnt = len;
3686
+ julong tmp;
3690
3687
asm volatile (" clc; "
3691
3688
" 0: ; "
3692
3689
" mov (%[b], %[i], 8), %[tmp]; "
@@ -3699,24 +3696,6 @@ sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3699
3696
: " memory" );
3700
3697
return tmp;
3701
3698
}
3702
- #else // ASM_SUBTRACT
3703
- typedef int __attribute__ ((mode(TI))) int128;
3704
-
3705
- // Subtract 0:b from carry:a. Return carry.
3706
- static unsigned long
3707
- sub (unsigned long a[], unsigned long b[], unsigned long carry, int len) {
3708
- int128 tmp = 0 ;
3709
- int i;
3710
- for (i = 0 ; i < len; i++) {
3711
- tmp += a[i];
3712
- tmp -= b[i];
3713
- a[i] = tmp;
3714
- tmp >>= 64 ;
3715
- assert (-1 <= tmp && tmp <= 0 , " invariant" );
3716
- }
3717
- return tmp + carry;
3718
- }
3719
- #endif // ! ASM_SUBTRACT
3720
3699
3721
3700
// Multiply (unsigned) Long A by Long B, accumulating the double-
3722
3701
// length result into the accumulator formed of T0, T1, and T2.
@@ -3739,17 +3718,59 @@ do { \
3739
3718
: " r" (A), " a" (B) : " cc" ); \
3740
3719
} while (0 )
3741
3720
3721
+ #else // _WINDOWS
3722
+
3723
+ static julong
3724
+ sub (julong a[], julong b[], julong carry, long len) {
3725
+ long i;
3726
+ julong tmp;
3727
+ unsigned char c = 1 ;
3728
+ for (i = 0 ; i < len; i++) {
3729
+ c = _addcarry_u64 (c, a[i], ~b[i], &tmp);
3730
+ a[i] = tmp;
3731
+ }
3732
+ c = _addcarry_u64 (c, carry, ~0 , &tmp);
3733
+ return tmp;
3734
+ }
3735
+
3736
+ // Multiply (unsigned) Long A by Long B, accumulating the double-
3737
+ // length result into the accumulator formed of T0, T1, and T2.
3738
+ #define MACC (A, B, T0, T1, T2 ) \
3739
+ do { \
3740
+ julong hi, lo; \
3741
+ lo = _umul128 (A, B, &hi); \
3742
+ unsigned char c = _addcarry_u64 (0 , lo, T0, &T0); \
3743
+ c = _addcarry_u64 (c, hi, T1, &T1); \
3744
+ _addcarry_u64 (c, T2, 0 , &T2); \
3745
+ } while (0 )
3746
+
3747
+ // As above, but add twice the double-length result into the
3748
+ // accumulator.
3749
+ #define MACC2 (A, B, T0, T1, T2 ) \
3750
+ do { \
3751
+ julong hi, lo; \
3752
+ lo = _umul128 (A, B, &hi); \
3753
+ unsigned char c = _addcarry_u64 (0 , lo, T0, &T0); \
3754
+ c = _addcarry_u64 (c, hi, T1, &T1); \
3755
+ _addcarry_u64 (c, T2, 0 , &T2); \
3756
+ c = _addcarry_u64 (0 , lo, T0, &T0); \
3757
+ c = _addcarry_u64 (c, hi, T1, &T1); \
3758
+ _addcarry_u64 (c, T2, 0 , &T2); \
3759
+ } while (0 )
3760
+
3761
+ #endif // _WINDOWS
3762
+
3742
3763
// Fast Montgomery multiplication. The derivation of the algorithm is
3743
3764
// in A Cryptographic Library for the Motorola DSP56000,
3744
3765
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3745
3766
3746
- static void __attribute__ ((noinline))
3747
- montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3748
- unsigned long m[], unsigned long inv, int len) {
3749
- unsigned long t0 = 0 , t1 = 0 , t2 = 0 ; // Triple-precision accumulator
3767
+ static void NOINLINE
3768
+ montgomery_multiply (julong a[], julong b[], julong n[],
3769
+ julong m[], julong inv, int len) {
3770
+ julong t0 = 0 , t1 = 0 , t2 = 0 ; // Triple-precision accumulator
3750
3771
int i;
3751
3772
3752
- assert (inv * n[0 ] == -1UL , " broken inverse in Montgomery multiply" );
3773
+ assert (inv * n[0 ] == -1ULL , " broken inverse in Montgomery multiply" );
3753
3774
3754
3775
for (i = 0 ; i < len; i++) {
3755
3776
int j;
@@ -3785,13 +3806,13 @@ montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
3785
3806
// multiplication. However, its loop control is more complex and it
3786
3807
// may actually run slower on some machines.
3787
3808
3788
- static void __attribute__ ((noinline))
3789
- montgomery_square(unsigned long a[], unsigned long n[],
3790
- unsigned long m[], unsigned long inv, int len) {
3791
- unsigned long t0 = 0 , t1 = 0 , t2 = 0 ; // Triple-precision accumulator
3809
+ static void NOINLINE
3810
+ montgomery_square (julong a[], julong n[],
3811
+ julong m[], julong inv, int len) {
3812
+ julong t0 = 0 , t1 = 0 , t2 = 0 ; // Triple-precision accumulator
3792
3813
int i;
3793
3814
3794
- assert (inv * n[0 ] == -1UL , " broken inverse in Montgomery multiply " );
3815
+ assert (inv * n[0 ] == -1ULL , " broken inverse in Montgomery square " );
3795
3816
3796
3817
for (i = 0 ; i < len; i++) {
3797
3818
int j;
@@ -3837,13 +3858,13 @@ montgomery_square(unsigned long a[], unsigned long n[],
3837
3858
}
3838
3859
3839
3860
// Swap words in a longword.
3840
- static unsigned long swap (unsigned long x) {
3861
+ static julong swap (julong x) {
3841
3862
return (x << 32 ) | (x >> 32 );
3842
3863
}
3843
3864
3844
3865
// Copy len longwords from s to d, word-swapping as we go. The
3845
3866
// destination array is reversed.
3846
- static void reverse_words (unsigned long *s, unsigned long *d, int len) {
3867
+ static void reverse_words (julong *s, julong *d, int len) {
3847
3868
d += len;
3848
3869
while (len-- > 0 ) {
3849
3870
d--;
@@ -3865,24 +3886,24 @@ void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints
3865
3886
// Make very sure we don't use so much space that the stack might
3866
3887
// overflow. 512 jints corresponds to an 16384-bit integer and
3867
3888
// will use here a total of 8k bytes of stack space.
3868
- int total_allocation = longwords * sizeof (unsigned long ) * 4 ;
3889
+ int total_allocation = longwords * sizeof (julong ) * 4 ;
3869
3890
guarantee (total_allocation <= 8192 , " must be" );
3870
- unsigned long *scratch = (unsigned long *)alloca (total_allocation);
3891
+ julong *scratch = (julong *)alloca (total_allocation);
3871
3892
3872
3893
// Local scratch arrays
3873
- unsigned long
3894
+ julong
3874
3895
*a = scratch + 0 * longwords,
3875
3896
*b = scratch + 1 * longwords,
3876
3897
*n = scratch + 2 * longwords,
3877
3898
*m = scratch + 3 * longwords;
3878
3899
3879
- reverse_words ((unsigned long *)a_ints, a, longwords);
3880
- reverse_words ((unsigned long *)b_ints, b, longwords);
3881
- reverse_words ((unsigned long *)n_ints, n, longwords);
3900
+ reverse_words ((julong *)a_ints, a, longwords);
3901
+ reverse_words ((julong *)b_ints, b, longwords);
3902
+ reverse_words ((julong *)n_ints, n, longwords);
3882
3903
3883
- ::montgomery_multiply (a, b, n, m, (unsigned long )inv, longwords);
3904
+ ::montgomery_multiply (a, b, n, m, (julong )inv, longwords);
3884
3905
3885
- reverse_words (m, (unsigned long *)m_ints, longwords);
3906
+ reverse_words (m, (julong *)m_ints, longwords);
3886
3907
}
3887
3908
3888
3909
void SharedRuntime::montgomery_square (jint *a_ints, jint *n_ints,
@@ -3894,30 +3915,28 @@ void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
3894
3915
// Make very sure we don't use so much space that the stack might
3895
3916
// overflow. 512 jints corresponds to an 16384-bit integer and
3896
3917
// will use here a total of 6k bytes of stack space.
3897
- int total_allocation = longwords * sizeof (unsigned long ) * 3 ;
3918
+ int total_allocation = longwords * sizeof (julong ) * 3 ;
3898
3919
guarantee (total_allocation <= 8192 , " must be" );
3899
- unsigned long *scratch = (unsigned long *)alloca (total_allocation);
3920
+ julong *scratch = (julong *)alloca (total_allocation);
3900
3921
3901
3922
// Local scratch arrays
3902
- unsigned long
3923
+ julong
3903
3924
*a = scratch + 0 * longwords,
3904
3925
*n = scratch + 1 * longwords,
3905
3926
*m = scratch + 2 * longwords;
3906
3927
3907
- reverse_words ((unsigned long *)a_ints, a, longwords);
3908
- reverse_words ((unsigned long *)n_ints, n, longwords);
3928
+ reverse_words ((julong *)a_ints, a, longwords);
3929
+ reverse_words ((julong *)n_ints, n, longwords);
3909
3930
3910
3931
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
3911
- ::montgomery_square (a, n, m, (unsigned long )inv, longwords);
3932
+ ::montgomery_square (a, n, m, (julong )inv, longwords);
3912
3933
} else {
3913
- ::montgomery_multiply (a, a, n, m, (unsigned long )inv, longwords);
3934
+ ::montgomery_multiply (a, a, n, m, (julong )inv, longwords);
3914
3935
}
3915
3936
3916
- reverse_words (m, (unsigned long *)m_ints, longwords);
3937
+ reverse_words (m, (julong *)m_ints, longwords);
3917
3938
}
3918
3939
3919
- #endif // WINDOWS
3920
-
3921
3940
#ifdef COMPILER2
3922
3941
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
3923
3942
//
0 commit comments