@@ -3968,6 +3968,238 @@ class StubGenerator: public StubCodeGenerator {
3968
3968
return start;
3969
3969
}
3970
3970
3971
+ // Arguments:
3972
+ //
3973
+ // Input:
3974
+ // c_rarg0 - newArr address
3975
+ // c_rarg1 - oldArr address
3976
+ // c_rarg2 - newIdx
3977
+ // c_rarg3 - shiftCount
3978
+ // c_rarg4 - numIter
3979
+ //
3980
+ address generate_bigIntegerRightShift () {
3981
+ __ align (CodeEntryAlignment);
3982
+ StubCodeMark mark (this , " StubRoutines" , " bigIntegerRightShiftWorker" );
3983
+ address start = __ pc ();
3984
+
3985
+ Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
3986
+
3987
+ Register newArr = c_rarg0;
3988
+ Register oldArr = c_rarg1;
3989
+ Register newIdx = c_rarg2;
3990
+ Register shiftCount = c_rarg3;
3991
+ Register numIter = c_rarg4;
3992
+ Register idx = numIter;
3993
+
3994
+ Register newArrCur = rscratch1;
3995
+ Register shiftRevCount = rscratch2;
3996
+ Register oldArrCur = r13;
3997
+ Register oldArrNext = r14;
3998
+
3999
+ FloatRegister oldElem0 = v0;
4000
+ FloatRegister oldElem1 = v1;
4001
+ FloatRegister newElem = v2;
4002
+ FloatRegister shiftVCount = v3;
4003
+ FloatRegister shiftVRevCount = v4;
4004
+
4005
+ __ cbz (idx, Exit);
4006
+
4007
+ __ add (newArr, newArr, newIdx, Assembler::LSL, 2 );
4008
+
4009
+ // left shift count
4010
+ __ movw (shiftRevCount, 32 );
4011
+ __ subw (shiftRevCount, shiftRevCount, shiftCount);
4012
+
4013
+ // numIter too small to allow a 4-words SIMD loop, rolling back
4014
+ __ cmp (numIter, (u1)4 );
4015
+ __ br (Assembler::LT, ShiftThree);
4016
+
4017
+ __ dup (shiftVCount, __ T4S, shiftCount);
4018
+ __ dup (shiftVRevCount, __ T4S, shiftRevCount);
4019
+ __ negr (shiftVCount, __ T4S, shiftVCount);
4020
+
4021
+ __ BIND (ShiftSIMDLoop);
4022
+
4023
+ // Calculate the load addresses
4024
+ __ sub (idx, idx, 4 );
4025
+ __ add (oldArrNext, oldArr, idx, Assembler::LSL, 2 );
4026
+ __ add (newArrCur, newArr, idx, Assembler::LSL, 2 );
4027
+ __ add (oldArrCur, oldArrNext, 4 );
4028
+
4029
+ // Load 4 words and process
4030
+ __ ld1 (oldElem0, __ T4S, Address (oldArrCur));
4031
+ __ ld1 (oldElem1, __ T4S, Address (oldArrNext));
4032
+ __ ushl (oldElem0, __ T4S, oldElem0, shiftVCount);
4033
+ __ ushl (oldElem1, __ T4S, oldElem1, shiftVRevCount);
4034
+ __ orr (newElem, __ T16B, oldElem0, oldElem1);
4035
+ __ st1 (newElem, __ T4S, Address (newArrCur));
4036
+
4037
+ __ cmp (idx, (u1)4 );
4038
+ __ br (Assembler::LT, ShiftTwoLoop);
4039
+ __ b (ShiftSIMDLoop);
4040
+
4041
+ __ BIND (ShiftTwoLoop);
4042
+ __ cbz (idx, Exit);
4043
+ __ cmp (idx, (u1)1 );
4044
+ __ br (Assembler::EQ, ShiftOne);
4045
+
4046
+ // Calculate the load addresses
4047
+ __ sub (idx, idx, 2 );
4048
+ __ add (oldArrNext, oldArr, idx, Assembler::LSL, 2 );
4049
+ __ add (newArrCur, newArr, idx, Assembler::LSL, 2 );
4050
+ __ add (oldArrCur, oldArrNext, 4 );
4051
+
4052
+ // Load 2 words and process
4053
+ __ ld1 (oldElem0, __ T2S, Address (oldArrCur));
4054
+ __ ld1 (oldElem1, __ T2S, Address (oldArrNext));
4055
+ __ ushl (oldElem0, __ T2S, oldElem0, shiftVCount);
4056
+ __ ushl (oldElem1, __ T2S, oldElem1, shiftVRevCount);
4057
+ __ orr (newElem, __ T8B, oldElem0, oldElem1);
4058
+ __ st1 (newElem, __ T2S, Address (newArrCur));
4059
+ __ b (ShiftTwoLoop);
4060
+
4061
+ __ BIND (ShiftThree);
4062
+ __ tbz (idx, 1 , ShiftOne);
4063
+ __ tbz (idx, 0 , ShiftTwo);
4064
+ __ ldrw (r10, Address (oldArr, 12 ));
4065
+ __ ldrw (r11, Address (oldArr, 8 ));
4066
+ __ lsrvw (r10, r10, shiftCount);
4067
+ __ lslvw (r11, r11, shiftRevCount);
4068
+ __ orrw (r12, r10, r11);
4069
+ __ strw (r12, Address (newArr, 8 ));
4070
+
4071
+ __ BIND (ShiftTwo);
4072
+ __ ldrw (r10, Address (oldArr, 8 ));
4073
+ __ ldrw (r11, Address (oldArr, 4 ));
4074
+ __ lsrvw (r10, r10, shiftCount);
4075
+ __ lslvw (r11, r11, shiftRevCount);
4076
+ __ orrw (r12, r10, r11);
4077
+ __ strw (r12, Address (newArr, 4 ));
4078
+
4079
+ __ BIND (ShiftOne);
4080
+ __ ldrw (r10, Address (oldArr, 4 ));
4081
+ __ ldrw (r11, Address (oldArr));
4082
+ __ lsrvw (r10, r10, shiftCount);
4083
+ __ lslvw (r11, r11, shiftRevCount);
4084
+ __ orrw (r12, r10, r11);
4085
+ __ strw (r12, Address (newArr));
4086
+
4087
+ __ BIND (Exit);
4088
+ __ ret (lr);
4089
+
4090
+ return start;
4091
+ }
4092
+
4093
+ // Arguments:
4094
+ //
4095
+ // Input:
4096
+ // c_rarg0 - newArr address
4097
+ // c_rarg1 - oldArr address
4098
+ // c_rarg2 - newIdx
4099
+ // c_rarg3 - shiftCount
4100
+ // c_rarg4 - numIter
4101
+ //
4102
+ address generate_bigIntegerLeftShift () {
4103
+ __ align (CodeEntryAlignment);
4104
+ StubCodeMark mark (this , " StubRoutines" , " bigIntegerLeftShiftWorker" );
4105
+ address start = __ pc ();
4106
+
4107
+ Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4108
+
4109
+ Register newArr = c_rarg0;
4110
+ Register oldArr = c_rarg1;
4111
+ Register newIdx = c_rarg2;
4112
+ Register shiftCount = c_rarg3;
4113
+ Register numIter = c_rarg4;
4114
+
4115
+ Register shiftRevCount = rscratch1;
4116
+ Register oldArrNext = rscratch2;
4117
+
4118
+ FloatRegister oldElem0 = v0;
4119
+ FloatRegister oldElem1 = v1;
4120
+ FloatRegister newElem = v2;
4121
+ FloatRegister shiftVCount = v3;
4122
+ FloatRegister shiftVRevCount = v4;
4123
+
4124
+ __ cbz (numIter, Exit);
4125
+
4126
+ __ add (oldArrNext, oldArr, 4 );
4127
+ __ add (newArr, newArr, newIdx, Assembler::LSL, 2 );
4128
+
4129
+ // right shift count
4130
+ __ movw (shiftRevCount, 32 );
4131
+ __ subw (shiftRevCount, shiftRevCount, shiftCount);
4132
+
4133
+ // numIter too small to allow a 4-words SIMD loop, rolling back
4134
+ __ cmp (numIter, (u1)4 );
4135
+ __ br (Assembler::LT, ShiftThree);
4136
+
4137
+ __ dup (shiftVCount, __ T4S, shiftCount);
4138
+ __ dup (shiftVRevCount, __ T4S, shiftRevCount);
4139
+ __ negr (shiftVRevCount, __ T4S, shiftVRevCount);
4140
+
4141
+ __ BIND (ShiftSIMDLoop);
4142
+
4143
+ // load 4 words and process
4144
+ __ ld1 (oldElem0, __ T4S, __ post (oldArr, 16 ));
4145
+ __ ld1 (oldElem1, __ T4S, __ post (oldArrNext, 16 ));
4146
+ __ ushl (oldElem0, __ T4S, oldElem0, shiftVCount);
4147
+ __ ushl (oldElem1, __ T4S, oldElem1, shiftVRevCount);
4148
+ __ orr (newElem, __ T16B, oldElem0, oldElem1);
4149
+ __ st1 (newElem, __ T4S, __ post (newArr, 16 ));
4150
+ __ sub (numIter, numIter, 4 );
4151
+
4152
+ __ cmp (numIter, (u1)4 );
4153
+ __ br (Assembler::LT, ShiftTwoLoop);
4154
+ __ b (ShiftSIMDLoop);
4155
+
4156
+ __ BIND (ShiftTwoLoop);
4157
+ __ cbz (numIter, Exit);
4158
+ __ cmp (numIter, (u1)1 );
4159
+ __ br (Assembler::EQ, ShiftOne);
4160
+
4161
+ // load 2 words and process
4162
+ __ ld1 (oldElem0, __ T2S, __ post (oldArr, 8 ));
4163
+ __ ld1 (oldElem1, __ T2S, __ post (oldArrNext, 8 ));
4164
+ __ ushl (oldElem0, __ T2S, oldElem0, shiftVCount);
4165
+ __ ushl (oldElem1, __ T2S, oldElem1, shiftVRevCount);
4166
+ __ orr (newElem, __ T8B, oldElem0, oldElem1);
4167
+ __ st1 (newElem, __ T2S, __ post (newArr, 8 ));
4168
+ __ sub (numIter, numIter, 2 );
4169
+ __ b (ShiftTwoLoop);
4170
+
4171
+ __ BIND (ShiftThree);
4172
+ __ ldrw (r10, __ post (oldArr, 4 ));
4173
+ __ ldrw (r11, __ post (oldArrNext, 4 ));
4174
+ __ lslvw (r10, r10, shiftCount);
4175
+ __ lsrvw (r11, r11, shiftRevCount);
4176
+ __ orrw (r12, r10, r11);
4177
+ __ strw (r12, __ post (newArr, 4 ));
4178
+ __ tbz (numIter, 1 , Exit);
4179
+ __ tbz (numIter, 0 , ShiftOne);
4180
+
4181
+ __ BIND (ShiftTwo);
4182
+ __ ldrw (r10, __ post (oldArr, 4 ));
4183
+ __ ldrw (r11, __ post (oldArrNext, 4 ));
4184
+ __ lslvw (r10, r10, shiftCount);
4185
+ __ lsrvw (r11, r11, shiftRevCount);
4186
+ __ orrw (r12, r10, r11);
4187
+ __ strw (r12, __ post (newArr, 4 ));
4188
+
4189
+ __ BIND (ShiftOne);
4190
+ __ ldrw (r10, Address (oldArr));
4191
+ __ ldrw (r11, Address (oldArrNext));
4192
+ __ lslvw (r10, r10, shiftCount);
4193
+ __ lsrvw (r11, r11, shiftRevCount);
4194
+ __ orrw (r12, r10, r11);
4195
+ __ strw (r12, Address (newArr));
4196
+
4197
+ __ BIND (Exit);
4198
+ __ ret (lr);
4199
+
4200
+ return start;
4201
+ }
4202
+
3971
4203
void ghash_multiply (FloatRegister result_lo, FloatRegister result_hi,
3972
4204
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3973
4205
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@@ -6224,6 +6456,11 @@ class StubGenerator: public StubCodeGenerator {
6224
6456
StubRoutines::_mulAdd = generate_mulAdd ();
6225
6457
}
6226
6458
6459
+ if (UseSIMDForBigIntegerShiftIntrinsics) {
6460
+ StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift ();
6461
+ StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift ();
6462
+ }
6463
+
6227
6464
if (UseMontgomeryMultiplyIntrinsic) {
6228
6465
StubCodeMark mark (this , " StubRoutines" , " montgomeryMultiply" );
6229
6466
MontgomeryMultiplyGenerator g (_masm, /* squaring*/ false );
0 commit comments