Skip to content

Commit 6b2d11b

Browse files
Dong BoRealFYang
Dong Bo
authored andcommittedOct 28, 2020
8255246: AArch64: Implement BigInteger shiftRight and shiftLeft accelerator/intrinsic
Reviewed-by: aph
1 parent 591e7e2 commit 6b2d11b

File tree

3 files changed

+275
-2
lines changed

3 files changed

+275
-2
lines changed
 

‎src/hotspot/cpu/aarch64/globals_aarch64.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ define_pd_global(intx, InlineSmallCode, 1000);
9393
"Use SIMD instructions in generated array equals code") \
9494
product(bool, UseSimpleArrayEquals, false, \
9595
"Use simpliest and shortest implementation for array equals") \
96+
product(bool, UseSIMDForBigIntegerShiftIntrinsics, true, \
97+
"Use SIMD instructions for left/right shift of BigInteger") \
9698
product(bool, AvoidUnalignedAccesses, false, \
9799
"Avoid generating unaligned memory accesses") \
98100
product(bool, UseLSE, false, \

‎src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

+237
Original file line numberDiff line numberDiff line change
@@ -3968,6 +3968,238 @@ class StubGenerator: public StubCodeGenerator {
39683968
return start;
39693969
}
39703970

3971+
// Arguments:
3972+
//
3973+
// Input:
3974+
// c_rarg0 - newArr address
3975+
// c_rarg1 - oldArr address
3976+
// c_rarg2 - newIdx
3977+
// c_rarg3 - shiftCount
3978+
// c_rarg4 - numIter
3979+
//
3980+
address generate_bigIntegerRightShift() {
3981+
__ align(CodeEntryAlignment);
3982+
StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
3983+
address start = __ pc();
3984+
3985+
Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
3986+
3987+
Register newArr = c_rarg0;
3988+
Register oldArr = c_rarg1;
3989+
Register newIdx = c_rarg2;
3990+
Register shiftCount = c_rarg3;
3991+
Register numIter = c_rarg4;
3992+
Register idx = numIter;
3993+
3994+
Register newArrCur = rscratch1;
3995+
Register shiftRevCount = rscratch2;
3996+
Register oldArrCur = r13;
3997+
Register oldArrNext = r14;
3998+
3999+
FloatRegister oldElem0 = v0;
4000+
FloatRegister oldElem1 = v1;
4001+
FloatRegister newElem = v2;
4002+
FloatRegister shiftVCount = v3;
4003+
FloatRegister shiftVRevCount = v4;
4004+
4005+
__ cbz(idx, Exit);
4006+
4007+
__ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4008+
4009+
// left shift count
4010+
__ movw(shiftRevCount, 32);
4011+
__ subw(shiftRevCount, shiftRevCount, shiftCount);
4012+
4013+
// numIter too small to allow a 4-words SIMD loop, rolling back
4014+
__ cmp(numIter, (u1)4);
4015+
__ br(Assembler::LT, ShiftThree);
4016+
4017+
__ dup(shiftVCount, __ T4S, shiftCount);
4018+
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
4019+
__ negr(shiftVCount, __ T4S, shiftVCount);
4020+
4021+
__ BIND(ShiftSIMDLoop);
4022+
4023+
// Calculate the load addresses
4024+
__ sub(idx, idx, 4);
4025+
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4026+
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
4027+
__ add(oldArrCur, oldArrNext, 4);
4028+
4029+
// Load 4 words and process
4030+
__ ld1(oldElem0, __ T4S, Address(oldArrCur));
4031+
__ ld1(oldElem1, __ T4S, Address(oldArrNext));
4032+
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
4033+
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
4034+
__ orr(newElem, __ T16B, oldElem0, oldElem1);
4035+
__ st1(newElem, __ T4S, Address(newArrCur));
4036+
4037+
__ cmp(idx, (u1)4);
4038+
__ br(Assembler::LT, ShiftTwoLoop);
4039+
__ b(ShiftSIMDLoop);
4040+
4041+
__ BIND(ShiftTwoLoop);
4042+
__ cbz(idx, Exit);
4043+
__ cmp(idx, (u1)1);
4044+
__ br(Assembler::EQ, ShiftOne);
4045+
4046+
// Calculate the load addresses
4047+
__ sub(idx, idx, 2);
4048+
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4049+
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
4050+
__ add(oldArrCur, oldArrNext, 4);
4051+
4052+
// Load 2 words and process
4053+
__ ld1(oldElem0, __ T2S, Address(oldArrCur));
4054+
__ ld1(oldElem1, __ T2S, Address(oldArrNext));
4055+
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4056+
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4057+
__ orr(newElem, __ T8B, oldElem0, oldElem1);
4058+
__ st1(newElem, __ T2S, Address(newArrCur));
4059+
__ b(ShiftTwoLoop);
4060+
4061+
__ BIND(ShiftThree);
4062+
__ tbz(idx, 1, ShiftOne);
4063+
__ tbz(idx, 0, ShiftTwo);
4064+
__ ldrw(r10, Address(oldArr, 12));
4065+
__ ldrw(r11, Address(oldArr, 8));
4066+
__ lsrvw(r10, r10, shiftCount);
4067+
__ lslvw(r11, r11, shiftRevCount);
4068+
__ orrw(r12, r10, r11);
4069+
__ strw(r12, Address(newArr, 8));
4070+
4071+
__ BIND(ShiftTwo);
4072+
__ ldrw(r10, Address(oldArr, 8));
4073+
__ ldrw(r11, Address(oldArr, 4));
4074+
__ lsrvw(r10, r10, shiftCount);
4075+
__ lslvw(r11, r11, shiftRevCount);
4076+
__ orrw(r12, r10, r11);
4077+
__ strw(r12, Address(newArr, 4));
4078+
4079+
__ BIND(ShiftOne);
4080+
__ ldrw(r10, Address(oldArr, 4));
4081+
__ ldrw(r11, Address(oldArr));
4082+
__ lsrvw(r10, r10, shiftCount);
4083+
__ lslvw(r11, r11, shiftRevCount);
4084+
__ orrw(r12, r10, r11);
4085+
__ strw(r12, Address(newArr));
4086+
4087+
__ BIND(Exit);
4088+
__ ret(lr);
4089+
4090+
return start;
4091+
}
4092+
4093+
// Arguments:
4094+
//
4095+
// Input:
4096+
// c_rarg0 - newArr address
4097+
// c_rarg1 - oldArr address
4098+
// c_rarg2 - newIdx
4099+
// c_rarg3 - shiftCount
4100+
// c_rarg4 - numIter
4101+
//
4102+
address generate_bigIntegerLeftShift() {
4103+
__ align(CodeEntryAlignment);
4104+
StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
4105+
address start = __ pc();
4106+
4107+
Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4108+
4109+
Register newArr = c_rarg0;
4110+
Register oldArr = c_rarg1;
4111+
Register newIdx = c_rarg2;
4112+
Register shiftCount = c_rarg3;
4113+
Register numIter = c_rarg4;
4114+
4115+
Register shiftRevCount = rscratch1;
4116+
Register oldArrNext = rscratch2;
4117+
4118+
FloatRegister oldElem0 = v0;
4119+
FloatRegister oldElem1 = v1;
4120+
FloatRegister newElem = v2;
4121+
FloatRegister shiftVCount = v3;
4122+
FloatRegister shiftVRevCount = v4;
4123+
4124+
__ cbz(numIter, Exit);
4125+
4126+
__ add(oldArrNext, oldArr, 4);
4127+
__ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4128+
4129+
// right shift count
4130+
__ movw(shiftRevCount, 32);
4131+
__ subw(shiftRevCount, shiftRevCount, shiftCount);
4132+
4133+
// numIter too small to allow a 4-words SIMD loop, rolling back
4134+
__ cmp(numIter, (u1)4);
4135+
__ br(Assembler::LT, ShiftThree);
4136+
4137+
__ dup(shiftVCount, __ T4S, shiftCount);
4138+
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
4139+
__ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4140+
4141+
__ BIND(ShiftSIMDLoop);
4142+
4143+
// load 4 words and process
4144+
__ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
4145+
__ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
4146+
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
4147+
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
4148+
__ orr(newElem, __ T16B, oldElem0, oldElem1);
4149+
__ st1(newElem, __ T4S, __ post(newArr, 16));
4150+
__ sub(numIter, numIter, 4);
4151+
4152+
__ cmp(numIter, (u1)4);
4153+
__ br(Assembler::LT, ShiftTwoLoop);
4154+
__ b(ShiftSIMDLoop);
4155+
4156+
__ BIND(ShiftTwoLoop);
4157+
__ cbz(numIter, Exit);
4158+
__ cmp(numIter, (u1)1);
4159+
__ br(Assembler::EQ, ShiftOne);
4160+
4161+
// load 2 words and process
4162+
__ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
4163+
__ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
4164+
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4165+
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4166+
__ orr(newElem, __ T8B, oldElem0, oldElem1);
4167+
__ st1(newElem, __ T2S, __ post(newArr, 8));
4168+
__ sub(numIter, numIter, 2);
4169+
__ b(ShiftTwoLoop);
4170+
4171+
__ BIND(ShiftThree);
4172+
__ ldrw(r10, __ post(oldArr, 4));
4173+
__ ldrw(r11, __ post(oldArrNext, 4));
4174+
__ lslvw(r10, r10, shiftCount);
4175+
__ lsrvw(r11, r11, shiftRevCount);
4176+
__ orrw(r12, r10, r11);
4177+
__ strw(r12, __ post(newArr, 4));
4178+
__ tbz(numIter, 1, Exit);
4179+
__ tbz(numIter, 0, ShiftOne);
4180+
4181+
__ BIND(ShiftTwo);
4182+
__ ldrw(r10, __ post(oldArr, 4));
4183+
__ ldrw(r11, __ post(oldArrNext, 4));
4184+
__ lslvw(r10, r10, shiftCount);
4185+
__ lsrvw(r11, r11, shiftRevCount);
4186+
__ orrw(r12, r10, r11);
4187+
__ strw(r12, __ post(newArr, 4));
4188+
4189+
__ BIND(ShiftOne);
4190+
__ ldrw(r10, Address(oldArr));
4191+
__ ldrw(r11, Address(oldArrNext));
4192+
__ lslvw(r10, r10, shiftCount);
4193+
__ lsrvw(r11, r11, shiftRevCount);
4194+
__ orrw(r12, r10, r11);
4195+
__ strw(r12, Address(newArr));
4196+
4197+
__ BIND(Exit);
4198+
__ ret(lr);
4199+
4200+
return start;
4201+
}
4202+
39714203
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
39724204
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
39734205
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@@ -6224,6 +6456,11 @@ class StubGenerator: public StubCodeGenerator {
62246456
StubRoutines::_mulAdd = generate_mulAdd();
62256457
}
62266458

6459+
if (UseSIMDForBigIntegerShiftIntrinsics) {
6460+
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
6461+
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
6462+
}
6463+
62276464
if (UseMontgomeryMultiplyIntrinsic) {
62286465
StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
62296466
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);

‎test/micro/org/openjdk/bench/java/math/BigIntegers.java

+36-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -34,6 +34,7 @@
3434
import org.openjdk.jmh.annotations.Scope;
3535
import org.openjdk.jmh.annotations.Setup;
3636
import org.openjdk.jmh.annotations.State;
37+
import org.openjdk.jmh.annotations.Param;
3738
import org.openjdk.jmh.infra.Blackhole;
3839

3940
import java.math.BigInteger;
@@ -45,11 +46,14 @@
4546
@State(Scope.Thread)
4647
public class BigIntegers {
4748

48-
private BigInteger[] hugeArray, largeArray, smallArray, shiftArray;
49+
private BigInteger[] hugeArray, largeArray, smallArray, shiftArray, smallShiftArray;
4950
public String[] dummyStringArray;
5051
public Object[] dummyArr;
5152
private static final int TESTSIZE = 1000;
5253

54+
@Param({"32", "64", "96", "128", "160", "192", "224", "256"})
55+
private int maxNumbits;
56+
5357
@Setup
5458
public void setup() {
5559
Random r = new Random(1123);
@@ -72,6 +76,9 @@ public void setup() {
7276
* Each array entry is atmost 16k bits
7377
* in size
7478
*/
79+
smallShiftArray = new BigInteger[TESTSIZE]; /*
80+
* Small numbers, bits count in range [maxNumbits - 31, maxNumbits]
81+
*/
7582

7683
dummyStringArray = new String[TESTSIZE];
7784
dummyArr = new Object[TESTSIZE];
@@ -84,6 +91,7 @@ public void setup() {
8491
largeArray[i] = new BigInteger("" + ((long) value + (long) Integer.MAX_VALUE));
8592
smallArray[i] = new BigInteger("" + ((long) value / 1000));
8693
shiftArray[i] = new BigInteger(numbits, r);
94+
smallShiftArray[i] = new BigInteger(Math.max(maxNumbits - value % 32, 0), r);
8795
}
8896
}
8997

@@ -177,4 +185,30 @@ public void testRightShift(Blackhole bh) {
177185
}
178186
bh.consume(tmp);
179187
}
188+
189+
/** Invokes the shiftLeft method of small BigInteger with different values. */
190+
@Benchmark
191+
@OperationsPerInvocation(TESTSIZE)
192+
public void testSmallLeftShift(Blackhole bh) {
193+
Random rand = new Random();
194+
int shift = rand.nextInt(30) + 1;
195+
BigInteger tmp = null;
196+
for (BigInteger s : smallShiftArray) {
197+
tmp = s.shiftLeft(shift);
198+
bh.consume(tmp);
199+
}
200+
}
201+
202+
/** Invokes the shiftRight method of small BigInteger with different values. */
203+
@Benchmark
204+
@OperationsPerInvocation(TESTSIZE)
205+
public void testSmallRightShift(Blackhole bh) {
206+
Random rand = new Random();
207+
int shift = rand.nextInt(30) + 1;
208+
BigInteger tmp = null;
209+
for (BigInteger s : smallShiftArray) {
210+
tmp = s.shiftRight(shift);
211+
bh.consume(tmp);
212+
}
213+
}
180214
}

0 commit comments

Comments
 (0)
Please sign in to comment.