8255246: AArch64: Implement BigInteger shiftRight and shiftLeft accelerator/intrinsic

Dong Bo · RealFYang · commit 6b2d11ba243e · 2020-10-28T11:52:07.000Z
Reviewed-by: aph
diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp
@@ -93,6 +93,8 @@ define_pd_global(intx, InlineSmallCode,          1000);
           "Use SIMD instructions in generated array equals code")       \
   product(bool, UseSimpleArrayEquals, false,                            \
           "Use simpliest and shortest implementation for array equals") \
+  product(bool, UseSIMDForBigIntegerShiftIntrinsics, true,              \
+          "Use SIMD instructions for left/right shift of BigInteger")   \
   product(bool, AvoidUnalignedAccesses, false,                          \
           "Avoid generating unaligned memory accesses")                 \
   product(bool, UseLSE, false,                                          \
diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
@@ -3968,6 +3968,238 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }
 
+  // Arguments:
+  //
+  // Input:
+  //   c_rarg0   - newArr address
+  //   c_rarg1   - oldArr address
+  //   c_rarg2   - newIdx
+  //   c_rarg3   - shiftCount
+  //   c_rarg4   - numIter
+  //
+  address generate_bigIntegerRightShift() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
+    address start = __ pc();
+
+    Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
+
+    Register newArr        = c_rarg0;
+    Register oldArr        = c_rarg1;
+    Register newIdx        = c_rarg2;
+    Register shiftCount    = c_rarg3;
+    Register numIter       = c_rarg4;
+    Register idx           = numIter;
+
+    Register newArrCur     = rscratch1;
+    Register shiftRevCount = rscratch2;
+    Register oldArrCur     = r13;
+    Register oldArrNext    = r14;
+
+    FloatRegister oldElem0        = v0;
+    FloatRegister oldElem1        = v1;
+    FloatRegister newElem         = v2;
+    FloatRegister shiftVCount     = v3;
+    FloatRegister shiftVRevCount  = v4;
+
+    __ cbz(idx, Exit);
+
+    __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
+
+    // left shift count
+    __ movw(shiftRevCount, 32);
+    __ subw(shiftRevCount, shiftRevCount, shiftCount);
+
+    // numIter too small to allow a 4-words SIMD loop, rolling back
+    __ cmp(numIter, (u1)4);
+    __ br(Assembler::LT, ShiftThree);
+
+    __ dup(shiftVCount,    __ T4S, shiftCount);
+    __ dup(shiftVRevCount, __ T4S, shiftRevCount);
+    __ negr(shiftVCount,   __ T4S, shiftVCount);
+
+    __ BIND(ShiftSIMDLoop);
+
+    // Calculate the load addresses
+    __ sub(idx, idx, 4);
+    __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
+    __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
+    __ add(oldArrCur,  oldArrNext, 4);
+
+    // Load 4 words and process
+    __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
+    __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
+    __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
+    __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
+    __ orr(newElem,   __ T16B, oldElem0, oldElem1);
+    __ st1(newElem,   __ T4S,  Address(newArrCur));
+
+    __ cmp(idx, (u1)4);
+    __ br(Assembler::LT, ShiftTwoLoop);
+    __ b(ShiftSIMDLoop);
+
+    __ BIND(ShiftTwoLoop);
+    __ cbz(idx, Exit);
+    __ cmp(idx, (u1)1);
+    __ br(Assembler::EQ, ShiftOne);
+
+    // Calculate the load addresses
+    __ sub(idx, idx, 2);
+    __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
+    __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
+    __ add(oldArrCur,  oldArrNext, 4);
+
+    // Load 2 words and process
+    __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
+    __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
+    __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
+    __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
+    __ orr(newElem,   __ T8B, oldElem0, oldElem1);
+    __ st1(newElem,   __ T2S, Address(newArrCur));
+    __ b(ShiftTwoLoop);
+
+    __ BIND(ShiftThree);
+    __ tbz(idx, 1, ShiftOne);
+    __ tbz(idx, 0, ShiftTwo);
+    __ ldrw(r10,  Address(oldArr, 12));
+    __ ldrw(r11,  Address(oldArr, 8));
+    __ lsrvw(r10, r10, shiftCount);
+    __ lslvw(r11, r11, shiftRevCount);
+    __ orrw(r12,  r10, r11);
+    __ strw(r12,  Address(newArr, 8));
+
+    __ BIND(ShiftTwo);
+    __ ldrw(r10,  Address(oldArr, 8));
+    __ ldrw(r11,  Address(oldArr, 4));
+    __ lsrvw(r10, r10, shiftCount);
+    __ lslvw(r11, r11, shiftRevCount);
+    __ orrw(r12,  r10, r11);
+    __ strw(r12,  Address(newArr, 4));
+
+    __ BIND(ShiftOne);
+    __ ldrw(r10,  Address(oldArr, 4));
+    __ ldrw(r11,  Address(oldArr));
+    __ lsrvw(r10, r10, shiftCount);
+    __ lslvw(r11, r11, shiftRevCount);
+    __ orrw(r12,  r10, r11);
+    __ strw(r12,  Address(newArr));
+
+    __ BIND(Exit);
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Input:
+  //   c_rarg0   - newArr address
+  //   c_rarg1   - oldArr address
+  //   c_rarg2   - newIdx
+  //   c_rarg3   - shiftCount
+  //   c_rarg4   - numIter
+  //
+  address generate_bigIntegerLeftShift() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
+    address start = __ pc();
+
+    Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
+
+    Register newArr        = c_rarg0;
+    Register oldArr        = c_rarg1;
+    Register newIdx        = c_rarg2;
+    Register shiftCount    = c_rarg3;
+    Register numIter       = c_rarg4;
+
+    Register shiftRevCount = rscratch1;
+    Register oldArrNext    = rscratch2;
+
+    FloatRegister oldElem0        = v0;
+    FloatRegister oldElem1        = v1;
+    FloatRegister newElem         = v2;
+    FloatRegister shiftVCount     = v3;
+    FloatRegister shiftVRevCount  = v4;
+
+    __ cbz(numIter, Exit);
+
+    __ add(oldArrNext, oldArr, 4);
+    __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
+
+    // right shift count
+    __ movw(shiftRevCount, 32);
+    __ subw(shiftRevCount, shiftRevCount, shiftCount);
+
+    // numIter too small to allow a 4-words SIMD loop, rolling back
+    __ cmp(numIter, (u1)4);
+    __ br(Assembler::LT, ShiftThree);
+
+    __ dup(shiftVCount,     __ T4S, shiftCount);
+    __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
+    __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
+
+    __ BIND(ShiftSIMDLoop);
+
+    // load 4 words and process
+    __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
+    __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
+    __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
+    __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
+    __ orr(newElem,   __ T16B, oldElem0, oldElem1);
+    __ st1(newElem,   __ T4S,  __ post(newArr, 16));
+    __ sub(numIter,   numIter, 4);
+
+    __ cmp(numIter, (u1)4);
+    __ br(Assembler::LT, ShiftTwoLoop);
+    __ b(ShiftSIMDLoop);
+
+    __ BIND(ShiftTwoLoop);
+    __ cbz(numIter, Exit);
+    __ cmp(numIter, (u1)1);
+    __ br(Assembler::EQ, ShiftOne);
+
+    // load 2 words and process
+    __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
+    __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
+    __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
+    __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
+    __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
+    __ st1(newElem,   __ T2S,  __ post(newArr, 8));
+    __ sub(numIter,   numIter, 2);
+    __ b(ShiftTwoLoop);
+
+    __ BIND(ShiftThree);
+    __ ldrw(r10,  __ post(oldArr, 4));
+    __ ldrw(r11,  __ post(oldArrNext, 4));
+    __ lslvw(r10, r10, shiftCount);
+    __ lsrvw(r11, r11, shiftRevCount);
+    __ orrw(r12,  r10, r11);
+    __ strw(r12,  __ post(newArr, 4));
+    __ tbz(numIter, 1, Exit);
+    __ tbz(numIter, 0, ShiftOne);
+
+    __ BIND(ShiftTwo);
+    __ ldrw(r10,  __ post(oldArr, 4));
+    __ ldrw(r11,  __ post(oldArrNext, 4));
+    __ lslvw(r10, r10, shiftCount);
+    __ lsrvw(r11, r11, shiftRevCount);
+    __ orrw(r12,  r10, r11);
+    __ strw(r12,  __ post(newArr, 4));
+
+    __ BIND(ShiftOne);
+    __ ldrw(r10,  Address(oldArr));
+    __ ldrw(r11,  Address(oldArrNext));
+    __ lslvw(r10, r10, shiftCount);
+    __ lsrvw(r11, r11, shiftRevCount);
+    __ orrw(r12,  r10, r11);
+    __ strw(r12,  Address(newArr));
+
+    __ BIND(Exit);
+    __ ret(lr);
+
+    return start;
+  }
+
   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@@ -6224,6 +6456,11 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_mulAdd = generate_mulAdd();
     }
 
+    if (UseSIMDForBigIntegerShiftIntrinsics) {
+      StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
+      StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
+    }
+
     if (UseMontgomeryMultiplyIntrinsic) {
       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
diff --git a/test/micro/org/openjdk/bench/java/math/BigIntegers.java b/test/micro/org/openjdk/bench/java/math/BigIntegers.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -34,6 +34,7 @@
 import org.openjdk.jmh.annotations.Scope;
 import org.openjdk.jmh.annotations.Setup;
 import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Param;
 import org.openjdk.jmh.infra.Blackhole;
 
 import java.math.BigInteger;
@@ -45,11 +46,14 @@
 @State(Scope.Thread)
 public class BigIntegers {
 
-    private BigInteger[] hugeArray, largeArray, smallArray, shiftArray;
+    private BigInteger[] hugeArray, largeArray, smallArray, shiftArray, smallShiftArray;
     public String[] dummyStringArray;
     public Object[] dummyArr;
     private static final int TESTSIZE = 1000;
 
+    @Param({"32", "64", "96", "128", "160", "192", "224", "256"})
+    private int maxNumbits;
+
     @Setup
     public void setup() {
         Random r = new Random(1123);
@@ -72,6 +76,9 @@ public void setup() {
          * Each array entry is atmost 16k bits
          * in size
          */
+        smallShiftArray = new BigInteger[TESTSIZE]; /*
+        * Small numbers, bits count in range [maxNumbits - 31, maxNumbits]
+        */
 
         dummyStringArray = new String[TESTSIZE];
         dummyArr = new Object[TESTSIZE];
@@ -84,6 +91,7 @@ public void setup() {
             largeArray[i] = new BigInteger("" + ((long) value + (long) Integer.MAX_VALUE));
             smallArray[i] = new BigInteger("" + ((long) value / 1000));
             shiftArray[i] = new BigInteger(numbits, r);
+            smallShiftArray[i] = new BigInteger(Math.max(maxNumbits - value % 32, 0), r);
         }
     }
 
@@ -177,4 +185,30 @@ public void testRightShift(Blackhole bh) {
         }
         bh.consume(tmp);
     }
+
+    /** Invokes the shiftLeft method of small BigInteger with different values. */
+    @Benchmark
+    @OperationsPerInvocation(TESTSIZE)
+    public void testSmallLeftShift(Blackhole bh) {
+        Random rand = new Random();
+        int shift = rand.nextInt(30) + 1;
+        BigInteger tmp = null;
+        for (BigInteger s : smallShiftArray) {
+            tmp = s.shiftLeft(shift);
+            bh.consume(tmp);
+        }
+    }
+
+    /** Invokes the shiftRight method of small BigInteger with different values. */
+    @Benchmark
+    @OperationsPerInvocation(TESTSIZE)
+    public void testSmallRightShift(Blackhole bh) {
+        Random rand = new Random();
+        int shift = rand.nextInt(30) + 1;
+        BigInteger tmp = null;
+        for (BigInteger s : smallShiftArray) {
+            tmp = s.shiftRight(shift);
+            bh.consume(tmp);
+        }
+    }
 }