Skip to content

Commit e9f45bb

Browse files
e1iuXiaohong Gong
authored and
Xiaohong Gong
committedMay 12, 2022
8282966: AArch64: Optimize VectorMask.toLong with SVE2
Reviewed-by: xgong, ngasson
1 parent 57a7670 commit e9f45bb

File tree

7 files changed

+116
-52
lines changed

7 files changed

+116
-52
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64_sve.ad

+3-2
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ source %{
149149
case Op_LoadVector:
150150
case Op_StoreVector:
151151
return Matcher::vector_size_supported(bt, vlen);
152+
case Op_VectorMaskToLong:
153+
if (vlen > 64) return false;
152154
default:
153155
break;
154156
}
@@ -5487,8 +5489,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
54875489
%}
54885490

54895491
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
5490-
predicate(UseSVE > 0 &&
5491-
n->in(1)->bottom_type()->is_vect()->length() <= 64);
5492+
predicate(UseSVE > 0);
54925493
match(Set dst (VectorMaskToLong src));
54935494
effect(TEMP vtmp1, TEMP vtmp2);
54945495
ins_cost(13 * SVE_COST);

‎src/hotspot/cpu/aarch64/aarch64_sve_ad.m4

+3-2
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,8 @@ source %{
144144
case Op_LoadVector:
145145
case Op_StoreVector:
146146
return Matcher::vector_size_supported(bt, vlen);
147+
case Op_VectorMaskToLong:
148+
if (vlen > 64) return false;
147149
default:
148150
break;
149151
}
@@ -3055,8 +3057,7 @@ instruct vmask_lasttrue(iRegINoSp dst, pReg src, pReg ptmp) %{
30553057
%}
30563058

30573059
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2) %{
3058-
predicate(UseSVE > 0 &&
3059-
n->in(1)->bottom_type()->is_vect()->length() <= 64);
3060+
predicate(UseSVE > 0);
30603061
match(Set dst (VectorMaskToLong src));
30613062
effect(TEMP vtmp1, TEMP vtmp2);
30623063
ins_cost(13 * SVE_COST);

‎src/hotspot/cpu/aarch64/assembler_aarch64.hpp

+13
Original file line numberDiff line numberDiff line change
@@ -3819,6 +3819,19 @@ void sve_fcm(Condition cond, PRegister Pd, SIMD_RegVariant T,
38193819
f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0);
38203820
}
38213821

3822+
// SVE2 bitwise permute
3823+
#define INSN(NAME, opc) \
3824+
void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \
3825+
starti; \
3826+
assert(T != Q, "invalid size"); \
3827+
f(0b01000101, 31, 24), f(T, 23, 22), f(0b0, 21); \
3828+
rf(Zm, 16), f(0b1011, 15, 12), f(opc, 11, 10); \
3829+
rf(Zn, 5), rf(Zd, 0); \
3830+
}
3831+
3832+
INSN(sve_bext, 0b00);
3833+
#undef INSN
3834+
38223835
Assembler(CodeBuffer* code) : AbstractAssembler(code) {
38233836
}
38243837

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

+55-13
Original file line numberDiff line numberDiff line change
@@ -958,32 +958,74 @@ void C2_MacroAssembler::bytemask_compress(Register dst) {
958958

959959
// Pack the lowest-numbered bit of each mask element in src into a long value
960960
// in dst, at most the first 64 lane elements.
961-
// Clobbers: rscratch1
961+
// Clobbers: rscratch1 if hardware doesn't support FEAT_BITPERM.
962962
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
963963
FloatRegister vtmp1, FloatRegister vtmp2) {
964964
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
965965
assert_different_registers(dst, rscratch1);
966966
assert_different_registers(vtmp1, vtmp2);
967967

968968
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
969+
// Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
970+
// Expected: dst = 0x658D
969971

970-
// Pack the mask into vector with sequential bytes.
972+
// Convert the mask into vector with sequential bytes.
973+
// vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
971974
sve_cpy(vtmp1, size, src, 1, false);
972975
if (bt != T_BYTE) {
973976
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
974977
}
975978

976-
// Compress the lowest 8 bytes.
977-
fmovd(dst, vtmp1);
978-
bytemask_compress(dst);
979-
if (lane_cnt <= 8) return;
980-
981-
// Repeat on higher bytes and join the results.
982-
// Compress 8 bytes in each iteration.
983-
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
984-
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
985-
bytemask_compress(rscratch1);
986-
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
979+
if (UseSVE > 0 && !VM_Version::supports_svebitperm()) {
980+
// Compress the lowest 8 bytes.
981+
fmovd(dst, vtmp1);
982+
bytemask_compress(dst);
983+
if (lane_cnt <= 8) return;
984+
985+
// Repeat on higher bytes and join the results.
986+
// Compress 8 bytes in each iteration.
987+
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
988+
sve_extract_integral(rscratch1, D, vtmp1, idx, /* is_signed */ false, vtmp2);
989+
bytemask_compress(rscratch1);
990+
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
991+
}
992+
} else if (UseSVE == 2 && VM_Version::supports_svebitperm()) {
993+
// Given by the vector with value 0x00 or 0x01 in each byte, the basic idea
994+
// is to compress each significant bit of the byte in a cross-lane way. Due
995+
// to the lack of cross-lane bit-compress instruction, here we use BEXT
996+
// (bit-compress in each lane) with the biggest lane size (T = D) and
997+
// concatenates the results then.
998+
999+
// The second source input of BEXT, initialized with 0x01 in each byte.
1000+
// vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1001+
sve_dup(vtmp2, B, 1);
1002+
1003+
// BEXT vtmp1.D, vtmp1.D, vtmp2.D
1004+
// vtmp1 = 0x0001010000010001 | 0x0100000001010001
1005+
// vtmp2 = 0x0101010101010101 | 0x0101010101010101
1006+
// ---------------------------------------
1007+
// vtmp1 = 0x0000000000000065 | 0x000000000000008D
1008+
sve_bext(vtmp1, D, vtmp1, vtmp2);
1009+
1010+
// Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1011+
// result to dst.
1012+
// vtmp1 = 0x0000000000000000 | 0x000000000000658D
1013+
// dst = 0x658D
1014+
if (lane_cnt <= 8) {
1015+
// No need to concatenate.
1016+
umov(dst, vtmp1, B, 0);
1017+
} else if (lane_cnt <= 16) {
1018+
ins(vtmp1, B, vtmp1, 1, 8);
1019+
umov(dst, vtmp1, H, 0);
1020+
} else {
1021+
// As the lane count is 64 at most, the final expected value must be in
1022+
// the lowest 64 bits after narrowing vtmp1 from D to B.
1023+
sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1024+
umov(dst, vtmp1, D, 0);
1025+
}
1026+
} else {
1027+
assert(false, "unsupported");
1028+
ShouldNotReachHere();
9871029
}
9881030
}
9891031

‎src/hotspot/cpu/aarch64/register_aarch64.hpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2000, 2022, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
@@ -275,6 +275,9 @@ class PRegisterImpl: public AbstractRegisterImpl {
275275
REGISTER_IMPL_DECLARATION(PRegister, PRegisterImpl, PRegisterImpl::number_of_registers);
276276

277277
// The predicate registers of SVE.
278+
//
279+
CONSTANT_REGISTER_DECLARATION(PRegister, pnoreg, (-1));
280+
278281
CONSTANT_REGISTER_DECLARATION(PRegister, p0, ( 0));
279282
CONSTANT_REGISTER_DECLARATION(PRegister, p1, ( 1));
280283
CONSTANT_REGISTER_DECLARATION(PRegister, p2, ( 2));

‎test/hotspot/gtest/aarch64/aarch64-asmtest.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1894,6 +1894,8 @@ def generate(kind, names):
18941894
["bic", "ZZZ"],
18951895
["uzp1", "ZZZ"],
18961896
["uzp2", "ZZZ"],
1897+
# SVE2 instructions
1898+
["bext", "ZZZ"],
18971899
])
18981900

18991901
generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
@@ -1904,8 +1906,9 @@ def generate(kind, names):
19041906

19051907
outfile.close()
19061908

1907-
# compile for sve with 8.3 and sha3 because of SHA3 crypto extension.
1908-
subprocess.check_call([AARCH64_AS, "-march=armv8.3-a+sha3+sve", "aarch64ops.s", "-o", "aarch64ops.o"])
1909+
# compile for sve with armv9-a+sha3+sve2-bitperm because of SHA3 crypto extension and SVE2 bitperm instructions.
1910+
# armv9-a enables sve and sve2 by default.
1911+
subprocess.check_call([AARCH64_AS, "-march=armv9-a+sha3+sve2-bitperm", "aarch64ops.s", "-o", "aarch64ops.o"])
19091912

19101913
print
19111914
print "/*"

‎test/hotspot/gtest/aarch64/asmtest.out.h

+33-32
Original file line numberDiff line numberDiff line change
@@ -1183,17 +1183,18 @@
11831183
__ sve_bic(z8, z2, z0); // bic z8.d, z2.d, z0.d
11841184
__ sve_uzp1(z23, __ S, z22, z0); // uzp1 z23.s, z22.s, z0.s
11851185
__ sve_uzp2(z25, __ H, z26, z23); // uzp2 z25.h, z26.h, z23.h
1186+
__ sve_bext(z21, __ B, z21, z1); // bext z21.b, z21.b, z1.b
11861187

11871188
// SVEReductionOp
1188-
__ sve_andv(v21, __ B, p5, z1); // andv b21, p5, z1.b
1189-
__ sve_orv(v10, __ S, p5, z11); // orv s10, p5, z11.s
1190-
__ sve_eorv(v23, __ D, p6, z8); // eorv d23, p6, z8.d
1191-
__ sve_smaxv(v17, __ S, p5, z19); // smaxv s17, p5, z19.s
1192-
__ sve_sminv(v4, __ D, p5, z13); // sminv d4, p5, z13.d
1193-
__ sve_fminv(v22, __ D, p7, z30); // fminv d22, p7, z30.d
1194-
__ sve_fmaxv(v17, __ S, p4, z14); // fmaxv s17, p4, z14.s
1195-
__ sve_fadda(v12, __ S, p7, z20); // fadda s12, p7, s12, z20.s
1196-
__ sve_uaddv(v1, __ B, p3, z13); // uaddv d1, p3, z13.b
1189+
__ sve_andv(v10, __ S, p5, z11); // andv s10, p5, z11.s
1190+
__ sve_orv(v23, __ D, p6, z8); // orv d23, p6, z8.d
1191+
__ sve_eorv(v17, __ S, p5, z19); // eorv s17, p5, z19.s
1192+
__ sve_smaxv(v4, __ D, p5, z13); // smaxv d4, p5, z13.d
1193+
__ sve_sminv(v22, __ D, p7, z30); // sminv d22, p7, z30.d
1194+
__ sve_fminv(v17, __ S, p4, z14); // fminv s17, p4, z14.s
1195+
__ sve_fmaxv(v12, __ S, p7, z20); // fmaxv s12, p7, z20.s
1196+
__ sve_fadda(v1, __ S, p3, z13); // fadda s1, p3, s1, z13.s
1197+
__ sve_uaddv(v7, __ S, p2, z11); // uaddv d7, p2, z11.s
11971198

11981199
__ bind(forth);
11991200

@@ -1212,30 +1213,30 @@
12121213
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
12131214
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
12141215
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
1215-
0x14000000, 0x17ffffd7, 0x140003e4, 0x94000000,
1216-
0x97ffffd4, 0x940003e1, 0x3400000a, 0x34fffa2a,
1217-
0x34007bca, 0x35000008, 0x35fff9c8, 0x35007b68,
1218-
0xb400000b, 0xb4fff96b, 0xb4007b0b, 0xb500001d,
1219-
0xb5fff91d, 0xb5007abd, 0x10000013, 0x10fff8b3,
1220-
0x10007a53, 0x90000013, 0x36300016, 0x3637f836,
1221-
0x363079d6, 0x3758000c, 0x375ff7cc, 0x3758796c,
1216+
0x14000000, 0x17ffffd7, 0x140003e5, 0x94000000,
1217+
0x97ffffd4, 0x940003e2, 0x3400000a, 0x34fffa2a,
1218+
0x34007bea, 0x35000008, 0x35fff9c8, 0x35007b88,
1219+
0xb400000b, 0xb4fff96b, 0xb4007b2b, 0xb500001d,
1220+
0xb5fff91d, 0xb5007add, 0x10000013, 0x10fff8b3,
1221+
0x10007a73, 0x90000013, 0x36300016, 0x3637f836,
1222+
0x363079f6, 0x3758000c, 0x375ff7cc, 0x3758798c,
12221223
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
12231224
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
12241225
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
12251226
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
1226-
0x54007740, 0x54000001, 0x54fff541, 0x540076e1,
1227-
0x54000002, 0x54fff4e2, 0x54007682, 0x54000002,
1228-
0x54fff482, 0x54007622, 0x54000003, 0x54fff423,
1229-
0x540075c3, 0x54000003, 0x54fff3c3, 0x54007563,
1230-
0x54000004, 0x54fff364, 0x54007504, 0x54000005,
1231-
0x54fff305, 0x540074a5, 0x54000006, 0x54fff2a6,
1232-
0x54007446, 0x54000007, 0x54fff247, 0x540073e7,
1233-
0x54000008, 0x54fff1e8, 0x54007388, 0x54000009,
1234-
0x54fff189, 0x54007329, 0x5400000a, 0x54fff12a,
1235-
0x540072ca, 0x5400000b, 0x54fff0cb, 0x5400726b,
1236-
0x5400000c, 0x54fff06c, 0x5400720c, 0x5400000d,
1237-
0x54fff00d, 0x540071ad, 0x5400000e, 0x54ffefae,
1238-
0x5400714e, 0x5400000f, 0x54ffef4f, 0x540070ef,
1227+
0x54007760, 0x54000001, 0x54fff541, 0x54007701,
1228+
0x54000002, 0x54fff4e2, 0x540076a2, 0x54000002,
1229+
0x54fff482, 0x54007642, 0x54000003, 0x54fff423,
1230+
0x540075e3, 0x54000003, 0x54fff3c3, 0x54007583,
1231+
0x54000004, 0x54fff364, 0x54007524, 0x54000005,
1232+
0x54fff305, 0x540074c5, 0x54000006, 0x54fff2a6,
1233+
0x54007466, 0x54000007, 0x54fff247, 0x54007407,
1234+
0x54000008, 0x54fff1e8, 0x540073a8, 0x54000009,
1235+
0x54fff189, 0x54007349, 0x5400000a, 0x54fff12a,
1236+
0x540072ea, 0x5400000b, 0x54fff0cb, 0x5400728b,
1237+
0x5400000c, 0x54fff06c, 0x5400722c, 0x5400000d,
1238+
0x54fff00d, 0x540071cd, 0x5400000e, 0x54ffefae,
1239+
0x5400716e, 0x5400000f, 0x54ffef4f, 0x5400710f,
12391240
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
12401241
0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f,
12411242
0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf,
@@ -1459,8 +1460,8 @@
14591460
0x65f1af0b, 0x65eec9f1, 0x65a7fed6, 0x65aa5f65,
14601461
0x65b47aae, 0x04c55723, 0x0441723d, 0x042d33ae,
14611462
0x04be3051, 0x047d32b6, 0x04e03048, 0x05a06ad7,
1462-
0x05776f59, 0x041a3435, 0x0498356a, 0x04d93917,
1463-
0x04883671, 0x04ca35a4, 0x65c73fd6, 0x658631d1,
1464-
0x65983e8c, 0x04012da1,
1463+
0x05776f59, 0x4501b2b5, 0x049a356a, 0x04d83917,
1464+
0x04993671, 0x04c835a4, 0x04ca3fd6, 0x658731d1,
1465+
0x65863e8c, 0x65982da1, 0x04812967,
14651466
};
14661467
// END Generated code -- do not edit

1 commit comments

Comments
 (1)

openjdk-notifier[bot] commented on May 12, 2022

@openjdk-notifier[bot]
Please sign in to comment.