Skip to content

Commit 6dae52f

Browse files
e1iuNingsheng Jian
authored and
Ningsheng Jian
committedDec 9, 2021
8276985: AArch64: [vectorapi] Backend support of VectorMaskToLongNode
Reviewed-by: njian, aph
1 parent 08aad85 commit 6dae52f

6 files changed

+149
-0
lines changed
 

‎src/hotspot/cpu/aarch64/aarch64_neon.ad

+32
Original file line numberDiff line numberDiff line change
@@ -5617,3 +5617,35 @@ instruct vmask_lasttrue16B(iRegINoSp dst, vecX src) %{
56175617
%}
56185618
ins_pipe(pipe_slow);
56195619
%}
5620+
5621+
instruct vmask_tolong8B(iRegLNoSp dst, vecD src) %{
5622+
match(Set dst (VectorMaskToLong src));
5623+
ins_cost(5 * INSN_COST);
5624+
format %{ "vmask_tolong $dst, $src\t# convert mask to long (8B)" %}
5625+
ins_encode %{
5626+
// Input "src" is a vector of boolean represented as
5627+
// bytes with 0x00/0x01 as element values.
5628+
5629+
__ fmovd(as_Register($dst$$reg), as_FloatRegister($src$$reg));
5630+
__ bytemask_compress(as_Register($dst$$reg));
5631+
%}
5632+
ins_pipe(pipe_slow);
5633+
%}
5634+
5635+
instruct vmask_tolong16B(iRegLNoSp dst, vecX src) %{
5636+
match(Set dst (VectorMaskToLong src));
5637+
ins_cost(11 * INSN_COST);
5638+
format %{ "vmask_tolong $dst, $src\t# convert mask to long (16B)" %}
5639+
ins_encode %{
5640+
// Input "src" is a vector of boolean represented as
5641+
// bytes with 0x00/0x01 as element values.
5642+
5643+
__ umov(as_Register($dst$$reg), as_FloatRegister($src$$reg), __ D, 0);
5644+
__ umov(rscratch1, as_FloatRegister($src$$reg), __ D, 1);
5645+
__ bytemask_compress(as_Register($dst$$reg));
5646+
__ bytemask_compress(rscratch1);
5647+
__ orr(as_Register($dst$$reg), as_Register($dst$$reg),
5648+
rscratch1, Assembler::LSL, 8);
5649+
%}
5650+
ins_pipe(pipe_slow);
5651+
%}

‎src/hotspot/cpu/aarch64/aarch64_neon_ad.m4

+32
Original file line numberDiff line numberDiff line change
@@ -2481,3 +2481,35 @@ instruct vmask_lasttrue16B(iRegINoSp dst, vecX src) %{
24812481
%}
24822482
ins_pipe(pipe_slow);
24832483
%}
2484+
2485+
instruct vmask_tolong8B(iRegLNoSp dst, vecD src) %{
2486+
match(Set dst (VectorMaskToLong src));
2487+
ins_cost(5 * INSN_COST);
2488+
format %{ "vmask_tolong $dst, $src\t# convert mask to long (8B)" %}
2489+
ins_encode %{
2490+
// Input "src" is a vector of boolean represented as
2491+
// bytes with 0x00/0x01 as element values.
2492+
2493+
__ fmovd(as_Register($dst$$reg), as_FloatRegister($src$$reg));
2494+
__ bytemask_compress(as_Register($dst$$reg));
2495+
%}
2496+
ins_pipe(pipe_slow);
2497+
%}
2498+
2499+
instruct vmask_tolong16B(iRegLNoSp dst, vecX src) %{
2500+
match(Set dst (VectorMaskToLong src));
2501+
ins_cost(11 * INSN_COST);
2502+
format %{ "vmask_tolong $dst, $src\t# convert mask to long (16B)" %}
2503+
ins_encode %{
2504+
// Input "src" is a vector of boolean represented as
2505+
// bytes with 0x00/0x01 as element values.
2506+
2507+
__ umov(as_Register($dst$$reg), as_FloatRegister($src$$reg), __ D, 0);
2508+
__ umov(rscratch1, as_FloatRegister($src$$reg), __ D, 1);
2509+
__ bytemask_compress(as_Register($dst$$reg));
2510+
__ bytemask_compress(rscratch1);
2511+
__ orr(as_Register($dst$$reg), as_Register($dst$$reg),
2512+
rscratch1, Assembler::LSL, 8);
2513+
%}
2514+
ins_pipe(pipe_slow);
2515+
%}

‎src/hotspot/cpu/aarch64/aarch64_sve.ad

+16
Original file line numberDiff line numberDiff line change
@@ -5746,6 +5746,22 @@ instruct vmask_lasttrue_partial(iRegINoSp dst, pReg src, pReg ptmp, rFlagsReg cr
57465746
ins_pipe(pipe_slow);
57475747
%}
57485748

5749+
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2, pRegGov pgtmp, rFlagsReg cr) %{
5750+
predicate(UseSVE > 0 &&
5751+
n->in(1)->bottom_type()->is_vect()->length() <= 64);
5752+
match(Set dst (VectorMaskToLong src));
5753+
effect(TEMP vtmp1, TEMP vtmp2, TEMP pgtmp, KILL cr);
5754+
ins_cost(13 * SVE_COST);
5755+
format %{ "vmask_tolong $dst, $src\t# vector mask tolong (sve)" %}
5756+
ins_encode %{
5757+
__ sve_vmask_tolong(as_Register($dst$$reg), as_PRegister($src$$reg),
5758+
Matcher::vector_element_basic_type(this, $src),
5759+
Matcher::vector_length(this, $src),
5760+
as_FloatRegister($vtmp1$$reg), as_FloatRegister($vtmp2$$reg),
5761+
as_PRegister($pgtmp$$reg));
5762+
%}
5763+
ins_pipe(pipe_slow);
5764+
%}
57495765
// ---------------------------- Vector mask generation ---------------------------
57505766
instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
57515767
predicate(UseSVE > 0);

‎src/hotspot/cpu/aarch64/aarch64_sve_ad.m4

+17
Original file line numberDiff line numberDiff line change
@@ -3176,6 +3176,23 @@ instruct vmask_lasttrue_partial(iRegINoSp dst, pReg src, pReg ptmp, rFlagsReg cr
31763176
ins_pipe(pipe_slow);
31773177
%}
31783178

3179+
instruct vmask_tolong(iRegLNoSp dst, pReg src, vReg vtmp1, vReg vtmp2, pRegGov pgtmp, rFlagsReg cr) %{
3180+
predicate(UseSVE > 0 &&
3181+
n->in(1)->bottom_type()->is_vect()->length() <= 64);
3182+
match(Set dst (VectorMaskToLong src));
3183+
effect(TEMP vtmp1, TEMP vtmp2, TEMP pgtmp, KILL cr);
3184+
ins_cost(13 * SVE_COST);
3185+
format %{ "vmask_tolong $dst, $src\t# vector mask tolong (sve)" %}
3186+
ins_encode %{
3187+
__ sve_vmask_tolong(as_Register($dst$$reg), as_PRegister($src$$reg),
3188+
Matcher::vector_element_basic_type(this, $src),
3189+
Matcher::vector_length(this, $src),
3190+
as_FloatRegister($vtmp1$$reg), as_FloatRegister($vtmp2$$reg),
3191+
as_PRegister($pgtmp$$reg));
3192+
%}
3193+
ins_pipe(pipe_slow);
3194+
%}dnl
3195+
31793196
// ---------------------------- Vector mask generation ---------------------------
31803197
instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
31813198
predicate(UseSVE > 0);

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

+43
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,48 @@ void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegis
946946
}
947947
}
948948

949+
// Compress the least significant bit of each byte to the rightmost and clear
950+
// the higher garbage bits.
951+
void C2_MacroAssembler::bytemask_compress(Register dst) {
952+
// Example input, dst = 0x01 00 00 00 01 01 00 01
953+
// The "??" bytes are garbage.
954+
orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
955+
orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
956+
orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
957+
andr(dst, dst, 0xff); // dst = 0x8D
958+
}
959+
960+
// Pack the lowest-numbered bit of each mask element in src into a long value
961+
// in dst, at most the first 64 lane elements.
962+
// Clobbers: rscratch1
963+
void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
964+
FloatRegister vtmp1, FloatRegister vtmp2, PRegister pgtmp) {
965+
assert(pgtmp->is_governing(), "This register has to be a governing predicate register.");
966+
assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
967+
assert_different_registers(dst, rscratch1);
968+
969+
Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
970+
971+
// Pack the mask into vector with sequential bytes.
972+
sve_cpy(vtmp1, size, src, 1, false);
973+
if (bt != T_BYTE) {
974+
sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
975+
}
976+
977+
// Compress the lowest 8 bytes.
978+
fmovd(dst, vtmp1);
979+
bytemask_compress(dst);
980+
if (lane_cnt <= 8) return;
981+
982+
// Repeat on higher bytes and join the results.
983+
// Compress 8 bytes in each iteration.
984+
for (int idx = 1; idx < (lane_cnt / 8); idx++) {
985+
idx == 1 ? fmovhid(rscratch1, vtmp1) : sve_extract(rscratch1, D, pgtmp, vtmp1, idx);
986+
bytemask_compress(rscratch1);
987+
orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
988+
}
989+
}
990+
949991
void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
950992
FloatRegister zn, FloatRegister zm, int cond) {
951993
assert(pg->is_governing(), "This register has to be a governing predicate register");
@@ -1021,6 +1063,7 @@ void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst
10211063
FloatRegister src, SIMD_RegVariant src_size,
10221064
FloatRegister tmp) {
10231065
assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1066+
assert_different_registers(src, tmp);
10241067
sve_dup(tmp, src_size, 0);
10251068
if (src_size == D) {
10261069
switch (dst_size) {

‎src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp

+9
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,15 @@
5555
FloatRegister ztmp1, FloatRegister ztmp2,
5656
PRegister pgtmp, PRegister ptmp, bool isL);
5757

58+
// Compress the least significant bit of each byte to the rightmost and clear
59+
// the higher garbage bits.
60+
void bytemask_compress(Register dst);
61+
62+
// Pack the lowest-numbered bit of each mask element in src into a long value
63+
// in dst, at most the first 64 lane elements.
64+
void sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
65+
FloatRegister vtmp1, FloatRegister vtmp2, PRegister pgtmp);
66+
5867
// SIMD&FP comparison
5968
void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
6069
FloatRegister src2, int cond, bool isQ);

0 commit comments

Comments
 (0)
Please sign in to comment.