Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8279508: Auto-vectorize Math.round API #7094

Closed
wants to merge 23 commits into from
Closed
Changes from 2 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
0fe0150
8279508: Auto-vectorize Math.round API
Jan 14, 2022
575d293
8279508: Adding a test for scalar intrinsification.
Jan 19, 2022
d610bd6
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8279508
Feb 12, 2022
2dc364f
8279508: Adding vectorized algorithms to match the semantics of round…
Feb 12, 2022
1c9ff77
8279508: Replacing by efficient instruction sequence based on MXCSR.R…
Feb 15, 2022
2f55569
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8279508
Feb 15, 2022
73674fe
8279508: Adding few descriptive comments.
Feb 16, 2022
f35ed9c
8279508: Fixing for windows failure.
Feb 17, 2022
6c869c7
8279508: Review comments resolved.
Feb 22, 2022
f7dec3d
8279508: Review comments resolved.
jatin-bhateja Feb 24, 2022
54d4ea3
8279508: Adding descriptive comments.
Feb 24, 2022
3b90ae5
8279508: Review comments resolved.`
Mar 1, 2022
57b1b13
8279508: Removing +LogCompilation flag.
Mar 1, 2022
bf1532f
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8279508
Mar 8, 2022
547f4e3
8279508: Preventing domain switch-over penalty for Math.round(float) …
Mar 8, 2022
fcb7321
8279508: Review comments resolution.
Mar 10, 2022
2519a58
8279508: Reducing the invocation count and compile thresholds for Rou…
Mar 11, 2022
e4d4e29
8279508: Creating separate test for round double under feature check.
Mar 12, 2022
c881d11
8279508: Styling comments resolved.
Mar 12, 2022
b1323a8
8279508: Windows build failure fix.
Mar 12, 2022
962d751
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8279508
Mar 18, 2022
c17440c
8279508: Using an explicit scratch register since rscratch1 is bound …
Mar 18, 2022
621bd69
8279508: Removing redundant test point.
Apr 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -6464,6 +6464,21 @@ void Assembler::vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vec
}


void Assembler::vroundps(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x08, (0xC0 | encode), (rmode));
}

void Assembler::vrndscaleps(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x08, (0xC0 | encode), (rmode));
}

void Assembler::vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(vector_len, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
4 changes: 4 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
@@ -2249,6 +2249,10 @@ class Assembler : public AbstractAssembler {
void vrndscalepd(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
void vrndscalepd(XMMRegister dst, Address src, int32_t rmode, int vector_len);

// Round Packed Single precision value.
void vroundps(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);
void vrndscaleps(XMMRegister dst, XMMRegister src, int32_t rmode, int vector_len);

// Bitwise Logical AND of Packed Floating-Point Values
void andpd(XMMRegister dst, XMMRegister src);
void andps(XMMRegister dst, XMMRegister src);
236 changes: 184 additions & 52 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -4013,51 +4013,11 @@ void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
}
}

/*
* Algorithm for vector D2L and F2I conversions:-
* a) Perform vector D2L/F2I cast.
* b) Choose fast path if none of the result vector lane contains 0x80000000 value.
* It signifies that source value could be any of the special floating point
* values(NaN,-Inf,Inf,Max,-Min).
* c) Set destination to zero if source is NaN value.
* d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
*/

void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
Register scratch, int vec_enc, bool roundD) {
void C2_MacroAssembler::vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This special handling is really large, could we use a stub routine for it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion, but as of now we are not using vector calling conventions for stubs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment. If the stub is only to be used by you, then you can determine your own calling convention.

Copy link
Member Author

@jatin-bhateja jatin-bhateja Mar 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are passing mixture of scalar, vector and opmask register to special handling function, only way we can pass them reliably to callee stub without having an elaborate mixed calling convention will be by bounding the machine operands.

XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
Register scratch, AddressLiteral float_sign_flip,
int vec_enc) {
Label done;
if (roundD) {
evcvtpd2qq(dst, src, vec_enc);
} else {
evcvttpd2qq(dst, src, vec_enc);
}
evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch);
evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
kortestwl(ktmp1, ktmp1);
jccb(Assembler::equal, done);

vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);

kxorwl(ktmp1, ktmp1, ktmp2);
evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
bind(done);
}

void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
AddressLiteral float_sign_flip, Register scratch, int vec_enc,
bool roundF) {
Label done;
if (roundF) {
vcvtps2dq(dst, src, vec_enc);
} else {
vcvttps2dq(dst, src, vec_enc);
}
vmovdqu(xtmp1, float_sign_flip, scratch, vec_enc);
vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
vptest(xtmp2, xtmp2, vec_enc);
@@ -4082,15 +4042,11 @@ void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMM
bind(done);
}

void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
Register scratch, int vec_enc, bool roundF) {
void C2_MacroAssembler::vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
Register scratch, AddressLiteral float_sign_flip,
int vec_enc) {
Label done;
if (roundF) {
vcvtps2dq(dst, src, vec_enc);
} else {
vcvttps2dq(dst, src, vec_enc);
}
evmovdqul(xtmp1, k0, float_sign_flip, false, vec_enc, scratch);
Assembler::evpcmpeqd(ktmp1, k0, xtmp1, dst, vec_enc);
kortestwl(ktmp1, ktmp1);
@@ -4107,6 +4063,182 @@ void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XM
bind(done);
}

void C2_MacroAssembler::vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2,
Register scratch, AddressLiteral double_sign_flip,
int vec_enc) {
Label done;
evmovdqul(xtmp1, k0, double_sign_flip, false, vec_enc, scratch);
evpcmpeqq(ktmp1, xtmp1, dst, vec_enc);
kortestwl(ktmp1, ktmp1);
jccb(Assembler::equal, done);

vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
evcmppd(ktmp2, k0, src, src, Assembler::UNORD_Q, vec_enc);
evmovdquq(dst, ktmp2, xtmp2, true, vec_enc);

kxorwl(ktmp1, ktmp1, ktmp2);
evcmppd(ktmp1, ktmp1, src, xtmp2, Assembler::NLT_UQ, vec_enc);
vpternlogq(xtmp2, 0x11, xtmp1, xtmp1, vec_enc);
evmovdquq(dst, ktmp1, xtmp2, true, vec_enc);
bind(done);
}

/*
* Algorithm for vector D2L and F2I conversions:-
* a) Perform vector D2L/F2I cast.
* b) Choose fast path if none of the result vector lane contains 0x80000000 value.
* It signifies that source value could be any of the special floating point
* values(NaN,-Inf,Inf,Max,-Min).
* c) Set destination to zero if source is NaN value.
* d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
*/

void C2_MacroAssembler::vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
Register scratch, int vec_enc) {
evcvttpd2qq(dst, src, vec_enc);
vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, double_sign_flip, vec_enc);
}

void C2_MacroAssembler::vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
vcvttps2dq(dst, src, vec_enc);
vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp3, xtmp4, scratch, float_sign_flip, vec_enc);
}

void C2_MacroAssembler::vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
Register scratch, int vec_enc) {
vcvttps2dq(dst, src, vec_enc);
vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp1, ktmp2, scratch, float_sign_flip, vec_enc);
}

#ifdef _LP64
void C2_MacroAssembler::vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister xtmp3, KRegister ktmp1, KRegister ktmp2, KRegister ktmp3,
AddressLiteral double_sign_flip, Register scratch, int vec_enc) {
// Following assembly snippet is vectorized translation of Math.round(double) algorithm
// for AVX512 target.
evmovdquq(xtmp1, k0, src, true, vec_enc);
movptr(scratch, 0x7ff0000000000000L);
evpbroadcastq(xtmp2, scratch, vec_enc);
evpandq(xtmp2, k0, xtmp2, xtmp1, true, vec_enc);
Assembler::evpsraq(xtmp2, k0, xtmp2, 0x34, true, vec_enc);
mov64(scratch, 0x432);
evpbroadcastq(dst, scratch, vec_enc);
vpsubq(dst, dst, xtmp2, vec_enc);
evmovdquq(xtmp3, k0, dst, true, vec_enc);
mov64(scratch, 0xffffffffffffffc0L);
evpbroadcastq(xtmp2, scratch, vec_enc);
evpandq(xtmp2, k0, dst, xtmp2, true, vec_enc);
vpxor(dst, dst, dst, vec_enc);
Assembler::evpcmpeqq(ktmp1, xtmp2, dst, vec_enc);
mov64(scratch, 0xfffffffffffffL);
evpbroadcastq(xtmp2, scratch, vec_enc);
mov64(scratch, 0x10000000000000L);
evpbroadcastq(dst, scratch, vec_enc);
evpternlogq(xtmp1, 0xea, k0, xtmp2, dst, true, vec_enc);
vpxor(dst, dst, dst, vec_enc);
evpcmpq(ktmp2, k0, src, dst, Assembler::lt, true, vec_enc);
kandwl(ktmp2, ktmp2, ktmp1);
evpsubq(xtmp1, ktmp2, dst, xtmp1, true, vec_enc);
evpsravq(xtmp1, ktmp1, xtmp1, xtmp3, true, vec_enc);
mov64(scratch, 0x1);
evpbroadcastq(xtmp3, scratch, vec_enc);
evpaddq(xtmp1, ktmp1, xtmp1, xtmp3, true, vec_enc);
evpsravq(xtmp3, ktmp1, xtmp1, xtmp3, true, vec_enc);
evcvtpd2qq(dst, src, vec_enc);
vector_cast_double_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp2, ktmp3, scratch, double_sign_flip, vec_enc);
evpblendmq(dst, ktmp1, dst, xtmp3, true, vec_enc);
}

void C2_MacroAssembler::vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister xtmp3, KRegister ktmp1, KRegister ktmp2, KRegister ktmp3,
AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
// Following assembly snippet is vectorized translation of Math.round(float) algorithm
// for AVX512 target.
evmovdquq(xtmp1, k0, src, true, vec_enc);
movl(scratch, 0x7F800000);
evpbroadcastd(xtmp2, scratch, vec_enc);
evpandd(xtmp2, k0, xtmp2, xtmp1, true, vec_enc);
Assembler::evpsrad(xtmp2, k0, xtmp2, 0x17, true, vec_enc);
movl(scratch, 0x95);
evpbroadcastd(dst, scratch, vec_enc);
vpsubd(dst, dst, xtmp2, vec_enc);
evmovdquq(xtmp3, k0, dst, true, vec_enc);
movl(scratch, 0XFFFFFFE0);
evpbroadcastd(xtmp2, scratch, vec_enc);
evpandd(xtmp2, k0, dst, xtmp2, true, vec_enc);
vpxor(dst, dst, dst, vec_enc);
Assembler::evpcmpeqd(ktmp1, k0, xtmp2, dst, vec_enc);
movl(scratch, 0X007FFFFF);
evpbroadcastd(xtmp2, scratch, vec_enc);
movl(scratch, 0X00800000);
evpbroadcastd(dst, scratch, vec_enc);
evpternlogd(xtmp1, 0xea, k0, xtmp2, dst, true, vec_enc);
vpxor(dst, dst, dst, vec_enc);
evpcmpd(ktmp2, k0, src, dst, Assembler::lt, true, vec_enc);
kandwl(ktmp2, ktmp2, ktmp1);
evpsubd(xtmp1, ktmp2, dst, xtmp1, true, vec_enc);
evpsravd(xtmp1, ktmp1, xtmp1, xtmp3, true, vec_enc);
movl(scratch, 0x1);
evpbroadcastd(xtmp3, scratch, vec_enc);
evpaddd(xtmp1, ktmp1, xtmp1, xtmp3, true, vec_enc);
evpsravd(xtmp3, ktmp1, xtmp1, xtmp3, true, vec_enc);
vcvtps2dq(dst, src, vec_enc);
vector_cast_float_special_cases_evex(dst, src, xtmp1, xtmp2, ktmp2, ktmp3, scratch, float_sign_flip, vec_enc);
evpblendmd(dst, ktmp1, dst, xtmp3, true, vec_enc);
}

void C2_MacroAssembler::vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, XMMRegister xtmp6,
AddressLiteral float_sign_flip, Register scratch, int vec_enc) {
// Following assembly snippet is vectorized translation of Math.round(float) algorithm
// for AVX2 target.
vmovdqu(xtmp1, src);
movl(scratch, 0x7F800000);
movdl(xtmp2, scratch);
vpbroadcastd(xtmp2, xtmp2, vec_enc);
vpand(xtmp2, xtmp2, xtmp1, vec_enc);
Assembler::vpsrad(xtmp2, xtmp2, 0x17, vec_enc);
movl(scratch, 0x95);
movdl(dst, scratch);
vpbroadcastd(dst, dst, vec_enc);
vpsubd(dst, dst, xtmp2, vec_enc);
vmovdqu(xtmp3, dst);
movl(scratch, 0xFFFFFFE0);
movdl(xtmp2, scratch);
vpbroadcastd(xtmp2, xtmp2, vec_enc);
vpand(xtmp2, dst, xtmp2, vec_enc);
vpxor(dst, dst, dst, vec_enc);
Assembler::vpcmpeqd(xtmp5, xtmp2, dst, vec_enc);
movl(scratch, 0x007FFFFF);
movdl(xtmp2, scratch);
vpbroadcastd(xtmp2, xtmp2, vec_enc);
movl(scratch, 0x00800000);
movdl(dst, scratch);
vpbroadcastd(dst, dst, vec_enc);
vpand(xtmp1, xtmp2, xtmp1, vec_enc);
vpor(xtmp1, xtmp1, dst, vec_enc);
vpxor(dst, dst, dst, vec_enc);
vpcmpCCW(xtmp4, src, dst, xtmp2, Assembler::lt, Assembler::D, vec_enc);
vpand(xtmp4, xtmp4, xtmp5, vec_enc);
vpsubd(dst, dst, xtmp1, vec_enc);
vblendvps(xtmp1, xtmp1, dst, xtmp4, vec_enc);
vpsravd(xtmp1, xtmp1, xtmp3, vec_enc);
movl(scratch, 0x1);
movdl(xtmp4, scratch);
vpbroadcastd(xtmp4, xtmp4, vec_enc);
vpaddd(xtmp1, xtmp1, xtmp4, vec_enc);
Assembler::vpsrad(xtmp3, xtmp1, 0x1, vec_enc);
vcvtps2dq(dst, src, vec_enc);
vector_cast_float_special_cases_avx(dst, src, xtmp1, xtmp2, xtmp6, xtmp4, scratch, float_sign_flip, vec_enc);
vblendvps(dst, dst, xtmp3, xtmp5, vec_enc);
}
#endif

void C2_MacroAssembler::evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
bool merge, BasicType bt, int vlen_enc) {
if (bt == T_INT) {
35 changes: 32 additions & 3 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
@@ -298,15 +298,44 @@

void vector_castF2I_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
AddressLiteral float_sign_flip, Register scratch, int vec_enc, bool roundF);
AddressLiteral float_sign_flip, Register scratch, int vec_enc);

void vector_castF2I_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, AddressLiteral float_sign_flip,
Register scratch, int vec_enc, bool roundF);
Register scratch, int vec_enc);


void vector_castD2L_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, AddressLiteral double_sign_flip,
Register scratch, int vec_enc, bool roundD);
Register scratch, int vec_enc);

void vector_cast_double_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral double_sign_flip,
int vec_enc);

void vector_cast_float_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
KRegister ktmp1, KRegister ktmp2, Register scratch, AddressLiteral float_sign_flip,
int vec_enc);

void vector_cast_float_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4,
Register scratch, AddressLiteral float_sign_flip,
int vec_enc);

#ifdef _LP64
void vector_round_double_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister xtmp3, KRegister ktmp1, KRegister ktmp2, KRegister ktmp3,
AddressLiteral double_sign_flip, Register scratch, int vec_enc);


void vector_round_float_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister xtmp3, KRegister ktmp1, KRegister ktmp2, KRegister ktmp3,
AddressLiteral float_sign_flip, Register scratch, int vec_enc);

void vector_round_float_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1, XMMRegister xtmp2,
XMMRegister xtmp3, XMMRegister xtmp4, XMMRegister xtmp5, XMMRegister xtmp6,
AddressLiteral float_sign_flip, Register scratch, int vec_enc);
#endif

void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, XMMRegister src3,
bool merge, BasicType bt, int vlen_enc);
Loading