Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8282664: Unroll by hand StringUTF16, StringLatin1, and Arrays polynomial hash loops #7700

Closed
Closed
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f7dda1d
8282664: Unroll by hand StringUTF16 and StringLatin1 polynomial hash …
luhenry Mar 4, 2022
8e2e59b
Add UTF-16 benchmarks
luhenry Mar 4, 2022
5febe14
Add first pass at vectorized intrinsic for StringLatin1.hashCode
luhenry Apr 4, 2022
79e0a82
Leave Java method unchanged
luhenry Apr 4, 2022
ed9904b
Reduce overhead of method handle
luhenry Apr 4, 2022
95f56b1
Improve performance of short strings
luhenry Apr 4, 2022
6c1f574
Reduce code duplication
luhenry Apr 5, 2022
026e971
Use intrinsic for StringUTF16
luhenry Apr 5, 2022
e1a5942
{wip} Generalize string hashcode to Arrays.hashCode
luhenry Apr 6, 2022
ff8eb33
Some small refactoring: store power_of_31_backwards in the code direc…
luhenry Apr 6, 2022
8a78584
Merge branch 'master' of https://github.com/openjdk/jdk into vectoriz…
luhenry Apr 6, 2022
1935cf3
Disable Arrays.hashCode intrinsic by default for CI
luhenry Apr 6, 2022
a6f75c2
Fix some merge conflicts
luhenry Apr 7, 2022
2b631dd
Add missing check for AryHashCode node
luhenry May 10, 2022
af7b445
Fix h when vectorized for Arrays.hashCode
luhenry May 10, 2022
721899e
Merge branch 'master' of https://github.com/openjdk/jdk into vectoriz…
luhenry May 10, 2022
7232036
Actually fix h when hashcode is vectorized
luhenry May 10, 2022
29dab16
Fix overlapping registers
luhenry May 10, 2022
c362466
Merge branch 'master' of https://github.com/openjdk/jdk into vectoriz…
luhenry May 11, 2022
34b90e8
Ensure a proper register is used + Slight performance optimizations
luhenry May 12, 2022
5d86266
Reenable SpecialArraysHashCode by default
luhenry May 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
286 changes: 286 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -2962,6 +2962,292 @@ void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Regis
bind(DONE_LABEL);
} // stringL_indexof_char

int C2_MacroAssembler::arrays_hashcode_elsize(BasicType eltype) {
switch (eltype) {
case T_BYTE: return sizeof(jbyte);
case T_SHORT: return sizeof(jshort);
case T_CHAR: return sizeof(jchar);
case T_INT: return sizeof(jint);
case T_FLOAT: return sizeof(jfloat);
default:
ShouldNotReachHere();
return -1;
}
}

void C2_MacroAssembler::arrays_hashcode_elload(Register dst, Address src, BasicType eltype) {
switch (eltype) {
case T_BYTE:
movzbl(dst, src);
break;
case T_SHORT:
case T_CHAR:
movzwl(dst, src);
break;
case T_INT:
case T_FLOAT:
movl(dst, src);
break;
default:
ShouldNotReachHere();
}
}

void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype) {
load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
}

void C2_MacroAssembler::arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype) {
load_vector(dst, src, arrays_hashcode_elsize(eltype) * 8);
}

void C2_MacroAssembler::arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype) {
switch (eltype) {
case T_BYTE:
vector_unsigned_cast(dst, dst, Assembler::AVX_256bit, T_BYTE, T_INT);
break;
case T_SHORT:
case T_CHAR:
vector_unsigned_cast(dst, dst, Assembler::AVX_256bit, T_SHORT, T_INT);
break;
case T_INT:
case T_FLOAT:
// do nothing
break;
default:
ShouldNotReachHere();
}
}

void C2_MacroAssembler::arrays_hashcode(Register ary1, Register cnt1, Register result,
Register i, Register coef, Register tmp, XMMRegister vnext,
XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
BasicType eltype, bool is_string_hashcode) {
ShortBranchVerifier sbv(this);
assert(UseAVX >= 2, "AVX2 intrinsics are required");

Label SHORT, SHORT_UNROLLED_LOOP_BEGIN, SHORT_UNROLLED_LOOP_END, SHORT_SCALAR_LOOP_BEGIN, SHORT_SCALAR_LOOP_END,
LONG, LONG_INIT, LONG_SCALAR_LOOP_BEGIN, LONG_SCALAR_LOOP_END, LONG_VECTOR_LOOP_BEGIN, LONG_VECTOR_LOOP_END,
NONNULL, END;

// For "renaming" for readibility of the code
Register bound;

XMMRegister vcoef[] = { vcoef0, vcoef1, vcoef2, vcoef3 },
vresult[] = { vresult0, vresult1, vresult2, vresult3 },
vtmp[] = { vtmp0, vtmp1, vtmp2, vtmp3 };

const int elsize = arrays_hashcode_elsize(eltype);

int length_offset = arrayOopDesc::length_offset_in_bytes();
int base_offset = arrayOopDesc::base_offset_in_bytes(eltype);

if (!is_string_hashcode) {
testptr(ary1, ary1);
jcc(Assembler::notZero, NONNULL);
movl(result, 0);
jmp(END);
bind(NONNULL);
movl(cnt1, Address(ary1, length_offset));
lea(ary1, Address(ary1, base_offset));
}

// int result = 0|1;
movl(result, is_string_hashcode ? 0 : 1);

// if (cnt1 == 0) {
cmpl(cnt1, 0);
jcc(Assembler::equal, END);

// cnt1 /= elsize
if (Address::times(elsize) != 0) {
shrl(cnt1, Address::times(elsize));
}

// } else if (cnt1 < 32) {
bind(SHORT);
cmpl(cnt1, 32);
jcc(Assembler::greaterEqual, LONG);

// int i = 0;
movl(i, 0);
// int bound = cnt1 & ~(4 - 1);
bound = coef;
movl(bound, cnt1);
andl(bound, ~(4-1));

// for (; i < bound; i += 4) {
bind(SHORT_UNROLLED_LOOP_BEGIN);
// i < bound;
cmpl(i, bound);
jcc(Assembler::greaterEqual, SHORT_UNROLLED_LOOP_END);
for (int idx = 0; idx < 4; idx++) {
// h = h << 5 - 31;
movl(tmp, result);
shll(result, 5);
subl(result, tmp);
// h += ary1[i];
arrays_hashcode_elload(tmp, Address(ary1, i, Address::times(elsize), idx*elsize), eltype);
addl(result, tmp);
}
addl(i, 4);
jmp(SHORT_UNROLLED_LOOP_BEGIN);
bind(SHORT_UNROLLED_LOOP_END);
// }

// for (; i < cnt1; i += 1) {
bind(SHORT_SCALAR_LOOP_BEGIN);
// i < cnt1;
cmpl(i, cnt1);
jcc(Assembler::greaterEqual, SHORT_SCALAR_LOOP_END);
// h = h << 5 - h;
movl(tmp, result);
shll(result, 5);
subl(result, tmp);
// h += ary1[i];
arrays_hashcode_elload(tmp, Address(ary1, i, Address::times(elsize)), eltype);
addl(result, tmp);
// i += 1;
addl(i, 1);
jmp(SHORT_SCALAR_LOOP_BEGIN);
bind(SHORT_SCALAR_LOOP_END);
// }

jmp(END);

// } else { // cnt1 >= 32
bind(LONG);

jmp(LONG_INIT);
address power_of_31_backwards = pc();
emit_int32( 2111290369);
emit_int32(-2010103841);
emit_int32( 350799937);
emit_int32( 11316127);
emit_int32( 693101697);
emit_int32( -254736545);
emit_int32( 961614017);
emit_int32( 31019807);
emit_int32(-2077209343);
emit_int32( -67006753);
emit_int32( 1244764481);
emit_int32(-2038056289);
emit_int32( 211350913);
emit_int32( -408824225);
emit_int32( -844471871);
emit_int32( -997072353);
emit_int32( 1353309697);
emit_int32( -510534177);
emit_int32( 1507551809);
emit_int32( -505558625);
emit_int32( -293403007);
emit_int32( 129082719);
emit_int32(-1796951359);
emit_int32( -196513505);
emit_int32(-1807454463);
emit_int32( 1742810335);
emit_int32( 887503681);
emit_int32( 28629151);
emit_int32( 923521);
emit_int32( 29791);
emit_int32( 961);
emit_int32( 31);
emit_int32( 1);
bind(LONG_INIT);

// int coef = 1;
movl(coef, 1);
// int i = cnt1 - 1;
movl(i, cnt1);
subl(i, 1);
// bound = cnt1 & ~(32-1);
bound = cnt1;
movl(bound, cnt1);
andl(bound, ~(32-1));

if (!is_string_hashcode) {
// result = 0;
movl(result, 0);
}

// for (; i >= bound; i -= 1) {
bind(LONG_SCALAR_LOOP_BEGIN);
// i >= bound;
cmpl(i, bound);
jcc(Assembler::less, LONG_SCALAR_LOOP_END);
// result += coef * ary1[i];
arrays_hashcode_elload(tmp, Address(ary1, i, Address::times(elsize)), eltype);
imull(tmp, coef);
addl(result, tmp);
// coef *= 31;
movl(tmp, 31);
imull(coef, tmp);
// i -= 1;
subl(i, 1);
jmp(LONG_SCALAR_LOOP_BEGIN);
bind(LONG_SCALAR_LOOP_END);
// }

for (int idx = 0; idx < 4; idx++) {
// vresult = IntVector.zero(I256);
vpxor(vresult[idx], vresult[idx]);
}
// vnext = IntVector.broadcast(I256, power_of_31_backwards[0]);
movdl(vnext, InternalAddress(power_of_31_backwards+(0*sizeof(jint))));
vpbroadcastd(vnext, vnext, Assembler::AVX_256bit);
// vcoef = IntVector.fromArray(I256, power_of_31_backwards, 1);
for (int idx = 0; idx < 4; idx++) {
arrays_hashcode_elvload(vcoef[idx], InternalAddress(power_of_31_backwards+((8*idx+1)*sizeof(jint))), T_INT);
}
// vcoef *= coef
movdl(vtmp0, coef);
vpbroadcastd(vtmp0, vtmp0, Assembler::AVX_256bit);
for (int idx = 0; idx < 4; idx++) {
vpmulld(vcoef[idx], vcoef[idx], vtmp0, Assembler::AVX_256bit);
}

// for (i &= ~(8*4-1); i >= 0; i -= 8*4) {
// i &= ~(8*4-1);
andl(i, ~(8*4-1));
bind(LONG_VECTOR_LOOP_BEGIN);
// loop fission to upfront the cost of fetching from memory, OOO execution
// can then hopefully do a better job of prefetching
for (int idx = 0; idx < 4; idx++) {
arrays_hashcode_elvload(vtmp[idx], Address(ary1, i, Address::times(elsize), 8*idx*elsize), eltype);
}
// vresult += vcoef * ary1[i+8*idx:i+8*idx+7]; vcoef *= vnext;
for (int idx = 0; idx < 4; idx++) {
arrays_hashcode_elvcast(vtmp[idx], eltype);
vpmulld(vtmp[idx], vtmp[idx], vcoef[idx], Assembler::AVX_256bit);
vpaddd(vresult[idx], vresult[idx], vtmp[idx], Assembler::AVX_256bit);
vpmulld(vcoef[idx], vcoef[idx], vnext, Assembler::AVX_256bit);
}
// i -= 8*4;
subl(i, 8*4);
// i >= 0;
cmpl(i, 0);
jcc(Assembler::greaterEqual, LONG_VECTOR_LOOP_BEGIN);
// }

if (!is_string_hashcode) {
// result += vcoef0[0];
movdl(tmp, vcoef0);
addl(result, tmp);
}

// result += vresult.reduceLanes(ADD);
for (int idx = 0; idx < 4; idx++) {
reduceI(Op_AddReductionVI, 256/(sizeof(jint)*8), result, result, vresult[idx], vtmp[(idx*2+0)%4], vtmp[(idx*2+1)%4]);
}

// }

bind(END);

} // arrays_hashcode

// helper function for string_compare
void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
Address::ScaleFactor scale, Address::ScaleFactor scale1,
14 changes: 14 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
@@ -276,6 +276,20 @@
Register limit, Register result, Register chr,
XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg);

void arrays_hashcode(Register str1, Register cnt1, Register result,
Register i, Register coef, Register tmp, XMMRegister vnext,
XMMRegister vcoef0, XMMRegister vcoef1, XMMRegister vcoef2, XMMRegister vcoef3,
XMMRegister vresult0, XMMRegister vresult1, XMMRegister vresult2, XMMRegister vresult3,
XMMRegister vtmp0, XMMRegister vtmp1, XMMRegister vtmp2, XMMRegister vtmp3,
BasicType eltype, bool is_string_hashcode);

// helper functions for arrays_hashcode
int arrays_hashcode_elsize(BasicType eltype);
void arrays_hashcode_elload(Register dst, Address src, BasicType eltype);
void arrays_hashcode_elvload(XMMRegister dst, Address src, BasicType eltype);
void arrays_hashcode_elvload(XMMRegister dst, AddressLiteral src, BasicType eltype);
void arrays_hashcode_elvcast(XMMRegister dst, BasicType eltype);


void evmasked_op(int ideal_opc, BasicType eType, KRegister mask,
XMMRegister dst, XMMRegister src1, XMMRegister src2,
5 changes: 5 additions & 0 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
@@ -1502,6 +1502,11 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_StrHashCode:
if (!UseSSE42Intrinsics) {
return false;
}
break;
case Op_OnSpinWait:
if (VM_Version::supports_on_spin_wait() == false) {
return false;
Loading