Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8281146: Replace StringCoding.hasNegatives with countPositives #7231

Closed
wants to merge 43 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
d59de38
Add StringCoding.countPositives, enabling optimizing decoding/encodin…
cl4es Jan 24, 2022
cc5d864
Merge branch 'master' of https://github.com/openjdk/jdk into count_po…
cl4es Jan 24, 2022
030eb78
Various tweaks and improvements
cl4es Jan 25, 2022
fde25ab
Merge branch 'master' into count_positives
cl4es Jan 26, 2022
2e5a5d1
Fix outdated ref in shenandoahSupport
cl4es Jan 26, 2022
622774c
Merge branch 'master' into count_positives
cl4es Jan 27, 2022
bdaa322
Fix incorrectly encoding latin1 to utf-8
cl4es Jan 27, 2022
b8fd8dd
Cleanup, minor improvements
cl4es Jan 27, 2022
0049d33
Add latin1 StringEncode micros
cl4es Jan 27, 2022
6ca30be
Improve benchmark, fine-tune intrinsic code
cl4es Jan 28, 2022
b87d534
Further tuning of intrinsic to improve precision on short strings wit…
cl4es Jan 28, 2022
3e3e451
Adjust the countPositives intrinsic to count the bytes exactly.
cl4es Jan 31, 2022
6f2aed3
Add shortMixed micros, cleanups
cl4es Jan 31, 2022
fc01b89
Resolve issues in the precise implementation
cl4es Jan 31, 2022
49602d2
Add more comments, simplify tail branching in AVX512 variant
cl4es Jan 31, 2022
175a650
Remove has_negatives intrinsic on x86 (and hook up 32-bit x86 to use …
cl4es Feb 1, 2022
c253157
Remove unused tail_mask
cl4es Feb 1, 2022
0b12c1a
Reduce jumps in the ascii path
cl4es Feb 1, 2022
37813af
Fix little-endian error caught by testing
cl4es Feb 2, 2022
251fe38
Simplify changes to encodeUTF8
cl4es Feb 2, 2022
2a855eb
Let countPositives use hasNegatives to allow ports not implementing t…
cl4es Feb 7, 2022
5d83d8c
Restore partial vector checks in AVX2 and SSE intrinsic variants
cl4es Feb 11, 2022
c4bb361
Merge branch 'master' into count_positives
cl4es Feb 11, 2022
47b2785
Merge branch 'master' of https://github.com/cl4es/jdk into count_posi…
cl4es Feb 17, 2022
531139a
Revert micro changes, split out to #7516
cl4es Feb 17, 2022
a5e28b3
Switch aarch64 intrinsic to a variant of countPositives returning len…
cl4es Feb 18, 2022
7427f6f
aarch64: fix issue with short inputs divisible by wordSize
cl4es Feb 18, 2022
a95680c
Fix TestCountPositives to correctly allow 0 return when expected != l…
cl4es Feb 23, 2022
685795c
Resolve merge conflict
cl4es Feb 23, 2022
a6becdf
Merge master
cl4es Mar 1, 2022
7789349
Narrow the bottom_type of CountPositivesNode (always results in a pos…
cl4es Mar 2, 2022
b826ef8
PPC impl provided by @TheRealMDoerr
cl4es Mar 2, 2022
3c41857
s390 impl provided by @RealLucy
cl4es Mar 2, 2022
3207c09
Clean out and remove vmIntrinsics::_hasNegatives and all related code
cl4es Mar 2, 2022
85be36a
Document that it's allowed for implementations to return values less …
cl4es Mar 3, 2022
81ef04e
Better implementation for aarch64 returning roughly the count of posi…
cl4es Mar 7, 2022
5c6194b
ary1 not required to have USE_KILL effect
cl4es Mar 7, 2022
934b5b8
use 32-bit mask to calculate correct remainder value
cl4es Mar 7, 2022
3d155c8
Restructure encodeUTF8 to reduce code gen issues
cl4es Mar 9, 2022
30739e1
Merge branch 'master' into count_positives
cl4es Mar 9, 2022
58ee73b
Revert encodeUTF8 for this PR due issues with fragile optimization
cl4es Mar 9, 2022
bc5a8c8
Fix copyright year in new test
cl4es Mar 9, 2022
6f22e1a
Disallow negative values in TestCountPositives test
cl4es Mar 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
@@ -4372,10 +4372,11 @@ address MacroAssembler::count_positives(Register ary1, Register len, Register re

BIND(END);
ldr(rscratch1, Address(ary1));
sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
lslv(rscratch1, rscratch1, len);
sub(rscratch2, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
lslv(rscratch1, rscratch1, rscratch2);
tst(rscratch1, UPPER_BIT_MASK);
b(SET_RESULT);
br(NE, SET_RESULT);
b(DONE);

BIND(STUB);
RuntimeAddress count_pos = RuntimeAddress(StubRoutines::aarch64::count_positives());
@@ -4400,7 +4401,9 @@ address MacroAssembler::count_positives(Register ary1, Register len, Register re
b(DONE);

BIND(SET_RESULT);
csel(result, zr, result, NE); // set len or 0

add(len, len, wordSize);
sub(result, result, len);

BIND(DONE);
postcond(pc() != badAddress);
37 changes: 22 additions & 15 deletions src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
Original file line number Diff line number Diff line change
@@ -4674,7 +4674,7 @@ class StubGenerator: public StubCodeGenerator {
// precondition: a copy of len is already in result
// __ mov(result, len);

Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16,
Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;

__ cmp(len, (u1)15);
@@ -4695,10 +4695,11 @@ class StubGenerator: public StubCodeGenerator {
__ ldp(rscratch1, rscratch2, Address(ary1, -16));
__ sub(len, len, 8); // no data dep., then sub can be executed while loading
__ tst(rscratch2, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE_NO_POP);
__ br(Assembler::NE, RET_NO_POP);
__ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
__ lsrv(rscratch1, rscratch1, rscratch2);
__ tst(rscratch1, UPPER_BIT_MASK);
__ bind(RET_NO_POP);
__ csel(result, zr, result, Assembler::NE);
__ leave();
__ ret(lr);
@@ -4718,10 +4719,10 @@ class StubGenerator: public StubCodeGenerator {
__ mov(tmp5, 16);
__ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
__ add(ary1, ary1, rscratch1);
__ sub(len, len, rscratch1);
__ orr(tmp6, tmp6, tmp1);
__ tst(tmp6, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE);
__ br(Assembler::NE, RET_ADJUST);
__ sub(len, len, rscratch1);

__ bind(ALIGNED);
__ cmp(len, large_loop_size);
@@ -4736,7 +4737,7 @@ class StubGenerator: public StubCodeGenerator {
__ sub(len, len, 16);
__ orr(tmp6, tmp6, tmp1);
__ tst(tmp6, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE);
__ br(Assembler::NE, RET_ADJUST_16);
__ cmp(len, large_loop_size);
__ br(Assembler::LT, CHECK_16);

@@ -4768,7 +4769,7 @@ class StubGenerator: public StubCodeGenerator {
__ orr(rscratch1, rscratch1, tmp6);
__ orr(tmp2, tmp2, rscratch1);
__ tst(tmp2, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE);
__ br(Assembler::NE, RET_ADJUST_LONG);
__ cmp(len, large_loop_size);
__ br(Assembler::GE, LARGE_LOOP);

@@ -4781,38 +4782,44 @@ class StubGenerator: public StubCodeGenerator {
__ sub(len, len, 16);
__ orr(tmp2, tmp2, tmp3);
__ tst(tmp2, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE);
__ br(Assembler::NE, RET_ADJUST_16);
__ cmp(len, (u1)16);
__ br(Assembler::GE, LOOP16); // 16-byte load loop end

__ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
__ cmp(len, (u1)8);
__ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
__ ldr(tmp3, Address(__ post(ary1, 8)));
__ sub(len, len, 8);
__ tst(tmp3, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE);
__ br(Assembler::NE, RET_ADJUST);
__ sub(len, len, 8);

__ bind(POST_LOOP16_LOAD_TAIL);
__ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
__ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
__ ldr(tmp1, Address(ary1));
__ mov(tmp2, 64);
__ sub(tmp4, tmp2, len, __ LSL, 3);
__ lslv(tmp1, tmp1, tmp4);
__ tst(tmp1, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_TRUE);
__ br(Assembler::NE, RET_ADJUST);
// Fallthrough

__ bind(RET_FALSE);
__ bind(RET_LEN);
__ pop(spilled_regs, sp);
__ leave();
__ ret(lr);

__ bind(RET_TRUE);
// difference result - len is the count of guaranteed to be
// positive bytes

__ bind(RET_ADJUST_LONG);
__ add(len, len, (u1)(large_loop_size - 16));
__ bind(RET_ADJUST_16);
__ add(len, len, 16);
__ bind(RET_ADJUST);
__ pop(spilled_regs, sp);
__ bind(RET_TRUE_NO_POP);
__ leave();
__ mov(result, zr);
__ sub(result, result, len);
__ ret(lr);

return entry;
4 changes: 2 additions & 2 deletions test/micro/org/openjdk/bench/java/lang/StringDecode.java
Original file line number Diff line number Diff line change
@@ -87,7 +87,7 @@ public void decodeAsciiShort(Blackhole bh) throws Exception {
bh.consume(new String(asciiString, charset));
bh.consume(new String(longAsciiString, 0, 15, charset));
bh.consume(new String(asciiString, 0, 3, charset));
bh.consume(new String(longAsciiString, 512, 512 + 7, charset));
bh.consume(new String(longAsciiString, 512, 7, charset));
}

@Benchmark
@@ -103,7 +103,7 @@ public void decodeLatin1Short(Blackhole bh) throws Exception {
bh.consume(new String(latin1String, charset));
bh.consume(new String(latin1String, 0, 15, charset));
bh.consume(new String(latin1String, 0, 3, charset));
bh.consume(new String(longLatin1OnlyString, 512, 512 + 7, charset));
bh.consume(new String(longLatin1OnlyString, 512, 7, charset));
}

@Benchmark