Skip to content

Commit e0f1fc7

Browse files
asgibbonsgbtuckerScott Gibbons
authored and
Sandhya Viswanathan
committedDec 2, 2021
8277358: Accelerate CRC32-C
Co-authored-by: Greg Tucker <greg.b.tucker@intel.com> Co-authored-by: Scott Gibbons <sgibbons@openjdk.org> Reviewed-by: kvn, sviswanathan, ecaspole
1 parent 73a9654 commit e0f1fc7

File tree

5 files changed

+122
-28
lines changed

5 files changed

+122
-28
lines changed
 

‎src/hotspot/cpu/x86/macroAssembler_x86.cpp

+20-18
Original file line numberDiff line numberDiff line change
@@ -7030,7 +7030,7 @@ void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, X
70307030

70317031
// Helper function for AVX 512 CRC32
70327032
// Compute CRC32 for < 256B buffers
7033-
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
7033+
void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos,
70347034
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
70357035
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
70367036

@@ -7043,7 +7043,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
70437043
jcc(Assembler::less, L_less_than_32);
70447044

70457045
// if there is, load the constants
7046-
movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10
7046+
movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
70477047
movdl(xmm0, crc); // get the initial crc value
70487048
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
70497049
pxor(xmm7, xmm0);
@@ -7070,7 +7070,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
70707070
pxor(xmm7, xmm0); //xor the initial crc value
70717071
addl(pos, 16);
70727072
subl(len, 16);
7073-
movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10
7073+
movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
70747074
jmp(L_get_last_two_xmms);
70757075

70767076
bind(L_less_than_16_left);
@@ -7190,12 +7190,17 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
71907190
* param crc register containing existing CRC (32-bit)
71917191
* param buf register pointing to input byte buffer (byte*)
71927192
* param len register containing number of bytes
7193+
* param table address of crc or crc32c table
71937194
* param tmp1 scratch register
71947195
* param tmp2 scratch register
71957196
* return rax result register
7197+
*
7198+
* This routine is identical for crc32c with the exception of the precomputed constant
7199+
* table which will be passed as the table argument. The calculation steps are
7200+
* the same for both variants.
71967201
*/
7197-
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) {
7198-
assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax);
7202+
void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
7203+
assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
71997204

72007205
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
72017206
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
@@ -7210,8 +7215,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
72107215
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
72117216
// context for the registers used, where all instructions below are using 128-bit mode
72127217
// On EVEX without VL and BW, these instructions will all be AVX.
7213-
lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
7214-
notl(crc);
72157218
movl(pos, 0);
72167219

72177220
// check if smaller than 256B
@@ -7225,15 +7228,15 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
72257228
evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
72267229
evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
72277230
evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7228-
evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
7231+
evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
72297232

72307233
subl(len, 256);
72317234
cmpl(len, 256);
72327235
jcc(Assembler::less, L_fold_128_B_loop);
72337236

72347237
evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit);
72357238
evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit);
7236-
evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
7239+
evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2
72377240
subl(len, 256);
72387241

72397242
bind(L_fold_256_B_loop);
@@ -7279,8 +7282,8 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
72797282
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
72807283
// the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
72817284
bind(L_fold_128_B_register);
7282-
evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7283-
evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7285+
evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
7286+
evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
72847287
evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
72857288
evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit);
72867289
// save last that has no multiplicand
@@ -7289,7 +7292,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
72897292
evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit);
72907293
evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit);
72917294
// Needed later in reduction loop
7292-
movdqu(xmm10, Address(key, 1 * 16));
7295+
movdqu(xmm10, Address(table, 1 * 16));
72937296
vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
72947297
vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
72957298

@@ -7305,7 +7308,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
73057308
jcc(Assembler::less, L_final_reduction_for_128);
73067309

73077310
bind(L_16B_reduction_loop);
7308-
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7311+
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
73097312
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
73107313
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
73117314
movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16));
@@ -7336,14 +7339,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
73367339
vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit);
73377340

73387341
blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7339-
vpclmulqdq(xmm8, xmm7, xmm10, 0x1);
7342+
vpclmulqdq(xmm8, xmm7, xmm10, 0x01);
73407343
vpclmulqdq(xmm7, xmm7, xmm10, 0x10);
73417344
vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit);
73427345
vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit);
73437346

73447347
bind(L_128_done);
73457348
// compute crc of a 128-bit value
7346-
movdqu(xmm10, Address(key, 3 * 16));
7349+
movdqu(xmm10, Address(table, 3 * 16));
73477350
movdqu(xmm0, xmm7);
73487351

73497352
// 64b fold
@@ -7359,14 +7362,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
73597362
jmp(L_barrett);
73607363

73617364
bind(L_less_than_256);
7362-
kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7365+
kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
73637366

73647367
//barrett reduction
73657368
bind(L_barrett);
73667369
vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2);
73677370
movdqu(xmm1, xmm7);
73687371
movdqu(xmm2, xmm7);
7369-
movdqu(xmm10, Address(key, 4 * 16));
7372+
movdqu(xmm10, Address(table, 4 * 16));
73707373

73717374
pclmulqdq(xmm7, xmm10, 0x0);
73727375
pxor(xmm7, xmm2);
@@ -7378,7 +7381,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
73787381
pextrd(crc, xmm7, 2);
73797382

73807383
bind(L_cleanup);
7381-
notl(crc); // ~c
73827384
addptr(rsp, 16 * 2 + 8);
73837385
pop(r12);
73847386
}

‎src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

+23-10
Original file line numberDiff line numberDiff line change
@@ -6528,7 +6528,13 @@ address generate_avx_ghash_processBlocks() {
65286528
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
65296529
VM_Version::supports_avx512bw() &&
65306530
VM_Version::supports_avx512vl()) {
6531+
// The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value.
6532+
// However, the constant table for CRC32-C assumes the original crc value. Account for this
6533+
// difference before calling and after returning.
6534+
__ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
6535+
__ notl(crc);
65316536
__ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
6537+
__ notl(crc);
65326538
} else {
65336539
__ kernel_crc32(crc, buf, len, table, tmp1);
65346540
}
@@ -6580,20 +6586,27 @@ address generate_avx_ghash_processBlocks() {
65806586

65816587
BLOCK_COMMENT("Entry:");
65826588
__ enter(); // required for proper stackwalking of RuntimeStub frame
6589+
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
6590+
VM_Version::supports_avx512bw() &&
6591+
VM_Version::supports_avx512vl()) {
6592+
__ lea(j, ExternalAddress(StubRoutines::x86::crc32c_table_avx512_addr()));
6593+
__ kernel_crc32_avx512(crc, buf, len, j, l, k);
6594+
} else {
65836595
#ifdef _WIN64
6584-
__ push(y);
6585-
__ push(z);
6596+
__ push(y);
6597+
__ push(z);
65866598
#endif
6587-
__ crc32c_ipl_alg2_alt2(crc, buf, len,
6588-
a, j, k,
6589-
l, y, z,
6590-
c_farg0, c_farg1, c_farg2,
6591-
is_pclmulqdq_supported);
6592-
__ movl(rax, crc);
6599+
__ crc32c_ipl_alg2_alt2(crc, buf, len,
6600+
a, j, k,
6601+
l, y, z,
6602+
c_farg0, c_farg1, c_farg2,
6603+
is_pclmulqdq_supported);
65936604
#ifdef _WIN64
6594-
__ pop(z);
6595-
__ pop(y);
6605+
__ pop(z);
6606+
__ pop(y);
65966607
#endif
6608+
}
6609+
__ movl(rax, crc);
65976610
__ vzeroupper();
65986611
__ leave(); // required for proper stackwalking of RuntimeStub frame
65996612
__ ret(0);

‎src/hotspot/cpu/x86/stubRoutines_x86.cpp

+17
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,23 @@ juint StubRoutines::x86::_crc_table_avx512[] =
221221
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
222222
};
223223

224+
juint StubRoutines::x86::_crc32c_table_avx512[] =
225+
{
226+
0xb9e02b86UL, 0x00000000UL, 0xdcb17aa4UL, 0x00000000UL,
227+
0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL,
228+
0x06e38d70UL, 0x00000002UL, 0x6992cea2UL, 0x00000000UL,
229+
0x493c7d27UL, 0x00000000UL, 0xdd45aab8UL, 0x00000000UL,
230+
0xdea713f0UL, 0x00000000UL, 0x05ec76f0UL, 0x00000001UL,
231+
0x47db8317UL, 0x00000000UL, 0x2ad91c30UL, 0x00000000UL,
232+
0x0715ce53UL, 0x00000000UL, 0xc49f4f67UL, 0x00000000UL,
233+
0x39d3b296UL, 0x00000000UL, 0x083a6eecUL, 0x00000000UL,
234+
0x9e4addf8UL, 0x00000000UL, 0x740eef02UL, 0x00000000UL,
235+
0xddc0152bUL, 0x00000000UL, 0x1c291d04UL, 0x00000000UL,
236+
0xba4fc28eUL, 0x00000000UL, 0x3da6d0cbUL, 0x00000000UL,
237+
0x493c7d27UL, 0x00000000UL, 0xc1068c50UL, 0x0000000eUL,
238+
0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL
239+
};
240+
224241
juint StubRoutines::x86::_crc_by128_masks_avx512[] =
225242
{
226243
0xffffffffUL, 0xffffffffUL, 0x00000000UL, 0x00000000UL,

‎src/hotspot/cpu/x86/stubRoutines_x86.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ class x86 {
137137
#ifdef _LP64
138138
static juint _crc_by128_masks_avx512[];
139139
static juint _crc_table_avx512[];
140+
static juint _crc32c_table_avx512[];
140141
static juint _shuf_table_crc32_avx512[];
141142
static juint _adler32_shuf0_table[];
142143
static juint _adler32_shuf1_table[];
@@ -256,6 +257,7 @@ class x86 {
256257
static address crc_by128_masks_avx512_addr() { return (address)_crc_by128_masks_avx512; }
257258
static address shuf_table_crc32_avx512_addr() { return (address)_shuf_table_crc32_avx512; }
258259
static address crc_table_avx512_addr() { return (address)_crc_table_avx512; }
260+
static address crc32c_table_avx512_addr() { return (address)_crc32c_table_avx512; }
259261
static address ghash_polynomial512_addr() { return _ghash_poly512_addr; }
260262
#endif // _LP64
261263
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
package org.openjdk.bench.java.util;
24+
25+
import java.util.Random;
26+
import java.util.concurrent.TimeUnit;
27+
import java.util.zip.CRC32C;
28+
import org.openjdk.jmh.annotations.*;
29+
30+
@BenchmarkMode(Mode.Throughput)
31+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
32+
@State(Scope.Benchmark)
33+
@Fork(value = 2)
34+
35+
public class TestCRC32C {
36+
37+
private CRC32C crc32c;
38+
private Random random;
39+
private byte[] bytes;
40+
41+
@Param({"64", "128", "256", "512", "1024", "2048", "4096", "8192", "16384", "32768", "65536"})
42+
private int count;
43+
44+
public TestCRC32C() {
45+
crc32c = new CRC32C();
46+
random = new Random(2147483648L);
47+
bytes = new byte[1000000];
48+
random.nextBytes(bytes);
49+
}
50+
51+
@Setup(Level.Iteration)
52+
public void setupBytes() {
53+
crc32c.reset();
54+
}
55+
56+
@Benchmark
57+
public void testCRC32CUpdate() {
58+
crc32c.update(bytes, 0, count);
59+
}
60+
}

0 commit comments

Comments
 (0)
Please sign in to comment.