@@ -7030,7 +7030,7 @@ void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, X
7030
7030
7031
7031
// Helper function for AVX 512 CRC32
7032
7032
// Compute CRC32 for < 256B buffers
7033
- void MacroAssembler::kernel_crc32_avx512_256B (Register crc, Register buf, Register len, Register key , Register pos,
7033
+ void MacroAssembler::kernel_crc32_avx512_256B (Register crc, Register buf, Register len, Register table , Register pos,
7034
7034
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
7035
7035
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) {
7036
7036
@@ -7043,7 +7043,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
7043
7043
jcc (Assembler::less, L_less_than_32);
7044
7044
7045
7045
// if there is, load the constants
7046
- movdqu (xmm10, Address (key , 1 * 16 )); // rk1 and rk2 in xmm10
7046
+ movdqu (xmm10, Address (table , 1 * 16 )); // rk1 and rk2 in xmm10
7047
7047
movdl (xmm0, crc); // get the initial crc value
7048
7048
movdqu (xmm7, Address (buf, pos, Address::times_1, 0 * 16 )); // load the plaintext
7049
7049
pxor (xmm7, xmm0);
@@ -7070,7 +7070,7 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
7070
7070
pxor (xmm7, xmm0); // xor the initial crc value
7071
7071
addl (pos, 16 );
7072
7072
subl (len, 16 );
7073
- movdqu (xmm10, Address (key , 1 * 16 )); // rk1 and rk2 in xmm10
7073
+ movdqu (xmm10, Address (table , 1 * 16 )); // rk1 and rk2 in xmm10
7074
7074
jmp (L_get_last_two_xmms);
7075
7075
7076
7076
bind (L_less_than_16_left);
@@ -7190,12 +7190,17 @@ void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Regist
7190
7190
* param crc register containing existing CRC (32-bit)
7191
7191
* param buf register pointing to input byte buffer (byte*)
7192
7192
* param len register containing number of bytes
7193
+ * param table address of crc or crc32c table
7193
7194
* param tmp1 scratch register
7194
7195
* param tmp2 scratch register
7195
7196
* return rax result register
7197
+ *
7198
+ * This routine is identical for crc32c with the exception of the precomputed constant
7199
+ * table which will be passed as the table argument. The calculation steps are
7200
+ * the same for both variants.
7196
7201
*/
7197
- void MacroAssembler::kernel_crc32_avx512 (Register crc, Register buf, Register len, Register key , Register tmp1, Register tmp2) {
7198
- assert_different_registers (crc, buf, len, key , tmp1, tmp2, rax);
7202
+ void MacroAssembler::kernel_crc32_avx512 (Register crc, Register buf, Register len, Register table , Register tmp1, Register tmp2) {
7203
+ assert_different_registers (crc, buf, len, table , tmp1, tmp2, rax, r12 );
7199
7204
7200
7205
Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7201
7206
Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
@@ -7210,8 +7215,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7210
7215
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
7211
7216
// context for the registers used, where all instructions below are using 128-bit mode
7212
7217
// On EVEX without VL and BW, these instructions will all be AVX.
7213
- lea (key, ExternalAddress (StubRoutines::x86::crc_table_avx512_addr ()));
7214
- notl (crc);
7215
7218
movl (pos, 0 );
7216
7219
7217
7220
// check if smaller than 256B
@@ -7225,15 +7228,15 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7225
7228
evmovdquq (xmm0, Address (buf, pos, Address::times_1, 0 * 64 ), Assembler::AVX_512bit);
7226
7229
evmovdquq (xmm4, Address (buf, pos, Address::times_1, 1 * 64 ), Assembler::AVX_512bit);
7227
7230
evpxorq (xmm0, xmm0, xmm10, Assembler::AVX_512bit);
7228
- evbroadcasti32x4 (xmm10, Address (key , 2 * 16 ), Assembler::AVX_512bit); // zmm10 has rk3 and rk4
7231
+ evbroadcasti32x4 (xmm10, Address (table , 2 * 16 ), Assembler::AVX_512bit); // zmm10 has rk3 and rk4
7229
7232
7230
7233
subl (len, 256 );
7231
7234
cmpl (len, 256 );
7232
7235
jcc (Assembler::less, L_fold_128_B_loop);
7233
7236
7234
7237
evmovdquq (xmm7, Address (buf, pos, Address::times_1, 2 * 64 ), Assembler::AVX_512bit);
7235
7238
evmovdquq (xmm8, Address (buf, pos, Address::times_1, 3 * 64 ), Assembler::AVX_512bit);
7236
- evbroadcasti32x4 (xmm16, Address (key , 0 * 16 ), Assembler::AVX_512bit); // zmm16 has rk-1 and rk-2
7239
+ evbroadcasti32x4 (xmm16, Address (table , 0 * 16 ), Assembler::AVX_512bit); // zmm16 has rk-1 and rk-2
7237
7240
subl (len, 256 );
7238
7241
7239
7242
bind (L_fold_256_B_loop);
@@ -7279,8 +7282,8 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7279
7282
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
7280
7283
// the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
7281
7284
bind (L_fold_128_B_register);
7282
- evmovdquq (xmm16, Address (key , 5 * 16 ), Assembler::AVX_512bit); // multiply by rk9-rk16
7283
- evmovdquq (xmm11, Address (key , 9 * 16 ), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7285
+ evmovdquq (xmm16, Address (table , 5 * 16 ), Assembler::AVX_512bit); // multiply by rk9-rk16
7286
+ evmovdquq (xmm11, Address (table , 9 * 16 ), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
7284
7287
evpclmulqdq (xmm1, xmm0, xmm16, 0x01 , Assembler::AVX_512bit);
7285
7288
evpclmulqdq (xmm2, xmm0, xmm16, 0x10 , Assembler::AVX_512bit);
7286
7289
// save last that has no multiplicand
@@ -7289,7 +7292,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7289
7292
evpclmulqdq (xmm5, xmm4, xmm11, 0x01 , Assembler::AVX_512bit);
7290
7293
evpclmulqdq (xmm6, xmm4, xmm11, 0x10 , Assembler::AVX_512bit);
7291
7294
// Needed later in reduction loop
7292
- movdqu (xmm10, Address (key , 1 * 16 ));
7295
+ movdqu (xmm10, Address (table , 1 * 16 ));
7293
7296
vpternlogq (xmm1, 0x96 , xmm2, xmm5, Assembler::AVX_512bit); // xor ABC
7294
7297
vpternlogq (xmm1, 0x96 , xmm6, xmm7, Assembler::AVX_512bit); // xor ABC
7295
7298
@@ -7305,7 +7308,7 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7305
7308
jcc (Assembler::less, L_final_reduction_for_128);
7306
7309
7307
7310
bind (L_16B_reduction_loop);
7308
- vpclmulqdq (xmm8, xmm7, xmm10, 0x1 );
7311
+ vpclmulqdq (xmm8, xmm7, xmm10, 0x01 );
7309
7312
vpclmulqdq (xmm7, xmm7, xmm10, 0x10 );
7310
7313
vpxor (xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7311
7314
movdqu (xmm0, Address (buf, pos, Address::times_1, 0 * 16 ));
@@ -7336,14 +7339,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7336
7339
vpshufb (xmm2, xmm2, xmm0, Assembler::AVX_128bit);
7337
7340
7338
7341
blendvpb (xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit);
7339
- vpclmulqdq (xmm8, xmm7, xmm10, 0x1 );
7342
+ vpclmulqdq (xmm8, xmm7, xmm10, 0x01 );
7340
7343
vpclmulqdq (xmm7, xmm7, xmm10, 0x10 );
7341
7344
vpxor (xmm7, xmm7, xmm8, Assembler::AVX_128bit);
7342
7345
vpxor (xmm7, xmm7, xmm2, Assembler::AVX_128bit);
7343
7346
7344
7347
bind (L_128_done);
7345
7348
// compute crc of a 128-bit value
7346
- movdqu (xmm10, Address (key , 3 * 16 ));
7349
+ movdqu (xmm10, Address (table , 3 * 16 ));
7347
7350
movdqu (xmm0, xmm7);
7348
7351
7349
7352
// 64b fold
@@ -7359,14 +7362,14 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7359
7362
jmp (L_barrett);
7360
7363
7361
7364
bind (L_less_than_256);
7362
- kernel_crc32_avx512_256B (crc, buf, len, key , pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7365
+ kernel_crc32_avx512_256B (crc, buf, len, table , pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup);
7363
7366
7364
7367
// barrett reduction
7365
7368
bind (L_barrett);
7366
7369
vpand (xmm7, xmm7, ExternalAddress (StubRoutines::x86::crc_by128_masks_avx512_addr () + 1 * 16 ), Assembler::AVX_128bit, tmp2);
7367
7370
movdqu (xmm1, xmm7);
7368
7371
movdqu (xmm2, xmm7);
7369
- movdqu (xmm10, Address (key , 4 * 16 ));
7372
+ movdqu (xmm10, Address (table , 4 * 16 ));
7370
7373
7371
7374
pclmulqdq (xmm7, xmm10, 0x0 );
7372
7375
pxor (xmm7, xmm2);
@@ -7378,7 +7381,6 @@ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register le
7378
7381
pextrd (crc, xmm7, 2 );
7379
7382
7380
7383
bind (L_cleanup);
7381
- notl (crc); // ~c
7382
7384
addptr (rsp, 16 * 2 + 8 );
7383
7385
pop (r12);
7384
7386
}
0 commit comments