@@ -4161,7 +4161,245 @@ void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRe
4161
4161
vpsrlq(dst, nds, src, vector_len);
4162
4162
}
4163
4163
}
4164
+
4165
+ // Reductions for vectors of ints, longs, floats, and doubles.
4166
+
4167
+ void MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
4168
+ int vector_len = Assembler::AVX_128bit;
4169
+
4170
+ switch (opcode) {
4171
+ case Op_AndReductionV: pand(dst, src); break;
4172
+ case Op_OrReductionV: por (dst, src); break;
4173
+ case Op_XorReductionV: pxor(dst, src); break;
4174
+
4175
+ case Op_AddReductionVF: addss(dst, src); break;
4176
+ case Op_AddReductionVD: addsd(dst, src); break;
4177
+ case Op_AddReductionVI: paddd(dst, src); break;
4178
+ case Op_AddReductionVL: paddq(dst, src); break;
4179
+
4180
+ case Op_MulReductionVF: mulss(dst, src); break;
4181
+ case Op_MulReductionVD: mulsd(dst, src); break;
4182
+ case Op_MulReductionVI: pmulld(dst, src); break;
4183
+ case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
4184
+
4185
+ default: assert(false, "wrong opcode");
4186
+ }
4187
+ }
4188
+
4189
+ void MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
4190
+ int vector_len = Assembler::AVX_256bit;
4191
+
4192
+ switch (opcode) {
4193
+ case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
4194
+ case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
4195
+ case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
4196
+
4197
+ case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
4198
+ case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
4199
+
4200
+ case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
4201
+ case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
4202
+
4203
+ default: assert(false, "wrong opcode");
4204
+ }
4205
+ }
4206
+
4207
+ void MacroAssembler::reduce_fp(int opcode, int vlen,
4208
+ XMMRegister dst, XMMRegister src,
4209
+ XMMRegister vtmp1, XMMRegister vtmp2) {
4210
+ switch (opcode) {
4211
+ case Op_AddReductionVF:
4212
+ case Op_MulReductionVF:
4213
+ reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
4214
+ break;
4215
+
4216
+ case Op_AddReductionVD:
4217
+ case Op_MulReductionVD:
4218
+ reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
4219
+ break;
4220
+
4221
+ default: assert(false, "wrong opcode");
4222
+ }
4223
+ }
4224
+
4225
+ void MacroAssembler::reduceI(int opcode, int vlen,
4226
+ Register dst, Register src1, XMMRegister src2,
4227
+ XMMRegister vtmp1, XMMRegister vtmp2) {
4228
+ switch (vlen) {
4229
+ case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
4230
+ case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
4231
+ case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
4232
+ case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4233
+
4234
+ default: assert(false, "wrong vector length");
4235
+ }
4236
+ }
4237
+
4238
+ #ifdef _LP64
4239
+ void MacroAssembler::reduceL(int opcode, int vlen,
4240
+ Register dst, Register src1, XMMRegister src2,
4241
+ XMMRegister vtmp1, XMMRegister vtmp2) {
4242
+ switch (vlen) {
4243
+ case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4244
+ case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4245
+ case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4246
+
4247
+ default: assert(false, "wrong vector length");
4248
+ }
4249
+ }
4250
+ #endif // _LP64
4251
+
4252
+ void MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4253
+ switch (vlen) {
4254
+ case 2:
4255
+ assert(vtmp2 == xnoreg, "");
4256
+ reduce2F(opcode, dst, src, vtmp1);
4257
+ break;
4258
+ case 4:
4259
+ assert(vtmp2 == xnoreg, "");
4260
+ reduce4F(opcode, dst, src, vtmp1);
4261
+ break;
4262
+ case 8:
4263
+ reduce8F(opcode, dst, src, vtmp1, vtmp2);
4264
+ break;
4265
+ case 16:
4266
+ reduce16F(opcode, dst, src, vtmp1, vtmp2);
4267
+ break;
4268
+ default: assert(false, "wrong vector length");
4269
+ }
4270
+ }
4271
+
4272
+ void MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4273
+ switch (vlen) {
4274
+ case 2:
4275
+ assert(vtmp2 == xnoreg, "");
4276
+ reduce2D(opcode, dst, src, vtmp1);
4277
+ break;
4278
+ case 4:
4279
+ reduce4D(opcode, dst, src, vtmp1, vtmp2);
4280
+ break;
4281
+ case 8:
4282
+ reduce8D(opcode, dst, src, vtmp1, vtmp2);
4283
+ break;
4284
+ default: assert(false, "wrong vector length");
4285
+ }
4286
+ }
4287
+
4288
+ void MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4289
+ if (opcode == Op_AddReductionVI) {
4290
+ if (vtmp1 != src2) {
4291
+ movdqu(vtmp1, src2);
4292
+ }
4293
+ phaddd(vtmp1, vtmp1);
4294
+ } else {
4295
+ pshufd(vtmp1, src2, 0x1);
4296
+ reduce_operation_128(opcode, vtmp1, src2);
4297
+ }
4298
+ movdl(vtmp2, src1);
4299
+ reduce_operation_128(opcode, vtmp1, vtmp2);
4300
+ movdl(dst, vtmp1);
4301
+ }
4302
+
4303
+ void MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4304
+ if (opcode == Op_AddReductionVI) {
4305
+ if (vtmp1 != src2) {
4306
+ movdqu(vtmp1, src2);
4307
+ }
4308
+ phaddd(vtmp1, src2);
4309
+ reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4310
+ } else {
4311
+ pshufd(vtmp2, src2, 0xE);
4312
+ reduce_operation_128(opcode, vtmp2, src2);
4313
+ reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
4314
+ }
4315
+ }
4316
+
4317
+ void MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4318
+ if (opcode == Op_AddReductionVI) {
4319
+ vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
4320
+ vextracti128_high(vtmp2, vtmp1);
4321
+ vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
4322
+ reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4323
+ } else {
4324
+ vextracti128_high(vtmp1, src2);
4325
+ reduce_operation_128(opcode, vtmp1, src2);
4326
+ reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4327
+ }
4328
+ }
4329
+
4330
+ void MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4331
+ vextracti64x4_high(vtmp2, src2);
4332
+ reduce_operation_256(opcode, vtmp2, vtmp2, src2);
4333
+ reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
4334
+ }
4335
+
4336
+ #ifdef _LP64
4337
+ void MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4338
+ pshufd(vtmp2, src2, 0xE);
4339
+ reduce_operation_128(opcode, vtmp2, src2);
4340
+ movdq(vtmp1, src1);
4341
+ reduce_operation_128(opcode, vtmp1, vtmp2);
4342
+ movdq(dst, vtmp1);
4343
+ }
4344
+
4345
+ void MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4346
+ vextracti128_high(vtmp1, src2);
4347
+ reduce_operation_128(opcode, vtmp1, src2);
4348
+ reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4349
+ }
4350
+
4351
+ void MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4352
+ vextracti64x4_high(vtmp2, src2);
4353
+ reduce_operation_256(opcode, vtmp2, vtmp2, src2);
4354
+ reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
4355
+ }
4356
+ #endif // _LP64
4357
+
4358
+ void MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
4359
+ reduce_operation_128(opcode, dst, src);
4360
+ pshufd(vtmp, src, 0x1);
4361
+ reduce_operation_128(opcode, dst, vtmp);
4362
+ }
4363
+
4364
+ void MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
4365
+ reduce2F(opcode, dst, src, vtmp);
4366
+ pshufd(vtmp, src, 0x2);
4367
+ reduce_operation_128(opcode, dst, vtmp);
4368
+ pshufd(vtmp, src, 0x3);
4369
+ reduce_operation_128(opcode, dst, vtmp);
4370
+ }
4371
+
4372
+ void MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4373
+ reduce4F(opcode, dst, src, vtmp2);
4374
+ vextractf128_high(vtmp2, src);
4375
+ reduce4F(opcode, dst, vtmp2, vtmp1);
4376
+ }
4377
+
4378
+ void MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4379
+ reduce8F(opcode, dst, src, vtmp1, vtmp2);
4380
+ vextracti64x4_high(vtmp1, src);
4381
+ reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
4382
+ }
4383
+
4384
+ void MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
4385
+ reduce_operation_128(opcode, dst, src);
4386
+ pshufd(vtmp, src, 0xE);
4387
+ reduce_operation_128(opcode, dst, vtmp);
4388
+ }
4389
+
4390
+ void MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4391
+ reduce2D(opcode, dst, src, vtmp2);
4392
+ vextractf128_high(vtmp2, src);
4393
+ reduce2D(opcode, dst, vtmp2, vtmp1);
4394
+ }
4395
+
4396
+ void MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4397
+ reduce4D(opcode, dst, src, vtmp1, vtmp2);
4398
+ vextracti64x4_high(vtmp1, src);
4399
+ reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
4400
+ }
4164
4401
#endif
4402
+
4165
4403
//-------------------------------------------------------------------------------------------
4166
4404
4167
4405
void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {
0 commit comments