Skip to content
This repository was archived by the owner on Aug 27, 2022. It is now read-only.
/ lanai Public archive

Commit 398ce29

Browse files
Sandhya Viswanathansrukmann
Sandhya Viswanathan
andcommittedMar 23, 2020
8240248: Extend superword reduction optimizations for x86
Add support for and, or, xor reduction Co-authored-by: Shravya Rukmannagari <shravya.rukmannagari@intel.com> Reviewed-by: vlivanov, thartmann
1 parent 75a8b7f commit 398ce29

File tree

13 files changed

+1089
-702
lines changed

13 files changed

+1089
-702
lines changed
 

‎make/devkit/createJMHBundle.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ mkdir -p $BUILD_DIR $JAR_DIR
4040
cd $JAR_DIR
4141
rm -f *
4242

43-
wget http://central.maven.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
44-
wget http://central.maven.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
45-
wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
46-
wget http://central.maven.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
43+
wget https://repo.maven.apache.org/maven2/org/apache/commons/commons-math3/$COMMONS_MATH3_VERSION/commons-math3-$COMMONS_MATH3_VERSION.jar
44+
wget https://repo.maven.apache.org/maven2/net/sf/jopt-simple/jopt-simple/$JOPT_SIMPLE_VERSION/jopt-simple-$JOPT_SIMPLE_VERSION.jar
45+
wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-core/$JMH_VERSION/jmh-core-$JMH_VERSION.jar
46+
wget https://repo.maven.apache.org/maven2/org/openjdk/jmh/jmh-generator-annprocess/$JMH_VERSION/jmh-generator-annprocess-$JMH_VERSION.jar
4747

4848
tar -cvzf ../$BUNDLE_NAME *
4949

‎src/hotspot/cpu/x86/macroAssembler_x86.cpp

+238
Original file line numberDiff line numberDiff line change
@@ -4161,7 +4161,245 @@ void MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRe
41614161
vpsrlq(dst, nds, src, vector_len);
41624162
}
41634163
}
4164+
4165+
// Reductions for vectors of ints, longs, floats, and doubles.
4166+
4167+
void MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) {
4168+
int vector_len = Assembler::AVX_128bit;
4169+
4170+
switch (opcode) {
4171+
case Op_AndReductionV: pand(dst, src); break;
4172+
case Op_OrReductionV: por (dst, src); break;
4173+
case Op_XorReductionV: pxor(dst, src); break;
4174+
4175+
case Op_AddReductionVF: addss(dst, src); break;
4176+
case Op_AddReductionVD: addsd(dst, src); break;
4177+
case Op_AddReductionVI: paddd(dst, src); break;
4178+
case Op_AddReductionVL: paddq(dst, src); break;
4179+
4180+
case Op_MulReductionVF: mulss(dst, src); break;
4181+
case Op_MulReductionVD: mulsd(dst, src); break;
4182+
case Op_MulReductionVI: pmulld(dst, src); break;
4183+
case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break;
4184+
4185+
default: assert(false, "wrong opcode");
4186+
}
4187+
}
4188+
4189+
void MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
4190+
int vector_len = Assembler::AVX_256bit;
4191+
4192+
switch (opcode) {
4193+
case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
4194+
case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
4195+
case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
4196+
4197+
case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break;
4198+
case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
4199+
4200+
case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break;
4201+
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
4202+
4203+
default: assert(false, "wrong opcode");
4204+
}
4205+
}
4206+
4207+
void MacroAssembler::reduce_fp(int opcode, int vlen,
4208+
XMMRegister dst, XMMRegister src,
4209+
XMMRegister vtmp1, XMMRegister vtmp2) {
4210+
switch (opcode) {
4211+
case Op_AddReductionVF:
4212+
case Op_MulReductionVF:
4213+
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
4214+
break;
4215+
4216+
case Op_AddReductionVD:
4217+
case Op_MulReductionVD:
4218+
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
4219+
break;
4220+
4221+
default: assert(false, "wrong opcode");
4222+
}
4223+
}
4224+
4225+
void MacroAssembler::reduceI(int opcode, int vlen,
4226+
Register dst, Register src1, XMMRegister src2,
4227+
XMMRegister vtmp1, XMMRegister vtmp2) {
4228+
switch (vlen) {
4229+
case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
4230+
case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
4231+
case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
4232+
case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4233+
4234+
default: assert(false, "wrong vector length");
4235+
}
4236+
}
4237+
4238+
#ifdef _LP64
4239+
void MacroAssembler::reduceL(int opcode, int vlen,
4240+
Register dst, Register src1, XMMRegister src2,
4241+
XMMRegister vtmp1, XMMRegister vtmp2) {
4242+
switch (vlen) {
4243+
case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4244+
case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4245+
case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
4246+
4247+
default: assert(false, "wrong vector length");
4248+
}
4249+
}
4250+
#endif // _LP64
4251+
4252+
void MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4253+
switch (vlen) {
4254+
case 2:
4255+
assert(vtmp2 == xnoreg, "");
4256+
reduce2F(opcode, dst, src, vtmp1);
4257+
break;
4258+
case 4:
4259+
assert(vtmp2 == xnoreg, "");
4260+
reduce4F(opcode, dst, src, vtmp1);
4261+
break;
4262+
case 8:
4263+
reduce8F(opcode, dst, src, vtmp1, vtmp2);
4264+
break;
4265+
case 16:
4266+
reduce16F(opcode, dst, src, vtmp1, vtmp2);
4267+
break;
4268+
default: assert(false, "wrong vector length");
4269+
}
4270+
}
4271+
4272+
void MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4273+
switch (vlen) {
4274+
case 2:
4275+
assert(vtmp2 == xnoreg, "");
4276+
reduce2D(opcode, dst, src, vtmp1);
4277+
break;
4278+
case 4:
4279+
reduce4D(opcode, dst, src, vtmp1, vtmp2);
4280+
break;
4281+
case 8:
4282+
reduce8D(opcode, dst, src, vtmp1, vtmp2);
4283+
break;
4284+
default: assert(false, "wrong vector length");
4285+
}
4286+
}
4287+
4288+
void MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4289+
if (opcode == Op_AddReductionVI) {
4290+
if (vtmp1 != src2) {
4291+
movdqu(vtmp1, src2);
4292+
}
4293+
phaddd(vtmp1, vtmp1);
4294+
} else {
4295+
pshufd(vtmp1, src2, 0x1);
4296+
reduce_operation_128(opcode, vtmp1, src2);
4297+
}
4298+
movdl(vtmp2, src1);
4299+
reduce_operation_128(opcode, vtmp1, vtmp2);
4300+
movdl(dst, vtmp1);
4301+
}
4302+
4303+
void MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4304+
if (opcode == Op_AddReductionVI) {
4305+
if (vtmp1 != src2) {
4306+
movdqu(vtmp1, src2);
4307+
}
4308+
phaddd(vtmp1, src2);
4309+
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4310+
} else {
4311+
pshufd(vtmp2, src2, 0xE);
4312+
reduce_operation_128(opcode, vtmp2, src2);
4313+
reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
4314+
}
4315+
}
4316+
4317+
void MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4318+
if (opcode == Op_AddReductionVI) {
4319+
vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
4320+
vextracti128_high(vtmp2, vtmp1);
4321+
vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
4322+
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4323+
} else {
4324+
vextracti128_high(vtmp1, src2);
4325+
reduce_operation_128(opcode, vtmp1, src2);
4326+
reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4327+
}
4328+
}
4329+
4330+
void MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4331+
vextracti64x4_high(vtmp2, src2);
4332+
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
4333+
reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
4334+
}
4335+
4336+
#ifdef _LP64
4337+
void MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4338+
pshufd(vtmp2, src2, 0xE);
4339+
reduce_operation_128(opcode, vtmp2, src2);
4340+
movdq(vtmp1, src1);
4341+
reduce_operation_128(opcode, vtmp1, vtmp2);
4342+
movdq(dst, vtmp1);
4343+
}
4344+
4345+
void MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4346+
vextracti128_high(vtmp1, src2);
4347+
reduce_operation_128(opcode, vtmp1, src2);
4348+
reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
4349+
}
4350+
4351+
void MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
4352+
vextracti64x4_high(vtmp2, src2);
4353+
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
4354+
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
4355+
}
4356+
#endif // _LP64
4357+
4358+
void MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
4359+
reduce_operation_128(opcode, dst, src);
4360+
pshufd(vtmp, src, 0x1);
4361+
reduce_operation_128(opcode, dst, vtmp);
4362+
}
4363+
4364+
void MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
4365+
reduce2F(opcode, dst, src, vtmp);
4366+
pshufd(vtmp, src, 0x2);
4367+
reduce_operation_128(opcode, dst, vtmp);
4368+
pshufd(vtmp, src, 0x3);
4369+
reduce_operation_128(opcode, dst, vtmp);
4370+
}
4371+
4372+
void MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4373+
reduce4F(opcode, dst, src, vtmp2);
4374+
vextractf128_high(vtmp2, src);
4375+
reduce4F(opcode, dst, vtmp2, vtmp1);
4376+
}
4377+
4378+
void MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4379+
reduce8F(opcode, dst, src, vtmp1, vtmp2);
4380+
vextracti64x4_high(vtmp1, src);
4381+
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
4382+
}
4383+
4384+
void MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
4385+
reduce_operation_128(opcode, dst, src);
4386+
pshufd(vtmp, src, 0xE);
4387+
reduce_operation_128(opcode, dst, vtmp);
4388+
}
4389+
4390+
void MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4391+
reduce2D(opcode, dst, src, vtmp2);
4392+
vextractf128_high(vtmp2, src);
4393+
reduce2D(opcode, dst, vtmp2, vtmp1);
4394+
}
4395+
4396+
void MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
4397+
reduce4D(opcode, dst, src, vtmp1, vtmp2);
4398+
vextracti64x4_high(vtmp1, src);
4399+
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
4400+
}
41644401
#endif
4402+
41654403
//-------------------------------------------------------------------------------------------
41664404

41674405
void MacroAssembler::clear_jweak_tag(Register possibly_jweak) {

‎src/hotspot/cpu/x86/macroAssembler_x86.hpp

+40
Original file line numberDiff line numberDiff line change
@@ -1649,8 +1649,48 @@ class MacroAssembler: public Assembler {
16491649
void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
16501650
void vshiftq(int opcode, XMMRegister dst, XMMRegister src);
16511651
void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1652+
1653+
// Reductions for vectors of ints, longs, floats, and doubles.
1654+
1655+
// dst = src1 + reduce(op, src2) using vtmp as temps
1656+
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1657+
#ifdef _LP64
1658+
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1659+
#endif // _LP64
1660+
1661+
// dst = reduce(op, src2) using vtmp as temps
1662+
void reduce_fp(int opcode, int vlen,
1663+
XMMRegister dst, XMMRegister src,
1664+
XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
1665+
private:
1666+
void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
1667+
void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
1668+
1669+
void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1670+
void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1671+
void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1672+
void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1673+
1674+
#ifdef _LP64
1675+
void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1676+
void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1677+
void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
1678+
#endif // _LP64
1679+
1680+
void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
1681+
void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
1682+
void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
1683+
void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
1684+
1685+
void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
1686+
void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
1687+
void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
1688+
1689+
void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src);
1690+
void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
16521691
#endif
16531692

1693+
public:
16541694
// C2 compiled method's prolog code.
16551695
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);
16561696

0 commit comments

Comments
 (0)
This repository has been archived.