Skip to content

Fast Path StringCoding.countPostives and hasNegative for Power #21597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 16 commits into
base: master
Choose a base branch
from
14 changes: 14 additions & 0 deletions runtime/compiler/p/codegen/J9CodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,20 @@ J9::Power::CodeGenerator::initialize()
cg->setEnableTLHPrefetching();
}

static bool disableInlineStringCodingHasNegatives =
feGetEnv("TR_DisableInlineStringCodingHasNegatives") != NULL;
static bool disableInlineStringCodingCountPositives =
feGetEnv("TR_DisableInlineStringCodingCountPositives") != NULL;
if (comp->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8) &&
comp->target().cpu.supportsFeature(OMR_FEATURE_PPC_HAS_VSX) &&
!TR::Compiler->om.canGenerateArraylets())
{
if (!disableInlineStringCodingHasNegatives)
cg->setSupportsInlineStringCodingHasNegatives();
if (!disableInlineStringCodingCountPositives)
cg->setSupportsInlineStringCodingCountPositives();
}

//This env-var does 3 things:
// 1. Prevents batch clear in frontend/j9/rossa.cpp
// 2. Prevents all allocations to nonZeroTLH
Expand Down
294 changes: 294 additions & 0 deletions runtime/compiler/p/codegen/J9TreeEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11693,6 +11693,285 @@ static bool inlineIntrinsicInflate(TR::Node *node, TR::CodeGenerator *cg)
return true;
}

static TR::Register *inlineStringCodingHasNegativesOrCountPositives(TR::Node *node,
TR::CodeGenerator *cg,
bool isCountPositives)
{
TR::Compilation *comp = cg->comp();
bool isLE = comp->target().cpu.isLittleEndian();
bool p9Plus = cg->comp()->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P9);

TR::Register *startReg = cg->gprClobberEvaluate(node->getChild(0)); // array
TR::Register *indexReg = cg->gprClobberEvaluate(node->getChild(1)); // offset
TR::Register *lengthReg = cg->evaluate(node->getChild(2)); // length

TR::Register *tempReg = cg->allocateRegister();

TR::Register *cr6 = cg->allocateRegister(TR_CCR);
TR::Register *cr0 = cg->allocateRegister(TR_CCR);

TR::Register *vconstant0Reg = cg->allocateRegister(TR_VRF);
TR::Register *vtmp1Reg = cg->allocateRegister(TR_VRF);

TR::Register *storeReg = cg->allocateRegister();
TR::Register *maskReg = cg->allocateRegister();

TR::LabelSymbol *VSXLabel = generateLabelSymbol(cg);
TR::LabelSymbol *serialPrepLabel = generateLabelSymbol(cg);
TR::LabelSymbol *serialUnrollLabel = generateLabelSymbol(cg);
TR::LabelSymbol *serialLabel = generateLabelSymbol(cg);
TR::LabelSymbol *matchLabel = generateLabelSymbol(cg);
TR::LabelSymbol *resultLabel = generateLabelSymbol(cg);
TR::LabelSymbol *endLabel = generateLabelSymbol(cg);

// check empty
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, lengthReg, 0);
generateConditionalBranchInstruction(cg, TR::InstOpCode::ble, node, resultLabel, cr6);

// skip over or load the header
#if defined(J9VM_GC_SPARSE_HEAP_ALLOCATION)
if (TR::Compiler->om.isOffHeapAllocationEnabled())
{
generateTrg1MemInstruction(
cg, TR::InstOpCode::ld, node, startReg,
TR::MemoryReference::createWithDisplacement(
cg, startReg, TR::Compiler->om.offsetOfContiguousDataAddrField(), 8)
);
}
else
#endif /* J9VM_GC_SPARSE_HEAP_ALLOCATION */
{
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, startReg, startReg,
TR::Compiler->om.contiguousArrayHeaderSizeInBytes());
}

// get the starting address
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, startReg, startReg, indexReg);
// make the index 0 since everything we need is relative to the offset
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, indexReg, 0);

// check the first byte
generateTrg1MemInstruction(cg, TR::InstOpCode::lbz, node, tempReg,
TR::MemoryReference::createWithIndexReg(cg, NULL, startReg, 1));
// check the negative bit
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, tempReg, 0x80);
if (isCountPositives) // when counting positives, just return the index which is 0
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);
else // when seeking negatives, we need to return 1
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, matchLabel, cr0);

// if we only have one byte end it here, and return 0 for hasNegative
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, lengthReg);
if (isCountPositives)
generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, endLabel, cr6);
else
generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, resultLabel, cr6);

// ready the zero reg
generateTrg1Src2Instruction(cg, TR::InstOpCode::vxor, node, vconstant0Reg, vconstant0Reg, vconstant0Reg);
// tempReg marks the end where we could use lxv
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, tempReg, lengthReg, -15);

// --- start of VSXLoop
generateLabelInstruction(cg, TR::InstOpCode::label, node, VSXLabel);
// go to residue if we don't have enough items to do one load
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, tempReg);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, serialPrepLabel, cr6);

// load 16 items; we don't need to worry about endianness since the order doesn't matter
if (p9Plus)
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvb16x, node, vtmp1Reg, startReg, indexReg);
else
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvw4x, node, vtmp1Reg, startReg, indexReg);
// bit 2 of cr6 (ZERO) will not be set if any comparison is true
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmpgtsb_r, node, vtmp1Reg, vconstant0Reg, vtmp1Reg);
// branch when the ZERO bit is not set
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, matchLabel, cr6);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 16);
generateLabelInstruction(cg, TR::InstOpCode::b, node, VSXLabel);

// --- when there is a match but we don't know the exact location yet
generateLabelInstruction(cg, TR::InstOpCode::label, node, matchLabel);
if (isCountPositives)
{
if (p9Plus) // just count for P9+
{
generateTrg1Src1Instruction(cg, TR::InstOpCode::vclzlsbb, node, tempReg, vtmp1Reg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, indexReg, tempReg, indexReg);
generateLabelInstruction(cg, TR::InstOpCode::b, node, endLabel);
}
else // otherwise, we use the serial loop to go through the items
{
generateLabelInstruction(cg, TR::InstOpCode::b, node, serialPrepLabel);
}
}
else // just report 1
{
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1);
generateLabelInstruction(cg, TR::InstOpCode::b, node, endLabel);
}

// --- serialPrepLabel to deal with whatever remains
generateLabelInstruction(cg, TR::InstOpCode::label, node, serialPrepLabel);
// do we have enough elements to use the unroll loop?
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, tempReg, lengthReg, -3);
// we need to use 4 individual masks instead for countPositves() in LE before P9
if (!(isLE && isCountPositives && !p9Plus))
{
// we want to load 0x80808080 in to maskReg, but lis was designed for signed values,
// and would throw an error for 0x8080, yet it could accept the equivalent negative value of it;
// we don't worry about sign extension since the upper word should be 0 in storeReg after lwzx
generateTrg1ImmInstruction(cg, TR::InstOpCode::lis, node, maskReg, -32640);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::ori, node, maskReg, maskReg, 0x8080);
}

generateLabelInstruction(cg, TR::InstOpCode::label, node, serialUnrollLabel);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, tempReg);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, serialLabel, cr6);
// loading 4 bytes at once is slightly faster
generateTrg1MemInstruction(cg, TR::InstOpCode::lwzx, node, storeReg,
TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 4));

if (isCountPositives) // when counting positives, we must consider every byte separately
{
if (isLE) // in LE, count the number of trailing zeroes or use masks
{
if (p9Plus)
{
generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::cnttzw, node, storeReg, storeReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, storeReg, storeReg, 3);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, indexReg, storeReg, indexReg);
// 4 means we need to keep checking
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, storeReg, 4);
generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6);
}
else // before P9, we cannot count trailing zeroes
{
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, maskReg, storeReg, 0x80);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, maskReg, storeReg, 0x8000);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, maskReg, storeReg, 0x80);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, maskReg, storeReg, 0x8000);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
}
}
else // in BE, count the leading zeroes
{
generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::cntlzw, node, storeReg, storeReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, storeReg, storeReg, 3);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, indexReg, storeReg, indexReg);
// 4 means we need to keep checking
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, storeReg, 4);
generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6);
}
}
else
{
generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, storeReg, 0);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, matchLabel, cr6);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 4);
}
generateLabelInstruction(cg, TR::InstOpCode::b, node, serialUnrollLabel);

generateLabelInstruction(cg, TR::InstOpCode::label, node, serialLabel);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, lengthReg);
// if we reach the end, indexReg is len already, so we don't need to do anything for countPositives
if (isCountPositives)
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, endLabel, cr6);
else
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, resultLabel, cr6);

generateTrg1MemInstruction(cg, TR::InstOpCode::lbzx, node, tempReg,
TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 1));
// check the negative bit
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, tempReg, tempReg, 0x80);
if (isCountPositives)
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);
else
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, matchLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateLabelInstruction(cg, TR::InstOpCode::b, node, serialLabel);

// --- load the length for countPositves; load 0 for hasNegative
generateLabelInstruction(cg, TR::InstOpCode::label, node, resultLabel);
if (isCountPositives)
generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, indexReg, lengthReg);
else
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 0);
// end

TR::RegisterDependencyConditions *deps =
new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 10, cg->trMemory());

deps->addPostCondition(startReg, TR::RealRegister::NoReg);
deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0();

deps->addPostCondition(indexReg, TR::RealRegister::NoReg);
deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0();

deps->addPostCondition(lengthReg, TR::RealRegister::NoReg);
deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0();

deps->addPostCondition(tempReg, TR::RealRegister::NoReg);

deps->addPostCondition(cr6, TR::RealRegister::cr6);
deps->addPostCondition(cr0, TR::RealRegister::cr0);

deps->addPostCondition(vconstant0Reg, TR::RealRegister::NoReg);
deps->addPostCondition(vtmp1Reg, TR::RealRegister::NoReg);

deps->addPostCondition(storeReg, TR::RealRegister::NoReg);
deps->addPostCondition(maskReg, TR::RealRegister::NoReg);

generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps);

if (isCountPositives) // if countPositives, indexReg will contain the first negative value
{
node->setRegister(indexReg);
cg->stopUsingRegister(tempReg);
}
else // if hasNegative, we will have a tempReg ready with zero or one
{
node->setRegister(tempReg);
cg->stopUsingRegister(indexReg);
}

cg->stopUsingRegister(startReg);
cg->stopUsingRegister(lengthReg);
cg->stopUsingRegister(cr6);
cg->stopUsingRegister(cr0);
cg->stopUsingRegister(vconstant0Reg);
cg->stopUsingRegister(vtmp1Reg);

cg->stopUsingRegister(storeReg);
cg->stopUsingRegister(maskReg);

for (int32_t i = 0; i < node->getNumChildren(); i++)
{
cg->decReferenceCount(node->getChild(i));
}

if (isCountPositives) // if countPositives, indexReg will contain the first negative value
return indexReg;
return tempReg; // if hasNegative, we will have a tempReg ready with zero or one
}

/*
* Arraycopy evaluator needs a version of inlineArrayCopy that can be used inside internal control flow. For this version of inlineArrayCopy, registers must
* be allocated outside of this function so the dependency at the end of the control flow knows about them.
Expand Down Expand Up @@ -12261,6 +12540,21 @@ J9::Power::CodeGenerator::inlineDirectCall(TR::Node *node, TR::Register *&result
}
break;

case TR::java_lang_StringCoding_hasNegatives:
if (cg->getSupportsInlineStringCodingHasNegatives())
{
resultReg = inlineStringCodingHasNegativesOrCountPositives(node, cg, false);
return true;
}
break;
case TR::java_lang_StringCoding_countPositives:
if (cg->getSupportsInlineStringCodingCountPositives())
{
resultReg = inlineStringCodingHasNegativesOrCountPositives(node, cg, true);
return true;
}
break;

case TR::sun_misc_Unsafe_compareAndSwapInt_jlObjectJII_Z:
// In Java9 this can be either the jdk.internal JNI method or the sun.misc Java wrapper.
// In Java8 it will be sun.misc which will contain the JNI directly.
Expand Down