8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-22 20:43:02 +01:00

Enable merge joins. Attempted a cost-based choice between hash and merge joins.

This commit is contained in:
Dmitry Yemanov 2022-04-02 09:05:10 +03:00
parent e4a7614aeb
commit f023ba9957
3 changed files with 89 additions and 65 deletions

View File

@ -294,8 +294,8 @@ namespace
if (node1 == node2)
return true;
const FieldNode* fieldNode1 = nodeAs<FieldNode>(node1);
const FieldNode* fieldNode2 = nodeAs<FieldNode>(node2);
const auto fieldNode1 = nodeAs<FieldNode>(node1);
const auto fieldNode2 = nodeAs<FieldNode>(node2);
if (fieldNode1 && fieldNode2)
{
@ -1654,7 +1654,6 @@ void Optimizer::checkSorts()
sort = rse->rse_sorted = nullptr;
}
}
}
// Examine the ORDER BY and DISTINCT clauses; if all the fields in the
@ -2343,102 +2342,116 @@ bool Optimizer::generateEquiJoin(RiverList& orgRivers)
if (!riverCount)
return false;
HalfStaticArray<RecordSource*, OPT_STATIC_ITEMS> rsbs;
HalfStaticArray<NestValueArray*, OPT_STATIC_ITEMS> keys;
bool useMergeJoin = false;
// AB: Get the lowest river position from the rivers that are merged
// Prepare rivers for joining
StreamList streams;
RiverList rivers;
unsigned number = 0, lowestPosition = MAX_ULONG;
HalfStaticArray<NestValueArray*, OPT_STATIC_ITEMS> keys;
unsigned position = 0, maxCardinalityPosition = 0, lowestPosition = MAX_ULONG;
double maxCardinality1 = 0, maxCardinality2 = 0;
for (River** iter = orgRivers.begin(); iter < orgRivers.end(); number++)
for (auto iter = orgRivers.begin(); iter < orgRivers.end(); position++)
{
River* const river = *iter;
if (!(TEST_DEP_BIT(selected_rivers, number)))
if (!(TEST_DEP_BIT(selected_rivers, position)))
{
iter++;
continue;
}
if (number < lowestPosition)
lowestPosition = number;
const auto river = *iter;
// Get the lowest river position
if (position < lowestPosition)
lowestPosition = position;
// Find position of the river with maximum cardinality
const auto rsb = river->getRecordSource();
const auto cardinality = rsb->getCardinality();
if (cardinality > maxCardinality1)
{
maxCardinality2 = maxCardinality1;
maxCardinality1 = cardinality;
maxCardinalityPosition = rivers.getCount();
}
else if (cardinality > maxCardinality2)
maxCardinality2 = cardinality;
streams.join(river->getStreams());
rivers.add(river);
orgRivers.remove(iter);
auto rsb = river->getRecordSource();
// Collect keys to join on
// Collect RSBs and keys to join
keys.add(FB_NEW_POOL(getPool()) NestValueArray(getPool()));
const auto key = FB_NEW_POOL(getPool()) SortNode(getPool());
if (useMergeJoin)
{
ValueExprNode*** selected_class;
for (selected_class = selected_classes.begin();
selected_class != selected_classes.end(); ++selected_class)
{
key->direction.add(ORDER_ASC); // Ascending sort
key->nullOrder.add(NULLS_DEFAULT); // Default nulls placement
key->expressions.add((*selected_class)[number]);
for (const auto eq_class : selected_classes)
keys.back()->add(eq_class[position]);
}
rsb = generateSort(river->getStreams(), nullptr, rsb, key, favorFirstRows(), false);
}
else
{
ValueExprNode*** selected_class;
const bool hashOverflow = (maxCardinality2 > HashJoin::maxCapacity());
for (selected_class = selected_classes.begin();
selected_class != selected_classes.end(); ++selected_class)
{
key->expressions.add((*selected_class)[number]);
}
}
// If any of to-be-hashed rivers is too large to be hashed efficiently,
// then prefer a merge join instead of a hash join.
// It seems that rivers are already sorted by their cardinality.
// For a hash join, we need to choose the smallest ones as inner sub-streams,
// hence we reverse the order when storing them in the temporary arrays.
if (useMergeJoin)
{
rsbs.add(rsb);
keys.add(&key->expressions);
}
else
{
rsbs.insert(0, rsb);
keys.insert(0, &key->expressions);
}
}
fb_assert(rsbs.getCount() == keys.getCount());
const bool useMergeJoin = hashOverflow;
// Build a join stream
RecordSource* rsb = nullptr;
HalfStaticArray<RecordSource*, OPT_STATIC_ITEMS> rsbs;
RecordSource* finalRsb = nullptr;
if (useMergeJoin)
{
rsb = FB_NEW_POOL(getPool())
position = 0;
for (const auto river : rivers)
{
const auto sort = FB_NEW_POOL(getPool()) SortNode(getPool());
for (const auto key : *keys[position++])
{
fb_assert(river->isReferenced(key));
sort->direction.add(ORDER_ASC); // ascending sort
sort->nullOrder.add(NULLS_DEFAULT); // default nulls placement
sort->expressions.add(key);
}
const auto rsb = generateSort(river->getStreams(), nullptr,
river->getRecordSource(), sort, favorFirstRows(), false);
rsbs.add(rsb);
}
finalRsb = FB_NEW_POOL(getPool())
MergeJoin(csb, rsbs.getCount(), (SortedStream**) rsbs.begin(), keys.begin());
}
else
{
rsb = FB_NEW_POOL(getPool())
// Ensure that the largest river is placed at the first position.
// It's important for a hash join to be efficient.
const auto maxCardinalityRiver = rivers[maxCardinalityPosition];
rivers[maxCardinalityPosition] = rivers[0];
rivers[0] = maxCardinalityRiver;
const auto maxCardinalityKey = keys[maxCardinalityPosition];
keys[maxCardinalityPosition] = keys[0];
keys[0] = maxCardinalityKey;
for (const auto river : rivers)
rsbs.add(river->getRecordSource());
finalRsb = FB_NEW_POOL(getPool())
HashJoin(tdbb, csb, rsbs.getCount(), rsbs.begin(), keys.begin());
}
// Pick up any boolean that may apply
rsb = applyLocalBoolean(rsb, streams, iter);
finalRsb = applyLocalBoolean(finalRsb, streams, iter);
const auto finalRiver = FB_NEW_POOL(getPool()) River(csb, rsb, rivers);
const auto finalRiver = FB_NEW_POOL(getPool()) River(csb, finalRsb, rivers);
orgRivers.insert(lowestPosition, finalRiver);
return true;

View File

@ -45,6 +45,15 @@ using namespace Jrd;
static const ULONG HASH_SIZE = 1009;
static const ULONG BUCKET_PREALLOCATE_SIZE = 32; // 256 bytes per slot
unsigned HashJoin::maxCapacity()
{
// Binary search across 1000 collisions is computationally similar to
// linear searc across 10 collisions. We use this number as a rough
// estimation of whether the lookup performance is likely to be acceptable.
return HASH_SIZE * 1000;
}
class HashJoin::HashTable : public PermanentStorage
{
class CollisionList

View File

@ -1092,6 +1092,8 @@ namespace Jrd
void findUsedStreams(StreamList& streams, bool expandAll = false) const override;
void nullRecords(thread_db* tdbb) const override;
static unsigned maxCapacity();
private:
ULONG computeHash(thread_db* tdbb, Request* request,
const SubStream& sub, UCHAR* buffer) const;