Fix #6873 - SIMILAR TO does not use index when pattern starts with non-wildcard character

(in contrary to LIKE).
2025-01-23 04:43:03 +01:00 · 2021-07-06 16:13:24 -03:00 · 2021-07-06 16:13:24 -03:00 · d2addfa578
commit d2addfa578
parent 23607dc3cf
3 changed files with 185 additions and 132 deletions
--- a/src/common/SimilarToRegex.cpp
+++ b/src/common/SimilarToRegex.cpp
@ -142,31 +142,6 @@ namespace
 			return c == '*' || c == '+' || c == '?' || c == '{';
 		}

-		bool isSpecial(UChar32 c)
-		{
-			switch (c)
-			{
-				case '^':
-				case '-':
-				case '_':
-				case '%':
-				case '[':
-				case ']':
-				case '(':
-				case ')':
-				case '{':
-				case '}':
-				case '|':
-				case '?':
-				case '+':
-				case '*':
-					return true;
-
-				default:
-					return false;
-			}
-		}
-
 		bool isRe2Special(UChar32 c)
 		{
 			switch (c)
@ -401,7 +376,7 @@ namespace
 						charSavePos = patternPos;
 						c = getPatternChar();

-						if (!(c == escapeChar || isSpecial(c)))
+						if (!(c == escapeChar || SimilarToRegex::isSpecialChar(c)))
 							status_exception::raise(Arg::Gds(isc_escape_invalid));
 					}
 					else
@ -470,7 +445,7 @@ namespace
 								charSavePos = patternPos;
 								c = getPatternChar();

-								if (!(c == escapeChar || isSpecial(c)))
+								if (!(c == escapeChar || SimilarToRegex::isSpecialChar(c)))
 									status_exception::raise(Arg::Gds(isc_escape_invalid));
 							}

@ -638,12 +613,12 @@ namespace
 						charSavePos = patternPos;
 						op = getPatternChar();

-						if (!isSpecial(op) && op != escapeChar)
+						if (!SimilarToRegex::isSpecialChar(op) && op != escapeChar)
 							status_exception::raise(Arg::Gds(isc_escape_invalid));
 					}
 					else
 					{
-						if (isSpecial(op))
+						if (SimilarToRegex::isSpecialChar(op))
 						{
 							controlChar = true;
 							patternPos = charSavePos;
--- a/src/common/SimilarToRegex.h
+++ b/src/common/SimilarToRegex.h
@ -51,6 +51,31 @@ public:
 		const char* patternStr, unsigned patternLen, const char* escapeStr, unsigned escapeLen);
 	~SimilarToRegex();

+public:
+	static bool isSpecialChar(ULONG c)
+	{
+		switch (c)
+		{
+			case '^':
+			case '-':
+			case '_':
+			case '%':
+			case '[':
+			case ']':
+			case '(':
+			case ')':
+			case '{':
+			case '}':
+			case '|':
+			case '?':
+			case '+':
+			case '*':
+				return true;
+		}
+
+		return false;
+	}
+
 private:
 	static void finalize(SimilarToRegex* self);

--- a/src/jrd/opt.cpp
+++ b/src/jrd/opt.cpp
@ -352,7 +352,7 @@ static bool map_equal(const ValueExprNode*, const ValueExprNode*, const MapNode*
 static void mark_indices(CompilerScratch::csb_repeat* csbTail, SSHORT relationId);
 static bool node_equality(const ValueExprNode*, const ValueExprNode*);
 static bool node_equality(const BoolExprNode*, const BoolExprNode*);
-static ValueExprNode* optimize_like(thread_db*, CompilerScratch*, ComparativeBoolNode*);
+static ValueExprNode* optimize_like_similar(thread_db*, CompilerScratch*, ComparativeBoolNode*);
 static USHORT river_count(USHORT count, ValueExprNode** eq_class);
 static bool search_stack(const ValueExprNode*, const ValueExprNodeStack&);
 static void set_direction(SortNode*, SortNode*);
@ -1445,12 +1445,13 @@ static SLONG decompose(thread_db* tdbb, BoolExprNode* boolNode, BoolExprNodeStac
 			return 2;
 		}

-		// turn a LIKE into a LIKE and a STARTING WITH, if it starts
+		// turn a LIKE/SIMILAR into a LIKE/SIMILAR and a STARTING WITH, if it starts
 		// with anything other than a pattern-matching character

 		ValueExprNode* arg;

-		if (cmpNode->blrOp == blr_like && (arg = optimize_like(tdbb, csb, cmpNode)))
+		if ((cmpNode->blrOp == blr_like || cmpNode->blrOp == blr_similar) &&
+			(arg = optimize_like_similar(tdbb, csb, cmpNode)))
 		{
 			ComparativeBoolNode* newCmpNode = FB_NEW_POOL(csb->csb_pool) ComparativeBoolNode(
 				csb->csb_pool, blr_starting);
@ -3389,81 +3390,80 @@ static bool node_equality(const BoolExprNode* node1, const BoolExprNode* node2)
 }


-static ValueExprNode* optimize_like(thread_db* tdbb, CompilerScratch* csb, ComparativeBoolNode* like_node)
+static ValueExprNode* optimize_like_similar(thread_db* tdbb, CompilerScratch* csb, ComparativeBoolNode* cmpNode)
 {
 /**************************************
 *
- *	o p t i m i z e _ l i k e
+ *	o p t i m i z e _ l i k e _ s i m i l a r
 *
 **************************************
 *
 * Functional description
- *	Optimize a LIKE expression, if possible,
- *	into a "starting with" AND a "like".  This
+ *	Optimize a LIKE/SIMILAR expression, if possible,
+ *	into a "starting with" AND a "LIKE/SIMILAR".  This
 *	will allow us to use the index for the
- *	starting with, and the LIKE can just tag
+ *	starting with, and the LIKE/SIMILAR can just tag
 *	along for the ride.
 *	But on the ride it does useful work, consider
- *	match LIKE "ab%c".  This is optimized by adding
- *	AND starting_with "ab", but the LIKE clause is
+ *	match LIKE/SIMILAR "ab%c".  This is optimized by adding
+ *	AND starting_with "ab", but the LIKE/SIMILAR clause is
 *	still needed.
 *
 **************************************/
 	SET_TDBB(tdbb);

-	ValueExprNode* match_node = like_node->arg1;
-	ValueExprNode* pattern_node = like_node->arg2;
-	ValueExprNode* escape_node = like_node->arg3;
+	ValueExprNode* matchNode = cmpNode->arg1;
+	ValueExprNode* patternNode = cmpNode->arg2;
+	ValueExprNode* escapeNode = cmpNode->arg3;

 	// if the pattern string or the escape string can't be
 	// evaluated at compile time, forget it
-	if (!nodeIs<LiteralNode>(pattern_node) || (escape_node && !nodeIs<LiteralNode>(escape_node)))
-		return NULL;
+	if (!nodeIs<LiteralNode>(patternNode) || (escapeNode && !nodeIs<LiteralNode>(escapeNode)))
+		return nullptr;

-	dsc match_desc;
-	match_node->getDesc(tdbb, csb, &match_desc);
+	dsc matchDesc;
+	matchNode->getDesc(tdbb, csb, &matchDesc);

-	dsc* pattern_desc = &nodeAs<LiteralNode>(pattern_node)->litDesc;
-	dsc* escape_desc = NULL;
+	dsc* patternDesc = &nodeAs<LiteralNode>(patternNode)->litDesc;
+	dsc* escapeDesc = nullptr;

-	if (escape_node)
-		escape_desc = &nodeAs<LiteralNode>(escape_node)->litDesc;
+	if (escapeNode)
+		escapeDesc = &nodeAs<LiteralNode>(escapeNode)->litDesc;

 	// if either is not a character expression, forget it
-	if ((match_desc.dsc_dtype > dtype_any_text) ||
-		(pattern_desc->dsc_dtype > dtype_any_text) ||
-		(escape_node && escape_desc->dsc_dtype > dtype_any_text))
+	if ((matchDesc.dsc_dtype > dtype_any_text) ||
+		(patternDesc->dsc_dtype > dtype_any_text) ||
+		(escapeNode && escapeDesc->dsc_dtype > dtype_any_text))
 	{
-		return NULL;
+		return nullptr;
 	}

-	TextType* matchTextType = INTL_texttype_lookup(tdbb, INTL_TTYPE(&match_desc));
+	TextType* matchTextType = INTL_texttype_lookup(tdbb, INTL_TTYPE(&matchDesc));
 	CharSet* matchCharset = matchTextType->getCharSet();
-	TextType* patternTextType = INTL_texttype_lookup(tdbb, INTL_TTYPE(pattern_desc));
+	TextType* patternTextType = INTL_texttype_lookup(tdbb, INTL_TTYPE(patternDesc));
 	CharSet* patternCharset = patternTextType->getCharSet();

+	if (cmpNode->blrOp == blr_like)
+	{
 		UCHAR escape_canonic[sizeof(ULONG)];
 		UCHAR first_ch[sizeof(ULONG)];
 		ULONG first_len;
 		UCHAR* p;
 		USHORT p_count;
+		MoveBuffer escapeBuffer;

 		// Get the escape character, if any
-	if (escape_node)
+		if (escapeNode)
 		{
 			// Ensure escape string is same character set as match string
-
-		MoveBuffer escape_buffer;
-
-		p_count = MOV_make_string2(tdbb, escape_desc, INTL_TTYPE(&match_desc), &p, escape_buffer);
+			p_count = MOV_make_string2(tdbb, escapeDesc, INTL_TTYPE(&matchDesc), &p, escapeBuffer);

 			first_len = matchCharset->substring(p_count, p, sizeof(first_ch), first_ch, 0, 1);
 			matchTextType->canonical(first_len, p, sizeof(escape_canonic), escape_canonic);
 		}

-	MoveBuffer pattern_buffer;
-
-	p_count = MOV_make_string2(tdbb, pattern_desc, INTL_TTYPE(&match_desc), &p, pattern_buffer);
+		MoveBuffer patternBuffer;
+		p_count = MOV_make_string2(tdbb, patternDesc, INTL_TTYPE(&matchDesc), &p, patternBuffer);

 		first_len = matchCharset->substring(p_count, p, sizeof(first_ch), first_ch, 0, 1);

@ -3473,26 +3473,24 @@ static ValueExprNode* optimize_like(thread_db* tdbb, CompilerScratch* csb, Compa
 		const BYTE canWidth = matchTextType->getCanonicalWidth();

 		const UCHAR* matchOneChar = matchCharset->getSqlMatchOneLength() != 0 ?
-		matchTextType->getCanonicalChar(TextType::CHAR_SQL_MATCH_ONE) : NULL;
+			matchTextType->getCanonicalChar(TextType::CHAR_SQL_MATCH_ONE) : nullptr;
 		const UCHAR* matchAnyChar = matchCharset->getSqlMatchAnyLength() != 0 ?
-		matchTextType->getCanonicalChar(TextType::CHAR_SQL_MATCH_ANY) : NULL;
+			matchTextType->getCanonicalChar(TextType::CHAR_SQL_MATCH_ANY) : nullptr;

 		// If the first character is a wildcard char, forget it.
-	if ((!escape_node || memcmp(first_canonic, escape_canonic, canWidth) != 0) &&
+		if ((!escapeNode || memcmp(first_canonic, escape_canonic, canWidth) != 0) &&
 			((matchOneChar && memcmp(first_canonic, matchOneChar, canWidth) == 0) ||
 			(matchAnyChar && memcmp(first_canonic, matchAnyChar, canWidth) == 0)))
 		{
-		return NULL;
+			return nullptr;
 		}

 		// allocate a literal node to store the starting with string;
 		// assume it will be shorter than the pattern string
-	// CVC: This assumption may not be true if we use "value like field".

 		LiteralNode* literal = FB_NEW_POOL(csb->csb_pool) LiteralNode(csb->csb_pool);
-	literal->litDesc = *pattern_desc;
-	UCHAR* q = literal->litDesc.dsc_address = FB_NEW_POOL(csb->csb_pool)
-		UCHAR[literal->litDesc.dsc_length];
+		literal->litDesc = *patternDesc;
+		UCHAR* q = literal->litDesc.dsc_address = FB_NEW_POOL(csb->csb_pool) UCHAR[literal->litDesc.dsc_length];

 		// Set the string length to point till the first wildcard character.

@ -3504,12 +3502,11 @@ static ValueExprNode* optimize_like(thread_db* tdbb, CompilerScratch* csb, Compa

 		for (const UCHAR* patternPtr = patternCanonical.begin(); patternPtr < patternCanonical.end(); )
 		{
-		// if there are escape characters, skip past them and
-		// don't treat the next char as a wildcard
+			// if there are escape characters, skip past them and don't treat the next char as a wildcard
 			const UCHAR* patternPtrStart = patternPtr;
 			patternPtr += canWidth;

-		if (escape_node && (memcmp(patternPtrStart, escape_canonic, canWidth) == 0))
+			if (escapeNode && (memcmp(patternPtrStart, escape_canonic, canWidth) == 0))
 			{
 				// Check for Escape character at end of string
 				if (!(patternPtr < patternCanonical.end()))
@ -3524,8 +3521,8 @@ static ValueExprNode* optimize_like(thread_db* tdbb, CompilerScratch* csb, Compa
 				break;
 			}

-		q += patternCharset->substring(pattern_desc->dsc_length,
-			pattern_desc->dsc_address,
+			q += patternCharset->substring(patternDesc->dsc_length,
+					patternDesc->dsc_address,
 					literal->litDesc.dsc_length - (q - literal->litDesc.dsc_address), q,
 					(patternPtrStart - patternCanonical.begin()) / canWidth, 1);
 		}
@ -3533,6 +3530,62 @@ static ValueExprNode* optimize_like(thread_db* tdbb, CompilerScratch* csb, Compa
 		literal->litDesc.dsc_length = q - literal->litDesc.dsc_address;

 		return literal;
+	}
+	else
+	{
+		fb_assert(cmpNode->blrOp == blr_similar);
+
+		MoveBuffer escapeBuffer;
+		UCHAR* escapeStart = nullptr;
+		ULONG escapeLen = 0;
+
+		// Get the escape character, if any
+		if (escapeNode)
+		{
+			// Ensure escape string is same character set as match string
+			escapeLen = MOV_make_string2(tdbb, escapeDesc, INTL_TTYPE(&matchDesc), &escapeStart, escapeBuffer);
+		}
+
+		MoveBuffer patternBuffer;
+		UCHAR* patternStart;
+		ULONG patternLen = MOV_make_string2(tdbb, patternDesc, INTL_TTYPE(&matchDesc), &patternStart, patternBuffer);
+		const auto patternEnd = patternStart + patternLen;
+		const UCHAR* patternPtr = patternStart;
+
+		MoveBuffer prefixBuffer;
+		ULONG charLen = 0;
+
+		while (IntlUtil::readOneChar(matchCharset, &patternPtr, patternEnd, &charLen))
+		{
+			if (escapeNode && charLen == escapeLen && memcmp(patternPtr, escapeStart, escapeLen) == 0)
+			{
+				if (!IntlUtil::readOneChar(matchCharset, &patternPtr, patternEnd, &charLen) ||
+					!((charLen == escapeLen && memcmp(patternPtr, escapeStart, escapeLen) == 0) ||
+					  (charLen == 1 && SimilarToRegex::isSpecialChar(*patternPtr))))
+				{
+					// Invalid escape.
+					return nullptr;
+				}
+			}
+			else if (charLen == 1 && SimilarToRegex::isSpecialChar(*patternPtr))
+				break;
+
+			prefixBuffer.push(patternPtr, charLen);
+		}
+
+		if (prefixBuffer.isEmpty())
+			return nullptr;
+
+		// Allocate a literal node to store the starting with string.
+		// Use the match text type as the pattern string is converted to it.
+
+		LiteralNode* literal = FB_NEW_POOL(csb->csb_pool) LiteralNode(csb->csb_pool);
+		literal->litDesc.makeText(prefixBuffer.getCount(), INTL_TTYPE(&matchDesc),
+			FB_NEW_POOL(csb->csb_pool) UCHAR[prefixBuffer.getCount()]);
+		memcpy(literal->litDesc.dsc_address, prefixBuffer.begin(), prefixBuffer.getCount());
+
+		return literal;
+	}
 }