From fa9c6d4d3c696f4928b282b451c48e70c4aaa267 Mon Sep 17 00:00:00 2001 From: Adriano dos Santos Fernandes <529415+asfernandes@users.noreply.github.com> Date: Mon, 18 Sep 2023 07:36:12 -0300 Subject: [PATCH] Fix #7715 - Alternative String Literals and multibyte (UTF8) alternatives. (#7742) --- src/dsql/Parser.cpp | 80 +++++++++++++++++++++++++++++++++------------ src/dsql/Parser.h | 8 +++-- src/dsql/parse.y | 16 ++++++--- 3 files changed, 76 insertions(+), 28 deletions(-) diff --git a/src/dsql/Parser.cpp b/src/dsql/Parser.cpp index e9e6908265..ba089ad88e 100644 --- a/src/dsql/Parser.cpp +++ b/src/dsql/Parser.cpp @@ -27,6 +27,7 @@ #include "../dsql/chars.h" #include "../jrd/jrd.h" #include "../jrd/DataTypeUtil.h" +#include "../dsql/metd_proto.h" #include "../jrd/intl_proto.h" #ifdef HAVE_FLOAT_H @@ -40,7 +41,7 @@ using namespace Jrd; Parser::Parser(thread_db* tdbb, MemoryPool& pool, MemoryPool* aStatementPool, DsqlCompilerScratch* aScratch, - USHORT aClientDialect, USHORT aDbDialect, const TEXT* string, size_t length, SSHORT characterSet) + USHORT aClientDialect, USHORT aDbDialect, const TEXT* string, size_t length, SSHORT charSetId) : PermanentStorage(pool), statementPool(aStatementPool), scratch(aScratch), @@ -50,6 +51,8 @@ Parser::Parser(thread_db* tdbb, MemoryPool& pool, MemoryPool* aStatementPool, Ds strMarks(pool), stmt_ambiguous(false) { + charSet = INTL_charset_lookup(tdbb, charSetId); + yyps = 0; yypath = 0; yylvals = 0; @@ -76,7 +79,7 @@ Parser::Parser(thread_db* tdbb, MemoryPool& pool, MemoryPool* aStatementPool, Ds lex.line_start = lex.last_token = lex.ptr = lex.leadingPtr = string; lex.end = string + length; lex.lines = 1; - lex.att_charset = characterSet; + lex.charSetId = charSetId; lex.line_start_bk = lex.line_start; lex.lines_bk = lex.lines; lex.param_number = 1; @@ -715,31 +718,66 @@ int Parser::yylexAux() if ((c == 'q' || c == 'Q') && lex.ptr + 3 < lex.end && *lex.ptr == '\'') { + auto currentCharSet = charSet; + + if (introducerCharSetName) + { + const auto symbol = METD_get_charset(scratch->getTransaction(), + introducerCharSetName->length(), introducerCharSetName->c_str()); + + if (!symbol) + { + // character set name is not defined + ERRD_post( + Arg::Gds(isc_sqlerr) << Arg::Num(-504) << + Arg::Gds(isc_charset_not_found) << *introducerCharSetName); + } + + currentCharSet = INTL_charset_lookup(tdbb, symbol->intlsym_ttype); + } + StrMark mark; mark.pos = lex.last_token - lex.start; - char endChar = *++lex.ptr; - switch (endChar) + const auto* endChar = ++lex.ptr; + ULONG endCharSize = 0; + + if (!IntlUtil::readOneChar(currentCharSet, reinterpret_cast(&lex.ptr), + reinterpret_cast(lex.end), &endCharSize)) { - case '{': - endChar = '}'; - break; - case '(': - endChar = ')'; - break; - case '[': - endChar = ']'; - break; - case '<': - endChar = '>'; - break; + endCharSize = 1; } - while (++lex.ptr + 1 < lex.end) + if (endCharSize == 1) { - if (*lex.ptr == endChar && lex.ptr[1] == '\'') + switch (*endChar) { - size_t len = ++lex.ptr - lex.last_token - 4; + case '{': + endChar = "}"; + break; + case '(': + endChar = ")"; + break; + case '[': + endChar = "]"; + break; + case '<': + endChar = ">"; + break; + } + } + + const auto start = lex.ptr + endCharSize; + ULONG charSize = endCharSize; + + while (IntlUtil::readOneChar(currentCharSet, reinterpret_cast(&lex.ptr), + reinterpret_cast(lex.end), &charSize)) + { + if (charSize == endCharSize && + memcmp(lex.ptr, endChar, endCharSize) == 0 && + lex.ptr[endCharSize] == '\'') + { + size_t len = lex.ptr - start; if (len > MAX_STR_SIZE) { @@ -749,9 +787,9 @@ int Parser::yylexAux() Arg::Num(MAX_STR_SIZE)); } - yylval.intlStringPtr = newIntlString(Firebird::string(lex.last_token + 3, len)); + yylval.intlStringPtr = newIntlString(Firebird::string(start, len)); - ++lex.ptr; + lex.ptr += endCharSize + 1; mark.length = lex.ptr - lex.last_token; mark.str = yylval.intlStringPtr; diff --git a/src/dsql/Parser.h b/src/dsql/Parser.h index 28e22323f5..6fabb8f7d8 100644 --- a/src/dsql/Parser.h +++ b/src/dsql/Parser.h @@ -99,7 +99,7 @@ private: const TEXT* line_start; const TEXT* last_token_bk; const TEXT* line_start_bk; - SSHORT att_charset; + SSHORT charSetId; SLONG lines, lines_bk; int prev_keyword; USHORT param_number; @@ -131,7 +131,7 @@ public: public: Parser(thread_db* tdbb, MemoryPool& pool, MemoryPool* aStatementPool, DsqlCompilerScratch* aScratch, - USHORT aClientDialect, USHORT aDbDialect, const TEXT* string, size_t length, SSHORT characterSet); + USHORT aClientDialect, USHORT aDbDialect, const TEXT* string, size_t length, SSHORT charSetId); ~Parser(); public: @@ -375,6 +375,7 @@ private: USHORT client_dialect; USHORT db_dialect; USHORT parser_version; + CharSet* charSet; CharSet* metadataCharSet; Firebird::string transformedString; @@ -382,6 +383,9 @@ private: bool stmt_ambiguous; DsqlStatement* parsedStatement; + // Parser feedback for lexer + MetaName* introducerCharSetName = nullptr; + // These value/posn are taken from the lexer YYSTYPE yylval; YYPOSN yyposn; diff --git a/src/dsql/parse.y b/src/dsql/parse.y index 28d70f6ff8..2c93e06ee1 100644 --- a/src/dsql/parse.y +++ b/src/dsql/parse.y @@ -7747,7 +7747,7 @@ ul_numeric_constant u_constant : u_numeric_constant | sql_string - { $$ = MAKE_str_constant($1, lex.att_charset); } + { $$ = MAKE_str_constant($1, lex.charSetId); } | DATE STRING { if (client_dialect < SQL_DIALECT_V6_TRANSITION) @@ -7837,12 +7837,18 @@ error_context %type sql_string sql_string : STRING // string in current charset - | INTRODUCER STRING // string in specific charset + | INTRODUCER + [ + // feedback for lexer + introducerCharSetName = $1; + ] + STRING // string in specific charset + [ introducerCharSetName = nullptr; ] { - $$ = $2; + $$ = $3; $$->setCharSet(*$1); - StrMark* mark = strMarks.get($2); + StrMark* mark = strMarks.get($3); if (mark) // hex string is not in strMarks mark->introduced = true; @@ -8179,7 +8185,7 @@ window_frame_exclusion_opt %type delimiter_opt delimiter_opt - : /* nothing */ { $$ = MAKE_str_constant(newIntlString(","), lex.att_charset); } + : /* nothing */ { $$ = MAKE_str_constant(newIntlString(","), lex.charSetId); } | ',' value { $$ = $2; } ;