/* * PROGRAM: JRD International support * MODULE: SimilarToMatcher.h * DESCRIPTION: SIMILAR TO predicate * * The contents of this file are subject to the Initial * Developer's Public License Version 1.0 (the "License"); * you may not use this file except in compliance with the * License. You may obtain a copy of the License at * http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl. * * Software distributed under the License is distributed AS IS, * WITHOUT WARRANTY OF ANY KIND, either express or implied. * See the License for the specific language governing rights * and limitations under the License. * * The Original Code was created by Adriano dos Santos Fernandes * for the Firebird Open Source RDBMS project. * * Copyright (c) 2007 Adriano dos Santos Fernandes * and all contributors signed below. * * All Rights Reserved. * Contributor(s): ______________________________________. */ #ifndef JRD_SIMILAR_TO_EVALUATOR_H #define JRD_SIMILAR_TO_EVALUATOR_H #include "../jrd/intl_classes.h" #include "../jrd/evl_string.h" // #define DEBUG_SIMILAR #ifdef DEBUG_SIMILAR // #define RECURSIVE_SIMILAR // useless in production due to stack overflow #endif namespace Firebird { template > class SimilarToMatcher : public Jrd::PatternMatcher { private: typedef Jrd::CharSet CharSet; typedef Jrd::TextType TextType; // This class is based on work of Zafir Anjum // http://www.codeguru.com/Cpp/Cpp/string/regex/article.php/c2791 // which has been derived from work by Henry Spencer. // // The original copyright notice follows: // // Copyright (c) 1986, 1993, 1995 by University of Toronto. // Written by Henry Spencer. Not derived from licensed software. // // Permission is granted to anyone to use this software for any // purpose on any computer system, and to redistribute it in any way, // subject to the following restrictions: // // 1. The author is not responsible for the consequences of use of // this software, no matter how awful, even if they arise // from defects in it. // // 2. The origin of this software must not be misrepresented, either // by explicit claim or by omission. // // 3. Altered versions must be plainly marked as such, and must not // be misrepresented (by explicit claim or omission) as being // the original software. // // 4. This notice must not be removed or altered. class Evaluator : private StaticAllocator { public: Evaluator(MemoryPool& pool, TextType* aTextType, const UCHAR* patternStr, SLONG patternLen, CharType aEscapeChar, bool aUseEscape); ~Evaluator() { delete[] branches; } bool getResult(); bool processNextChunk(const UCHAR* data, SLONG dataLen); void reset(); private: enum Op { opBranch, opStart, opEnd, opRef, opRepeatingRefStart, opRepeatingRefEnd, opNothing, opAny, opAnyOf, opExactly, opExactlyOne, // optimization for opExactly with a single character // Implementation details of the non-recursive match opRet, opRepeatingRestore // If new codes are added, shifts in MatchState codes may need to change. }; struct Node { explicit Node(Op aOp, const CharType* aStr = NULL, SLONG aLen = 0) : op(aOp), str(aStr), len(aLen), str2(NULL), len2(0), str3(aStr), len3(aLen), str4(NULL), len4(0), ref(0), branchNum(-1) { } Node(Op aOp, SLONG aLen1, SLONG aLen2, int aRef) : op(aOp), str(NULL), len(aLen1), str2(NULL), len2(aLen2), str3(NULL), len3(0), str4(NULL), len4(0), ref(aRef), branchNum(-1) { } Node(Op aOp, int aRef) : op(aOp), str(NULL), len(0), str2(NULL), len2(0), str3(NULL), len3(0), str4(NULL), len4(0), ref(aRef), branchNum(-1) { } Node(const Node& node) : op(node.op), str(node.str), len(node.len), str2(node.str2), len2(node.len2), str3(node.str3), len3(node.len3), str4(node.str4), len4(node.len4), ref(node.ref), branchNum(node.branchNum) { } #ifdef DEBUG_SIMILAR void dump(string& text, int i) const { string temp; switch (op) { case opBranch: if (branchNum == -1) temp.printf("opBranch(%d)", i + ref); else temp.printf("opBranch(%d, %d)", i + ref, branchNum); break; case opStart: temp = "opStart"; break; case opEnd: temp = "opEnd"; break; case opRef: if (branchNum == -1) temp.printf("opRef(%d)", i + ref); else temp.printf("opRef(%d, %d)", i + ref, branchNum); break; case opRepeatingRefStart: temp.printf("opRepeatingRefStart(%d, %d)", i + ref, len); break; case opRepeatingRefEnd: temp.printf("opRepeatingRefEnd(%d)", i + ref); break; case opNothing: temp = "opNothing"; break; case opAny: temp = "opAny"; break; case opAnyOf: temp.printf("opAnyOf(%.*s, %d, %.*s, %d, %.*s, %d, %.*s, %d)", len, str, len, len2, str2, len2, len3, str3, len3, len4, str4, len4); break; case opExactly: temp.printf("opExactly(%.*s, %d)", len, str, len); break; case opExactlyOne: temp.printf("opExactlyOne(%.*s)", len, str); break; case opRet: temp.printf("opRet"); break; case opRepeatingRestore: temp.printf("opRepeatingRestore"); break; default: temp = "unknown"; break; } text.printf("%d: %s", i, temp.c_str()); } #endif // DEBUG_SIMILAR Op op; const CharType* str; SLONG len; const UCHAR* str2; SLONG len2; const CharType* str3; SLONG len3; const UCHAR* str4; SLONG len4; int ref; int branchNum; }; #ifndef RECURSIVE_SIMILAR // Struct used to evaluate expressions without recursion. // Represents local variables to implement a "virtual stack". struct Scope { inline explicit Scope(const Node* ai) : i(ai), save(NULL) { } inline void operator =(const Node* ai) { i = ai; save = NULL; } const Node* i; const CharType* save; }; // Stack for recursion emulation. template class SimpleStack { public: SimpleStack() : size(INCREASE_FACTOR) { data = FB_NEW_POOL(*getDefaultMemoryPool()) UCHAR[(size + 1) * sizeof(T)]; back = (T*) FB_ALIGN(data.get(), sizeof(T)); end = back + size; // 'back' starts before initial element, then always points to the last pushed element. --back; } template inline void push(T2 node) { // If the limit is reached, resize. if (++back == end) { unsigned newSize = size + INCREASE_FACTOR; UCHAR* newData = FB_NEW_POOL(*getDefaultMemoryPool()) UCHAR[(newSize + 1) * sizeof(T)]; T* p = (T*) FB_ALIGN(newData, sizeof(T)); memcpy(p, end - size, size * sizeof(T)); back = p + size; end = p + newSize; size = newSize; data.reset(newData); } *back = node; } inline T pop() { fb_assert(getCount() > 0); return *back--; } inline T* begin() const { return (T*) FB_ALIGN(data.get(), sizeof(T)); } inline FB_SIZE_T getCount() const { return (back + 1) - begin(); } public: T* back; private: static const unsigned INCREASE_FACTOR = 50; unsigned size; AutoPtr data; T* end; }; #endif // RECURSIVE_SIMILAR static const int FLAG_NOT_EMPTY = 1; // known never to match empty string static const int FLAG_EXACTLY = 2; // non-escaped string private: void parseExpr(int* flagp); void parseTerm(int* flagp); void parseFactor(int* flagp); void parsePrimary(int* flagp); bool isRep(CharType c) const; CharType canonicalChar(int ch) const { return *reinterpret_cast(textType->getCanonicalChar(ch)); } #ifdef DEBUG_SIMILAR void dump() const; #endif private: #ifdef RECURSIVE_SIMILAR bool match(int start); #else bool match(); #endif private: static SLONG notInSet(const CharType* str, SLONG strLen, const CharType* set, SLONG setLen); private: struct Range { unsigned start; unsigned length; }; #ifdef DEBUG_SIMILAR Array debugLog; int debugLevel; #endif TextType* textType; CharType escapeChar; bool useEscape; HalfStaticArray buffer; const UCHAR* originalPatternStr; SLONG originalPatternLen; StrConverter patternCvt; CharSet* charSet; Array nodes; const CharType* patternStart; const CharType* patternEnd; const CharType* patternPos; const CharType* bufferStart; const CharType* bufferEnd; const CharType* bufferPos; CharType metaCharacters[15]; public: unsigned branchNum; Range* branches; }; public: SimilarToMatcher(MemoryPool& pool, TextType* ttype, const UCHAR* str, SLONG strLen, CharType escape, bool useEscape) : PatternMatcher(pool, ttype), evaluator(pool, ttype, str, strLen, escape, useEscape) { } void reset() { evaluator.reset(); } bool result() { return evaluator.getResult(); } bool process(const UCHAR* str, SLONG length) { return evaluator.processNextChunk(str, length); } unsigned getNumBranches() { return evaluator.branchNum; } void getBranchInfo(unsigned n, unsigned* start, unsigned* length) { fb_assert(n <= evaluator.branchNum); *start = evaluator.branches[n].start; *length = evaluator.branches[n].length; } static SimilarToMatcher* create(MemoryPool& pool, TextType* ttype, const UCHAR* str, SLONG length, const UCHAR* escape, SLONG escapeLen) { StrConverter cvt_escape(pool, ttype, escape, escapeLen); return FB_NEW_POOL(pool) SimilarToMatcher(pool, ttype, str, length, (escape ? *reinterpret_cast(escape) : 0), escapeLen != 0); } static bool evaluate(MemoryPool& pool, TextType* ttype, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escapeLen) { StrConverter cvt_escape(pool, ttype, escape, escapeLen); Evaluator evaluator(pool, ttype, p, pl, (escape ? *reinterpret_cast(escape) : 0), escapeLen != 0); evaluator.processNextChunk(s, sl); return evaluator.getResult(); } private: Evaluator evaluator; }; template SimilarToMatcher::Evaluator::Evaluator( MemoryPool& pool, TextType* aTextType, const UCHAR* patternStr, SLONG patternLen, CharType aEscapeChar, bool aUseEscape) : StaticAllocator(pool), #ifdef DEBUG_SIMILAR debugLog(pool), debugLevel(-1), #endif textType(aTextType), escapeChar(aEscapeChar), useEscape(aUseEscape), buffer(pool), originalPatternStr(patternStr), originalPatternLen(patternLen), patternCvt(pool, textType, patternStr, patternLen), charSet(textType->getCharSet()), nodes(pool), branchNum(0) { fb_assert(patternLen % sizeof(CharType) == 0); patternLen /= sizeof(CharType); CharType* p = metaCharacters; *p++ = canonicalChar(TextType::CHAR_CIRCUMFLEX); *p++ = canonicalChar(TextType::CHAR_MINUS); *p++ = canonicalChar(TextType::CHAR_UNDERLINE); *p++ = canonicalChar(TextType::CHAR_PERCENT); *p++ = canonicalChar(TextType::CHAR_OPEN_BRACKET); *p++ = canonicalChar(TextType::CHAR_CLOSE_BRACKET); *p++ = canonicalChar(TextType::CHAR_OPEN_PAREN); *p++ = canonicalChar(TextType::CHAR_CLOSE_PAREN); *p++ = canonicalChar(TextType::CHAR_OPEN_BRACE); *p++ = canonicalChar(TextType::CHAR_CLOSE_BRACE); *p++ = canonicalChar(TextType::CHAR_VERTICAL_BAR); *p++ = canonicalChar(TextType::CHAR_QUESTION_MARK); *p++ = canonicalChar(TextType::CHAR_PLUS); *p++ = canonicalChar(TextType::CHAR_ASTERISK); if (useEscape) *p++ = escapeChar; else *p++ = canonicalChar(TextType::CHAR_ASTERISK); // just repeat something fb_assert(p - metaCharacters == FB_NELEM(metaCharacters)); patternStart = patternPos = (const CharType*) patternStr; patternEnd = patternStart + patternLen; nodes.push(Node(opStart)); int flags; parseExpr(&flags); nodes.push(Node(opEnd)); #ifdef DEBUG_SIMILAR dump(); #endif // Check for proper termination. if (patternPos < patternEnd) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); branches = FB_NEW_POOL(pool) Range[branchNum + 1]; reset(); } template bool SimilarToMatcher::Evaluator::getResult() { const UCHAR* str = buffer.begin(); SLONG len = buffer.getCount(); // note that StrConverter changes str and len variables StrConverter cvt(pool, textType, str, len); fb_assert(len % sizeof(CharType) == 0); bufferStart = bufferPos = (const CharType*) str; bufferEnd = bufferStart + len / sizeof(CharType); #ifdef DEBUG_SIMILAR debugLog.clear(); debugLevel = -1; #endif const bool matched = #ifdef RECURSIVE_SIMILAR match(0); #else match(); #endif #ifdef DEBUG_SIMILAR if (matched) { for (unsigned i = 0; i <= branchNum; ++i) { string x; x.printf("%d: %d, %d\n", i, branches[i].start, branches[i].length); debugLog.add(x.c_str(), x.length()); } debugLog.add('\0'); gds__log("\n%s", debugLog.begin()); } #endif // DEBUG_SIMILAR return matched; } template bool SimilarToMatcher::Evaluator::processNextChunk(const UCHAR* data, SLONG dataLen) { const FB_SIZE_T pos = buffer.getCount(); memcpy(buffer.getBuffer(pos + dataLen) + pos, data, dataLen); return true; } template void SimilarToMatcher::Evaluator::reset() { buffer.shrink(0); memset(branches, 0, sizeof(Range) * (branchNum + 1)); } template void SimilarToMatcher::Evaluator::parseExpr(int* flagp) { *flagp = FLAG_NOT_EMPTY; bool first = true; Array refs; int start; while (first || (patternPos < patternEnd && *patternPos == canonicalChar(TextType::CHAR_VERTICAL_BAR))) { if (first) first = false; else ++patternPos; int thisBranchNum = branchNum; start = nodes.getCount(); nodes.push(Node(opBranch)); nodes.back().branchNum = thisBranchNum; int flags; parseTerm(&flags); *flagp &= ~(~flags & FLAG_NOT_EMPTY); *flagp |= flags; refs.push(nodes.getCount()); nodes.push(Node(opRef)); nodes.back().branchNum = thisBranchNum; nodes[start].ref = nodes.getCount() - start; } nodes[start].ref = 0; for (Array::iterator i = refs.begin(); i != refs.end(); ++i) nodes[*i].ref = nodes.getCount() - *i; } template void SimilarToMatcher::Evaluator::parseTerm(int* flagp) { *flagp = 0; bool first = true; CharType c; int flags; while ((patternPos < patternEnd) && (c = *patternPos) != canonicalChar(TextType::CHAR_VERTICAL_BAR) && c != canonicalChar(TextType::CHAR_CLOSE_PAREN)) { parseFactor(&flags); *flagp |= flags & FLAG_NOT_EMPTY; if (first) { *flagp |= flags; first = false; } } if (first) nodes.push(Node(opNothing)); } template void SimilarToMatcher::Evaluator::parseFactor(int* flagp) { int atomPos = nodes.getCount(); int flags; parsePrimary(&flags); CharType op; if (patternPos >= patternEnd || !isRep((op = *patternPos))) { *flagp = flags; return; } if (!(flags & FLAG_NOT_EMPTY) && op != canonicalChar(TextType::CHAR_QUESTION_MARK)) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); // If the last primary is a string, split the last character if (flags & FLAG_EXACTLY) { fb_assert(nodes.back().op == opExactly || nodes.back().op == opExactlyOne); if (nodes.back().op == opExactly && nodes.back().len > 1) { Node last = nodes.back(); last.op = opExactlyOne; last.str += nodes.back().len - 1; last.len = 1; --nodes.back().len; atomPos = nodes.getCount(); nodes.push(last); } } fb_assert( op == canonicalChar(TextType::CHAR_ASTERISK) || op == canonicalChar(TextType::CHAR_PLUS) || op == canonicalChar(TextType::CHAR_QUESTION_MARK) || op == canonicalChar(TextType::CHAR_OPEN_BRACE)); if (op == canonicalChar(TextType::CHAR_ASTERISK)) { *flagp = 0; nodes.insert(atomPos, Node(opBranch, nodes.getCount() - atomPos + 2)); nodes.push(Node(opRef, atomPos - nodes.getCount())); nodes.push(Node(opBranch)); } else if (op == canonicalChar(TextType::CHAR_PLUS)) { *flagp = FLAG_NOT_EMPTY; nodes.push(Node(opBranch, 2)); nodes.push(Node(opRef, atomPos - nodes.getCount())); nodes.push(Node(opBranch)); } else if (op == canonicalChar(TextType::CHAR_QUESTION_MARK)) { *flagp = 0; nodes.insert(atomPos, Node(opBranch, nodes.getCount() - atomPos + 1)); nodes.push(Node(opBranch)); } else if (op == canonicalChar(TextType::CHAR_OPEN_BRACE)) { ++patternPos; UCharBuffer dummy; const UCHAR* p = originalPatternStr + charSet->substring(originalPatternLen, originalPatternStr, originalPatternLen, dummy.getBuffer(originalPatternLen), 1, patternPos - patternStart); ULONG size = 0; bool comma = false; string s1, s2; bool ok; while ((ok = IntlUtil::readOneChar(charSet, &p, originalPatternStr + originalPatternLen, &size))) { if (*patternPos == canonicalChar(TextType::CHAR_CLOSE_BRACE)) { if (s1.isEmpty()) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); break; } else if (*patternPos == canonicalChar(TextType::CHAR_COMMA)) { if (comma) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); comma = true; } else { ULONG ch = 0; charSet->getConvToUnicode().convert(size, p, sizeof(ch), reinterpret_cast(&ch)); if (ch >= '0' && ch <= '9') { if (comma) s2 += (char) ch; else s1 += (char) ch; } else status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); } ++patternPos; } if (!ok || s1.length() > 9 || s2.length() > 9) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); const int n1 = atoi(s1.c_str()); const int n2 = s2.isEmpty() ? (comma ? INT_MAX : n1) : atoi(s2.c_str()); if (n2 < n1) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); *flagp = n1 == 0 ? 0 : FLAG_NOT_EMPTY; if (n1 == 0 && n2 == INT_MAX) { // Tranforms x{0,} to x* nodes.insert(atomPos, Node(opBranch, nodes.getCount() - atomPos + 2)); nodes.push(Node(opRef, atomPos - nodes.getCount())); nodes.push(Node(opBranch)); } else { if (n1 == 0) { // Tranforms x{,n} to (x?){n} nodes.insert(atomPos, Node(opBranch, nodes.getCount() - atomPos + 1)); nodes.push(Node(opBranch)); } int exprPos = atomPos + 1; int exprSize = nodes.getCount() - exprPos + 1; nodes.insert(atomPos, Node(opRepeatingRefStart, (n1 == 0 ? n2 : n1), 0, nodes.getCount() - atomPos + 1)); nodes.push(Node(opRepeatingRefEnd, atomPos - nodes.getCount())); if (n2 != n1 && n1 != 0) { if (n2 == INT_MAX) { // Tranforms x{n,} to x{n}x* nodes.push(Node(opBranch, exprSize + 2)); for (int i = 0; i < exprSize; ++i) { Node copy(nodes[exprPos + i]); nodes.push(copy); } nodes.push(Node(opRef, -exprSize - 1)); nodes.push(Node(opBranch)); } else { // Tranforms x{n,m} to x{n}(x?){m-n} nodes.push(Node(opRepeatingRefStart, n2 - n1, 0, exprSize + 3)); nodes.push(Node(opBranch, exprSize + 1)); for (int i = 0; i < exprSize; ++i) { Node copy(nodes[exprPos + i]); nodes.push(copy); } nodes.push(Node(opBranch)); nodes.push(Node(opRepeatingRefEnd, -exprSize -3)); } } } } ++patternPos; if (patternPos < patternEnd && isRep(*patternPos)) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); } template void SimilarToMatcher::Evaluator::parsePrimary(int* flagp) { *flagp = 0; const CharType op = *patternPos++; if (op == canonicalChar(TextType::CHAR_UNDERLINE)) { nodes.push(Node(opAny)); *flagp |= FLAG_NOT_EMPTY; } else if (op == canonicalChar(TextType::CHAR_PERCENT)) { nodes.push(Node(opBranch, 3)); nodes.push(Node(opAny)); nodes.push(Node(opRef, -2)); nodes.push(Node(opBranch)); *flagp = 0; return; } else if (op == canonicalChar(TextType::CHAR_OPEN_BRACKET)) { nodes.push(Node(opAnyOf)); HalfStaticArray charsBuffer; HalfStaticArray rangeBuffer; Node& node = nodes.back(); const CharType** nodeChars = &node.str; SLONG* nodeCharsLen = &node.len; const UCHAR** nodeRange = &node.str2; SLONG* nodeRangeLen = &node.len2; bool but = false; do { if (patternPos >= patternEnd) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); bool range = false; bool charClass = false; if (useEscape && *patternPos == escapeChar) { if (++patternPos >= patternEnd) status_exception::raise(Arg::Gds(isc_escape_invalid)); if (*patternPos != escapeChar && notInSet(patternPos, 1, metaCharacters, FB_NELEM(metaCharacters)) != 0) { status_exception::raise(Arg::Gds(isc_escape_invalid)); } if (patternPos + 1 < patternEnd) range = (patternPos[1] == canonicalChar(TextType::CHAR_MINUS)); } else { if (*patternPos == canonicalChar(TextType::CHAR_OPEN_BRACKET)) charClass = true; else if (*patternPos == canonicalChar(TextType::CHAR_CIRCUMFLEX)) { if (but) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); but = true; CharType* p = (CharType*) alloc(charsBuffer.getCount() * sizeof(CharType)); memcpy(p, charsBuffer.begin(), charsBuffer.getCount() * sizeof(CharType)); *nodeChars = p; *nodeCharsLen = charsBuffer.getCount(); if (rangeBuffer.getCount() > 0) { UCHAR* p = (UCHAR*) alloc(rangeBuffer.getCount()); memcpy(p, rangeBuffer.begin(), rangeBuffer.getCount()); *nodeRange = p; } *nodeRangeLen = rangeBuffer.getCount(); charsBuffer.clear(); rangeBuffer.clear(); nodeChars = &node.str3; nodeCharsLen = &node.len3; nodeRange = &node.str4; nodeRangeLen = &node.len4; ++patternPos; continue; } else if (patternPos + 1 < patternEnd) range = (patternPos[1] == canonicalChar(TextType::CHAR_MINUS)); } if (charClass) { if (++patternPos >= patternEnd || *patternPos != canonicalChar(TextType::CHAR_COLON)) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); const CharType* start = ++patternPos; while (patternPos < patternEnd && *patternPos != canonicalChar(TextType::CHAR_COLON)) ++patternPos; const SLONG len = patternPos++ - start; if (patternPos >= patternEnd || *patternPos++ != canonicalChar(TextType::CHAR_CLOSE_BRACKET)) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); typedef const UCHAR* (TextType::*GetCanonicalFunc)(int*) const; static const GetCanonicalFunc alNum[] = {&TextType::getCanonicalUpperLetters, &TextType::getCanonicalLowerLetters, &TextType::getCanonicalNumbers, NULL}; static const GetCanonicalFunc alpha[] = {&TextType::getCanonicalUpperLetters, &TextType::getCanonicalLowerLetters, NULL}; static const GetCanonicalFunc digit[] = {&TextType::getCanonicalNumbers, NULL}; static const GetCanonicalFunc lower[] = {&TextType::getCanonicalLowerLetters, NULL}; static const GetCanonicalFunc space[] = {&TextType::getCanonicalSpace, NULL}; static const GetCanonicalFunc upper[] = {&TextType::getCanonicalUpperLetters, NULL}; static const GetCanonicalFunc whitespace[] = {&TextType::getCanonicalWhiteSpaces, NULL}; struct { const GetCanonicalFunc* funcs; const ULONG nameLen; // in bytes, not characters because all functions accept length in bytes const USHORT name[10]; } static const classes[] = { // Names are in utf16 in order not to convert them every time for comparison and thus save some CPU {alNum, 10, {'A','L','N','U','M'}}, {alpha, 10, {'A','L','P','H','A'}}, {digit, 10, {'D','I','G','I','T'}}, {lower, 10, {'L','O','W','E','R'}}, {space, 10, {'S','P','A','C','E'}}, {upper, 10, {'U','P','P','E','R'}}, {whitespace, 20, {'W','H','I','T','E','S','P','A','C','E'}} }; // Get the exact original substring correspondent to the canonical bytes. HalfStaticArray classNameStr( len * charSet->maxBytesPerChar()); ULONG classNameStrLen = charSet->substring(originalPatternLen, originalPatternStr, classNameStr.getCapacity(), classNameStr.begin(), start - patternStart, len); // And then convert it to UTF-16. HalfStaticArray classNameUtf16( len * sizeof(ULONG)); ULONG classNameUtf16Len = charSet->getConvToUnicode().convert( classNameStrLen, classNameStr.begin(), classNameUtf16.getCapacity() * sizeof(USHORT), classNameUtf16.begin()); // Bring class name to uppercase for case-insensitivity. // Do it in UTF-16 because original collation can have no uppercase conversion. classNameUtf16Len = Jrd::UnicodeUtil::utf16UpperCase( classNameUtf16Len, classNameUtf16.begin(), classNameUtf16.getCapacity() * sizeof(USHORT), classNameUtf16.begin(), NULL); int classN; for (classN = 0; classN < FB_NELEM(classes); ++classN) { INTL_BOOL errorFlag; if (Jrd::UnicodeUtil::utf16Compare(classNameUtf16Len, classNameUtf16.begin(), classes[classN].nameLen, classes[classN].name, &errorFlag) == 0) { for (const GetCanonicalFunc* func = classes[classN].funcs; *func; ++func) { int count; const CharType* canonic = (const CharType*) (textType->**func)(&count); charsBuffer.push(canonic, count); } break; } } if (classN >= FB_NELEM(classes)) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); } else { charsBuffer.push(*patternPos++); if (range) { --patternPos; // go back to first char UCHAR c[sizeof(ULONG)]; ULONG len = charSet->substring(originalPatternLen, originalPatternStr, sizeof(c), c, patternPos - patternStart, 1); rangeBuffer.push(len); FB_SIZE_T rangeCount = rangeBuffer.getCount(); memcpy(rangeBuffer.getBuffer(rangeCount + len) + rangeCount, &c, len); ++patternPos; // character ++patternPos; // minus if (patternPos >= patternEnd) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); if (useEscape && *patternPos == escapeChar) { if (++patternPos >= patternEnd) status_exception::raise(Arg::Gds(isc_escape_invalid)); if (*patternPos != escapeChar && notInSet(patternPos, 1, metaCharacters, FB_NELEM(metaCharacters)) != 0) { status_exception::raise(Arg::Gds(isc_escape_invalid)); } } len = charSet->substring(originalPatternLen, originalPatternStr, sizeof(c), c, patternPos - patternStart, 1); rangeBuffer.push(len); rangeCount = rangeBuffer.getCount(); memcpy(rangeBuffer.getBuffer(rangeCount + len) + rangeCount, &c, len); charsBuffer.push(*patternPos++); } } if (patternPos >= patternEnd) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); } while (*patternPos != canonicalChar(TextType::CHAR_CLOSE_BRACKET)); CharType* p = (CharType*) alloc(charsBuffer.getCount() * sizeof(CharType)); memcpy(p, charsBuffer.begin(), charsBuffer.getCount() * sizeof(CharType)); *nodeChars = p; *nodeCharsLen = charsBuffer.getCount(); if (rangeBuffer.getCount() > 0) { UCHAR* r = (UCHAR*) alloc(rangeBuffer.getCount()); memcpy(r, rangeBuffer.begin(), rangeBuffer.getCount()); *nodeRange = r; } *nodeRangeLen = rangeBuffer.getCount(); ++patternPos; *flagp |= FLAG_NOT_EMPTY; } else if (op == canonicalChar(TextType::CHAR_OPEN_PAREN)) { int flags; parseExpr(&flags); ++branchNum; // This is used for the trace stuff. if (patternPos >= patternEnd || *patternPos++ != canonicalChar(TextType::CHAR_CLOSE_PAREN)) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); *flagp |= flags & FLAG_NOT_EMPTY; } else if (useEscape && op == escapeChar) { if (patternPos >= patternEnd) status_exception::raise(Arg::Gds(isc_escape_invalid)); if (*patternPos != escapeChar && notInSet(patternPos, 1, metaCharacters, FB_NELEM(metaCharacters)) != 0) { status_exception::raise(Arg::Gds(isc_escape_invalid)); } nodes.push(Node(opExactlyOne, patternPos++, 1)); *flagp |= FLAG_NOT_EMPTY; } else { --patternPos; const SLONG len = notInSet(patternPos, patternEnd - patternPos, metaCharacters, FB_NELEM(metaCharacters)); if (len == 0) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); *flagp |= FLAG_NOT_EMPTY | FLAG_EXACTLY; nodes.push(Node((len == 1 ? opExactlyOne : opExactly), patternPos, len)); patternPos += len; } } template bool SimilarToMatcher::Evaluator::isRep(CharType c) const { return (c == canonicalChar(TextType::CHAR_ASTERISK) || c == canonicalChar(TextType::CHAR_PLUS) || c == canonicalChar(TextType::CHAR_QUESTION_MARK) || c == canonicalChar(TextType::CHAR_OPEN_BRACE)); } #ifdef DEBUG_SIMILAR template void SimilarToMatcher::Evaluator::dump() const { string text; for (unsigned i = 0; i < nodes.getCount(); ++i) { string type; nodes[i].dump(type, i); string s; s.printf("%s%s", (i > 0 ? ", " : ""), type.c_str()); text += s; } gds__log("%s", text.c_str()); } #endif // DEBUG_SIMILAR template #ifdef RECURSIVE_SIMILAR bool SimilarToMatcher::Evaluator::match(int start) { #ifdef DEBUG_SIMILAR AutoSetRestore autoDebugLevel(&debugLevel, debugLevel + 1); #endif for (int i = start;; ++i) { const Node* node = &nodes[i]; #ifdef DEBUG_SIMILAR string s; node->dump(s, i); for (int debugLevelI = 0; debugLevelI < debugLevel; ++debugLevelI) s = " " + s; s = "\n" + s; debugLog.add(s.c_str(), s.length()); #endif switch (node->op) { case opBranch: { const CharType* const save = bufferPos; while (true) { if (node->branchNum != -1) branches[node->branchNum].start = save - bufferStart; if (match(i + 1)) return true; bufferPos = save; if (node->ref == 0) return false; i += node->ref; node = &nodes[i]; if (node->ref == 0) break; #ifdef DEBUG_SIMILAR node->dump(s, i); for (int debugLevelI = 0; debugLevelI < debugLevel; ++debugLevelI) s = " " + s; s = "\n" + s; debugLog.add(s.c_str(), s.length()); #endif } break; } case opStart: if (bufferPos != bufferStart) return false; break; case opEnd: return (bufferPos == bufferEnd); case opRef: if (node->branchNum != -1) { fb_assert(unsigned(node->branchNum) <= branchNum); branches[node->branchNum].length = bufferPos - bufferStart - branches[node->branchNum].start; } if (node->ref == 1) // avoid recursion break; return match(i + node->ref); //// FIXME: opRepeatingRefStart, opRepeatingRefEnd case opNothing: break; case opAny: #ifdef DEBUG_SIMILAR if (bufferPos >= bufferEnd) s = " -> "; else s.printf(" -> %d", *bufferPos); debugLog.add(s.c_str(), s.length()); #endif if (bufferPos >= bufferEnd) return false; ++bufferPos; break; case opAnyOf: #ifdef DEBUG_SIMILAR if (bufferPos >= bufferEnd) s = " -> "; else s.printf(" -> %d", *bufferPos); debugLog.add(s.c_str(), s.length()); #endif if (bufferPos >= bufferEnd) return false; if (notInSet(bufferPos, 1, node->str, node->len) != 0) { const UCHAR* const end = node->str2 + node->len2; const UCHAR* p = node->str2; while (p < end) { UCHAR c[sizeof(ULONG)]; ULONG len = charSet->substring(buffer.getCount(), buffer.begin(), sizeof(c), c, bufferPos - bufferStart, 1); if (textType->compare(len, c, p[0], p + 1) >= 0 && textType->compare(len, c, p[1 + p[0]], p + 2 + p[0]) <= 0) { break; } p += 2 + p[0] + p[1 + p[0]]; } if (node->len + node->len2 != 0 && p >= end) return false; } if (notInSet(bufferPos, 1, node->str3, node->len3) == 0) return false; else { const UCHAR* const end = node->str4 + node->len4; const UCHAR* p = node->str4; while (p < end) { UCHAR c[sizeof(ULONG)]; const ULONG len = charSet->substring(buffer.getCount(), buffer.begin(), sizeof(c), c, bufferPos - bufferStart, 1); if (textType->compare(len, c, p[0], p + 1) >= 0 && textType->compare(len, c, p[1 + p[0]], p + 2 + p[0]) <= 0) { break; } p += 2 + p[0] + p[1 + p[0]]; } if (p < end) return false; } ++bufferPos; break; case opExactly: if (bufferEnd - bufferPos >= node->len && memcmp(node->str, bufferPos, node->len * sizeof(CharType)) == 0) { bufferPos += node->len; break; } else return false; case opExactlyOne: if (bufferEnd - bufferPos >= 1 && *node->str == *bufferPos) { bufferPos += node->len; break; } else return false; default: fb_assert(false); return false; } } return true; } #else bool SimilarToMatcher::Evaluator::match() { // Left shift by 4 to OR MatchState's and Op's without additional runtime shifts. static const unsigned MATCH_STATE_SHIFT = 4; enum MatchState { msIterating = 0x00 << MATCH_STATE_SHIFT, msReturningFalse = 0x01 << MATCH_STATE_SHIFT, msReturningTrue = 0x02 << MATCH_STATE_SHIFT, msReturningMask = (msReturningFalse | msReturningTrue) }; SimpleStack scopeStack; // Add special node to return without needing additional comparison after popping // the stack on each return. Node nodeRet(opRet); scopeStack.push(&nodeRet); scopeStack.push(nodes.begin()); MatchState state = msIterating; SimpleStack repeatStack; SLONG repeatCount = 0; Node nodeRepeatingRestore(opRepeatingRestore); while (true) { fb_assert(scopeStack.getCount() > 0); Scope* const scope = scopeStack.back; const Node* const node = scope->i; #ifdef DEBUG_SIMILAR string debugText; node->dump(debugText, (node == &nodeRet ? -1 : node - nodes.begin())); for (const CharType* p = bufferPos; p != bufferEnd; ++p) { string s; s.printf(" %04d", *p); debugText += s; } debugText += "\nrepeat:"; for (const int* p = repeatStack.begin(); p <= repeatStack.back; ++p) { string s; s.printf(" %d", *p); debugText += s; } { string s; s.printf(" %d", repeatCount); debugText += s; } debugText += "\nscope:"; for (const Scope* p = scopeStack.begin(); p <= scopeStack.back; ++p) { string s; s.printf(" %d", (p->i == &nodeRet ? -1 : (p->i == &nodeRepeatingRestore ? -2 : p->i - nodes.begin()))); debugText += s; } gds__log("%d, %s", state, debugText.c_str()); #endif #define ENCODE_OP_STATE(op, state) ((op) | (state)) // Go directly to op and state with a single switch. switch (ENCODE_OP_STATE(node->op, state)) { case ENCODE_OP_STATE(opBranch, msIterating): if (node->branchNum != -1) branches[node->branchNum].start = bufferPos - bufferStart; scope->save = bufferPos; scopeStack.push(scope->i + 1); continue; case ENCODE_OP_STATE(opBranch, msReturningFalse): bufferPos = scope->save; if (node->ref != 0) { state = msIterating; scope->i += node->ref; if (scope->i->ref != 0) { scope->save = bufferPos; scopeStack.push(scope->i + 1); continue; } } break; case ENCODE_OP_STATE(opBranch, msReturningTrue): break; case ENCODE_OP_STATE(opStart, msIterating): if (bufferPos != bufferStart) state = msReturningFalse; break; case ENCODE_OP_STATE(opEnd, msIterating): state = (bufferPos == bufferEnd ? msReturningTrue : msReturningFalse); break; case ENCODE_OP_STATE(opRef, msIterating): if (node->branchNum != -1) { fb_assert(unsigned(node->branchNum) <= branchNum); branches[node->branchNum].length = bufferPos - bufferStart - branches[node->branchNum].start; } scope->i += node->ref; scope->save = NULL; continue; case ENCODE_OP_STATE(opRef, msReturningFalse): case ENCODE_OP_STATE(opRef, msReturningTrue): break; case ENCODE_OP_STATE(opRepeatingRefStart, msIterating): repeatStack.push(repeatCount); repeatCount = node->len; scopeStack.push(scope->i + node->ref); continue; case ENCODE_OP_STATE(opRepeatingRefStart, msReturningFalse): case ENCODE_OP_STATE(opRepeatingRefStart, msReturningTrue): repeatCount = repeatStack.pop(); break; case ENCODE_OP_STATE(opRepeatingRefEnd, msIterating): if (repeatCount > 0) { --repeatCount; scopeStack.push(scope->i + node->ref + 1); } else { repeatCount = repeatStack.pop(); scopeStack.push(&nodeRepeatingRestore); scopeStack.push(scope->i + 1); } continue; case ENCODE_OP_STATE(opRepeatingRefEnd, msReturningFalse): ++repeatCount; break; case ENCODE_OP_STATE(opRepeatingRefEnd, msReturningTrue): break; case ENCODE_OP_STATE(opRepeatingRestore, msReturningFalse): case ENCODE_OP_STATE(opRepeatingRestore, msReturningTrue): repeatStack.push(repeatCount); repeatCount = -1; break; case ENCODE_OP_STATE(opNothing, msIterating): case ENCODE_OP_STATE(opNothing, msReturningFalse): case ENCODE_OP_STATE(opNothing, msReturningTrue): break; case ENCODE_OP_STATE(opAny, msIterating): if (bufferPos >= bufferEnd) state = msReturningFalse; else ++bufferPos; break; case ENCODE_OP_STATE(opAnyOf, msIterating): if (bufferPos >= bufferEnd) state = msReturningFalse; else { if (notInSet(bufferPos, 1, node->str, node->len) != 0) { const UCHAR* const end = node->str2 + node->len2; const UCHAR* p = node->str2; while (p < end) { UCHAR c[sizeof(ULONG)]; const ULONG len = charSet->substring(buffer.getCount(), buffer.begin(), sizeof(c), c, bufferPos - bufferStart, 1); if (textType->compare(len, c, p[0], p + 1) >= 0 && textType->compare(len, c, p[1 + p[0]], p + 2 + p[0]) <= 0) { break; } p += 2 + p[0] + p[1 + p[0]]; } if (node->len + node->len2 != 0 && p >= end) { state = msReturningFalse; break; } } if (notInSet(bufferPos, 1, node->str3, node->len3) == 0) state = msReturningFalse; else { const UCHAR* const end = node->str4 + node->len4; const UCHAR* p = node->str4; while (p < end) { UCHAR c[sizeof(ULONG)]; const ULONG len = charSet->substring( buffer.getCount(), buffer.begin(), sizeof(c), c, bufferPos - bufferStart, 1); if (textType->compare(len, c, p[0], p + 1) >= 0 && textType->compare(len, c, p[1 + p[0]], p + 2 + p[0]) <= 0) { break; } p += 2 + p[0] + p[1 + p[0]]; } if (p < end) state = msReturningFalse; } } if (state == msIterating) ++bufferPos; break; case ENCODE_OP_STATE(opExactly, msIterating): if (bufferEnd - bufferPos >= node->len && memcmp(node->str, bufferPos, node->len * sizeof(CharType)) == 0) { bufferPos += node->len; } else state = msReturningFalse; break; case ENCODE_OP_STATE(opExactlyOne, msIterating): if (bufferEnd - bufferPos >= 1 && *node->str == *bufferPos) ++bufferPos; else state = msReturningFalse; break; case ENCODE_OP_STATE(opRet, msReturningFalse): case ENCODE_OP_STATE(opRet, msReturningTrue): fb_assert(repeatStack.getCount() == 0); return state == msReturningTrue; default: fb_assert(false); return false; } #undef ENCODE_OP_STATE switch (state) { case msIterating: ++scope->i; break; case msReturningFalse: case msReturningTrue: scopeStack.pop(); break; } } fb_assert(false); return false; } #endif // Returns the number of characters up to first one present in set. template SLONG SimilarToMatcher::Evaluator::notInSet( const CharType* str, SLONG strLen, const CharType* set, SLONG setLen) { for (const CharType* begin = str; str - begin < strLen; ++str) { for (const CharType* p = set; p - set < setLen; ++p) { if (*p == *str) return str - begin; } } return strLen; } // Given a regular expression R1#R2#R3 and the string S: // - Find the shortest substring of S that matches R1 while the remainder (S23) matches R2R3; // - Find the longest (S2) substring of S23 that matches R2 while the remainder matches R3; // - Return S2. template > class SubstringSimilarMatcher : public Jrd::BaseSubstringSimilarMatcher { private: typedef Jrd::CharSet CharSet; typedef Jrd::TextType TextType; public: SubstringSimilarMatcher(MemoryPool& pool, TextType* ttype, const UCHAR* patternStr, SLONG patternLen, CharType aEscapeChar) : BaseSubstringSimilarMatcher(pool, ttype), escapeChar(aEscapeChar), originalPatternStr(patternStr), originalPatternLen(patternLen), patternCvt(pool, textType, patternStr, patternLen), buffer(pool) { CharSet* charSet = textType->getCharSet(); // Make a new string without the . While doing it, get the byte // length of each segment. UCharBuffer newExpr(originalPatternLen); UCHAR* newExprPos = newExpr.begin(); const UCHAR* originalPatternEnd = originalPatternStr + originalPatternLen; const UCHAR* originalPatternPos = originalPatternStr; const CharType* lastStart = reinterpret_cast(patternStr); const CharType* end = lastStart + patternLen; unsigned lengths[3]; unsigned lengthsNum = 0; UCHAR dummy[sizeof(ULONG) * 2]; for (const CharType* p = lastStart; p < end; ++p) { if (*p != escapeChar) continue; if (++p >= end) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); if (*p == canonicalChar(TextType::CHAR_DOUBLE_QUOTE)) { if (lengthsNum >= 2) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); // Get the byte length since the last segment. ULONG len = charSet->substring(originalPatternEnd - originalPatternPos, originalPatternPos, newExpr.begin() + originalPatternLen - newExprPos, newExprPos, 0, p - lastStart - 1); lengths[lengthsNum++] = len; newExprPos += len; originalPatternPos += len; // Advance two () characters. originalPatternPos += charSet->substring(originalPatternEnd - originalPatternPos, originalPatternPos, sizeof(dummy), dummy, 0, 2); lastStart = p + 1; // Register the start of the next segment. } } if (lengthsNum != 2) status_exception::raise(Arg::Gds(isc_invalid_similar_pattern)); // Get the byte length of the last segment. lengths[2] = charSet->substring(originalPatternEnd - originalPatternPos, originalPatternPos, newExpr.begin() + originalPatternLen - newExprPos, newExprPos, 0, end - lastStart); // Construct the needed regular expressions. r1 = FB_NEW_POOL(pool) SimilarToMatcher(pool, ttype, newExpr.begin(), lengths[0], escapeChar, true); r2 = FB_NEW_POOL(pool) SimilarToMatcher(pool, ttype, newExpr.begin() + lengths[0], lengths[1], escapeChar, true); r3 = FB_NEW_POOL(pool) SimilarToMatcher(pool, ttype, newExpr.begin() + lengths[0] + lengths[1], lengths[2], escapeChar, true); r23 = FB_NEW_POOL(pool) SimilarToMatcher(pool, ttype, newExpr.begin() + lengths[0], lengths[1] + lengths[2], escapeChar, true); } static SubstringSimilarMatcher* create(MemoryPool& pool, TextType* ttype, const UCHAR* str, SLONG length, const UCHAR* escape, SLONG escapeLen) { StrConverter cvt_escape(pool, ttype, escape, escapeLen); return FB_NEW_POOL(pool) SubstringSimilarMatcher(pool, ttype, str, length, *reinterpret_cast(escape)); } void reset() { buffer.shrink(0); r1->reset(); r2->reset(); r3->reset(); r23->reset(); } bool result() { CharSet* charSet = textType->getCharSet(); const UCHAR* p = buffer.begin(); UCharBuffer temp(buffer.getCount()); UCHAR dummy[sizeof(ULONG)]; // Find the shortest substring that matches R1 while the full expression matches R1R2R3. do { r1->reset(); r1->process(buffer.begin(), p - buffer.begin()); if (r1->result()) { // We have a initial substring matching R1. Let's see if the remainder matches R2R3. r23->reset(); r23->process(p, buffer.end() - p); if (r23->result()) { // Now we start to find the longest substring that matches R2 while the // remainder matches R3. Once we found it, it's the result string. // We already know its start, based on the substring that matched R1. matchedStart = p - buffer.begin(); const UCHAR* p3 = buffer.end(); SLONG charLen23 = -1; memcpy(temp.begin(), p, p3 - p); while (true) { r2->reset(); r2->process(temp.begin(), p3 - p); if (r2->result()) { r3->reset(); r3->process(p3, buffer.end() - p3); if (r3->result()) { matchedLength = p3 - buffer.begin() - matchedStart; return true; } } if (charLen23 == -1) charLen23 = charSet->length(p3 - p, p, true); if (charLen23-- == 0) break; // Shrink in one character the string to match R2. // Move back one character to match R3. p3 = p + charSet->substring(buffer.end() - p, p, temp.getCapacity(), temp.begin(), 0, charLen23); } } } // Advance a character. p += charSet->substring(buffer.end() - p, p, sizeof(dummy), dummy, 0, 1); } while (p < buffer.end()); return false; } bool process(const UCHAR* str, SLONG length) { const FB_SIZE_T pos = buffer.getCount(); memcpy(buffer.getBuffer(pos + length) + pos, str, length); return true; } // We return byte-base start and length. void getResultInfo(unsigned* start, unsigned* length) { *start = matchedStart; *length = matchedLength; } private: CharType canonicalChar(int ch) const { return *reinterpret_cast(textType->getCanonicalChar(ch)); } private: CharType escapeChar; const UCHAR* originalPatternStr; SLONG originalPatternLen; StrConverter patternCvt; HalfStaticArray buffer; AutoPtr r1, r2, r3, r23; unsigned matchedStart; unsigned matchedLength; }; } // namespace Firebird #endif // JRD_SIMILAR_TO_EVALUATOR_H