8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-28 02:43:03 +01:00
firebird-mirror/src/jrd/evl_string.h

432 lines
11 KiB
C
Raw Normal View History

/*
* PROGRAM: JRD Access Method
* MODULE: evl_string.h
* DESCRIPTION: Streamed string functions
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
* You may obtain a copy of the Licence at
* http://www.gnu.org/licences/lgpl.html
*
* As a special exception this file can also be included in modules
* with other source code as long as that source code has been
* released under an Open Source Initiative certificed licence.
* More information about OSI certification can be found at:
* http://www.opensource.org
*
* This module is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public Licence for more details.
*
* This module was created by members of the firebird development
* team. All individual contributions remain the Copyright (C) of
* those individuals and all rights are reserved. Contributors to
* this file are either listed below or can be obtained from a CVS
* history command.
*
* Created by: Nickolay Samofatov <skidder@bssys.com>
*
* Contributor(s):
*
*
* $Id: evl_string.h,v 1.1 2003-11-16 22:10:26 skidder Exp $
*
*/
#ifndef EVL_STRING_H
#define EVL_STRING_H
#include "..\common\classes\alloc.h"
#include "..\common\classes\array.h"
// Number of pattern items statically allocated
#define STATIC_PATTERN_ITEMS 16
// Number of pattern items that are matched in parallel statically allocated
#define STATIC_STATUS_ITEMS 16
// Size of internal static buffer used for allocation
// This buffer is used for KMP masks and string unescaping
#ifdef TESTING_ONLY
#define STATIC_PATTERN_BUFFER 16
#else
#define STATIC_PATTERN_BUFFER 1024
#endif
namespace Firebird {
enum PatternItemType {
piNone = 0,
piSearch,
piSkipFixed,
piDirectMatch,
// Used only during compilation phase to indicate that string is pending cleanup
piEscapedString,
// Used to optimize subpatterns like "%_%____%_", not used during matching phase
piSkipMore
};
enum MatchType {
MATCH_NONE = 0,
MATCH_FIXED,
MATCH_ANY
};
template <typename CharType>
class LikeEvaluator {
public:
LikeEvaluator(MemoryPool* _pool) : pool(_pool), patternItems(_pool),
branches(_pool), chunksToFree(_pool), allocated(0), match_type(MATCH_NONE)
{ }
~LikeEvaluator() {
for (int i=0; i < chunksToFree.getCount(); i++)
pool->deallocate(chunksToFree[i]);
}
void reset() {
assert(patternItems.getCount());
branches.shrink(0);
if (patternItems[0].type == piNone) {
match_type = (patternItems[0].match_any ? MATCH_ANY : MATCH_FIXED);
} else {
BranchItem temp = {&patternItems[0], 0};
branches.add(temp);
match_type = MATCH_NONE;
}
}
bool getResult() {
return match_type != MATCH_NONE;
}
void prepare(const CharType* pattern_str, SSHORT pattern_len, CharType escape_char,
CharType sql_match_any, CharType sql_match_one);
// Returns true if more data can change the result of evaluation
bool processNextChunk(const CharType* data, SSHORT data_len);
private:
struct PatternItem {
PatternItemType type;
union {
struct {
SSHORT length;
CharType* data;
SSHORT* kmpNext; // Jump table for Knuth-Morris-Pratt algorithm
} str;
SSHORT skipCount;
};
bool match_any;
};
struct BranchItem {
PatternItem* pattern;
SSHORT offset; // Match offset inside this pattern
};
MemoryPool* pool;
HalfStaticArray<PatternItem, STATIC_PATTERN_ITEMS> patternItems;
HalfStaticArray<BranchItem, STATIC_STATUS_ITEMS> branches;
Array<CharType*> chunksToFree;
char allocBuffer[STATIC_PATTERN_BUFFER];
int allocated;
MatchType match_type;
void* alloc(SSHORT count) {
CharType* result;
if (allocated + count <= STATIC_PATTERN_BUFFER) {
result = allocBuffer + allocated;
allocated += count;
return result;
} else {
result = reinterpret_cast<CharType*>(pool->allocate(count));
chunksToFree.add(result);
return result;
}
}
static void preKmp(CharType *x, int m, SSHORT kmpNext[]) {
SSHORT i, j;
i = 0;
j = kmpNext[0] = -1;
while (i < m) {
while (j > -1 && x[i] != x[j])
j = kmpNext[j];
i++;
j++;
if (x[i] == x[j])
kmpNext[i] = kmpNext[j];
else
kmpNext[i] = j;
}
}
};
template <typename CharType>
void LikeEvaluator<CharType>::prepare(const CharType* pattern_str,
SSHORT pattern_len, CharType escape_char, CharType sql_match_any, CharType sql_match_one)
{
// PASS1. Parse pattern.
SSHORT pattern_pos = 0;
patternItems.shrink(0);
PatternItem temp = {piNone, 0};
patternItems.add(temp);
PatternItem *item = patternItems.begin();
while (pattern_pos < pattern_len) {
CharType c = pattern_str[pattern_pos++];
// Escaped symbol
if (escape_char && c == escape_char) {
if (pattern_pos < pattern_len) {
c = pattern_str[pattern_pos++];
/* Note: SQL II says <escape_char><escape_char> is error condition */
if (c == escape_char || c == sql_match_any || c == sql_match_one)
{
switch (item->type) {
case piSkipFixed:
case piSkipMore:
patternItems.grow(patternItems.getCount()+1);
item = patternItems.end()-1;
// Note: fall into
case piNone:
item->type = piEscapedString;
item->str.data = const_cast<CharType*>(pattern_str + pattern_pos - 2);
item->str.length = 1;
break;
case piSearch:
item->type = piEscapedString;
// Note: fall into
case piEscapedString:
item->str.length++;
break;
}
continue;
}
}
ERR_post(gds_like_escape_invalid, 0);
}
// percent sign
if (c == sql_match_any) {
switch(item->type) {
case piSearch:
case piEscapedString:
patternItems.grow(patternItems.getCount()+1);
item = patternItems.end()-1;
// Note: fall into
case piSkipFixed:
case piNone:
item->type = piSkipMore;
break;
}
continue;
}
// underscore
if (c == sql_match_one) {
switch(item->type) {
case piSearch:
case piEscapedString:
patternItems.grow(patternItems.getCount()+1);
item = patternItems.end()-1;
// Note: fall into
case piNone:
item->type = piSkipFixed;
item->skipCount = 1;
break;
case piSkipFixed:
case piSkipMore:
item->skipCount++;
break;
}
continue;
}
// anything else
switch (item->type) {
case piSkipFixed:
case piSkipMore:
patternItems.grow(patternItems.getCount()+1);
item = patternItems.end()-1;
// Note: fall into
case piNone:
item->type = piSearch;
item->str.data = const_cast<CharType*>(pattern_str + pattern_pos - 1);
item->str.length = 1;
break;
case piSearch:
case piEscapedString:
item->str.length++;
break;
}
}
// PASS2. Compilation/Optimization
// Unescape strings, mark direct match items, pre-compile KMP tables and
// optimize out piSkipMore nodes
bool directMatch = true;
for (int i=0; i < patternItems.getCount();) {
PatternItem *item = &patternItems[i];
switch (item->type) {
case piEscapedString: {
const CharType *curPos = item->str.data;
item->str.data =
reinterpret_cast<CharType*>(alloc(item->str.length*sizeof(CharType)));
for (SSHORT j=0; j < item->str.length; j++) {
if (*curPos == escape_char)
curPos++;
item->str.data[j] = *curPos++;
}
item->type = piSearch;
// Note: fall into
}
case piSearch:
if (directMatch)
item->type = piDirectMatch;
else {
item->str.kmpNext =
reinterpret_cast<SSHORT*>(alloc((item->str.length+1)*sizeof(SSHORT)));
preKmp(item->str.data, item->str.length, item->str.kmpNext);
directMatch = true;
}
break;
case piSkipMore:
// Optimize out piSkipMore
directMatch = false;
if (item->skipCount != 0) {
// Convert this node to SkipFixed if possible
item->type = piSkipFixed;
item->match_any = true;
} else {
if (i > 0) {
// Mark previous node if it exists
patternItems[i-1].match_any = true;
patternItems.remove(i);
continue;
}
// Remove node if we have other nodes
if (patternItems.getCount() != 1) {
patternItems.remove(i);
continue;
}
// Our pattern is single %
item->type = piNone;
item->match_any = true;
}
break;
}
i++;
}
// Get ready for parsing
reset();
}
template <typename CharType>
bool LikeEvaluator<CharType>::processNextChunk(const CharType* data, SSHORT data_len) {
assert(data_len);
assert(patternItems.getCount());
if (match_type == MATCH_FIXED)
match_type = MATCH_NONE;
if (branches.getCount() == 0)
return false;
SSHORT data_pos = 0;
SSHORT finishCandidate = -1;
while (data_pos < data_len) {
int branch_number = 0;
while (branch_number < branches.getCount())
{
BranchItem *current_branch = &branches[branch_number];
PatternItem *current_pattern = current_branch->pattern;
switch(current_pattern->type) {
case piDirectMatch:
if (data[data_pos] != current_pattern->str.data[current_branch->offset]) {
// Terminate matching branch
branches.remove(branch_number);
if (branches.getCount() == 0)
return false;
continue;
}
// Note: fall into
case piSkipFixed:
current_branch->offset++;
if (current_branch->offset >= current_pattern->str.length) {
// Switch to next subpattern or finish matching
if (current_pattern->match_any) {
current_pattern++;
if (current_pattern >= patternItems.end()) {
branches.shrink(0);
match_type = MATCH_ANY;
return false;
}
branches.shrink(1);
branches[0].pattern = current_pattern;
branches[0].offset = 0;
branch_number = 0;
break;
}
current_pattern++;
if (current_pattern >= patternItems.end()) {
finishCandidate = data_pos;
branches.remove(branch_number);
if (branches.getCount() == 0) {
if (data_pos == data_len-1) {
match_type = MATCH_FIXED;
return true;
}
return false;
}
continue;
}
current_branch->pattern = current_pattern;
current_branch->offset = 0;
}
break;
case piSearch:
// Knuth-Morris-Pratt search algorithm
while (current_branch->offset >= 0 &&
current_pattern->str.data[current_branch->offset] != data[data_pos])
{
current_branch->offset = current_pattern->str.kmpNext[current_branch->offset];
}
current_branch->offset++;
if (current_branch->offset >= current_pattern->str.length) {
current_branch->offset = current_pattern->str.kmpNext[current_branch->offset];
if (current_pattern + 1 >= patternItems.end()) {
if (current_pattern->match_any) {
branches.shrink(0);
match_type = MATCH_ANY;
return false;
}
// We are looking for the pattern at the end of string
finishCandidate = data_pos;
} else {
// Try to apply further patterns, but continue searching
BranchItem temp = {current_pattern + 1, 0};
branches.insert(branch_number+1, temp); // +1 is to reduce movement effort :)
branch_number++; // Skip newly inserted branch in this cycle
}
}
break;
default:
assert(false);
}
branch_number++;
}
data_pos++;
}
if (finishCandidate == data_len - 1)
match_type = MATCH_FIXED;
return true;
}
}
#endif