8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-27 06:03:02 +01:00
firebird-mirror/src/jrd/intl_classes.h
2005-06-20 05:40:38 +00:00

633 lines
18 KiB
C++

/*
* PROGRAM: JRD International support
* MODULE: intl_classes.h
* DESCRIPTION: International text handling definitions
*
* The contents of this file are subject to the Initial
* Developer's Public License Version 1.0 (the "License");
* you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
* http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl.
*
* Software distributed under the License is distributed AS IS,
* WITHOUT WARRANTY OF ANY KIND, either express or implied.
* See the License for the specific language governing rights
* and limitations under the License.
*
* The Original Code was created by Nickolay Samofatov
* for the Firebird Open Source RDBMS project.
*
* Copyright (c) 2004 Nickolay Samofatov <nickolay@broadviewsoftware.com>
* and all contributors signed below.
*
* All Rights Reserved.
* Contributor(s): ______________________________________.
*
*/
#ifndef JRD_INTL_CLASSES_H
#define JRD_INTL_CLASSES_H
#include "firebird.h"
#include "../jrd/jrd.h"
#include "../jrd/intlobj_new.h"
#include "../jrd/constants.h"
#include "../jrd/unicode_util.h"
typedef SSHORT CHARSET_ID;
typedef SSHORT COLLATE_ID;
typedef USHORT TTYPE_ID;
namespace Jrd {
class LikeObject {
public:
virtual void reset() = 0;
virtual bool process(thread_db*, TextType*, const UCHAR*, SLONG) = 0;
virtual bool result() = 0;
virtual ~LikeObject() {}
};
class ContainsObject {
public:
virtual void reset() = 0;
virtual bool process(Jrd::thread_db*, Jrd::TextType*, const UCHAR*, SLONG) = 0;
virtual bool result() = 0;
virtual ~ContainsObject() {}
};
class CsConvert
{
public:
CsConvert(csconvert* _cnvt) : cnvt(_cnvt) {}
CsConvert(const CsConvert& obj) : cnvt(obj.cnvt) {}
// CVC: Beware of this can of worms: csconvert_convert gets assigned
// different functions that not necessarily take the same argument. Typically,
// the src pointer and the dest pointer use different types.
// How does this work without crashing is a miracle of IT.
// To be used with getConvFromUnicode method of CharSet class
ULONG convert(ULONG a,
const USHORT* b,
ULONG c,
UCHAR* d,
USHORT* e,
ULONG* f)
{
fb_assert(cnvt != NULL);
return (*cnvt->csconvert_fn_convert)(cnvt, a, reinterpret_cast<const UCHAR*>(b), c, d, e, f);
}
// To be used with getConvToUnicode method of CharSet class
ULONG convert(ULONG a,
const UCHAR* b,
ULONG c,
USHORT* d,
USHORT* e,
ULONG* f)
{
fb_assert(cnvt != NULL);
return (*cnvt->csconvert_fn_convert)(cnvt, a, b, c, reinterpret_cast<UCHAR*>(d), e, f);
}
// To be used for arbitrary conversions
ULONG convert(ULONG a,
const UCHAR* b,
ULONG c,
UCHAR* d,
USHORT* e,
ULONG* f)
{
fb_assert(cnvt != NULL);
return (*cnvt->csconvert_fn_convert)(cnvt, a, b, c, d, e, f);
}
// To be used for measure length of conversion
ULONG convertLength(ULONG srcLen)
{
fb_assert(cnvt != NULL);
USHORT errCode;
ULONG errPos;
return (*cnvt->csconvert_fn_convert)(cnvt, srcLen, NULL, 0, NULL, &errCode, &errPos);
}
const char* getName() const { fb_assert(cnvt); return cnvt->csconvert_name; }
csconvert* getStruct() const { return cnvt; }
private:
csconvert* cnvt;
};
static inline bool operator ==(const CsConvert& cv1, const CsConvert& cv2)
{
return cv1.getStruct() == cv2.getStruct();
}
static inline bool operator !=(const CsConvert& cv1, const CsConvert& cv2)
{
return cv1.getStruct() != cv2.getStruct();
}
class CharSet
{
public:
CharSet(CHARSET_ID _id, charset* _cs)
: id(_id), cs(_cs)
{
USHORT err_code;
ULONG err_pos;
sqlMatchAnyLength = getConvFromUnicode().convert(sizeof(SQL_MATCH_ANY_CHARS), &SQL_MATCH_ANY_CHARS, sizeof(sqlMatchAny), sqlMatchAny, &err_code, &err_pos);
sqlMatchOneLength = getConvFromUnicode().convert(sizeof(SQL_MATCH_1_CHAR), &SQL_MATCH_1_CHAR, sizeof(sqlMatchOne), sqlMatchOne, &err_code, &err_pos);
}
private:
CharSet(const CharSet&) {}
public:
virtual ~CharSet() {}
CHARSET_ID getId() const { fb_assert(cs); return id; }
const char* getName() const { fb_assert(cs); return cs->charset_name; }
UCHAR minBytesPerChar() const { fb_assert(cs); return cs->charset_min_bytes_per_char; }
UCHAR maxBytesPerChar() const { fb_assert(cs); return cs->charset_max_bytes_per_char; }
UCHAR getSpaceLength() const { fb_assert(cs); return cs->charset_space_length; }
const UCHAR* getSpace() const { fb_assert(cs); return cs->charset_space_character; }
USHORT getFlags() const { fb_assert(cs); return cs->charset_flags; }
bool isMultiByte() const
{
fb_assert(cs);
return cs->charset_min_bytes_per_char != cs->charset_max_bytes_per_char;
}
bool wellFormed(ULONG len, const UCHAR* str, ULONG* offendingPos = NULL) const
{
ULONG offendingPos2;
if (offendingPos == NULL)
offendingPos = &offendingPos2;
fb_assert(cs);
if (cs->charset_fn_well_formed)
return cs->charset_fn_well_formed(cs, len, str, offendingPos);
else
return true;
}
CsConvert getConvToUnicode() const { fb_assert(cs); return &cs->charset_to_unicode; }
CsConvert getConvFromUnicode() const { fb_assert(cs); return &cs->charset_from_unicode; }
void destroy()
{
fb_assert(cs);
if (cs->charset_fn_destroy)
cs->charset_fn_destroy(cs);
}
const UCHAR* getSqlMatchAny() const { return sqlMatchAny; }
const UCHAR* getSqlMatchOne() const { return sqlMatchOne; }
const BYTE getSqlMatchAnyLength() const { return sqlMatchAnyLength; }
const BYTE getSqlMatchOneLength() const { return sqlMatchOneLength; }
charset* getStruct() const { return cs; }
ULONG removeTrailingSpaces(ULONG srcLen, const UCHAR* src) const
{
const UCHAR* p = src + srcLen - getSpaceLength();
while (p >= src && memcmp(p, getSpace(), getSpaceLength()) == 0)
p -= getSpaceLength();
p += getSpaceLength();
return p - src;
}
virtual ULONG length(thread_db* tdbb, ULONG srcLen, const UCHAR* src, bool countTrailingSpaces) const = 0;
virtual ULONG substring(thread_db* tdbb, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst, ULONG startPos, ULONG length) const = 0;
private:
CHARSET_ID id;
UCHAR sqlMatchAny[sizeof(ULONG)];
UCHAR sqlMatchOne[sizeof(ULONG)];
BYTE sqlMatchAnyLength;
BYTE sqlMatchOneLength;
charset* cs;
};
class TextType
{
public:
TextType(TTYPE_ID _type, texttype *_tt, CharSet* _cs)
: type(_type), tt(_tt), cs(_cs)
{
canonical(cs->getSqlMatchAnyLength(), cs->getSqlMatchAny(), sizeof(sqlMatchAnyCanonic), sqlMatchAnyCanonic);
canonical(cs->getSqlMatchOneLength(), cs->getSqlMatchOne(), sizeof(sqlMatchOneCanonic), sqlMatchOneCanonic);
struct Conversion
{
USHORT ch;
UCHAR* ptr;
};
Conversion conversions[] =
{
{GDML_MATCH_ONE, gdmlMatchOneCanonic},
{GDML_MATCH_ANY, gdmlMatchAnyCanonic},
{GDML_QUOTE, gdmlQuoteCanonic},
{GDML_NOT, gdmlNotCanonic},
{GDML_RANGE, gdmlRangeCanonic},
{GDML_CLASS_START, gdmlClassStartCanonic},
{GDML_CLASS_END, gdmlClassEndCanonic},
{GDML_SUBSTITUTE, gdmlSubstituteCanonic},
{GDML_FLAG_SET, gdmlFlagSetCanonic},
{GDML_FLAG_CLEAR, gdmlFlagClearCanonic},
{GDML_COMMA, gdmlCommaCanonic},
{GDML_LPAREN, gdmlLParenCanonic},
{GDML_RPAREN, gdmlRParenCanonic},
{'S', gdmlUpperSCanonic},
{'s', gdmlLowerSCanonic}
};
for (int i = 0; i < FB_NELEM(conversions); i++)
{
USHORT err_code;
ULONG err_position;
UCHAR temp[sizeof(ULONG)];
ULONG length = getCharSet()->getConvFromUnicode().convert(sizeof(USHORT), &conversions[i].ch, sizeof(temp), temp, &err_code, &err_position);
canonical(length, temp, sizeof(ULONG), conversions[i].ptr);
}
}
private:
TextType(const TextType&) {}
public:
virtual ~TextType() {}
USHORT key_length(USHORT len) {
fb_assert(tt);
if (tt->texttype_fn_key_length)
return (*tt->texttype_fn_key_length)(tt, len);
else
{
if (getCharSet()->isMultiByte())
return UnicodeUtil::utf16KeyLength(len);
else
return len;
}
}
USHORT string_to_key(USHORT srcLen,
const UCHAR* src,
USHORT dstLen,
UCHAR* dst,
USHORT key_type)
{
fb_assert(tt);
if (tt->texttype_fn_string_to_key)
return (*tt->texttype_fn_string_to_key)(tt, srcLen, src, dstLen, dst, key_type);
else
{
const UCHAR* space = getCharSet()->getSpace();
BYTE spaceLength = getCharSet()->getSpaceLength();
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str;
UCHAR utf16Space[sizeof(ULONG)];
if (getCharSet()->isMultiByte())
{
USHORT err_code;
ULONG err_position;
// convert src to UTF-16
ULONG utf16Length = getCharSet()->getConvToUnicode().convertLength(srcLen);
srcLen = getCharSet()->getConvToUnicode().convert(srcLen, src,
utf16Length, utf16Str.getBuffer(utf16Length), &err_code, &err_position);
src = utf16Str.begin();
// convert charset space to UTF-16
spaceLength = getCharSet()->getConvToUnicode().convert(spaceLength, space,
sizeof(utf16Space), utf16Space, &err_code, &err_position);
fb_assert(spaceLength == 2); // space character can't be surrogate for default string_to_key
space = utf16Space;
}
if (tt->texttype_pad_option)
{
const UCHAR* pad;
for (pad = src + srcLen - spaceLength; pad >= src; pad -= spaceLength)
{
if (memcmp(pad, space, spaceLength) != 0)
break;
}
srcLen = pad - src + spaceLength;
}
if (getCharSet()->isMultiByte())
{
dstLen = UnicodeUtil::utf16ToKey(srcLen, reinterpret_cast<const USHORT*>(src),
dstLen, dst, key_type);
}
else
{
if (dstLen >= srcLen)
{
memcpy(dst, src, srcLen);
dstLen = srcLen;
}
else
dstLen = INTL_BAD_KEY_LENGTH;
}
return dstLen;
}
}
SSHORT compare(ULONG len1,
const UCHAR* str1,
ULONG len2,
const UCHAR* str2)
{
INTL_BOOL error = false;
fb_assert(tt);
if (tt->texttype_fn_compare)
return (*tt->texttype_fn_compare)(tt, len1, str1, len2, str2, &error);
else
{
const UCHAR* space = getCharSet()->getSpace();
BYTE spaceLength = getCharSet()->getSpaceLength();
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str1;
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16Str2;
UCHAR utf16Space[sizeof(ULONG)];
if (getCharSet()->isMultiByte())
{
USHORT err_code;
ULONG err_position;
// convert str1 to UTF-16
ULONG utf16Length = getCharSet()->getConvToUnicode().convertLength(len1);
len1 = getCharSet()->getConvToUnicode().convert(len1, str1,
utf16Length, utf16Str1.getBuffer(utf16Length), &err_code, &err_position);
str1 = utf16Str1.begin();
// convert str2 to UTF-16
utf16Length = getCharSet()->getConvToUnicode().convertLength(len2);
len2 = getCharSet()->getConvToUnicode().convert(len2, str2,
utf16Length, utf16Str2.getBuffer(utf16Length), &err_code, &err_position);
str2 = utf16Str2.begin();
// convert charset space to UTF-16
spaceLength = getCharSet()->getConvToUnicode().convert(spaceLength, space,
sizeof(utf16Space), utf16Space, &err_code, &err_position);
fb_assert(spaceLength == 2); // space character can't be surrogate for default compare
space = utf16Space;
}
if (tt->texttype_pad_option)
{
const UCHAR* pad;
for (pad = str1 + len1 - spaceLength; pad >= str1; pad -= spaceLength)
{
if (memcmp(pad, space, spaceLength) != 0)
break;
}
len1 = pad - str1 + spaceLength;
for (pad = str2 + len2 - spaceLength; pad >= str2; pad -= spaceLength)
{
if (memcmp(pad, space, spaceLength) != 0)
break;
}
len2 = pad - str2 + spaceLength;
}
if (getCharSet()->isMultiByte())
{
INTL_BOOL error_flag;
return UnicodeUtil::utf16Compare(len1, reinterpret_cast<const USHORT*>(str1),
len2, reinterpret_cast<const USHORT*>(str2), &error_flag);
}
else
{
SSHORT cmp = memcmp(str1, str2, MIN(len1, len2));
if (cmp == 0)
cmp = (len1 < len2 ? -1 : (len1 > len2 ? 1 : 0));
return cmp;
}
}
}
ULONG str_to_upper(ULONG srcLen,
const UCHAR* src,
ULONG dstLen,
UCHAR* dst)
{
fb_assert(tt);
if (tt->texttype_fn_str_to_upper)
return (*tt->texttype_fn_str_to_upper)(tt, srcLen, src, dstLen, dst);
else
{
USHORT err_code;
ULONG err_position;
ULONG utf16_length = getCharSet()->getConvToUnicode().convertLength(srcLen);
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16_str;
UCHAR* utf16_ptr;
if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate
utf16_ptr = dst;
else
utf16_ptr = utf16_str.getBuffer(utf16_length);
// convert to UTF-16
srcLen = getCharSet()->getConvToUnicode().convert(srcLen, src,
utf16_length, utf16_ptr, &err_code, &err_position);
// convert to uppercase
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> upper_str;
srcLen = UnicodeUtil::utf16UpperCase(srcLen, reinterpret_cast<USHORT*>(utf16_ptr),
utf16_length, reinterpret_cast<USHORT*>(upper_str.getBuffer(utf16_length)));
// convert to original character set
return getCharSet()->getConvFromUnicode().convert(srcLen, upper_str.begin(),
dstLen, dst, &err_code, &err_position);
}
}
ULONG str_to_lower(ULONG srcLen,
const UCHAR* src,
ULONG dstLen,
UCHAR* dst)
{
fb_assert(tt);
if (tt->texttype_fn_str_to_lower)
return (*tt->texttype_fn_str_to_lower)(tt, srcLen, src, dstLen, dst);
else
{
USHORT err_code;
ULONG err_position;
ULONG utf16_length = getCharSet()->getConvToUnicode().convertLength(srcLen);
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16_str;
UCHAR* utf16_ptr;
if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate
utf16_ptr = dst;
else
utf16_ptr = utf16_str.getBuffer(utf16_length);
// convert to UTF-16
srcLen = getCharSet()->getConvToUnicode().convert(srcLen, src,
utf16_length, utf16_ptr, &err_code, &err_position);
// convert to lowercase
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> lower_str;
srcLen = UnicodeUtil::utf16LowerCase(srcLen, reinterpret_cast<USHORT*>(utf16_ptr),
utf16_length, reinterpret_cast<USHORT*>(lower_str.getBuffer(utf16_length)));
// convert to original character set
return getCharSet()->getConvFromUnicode().convert(srcLen, lower_str.begin(),
dstLen, dst, &err_code, &err_position);
}
}
ULONG canonical(ULONG srcLen,
const UCHAR* src,
ULONG dstLen,
UCHAR* dst)
{
fb_assert(tt);
if (tt->texttype_fn_canonical)
return (*tt->texttype_fn_canonical)(tt, srcLen, src, dstLen, dst);
else if (getCharSet()->isMultiByte())
{
fb_assert(tt->texttype_canonical_width == sizeof(ULONG));
USHORT err_code;
ULONG err_position;
Firebird::HalfStaticArray<UCHAR, BUFFER_SMALL> utf16_str;
ULONG utf16_len = getCharSet()->getConvToUnicode().convertLength(srcLen);
// convert to UTF-16
utf16_len = getCharSet()->getConvToUnicode().convert(srcLen, src,
utf16_len, utf16_str.getBuffer(utf16_len), &err_code, &err_position);
// convert UTF-16 to UTF-32
return UnicodeUtil::utf16ToUtf32(utf16_len, reinterpret_cast<const USHORT*>(utf16_str.begin()),
dstLen, reinterpret_cast<ULONG*>(dst), &err_code, &err_position) / sizeof(ULONG);
}
else
{
fb_assert(tt->texttype_canonical_width == getCharSet()->minBytesPerChar());
fb_assert(dstLen >= srcLen);
memcpy(dst, src, srcLen);
return srcLen / getCharSet()->minBytesPerChar();
}
}
const UCHAR* getSqlMatchAnyCanonic() const { return sqlMatchAnyCanonic; }
const UCHAR* getSqlMatchOneCanonic() const { return sqlMatchOneCanonic; }
const UCHAR* getGdmlMatchOneCanonic() const { return gdmlMatchOneCanonic; }
const UCHAR* getGdmlMatchAnyCanonic() const { return gdmlMatchAnyCanonic; }
const UCHAR* getGdmlQuoteCanonic() const { return gdmlQuoteCanonic; }
const UCHAR* getGdmlNotCanonic() const { return gdmlNotCanonic; }
const UCHAR* getGdmlRangeCanonic() const { return gdmlRangeCanonic; }
const UCHAR* getGdmlClassStartCanonic() const { return gdmlClassStartCanonic; }
const UCHAR* getGdmlClassEndCanonic() const { return gdmlClassEndCanonic; }
const UCHAR* getGdmlSubstituteCanonic() const { return gdmlSubstituteCanonic; }
const UCHAR* getGdmlFlagSetCanonic() const { return gdmlFlagSetCanonic; }
const UCHAR* getGdmlFlagClearCanonic() const { return gdmlFlagClearCanonic; }
const UCHAR* getGdmlCommaCanonic() const { return gdmlCommaCanonic; }
const UCHAR* getGdmlLParenCanonic() const { return gdmlLParenCanonic; }
const UCHAR* getGdmlRParenCanonic() const { return gdmlRParenCanonic; }
const UCHAR* getGdmlUpperSCanonic() const { return gdmlUpperSCanonic; }
const UCHAR* getGdmlLowerSCanonic() const { return gdmlLowerSCanonic; }
USHORT getType() const
{
return type;
}
CharSet* getCharSet() const
{
return cs;
}
BYTE getCanonicalWidth() const
{
fb_assert(tt);
return tt->texttype_canonical_width;
}
USHORT getFlags() const
{
fb_assert(tt);
return tt->texttype_flags;
}
void destroy()
{
fb_assert(tt);
if (tt->texttype_fn_destroy)
tt->texttype_fn_destroy(tt);
}
virtual bool matches(thread_db* tdbb, const UCHAR* a, SLONG b, const UCHAR* c, SLONG d) = 0;
virtual bool sleuth_check(thread_db* tdbb, USHORT a, const UCHAR* b, SLONG c, const UCHAR* d, SLONG e) = 0;
virtual ULONG sleuth_merge(thread_db* tdbb, const UCHAR* a, SLONG b, const UCHAR* c, SLONG d, UCHAR* e, SLONG f) = 0;
virtual bool like(thread_db* tdbb, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length) = 0;
virtual LikeObject *like_create(thread_db* tdbb, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length) = 0;
virtual bool contains(thread_db* tdbb, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl) = 0;
virtual ContainsObject *contains_create(thread_db* tdbb, const UCHAR* p, SLONG pl) = 0;
private:
TTYPE_ID type;
texttype* tt;
CharSet* cs;
UCHAR sqlMatchAnyCanonic[sizeof(ULONG)];
UCHAR sqlMatchOneCanonic[sizeof(ULONG)];
UCHAR gdmlMatchOneCanonic[sizeof(ULONG)];
UCHAR gdmlMatchAnyCanonic[sizeof(ULONG)];
UCHAR gdmlQuoteCanonic[sizeof(ULONG)];
UCHAR gdmlNotCanonic[sizeof(ULONG)];
UCHAR gdmlRangeCanonic[sizeof(ULONG)];
UCHAR gdmlClassStartCanonic[sizeof(ULONG)];
UCHAR gdmlClassEndCanonic[sizeof(ULONG)];
UCHAR gdmlSubstituteCanonic[sizeof(ULONG)];
UCHAR gdmlFlagSetCanonic[sizeof(ULONG)];
UCHAR gdmlFlagClearCanonic[sizeof(ULONG)];
UCHAR gdmlCommaCanonic[sizeof(ULONG)];
UCHAR gdmlLParenCanonic[sizeof(ULONG)];
UCHAR gdmlRParenCanonic[sizeof(ULONG)];
UCHAR gdmlUpperSCanonic[sizeof(ULONG)];
UCHAR gdmlLowerSCanonic[sizeof(ULONG)];
};
} //namespace Jrd
#endif /* JRD_INTL_CLASSES_H */