/* * PROGRAM: JRD International support * MODULE: intl_classes.h * DESCRIPTION: International text handling definitions * * The contents of this file are subject to the Initial * Developer's Public License Version 1.0 (the "License"); * you may not use this file except in compliance with the * License. You may obtain a copy of the License at * http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl. * * Software distributed under the License is distributed AS IS, * WITHOUT WARRANTY OF ANY KIND, either express or implied. * See the License for the specific language governing rights * and limitations under the License. * * The Original Code was created by Nickolay Samofatov * for the Firebird Open Source RDBMS project. * * Copyright (c) 2004 Nickolay Samofatov * and all contributors signed below. * * All Rights Reserved. * Contributor(s): ______________________________________. * */ #ifndef JRD_INTL_CLASSES_H #define JRD_INTL_CLASSES_H #include "firebird.h" #include "../jrd/jrd.h" #include "../jrd/intlobj_new.h" #include "../jrd/constants.h" #include "../jrd/unicode_util.h" typedef SSHORT CHARSET_ID; typedef SSHORT COLLATE_ID; typedef USHORT TTYPE_ID; namespace Jrd { class LikeObject { public: virtual void reset() = 0; virtual bool process(thread_db*, TextType*, const UCHAR*, SLONG) = 0; virtual bool result() = 0; virtual ~LikeObject() {} }; class ContainsObject { public: virtual void reset() = 0; virtual bool process(Jrd::thread_db*, Jrd::TextType*, const UCHAR*, SLONG) = 0; virtual bool result() = 0; virtual ~ContainsObject() {} }; class CsConvert { public: CsConvert(csconvert* _cnvt) : cnvt(_cnvt) {} CsConvert(const CsConvert& obj) : cnvt(obj.cnvt) {} // CVC: Beware of this can of worms: csconvert_convert gets assigned // different functions that not necessarily take the same argument. Typically, // the src pointer and the dest pointer use different types. // How does this work without crashing is a miracle of IT. // To be used with getConvFromUnicode method of CharSet class ULONG convert(ULONG a, const USHORT* b, ULONG c, UCHAR* d, USHORT* e, ULONG* f) { fb_assert(cnvt != NULL); return (*cnvt->csconvert_fn_convert)(cnvt, a, reinterpret_cast(b), c, d, e, f); } // To be used with getConvToUnicode method of CharSet class ULONG convert(ULONG a, const UCHAR* b, ULONG c, USHORT* d, USHORT* e, ULONG* f) { fb_assert(cnvt != NULL); return (*cnvt->csconvert_fn_convert)(cnvt, a, b, c, reinterpret_cast(d), e, f); } // To be used for arbitrary conversions ULONG convert(ULONG a, const UCHAR* b, ULONG c, UCHAR* d, USHORT* e, ULONG* f) { fb_assert(cnvt != NULL); return (*cnvt->csconvert_fn_convert)(cnvt, a, b, c, d, e, f); } // To be used for measure length of conversion ULONG convertLength(ULONG srcLen) { fb_assert(cnvt != NULL); USHORT errCode; ULONG errPos; return (*cnvt->csconvert_fn_convert)(cnvt, srcLen, NULL, 0, NULL, &errCode, &errPos); } const char* getName() const { fb_assert(cnvt); return cnvt->csconvert_name; } csconvert* getStruct() const { return cnvt; } private: csconvert* cnvt; }; static inline bool operator ==(const CsConvert& cv1, const CsConvert& cv2) { return cv1.getStruct() == cv2.getStruct(); } static inline bool operator !=(const CsConvert& cv1, const CsConvert& cv2) { return cv1.getStruct() != cv2.getStruct(); } class CharSet { public: CharSet(CHARSET_ID _id, charset* _cs) : id(_id), cs(_cs) { USHORT err_code; ULONG err_pos; sqlMatchAnyLength = getConvFromUnicode().convert(sizeof(SQL_MATCH_ANY_CHARS), &SQL_MATCH_ANY_CHARS, sizeof(sqlMatchAny), sqlMatchAny, &err_code, &err_pos); sqlMatchOneLength = getConvFromUnicode().convert(sizeof(SQL_MATCH_1_CHAR), &SQL_MATCH_1_CHAR, sizeof(sqlMatchOne), sqlMatchOne, &err_code, &err_pos); } private: CharSet(const CharSet&) {} public: virtual ~CharSet() {} CHARSET_ID getId() const { fb_assert(cs); return id; } const char* getName() const { fb_assert(cs); return cs->charset_name; } UCHAR minBytesPerChar() const { fb_assert(cs); return cs->charset_min_bytes_per_char; } UCHAR maxBytesPerChar() const { fb_assert(cs); return cs->charset_max_bytes_per_char; } UCHAR getSpaceLength() const { fb_assert(cs); return cs->charset_space_length; } const UCHAR* getSpace() const { fb_assert(cs); return cs->charset_space_character; } USHORT getFlags() const { fb_assert(cs); return cs->charset_flags; } bool isMultiByte() const { fb_assert(cs); return cs->charset_min_bytes_per_char != cs->charset_max_bytes_per_char; } bool wellFormed(ULONG len, const UCHAR* str, ULONG* offendingPos = NULL) const { ULONG offendingPos2; if (offendingPos == NULL) offendingPos = &offendingPos2; fb_assert(cs); if (cs->charset_fn_well_formed) return cs->charset_fn_well_formed(cs, len, str, offendingPos); else return true; } CsConvert getConvToUnicode() const { fb_assert(cs); return &cs->charset_to_unicode; } CsConvert getConvFromUnicode() const { fb_assert(cs); return &cs->charset_from_unicode; } void destroy() { fb_assert(cs); if (cs->charset_fn_destroy) cs->charset_fn_destroy(cs); } const UCHAR* getSqlMatchAny() const { return sqlMatchAny; } const UCHAR* getSqlMatchOne() const { return sqlMatchOne; } const BYTE getSqlMatchAnyLength() const { return sqlMatchAnyLength; } const BYTE getSqlMatchOneLength() const { return sqlMatchOneLength; } charset* getStruct() const { return cs; } ULONG removeTrailingSpaces(ULONG srcLen, const UCHAR* src) const { const UCHAR* p = src + srcLen - getSpaceLength(); while (p >= src && memcmp(p, getSpace(), getSpaceLength()) == 0) p -= getSpaceLength(); p += getSpaceLength(); return p - src; } virtual ULONG length(thread_db* tdbb, ULONG srcLen, const UCHAR* src, bool countTrailingSpaces) const = 0; virtual ULONG substring(thread_db* tdbb, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst, ULONG startPos, ULONG length) const = 0; private: CHARSET_ID id; UCHAR sqlMatchAny[sizeof(ULONG)]; UCHAR sqlMatchOne[sizeof(ULONG)]; BYTE sqlMatchAnyLength; BYTE sqlMatchOneLength; charset* cs; }; class TextType { public: TextType(TTYPE_ID _type, texttype *_tt, CharSet* _cs) : type(_type), tt(_tt), cs(_cs) { canonical(cs->getSqlMatchAnyLength(), cs->getSqlMatchAny(), sizeof(sqlMatchAnyCanonic), sqlMatchAnyCanonic); canonical(cs->getSqlMatchOneLength(), cs->getSqlMatchOne(), sizeof(sqlMatchOneCanonic), sqlMatchOneCanonic); struct Conversion { USHORT ch; UCHAR* ptr; }; Conversion conversions[] = { {GDML_MATCH_ONE, gdmlMatchOneCanonic}, {GDML_MATCH_ANY, gdmlMatchAnyCanonic}, {GDML_QUOTE, gdmlQuoteCanonic}, {GDML_NOT, gdmlNotCanonic}, {GDML_RANGE, gdmlRangeCanonic}, {GDML_CLASS_START, gdmlClassStartCanonic}, {GDML_CLASS_END, gdmlClassEndCanonic}, {GDML_SUBSTITUTE, gdmlSubstituteCanonic}, {GDML_FLAG_SET, gdmlFlagSetCanonic}, {GDML_FLAG_CLEAR, gdmlFlagClearCanonic}, {GDML_COMMA, gdmlCommaCanonic}, {GDML_LPAREN, gdmlLParenCanonic}, {GDML_RPAREN, gdmlRParenCanonic}, {'S', gdmlUpperSCanonic}, {'s', gdmlLowerSCanonic} }; for (int i = 0; i < FB_NELEM(conversions); i++) { USHORT err_code; ULONG err_position; UCHAR temp[sizeof(ULONG)]; ULONG length = getCharSet()->getConvFromUnicode().convert(sizeof(USHORT), &conversions[i].ch, sizeof(temp), temp, &err_code, &err_position); canonical(length, temp, sizeof(ULONG), conversions[i].ptr); } } private: TextType(const TextType&) {} public: virtual ~TextType() {} USHORT key_length(USHORT len) { fb_assert(tt); if (tt->texttype_fn_key_length) return (*tt->texttype_fn_key_length)(tt, len); else { if (getCharSet()->isMultiByte()) return UnicodeUtil::utf16KeyLength(len); else return len; } } USHORT string_to_key(USHORT srcLen, const UCHAR* src, USHORT dstLen, UCHAR* dst, USHORT key_type) { fb_assert(tt); if (tt->texttype_fn_string_to_key) return (*tt->texttype_fn_string_to_key)(tt, srcLen, src, dstLen, dst, key_type); else { const UCHAR* space = getCharSet()->getSpace(); BYTE spaceLength = getCharSet()->getSpaceLength(); Firebird::HalfStaticArray utf16Str; UCHAR utf16Space[sizeof(ULONG)]; if (getCharSet()->isMultiByte()) { USHORT err_code; ULONG err_position; // convert src to UTF-16 ULONG utf16Length = getCharSet()->getConvToUnicode().convertLength(srcLen); srcLen = getCharSet()->getConvToUnicode().convert(srcLen, src, utf16Length, utf16Str.getBuffer(utf16Length), &err_code, &err_position); src = utf16Str.begin(); // convert charset space to UTF-16 spaceLength = getCharSet()->getConvToUnicode().convert(spaceLength, space, sizeof(utf16Space), utf16Space, &err_code, &err_position); fb_assert(spaceLength == 2); // space character can't be surrogate for default string_to_key space = utf16Space; } if (tt->texttype_pad_option) { const UCHAR* pad; for (pad = src + srcLen - spaceLength; pad >= src; pad -= spaceLength) { if (memcmp(pad, space, spaceLength) != 0) break; } srcLen = pad - src + spaceLength; } if (getCharSet()->isMultiByte()) { dstLen = UnicodeUtil::utf16ToKey(srcLen, reinterpret_cast(src), dstLen, dst, key_type); } else { if (dstLen >= srcLen) { memcpy(dst, src, srcLen); dstLen = srcLen; } else dstLen = INTL_BAD_KEY_LENGTH; } return dstLen; } } SSHORT compare(ULONG len1, const UCHAR* str1, ULONG len2, const UCHAR* str2) { INTL_BOOL error = false; fb_assert(tt); if (tt->texttype_fn_compare) return (*tt->texttype_fn_compare)(tt, len1, str1, len2, str2, &error); else { const UCHAR* space = getCharSet()->getSpace(); BYTE spaceLength = getCharSet()->getSpaceLength(); Firebird::HalfStaticArray utf16Str1; Firebird::HalfStaticArray utf16Str2; UCHAR utf16Space[sizeof(ULONG)]; if (getCharSet()->isMultiByte()) { USHORT err_code; ULONG err_position; // convert str1 to UTF-16 ULONG utf16Length = getCharSet()->getConvToUnicode().convertLength(len1); len1 = getCharSet()->getConvToUnicode().convert(len1, str1, utf16Length, utf16Str1.getBuffer(utf16Length), &err_code, &err_position); str1 = utf16Str1.begin(); // convert str2 to UTF-16 utf16Length = getCharSet()->getConvToUnicode().convertLength(len2); len2 = getCharSet()->getConvToUnicode().convert(len2, str2, utf16Length, utf16Str2.getBuffer(utf16Length), &err_code, &err_position); str2 = utf16Str2.begin(); // convert charset space to UTF-16 spaceLength = getCharSet()->getConvToUnicode().convert(spaceLength, space, sizeof(utf16Space), utf16Space, &err_code, &err_position); fb_assert(spaceLength == 2); // space character can't be surrogate for default compare space = utf16Space; } if (tt->texttype_pad_option) { const UCHAR* pad; for (pad = str1 + len1 - spaceLength; pad >= str1; pad -= spaceLength) { if (memcmp(pad, space, spaceLength) != 0) break; } len1 = pad - str1 + spaceLength; for (pad = str2 + len2 - spaceLength; pad >= str2; pad -= spaceLength) { if (memcmp(pad, space, spaceLength) != 0) break; } len2 = pad - str2 + spaceLength; } if (getCharSet()->isMultiByte()) { INTL_BOOL error_flag; return UnicodeUtil::utf16Compare(len1, reinterpret_cast(str1), len2, reinterpret_cast(str2), &error_flag); } else { SSHORT cmp = memcmp(str1, str2, MIN(len1, len2)); if (cmp == 0) cmp = (len1 < len2 ? -1 : (len1 > len2 ? 1 : 0)); return cmp; } } } ULONG str_to_upper(ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst) { fb_assert(tt); if (tt->texttype_fn_str_to_upper) return (*tt->texttype_fn_str_to_upper)(tt, srcLen, src, dstLen, dst); else { USHORT err_code; ULONG err_position; ULONG utf16_length = getCharSet()->getConvToUnicode().convertLength(srcLen); Firebird::HalfStaticArray utf16_str; UCHAR* utf16_ptr; if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate utf16_ptr = dst; else utf16_ptr = utf16_str.getBuffer(utf16_length); // convert to UTF-16 srcLen = getCharSet()->getConvToUnicode().convert(srcLen, src, utf16_length, utf16_ptr, &err_code, &err_position); // convert to uppercase Firebird::HalfStaticArray upper_str; srcLen = UnicodeUtil::utf16UpperCase(srcLen, reinterpret_cast(utf16_ptr), utf16_length, reinterpret_cast(upper_str.getBuffer(utf16_length))); // convert to original character set return getCharSet()->getConvFromUnicode().convert(srcLen, upper_str.begin(), dstLen, dst, &err_code, &err_position); } } ULONG str_to_lower(ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst) { fb_assert(tt); if (tt->texttype_fn_str_to_lower) return (*tt->texttype_fn_str_to_lower)(tt, srcLen, src, dstLen, dst); else { USHORT err_code; ULONG err_position; ULONG utf16_length = getCharSet()->getConvToUnicode().convertLength(srcLen); Firebird::HalfStaticArray utf16_str; UCHAR* utf16_ptr; if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate utf16_ptr = dst; else utf16_ptr = utf16_str.getBuffer(utf16_length); // convert to UTF-16 srcLen = getCharSet()->getConvToUnicode().convert(srcLen, src, utf16_length, utf16_ptr, &err_code, &err_position); // convert to lowercase Firebird::HalfStaticArray lower_str; srcLen = UnicodeUtil::utf16LowerCase(srcLen, reinterpret_cast(utf16_ptr), utf16_length, reinterpret_cast(lower_str.getBuffer(utf16_length))); // convert to original character set return getCharSet()->getConvFromUnicode().convert(srcLen, lower_str.begin(), dstLen, dst, &err_code, &err_position); } } ULONG canonical(ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst) { fb_assert(tt); if (tt->texttype_fn_canonical) return (*tt->texttype_fn_canonical)(tt, srcLen, src, dstLen, dst); else if (getCharSet()->isMultiByte()) { fb_assert(tt->texttype_canonical_width == sizeof(ULONG)); USHORT err_code; ULONG err_position; Firebird::HalfStaticArray utf16_str; ULONG utf16_len = getCharSet()->getConvToUnicode().convertLength(srcLen); // convert to UTF-16 utf16_len = getCharSet()->getConvToUnicode().convert(srcLen, src, utf16_len, utf16_str.getBuffer(utf16_len), &err_code, &err_position); // convert UTF-16 to UTF-32 return UnicodeUtil::utf16ToUtf32(utf16_len, reinterpret_cast(utf16_str.begin()), dstLen, reinterpret_cast(dst), &err_code, &err_position) / sizeof(ULONG); } else { fb_assert(tt->texttype_canonical_width == getCharSet()->minBytesPerChar()); fb_assert(dstLen >= srcLen); memcpy(dst, src, srcLen); return srcLen / getCharSet()->minBytesPerChar(); } } const UCHAR* getSqlMatchAnyCanonic() const { return sqlMatchAnyCanonic; } const UCHAR* getSqlMatchOneCanonic() const { return sqlMatchOneCanonic; } const UCHAR* getGdmlMatchOneCanonic() const { return gdmlMatchOneCanonic; } const UCHAR* getGdmlMatchAnyCanonic() const { return gdmlMatchAnyCanonic; } const UCHAR* getGdmlQuoteCanonic() const { return gdmlQuoteCanonic; } const UCHAR* getGdmlNotCanonic() const { return gdmlNotCanonic; } const UCHAR* getGdmlRangeCanonic() const { return gdmlRangeCanonic; } const UCHAR* getGdmlClassStartCanonic() const { return gdmlClassStartCanonic; } const UCHAR* getGdmlClassEndCanonic() const { return gdmlClassEndCanonic; } const UCHAR* getGdmlSubstituteCanonic() const { return gdmlSubstituteCanonic; } const UCHAR* getGdmlFlagSetCanonic() const { return gdmlFlagSetCanonic; } const UCHAR* getGdmlFlagClearCanonic() const { return gdmlFlagClearCanonic; } const UCHAR* getGdmlCommaCanonic() const { return gdmlCommaCanonic; } const UCHAR* getGdmlLParenCanonic() const { return gdmlLParenCanonic; } const UCHAR* getGdmlRParenCanonic() const { return gdmlRParenCanonic; } const UCHAR* getGdmlUpperSCanonic() const { return gdmlUpperSCanonic; } const UCHAR* getGdmlLowerSCanonic() const { return gdmlLowerSCanonic; } USHORT getType() const { return type; } CharSet* getCharSet() const { return cs; } BYTE getCanonicalWidth() const { fb_assert(tt); return tt->texttype_canonical_width; } USHORT getFlags() const { fb_assert(tt); return tt->texttype_flags; } void destroy() { fb_assert(tt); if (tt->texttype_fn_destroy) tt->texttype_fn_destroy(tt); } virtual bool matches(thread_db* tdbb, const UCHAR* a, SLONG b, const UCHAR* c, SLONG d) = 0; virtual bool sleuth_check(thread_db* tdbb, USHORT a, const UCHAR* b, SLONG c, const UCHAR* d, SLONG e) = 0; virtual ULONG sleuth_merge(thread_db* tdbb, const UCHAR* a, SLONG b, const UCHAR* c, SLONG d, UCHAR* e, SLONG f) = 0; virtual bool like(thread_db* tdbb, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length) = 0; virtual LikeObject *like_create(thread_db* tdbb, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length) = 0; virtual bool contains(thread_db* tdbb, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl) = 0; virtual ContainsObject *contains_create(thread_db* tdbb, const UCHAR* p, SLONG pl) = 0; private: TTYPE_ID type; texttype* tt; CharSet* cs; UCHAR sqlMatchAnyCanonic[sizeof(ULONG)]; UCHAR sqlMatchOneCanonic[sizeof(ULONG)]; UCHAR gdmlMatchOneCanonic[sizeof(ULONG)]; UCHAR gdmlMatchAnyCanonic[sizeof(ULONG)]; UCHAR gdmlQuoteCanonic[sizeof(ULONG)]; UCHAR gdmlNotCanonic[sizeof(ULONG)]; UCHAR gdmlRangeCanonic[sizeof(ULONG)]; UCHAR gdmlClassStartCanonic[sizeof(ULONG)]; UCHAR gdmlClassEndCanonic[sizeof(ULONG)]; UCHAR gdmlSubstituteCanonic[sizeof(ULONG)]; UCHAR gdmlFlagSetCanonic[sizeof(ULONG)]; UCHAR gdmlFlagClearCanonic[sizeof(ULONG)]; UCHAR gdmlCommaCanonic[sizeof(ULONG)]; UCHAR gdmlLParenCanonic[sizeof(ULONG)]; UCHAR gdmlRParenCanonic[sizeof(ULONG)]; UCHAR gdmlUpperSCanonic[sizeof(ULONG)]; UCHAR gdmlLowerSCanonic[sizeof(ULONG)]; }; } //namespace Jrd #endif /* JRD_INTL_CLASSES_H */