/************* history ************ * * COMPONENT: JRD MODULE: INTL.CPP * generated by Marion V2.5 2/6/90 * from dev db on 4-JAN-1995 ***************************************************************** * * PR 2002-06-02 Added ugly c hack in * intl_back_compat_alloc_func_lookup. * When someone has time we need to change the references to * return (void*) function to something more C++ like * * 42 4711 3 11 17 tamlin 2001 * Added silly numbers before my name, and converted it to C++. * * 18850 daves 4-JAN-1995 * Fix gds__alloc usage * * 18837 deej 31-DEC-1994 * fixing up HARBOR_MERGE * * 18821 deej 27-DEC-1994 * HARBOR MERGE * * 18789 jdavid 19-DEC-1994 * Cast some functions * * 17508 jdavid 15-JUL-1994 * Bring it up to date * * 17500 daves 13-JUL-1994 * Bug 6645: Different calculation of partial keys * * 17202 katz 24-MAY-1994 * PC_PLATFORM requires the .dll extension * * 17191 katz 23-MAY-1994 * OS/2 requires the .dll extension * * 17180 katz 23-MAY-1994 * Define location of DLL on OS/2 * * 17149 katz 20-MAY-1994 * In JRD, isc_arg_number arguments are SLONG's not int's * * 16633 daves 19-APR-1994 * Bug 6202: International licensing uses INTERNATIONAL product code * * 16555 katz 17-APR-1994 * The last argument of calls to ERR_post should be 0 * * 16521 katz 14-APR-1994 * Borland C needs a decorated symbol to lookup * * 16403 daves 8-APR-1994 * Bug 6441: Emit an error whenever transliteration from ttype_binary attempted * * 16141 katz 28-MAR-1994 * Don't declare return value from ISC_lookup_entrypoint as API_ROUTINE * * The contents of this file are subject to the Interbase Public * License Version 1.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy * of the License at http://www.Inprise.com/IPL.html * * Software distributed under the License is distributed on an * "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express * or implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code was created by Inprise Corporation * and its predecessors. Portions created by Inprise Corporation are * Copyright (C) Inprise Corporation. * * All Rights Reserved. * Contributor(s): ______________________________________. * * 2002.10.29 Sean Leyne - Removed obsolete "Netware" port * * 2002.10.30 Sean Leyne - Removed support for obsolete "PC_PLATFORM" define * */ /* * PROGRAM: JRD Intl * MODULE: intl.cpp * DESCRIPTION: International text support routines * * copyright (c) 1992, 1993 by Borland International */ #include "firebird.h" #include #include "../jrd/common.h" #include #include "../jrd/jrd.h" #include "../jrd/req.h" #include "../jrd/val.h" #include "gen/iberror.h" #include "../jrd/intl.h" #include "../jrd/intl_classes.h" #include "../jrd/ods.h" #include "../jrd/btr.h" #include "../intl/charsets.h" #include "../intl/country_codes.h" #include "../jrd/gdsassert.h" #include "../jrd/license.h" #ifdef INTL_BUILTIN #include "../intl/ld_proto.h" #endif #include "../jrd/all_proto.h" #include "../jrd/cvt_proto.h" #include "../jrd/err_proto.h" #include "../jrd/fun_proto.h" #include "../jrd/gds_proto.h" #include "../jrd/iberr_proto.h" #include "../jrd/intl_proto.h" #include "../jrd/isc_proto.h" #include "../jrd/met_proto.h" #include "../jrd/thd.h" #include "../jrd/evl_string.h" #include "../jrd/jrd.h" #include "../jrd/evl_like.h" #include "../jrd/mov_proto.h" #include "../jrd/IntlManager.h" #include "../common/classes/init.h" using namespace Jrd; #define IS_TEXT(x) (((x)->dsc_dtype == dtype_text) ||\ ((x)->dsc_dtype == dtype_varying)||\ ((x)->dsc_dtype == dtype_cstring)) #define TTYPE_TO_CHARSET(tt) ((SSHORT)((tt) & 0x00FF)) #define TTYPE_TO_COLLATION(tt) ((SSHORT)((tt) >> 8)) static bool all_spaces(thread_db*, CHARSET_ID, const BYTE*, ULONG, ULONG); static void pad_spaces(thread_db*, CHARSET_ID, BYTE *, ULONG); static INTL_BOOL lookup_charset(charset* cs, const SubtypeInfo* info); static INTL_BOOL lookup_texttype(texttype* tt, const SubtypeInfo* info); // We need all the structure definitions from the old interface #define INTL_ENGINE_INTERNAL #include "../jrd/intlobj_new.h" // Classes and structures used internally to this file and intl implementation class CharSetContainer { public: CharSetContainer(MemoryPool& p, USHORT cs_id, const SubtypeInfo* info); void destroy() { cs->destroy(); for (size_t i = 0; i < charset_collations.getCount(); i++) if (charset_collations[i]) charset_collations[i]->destroy(); } CharSet* getCharSet() { return cs; } TextType* lookupCollation(thread_db* tdbb, USHORT tt_id); CsConvert lookupConverter(thread_db* tdbb, CHARSET_ID to_cs); static CharSetContainer* lookupCharset(thread_db* tdbb, SSHORT ttype, ISC_STATUS *status); private: Firebird::Array charset_collations; CharSet* cs; }; /* Below are templates for functions used in TextType implementation */ class NullStrConverter { public: NullStrConverter(thread_db* tdbb, const TextType* obj, const UCHAR *str, SLONG len) { } }; template class UpcaseConverter : public PrevConverter { public: UpcaseConverter(thread_db* tdbb, TextType* obj, const UCHAR* &str, SLONG &len) : PrevConverter(tdbb, obj, str, len) { if (len > (int) sizeof(tempBuffer)) out_str = FB_NEW(*tdbb->getDefaultPool()) UCHAR[len]; else out_str = tempBuffer; obj->str_to_upper(len, str, len, out_str); str = out_str; } ~UpcaseConverter() { if (out_str != tempBuffer) delete[] out_str; } private: UCHAR tempBuffer[100], *out_str; }; template class CanonicalConverter : public PrevConverter { public: CanonicalConverter(thread_db* tdbb, TextType* obj, const UCHAR* &str, SLONG &len) : PrevConverter(tdbb, obj, str, len) { SLONG out_len = len / obj->getCharSet()->minBytesPerChar() * obj->getCanonicalWidth(); if (out_len > (int) sizeof(tempBuffer)) out_str = FB_NEW(*tdbb->getDefaultPool()) UCHAR[out_len]; else out_str = tempBuffer; if (str) { len = obj->canonical(len, str, out_len, out_str) * obj->getCanonicalWidth(); str = out_str; } else len = 0; } ~CanonicalConverter() { if (out_str != tempBuffer) delete[] out_str; } private: UCHAR tempBuffer[100], *out_str; }; template class LikeObjectImpl : public LikeObject { public: LikeObjectImpl(MemoryPool& pool, const CharType* str, SLONG str_len, CharType escape, bool use_escape, CharType sql_match_any, CharType sql_match_one) : evaluator(pool, str, str_len, escape, use_escape, sql_match_any, sql_match_one) { } void reset() { evaluator.reset(); } bool result() { return evaluator.getResult(); } bool process(thread_db* tdbb, Jrd::TextType* ttype, const UCHAR* str, SLONG length) { StrConverter cvt(tdbb, ttype, str, length); fb_assert(length % sizeof(CharType) == 0); return evaluator.processNextChunk( reinterpret_cast(str), length / sizeof(CharType)); } ~LikeObjectImpl() {} static LikeObject* create(thread_db* tdbb, TextType* ttype, const UCHAR* str, SLONG length, const UCHAR* escape, SLONG escape_length, const UCHAR* sql_match_any, SLONG match_any_length, const UCHAR* sql_match_one, SLONG match_one_length) { StrConverter cvt(tdbb, ttype, str, length), cvt_escape(tdbb, ttype, escape, escape_length), cvt_match_any(tdbb, ttype, sql_match_any, match_any_length), cvt_match_one(tdbb, ttype, sql_match_one, match_one_length); fb_assert(length % sizeof(CharType) == 0); return FB_NEW(*tdbb->getDefaultPool()) LikeObjectImpl(*tdbb->getDefaultPool(), reinterpret_cast(str), length / sizeof(CharType), (escape ? *reinterpret_cast(escape) : 0), escape_length != 0, *reinterpret_cast(sql_match_any), *reinterpret_cast(sql_match_one)); } static bool evaluate(thread_db* tdbb, TextType* ttype, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length, const UCHAR* sql_match_any, SLONG match_any_length, const UCHAR* sql_match_one, SLONG match_one_length) { StrConverter cvt1(tdbb, ttype, p, pl), cvt2(tdbb, ttype, s, sl), cvt_escape(tdbb, ttype, escape, escape_length), cvt_match_any(tdbb, ttype, sql_match_any, match_any_length), cvt_match_one(tdbb, ttype, sql_match_one, match_one_length); fb_assert(pl % sizeof(CharType) == 0); fb_assert(sl % sizeof(CharType) == 0); Firebird::LikeEvaluator evaluator(*tdbb->getDefaultPool(), reinterpret_cast(p), pl / sizeof(CharType), (escape ? *reinterpret_cast(escape) : 0), escape_length != 0, *reinterpret_cast(sql_match_any), *reinterpret_cast(sql_match_one)); evaluator.processNextChunk(reinterpret_cast(s), sl / sizeof(CharType)); return evaluator.getResult(); } private: Firebird::LikeEvaluator evaluator; }; template class ContainsObjectImpl : public ContainsObject { public: ContainsObjectImpl(MemoryPool& pool, const CharType* str, SLONG str_len) : evaluator(pool, str, str_len) { } void reset() { evaluator.reset(); } bool result() { return evaluator.getResult(); } bool process(thread_db* tdbb, Jrd::TextType* ttype, const UCHAR* str, SLONG length) { StrConverter cvt(tdbb, ttype, str, length); fb_assert(length % sizeof(CharType) == 0); return evaluator.processNextChunk( reinterpret_cast(str), length / sizeof(CharType)); } ~ContainsObjectImpl() {} static ContainsObject* create(thread_db* tdbb, TextType* ttype, const UCHAR* str, SLONG length) { StrConverter cvt(tdbb, ttype, str, length); fb_assert(length % sizeof(CharType) == 0); return FB_NEW(*tdbb->getDefaultPool()) ContainsObjectImpl(*tdbb->getDefaultPool(), reinterpret_cast(str), length / sizeof(CharType)); } static bool evaluate(thread_db* tdbb, TextType* ttype, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl) { StrConverter cvt1(tdbb, ttype, p, pl), cvt2(tdbb, ttype, s, sl); fb_assert(pl % sizeof(CharType) == 0); fb_assert(sl % sizeof(CharType) == 0); Firebird::ContainsEvaluator evaluator(*tdbb->getDefaultPool(), reinterpret_cast(p), pl / sizeof(CharType)); evaluator.processNextChunk(reinterpret_cast(s), sl / sizeof(CharType)); return evaluator.getResult(); } private: Firebird::ContainsEvaluator evaluator; }; template class MatchesObjectImpl { public: static bool evaluate(thread_db* tdbb, TextType* ttype, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl) { StrConverter cvt1(tdbb, ttype, p, pl), cvt2(tdbb, ttype, s, sl); fb_assert(pl % sizeof(CharType) == 0); fb_assert(sl % sizeof(CharType) == 0); return MATCHESNAME(tdbb, ttype, reinterpret_cast(s), sl, reinterpret_cast(p), pl); } }; template class SleuthObjectImpl { public: static bool check(thread_db* tdbb, TextType* ttype, USHORT flags, const UCHAR* search, SLONG search_len, const UCHAR* match, SLONG match_len) { StrConverter cvt1(tdbb, ttype, search, search_len), cvt2(tdbb, ttype, match, match_len); fb_assert(search_len % sizeof(CharType) == 0); fb_assert(match_len % sizeof(CharType) == 0); return SLEUTHNAME(tdbb, ttype, flags, reinterpret_cast(search), search_len, reinterpret_cast(match), match_len); } static bool merge(thread_db* tdbb, TextType* ttype, const UCHAR* match, SLONG match_bytes, const UCHAR* control, SLONG control_bytes, UCHAR* combined, SLONG combined_bytes) { StrConverter cvt1(tdbb, ttype, match, match_bytes), cvt2(tdbb, ttype, control, control_bytes); fb_assert(match_bytes % sizeof(CharType) == 0); fb_assert(control_bytes % sizeof(CharType) == 0); return SLEUTH_MERGE_NAME(tdbb, ttype, reinterpret_cast(match), match_bytes, reinterpret_cast(control), control_bytes, reinterpret_cast(combined), combined_bytes); } }; class FixedWidthCharSet : public CharSet { public: FixedWidthCharSet(CHARSET_ID _id, charset* _cs) : CharSet(_id, _cs) {} virtual ULONG length(thread_db* tdbb, ULONG srcLen, const UCHAR* src, bool countTrailingSpaces) const { fb_assert(getStruct()); if (!countTrailingSpaces) srcLen = removeTrailingSpaces(srcLen, src); if (getStruct()->charset_fn_length) return getStruct()->charset_fn_length(getStruct(), srcLen, src); else return srcLen / minBytesPerChar(); } virtual ULONG substring(thread_db* tdbb, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst, ULONG startPos, ULONG length) const { fb_assert(getStruct()); if (getStruct()->charset_fn_substring) return getStruct()->charset_fn_substring(getStruct(), srcLen, src, dstLen, dst, startPos, length); else { fb_assert(src != NULL && dst != NULL); if (dstLen < length * minBytesPerChar()) return INTL_BAD_STR_LENGTH; else if (startPos * minBytesPerChar() > srcLen) return 0; length = MIN(srcLen / minBytesPerChar() - startPos, length) * minBytesPerChar(); memcpy(dst, src + startPos * minBytesPerChar(), length); return length; } } }; class MultiByteCharSet : public CharSet { public: MultiByteCharSet(CHARSET_ID _id, charset* _cs) : CharSet(_id, _cs) {} virtual ULONG length(thread_db* tdbb, ULONG srcLen, const UCHAR* src, bool countTrailingSpaces) const { fb_assert(getStruct()); if (!countTrailingSpaces) srcLen = removeTrailingSpaces(srcLen, src); if (getStruct()->charset_fn_length) return getStruct()->charset_fn_length(getStruct(), srcLen, src); else { USHORT errCode; ULONG errPos; ULONG length = getConvToUnicode().convertLength(srcLen); // convert to UTF16 Firebird::HalfStaticArray str; length = getConvToUnicode().convert(srcLen, src, length, str.getBuffer(length / sizeof(USHORT)), &errCode, &errPos); // calculate length of UTF16 return UnicodeUtil::utf16Length(length, str.begin()); } } virtual ULONG substring(thread_db* tdbb, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst, ULONG startPos, ULONG length) const { fb_assert(getStruct()); if (getStruct()->charset_fn_substring) return getStruct()->charset_fn_substring(getStruct(), srcLen, src, dstLen, dst, startPos, length); else { fb_assert(src != NULL && dst != NULL); if (length == 0 || startPos >= srcLen) return 0; USHORT errCode; ULONG errPos; // convert to UTF16 Firebird::HalfStaticArray str; ULONG length = getConvToUnicode().convertLength(srcLen); length = getConvToUnicode().convert(srcLen, src, length, reinterpret_cast(str.getBuffer(length)), &errCode, &errPos); // generate substring of UTF16 Firebird::HalfStaticArray substr; length = UnicodeUtil::utf16Substring(length, reinterpret_cast(str.begin()), length, reinterpret_cast(substr.getBuffer(length)), startPos, length); // convert generated substring to original charset return getConvFromUnicode().convert(length, substr.begin(), dstLen, dst, &errCode, &errPos); } } }; template class CollationImpl : public TextType { public: CollationImpl(TTYPE_ID type, TEXTTYPE tt, CharSet* cs) : TextType(type, tt, cs) {} virtual bool matches(thread_db* tdbb, const UCHAR* a, SLONG b, const UCHAR* c, SLONG d) { return pMatchesObjectImpl::evaluate(tdbb, this, a, b, c, d); } virtual bool sleuth_check(thread_db* tdbb, USHORT a, const UCHAR* b, SLONG c, const UCHAR* d, SLONG e) { return pSleuthObjectImpl::check(tdbb, this, a, b, c, d, e); } virtual ULONG sleuth_merge(thread_db* tdbb, const UCHAR* a, SLONG b, const UCHAR* c, SLONG d, UCHAR* e, SLONG f) { return pSleuthObjectImpl::merge(tdbb, this, a, b, c, d, e, f); } virtual bool like(thread_db* tdbb, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length) { return pLikeObjectImpl::evaluate(tdbb, this, s, sl, p, pl, escape, escape_length, getCharSet()->getSqlMatchAny(), getCharSet()->getSqlMatchAnyLength(), getCharSet()->getSqlMatchOne(), getCharSet()->getSqlMatchOneLength()); } virtual LikeObject *like_create(thread_db* tdbb, const UCHAR* p, SLONG pl, const UCHAR* escape, SLONG escape_length) { return pLikeObjectImpl::create(tdbb, this, p, pl, escape, escape_length, getCharSet()->getSqlMatchAny(), getCharSet()->getSqlMatchAnyLength(), getCharSet()->getSqlMatchOne(), getCharSet()->getSqlMatchOneLength()); } virtual bool contains(thread_db* tdbb, const UCHAR* s, SLONG sl, const UCHAR* p, SLONG pl) { return pContainsObjectImpl::evaluate(tdbb, this, s, sl, p, pl); } virtual ContainsObject *contains_create(thread_db* tdbb, const UCHAR* p, SLONG pl) { return pContainsObjectImpl::create(tdbb, this, p, pl); } }; typedef ContainsObjectImpl, UCHAR> uchar_contains_direct; typedef ContainsObjectImpl, USHORT> ushort_contains_direct; typedef ContainsObjectImpl, ULONG> ulong_contains_direct; typedef MatchesObjectImpl, UCHAR> uchar_matches_canonical; typedef SleuthObjectImpl, UCHAR> uchar_sleuth_canonical; typedef LikeObjectImpl, UCHAR> uchar_like_canonical; typedef ContainsObjectImpl >, UCHAR> uchar_contains_canonical; typedef MatchesObjectImpl, USHORT> ushort_matches_canonical; typedef SleuthObjectImpl, USHORT> ushort_sleuth_canonical; typedef LikeObjectImpl, USHORT> ushort_like_canonical; typedef ContainsObjectImpl >, USHORT> ushort_contains_canonical; typedef MatchesObjectImpl, ULONG> ulong_matches_canonical; typedef SleuthObjectImpl, ULONG> ulong_sleuth_canonical; typedef LikeObjectImpl, ULONG> ulong_like_canonical; typedef ContainsObjectImpl >, ULONG> ulong_contains_canonical; CharSetContainer* CharSetContainer::lookupCharset(thread_db* tdbb, SSHORT ttype, ISC_STATUS *status) { /************************************** * * l o o k u p C h a r s e t * ************************************** * * Functional description * * Lookup a character set descriptor. * * First, search the appropriate vector that hangs * off the dbb. If not found, then call the lower * level lookup routine to allocate it, or return * null if we don't know about the charset. * * Returns: * *charset - if no errors; * - if error & err non NULL * NULL - if error & err NULL * **************************************/ CharSetContainer *cs = NULL; SET_TDBB(tdbb); Database* dbb = tdbb->tdbb_database; USHORT id = TTYPE_TO_CHARSET(ttype); if (id == CS_dynamic) id = tdbb->tdbb_attachment->att_charset; if (id >= dbb->dbb_charsets.size()) dbb->dbb_charsets.resize(id + 10); else cs = dbb->dbb_charsets[id]; // allocate a new character set object if we couldn't find one. if (!cs) { SubtypeInfo info; if (id == CS_UTF16) info.charsetName = "UTF16"; if ((id == CS_UTF16) || MET_get_char_coll_subtype_info(tdbb, id, &info)) { cs = FB_NEW(*dbb->dbb_permanent) CharSetContainer(*dbb->dbb_permanent, id, &info); if (cs->getCharSet() == NULL) { delete cs; return NULL; } dbb->dbb_charsets[id] = cs; } } return cs; } CharSetContainer::CharSetContainer(MemoryPool& p, USHORT cs_id, const SubtypeInfo* info) : charset_collations(p), cs(NULL) { charset* csL = FB_NEW(p) charset; memset(csL, 0, sizeof(charset)); if (lookup_charset(csL, info) && (csL->charset_flags & CHARSET_ASCII_BASED)) { if (csL->charset_min_bytes_per_char != csL->charset_max_bytes_per_char) this->cs = FB_NEW(p) MultiByteCharSet(cs_id, csL); else this->cs = FB_NEW(p) FixedWidthCharSet(cs_id, csL); } else { delete csL; csL = NULL; } } CsConvert CharSetContainer::lookupConverter(thread_db* tdbb, CHARSET_ID to_cs) { if (to_cs == CS_UTF16) { return cs->getConvToUnicode(); } if (cs->getId() == CS_UTF16) { CharSet* to_charset = INTL_charset_lookup(tdbb, to_cs, NULL); if (to_charset == NULL) return NULL; return to_charset->getConvFromUnicode(); } //// TODO: converters return NULL; } TextType* CharSetContainer::lookupCollation(thread_db* tdbb, USHORT tt_id) { const USHORT id = TTYPE_TO_COLLATION(tt_id); if (id < charset_collations.getCount() && charset_collations[id] != NULL) return charset_collations[id]; SubtypeInfo info; if (MET_get_char_coll_subtype_info(tdbb, tt_id, &info)) { CharSet* charset = INTL_charset_lookup(tdbb, TTYPE_TO_CHARSET(tt_id), NULL); if (TTYPE_TO_CHARSET(tt_id) != CS_METADATA) { Firebird::HalfStaticArray specificAttributes; ULONG size = info.specificAttributes.getCount() * charset->maxBytesPerChar(); size = INTL_convert_bytes(tdbb, TTYPE_TO_CHARSET(tt_id), specificAttributes.getBuffer(size), size, CS_METADATA, info.specificAttributes.begin(), info.specificAttributes.getCount(), ERR_post); specificAttributes.shrink(size); info.specificAttributes = specificAttributes; } TEXTTYPE tt = FB_NEW(*tdbb->tdbb_database->dbb_permanent) texttype; memset(tt, 0, sizeof(texttype)); if (!lookup_texttype(tt, &info)) { delete tt; return NULL; } if (charset_collations.getCount() <= id) charset_collations.grow(id + 1); if (charset_collations[id] == NULL) { fb_assert((tt->texttype_canonical_width == 0 && tt->texttype_fn_canonical == NULL) || (tt->texttype_canonical_width != 0 && tt->texttype_fn_canonical != NULL)); if (tt->texttype_canonical_width == 0) { if (charset->isMultiByte()) tt->texttype_canonical_width = sizeof(ULONG); // UTF-32 else { tt->texttype_canonical_width = charset->minBytesPerChar(); // canonical is equal to string, then TEXTTYPE_DIRECT_MATCH can be turned on tt->texttype_flags |= TEXTTYPE_DIRECT_MATCH; } } fb_assert(tt->texttype_canonical_width == 1 || tt->texttype_canonical_width == 2 || tt->texttype_canonical_width == 4); switch (tt->texttype_canonical_width) { case 1: if (tt->texttype_flags & TEXTTYPE_DIRECT_MATCH) { charset_collations[id] = FB_NEW(*tdbb->tdbb_database->dbb_permanent) CollationImpl(tt_id, tt, charset); } else { charset_collations[id] = FB_NEW(*tdbb->tdbb_database->dbb_permanent) CollationImpl(tt_id, tt, charset); } break; case 2: if (tt->texttype_flags & TEXTTYPE_DIRECT_MATCH) { charset_collations[id] = FB_NEW(*tdbb->tdbb_database->dbb_permanent) CollationImpl(tt_id, tt, charset); } else { charset_collations[id] = FB_NEW(*tdbb->tdbb_database->dbb_permanent) CollationImpl(tt_id, tt, charset); } break; case 4: if (tt->texttype_flags & TEXTTYPE_DIRECT_MATCH) { charset_collations[id] = FB_NEW(*tdbb->tdbb_database->dbb_permanent) CollationImpl(tt_id, tt, charset); } else { charset_collations[id] = FB_NEW(*tdbb->tdbb_database->dbb_permanent) CollationImpl(tt_id, tt, charset); } break; default: fb_assert(false); return NULL; } } return charset_collations[id]; } else return NULL; } static INTL_BOOL lookup_charset(charset* cs, const SubtypeInfo* info) { return IntlManager::lookupCharSet(info->charsetName, cs); } static INTL_BOOL lookup_texttype(texttype* tt, const SubtypeInfo* info) { return IntlManager::lookupCollation(info->baseCollationName, info->charsetName, info->attributes, info->specificAttributes.begin(), info->specificAttributes.getCount(), info->ignoreAttributes, tt); } void Database::destroyIntlObjects() { for (size_t i = 0; i < dbb_charsets.size(); i++) if (dbb_charsets[i]) dbb_charsets[i]->destroy(); } CHARSET_ID INTL_charset(thread_db* tdbb, USHORT ttype, FPTR_ERROR err) { /************************************** * * I N T L _ c h a r s e t * ************************************** * * Functional description * Return the character set ID for a piece of text. * **************************************/ switch (ttype) { case ttype_none: return (CS_NONE); case ttype_ascii: return (CS_ASCII); case ttype_unicode_fss: return (CS_UNICODE_FSS); case ttype_binary: return (CS_BINARY); case ttype_dynamic: SET_TDBB(tdbb); return (tdbb->tdbb_attachment->att_charset); default: return (TTYPE_TO_CHARSET(ttype)); } } int INTL_compare(thread_db* tdbb, const dsc* pText1, const dsc* pText2, FPTR_ERROR err) { /************************************** * * I N T L _ c o m p a r e * ************************************** * * Functional description * Compare two pieces of international text. * **************************************/ SET_TDBB(tdbb); fb_assert(pText1 != NULL); fb_assert(pText2 != NULL); fb_assert(IS_TEXT(pText1) && IS_TEXT(pText2)); fb_assert(INTL_data_or_binary(pText1) || INTL_data_or_binary(pText2)); fb_assert(err); /* normal compare routine from CVT_compare */ /* trailing spaces in strings are ignored for comparision */ UCHAR* p1; USHORT t1; USHORT length1 = CVT_get_string_ptr(pText1, &t1, &p1, NULL, 0, err); UCHAR* p2; USHORT t2; USHORT length2 = CVT_get_string_ptr(pText2, &t2, &p2, NULL, 0, err); /* YYY - by SQL II compare_type must be explicit in the SQL statement if there is any doubt */ SSHORT compare_type = MAX(t1, t2); /* YYY */ UCHAR buffer[MAX_KEY]; if (t1 != t2) { CHARSET_ID cs1 = INTL_charset(tdbb, t1, err); CHARSET_ID cs2 = INTL_charset(tdbb, t2, err); if (cs1 != cs2) { if (compare_type != t2) { /* convert pText2 to pText1's type, if possible */ /* YYY - should failure to convert really return an error here? Support joining a 437 & Latin1 Column, and we pick the compare_type as 437, still only want the equal values.... But then, what about < operations, which make no sense if the string cannot be expressed... */ length2 = INTL_convert_bytes(tdbb, cs1, buffer, sizeof(buffer), cs2, p2, length2, err); p2 = buffer; } else { /* convert pText1 to pText2's type, if possible */ length1 = INTL_convert_bytes(tdbb, cs2, buffer, sizeof(buffer), cs1, p1, length1, err); p1 = buffer; } } } TextType* obj = INTL_texttype_lookup(tdbb, compare_type, err, NULL); return obj->compare(length1, p1, length2, p2); } ULONG INTL_convert_bytes(thread_db* tdbb, CHARSET_ID dest_type, BYTE* dest_ptr, ULONG dest_len, CHARSET_ID src_type, const BYTE* src_ptr, ULONG src_len, FPTR_ERROR err) { /************************************** * * I N T L _ c o n v e r t _ b y t e s * ************************************** * * Functional description * Given a string of bytes in one character set, convert it to another * character set. * * If (dest_ptr) is NULL, return the count of bytes needed to convert * the string. This does not guarantee the string can be converted, * the purpose of this is to allocate a large enough buffer. * * RETURNS: * Length of resulting string, in bytes. * calls (err) if conversion error occurs. * **************************************/ ULONG len; ULONG len2; USHORT err_code = 0; ULONG err_position; SET_TDBB(tdbb); fb_assert(src_ptr != NULL); fb_assert(src_type != dest_type); fb_assert(err != NULL); const UCHAR* const start_dest_ptr = dest_ptr; if ((dest_type == CS_BINARY) || (dest_type == CS_NONE)) { /* See if we just need a length estimate */ if (dest_ptr == NULL) return (src_len); len = MIN(dest_len, src_len); if (len) do { *dest_ptr++ = *src_ptr++; } while (--len); /* See if only space characters are remaining */ len = src_len - MIN(dest_len, src_len); if (!len || all_spaces(tdbb, src_type, src_ptr, len, 0)) return (dest_ptr - start_dest_ptr); else (*err) (isc_arith_except, 0); } else if (src_len == 0) return (0); else if (src_type == CS_BINARY) (*err)(isc_arith_except, isc_arg_gds, isc_transliteration_failed, 0); else /* character sets are known to be different */ { /* Do we know an object from cs1 to cs2? */ CsConvert cs_obj = INTL_convert_lookup(tdbb, dest_type, src_type); if (cs_obj != NULL) { len = cs_obj.convert(src_len, src_ptr, dest_len, dest_ptr, &err_code, &err_position); if (!err_code || ((err_code == CS_TRUNCATION_ERROR) && all_spaces(tdbb, src_type, src_ptr, src_len, err_position))) { return (len); } else if (err_code == CS_TRUNCATION_ERROR) (*err) (isc_arith_except, 0); else (*err) (isc_arith_except, isc_arg_gds, isc_transliteration_failed, 0); } /* Find a CS1 to UNICODE object */ CharSet* from_cs = INTL_charset_lookup(tdbb, src_type, NULL); if (from_cs == NULL) (*err)(isc_arith_except, isc_arg_gds, isc_text_subtype, isc_arg_number, (ISC_STATUS) src_type, 0); /* ** allocate a temporary buffer that is large enough. */ BYTE* tmp_buffer = (BYTE *) FB_NEW(*tdbb->getDefaultPool()) char[(SLONG) src_len * sizeof(ULONG)]; cs_obj = from_cs->getConvToUnicode(); fb_assert(cs_obj != NULL); len = cs_obj.convert(src_len, src_ptr, src_len * sizeof(ULONG), tmp_buffer, &err_code, &err_position); if (err_code && !((err_code == CS_TRUNCATION_ERROR) && all_spaces(tdbb, src_type, src_ptr, src_len, err_position))) { delete [] tmp_buffer; if (err_code == CS_TRUNCATION_ERROR) (*err) (isc_arith_except, 0); else (*err) (isc_arith_except, isc_arg_gds, isc_transliteration_failed, 0); } /* Find a UNICODE to CS2 object */ CharSet* to_cs = INTL_charset_lookup(tdbb, dest_type, NULL); if (to_cs == NULL) { delete [] tmp_buffer; (*err) (isc_arith_except, isc_arg_gds, isc_text_subtype, isc_arg_number, (ISC_STATUS) dest_type, 0); } cs_obj = to_cs->getConvFromUnicode(); fb_assert(cs_obj != NULL); len2 = cs_obj.convert(len, tmp_buffer, dest_len, dest_ptr, &err_code, &err_position); if (err_code && !((err_code == CS_TRUNCATION_ERROR) && all_spaces(tdbb, CS_UTF16, tmp_buffer, len, err_position))) { delete [] tmp_buffer; if (err_code == CS_TRUNCATION_ERROR) (*err) (isc_arith_except, 0); else (*err) (isc_arith_except, isc_arg_gds, isc_transliteration_failed, 0); } delete [] tmp_buffer; return (len2); } return (0); /* to remove compiler errors. This should never be executed */ } CsConvert INTL_convert_lookup(thread_db* tdbb, CHARSET_ID to_cs, CHARSET_ID from_cs) { /************************************** * * I N T L _ c o n v e r t _ l o o k u p * ************************************** * * Functional description * **************************************/ CharSetContainer *charset; SET_TDBB(tdbb); Database* dbb = tdbb->tdbb_database; CHECK_DBB(dbb); if (from_cs == CS_dynamic) from_cs = tdbb->tdbb_attachment->att_charset; if (to_cs == CS_dynamic) to_cs = tdbb->tdbb_attachment->att_charset; /* Should from_cs == to_cs? be handled better? YYY */ fb_assert(from_cs != CS_dynamic); fb_assert(to_cs != CS_dynamic); charset = CharSetContainer::lookupCharset(tdbb, from_cs, NULL); if (charset == NULL) return NULL; return charset->lookupConverter(tdbb, to_cs); } int INTL_convert_string(dsc* to, const dsc* from, FPTR_ERROR err) { /************************************** * * I N T L _ c o n v e r t _ s t r i n g * ************************************** * * Functional description * Convert a string from one type to another * * RETURNS: * 0 if no error in conversion * non-zero otherwise. * CVC: Unfortunately, this function puts the source in the 2nd param, * as opposed to the CVT routines, so const help mitigating coding mistakes. * **************************************/ /* Note: This function is called from outside the engine as well as inside - we likely can't get rid of JRD_get_thread_data here */ thread_db* tdbb = JRD_get_thread_data(); if (tdbb == NULL) /* are we in the Engine? */ return (1); /* no, then can't access intl gah */ fb_assert(to != NULL); fb_assert(from != NULL); fb_assert(IS_TEXT(to) && IS_TEXT(from)); CHARSET_ID from_cs = INTL_charset(tdbb, INTL_TTYPE(from), err); CHARSET_ID to_cs = INTL_charset(tdbb, INTL_TTYPE(to), err); UCHAR* start = to->dsc_address; UCHAR* p = start; /* Must convert dtype(cstring,text,vary) and ttype(ascii,binary,..intl..) */ UCHAR* from_ptr; USHORT from_type; const USHORT from_len = CVT_get_string_ptr(from, &from_type, &from_ptr, NULL, 0, err); ULONG to_size, to_len, to_fill; to_size = to_len = TEXT_LEN(to); ULONG from_fill; const UCHAR* q = from_ptr; CharSet* toCharSet = INTL_charset_lookup(tdbb, to_cs, NULL); fb_assert(toCharSet != NULL); ULONG toLength; switch (to->dsc_dtype) { case dtype_text: if ((from_cs != to_cs) && (to_cs != CS_BINARY) && (to_cs != CS_NONE) && (from_cs != CS_NONE)) { to_len = INTL_convert_bytes(tdbb, to_cs, to->dsc_address, to_size, from_cs, from_ptr, from_len, err); toLength = to_len; to_fill = to_size - to_len; from_fill = 0; /* Convert_bytes handles source truncation */ p += to_len; } else { /* binary string can always be converted TO by byte-copy */ to_len = MIN(from_len, to_size); if (!toCharSet->wellFormed(to_len, q)) (*err)(isc_malformed_string, 0); toLength = to_len; from_fill = from_len - to_len; to_fill = to_size - to_len; if (to_len) do *p++ = *q++; while (--to_len); } if (to_fill > 0) pad_spaces(tdbb, to_cs, p, to_fill); break; case dtype_cstring: if ((from_cs != to_cs) && (to_cs != CS_BINARY) && (to_cs != CS_NONE) && (from_cs != CS_NONE)) { to_len = INTL_convert_bytes(tdbb, to_cs, to->dsc_address, to_size, from_cs, from_ptr, from_len, err); toLength = to_len; to->dsc_address[to_len] = 0; from_fill = 0; /* Convert_bytes handles source truncation */ } else { /* binary string can always be converted TO by byte-copy */ to_len = MIN(from_len, to_size); if (!toCharSet->wellFormed(to_len, q)) (*err)(isc_malformed_string, 0); toLength = to_len; from_fill = from_len - to_len; if (to_len) do *p++ = *q++; while (--to_len); *p = 0; } break; case dtype_varying: if ((from_cs != to_cs) && (to_cs != CS_BINARY) && (to_cs != CS_NONE) && (from_cs != CS_NONE)) { to_len = INTL_convert_bytes(tdbb, to_cs, (start = reinterpret_cast(((vary*) to->dsc_address)->vary_string)), to_size, from_cs, from_ptr, from_len, err); toLength = to_len; ((vary*) to->dsc_address)->vary_length = to_len; from_fill = 0; /* Convert_bytes handles source truncation */ } else { /* binary string can always be converted TO by byte-copy */ to_len = MIN(from_len, to_size); if (!toCharSet->wellFormed(to_len, q)) (*err)(isc_malformed_string, 0); toLength = to_len; from_fill = from_len - to_len; ((vary*) p)->vary_length = to_len; start = p = reinterpret_cast(((vary*) p)->vary_string); if (to_len) do *p++ = *q++; while (--to_len); } break; } if (toCharSet->isMultiByte() && !(toCharSet->getFlags() & CHARSET_LEGACY_SEMANTICS) && toLength != 31 && /* allow non CHARSET_LEGACY_SEMANTICS to be used as connection charset */ toCharSet->length(tdbb, toLength, start, false) > to_size / toCharSet->maxBytesPerChar()) { (*err)(isc_arith_except, 0); } if (from_fill) /* Make sure remaining characters on From string are spaces */ if (!all_spaces(tdbb, from_cs, q, from_fill, 0)) (*err) (isc_arith_except, 0); return 0; } int INTL_data(const dsc* pText) { /************************************** * * I N T L _ d a t a * ************************************** * * Functional description * Given an input text descriptor, * return TRUE if the data pointed to represents * international text (subject to user defined or non-binary * collation or comparison). * **************************************/ fb_assert(pText != NULL); if (!IS_TEXT(pText)) return FALSE; if (!INTERNAL_TTYPE(pText)) return TRUE; return FALSE; } int INTL_data_or_binary(const dsc* pText) { /************************************** * * I N T L _ d a t a _ o r _ b i n a r y * ************************************** * * Functional description * **************************************/ return (INTL_data(pText) || (pText->dsc_ttype() == ttype_binary)); } bool INTL_defined_type(thread_db* tdbb, ISC_STATUS * status, SSHORT t_type) { /************************************** * * I N T L _ d e f i n e d _ t y p e * ************************************** * * Functional description * Is (t_type) a known text type? * Return: * false type is not defined. * true type is defined * status set to gds_status codes to describe any error. * * Note: * Due to cleanup that must happen in DFW, this routine * must return, and not call ERR directly. * **************************************/ SET_TDBB(tdbb); if (status) status[0] = isc_arg_end; TextType* obj = INTL_texttype_lookup(tdbb, t_type, NULL, status); if (obj == NULL) return false; return true; } void INTL_init(thread_db* tdbb) { /************************************** * * I N T L _ i n i t * ************************************** * * Functional description * **************************************/ } USHORT INTL_key_length(thread_db* tdbb, USHORT idxType, USHORT iLength) { /************************************** * * I N T L _ k e y _ l e n g t h * ************************************** * * Functional description * Given an index type, and a maximum length (iLength) * return the length of the byte string key descriptor to * use when collating text of this type. * **************************************/ SET_TDBB(tdbb); fb_assert(idxType >= idx_first_intl_string); const SSHORT ttype = INTL_INDEX_TO_TEXT(idxType); USHORT key_length; if (ttype >= 0 && ttype <= ttype_last_internal) key_length = iLength; else { TextType* obj = INTL_texttype_lookup(tdbb, ttype, ERR_post, NULL); key_length = obj->key_length(iLength); } /* Validity checks on the computed key_length */ if (key_length > MAX_KEY) key_length = MAX_KEY; if (key_length < iLength) key_length = iLength; return (key_length); } CharSet* INTL_charset_lookup(thread_db* tdbb, SSHORT parm1, ISC_STATUS* status) { /************************************** * * I N T L _ c h a r s e t _ l o o k u p * ************************************** * * Functional description * * Lookup a character set descriptor. * * First, search the appropriate vector that hangs * off the dbb. If not found, then call the lower * level lookup routine to allocate it, or return * null if we don't know about the charset. * * Returns: * *charset - if no errors; * - if error & err non NULL * NULL - if error & err NULL * **************************************/ CharSetContainer *cs = CharSetContainer::lookupCharset(tdbb, parm1, status); if (!cs) return NULL; return cs->getCharSet(); } TextType* INTL_texttype_lookup(thread_db* tdbb, SSHORT parm1, FPTR_ERROR err, ISC_STATUS* status) { /************************************** * * I N T L _ t e x t t y p e _ l o o k u p * ************************************** * * Functional description * * Lookup either a character set descriptor or * texttype descriptor object. * * First, search the appropriate vector that hangs * off the dbb. If not found, then call the lower * level lookup routine to find it in the libraries. * * Returns: * *object - if no errors; * - if error & err non NULL * NULL - if error & err NULL * **************************************/ SET_TDBB(tdbb); Database* dbb = tdbb->tdbb_database; if (parm1 == ttype_dynamic) parm1 = MAP_CHARSET_TO_TTYPE(tdbb->tdbb_attachment->att_charset); CharSetContainer* csc = CharSetContainer::lookupCharset(tdbb, parm1, status); if (!csc) return NULL; return csc->lookupCollation(tdbb, parm1); } bool INTL_texttype_validate(Jrd::thread_db* tdbb, const SubtypeInfo* info) { texttype tt; memset(&tt, 0, sizeof(tt)); bool ret = lookup_texttype(&tt, info); if (ret && tt.texttype_fn_destroy) tt.texttype_fn_destroy(&tt); return ret; } void INTL_pad_spaces(thread_db* tdbb, DSC * type, UCHAR * string, ULONG length) { /************************************** * * I N T L _ p a d _ s p a c e s * ************************************** * * Functional description * Pad a buffer with spaces, using the character * set's defined space character. * **************************************/ SET_TDBB(tdbb); fb_assert(type != NULL); fb_assert(IS_TEXT(type)); fb_assert(string != NULL); const USHORT charset = INTL_charset(tdbb, type->dsc_ttype(), NULL); pad_spaces(tdbb, charset, string, length); } USHORT INTL_string_to_key(thread_db* tdbb, USHORT idxType, const dsc* pString, DSC* pByte, USHORT key_type) { /************************************** * * I N T L _ s t r i n g _ t o _ k e y * ************************************** * * Functional description * Given an input string, convert it to a byte string * that will collate naturally (byte order). * * Return the length of the resulting byte string. * **************************************/ UCHAR pad_char; SSHORT ttype; SET_TDBB(tdbb); fb_assert(idxType >= idx_first_intl_string || idxType == idx_string || idxType == idx_byte_array || idxType == idx_metadata); fb_assert(pString != NULL); fb_assert(pByte != NULL); fb_assert(pString->dsc_address != NULL); fb_assert(pByte->dsc_address != NULL); fb_assert(pByte->dsc_dtype == dtype_text); switch (idxType) { case idx_string: pad_char = ' '; ttype = ttype_none; break; case idx_byte_array: pad_char = 0; ttype = ttype_binary; break; case idx_metadata: pad_char = ' '; ttype = ttype_metadata; break; default: pad_char = 0; ttype = INTL_INDEX_TO_TEXT(idxType); break; } /* Make a string into the proper type of text */ MoveBuffer temp; UCHAR* src; USHORT len = MOV_make_string2(pString, ttype, &src, temp); USHORT outlen; char* dest = reinterpret_cast(pByte->dsc_address); USHORT destLen = pByte->dsc_length; switch (ttype) { case ttype_metadata: case ttype_binary: case ttype_ascii: case ttype_none: while (len-- && destLen-- > 0) *dest++ = *src++; /* strip off ending pad characters */ while (dest > (const char*)pByte->dsc_address) { if (*(dest - 1) == pad_char) dest--; else break; } outlen = (dest - (const char*)pByte->dsc_address); break; default: TextType* obj = INTL_texttype_lookup(tdbb, ttype, ERR_post, NULL); outlen = obj->string_to_key(len, reinterpret_cast(src), pByte->dsc_length, reinterpret_cast(dest), key_type); break; } return (outlen); } int INTL_str_to_upper(thread_db* tdbb, DSC * pString) { /************************************** * * I N T L _ s t r _ t o _ u p p e r * ************************************** * * Functional description * Given an input string, convert it to uppercase * **************************************/ SET_TDBB(tdbb); fb_assert(pString != NULL); fb_assert(pString->dsc_address != NULL); UCHAR* src; UCHAR buffer[MAX_KEY]; USHORT ttype; USHORT len = CVT_get_string_ptr(pString, &ttype, &src, reinterpret_cast(buffer), sizeof(buffer), ERR_post); UCHAR* dest; switch (ttype) { case ttype_binary: /* cannot uppercase binary strings */ break; case ttype_none: case ttype_ascii: dest = src; while (len--) { *dest++ = UPPER7(*src); src++; } break; default: TextType* obj = INTL_texttype_lookup(tdbb, ttype, ERR_post, NULL); obj->str_to_upper(len, src, len, src); // ASF: this works for all cases? (src and dst buffers are the same) break; } /* * Added to remove compiler errors. Callers are not checking * the return code from this function 4/5/95. */ return (0); } int INTL_str_to_lower(thread_db* tdbb, DSC * pString) { /************************************** * * I N T L _ s t r _ t o _ l o w e r * ************************************** * * Functional description * Given an input string, convert it to lowercase * **************************************/ SET_TDBB(tdbb); fb_assert(pString != NULL); fb_assert(pString->dsc_address != NULL); UCHAR* src; UCHAR buffer[MAX_KEY]; USHORT ttype; USHORT len = CVT_get_string_ptr(pString, &ttype, &src, reinterpret_cast(buffer), sizeof(buffer), ERR_post); UCHAR* dest; switch (ttype) { case ttype_binary: /* cannot lowercase binary strings */ break; case ttype_none: case ttype_ascii: dest = src; while (len--) { *dest++ = LOWWER7(*src); src++; } break; default: TextType* obj = INTL_texttype_lookup(tdbb, ttype, ERR_post, NULL); obj->str_to_lower(len, src, len, src); // ASF: this works for all cases? (src and dst buffers are the same) break; } /* * Added to remove compiler errors. Callers are not checking * the return code from this function 4/5/95. */ return (0); } static bool all_spaces( thread_db* tdbb, CHARSET_ID charset, const BYTE* ptr, ULONG len, ULONG offset) { /************************************** * * a l l _ s p a c e s * ************************************** * * Functional description * determine if the string at ptr[offset] ... ptr[len] is entirely * spaces, as per the space definition of (charset). * The binary representation of a Space is character-set dependent. * (0x20 for Ascii, 0x0020 for Unicode, 0x20 for SJIS, but must watch for * 0x??20, which is NOT a space. **************************************/ SET_TDBB(tdbb); fb_assert(ptr != NULL); CharSet* obj = INTL_charset_lookup(tdbb, charset, NULL); fb_assert(obj != NULL); /* * We are assuming offset points to the first byte which was not * consumed in a conversion. And that offset is pointing * to a character boundary */ // Single-octet character sets are optimized here if (obj->getSpaceLength() == 1) { const BYTE* p = &ptr[offset]; const BYTE* const end = &ptr[len]; while (p < end) { if (*p++ != *obj->getSpace()) return false; } return true; } else { const BYTE* p = &ptr[offset]; const BYTE* const end = &ptr[len]; const unsigned char* space = obj->getSpace(); const unsigned char* const end_space = &space[obj->getSpaceLength()]; while (p < end) { space = obj->getSpace(); while (p < end && space < end_space) { if (*p++ != *space++) return false; } } return true; } } static void pad_spaces(thread_db* tdbb, CHARSET_ID charset, BYTE* ptr, ULONG len) { /* byte count */ /************************************** * * p a d _ s p a c e s * ************************************** * * Functional description * Pad a buffer with the character set defined space character. * **************************************/ SET_TDBB(tdbb); fb_assert(ptr != NULL); CharSet* obj = INTL_charset_lookup(tdbb, charset, NULL); fb_assert(obj != NULL); /* Single-octet character sets are optimized here */ if (obj->getSpaceLength() == 1) { const BYTE* const end = &ptr[len]; while (ptr < end) *ptr++ = *obj->getSpace(); } else { const BYTE* const end = &ptr[len]; const UCHAR* space = obj->getSpace(); const UCHAR* const end_space = &space[obj->getSpaceLength()]; while (ptr < end) { space = obj->getSpace(); while (ptr < end && space < end_space) { *ptr++ = *space++; } /* This fb_assert is checking that we didn't have a buffer-end * in the middle of a space character */ fb_assert(!(ptr == end) || (space == end_space)); } } }