8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-27 17:23:03 +01:00
firebird-mirror/src/jrd/unicode_util.cpp
alexpeshkoff 4ea635c01c Style
2009-07-21 07:48:21 +00:00

1310 lines
31 KiB
C++

/*
* PROGRAM: JRD International support
* MODULE: unicode_util.h
* DESCRIPTION: Unicode functions
*
* The contents of this file are subject to the Initial
* Developer's Public License Version 1.0 (the "License");
* you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
* http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl.
*
* Software distributed under the License is distributed AS IS,
* WITHOUT WARRANTY OF ANY KIND, either express or implied.
* See the License for the specific language governing rights
* and limitations under the License.
*
* The Original Code was created by Adriano dos Santos Fernandes
* for the Firebird Open Source RDBMS project.
*
* Copyright (c) 2004 Adriano dos Santos Fernandes <adrianosf@uol.com.br>
* and all contributors signed below.
*
* All Rights Reserved.
* Contributor(s): ______________________________________.
*/
#include "firebird.h"
#include "../common/classes/alloc.h"
#include "../jrd/constants.h"
#include "../jrd/unicode_util.h"
#include "../jrd/CharSet.h"
#include "../jrd/IntlUtil.h"
#include "../jrd/gdsassert.h"
#include "../common/classes/auto.h"
#include "../common/classes/GenericMap.h"
#include "../common/classes/init.h"
#include "../common/classes/objects_array.h"
#include "../common/classes/rwlock.h"
#include "unicode/ustring.h"
#include "unicode/utrans.h"
#include "unicode/uchar.h"
#include "unicode/ucnv.h"
#include "unicode/ucol.h"
using namespace Firebird;
namespace Jrd {
const char* const UnicodeUtil::DEFAULT_ICU_VERSION =
STRINGIZE(U_ICU_VERSION_MAJOR_NUM)"."STRINGIZE(U_ICU_VERSION_MINOR_NUM);
// encapsulate ICU collations libraries
struct UnicodeUtil::ICU
{
private:
ICU(const ICU&); // not implemented
ICU& operator =(const ICU&); // not implemented
public:
ICU()
: inModule(NULL), ucModule(NULL)
{
}
~ICU()
{
delete ucModule;
delete inModule;
}
ModuleLoader::Module* inModule;
ModuleLoader::Module* ucModule;
UVersionInfo collVersion;
void (U_EXPORT2 *uVersionToString)(UVersionInfo versionArray, char *versionString);
int32_t (U_EXPORT2 *ulocCountAvailable)();
const char* (U_EXPORT2 *ulocGetAvailable)(int32_t n);
void (U_EXPORT2 *usetClose)(USet* set);
int32_t (U_EXPORT2 *usetGetItem)(const USet* set, int32_t itemIndex,
UChar32* start, UChar32* end, UChar* str, int32_t strCapacity, UErrorCode* ec);
int32_t (U_EXPORT2 *usetGetItemCount)(const USet* set);
USet* (U_EXPORT2 *usetOpen)(UChar32 start, UChar32 end);
void (U_EXPORT2 *ucolClose)(UCollator* coll);
int32_t (U_EXPORT2 *ucolGetContractions)(const UCollator* coll, USet* conts, UErrorCode* status);
int32_t (U_EXPORT2 *ucolGetSortKey)(const UCollator* coll, const UChar* source,
int32_t sourceLength, uint8_t* result, int32_t resultLength);
UCollator* (U_EXPORT2 *ucolOpen)(const char* loc, UErrorCode* status);
void (U_EXPORT2 *ucolSetAttribute)(UCollator* coll, UColAttribute attr,
UColAttributeValue value, UErrorCode* status);
UCollationResult (U_EXPORT2 *ucolStrColl)(const UCollator* coll, const UChar* source,
int32_t sourceLength, const UChar* target, int32_t targetLength);
void (U_EXPORT2 *ucolGetVersion)(const UCollator* coll, UVersionInfo info);
void (U_EXPORT2 *utransClose)(UTransliterator* trans);
UTransliterator* (U_EXPORT2 *utransOpen)(
const char* id,
UTransDirection dir,
const UChar* rules, /* may be Null */
int32_t rulesLength, /* -1 if null-terminated */
UParseError* parseError, /* may be Null */
UErrorCode* status);
void (U_EXPORT2 *utransTransUChars)(
const UTransliterator* trans,
UChar* text,
int32_t* textLength,
int32_t textCapacity,
int32_t start,
int32_t* limit,
UErrorCode* status);
};
// cache ICU module instances to not load and unload many times
class UnicodeUtil::ICUModules
{
typedef GenericMap<Pair<Left<string, ICU*> > > ModulesMap;
public:
explicit ICUModules(MemoryPool&)
{
}
~ICUModules()
{
ModulesMap::Accessor modulesAccessor(&modules());
for (bool found = modulesAccessor.getFirst(); found; found = modulesAccessor.getNext())
delete modulesAccessor.current()->second;
}
InitInstance<ModulesMap> modules;
RWLock lock;
};
namespace {
GlobalPtr<UnicodeUtil::ICUModules> icuModules;
}
static const char* const COLL_30_VERSION = "41.128.4.4"; // ICU 3.0 collator version
static void getVersions(const string& configInfo, ObjectsArray<string>& versions)
{
charset cs;
IntlUtil::initAsciiCharset(&cs);
AutoPtr<CharSet> ascii(Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, &cs));
IntlUtil::SpecificAttributesMap config;
IntlUtil::parseSpecificAttributes(ascii, configInfo.length(),
(const UCHAR*) configInfo.c_str(), &config);
string versionsStr;
if (config.get("icu_versions", versionsStr))
versionsStr.trim();
else
versionsStr = "default";
versions.clear();
size_t start = 0;
size_t n;
for (size_t i = versionsStr.find(' '); i != versionsStr.npos;
start = i + 1, i = versionsStr.find(' ', start))
{
if ((n = versionsStr.find_first_not_of(' ', start)) != versionsStr.npos)
start = n;
versions.add(versionsStr.substr(start, i - start));
}
if ((n = versionsStr.find_first_not_of(' ', start)) != versionsStr.npos)
start = n;
versions.add(versionsStr.substr(start));
}
// BOCU-1
USHORT UnicodeUtil::utf16KeyLength(USHORT len)
{
return (len / 2) * 4;
}
// BOCU-1
USHORT UnicodeUtil::utf16ToKey(USHORT srcLen, const USHORT* src, USHORT dstLen, UCHAR* dst)
{
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL && dst != NULL);
if (dstLen < srcLen / sizeof(*src) * 4)
return INTL_BAD_KEY_LENGTH;
UErrorCode status = U_ZERO_ERROR;
UConverter* conv = ucnv_open("BOCU-1", &status);
fb_assert(U_SUCCESS(status));
const int32_t len = ucnv_fromUChars(conv, reinterpret_cast<char*>(dst), dstLen,
// safe cast - alignment not changed
reinterpret_cast<const UChar*>(src), srcLen / sizeof(*src), &status);
fb_assert(U_SUCCESS(status));
ucnv_close(conv);
return len;
}
ULONG UnicodeUtil::utf16LowerCase(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,
const ULONG* exceptions)
{
// this is more correct but we don't support completely yet
/***
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL && dst != NULL);
memcpy(dst, src, srcLen);
UErrorCode errorCode = U_ZERO_ERROR;
UTransliterator* trans = utrans_open("Any-Lower", UTRANS_FORWARD, NULL, 0, NULL, &errorCode);
//// TODO: add exceptions in this way: Any-Lower[^\\u03BC] - for U+03BC
if (errorCode <= 0)
{
int32_t capacity = dstLen;
int32_t len = srcLen / sizeof(USHORT);
int32_t limit = len;
utrans_transUChars(trans, reinterpret_cast<UChar*>(dst), &len, capacity, 0, &limit, &errorCode);
utrans_close(trans);
len *= sizeof(USHORT);
if (len > dstLen)
len = INTL_BAD_STR_LENGTH;
return len;
}
else
return INTL_BAD_STR_LENGTH;
***/
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL && dst != NULL);
srcLen /= sizeof(*src);
dstLen /= sizeof(*dst);
ULONG n = 0;
for (ULONG i = 0; i < srcLen;)
{
uint32_t c;
U16_NEXT(src, i, srcLen, c);
if (!exceptions)
c = u_tolower(c);
else
{
const ULONG* p = exceptions;
while (*p && *p != c)
++p;
if (*p == 0)
c = u_tolower(c);
}
bool error;
U16_APPEND(dst, n, dstLen, c, error);
}
return n * sizeof(*dst);
}
ULONG UnicodeUtil::utf16UpperCase(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,
const ULONG* exceptions)
{
// this is more correct but we don't support completely yet
/***
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL && dst != NULL);
memcpy(dst, src, srcLen);
UErrorCode errorCode = U_ZERO_ERROR;
UTransliterator* trans = utrans_open("Any-Upper", UTRANS_FORWARD, NULL, 0, NULL, &errorCode);
//// TODO: add exceptions in this way: Any-Upper[^\\u03BC] - for U+03BC
if (errorCode <= 0)
{
int32_t capacity = dstLen;
int32_t len = srcLen / sizeof(USHORT);
int32_t limit = len;
utrans_transUChars(trans, reinterpret_cast<UChar*>(dst), &len, capacity, 0, &limit, &errorCode);
utrans_close(trans);
len *= sizeof(USHORT);
if (len > dstLen)
len = INTL_BAD_STR_LENGTH;
return len;
}
else
return INTL_BAD_STR_LENGTH;
***/
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL && dst != NULL);
srcLen /= sizeof(*src);
dstLen /= sizeof(*dst);
ULONG n = 0;
for (ULONG i = 0; i < srcLen;)
{
uint32_t c;
U16_NEXT(src, i, srcLen, c);
if (!exceptions)
c = u_toupper(c);
else
{
const ULONG* p = exceptions;
while (*p && *p != c)
++p;
if (*p == 0)
c = u_toupper(c);
}
bool error;
U16_APPEND(dst, n, dstLen, c, error);
}
return n * sizeof(*dst);
}
ULONG UnicodeUtil::utf16ToUtf8(ULONG srcLen, const USHORT* src, ULONG dstLen, UCHAR* dst,
USHORT* err_code, ULONG* err_position)
{
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL || dst == NULL);
fb_assert(err_code != NULL);
fb_assert(err_position != NULL);
*err_code = 0;
if (dst == NULL)
return srcLen / sizeof(*src) * 4;
srcLen /= sizeof(*src);
const UCHAR* const dstStart = dst;
const UCHAR* const dstEnd = dst + dstLen;
for (ULONG i = 0; i < srcLen; )
{
if (dstEnd - dst == 0)
{
*err_code = CS_TRUNCATION_ERROR;
*err_position = i * sizeof(*src);
break;
}
UChar32 c = src[i++];
if (c <= 0x7F)
*dst++ = c;
else
{
*err_position = (i - 1) * sizeof(*src);
if (UTF_IS_SURROGATE(c))
{
UChar32 c2;
if (UTF_IS_SURROGATE_FIRST(c) && i < srcLen && UTF_IS_TRAIL(c2 = src[i]))
{
++i;
c = UTF16_GET_PAIR_VALUE(c, c2);
}
else
{
*err_code = CS_BAD_INPUT;
break;
}
}
if (U8_LENGTH(c) <= dstEnd - dst)
{
int j = 0;
U8_APPEND_UNSAFE(dst, j, c);
dst += j;
}
else
{
*err_code = CS_TRUNCATION_ERROR;
break;
}
}
}
return (dst - dstStart) * sizeof(*dst);
}
ULONG UnicodeUtil::utf8ToUtf16(ULONG srcLen, const UCHAR* src, ULONG dstLen, USHORT* dst,
USHORT* err_code, ULONG* err_position)
{
fb_assert(src != NULL || dst == NULL);
fb_assert(err_code != NULL);
fb_assert(err_position != NULL);
*err_code = 0;
if (dst == NULL)
return srcLen * sizeof(*dst);
const USHORT* const dstStart = dst;
const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);
for (ULONG i = 0; i < srcLen; )
{
if (dstEnd - dst == 0)
{
*err_code = CS_TRUNCATION_ERROR;
*err_position = i;
break;
}
UChar32 c = src[i++];
if (c <= 0x7F)
*dst++ = c;
else
{
*err_position = i - 1;
c = utf8_nextCharSafeBody(src, reinterpret_cast<int32_t*>(&i), srcLen, c, -1);
if (c < 0)
{
*err_code = CS_BAD_INPUT;
break;
}
else if (c <= 0xFFFF)
*dst++ = c;
else
{
if (dstEnd - dst > 1)
{
*dst++ = UTF16_LEAD(c);
*dst++ = UTF16_TRAIL(c);
}
else
{
*err_code = CS_TRUNCATION_ERROR;
break;
}
}
}
}
return (dst - dstStart) * sizeof(*dst);
}
ULONG UnicodeUtil::utf16ToUtf32(ULONG srcLen, const USHORT* src, ULONG dstLen, ULONG* dst,
USHORT* err_code, ULONG* err_position)
{
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL || dst == NULL);
fb_assert(err_code != NULL);
fb_assert(err_position != NULL);
*err_code = 0;
if (dst == NULL)
return srcLen / sizeof(*src) * sizeof(*dst);
// based on u_strToUTF32 from ICU
const USHORT* const srcStart = src;
const ULONG* const dstStart = dst;
const USHORT* const srcEnd = src + srcLen / sizeof(*src);
const ULONG* const dstEnd = dst + dstLen / sizeof(*dst);
while (src < srcEnd && dst < dstEnd)
{
ULONG ch = *src++;
if (UTF_IS_LEAD(ch))
{
ULONG ch2;
if (src < srcEnd && UTF_IS_TRAIL(ch2 = *src))
{
ch = UTF16_GET_PAIR_VALUE(ch, ch2);
++src;
}
else
{
*err_code = CS_BAD_INPUT;
--src;
break;
}
}
*(dst++) = ch;
}
*err_position = (src - srcStart) * sizeof(*src);
if (*err_code == 0 && src < srcEnd)
*err_code = CS_TRUNCATION_ERROR;
return (dst - dstStart) * sizeof(*dst);
}
ULONG UnicodeUtil::utf32ToUtf16(ULONG srcLen, const ULONG* src, ULONG dstLen, USHORT* dst,
USHORT* err_code, ULONG* err_position)
{
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL || dst == NULL);
fb_assert(err_code != NULL);
fb_assert(err_position != NULL);
*err_code = 0;
if (dst == NULL)
return srcLen;
// based on u_strFromUTF32 from ICU
const ULONG* const srcStart = src;
const USHORT* const dstStart = dst;
const ULONG* const srcEnd = src + srcLen / sizeof(*src);
const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);
while (src < srcEnd && dst < dstEnd)
{
const ULONG ch = *src++;
if (ch <= 0xFFFF)
*(dst++) = ch;
else if (ch <= 0x10FFFF)
{
*(dst++) = UTF16_LEAD(ch);
if (dst < dstEnd)
*(dst++) = UTF16_TRAIL(ch);
else
{
*err_code = CS_TRUNCATION_ERROR;
--dst;
break;
}
}
else
{
*err_code = CS_BAD_INPUT;
--src;
break;
}
}
*err_position = (src - srcStart) * sizeof(*src);
if (*err_code == 0 && src < srcEnd)
*err_code = CS_TRUNCATION_ERROR;
return (dst - dstStart) * sizeof(*dst);
}
SSHORT UnicodeUtil::utf16Compare(ULONG len1, const USHORT* str1, ULONG len2, const USHORT* str2,
INTL_BOOL* error_flag)
{
fb_assert(len1 % sizeof(*str1) == 0);
fb_assert(len2 % sizeof(*str2) == 0);
fb_assert(str1 != NULL);
fb_assert(str2 != NULL);
fb_assert(error_flag != NULL);
*error_flag = false;
// safe casts - alignment not changed
int32_t cmp = u_strCompare(reinterpret_cast<const UChar*>(str1), len1 / sizeof(*str1),
reinterpret_cast<const UChar*>(str2), len2 / sizeof(*str2), true);
return (cmp < 0 ? -1 : (cmp > 0 ? 1 : 0));
}
ULONG UnicodeUtil::utf16Length(ULONG len, const USHORT* str)
{
fb_assert(len % sizeof(*str) == 0);
// safe cast - alignment not changed
return u_countChar32(reinterpret_cast<const UChar*>(str), len / sizeof(*str));
}
ULONG UnicodeUtil::utf16Substring(ULONG srcLen, const USHORT* src, ULONG dstLen, USHORT* dst,
ULONG startPos, ULONG length)
{
fb_assert(srcLen % sizeof(*src) == 0);
fb_assert(src != NULL && dst != NULL);
if (length == 0)
return 0;
const USHORT* const dstStart = dst;
const USHORT* const srcEnd = src + srcLen / sizeof(*src);
const USHORT* const dstEnd = dst + dstLen / sizeof(*dst);
ULONG pos = 0;
while (src < srcEnd && dst < dstEnd && pos < startPos)
{
const ULONG ch = *src++;
if (UTF_IS_LEAD(ch))
{
if (src < srcEnd && UTF_IS_TRAIL(*src))
++src;
}
++pos;
}
while (src < srcEnd && dst < dstEnd && pos < startPos + length)
{
const ULONG ch = *src++;
*(dst++) = ch;
if (UTF_IS_LEAD(ch))
{
ULONG ch2;
if (src < srcEnd && UTF_IS_TRAIL(ch2 = *src))
{
*(dst++) = ch2;
++src;
}
}
++pos;
}
return (dst - dstStart) * sizeof(*dst);
}
INTL_BOOL UnicodeUtil::utf8WellFormed(ULONG len, const UCHAR* str, ULONG* offending_position)
{
fb_assert(str != NULL);
for (ULONG i = 0; i < len; )
{
UChar32 c = str[i++];
if (c > 0x7F)
{
const ULONG save_i = i - 1;
c = utf8_nextCharSafeBody(str, reinterpret_cast<int32_t*>(&i), len, c, -1);
if (c < 0)
{
if (offending_position)
*offending_position = save_i;
return false; // malformed
}
}
}
return true; // well-formed
}
INTL_BOOL UnicodeUtil::utf16WellFormed(ULONG len, const USHORT* str, ULONG* offending_position)
{
fb_assert(str != NULL);
fb_assert(len % sizeof(*str) == 0);
len /= sizeof(*str);
for (ULONG i = 0; i < len;)
{
const ULONG save_i = i;
uint32_t c;
U16_NEXT(str, i, len, c);
if (!U_IS_SUPPLEMENTARY(c) && (U16_IS_LEAD(c) || U16_IS_TRAIL(c)))
{
if (offending_position)
*offending_position = save_i * sizeof(*str);
return false; // malformed
}
}
return true; // well-formed
}
INTL_BOOL UnicodeUtil::utf32WellFormed(ULONG len, const ULONG* str, ULONG* offending_position)
{
fb_assert(str != NULL);
fb_assert(len % sizeof(*str) == 0);
const ULONG* strStart = str;
while (len)
{
if (!U_IS_UNICODE_CHAR(*str))
{
if (offending_position)
*offending_position = (str - strStart) * sizeof(*str);
return false; // malformed
}
++str;
len -= sizeof(*str);
}
return true; // well-formed
}
UnicodeUtil::ICU* UnicodeUtil::loadICU(const Firebird::string& icuVersion,
const Firebird::string& configInfo)
{
#if defined(WIN_NT)
const char* const inTemplate = "icuin%s%s.dll";
const char* const ucTemplate = "icuuc%s%s.dll";
#elif defined(DARWIN)
const char* const inTemplate = "/Library/Frameworks/Firebird.framework/Versions/A/Libraries/libicui18n.dylib";
const char* const ucTemplate = "/Library/Frameworks/Firebird.framework/versions/A/Libraries/libicuuc.dylib";
#elif defined(HPUX)
const char* const inTemplate = "libicui18n.sl.%s%s";
const char* const ucTemplate = "libicuuc.sl.%s%s";
#else
const char* const inTemplate = "libicui18n.so.%s%s";
const char* const ucTemplate = "libicuuc.so.%s%s";
#endif
ObjectsArray<string> versions;
getVersions(configInfo, versions);
string version = icuVersion.isEmpty() ? versions[0] : icuVersion;
if (version == "default")
{
version.printf("%d.%d", U_ICU_VERSION_MAJOR_NUM, U_ICU_VERSION_MINOR_NUM);
}
for (ObjectsArray<string>::const_iterator i(versions.begin()); i != versions.end(); ++i)
{
string majorVersion;
string minorVersion;
if (*i == "default")
{
majorVersion = STRINGIZE(U_ICU_VERSION_MAJOR_NUM);
minorVersion = STRINGIZE(U_ICU_VERSION_MINOR_NUM);
}
else
{
const size_t pos = i->find('.');
if (pos == i->npos)
continue;
majorVersion = i->substr(0, pos);
minorVersion = i->substr(pos + 1);
}
if (version != majorVersion + "." + minorVersion)
{
continue;
}
ReadLockGuard readGuard(icuModules->lock);
ICU* icu;
if (icuModules->modules().get(version, icu))
{
return icu;
}
PathName filename;
filename.printf(ucTemplate, majorVersion.c_str(), minorVersion.c_str());
icu = FB_NEW(*getDefaultMemoryPool()) ICU();
icu->ucModule = ModuleLoader::loadModule(filename);
if (!icu->ucModule)
{
ModuleLoader::doctorModuleExtention(filename);
icu->ucModule = ModuleLoader::loadModule(filename);
}
if (!icu->ucModule)
{
delete icu;
continue;
}
filename.printf(inTemplate, majorVersion.c_str(), minorVersion.c_str());
icu->inModule = ModuleLoader::loadModule(filename);
if (!icu->inModule)
{
ModuleLoader::doctorModuleExtention(filename);
icu->inModule = ModuleLoader::loadModule(filename);
}
if (!icu->inModule)
{
delete icu;
continue;
}
string symbol;
symbol.printf("u_versionToString_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->uVersionToString);
symbol.printf("uloc_countAvailable_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->ulocCountAvailable);
symbol.printf("uloc_getAvailable_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->ulocGetAvailable);
symbol.printf("uset_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->usetClose);
symbol.printf("uset_getItem_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->usetGetItem);
symbol.printf("uset_getItemCount_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->usetGetItemCount);
symbol.printf("uset_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->ucModule->findSymbol(symbol, icu->usetOpen);
symbol.printf("ucol_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolClose);
symbol.printf("ucol_getContractions_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolGetContractions);
symbol.printf("ucol_getSortKey_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolGetSortKey);
symbol.printf("ucol_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolOpen);
symbol.printf("ucol_setAttribute_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolSetAttribute);
symbol.printf("ucol_strcoll_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolStrColl);
symbol.printf("ucol_getVersion_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->ucolGetVersion);
symbol.printf("utrans_open_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->utransOpen);
symbol.printf("utrans_close_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->utransClose);
symbol.printf("utrans_transUChars_%s_%s", majorVersion.c_str(), minorVersion.c_str());
icu->inModule->findSymbol(symbol, icu->utransTransUChars);
if (!icu->uVersionToString || !icu->ulocCountAvailable || !icu->ulocGetAvailable ||
!icu->usetClose || !icu->usetGetItem || !icu->usetGetItemCount || !icu->usetOpen ||
!icu->ucolClose || !icu->ucolGetContractions || !icu->ucolGetSortKey || !icu->ucolOpen ||
!icu->ucolSetAttribute || !icu->ucolStrColl || !icu->ucolGetVersion)
{
delete icu;
continue;
}
UErrorCode status = U_ZERO_ERROR;
UCollator* collator = icu->ucolOpen("", &status);
if (!collator)
{
delete icu;
continue;
}
icu->ucolGetVersion(collator, icu->collVersion);
icu->ucolClose(collator);
// RWLock don't allow lock upgrade (read->write) so we
// release read and acquire a write lock.
readGuard.release();
WriteLockGuard writeGuard(icuModules->lock);
// In this small amount of time, one may already loaded the
// same version, so within the write lock we verify again.
ICU* icu2;
if (icuModules->modules().get(version, icu2))
{
delete icu;
return icu2;
}
icuModules->modules().put(version, icu);
return icu;
}
return NULL;
}
bool UnicodeUtil::getCollVersion(const Firebird::string& icuVersion,
const Firebird::string& configInfo, Firebird::string& collVersion)
{
ICU* icu = loadICU(icuVersion, configInfo);
if (!icu)
return false;
char version[U_MAX_VERSION_STRING_LENGTH];
icu->uVersionToString(icu->collVersion, version);
if (string(COLL_30_VERSION) == version)
collVersion = "";
else
collVersion = version;
return true;
}
UnicodeUtil::Utf16Collation* UnicodeUtil::Utf16Collation::create(
texttype* tt, USHORT attributes,
Firebird::IntlUtil::SpecificAttributesMap& specificAttributes, const Firebird::string& configInfo)
{
int attributeCount = 0;
bool error;
string locale;
if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("LOCALE"), locale))
++attributeCount;
string collVersion;
if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("COLL-VERSION"), collVersion))
{
++attributeCount;
collVersion = IntlUtil::convertUtf16ToAscii(collVersion, &error);
if (error)
return NULL;
}
string numericSort;
if (specificAttributes.get(IntlUtil::convertAsciiToUtf16("NUMERIC-SORT"), numericSort))
{
++attributeCount;
numericSort = IntlUtil::convertUtf16ToAscii(numericSort, &error);
if (error || !(numericSort == "0" || numericSort == "1"))
return NULL;
}
locale = IntlUtil::convertUtf16ToAscii(locale, &error);
if (error)
return NULL;
if ((attributes & ~(TEXTTYPE_ATTR_PAD_SPACE | TEXTTYPE_ATTR_CASE_INSENSITIVE |
TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ||
((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==
TEXTTYPE_ATTR_ACCENT_INSENSITIVE) ||
(specificAttributes.count() - attributeCount) != 0)
{
return NULL;
}
if (collVersion.isEmpty())
collVersion = COLL_30_VERSION;
tt->texttype_pad_option = (attributes & TEXTTYPE_ATTR_PAD_SPACE) ? true : false;
ICU* icu = loadICU(collVersion, locale, configInfo);
if (!icu)
return NULL;
UErrorCode status = U_ZERO_ERROR;
UCollator* compareCollator = icu->ucolOpen(locale.c_str(), &status);
if (!compareCollator)
return NULL;
UCollator* partialCollator = icu->ucolOpen(locale.c_str(), &status);
if (!partialCollator)
{
icu->ucolClose(compareCollator);
return NULL;
}
UCollator* sortCollator = icu->ucolOpen(locale.c_str(), &status);
if (!sortCollator)
{
icu->ucolClose(compareCollator);
icu->ucolClose(partialCollator);
return NULL;
}
icu->ucolSetAttribute(partialCollator, UCOL_STRENGTH, UCOL_PRIMARY, &status);
if ((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==
(TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE))
{
icu->ucolSetAttribute(compareCollator, UCOL_STRENGTH, UCOL_PRIMARY, &status);
tt->texttype_flags |= TEXTTYPE_SEPARATE_UNIQUE;
tt->texttype_canonical_width = 4; // UTF-32
}
else if (attributes & TEXTTYPE_ATTR_CASE_INSENSITIVE)
{
icu->ucolSetAttribute(compareCollator, UCOL_STRENGTH, UCOL_SECONDARY, &status);
tt->texttype_flags |= TEXTTYPE_SEPARATE_UNIQUE;
tt->texttype_canonical_width = 4; // UTF-32
}
else
tt->texttype_flags = TEXTTYPE_DIRECT_MATCH;
const bool isNumericSort = numericSort == "1";
if (isNumericSort)
{
icu->ucolSetAttribute(compareCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);
icu->ucolSetAttribute(partialCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);
icu->ucolSetAttribute(sortCollator, UCOL_NUMERIC_COLLATION, UCOL_ON, &status);
}
USet* contractions = icu->usetOpen(0, 0);
// status not verified here.
icu->ucolGetContractions(partialCollator, contractions, &status);
Utf16Collation* obj = new Utf16Collation();
obj->icu = icu;
obj->tt = tt;
obj->attributes = attributes;
obj->compareCollator = compareCollator;
obj->partialCollator = partialCollator;
obj->sortCollator = sortCollator;
obj->contractions = contractions;
obj->contractionsCount = icu->usetGetItemCount(contractions);
obj->numericSort = isNumericSort;
return obj;
}
UnicodeUtil::Utf16Collation::~Utf16Collation()
{
icu->usetClose(contractions);
icu->ucolClose(compareCollator);
icu->ucolClose(partialCollator);
icu->ucolClose(sortCollator);
// ASF: we should not "delete icu"
}
USHORT UnicodeUtil::Utf16Collation::keyLength(USHORT len) const
{
return (len / 4) * 6;
}
USHORT UnicodeUtil::Utf16Collation::stringToKey(USHORT srcLen, const USHORT* src,
USHORT dstLen, UCHAR* dst,
USHORT key_type) const
{
fb_assert(src != NULL && dst != NULL);
fb_assert(srcLen % sizeof(*src) == 0);
if (dstLen < keyLength(srcLen))
{
fb_assert(false);
return INTL_BAD_KEY_LENGTH;
}
srcLen /= sizeof(*src);
if (tt->texttype_pad_option)
{
const USHORT* pad;
for (pad = src + srcLen - 1; pad >= src; --pad)
{
if (*pad != 32)
break;
}
srcLen = pad - src + 1;
}
const UCollator* coll = NULL;
switch (key_type)
{
case INTL_KEY_PARTIAL:
{
coll = partialCollator;
// Remove last bytes of key if they are start of a contraction
// to correctly find in the index.
for (int i = 0; i < contractionsCount; ++i)
{
UChar str[10];
UErrorCode status = U_ZERO_ERROR;
int len = icu->usetGetItem(contractions, i, NULL, NULL, str, sizeof(str), &status);
if (len > srcLen)
len = srcLen;
else
--len;
// safe cast - alignment not changed
if (u_strCompare(str, len, reinterpret_cast<const UChar*>(src) + srcLen - len, len, true) == 0)
{
srcLen -= len;
break;
}
}
if (numericSort)
{
// ASF: Wee need to remove trailing numbers to return sub key that
// matches full key. Example: "abc1" becomes "abc" to match "abc10".
const USHORT* p = src + srcLen - 1;
for (; p >= src; --p)
{
if (!(*p >= '0' && *p <= '9'))
break;
}
srcLen = p - src + 1;
}
break;
}
case INTL_KEY_UNIQUE:
coll = compareCollator;
break;
case INTL_KEY_SORT:
coll = sortCollator;
break;
default:
fb_assert(false);
return INTL_BAD_KEY_LENGTH;
}
if (srcLen == 0)
return 0;
return icu->ucolGetSortKey(coll,
reinterpret_cast<const UChar*>(src), srcLen, dst, dstLen);
}
SSHORT UnicodeUtil::Utf16Collation::compare(ULONG len1, const USHORT* str1,
ULONG len2, const USHORT* str2,
INTL_BOOL* error_flag) const
{
fb_assert(len1 % sizeof(*str1) == 0 && len2 % sizeof(*str2) == 0);
fb_assert(str1 != NULL && str2 != NULL);
fb_assert(error_flag != NULL);
*error_flag = false;
len1 /= sizeof(*str1);
len2 /= sizeof(*str2);
if (tt->texttype_pad_option)
{
const USHORT* pad;
for (pad = str1 + len1 - 1; pad >= str1; --pad)
{
if (*pad != 32)
break;
}
len1 = pad - str1 + 1;
for (pad = str2 + len2 - 1; pad >= str2; --pad)
{
if (*pad != 32)
break;
}
len2 = pad - str2 + 1;
}
return (SSHORT)icu->ucolStrColl(compareCollator,
// safe casts - alignment not changed
reinterpret_cast<const UChar*>(str1), len1,
reinterpret_cast<const UChar*>(str2), len2);
}
ULONG UnicodeUtil::Utf16Collation::canonical(ULONG srcLen, const USHORT* src, ULONG dstLen, ULONG* dst,
const ULONG* exceptions)
{
HalfStaticArray<USHORT, BUFFER_SMALL / 2> upperStr;
if ((attributes & (TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE)) ==
(TEXTTYPE_ATTR_CASE_INSENSITIVE | TEXTTYPE_ATTR_ACCENT_INSENSITIVE))
{
fb_assert(srcLen % sizeof(*src) == 0);
memcpy(upperStr.getBuffer(srcLen / sizeof(USHORT)), src, srcLen);
UErrorCode errorCode = U_ZERO_ERROR;
UTransliterator* trans = icu->utransOpen("Any-Upper; NFD; [:Nonspacing Mark:] Remove; NFC",
UTRANS_FORWARD, NULL, 0, NULL, &errorCode);
if (errorCode <= 0)
{
const int32_t capacity = dstLen;
int32_t len = srcLen / sizeof(USHORT);
int32_t limit = len;
icu->utransTransUChars(trans, reinterpret_cast<UChar*>(upperStr.begin()),
&len, capacity, 0, &limit, &errorCode);
icu->utransClose(trans);
len *= sizeof(USHORT);
if (ULONG(len) > dstLen)
len = INTL_BAD_STR_LENGTH;
srcLen = len;
src = upperStr.begin();
}
else
return INTL_BAD_STR_LENGTH;
}
else if (attributes & TEXTTYPE_ATTR_CASE_INSENSITIVE)
{
srcLen = utf16UpperCase(srcLen, src,
srcLen, upperStr.getBuffer(srcLen / sizeof(USHORT)), exceptions);
src = upperStr.begin();
}
// convert UTF-16 to UTF-32
USHORT errCode;
ULONG errPosition;
return utf16ToUtf32(srcLen, src, dstLen, dst, &errCode, &errPosition) / sizeof(ULONG);
}
UnicodeUtil::ICU* UnicodeUtil::Utf16Collation::loadICU(
const Firebird::string& collVersion, const Firebird::string& locale,
const Firebird::string& configInfo)
{
ObjectsArray<string> versions;
getVersions(configInfo, versions);
for (ObjectsArray<string>::const_iterator i(versions.begin()); i != versions.end(); ++i)
{
ICU* icu = UnicodeUtil::loadICU(*i, configInfo);
if (!icu)
continue;
if (locale.hasData())
{
int avail = icu->ulocCountAvailable();
while (--avail >= 0)
{
if (locale == icu->ulocGetAvailable(avail))
break;
}
if (avail < 0)
continue;
}
char version[U_MAX_VERSION_STRING_LENGTH];
icu->uVersionToString(icu->collVersion, version);
if (collVersion != version)
continue;
return icu;
}
return NULL;
}
} // namespace Jrd