/* * PROGRAM: JRD International support * MODULE: IntlUtil.cpp * DESCRIPTION: INTL Utility functions * * The contents of this file are subject to the Initial * Developer's Public License Version 1.0 (the "License"); * you may not use this file except in compliance with the * License. You may obtain a copy of the License at * http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_idpl. * * Software distributed under the License is distributed AS IS, * WITHOUT WARRANTY OF ANY KIND, either express or implied. * See the License for the specific language governing rights * and limitations under the License. * * The Original Code was created by Adriano dos Santos Fernandes * for the Firebird Open Source RDBMS project. * * Copyright (c) 2006 Adriano dos Santos Fernandes * and all contributors signed below. * * All Rights Reserved. * Contributor(s): ______________________________________. */ #include "firebird.h" #include "../jrd/IntlUtil.h" #include "../jrd/unicode_util.h" #include "../jrd/intl_classes.h" #include "../intl/country_codes.h" #include "../common/classes/auto.h" #include "../common/classes/Aligner.h" using Jrd::UnicodeUtil; namespace { struct TextTypeImpl { TextTypeImpl(charset* a_cs, UnicodeUtil::Utf16Collation* a_collation) : cs(a_cs), collation(a_collation) { } ~TextTypeImpl() { if (cs->charset_fn_destroy) cs->charset_fn_destroy(cs); delete cs; delete collation; } charset* cs; UnicodeUtil::Utf16Collation* collation; }; } namespace Firebird { static void unicodeDestroy(texttype* tt); static USHORT unicodeKeyLength(texttype* tt, USHORT len); static USHORT unicodeStrToKey(texttype* tt, USHORT srcLen, const UCHAR* src, USHORT dstLen, UCHAR* dst, USHORT keyType); static SSHORT unicodeCompare(texttype* tt, ULONG len1, const UCHAR* str1, ULONG len2, const UCHAR* str2, INTL_BOOL* errorFlag); static ULONG unicodeCanonical(texttype* tt, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst); string IntlUtil::generateSpecificAttributes(Jrd::CharSet* cs, SpecificAttributesMap& map) { SpecificAttributesMap::Accessor accessor(&map); bool found = accessor.getFirst(); string s; while (found) { UCHAR c[sizeof(ULONG)]; ULONG size; SpecificAttribute* attribute = accessor.current(); s += escapeAttribute(cs, attribute->first); const USHORT equalChar = '='; size = cs->getConvFromUnicode().convert( sizeof(equalChar), (const UCHAR*) &equalChar, sizeof(c), c); s += string((const char*) &c, size); s += escapeAttribute(cs, attribute->second); found = accessor.getNext(); if (found) { const USHORT semiColonChar = ';'; size = cs->getConvFromUnicode().convert( sizeof(semiColonChar), (const UCHAR*) &semiColonChar, sizeof(c), c); s += string((const char*) &c, size); } } return s; } bool IntlUtil::parseSpecificAttributes(Jrd::CharSet* cs, ULONG len, const UCHAR* s, SpecificAttributesMap* map) { // Note that the map isn't cleared. // Old attributes will be combined with the new ones. const UCHAR* p = s; const UCHAR* const end = s + len; ULONG size = 0; readAttributeChar(cs, &p, end, &size, true); while (p < end) { while (p < end && size == cs->getSpaceLength() && memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0) { if (!readAttributeChar(cs, &p, end, &size, true)) return true; } const UCHAR* start = p; UCHAR uc[sizeof(ULONG)]; ULONG uSize; while (p < end) { uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc); if (uSize == 2 && ((*(USHORT*) uc >= 'A' && *(USHORT*) uc <= 'Z') || (*(USHORT*) uc >= 'a' && *(USHORT*) uc <= 'z') || *(USHORT*) uc == '-' || *(USHORT*) uc == '_')) { if (!readAttributeChar(cs, &p, end, &size, true)) return false; } else break; } if (p - start == 0) return false; string name = string((const char*)start, p - start); name = unescapeAttribute(cs, name); while (p < end && size == cs->getSpaceLength() && memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0) { if (!readAttributeChar(cs, &p, end, &size, true)) return false; } uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc); if (uSize != 2 || *(USHORT*)uc != '=') return false; string value; if (readAttributeChar(cs, &p, end, &size, true)) { while (p < end && size == cs->getSpaceLength() && memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0) { if (!readAttributeChar(cs, &p, end, &size, true)) return false; } const UCHAR* endNoSpace = start = p; while (p < end) { uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc); if (uSize != 2 || *(USHORT*)uc != ';') { if (!(size == cs->getSpaceLength() && memcmp(p, cs->getSpace(), cs->getSpaceLength()) == 0)) { endNoSpace = p + size; } if (!readAttributeChar(cs, &p, end, &size, true)) break; } else break; } value = unescapeAttribute(cs, string((const char*)start, endNoSpace - start)); if (p < end) readAttributeChar(cs, &p, end, &size, true); // skip the semicolon } if (value.isEmpty()) map->remove(name); else map->put(name, value); } return true; } string IntlUtil::convertAsciiToUtf16(const string& ascii) { string s; const char* end = ascii.c_str() + ascii.length(); for (const char* p = ascii.c_str(); p < end; ++p) { USHORT c = *(UCHAR*) p; s.append((char*) &c, sizeof(c)); } return s; } string IntlUtil::convertUtf16ToAscii(const string& utf16, bool* error) { fb_assert(utf16.length() % sizeof(USHORT) == 0); string s; const USHORT* end = (const USHORT*) (utf16.c_str() + utf16.length()); for (const USHORT* p = (const USHORT*) utf16.c_str(); p < end; ++p) { if (*p <= 0xFF) s.append(1, (UCHAR) *p); else { *error = true; return ""; } } *error = false; return s; } ULONG IntlUtil::cvtAsciiToUtf16(csconvert* obj, ULONG nSrc, const UCHAR* pSrc, ULONG nDest, UCHAR* ppDest, USHORT* err_code, ULONG* err_position) { /************************************** * * c v t A s c i i T o U t f 1 6 * ************************************** * * Functional description * Convert CHARACTER SET ASCII to UTF-16. * Byte values below 128 treated as ASCII. * Byte values >= 128 create BAD_INPUT * *************************************/ fb_assert(obj != NULL); fb_assert((pSrc != NULL) || (ppDest == NULL)); fb_assert(err_code != NULL); *err_code = 0; if (ppDest == NULL) /* length estimate needed? */ return (2 * nSrc); Firebird::OutAligner d(ppDest, nDest); USHORT* pDest = d; const USHORT* const pStart = pDest; const UCHAR* const pStart_src = pSrc; while (nDest >= sizeof(*pDest) && nSrc >= sizeof(*pSrc)) { if (*pSrc > 127) { *err_code = CS_BAD_INPUT; break; } *pDest++ = *pSrc++; nDest -= sizeof(*pDest); nSrc -= sizeof(*pSrc); } if (!*err_code && nSrc) { *err_code = CS_TRUNCATION_ERROR; } *err_position = (pSrc - pStart_src) * sizeof(*pSrc); return ((pDest - pStart) * sizeof(*pDest)); } ULONG IntlUtil::cvtUtf16ToAscii(csconvert* obj, ULONG nSrc, const UCHAR* ppSrc, ULONG nDest, UCHAR* pDest, USHORT* err_code, ULONG* err_position) { /************************************** * * c v t U t f 1 6 T o A s c i i * ************************************** * * Functional description * Convert UTF16 to CHARACTER SET ASCII. * Byte values below 128 treated as ASCII. * Byte values >= 128 create CONVERT_ERROR * *************************************/ fb_assert(obj != NULL); fb_assert((ppSrc != NULL) || (pDest == NULL)); fb_assert(err_code != NULL); *err_code = 0; if (pDest == NULL) /* length estimate needed? */ return (nSrc / 2); Firebird::Aligner s(ppSrc, nSrc); const USHORT* pSrc = s; const UCHAR* const pStart = pDest; const USHORT* const pStart_src = pSrc; while (nDest >= sizeof(*pDest) && nSrc >= sizeof(*pSrc)) { if (*pSrc > 127) { *err_code = CS_CONVERT_ERROR; break; } *pDest++ = *pSrc++; nDest -= sizeof(*pDest); nSrc -= sizeof(*pSrc); } if (!*err_code && nSrc) { *err_code = CS_TRUNCATION_ERROR; } *err_position = (pSrc - pStart_src) * sizeof(*pSrc); return ((pDest - pStart) * sizeof(*pDest)); } void IntlUtil::initAsciiCharset(charset* cs) { initNarrowCharset(cs, "ASCII"); initConvert(&cs->charset_to_unicode, cvtAsciiToUtf16); initConvert(&cs->charset_from_unicode, cvtUtf16ToAscii); } void IntlUtil::initConvert(csconvert* cvt, pfn_INTL_convert func) { memset(cvt, 0, sizeof(*cvt)); cvt->csconvert_version = CSCONVERT_VERSION_1; cvt->csconvert_name = (const ASCII*) "DIRECT"; cvt->csconvert_fn_convert = func; } void IntlUtil::initNarrowCharset(charset* cs, const ASCII* name) { memset(cs, 0, sizeof(*cs)); cs->charset_version = CHARSET_VERSION_1; cs->charset_name = name; cs->charset_flags |= CHARSET_ASCII_BASED; cs->charset_min_bytes_per_char = 1; cs->charset_max_bytes_per_char = 1; cs->charset_space_length = 1; cs->charset_space_character = (const BYTE*) " "; cs->charset_fn_well_formed = NULL; } bool IntlUtil::initUnicodeCollation(texttype* tt, charset* cs, const ASCII* name, USHORT attributes, const UCharBuffer& specificAttributes, const string& configInfo) { // name comes from stack. Copy it. ASCII* nameCopy = new ASCII[strlen(name) + 1]; strcpy(nameCopy, name); tt->texttype_name = nameCopy; tt->texttype_version = TEXTTYPE_VERSION_1; tt->texttype_country = CC_INTL; tt->texttype_fn_destroy = unicodeDestroy; tt->texttype_fn_compare = unicodeCompare; tt->texttype_fn_key_length = unicodeKeyLength; tt->texttype_fn_string_to_key = unicodeStrToKey; IntlUtil::SpecificAttributesMap map; Jrd::CharSet* charSet = NULL; try { charSet = Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, cs); IntlUtil::parseSpecificAttributes(charSet, specificAttributes.getCount(), specificAttributes.begin(), &map); delete charSet; } catch (...) { delete charSet; return false; } IntlUtil::SpecificAttributesMap map16; SpecificAttributesMap::Accessor accessor(&map); bool found = accessor.getFirst(); while (found) { UCharBuffer s1, s2; USHORT errCode; ULONG errPosition; s1.resize(cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, accessor.current()->first.length(), NULL, 0, NULL, &errCode, &errPosition)); s1.resize(cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, accessor.current()->first.length(), (UCHAR*) accessor.current()->first.c_str(), s1.getCapacity(), s1.begin(), &errCode, &errPosition)); s2.resize(cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, accessor.current()->second.length(), NULL, 0, NULL, &errCode, &errPosition)); s2.resize(cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, accessor.current()->second.length(), (UCHAR*) accessor.current()->second.c_str(), s2.getCapacity(), s2.begin(), &errCode, &errPosition)); map16.put(string((char*) s1.begin(), s1.getCount()), string((char*) s2.begin(), s2.getCount())); found = accessor.getNext(); } UnicodeUtil::Utf16Collation* collation = UnicodeUtil::Utf16Collation::create(tt, attributes, map16, configInfo); if (!collation) return false; tt->texttype_impl = new TextTypeImpl(cs, collation); if (tt->texttype_canonical_width != 0) tt->texttype_fn_canonical = unicodeCanonical; return true; } ULONG IntlUtil::toLower(Jrd::CharSet* cs, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst, const ULONG* exceptions) { const ULONG utf16_length = cs->getConvToUnicode().convertLength(srcLen); Firebird::HalfStaticArray utf16_str; UCHAR* utf16_ptr; if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate utf16_ptr = dst; else utf16_ptr = utf16_str.getBuffer(utf16_length); // convert to UTF-16 srcLen = cs->getConvToUnicode().convert(srcLen, src, utf16_length, utf16_ptr); // convert to lowercase Firebird::HalfStaticArray lower_str; srcLen = UnicodeUtil::utf16LowerCase(srcLen, Firebird::Aligner(utf16_ptr, srcLen), utf16_length, Firebird::OutAligner(lower_str.getBuffer(utf16_length), utf16_length), exceptions); // convert to original character set return cs->getConvFromUnicode().convert(srcLen, lower_str.begin(), dstLen, dst); } ULONG IntlUtil::toUpper(Jrd::CharSet* cs, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst, const ULONG* exceptions) { const ULONG utf16_length = cs->getConvToUnicode().convertLength(srcLen); Firebird::HalfStaticArray utf16_str; UCHAR* utf16_ptr; if (dstLen >= utf16_length) // if dst buffer is sufficient large, use it as intermediate utf16_ptr = dst; else utf16_ptr = utf16_str.getBuffer(utf16_length); // convert to UTF-16 srcLen = cs->getConvToUnicode().convert(srcLen, src, utf16_length, utf16_ptr); // convert to uppercase Firebird::HalfStaticArray upper_str; srcLen = UnicodeUtil::utf16UpperCase(srcLen, Firebird::Aligner(utf16_ptr, srcLen), utf16_length, Firebird::OutAligner(upper_str.getBuffer(utf16_length), utf16_length), exceptions); // convert to original character set return cs->getConvFromUnicode().convert(srcLen, upper_str.begin(), dstLen, dst); } bool IntlUtil::readOneChar(Jrd::CharSet* cs, const UCHAR** s, const UCHAR* end, ULONG* size) { (*s) += *size; if (*s >= end) { (*s) = end; *size = 0; return false; } UCHAR c[sizeof(ULONG)]; *size = cs->substring(end - *s, *s, sizeof(c), c, 0, 1); return true; } // Transform ICU-VERSION attribute (given by the user) in COLL-VERSION (to be stored). bool IntlUtil::setupIcuAttributes(charset* cs, const string& specificAttributes, const string& configInfo, string& newSpecificAttributes) { AutoPtr charSet(Jrd::CharSet::createInstance(*getDefaultMemoryPool(), 0, cs)); IntlUtil::SpecificAttributesMap map; if (!IntlUtil::parseSpecificAttributes(charSet, specificAttributes.length(), (const UCHAR*) specificAttributes.begin(), &map)) { return false; } string icuVersion; map.get("ICU-VERSION", icuVersion); string collVersion; if (!UnicodeUtil::getCollVersion(icuVersion, configInfo, collVersion)) return false; map.remove("ICU-VERSION"); map.remove("COLL-VERSION"); if (collVersion.hasData()) map.put("COLL-VERSION", collVersion); newSpecificAttributes = IntlUtil::generateSpecificAttributes(charSet, map); return true; } string IntlUtil::escapeAttribute(Jrd::CharSet* cs, const string& s) { string ret; const UCHAR* p = (const UCHAR*)s.begin(); const UCHAR* end = (const UCHAR*)s.end(); ULONG size = 0; while (readOneChar(cs, &p, end, &size)) { UCHAR uc[sizeof(ULONG)]; const ULONG uSize = cs->getConvToUnicode().convert(size, p, sizeof(uc), uc); if (uSize == 2) { if (*(USHORT*) uc == '\\' || *(USHORT*) uc == '=' || *(USHORT*) uc == ';') { *(USHORT*) uc = '\\'; UCHAR bytes[sizeof(ULONG)]; ULONG bytesSize = cs->getConvFromUnicode().convert( sizeof(USHORT), uc, sizeof(bytes), bytes); ret.append(string((const char*)bytes, bytesSize)); } } ret.append(string((const char*)p, size)); } return ret; } string IntlUtil::unescapeAttribute(Jrd::CharSet* cs, const string& s) { string ret; const UCHAR* p = (const UCHAR*)s.begin(); const UCHAR* end = (const UCHAR*)s.end(); ULONG size = 0; while (readAttributeChar(cs, &p, end, &size, false)) ret.append(string((const char*)p, size)); return ret; } bool IntlUtil::isAttributeEscape(Jrd::CharSet* cs, const UCHAR* s, ULONG size) { UCHAR uc[sizeof(ULONG)]; const ULONG uSize = cs->getConvToUnicode().convert(size, s, sizeof(uc), uc); return (uSize == 2 && *(USHORT*) uc == '\\'); } bool IntlUtil::readAttributeChar(Jrd::CharSet* cs, const UCHAR** s, const UCHAR* end, ULONG* size, bool returnEscape) { if (readOneChar(cs, s, end, size)) { if (isAttributeEscape(cs, *s, *size)) { const UCHAR* p = *s; ULONG firstSize = *size; if (readOneChar(cs, s, end, size)) { if (returnEscape) { *s = p; *size += firstSize; } } else return false; } return true; } return false; } static void unicodeDestroy(texttype* tt) { delete[] const_cast(tt->texttype_name); delete static_cast(tt->texttype_impl); } static USHORT unicodeKeyLength(texttype* tt, USHORT len) { TextTypeImpl* impl = static_cast(tt->texttype_impl); return impl->collation->keyLength(len / impl->cs->charset_max_bytes_per_char * 4); } static USHORT unicodeStrToKey(texttype* tt, USHORT srcLen, const UCHAR* src, USHORT dstLen, UCHAR* dst, USHORT keyType) { TextTypeImpl* impl = static_cast(tt->texttype_impl); try { charset* cs = impl->cs; HalfStaticArray utf16Str; USHORT errorCode; ULONG offendingPos; utf16Str.getBuffer( cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, srcLen, src, 0, NULL, &errorCode, &offendingPos)); ULONG utf16Len = cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, srcLen, src, utf16Str.getCapacity(), utf16Str.begin(), &errorCode, &offendingPos); return impl->collation->stringToKey(utf16Len, (USHORT*)utf16Str.begin(), dstLen, dst, keyType); } catch (BadAlloc) { fb_assert(false); return INTL_BAD_KEY_LENGTH; } } static SSHORT unicodeCompare(texttype* tt, ULONG len1, const UCHAR* str1, ULONG len2, const UCHAR* str2, INTL_BOOL* errorFlag) { TextTypeImpl* impl = static_cast(tt->texttype_impl); try { *errorFlag = false; charset* cs = impl->cs; HalfStaticArray utf16Str1; HalfStaticArray utf16Str2; USHORT errorCode; ULONG offendingPos; utf16Str1.getBuffer( cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, len1, str1, 0, NULL, &errorCode, &offendingPos)); const ULONG utf16Len1 = cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, len1, str1, utf16Str1.getCapacity(), utf16Str1.begin(), &errorCode, &offendingPos); utf16Str2.getBuffer( cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, len2, str2, 0, NULL, &errorCode, &offendingPos)); const ULONG utf16Len2 = cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, len2, str2, utf16Str2.getCapacity(), utf16Str2.begin(), &errorCode, &offendingPos); return impl->collation->compare(utf16Len1, (USHORT*)utf16Str1.begin(), utf16Len2, (USHORT*)utf16Str2.begin(), errorFlag); } catch (BadAlloc) { fb_assert(false); return 0; } } static ULONG unicodeCanonical(texttype* tt, ULONG srcLen, const UCHAR* src, ULONG dstLen, UCHAR* dst) { TextTypeImpl* impl = static_cast(tt->texttype_impl); try { charset* cs = impl->cs; HalfStaticArray utf16Str; USHORT errorCode; ULONG offendingPos; utf16Str.getBuffer( cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, srcLen, src, 0, NULL, &errorCode, &offendingPos)); const ULONG utf16Len = cs->charset_to_unicode.csconvert_fn_convert( &cs->charset_to_unicode, srcLen, src, utf16Str.getCapacity(), utf16Str.begin(), &errorCode, &offendingPos); return impl->collation->canonical( utf16Len, Firebird::Aligner(utf16Str.begin(), utf16Len), dstLen, Firebird::OutAligner(dst, dstLen), NULL); } catch (BadAlloc) { fb_assert(false); return INTL_BAD_KEY_LENGTH; } } } // namespace Firebird