2001-05-23 15:26:42 +02:00
|
|
|
/*
|
|
|
|
* PROGRAM: InterBase International support
|
2003-12-11 11:33:30 +01:00
|
|
|
* MODULE: cv_gb2312.cpp
|
2001-05-23 15:26:42 +02:00
|
|
|
* DESCRIPTION: Codeset conversion for GB2312 family codesets
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the Interbase Public
|
|
|
|
* License Version 1.0 (the "License"); you may not use this file
|
|
|
|
* except in compliance with the License. You may obtain a copy
|
|
|
|
* of the License at http://www.Inprise.com/IPL.html
|
|
|
|
*
|
|
|
|
* Software distributed under the License is distributed on an
|
|
|
|
* "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
|
|
|
|
* or implied. See the License for the specific language governing
|
|
|
|
* rights and limitations under the License.
|
|
|
|
*
|
|
|
|
* The Original Code was created by Inprise Corporation
|
|
|
|
* and its predecessors. Portions created by Inprise Corporation are
|
|
|
|
* Copyright (C) Inprise Corporation.
|
|
|
|
*
|
|
|
|
* All Rights Reserved.
|
|
|
|
* Contributor(s): ______________________________________.
|
|
|
|
*/
|
|
|
|
|
2003-02-17 11:37:42 +01:00
|
|
|
#include "firebird.h"
|
2001-05-23 15:26:42 +02:00
|
|
|
#include "../intl/ldcommon.h"
|
2003-02-20 16:47:23 +01:00
|
|
|
#include "../intl/cv_gb2312.h"
|
2005-05-28 00:45:31 +02:00
|
|
|
#include "../intl/cv_narrow.h"
|
2003-09-21 01:33:36 +02:00
|
|
|
#include "ld_proto.h"
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2009-05-02 15:00:09 +02:00
|
|
|
#define GB1(uc) ((UCHAR)((uc) & 0xff) >= 0xa1 && \
|
|
|
|
(UCHAR)((uc) & 0xff) <= 0xfe) // GB2312 1st-byte
|
|
|
|
#define GB2(uc) ((UCHAR)((uc) & 0xff) >= 0xa1 && \
|
|
|
|
(UCHAR)((uc) & 0xff) <= 0xfe) // GB2312 2nd-byte
|
|
|
|
|
|
|
|
|
2005-05-28 00:45:31 +02:00
|
|
|
ULONG CVGB_gb2312_to_unicode(csconvert* obj,
|
|
|
|
ULONG src_len,
|
|
|
|
const UCHAR* src_ptr,
|
|
|
|
ULONG dest_len,
|
2007-10-05 16:37:33 +02:00
|
|
|
UCHAR* p_dest_ptr,
|
|
|
|
USHORT* err_code,
|
|
|
|
ULONG* err_position)
|
2001-05-23 15:26:42 +02:00
|
|
|
{
|
2008-07-12 21:37:03 +02:00
|
|
|
fb_assert(obj != NULL);
|
|
|
|
|
2009-01-13 17:21:15 +01:00
|
|
|
CsConvertImpl* impl = obj->csconvert_impl;
|
2008-07-12 21:37:03 +02:00
|
|
|
|
2007-10-06 12:29:46 +02:00
|
|
|
fb_assert(src_ptr != NULL || p_dest_ptr == NULL);
|
2003-11-04 00:59:24 +01:00
|
|
|
fb_assert(err_code != NULL);
|
|
|
|
fb_assert(err_position != NULL);
|
2007-10-05 16:37:33 +02:00
|
|
|
fb_assert(obj->csconvert_fn_convert == CVGB_gb2312_to_unicode);
|
2008-07-12 21:37:03 +02:00
|
|
|
fb_assert(impl->csconvert_datatable != NULL);
|
|
|
|
fb_assert(impl->csconvert_misc != NULL);
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2005-05-28 00:45:31 +02:00
|
|
|
const ULONG src_start = src_len;
|
2001-05-23 15:26:42 +02:00
|
|
|
*err_code = 0;
|
|
|
|
|
2008-10-12 16:30:15 +02:00
|
|
|
// See if we're only after a length estimate
|
2007-10-05 16:37:33 +02:00
|
|
|
if (p_dest_ptr == NULL)
|
2005-05-28 00:45:31 +02:00
|
|
|
return (src_len * sizeof(USHORT));
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2007-10-05 16:37:33 +02:00
|
|
|
Firebird::OutAligner<USHORT> d(p_dest_ptr, dest_len);
|
|
|
|
USHORT* dest_ptr = d;
|
|
|
|
|
2005-05-28 00:45:31 +02:00
|
|
|
USHORT wide;
|
2004-03-07 08:58:55 +01:00
|
|
|
USHORT this_len;
|
2005-05-28 00:45:31 +02:00
|
|
|
const USHORT* const start = dest_ptr;
|
2009-05-10 17:23:31 +02:00
|
|
|
while (src_len && dest_len > 1)
|
2009-05-09 15:46:06 +02:00
|
|
|
{
|
|
|
|
if (*src_ptr & 0x80)
|
|
|
|
{
|
2004-03-07 08:58:55 +01:00
|
|
|
const UCHAR c1 = *src_ptr++;
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2009-05-09 15:46:06 +02:00
|
|
|
if (GB1(c1))
|
|
|
|
{ // first byte is GB2312
|
2001-05-23 15:26:42 +02:00
|
|
|
if (src_len == 1) {
|
|
|
|
*err_code = CS_BAD_INPUT;
|
|
|
|
break;
|
|
|
|
}
|
2004-03-07 08:58:55 +01:00
|
|
|
const UCHAR c2 = *src_ptr++;
|
2009-05-09 15:46:06 +02:00
|
|
|
if (!(GB2(c2))) { // Bad second byte
|
2001-05-23 15:26:42 +02:00
|
|
|
*err_code = CS_BAD_INPUT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
wide = (c1 << 8) + c2;
|
|
|
|
this_len = 2;
|
|
|
|
}
|
2009-05-09 15:46:06 +02:00
|
|
|
else
|
|
|
|
{
|
2001-05-23 15:26:42 +02:00
|
|
|
*err_code = CS_BAD_INPUT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2009-05-09 15:46:06 +02:00
|
|
|
else
|
|
|
|
{ // it is ASCII
|
2001-05-23 15:26:42 +02:00
|
|
|
|
|
|
|
wide = *src_ptr++;
|
|
|
|
this_len = 1;
|
|
|
|
}
|
|
|
|
|
2009-05-09 15:46:06 +02:00
|
|
|
// Convert from GB2312 to UNICODE
|
2008-07-12 21:37:03 +02:00
|
|
|
const USHORT ch = ((const USHORT*) impl->csconvert_datatable)
|
2008-10-12 16:30:15 +02:00
|
|
|
[((const USHORT*) impl->csconvert_misc)[(USHORT) wide / 256] + (wide % 256)];
|
2001-05-23 15:26:42 +02:00
|
|
|
|
|
|
|
if ((ch == CS_CANT_MAP) && !(wide == CS_CANT_MAP)) {
|
|
|
|
*err_code = CS_CONVERT_ERROR;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
*dest_ptr++ = ch;
|
2003-02-18 06:24:35 +01:00
|
|
|
dest_len -= sizeof(*dest_ptr);
|
2001-05-23 15:26:42 +02:00
|
|
|
src_len -= this_len;
|
2004-03-11 06:04:26 +01:00
|
|
|
}
|
2001-05-23 15:26:42 +02:00
|
|
|
if (src_len && !*err_code) {
|
|
|
|
*err_code = CS_TRUNCATION_ERROR;
|
2004-03-11 06:04:26 +01:00
|
|
|
}
|
2001-05-23 15:26:42 +02:00
|
|
|
*err_position = src_start - src_len;
|
|
|
|
return ((dest_ptr - start) * sizeof(*dest_ptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-05-28 00:45:31 +02:00
|
|
|
ULONG CVGB_unicode_to_gb2312(csconvert* obj,
|
|
|
|
ULONG unicode_len,
|
2007-10-05 16:37:33 +02:00
|
|
|
const UCHAR* p_unicode_str,
|
2005-05-28 00:45:31 +02:00
|
|
|
ULONG gb_len,
|
2007-10-05 16:37:33 +02:00
|
|
|
UCHAR* gb_str,
|
2008-12-05 02:20:14 +01:00
|
|
|
USHORT* err_code,
|
2007-10-05 16:37:33 +02:00
|
|
|
ULONG* err_position)
|
2001-05-23 15:26:42 +02:00
|
|
|
{
|
2008-07-12 21:37:03 +02:00
|
|
|
fb_assert(obj != NULL);
|
|
|
|
|
2009-01-13 17:21:15 +01:00
|
|
|
CsConvertImpl* impl = obj->csconvert_impl;
|
2008-07-12 21:37:03 +02:00
|
|
|
|
2007-10-06 12:29:46 +02:00
|
|
|
fb_assert(p_unicode_str != NULL || gb_str == NULL);
|
2003-11-04 00:59:24 +01:00
|
|
|
fb_assert(err_code != NULL);
|
|
|
|
fb_assert(err_position != NULL);
|
2007-10-05 16:37:33 +02:00
|
|
|
fb_assert(obj->csconvert_fn_convert == CVGB_unicode_to_gb2312);
|
2008-07-12 21:37:03 +02:00
|
|
|
fb_assert(impl->csconvert_datatable != NULL);
|
|
|
|
fb_assert(impl->csconvert_misc != NULL);
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2005-05-28 00:45:31 +02:00
|
|
|
const ULONG src_start = unicode_len;
|
2001-05-23 15:26:42 +02:00
|
|
|
*err_code = 0;
|
|
|
|
|
2009-05-09 15:46:06 +02:00
|
|
|
// See if we're only after a length estimate
|
2001-05-23 15:26:42 +02:00
|
|
|
if (gb_str == NULL)
|
2009-05-10 17:23:31 +02:00
|
|
|
return unicode_len; // worst case - all han character input
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2007-10-05 16:37:33 +02:00
|
|
|
Firebird::Aligner<USHORT> s(p_unicode_str, unicode_len);
|
|
|
|
const USHORT* unicode_str = s;
|
|
|
|
|
2004-03-07 08:58:55 +01:00
|
|
|
const UCHAR* const start = gb_str;
|
2009-05-10 17:23:31 +02:00
|
|
|
while (gb_len && unicode_len > 1)
|
2009-05-09 15:46:06 +02:00
|
|
|
{
|
|
|
|
// Convert from UNICODE to GB2312 code
|
2005-05-28 00:45:31 +02:00
|
|
|
const USHORT wide = *unicode_str++;
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2008-07-12 21:37:03 +02:00
|
|
|
const USHORT gb_ch = ((const USHORT*) impl->csconvert_datatable)
|
2008-10-12 16:30:15 +02:00
|
|
|
[((const USHORT*) impl->csconvert_misc)[(USHORT)wide / 256] + (wide % 256)];
|
2001-05-23 15:26:42 +02:00
|
|
|
if ((gb_ch == CS_CANT_MAP) && !(wide == CS_CANT_MAP)) {
|
|
|
|
*err_code = CS_CONVERT_ERROR;
|
|
|
|
break;
|
2004-03-11 06:04:26 +01:00
|
|
|
}
|
2001-05-23 15:26:42 +02:00
|
|
|
|
2004-03-07 08:58:55 +01:00
|
|
|
const int tmp1 = gb_ch / 256;
|
|
|
|
const int tmp2 = gb_ch % 256;
|
2009-05-09 15:46:06 +02:00
|
|
|
if (tmp1 == 0)
|
|
|
|
{ // ASCII character
|
2008-12-05 02:20:14 +01:00
|
|
|
|
2008-10-12 17:57:39 +02:00
|
|
|
fb_assert((UCHAR(tmp2) & 0x80) == 0);
|
2008-12-05 02:20:14 +01:00
|
|
|
|
2001-05-23 15:26:42 +02:00
|
|
|
*gb_str++ = tmp2;
|
|
|
|
gb_len--;
|
|
|
|
unicode_len -= sizeof(*unicode_str);
|
|
|
|
continue;
|
2004-03-11 06:04:26 +01:00
|
|
|
}
|
2009-05-09 15:46:06 +02:00
|
|
|
if (gb_len < 2)
|
|
|
|
{
|
2001-05-23 15:26:42 +02:00
|
|
|
*err_code = CS_TRUNCATION_ERROR;
|
|
|
|
break;
|
|
|
|
}
|
2009-05-09 15:46:06 +02:00
|
|
|
else
|
|
|
|
{
|
2003-11-04 00:59:24 +01:00
|
|
|
fb_assert(GB1(tmp1));
|
|
|
|
fb_assert(GB2(tmp2));
|
2001-05-23 15:26:42 +02:00
|
|
|
*gb_str++ = tmp1;
|
|
|
|
*gb_str++ = tmp2;
|
|
|
|
unicode_len -= sizeof(*unicode_str);
|
|
|
|
gb_len -= 2;
|
2004-03-11 06:04:26 +01:00
|
|
|
}
|
2001-05-23 15:26:42 +02:00
|
|
|
}
|
|
|
|
if (unicode_len && !*err_code) {
|
|
|
|
*err_code = CS_TRUNCATION_ERROR;
|
|
|
|
}
|
|
|
|
*err_position = src_start - unicode_len;
|
|
|
|
return ((gb_str - start) * sizeof(*gb_str));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2009-05-01 19:21:36 +02:00
|
|
|
INTL_BOOL CVGB_check_gb2312(charset* /*cs*/, ULONG gb_len, const UCHAR *gb_str, ULONG* offending_position)
|
2001-05-23 15:26:42 +02:00
|
|
|
{
|
|
|
|
/**************************************
|
|
|
|
* Functional description
|
|
|
|
* Make sure that the GB2312 string does not have any truncated 2 byte
|
2008-12-05 02:20:14 +01:00
|
|
|
* character at the end.
|
|
|
|
* If we have a truncated character then,
|
|
|
|
* return false.
|
2005-05-28 00:45:31 +02:00
|
|
|
* else return(true);
|
2001-05-23 15:26:42 +02:00
|
|
|
**************************************/
|
2006-10-03 03:25:57 +02:00
|
|
|
const UCHAR* gb_str_start = gb_str;
|
|
|
|
|
|
|
|
while (gb_len--)
|
|
|
|
{
|
2004-03-07 08:58:55 +01:00
|
|
|
const UCHAR c1 = *gb_str;
|
2006-10-03 03:25:57 +02:00
|
|
|
|
|
|
|
if (c1 & 0x80) // it is not an ASCII char
|
|
|
|
{
|
|
|
|
if (GB1(c1)) // first byte is GB2312
|
2005-05-28 00:45:31 +02:00
|
|
|
{
|
2006-10-03 03:25:57 +02:00
|
|
|
if (gb_len == 0 || // truncated GB2312
|
|
|
|
!GB2(gb_str[1])) // bad second byte
|
|
|
|
{
|
2006-10-04 04:01:26 +02:00
|
|
|
if (offending_position)
|
|
|
|
*offending_position = gb_str - gb_str_start;
|
|
|
|
|
2006-10-03 03:25:57 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2001-05-23 15:26:42 +02:00
|
|
|
gb_str += 2;
|
|
|
|
gb_len -= 1;
|
|
|
|
}
|
2006-10-03 03:25:57 +02:00
|
|
|
else // bad first byte
|
2006-10-04 04:01:26 +02:00
|
|
|
{
|
|
|
|
if (offending_position)
|
|
|
|
*offending_position = gb_str - gb_str_start;
|
|
|
|
|
2006-10-03 03:25:57 +02:00
|
|
|
return false;
|
2006-10-04 04:01:26 +02:00
|
|
|
}
|
2001-05-23 15:26:42 +02:00
|
|
|
}
|
2006-10-03 03:25:57 +02:00
|
|
|
else // it is an ASCII char
|
2001-05-23 15:26:42 +02:00
|
|
|
gb_str++;
|
|
|
|
}
|
2006-10-03 03:25:57 +02:00
|
|
|
|
|
|
|
return true;
|
2001-05-23 15:26:42 +02:00
|
|
|
}
|