mirror of
https://github.com/FirebirdSQL/firebird.git
synced 2025-01-27 20:03:03 +01:00
490 lines
11 KiB
C++
490 lines
11 KiB
C++
/*
|
|
* PROGRAM: JRD Access Method
|
|
* MODULE: kanji.c
|
|
* DESCRIPTION:
|
|
*
|
|
* The contents of this file are subject to the Interbase Public
|
|
* License Version 1.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy
|
|
* of the License at http://www.Inprise.com/IPL.html
|
|
*
|
|
* Software distributed under the License is distributed on an
|
|
* "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
|
|
* or implied. See the License for the specific language governing
|
|
* rights and limitations under the License.
|
|
*
|
|
* The Original Code was created by Inprise Corporation
|
|
* and its predecessors. Portions created by Inprise Corporation are
|
|
* Copyright (C) Inprise Corporation.
|
|
*
|
|
* All Rights Reserved.
|
|
* Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
#include "firebird.h"
|
|
#include "../jrd/ib_stdio.h"
|
|
#include "../jrd/common.h"
|
|
#include "kanji.h"
|
|
#include "kanji_proto.h"
|
|
|
|
|
|
#define S2E(s1, s2, j1, j2) \
|
|
{ \
|
|
if (s2 >= 0x9f) { \
|
|
if (s1 >= 0xe0) j1 = (s1*2 - 0xe0); \
|
|
else j1 = (s1*2 - 0x60); \
|
|
j2 = (s2 + 2); \
|
|
} else { \
|
|
if (s1 >= 0xe0) j1 = (s1*2 - 0xe1); \
|
|
else j1 = (s1*2 - 0x61);\
|
|
if (s2 >= 0x7f) j2 = (s2 + 0x60); \
|
|
else j2 = (s2 + 0x61); \
|
|
} \
|
|
}
|
|
|
|
|
|
USHORT KANJI_check_euc(UCHAR * euc_str, USHORT euc_len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ c h e c k _ e u c
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* This is a cousin of the KANJI_check_sjis routine.
|
|
* Make sure that the euc string does not have any truncated 2 byte
|
|
* character at the end. * If we have a truncated character then,
|
|
* return 1.
|
|
* else return(0);
|
|
**************************************/
|
|
UCHAR c1, c2;
|
|
|
|
|
|
while (euc_len--) {
|
|
if (*euc_str & 0x80) { /* Is it EUC */
|
|
if (euc_len == 0) { /* truncated kanji */
|
|
return (1);
|
|
}
|
|
else {
|
|
euc_str += 2;
|
|
euc_len -= 1;
|
|
}
|
|
}
|
|
else { /* it is a ASCII */
|
|
|
|
euc_str++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
|
|
USHORT KANJI_check_sjis(UCHAR * sjis_str, USHORT sjis_len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ c h e c k _ s j i s
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* This is a cousin of the KANJI_check_euc routine.
|
|
* Make sure that the sjis string does not have any truncated 2 byte
|
|
* character at the end. * If we have a truncated character then,
|
|
* return 1.
|
|
* else return(0);
|
|
**************************************/
|
|
|
|
UCHAR c1;
|
|
|
|
while (sjis_len--) {
|
|
if (*sjis_str & 0x80) { /* Is it SJIS */
|
|
if SJIS1
|
|
((c1 = *sjis_str)) { /* It is a KANJI */
|
|
if (sjis_len == 0) { /* truncated KANJI */
|
|
return (1);
|
|
}
|
|
else {
|
|
sjis_str += 2;
|
|
sjis_len -= 1;
|
|
}
|
|
}
|
|
else { /*It is a KANA */
|
|
|
|
sjis_str++;
|
|
}
|
|
}
|
|
else { /* it is a ASCII */
|
|
|
|
sjis_str++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_euc2sjis(UCHAR * euc_str,
|
|
USHORT euc_len,
|
|
UCHAR * sjis_str,
|
|
USHORT sjis_buf_len, USHORT * sjis_len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ e u c 2 s j i s
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Convert euc_len number of bytes in euc_str to sjis_str .
|
|
* sjis_buf_len is the maximum size of the sjis buffer.
|
|
* sjis_len is set to the number of bytes put in the sjis_str.
|
|
* If a kanji conversion error occurs a 1 is returned.
|
|
*
|
|
**************************************/
|
|
UCHAR c1, c2;
|
|
|
|
*sjis_len = 0;
|
|
while (euc_len) {
|
|
if (*euc_str & 0x80) { /* Non-Ascii - High bit set */
|
|
if (*sjis_len >= sjis_buf_len) /*buffer full */
|
|
return (1);
|
|
|
|
c1 = *euc_str++;
|
|
euc_len--;
|
|
|
|
if (EUC1(c1)) { /* It is a EUC */
|
|
if (euc_len == 0)
|
|
return (1); /* truncated EUC */
|
|
c2 = *euc_str++;
|
|
euc_len--;
|
|
if (!(EUC2(c2)))
|
|
return (1); /* Bad EUC */
|
|
if (c1 == 0x8e) { /* Kana */
|
|
*sjis_len += 1;
|
|
*sjis_str++ = c2;
|
|
}
|
|
else { /* Kanji */
|
|
|
|
*sjis_len += 2;
|
|
if (*sjis_len > sjis_buf_len) /*buffer full */
|
|
return (1);
|
|
c1 ^= 0x80;
|
|
c2 ^= 0x80;
|
|
*sjis_str++ =
|
|
(c1 - 0x21) / 2 + ((c1 <= 0x5e) ? 0x81 : 0xc1);
|
|
if (c1 & 1) /* odd */
|
|
*sjis_str++ = c2 + ((c2 <= 0x5f) ? 0x1f : 0x20);
|
|
else
|
|
*sjis_str++ = c2 + 0x7e;
|
|
}
|
|
}
|
|
else /* It is some bad character */
|
|
return (1);
|
|
}
|
|
else { /* ASCII */
|
|
|
|
euc_len--;
|
|
*sjis_len += 1;
|
|
*sjis_str++ = *euc_str++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_euc_byte2short(UCHAR * src, USHORT * dst, USHORT len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ e u c _ b y t e 2 s h o r t
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Convert len number of bytes of EUC string in
|
|
* src (SCHAR-based buffer) into dst (short-based buffer).
|
|
* This routine merges:
|
|
* 1-byte ASCII into 1 sshort, and
|
|
* 2-byte EUC kanji into 1 sshort.
|
|
* Return the number of "characters" in dst.
|
|
*
|
|
**************************************/
|
|
USHORT l, x;
|
|
for (l = 0; len-- > 0; l++) {
|
|
x = (EUC1(*src)) ? (len--, (*src++ << 8)) : 0;
|
|
x |= *src++;
|
|
*dst++ = x;
|
|
}
|
|
return l;
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_euc_len(UCHAR * sjis_str, USHORT sjis_len, USHORT * euc_len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ e u c _ l e n
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Return the number of euc bytes corresponding to a given sjis string.
|
|
* Returns 1 if invalid kanji is encountered.
|
|
*
|
|
**************************************/
|
|
|
|
UCHAR c1, c2;
|
|
|
|
*euc_len = 0;
|
|
while (sjis_len) {
|
|
if (*sjis_str & 0x80) { /* Non-Ascii - High bit set */
|
|
c1 = *sjis_str++;
|
|
sjis_len--;
|
|
|
|
if (SJIS1(c1)) { /* First byte is a KANJI */
|
|
if (sjis_len == 0)
|
|
return (1); /* truncated KANJI */
|
|
c2 = *sjis_str++;
|
|
sjis_len--;
|
|
if (!(SJIS2(c2)))
|
|
return (1); /* Bad second byte */
|
|
*euc_len += 2; /* Good Kanji */
|
|
}
|
|
else if (SJIS_SINGLE(c1))
|
|
*euc_len += 2; /* Kana */
|
|
else
|
|
return (1); /* It is some bad character */
|
|
}
|
|
else { /* it is a ASCII */
|
|
|
|
sjis_len--;
|
|
*euc_len += 1;
|
|
sjis_str++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_sjis2euc(UCHAR * sjis_str,
|
|
USHORT sjis_len,
|
|
UCHAR * euc_str, USHORT euc_buf_len, USHORT * euc_len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ s j i s 2 e u c
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Convert sjis_len number of bytes in sjis_str to euc_str .
|
|
* euc_len is set to the number of bytes put in the euc_str.
|
|
* If a kanji conversion error occurs a 1 is returned.
|
|
*
|
|
**************************************/
|
|
|
|
UCHAR c1, c2;
|
|
|
|
*euc_len = 0;
|
|
while (sjis_len) {
|
|
if (*euc_len >= euc_buf_len) /*buffer full */
|
|
return (1);
|
|
|
|
if (*sjis_str & 0x80) { /* Non-Ascii - High bit set */
|
|
c1 = *sjis_str++;
|
|
sjis_len--;
|
|
|
|
if (SJIS1(c1)) { /* First byte is a KANJI */
|
|
if (sjis_len == 0)
|
|
return (1); /* truncated KANJI */
|
|
c2 = *sjis_str++;
|
|
sjis_len--;
|
|
if (!(SJIS2(c2)))
|
|
return (1); /* Bad second byte */
|
|
*euc_len += 2; /* Good Kanji */
|
|
if (*euc_len > euc_buf_len) /*buffer full */
|
|
return (1);
|
|
S2E(c1, c2, *euc_str, *(euc_str + 1));
|
|
euc_str += 2;
|
|
}
|
|
else if (SJIS_SINGLE(c1)) {
|
|
*euc_len += 2; /* Kana */
|
|
if (*euc_len > euc_buf_len) /*buffer full */
|
|
return (1);
|
|
*euc_str++ = 0x8e;
|
|
*euc_str++ = c1;
|
|
}
|
|
else
|
|
return (1); /* It is some bad character */
|
|
}
|
|
else { /* it is a ASCII */
|
|
|
|
*euc_len += 1;
|
|
sjis_len--;
|
|
*euc_str++ = *sjis_str++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_sjis_byte2short(UCHAR * src;
|
|
USHORT * dst, USHORT len) {
|
|
/**************************************
|
|
*
|
|
* K A N J I _ s j i s _ b y t e 2 s h o r t
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Convert len number of bytes of SJIS string in
|
|
* src (SCHAR-based buffer) into dst (short-based buffer).
|
|
* This routine merges:
|
|
* 1-byte ASCII into 1 sshort,
|
|
* 1-byte SJIS kana 1 sshort, and
|
|
* 2-byte SJIS kanji into 1 sshort.
|
|
* Return the number of "characters" in dst.
|
|
*
|
|
**************************************/
|
|
USHORT l, x;
|
|
for (l = 0; len-- > 0; l++) {
|
|
x = (SJIS1(*src)) ? (len--, (*src++ << 8)) : 0;
|
|
x |= *src++;
|
|
*dst++ = x;
|
|
}
|
|
return l;
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_sjis2euc5(UCHAR * sjis_str,
|
|
USHORT sjis_len,
|
|
UCHAR * euc_str,
|
|
USHORT euc_buf_len,
|
|
USHORT * euc_len, USHORT * ib_sjis, USHORT * ib_euc)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ s j i s 2 e u c 5
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Similar to KANJI_sjis2euc().
|
|
* Differences:
|
|
* (1) when buffer is full, returns (1).
|
|
* (2) when there's invalid SCHAR, returns(2).
|
|
* (3) two additional parameters are:
|
|
* ib_sjis number of in-bound sjis character
|
|
* ib_euc number of in-bound euc character
|
|
* This function is designed to convert blob segment from SJIS to EUC
|
|
* on retrieval. The reason we have ib_sjis and ib_euc is to keep
|
|
* track of splitting fragments from segments.
|
|
*
|
|
**************************************/
|
|
UCHAR c1, c2;
|
|
|
|
*euc_len = 0;
|
|
*ib_sjis = *ib_euc = 0;
|
|
while (sjis_len) {
|
|
if (*euc_len >= euc_buf_len) /*buffer full */
|
|
return (1);
|
|
|
|
if (*sjis_str & 0x80) { /* Non-Ascii - High bit set */
|
|
c1 = *sjis_str++;
|
|
sjis_len--;
|
|
|
|
if (SJIS1(c1)) { /* First byte is a KANJI */
|
|
if (sjis_len == 0)
|
|
return (2); /* truncated KANJI */
|
|
c2 = *sjis_str++;
|
|
sjis_len--;
|
|
if (!(SJIS2(c2)))
|
|
return (2); /* Bad second byte */
|
|
*euc_len += 2; /* Good Kanji */
|
|
if (*euc_len > euc_buf_len) /*buffer full */
|
|
return (1);
|
|
S2E(c1, c2, *euc_str, *(euc_str + 1));
|
|
euc_str += 2;
|
|
*ib_sjis += 2;
|
|
*ib_euc += 2;
|
|
}
|
|
else if (SJIS_SINGLE(c1)) {
|
|
*euc_len += 2; /* Kana */
|
|
if (*euc_len > euc_buf_len) /*buffer full */
|
|
return (1);
|
|
*euc_str++ = 0x8e;
|
|
*euc_str++ = c1;
|
|
(*ib_sjis)++;
|
|
*ib_euc += 2;
|
|
}
|
|
else
|
|
return (2); /* It is some bad character */
|
|
}
|
|
else { /* it is a ASCII */
|
|
|
|
*euc_len += 1;
|
|
sjis_len--;
|
|
*euc_str++ = *sjis_str++;
|
|
(*ib_sjis)++;
|
|
(*ib_euc)++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
|
|
|
|
USHORT KANJI_sjis_len(UCHAR * euc_str, USHORT euc_len, USHORT * sjis_len)
|
|
{
|
|
/**************************************
|
|
*
|
|
* K A N J I _ s j i s _ l e n
|
|
*
|
|
**************************************
|
|
*
|
|
* Functional description
|
|
* Find the number of sjis bytes corresponding to a given euc string.
|
|
* Returns 1 if invalid kanji is encountered.
|
|
*
|
|
**************************************/
|
|
|
|
UCHAR c1, c2;
|
|
|
|
*sjis_len = 0;
|
|
while (euc_len) {
|
|
if (*euc_str & 0x80) { /* Non-Ascii - High bit set */
|
|
c1 = *euc_str++;
|
|
euc_len--;
|
|
|
|
if (EUC1(c1)) { /* It is a EUC */
|
|
if (euc_len == 0)
|
|
return (1); /* truncated EUC */
|
|
c2 = *euc_str++;
|
|
euc_len--;
|
|
if (!(EUC2(c2)))
|
|
return (1); /* Bad EUC */
|
|
if (c1 == 0x8e)
|
|
*sjis_len += 1; /* Kana */
|
|
else
|
|
*sjis_len += 2; /* Kanji */
|
|
}
|
|
else /* It is some bad character */
|
|
return (1);
|
|
}
|
|
else { /* ASCII */
|
|
|
|
euc_len--;
|
|
*sjis_len += 1;
|
|
euc_str++;
|
|
}
|
|
}
|
|
return (0);
|
|
}
|