8
0
mirror of https://github.com/FirebirdSQL/firebird.git synced 2025-01-27 20:03:03 +01:00
firebird-mirror/src/intl/kanji.cpp

490 lines
11 KiB
C++

/*
* PROGRAM: JRD Access Method
* MODULE: kanji.c
* DESCRIPTION:
*
* The contents of this file are subject to the Interbase Public
* License Version 1.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy
* of the License at http://www.Inprise.com/IPL.html
*
* Software distributed under the License is distributed on an
* "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
* or implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code was created by Inprise Corporation
* and its predecessors. Portions created by Inprise Corporation are
* Copyright (C) Inprise Corporation.
*
* All Rights Reserved.
* Contributor(s): ______________________________________.
*/
#include "firebird.h"
#include "../jrd/ib_stdio.h"
#include "../jrd/common.h"
#include "kanji.h"
#include "kanji_proto.h"
#define S2E(s1, s2, j1, j2) \
{ \
if (s2 >= 0x9f) { \
if (s1 >= 0xe0) j1 = (s1*2 - 0xe0); \
else j1 = (s1*2 - 0x60); \
j2 = (s2 + 2); \
} else { \
if (s1 >= 0xe0) j1 = (s1*2 - 0xe1); \
else j1 = (s1*2 - 0x61);\
if (s2 >= 0x7f) j2 = (s2 + 0x60); \
else j2 = (s2 + 0x61); \
} \
}
USHORT KANJI_check_euc(UCHAR * euc_str, USHORT euc_len)
{
/**************************************
*
* K A N J I _ c h e c k _ e u c
*
**************************************
*
* Functional description
* This is a cousin of the KANJI_check_sjis routine.
* Make sure that the euc string does not have any truncated 2 byte
* character at the end. * If we have a truncated character then,
* return 1.
* else return(0);
**************************************/
UCHAR c1, c2;
while (euc_len--) {
if (*euc_str & 0x80) { /* Is it EUC */
if (euc_len == 0) { /* truncated kanji */
return (1);
}
else {
euc_str += 2;
euc_len -= 1;
}
}
else { /* it is a ASCII */
euc_str++;
}
}
return (0);
}
USHORT KANJI_check_sjis(UCHAR * sjis_str, USHORT sjis_len)
{
/**************************************
*
* K A N J I _ c h e c k _ s j i s
*
**************************************
*
* Functional description
* This is a cousin of the KANJI_check_euc routine.
* Make sure that the sjis string does not have any truncated 2 byte
* character at the end. * If we have a truncated character then,
* return 1.
* else return(0);
**************************************/
UCHAR c1;
while (sjis_len--) {
if (*sjis_str & 0x80) { /* Is it SJIS */
if SJIS1
((c1 = *sjis_str)) { /* It is a KANJI */
if (sjis_len == 0) { /* truncated KANJI */
return (1);
}
else {
sjis_str += 2;
sjis_len -= 1;
}
}
else { /*It is a KANA */
sjis_str++;
}
}
else { /* it is a ASCII */
sjis_str++;
}
}
return (0);
}
USHORT KANJI_euc2sjis(UCHAR * euc_str,
USHORT euc_len,
UCHAR * sjis_str,
USHORT sjis_buf_len, USHORT * sjis_len)
{
/**************************************
*
* K A N J I _ e u c 2 s j i s
*
**************************************
*
* Functional description
* Convert euc_len number of bytes in euc_str to sjis_str .
* sjis_buf_len is the maximum size of the sjis buffer.
* sjis_len is set to the number of bytes put in the sjis_str.
* If a kanji conversion error occurs a 1 is returned.
*
**************************************/
UCHAR c1, c2;
*sjis_len = 0;
while (euc_len) {
if (*euc_str & 0x80) { /* Non-Ascii - High bit set */
if (*sjis_len >= sjis_buf_len) /*buffer full */
return (1);
c1 = *euc_str++;
euc_len--;
if (EUC1(c1)) { /* It is a EUC */
if (euc_len == 0)
return (1); /* truncated EUC */
c2 = *euc_str++;
euc_len--;
if (!(EUC2(c2)))
return (1); /* Bad EUC */
if (c1 == 0x8e) { /* Kana */
*sjis_len += 1;
*sjis_str++ = c2;
}
else { /* Kanji */
*sjis_len += 2;
if (*sjis_len > sjis_buf_len) /*buffer full */
return (1);
c1 ^= 0x80;
c2 ^= 0x80;
*sjis_str++ =
(c1 - 0x21) / 2 + ((c1 <= 0x5e) ? 0x81 : 0xc1);
if (c1 & 1) /* odd */
*sjis_str++ = c2 + ((c2 <= 0x5f) ? 0x1f : 0x20);
else
*sjis_str++ = c2 + 0x7e;
}
}
else /* It is some bad character */
return (1);
}
else { /* ASCII */
euc_len--;
*sjis_len += 1;
*sjis_str++ = *euc_str++;
}
}
return (0);
}
USHORT KANJI_euc_byte2short(UCHAR * src, USHORT * dst, USHORT len)
{
/**************************************
*
* K A N J I _ e u c _ b y t e 2 s h o r t
*
**************************************
*
* Functional description
* Convert len number of bytes of EUC string in
* src (SCHAR-based buffer) into dst (short-based buffer).
* This routine merges:
* 1-byte ASCII into 1 sshort, and
* 2-byte EUC kanji into 1 sshort.
* Return the number of "characters" in dst.
*
**************************************/
USHORT l, x;
for (l = 0; len-- > 0; l++) {
x = (EUC1(*src)) ? (len--, (*src++ << 8)) : 0;
x |= *src++;
*dst++ = x;
}
return l;
}
USHORT KANJI_euc_len(UCHAR * sjis_str, USHORT sjis_len, USHORT * euc_len)
{
/**************************************
*
* K A N J I _ e u c _ l e n
*
**************************************
*
* Functional description
* Return the number of euc bytes corresponding to a given sjis string.
* Returns 1 if invalid kanji is encountered.
*
**************************************/
UCHAR c1, c2;
*euc_len = 0;
while (sjis_len) {
if (*sjis_str & 0x80) { /* Non-Ascii - High bit set */
c1 = *sjis_str++;
sjis_len--;
if (SJIS1(c1)) { /* First byte is a KANJI */
if (sjis_len == 0)
return (1); /* truncated KANJI */
c2 = *sjis_str++;
sjis_len--;
if (!(SJIS2(c2)))
return (1); /* Bad second byte */
*euc_len += 2; /* Good Kanji */
}
else if (SJIS_SINGLE(c1))
*euc_len += 2; /* Kana */
else
return (1); /* It is some bad character */
}
else { /* it is a ASCII */
sjis_len--;
*euc_len += 1;
sjis_str++;
}
}
return (0);
}
USHORT KANJI_sjis2euc(UCHAR * sjis_str,
USHORT sjis_len,
UCHAR * euc_str, USHORT euc_buf_len, USHORT * euc_len)
{
/**************************************
*
* K A N J I _ s j i s 2 e u c
*
**************************************
*
* Functional description
* Convert sjis_len number of bytes in sjis_str to euc_str .
* euc_len is set to the number of bytes put in the euc_str.
* If a kanji conversion error occurs a 1 is returned.
*
**************************************/
UCHAR c1, c2;
*euc_len = 0;
while (sjis_len) {
if (*euc_len >= euc_buf_len) /*buffer full */
return (1);
if (*sjis_str & 0x80) { /* Non-Ascii - High bit set */
c1 = *sjis_str++;
sjis_len--;
if (SJIS1(c1)) { /* First byte is a KANJI */
if (sjis_len == 0)
return (1); /* truncated KANJI */
c2 = *sjis_str++;
sjis_len--;
if (!(SJIS2(c2)))
return (1); /* Bad second byte */
*euc_len += 2; /* Good Kanji */
if (*euc_len > euc_buf_len) /*buffer full */
return (1);
S2E(c1, c2, *euc_str, *(euc_str + 1));
euc_str += 2;
}
else if (SJIS_SINGLE(c1)) {
*euc_len += 2; /* Kana */
if (*euc_len > euc_buf_len) /*buffer full */
return (1);
*euc_str++ = 0x8e;
*euc_str++ = c1;
}
else
return (1); /* It is some bad character */
}
else { /* it is a ASCII */
*euc_len += 1;
sjis_len--;
*euc_str++ = *sjis_str++;
}
}
return (0);
}
USHORT KANJI_sjis_byte2short(UCHAR * src;
USHORT * dst, USHORT len) {
/**************************************
*
* K A N J I _ s j i s _ b y t e 2 s h o r t
*
**************************************
*
* Functional description
* Convert len number of bytes of SJIS string in
* src (SCHAR-based buffer) into dst (short-based buffer).
* This routine merges:
* 1-byte ASCII into 1 sshort,
* 1-byte SJIS kana 1 sshort, and
* 2-byte SJIS kanji into 1 sshort.
* Return the number of "characters" in dst.
*
**************************************/
USHORT l, x;
for (l = 0; len-- > 0; l++) {
x = (SJIS1(*src)) ? (len--, (*src++ << 8)) : 0;
x |= *src++;
*dst++ = x;
}
return l;
}
USHORT KANJI_sjis2euc5(UCHAR * sjis_str,
USHORT sjis_len,
UCHAR * euc_str,
USHORT euc_buf_len,
USHORT * euc_len, USHORT * ib_sjis, USHORT * ib_euc)
{
/**************************************
*
* K A N J I _ s j i s 2 e u c 5
*
**************************************
*
* Functional description
* Similar to KANJI_sjis2euc().
* Differences:
* (1) when buffer is full, returns (1).
* (2) when there's invalid SCHAR, returns(2).
* (3) two additional parameters are:
* ib_sjis number of in-bound sjis character
* ib_euc number of in-bound euc character
* This function is designed to convert blob segment from SJIS to EUC
* on retrieval. The reason we have ib_sjis and ib_euc is to keep
* track of splitting fragments from segments.
*
**************************************/
UCHAR c1, c2;
*euc_len = 0;
*ib_sjis = *ib_euc = 0;
while (sjis_len) {
if (*euc_len >= euc_buf_len) /*buffer full */
return (1);
if (*sjis_str & 0x80) { /* Non-Ascii - High bit set */
c1 = *sjis_str++;
sjis_len--;
if (SJIS1(c1)) { /* First byte is a KANJI */
if (sjis_len == 0)
return (2); /* truncated KANJI */
c2 = *sjis_str++;
sjis_len--;
if (!(SJIS2(c2)))
return (2); /* Bad second byte */
*euc_len += 2; /* Good Kanji */
if (*euc_len > euc_buf_len) /*buffer full */
return (1);
S2E(c1, c2, *euc_str, *(euc_str + 1));
euc_str += 2;
*ib_sjis += 2;
*ib_euc += 2;
}
else if (SJIS_SINGLE(c1)) {
*euc_len += 2; /* Kana */
if (*euc_len > euc_buf_len) /*buffer full */
return (1);
*euc_str++ = 0x8e;
*euc_str++ = c1;
(*ib_sjis)++;
*ib_euc += 2;
}
else
return (2); /* It is some bad character */
}
else { /* it is a ASCII */
*euc_len += 1;
sjis_len--;
*euc_str++ = *sjis_str++;
(*ib_sjis)++;
(*ib_euc)++;
}
}
return (0);
}
USHORT KANJI_sjis_len(UCHAR * euc_str, USHORT euc_len, USHORT * sjis_len)
{
/**************************************
*
* K A N J I _ s j i s _ l e n
*
**************************************
*
* Functional description
* Find the number of sjis bytes corresponding to a given euc string.
* Returns 1 if invalid kanji is encountered.
*
**************************************/
UCHAR c1, c2;
*sjis_len = 0;
while (euc_len) {
if (*euc_str & 0x80) { /* Non-Ascii - High bit set */
c1 = *euc_str++;
euc_len--;
if (EUC1(c1)) { /* It is a EUC */
if (euc_len == 0)
return (1); /* truncated EUC */
c2 = *euc_str++;
euc_len--;
if (!(EUC2(c2)))
return (1); /* Bad EUC */
if (c1 == 0x8e)
*sjis_len += 1; /* Kana */
else
*sjis_len += 2; /* Kanji */
}
else /* It is some bad character */
return (1);
}
else { /* ASCII */
euc_len--;
*sjis_len += 1;
euc_str++;
}
}
return (0);
}