firebird-mirror/src/intl/kanji.cpp

/*
 *	PROGRAM:	JRD Access Method
 *	MODULE:		kanji.c
 *	DESCRIPTION:
 *
 * The contents of this file are subject to the Interbase Public
 * License Version 1.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy
 * of the License at http://www.Inprise.com/IPL.html
 *
 * Software distributed under the License is distributed on an
 * "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code was created by Inprise Corporation
 * and its predecessors. Portions created by Inprise Corporation are
 * Copyright (C) Inprise Corporation.
 *
 * All Rights Reserved.
 * Contributor(s): ______________________________________.
 */

#include "firebird.h"
#include "../jrd/ib_stdio.h"
#include "../jrd/common.h"
#include "kanji.h"
#include "kanji_proto.h"


#define S2E(s1, s2, j1, j2) \
{ \
  if (s2 >= 0x9f) { \
    if (s1 >= 0xe0) j1 = (s1*2 - 0xe0); \
    else j1 = (s1*2 - 0x60); \
    j2 = (s2 + 2); \
  } else { \
    if (s1 >= 0xe0) j1 = (s1*2 - 0xe1); \
    else j1 = (s1*2 - 0x61);\
    if (s2 >= 0x7f) j2 = (s2 + 0x60); \
    else j2 = (s2 +  0x61); \
  } \
}


USHORT KANJI_check_euc(UCHAR * euc_str, USHORT euc_len)
{
/**************************************
 *
 *      K A N J I _ c h e c k _ e u c
 *
 **************************************
 *
 * Functional description
 *	This is a cousin of the KANJI_check_sjis routine.
 *      Make sure that the euc string does not have any truncated 2 byte
 *      character at the end. * If we have a truncated character then,
 *          return 1.
 *          else return(0);
 **************************************/
	UCHAR c1, c2;


	while (euc_len--) {
		if (*euc_str & 0x80) {	/* Is it  EUC */
			if (euc_len == 0) {	/* truncated kanji */
				return (1);
			}
			else {
				euc_str += 2;
				euc_len -= 1;
			}
		}
		else {					/* it is a ASCII */

			euc_str++;
		}
	}
	return (0);
}


USHORT KANJI_check_sjis(UCHAR * sjis_str, USHORT sjis_len)
{
/**************************************
 *
 *      K A N J I _ c h e c k _ s j i s
 *
 **************************************
 *
 * Functional description
 *	This is a cousin of the KANJI_check_euc routine.
 *      Make sure that the sjis string does not have any truncated 2 byte
 *	character at the end. *	If we have a truncated character then,
 *	    return 1.
 *	    else return(0);
 **************************************/

	UCHAR c1;

	while (sjis_len--) {
		if (*sjis_str & 0x80) {	/* Is it  SJIS */
			if SJIS1
				((c1 = *sjis_str)) {	/* It is a KANJI */
				if (sjis_len == 0) {	/* truncated KANJI */
					return (1);
				}
				else {
					sjis_str += 2;
					sjis_len -= 1;
				}
				}
			else {				/*It is a KANA */

				sjis_str++;
			}
		}
		else {					/* it is a ASCII */

			sjis_str++;
		}
	}
	return (0);
}


USHORT KANJI_euc2sjis(UCHAR * euc_str,
					  USHORT euc_len,
					  UCHAR * sjis_str,
					  USHORT sjis_buf_len, USHORT * sjis_len)
{
/**************************************
 *
 *      K A N J I _ e u c 2 s j i s
 *
 **************************************
 *
 * Functional description
 *      Convert euc_len number of bytes in euc_str to sjis_str .
 *	sjis_buf_len is the maximum size of the sjis buffer.
 *      sjis_len is set to the number of bytes put in the sjis_str.
 * 	If a kanji conversion error occurs a 1 is returned.
 *
 **************************************/
	UCHAR c1, c2;

	*sjis_len = 0;
	while (euc_len) {
		if (*euc_str & 0x80) {	/* Non-Ascii - High bit set */
			if (*sjis_len >= sjis_buf_len)	/*buffer full */
				return (1);

			c1 = *euc_str++;
			euc_len--;

			if (EUC1(c1)) {		/* It is a EUC */
				if (euc_len == 0)
					return (1);	/* truncated EUC */
				c2 = *euc_str++;
				euc_len--;
				if (!(EUC2(c2)))
					return (1);	/* Bad EUC */
				if (c1 == 0x8e) {	/* Kana */
					*sjis_len += 1;
					*sjis_str++ = c2;
				}
				else {			/* Kanji */

					*sjis_len += 2;
					if (*sjis_len > sjis_buf_len)	/*buffer full */
						return (1);
					c1 ^= 0x80;
					c2 ^= 0x80;
					*sjis_str++ =
						(c1 - 0x21) / 2 + ((c1 <= 0x5e) ? 0x81 : 0xc1);
					if (c1 & 1)	/* odd */
						*sjis_str++ = c2 + ((c2 <= 0x5f) ? 0x1f : 0x20);
					else
						*sjis_str++ = c2 + 0x7e;
				}
			}
			else				/* It is some bad character */
				return (1);
		}
		else {					/* ASCII */

			euc_len--;
			*sjis_len += 1;
			*sjis_str++ = *euc_str++;
		}
	}
	return (0);
}


USHORT KANJI_euc_byte2short(UCHAR * src, USHORT * dst, USHORT len)
{
/**************************************
 *
 *      K A N J I _ e u c _ b y t e 2 s h o r t
 *
 **************************************
 *
 * Functional description
 *      Convert len number of bytes of EUC string in
 *	src (SCHAR-based buffer) into dst (short-based buffer).
 *	This routine merges:
 *		1-byte ASCII into 1 sshort, and
 *		2-byte EUC kanji into 1 sshort.
 *	Return the number of "characters" in dst.
 *
 **************************************/
	USHORT l, x;
	for (l = 0; len-- > 0; l++) {
		x = (EUC1(*src)) ? (len--, (*src++ << 8)) : 0;
		x |= *src++;
		*dst++ = x;
	}
	return l;
}


USHORT KANJI_euc_len(UCHAR * sjis_str, USHORT sjis_len, USHORT * euc_len)
{
/**************************************
 *
 *      K A N J I _ e u c _ l e n
 *
 **************************************
 *
 * Functional description
 *      Return the number of euc bytes corresponding to a given sjis string.
 *	Returns 1 if invalid kanji is encountered.
 *
 **************************************/

	UCHAR c1, c2;

	*euc_len = 0;
	while (sjis_len) {
		if (*sjis_str & 0x80) {	/* Non-Ascii - High bit set */
			c1 = *sjis_str++;
			sjis_len--;

			if (SJIS1(c1)) {	/* First byte is a KANJI */
				if (sjis_len == 0)
					return (1);	/* truncated KANJI */
				c2 = *sjis_str++;
				sjis_len--;
				if (!(SJIS2(c2)))
					return (1);	/* Bad second byte */
				*euc_len += 2;	/* Good Kanji */
			}
			else if (SJIS_SINGLE(c1))
				*euc_len += 2;	/* Kana */
			else
				return (1);		/* It is some bad character */
		}
		else {					/* it is a ASCII */

			sjis_len--;
			*euc_len += 1;
			sjis_str++;
		}
	}
	return (0);
}


USHORT KANJI_sjis2euc(UCHAR * sjis_str,
					  USHORT sjis_len,
					  UCHAR * euc_str, USHORT euc_buf_len, USHORT * euc_len)
{
/**************************************
 *
 *      K A N J I _ s j i s 2 e u c
 *
 **************************************
 *
 * Functional description
 *      Convert sjis_len number of bytes in sjis_str to euc_str .
 *      euc_len is set to the number of bytes put in the euc_str.
 * 	If a kanji conversion error occurs a 1 is returned.
 *
 **************************************/

	UCHAR c1, c2;

	*euc_len = 0;
	while (sjis_len) {
		if (*euc_len >= euc_buf_len)	/*buffer full */
			return (1);

		if (*sjis_str & 0x80) {	/* Non-Ascii - High bit set */
			c1 = *sjis_str++;
			sjis_len--;

			if (SJIS1(c1)) {	/* First byte is a KANJI */
				if (sjis_len == 0)
					return (1);	/* truncated KANJI */
				c2 = *sjis_str++;
				sjis_len--;
				if (!(SJIS2(c2)))
					return (1);	/* Bad second byte */
				*euc_len += 2;	/* Good Kanji */
				if (*euc_len > euc_buf_len)	/*buffer full */
					return (1);
				S2E(c1, c2, *euc_str, *(euc_str + 1));
				euc_str += 2;
			}
			else if (SJIS_SINGLE(c1)) {
				*euc_len += 2;	/* Kana */
				if (*euc_len > euc_buf_len)	/*buffer full */
					return (1);
				*euc_str++ = 0x8e;
				*euc_str++ = c1;
			}
			else
				return (1);		/* It is some bad character */
		}
		else {					/* it is a ASCII */

			*euc_len += 1;
			sjis_len--;
			*euc_str++ = *sjis_str++;
		}
	}
	return (0);
}


USHORT KANJI_sjis_byte2short(UCHAR * src;
							 USHORT * dst, USHORT len) {
/**************************************
 *
 *      K A N J I _ s j i s _ b y t e 2 s h o r t
 *
 **************************************
 *
 * Functional description
 *      Convert len number of bytes of SJIS string in
 *	src (SCHAR-based buffer) into dst (short-based buffer).
 *	This routine merges:
 *		1-byte ASCII into 1 sshort,
 *		1-byte SJIS kana 1 sshort, and
 *		2-byte SJIS kanji into 1 sshort.
 *	Return the number of "characters" in dst.
 *
 **************************************/
	USHORT l, x;
	for (l = 0; len-- > 0; l++) {
		x = (SJIS1(*src)) ? (len--, (*src++ << 8)) : 0;
		x |= *src++;
		*dst++ = x;
	}
	return l;
}


USHORT KANJI_sjis2euc5(UCHAR * sjis_str,
					   USHORT sjis_len,
					   UCHAR * euc_str,
					   USHORT euc_buf_len,
					   USHORT * euc_len, USHORT * ib_sjis, USHORT * ib_euc)
{
/**************************************
 *
 *      K A N J I _ s j i s 2 e u c 5
 *
 **************************************
 *
 * Functional description
 *	Similar to KANJI_sjis2euc().
 *	Differences:
 *	(1) when buffer is full, returns (1).
 *	(2) when there's invalid SCHAR, returns(2).
 *	(3) two additional parameters are:
 *		ib_sjis		number of in-bound sjis character
 *		ib_euc		number of in-bound euc character
 * This function is designed to convert blob segment from SJIS to EUC
 * on retrieval. The reason we have ib_sjis and ib_euc is to keep
 * track of splitting fragments from segments.
 *
 **************************************/
	UCHAR c1, c2;

	*euc_len = 0;
	*ib_sjis = *ib_euc = 0;
	while (sjis_len) {
		if (*euc_len >= euc_buf_len)	/*buffer full */
			return (1);

		if (*sjis_str & 0x80) {	/* Non-Ascii - High bit set */
			c1 = *sjis_str++;
			sjis_len--;

			if (SJIS1(c1)) {	/* First byte is a KANJI */
				if (sjis_len == 0)
					return (2);	/* truncated KANJI */
				c2 = *sjis_str++;
				sjis_len--;
				if (!(SJIS2(c2)))
					return (2);	/* Bad second byte */
				*euc_len += 2;	/* Good Kanji */
				if (*euc_len > euc_buf_len)	/*buffer full */
					return (1);
				S2E(c1, c2, *euc_str, *(euc_str + 1));
				euc_str += 2;
				*ib_sjis += 2;
				*ib_euc += 2;
			}
			else if (SJIS_SINGLE(c1)) {
				*euc_len += 2;	/* Kana */
				if (*euc_len > euc_buf_len)	/*buffer full */
					return (1);
				*euc_str++ = 0x8e;
				*euc_str++ = c1;
				(*ib_sjis)++;
				*ib_euc += 2;
			}
			else
				return (2);		/* It is some bad character */
		}
		else {					/* it is a ASCII */

			*euc_len += 1;
			sjis_len--;
			*euc_str++ = *sjis_str++;
			(*ib_sjis)++;
			(*ib_euc)++;
		}
	}
	return (0);
}


USHORT KANJI_sjis_len(UCHAR * euc_str, USHORT euc_len, USHORT * sjis_len)
{
/**************************************
 *
 *      K A N J I _ s j i s _ l e n
 *
 **************************************
 *
 * Functional description
 *      Find the number of sjis bytes corresponding to a given euc string.
 *	Returns 1 if invalid kanji is encountered.
 *
 **************************************/

	UCHAR c1, c2;

	*sjis_len = 0;
	while (euc_len) {
		if (*euc_str & 0x80) {	/* Non-Ascii - High bit set */
			c1 = *euc_str++;
			euc_len--;

			if (EUC1(c1)) {		/* It is a EUC */
				if (euc_len == 0)
					return (1);	/* truncated EUC */
				c2 = *euc_str++;
				euc_len--;
				if (!(EUC2(c2)))
					return (1);	/* Bad EUC */
				if (c1 == 0x8e)
					*sjis_len += 1;	/* Kana */
				else
					*sjis_len += 2;	/* Kanji */
			}
			else				/* It is some bad character */
				return (1);
		}
		else {					/* ASCII */

			euc_len--;
			*sjis_len += 1;
			euc_str++;
		}
	}
	return (0);
}