firebird-mirror/src/intl/lc_narrow.c

/*
 *	PROGRAM:	InterBase International support
 *	MODULE:		lc_narrow.c
 *	DESCRIPTION:	Common base for Narrow language drivers
 *			(full International collation)
 *
 * The contents of this file are subject to the Interbase Public
 * License Version 1.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy
 * of the License at http://www.Inprise.com/IPL.html
 *
 * Software distributed under the License is distributed on an
 * "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code was created by Inprise Corporation
 * and its predecessors. Portions created by Inprise Corporation are
 * Copyright (C) Inprise Corporation.
 *
 * All Rights Reserved.
 * Contributor(s): ______________________________________.
 */


#include "../intl/ldcommon.h"

USHORT LC_NARROW_key_length();
USHORT LC_NARROW_string_to_key();
SSHORT LC_NARROW_compare();


/*
 * Generic base for InterBase 4.0 Language Driver
 */


#define LANGFAM2_MAX_KEY     	(MAX_KEY)

#define ASCII_SPACE	32
#define	NULL_WEIGHT	0
#define	NULL_SECONDARY	0
#define	NULL_TERTIARY	0


/*
 * key_length (obj, inLen)
 *
 * For an input string of (inLen) bytes, return the maximum
 * key buffer length.
 *
 * This is used for index buffer allocation within the
 * Engine.
 *
 *  Note:
 *	Strings containing (m) expand chars need 2*(m)*3
 *	bytes for key values of the expansion.  This is
 *	offset by string values that don't have corresponding
 *	secondary or tertiary key values.
 *	Let:
 *		n = length of input string
 *		np = count of "simple" bytes in the string, alphabetic
 *		     no secondary and no tertiary.
 *		ns = Has secondary or tertiary, but not both
 *		nt = Has tertiary and secondary.
 *		nc = is a COMPRESSED value.
 *		ne = Has an EXPAND value.
 *		nsp = Is a special value.
 *
 *		n = np + ns + nt + nc + ne + nsp
 *
 *		Key_length(n) =
 *			np
 *			+ 2 * ns
 *			+ 3 * nt
 *			+ 3 * (nc/2)
 *			+ 3 * 2 * ne
 *			+ 2 * nsp
 *			+ 1 (if nsp > 0, for separating keys from special keys)
 *
 *	Clearly this is maximized when the string consists solely of
 *	EXPAND characters.  This degenerate case doesn't occur in
 *	standard text usage, except for short strings (1-2 characters).
 *
 *	Therefore, we compute the keylength based on the "normal" case
 *	of the (nt) term.  It is likely we could choose a probabilistic value
 *	(such as 2.5 * n) for the length of the key.
 *
 *	The degenerate case of short strings is handled by a minimal key
 *	length.
 */
USHORT LC_NARROW_key_length(obj, inLen)
	 TEXTTYPE obj;
	 USHORT inLen;
{
	USHORT len;
/* assert (inLen <= LANGFAM2_MAX_KEY); *//* almost certainly an error */
	len = 3 * MAX(inLen, 2);
	return (MIN(len, LANGFAM2_MAX_KEY));
}


#ifdef DEBUG
#include "../jrd/ib_stdio.h"
static ULONG do_debug = 0;
#endif

/*
 *
 *  Convert a user string to a sequence that will collate bytewise.
 *
 * RETURN:
 *		Length, in bytes, of returned key
 */
USHORT LC_NARROW_string_to_key(obj, iInLen, pInChar, iOutLen, pOutChar,
							   partial)
	 TEXTTYPE obj;
	 USHORT iInLen;
	 BYTE *pInChar;
	 USHORT iOutLen;
	 BYTE *pOutChar;
	 USHORT partial;
{
	USHORT lprimary;
	BYTE secondary[LANGFAM2_MAX_KEY];
	USHORT lsecondary;
	BYTE tertiary[LANGFAM2_MAX_KEY];
	USHORT ltertiary;
	BYTE special[LANGFAM2_MAX_KEY * 2];
	USHORT lspecial;

	USHORT i, j;

	BYTE *outbuff;
	BYTE *inbuff;
	struct SortOrderTblEntry *coll;

	assert(pOutChar != NULL);
	assert(pInChar != NULL);
/* assert (iInLen   <= LANGFAM2_MAX_KEY); */
	assert(iOutLen <= LANGFAM2_MAX_KEY);
	assert(iOutLen >= LC_NARROW_key_length(obj, iInLen));

#ifdef DEBUG
/* Dump out the input string */
	if (do_debug) {
		ib_printf("string: (%02d) '%*s'\n", iInLen, iInLen, pInChar);
		ib_fflush(ib_stdout);
	};
#endif	/* DEBUG */

	outbuff = pOutChar;
	lprimary = 0;
	lsecondary = 0;
	ltertiary = 0;
	lspecial = 0;

/* point inbuff at last character */
	inbuff = pInChar + iInLen - 1;

/* skip backwards over all spaces & reset input length */
	while ((inbuff >= pInChar) && (*inbuff == ASCII_SPACE))
		inbuff--;
	iInLen = (inbuff - pInChar + 1);

	for (i = 0; i < iInLen; i++, pInChar++) {
		assert(lprimary < iOutLen);
		assert(lsecondary < sizeof(secondary));
		assert(ltertiary < sizeof(tertiary));
		assert(lspecial < sizeof(special));

		coll =
			&((struct SortOrderTblEntry *) obj->
			  texttype_collation_table)[*pInChar];
		if (!(coll->IsExpand || coll->IsCompress)) {
			if (coll->Primary != NULL_WEIGHT)
				outbuff[lprimary++] = coll->Primary;
			if (coll->Secondary != NULL_SECONDARY)
				secondary[lsecondary++] = coll->Secondary;
			if (coll->Tertiary != NULL_TERTIARY)
				tertiary[ltertiary++] = coll->Tertiary;
		}
		else if (coll->IsExpand && coll->IsCompress) {
			/* Both flags set indicate a special value */
			if ((coll->Primary != NULL_WEIGHT) &&
				!(obj->texttype_flags & TEXTTYPE_ignore_specials)) {
				special[lspecial++] = (i + 1);	/* position */
				special[lspecial++] = coll->Primary;
			};
		}
		else if (coll->IsExpand) {
			struct ExpandChar *exp;
			exp = &((struct ExpandChar *) obj->texttype_expand_table)[0];
			while (exp->Ch && exp->Ch != *pInChar)
				exp++;
			assert(exp->Ch == *pInChar);
			for (j = 0; j < 2; j++) {
				if (j)
					coll =
						&((struct SortOrderTblEntry *) obj->
						  texttype_collation_table)[exp->ExpCh2];
				if (coll->Primary != NULL_WEIGHT)
					outbuff[lprimary++] = coll->Primary;
				if (coll->Secondary != NULL_SECONDARY)
					secondary[lsecondary++] = coll->Secondary;
				if (coll->Tertiary != NULL_TERTIARY)
					tertiary[ltertiary++] = coll->Tertiary;
			}
		}
		else {					/* (col->IsCompress) */

			struct CompressPair *cmp;
			if ((USHORT) (i + 1) < iInLen) {
				cmp =
					&((struct CompressPair *) obj->
					  texttype_compress_table)[0];
				while (cmp->CharPair[0]) {
					if ((cmp->CharPair[0] == *pInChar) &&
						(cmp->CharPair[1] == *(pInChar + 1))) {

						/* Gobble the two-to-1 entry */
						coll = &cmp->NoCaseWeight;
						pInChar++;
						i++;
						break;
					}
					cmp++;
				}
			}
			if (coll->Primary != NULL_WEIGHT)
				outbuff[lprimary++] = coll->Primary;
			if (coll->Secondary != NULL_SECONDARY)
				secondary[lsecondary++] = coll->Secondary;
			if (coll->Tertiary != NULL_TERTIARY)
				tertiary[ltertiary++] = coll->Tertiary;
		}
	};

/* primary keys are already in output key */

	outbuff += lprimary;
	iOutLen -= lprimary;

	if (partial)
		/* return length of key */
		return (outbuff - pOutChar);

/* put secondary keys into output key */
	if (obj->texttype_flags & TEXTTYPE_reverse_secondary) {
		for (i = 0; i < lsecondary && iOutLen; i++) {
			*outbuff++ = secondary[lsecondary - i - 1];
			iOutLen--;
		}
	}
	else {
		for (i = 0; i < lsecondary && iOutLen; i++) {
			*outbuff++ = secondary[i];
			iOutLen--;
		};
	};

/* put tertiary keys into output key */
	for (i = 0; i < ltertiary && iOutLen; i++) {
		*outbuff++ = tertiary[i];
		iOutLen--;
	};

/* put special keys into output key */
	if ((lspecial && iOutLen) &&
		!(obj->texttype_flags & TEXTTYPE_ignore_specials)) {
		/* Insert the marker-byte */
		*outbuff++ = 0;
		iOutLen--;
		for (i = 0; i < lspecial && iOutLen; i++) {
			*outbuff++ = special[i];
			iOutLen--;
		};
	};

#ifdef DEBUG
/* Dump out the computed key */
	if (do_debug) {
		UCHAR *p;
		ib_printf("   key: (%02d) ", (outbuff - pOutChar));
		for (p = pOutChar; p < outbuff; p++)
			ib_printf("%2x ", *p);
		ib_printf("\n");
		ib_fflush(ib_stdout);
	};
#endif

/* return length of key */
	return (outbuff - pOutChar);
}


#define	HAVE_WAITING	1
#define HAVE_SPECIAL	2

/* expansion char go before the expansion. */
/* eg: S-set collates before ss */

typedef struct coltab_status {
	USHORT stat_flags;
	struct SortOrderTblEntry *stat_waiting;
} *COLSTAT;


static SSHORT special_scan(obj, l1, s1, l2, s2)
	 TEXTTYPE obj;
	 USHORT l1;
	 BYTE *s1;
	 USHORT l2;
	 BYTE *s2;
{
	struct SortOrderTblEntry *col1;
	struct SortOrderTblEntry *col2;
	USHORT index1, index2;

	index1 = 0;
	index2 = 0;

	while (1) {
		/* Scan to find ignore char from l1 */
		while (l1) {
			col1 =
				&((struct SortOrderTblEntry *) obj->
				  texttype_collation_table)[*s1];
			if (col1->IsExpand && col1->IsCompress)
				break;
			l1--;
			s1++;
			index1++;
		}

		/* Scan to find ignore char from l2 */
		while (l2) {
			col2 =
				&((struct SortOrderTblEntry *) obj->
				  texttype_collation_table)[*s2];
			if (col2->IsExpand && col2->IsCompress)
				break;
			l2--;
			s2++;
			index2++;
		}
		if (!l1 && !l2)			/* All out of ignore characters */
			return 0;
		if (l1 && !l2)			/* Out in l2 only */
			return 1000;
		if (!l1 && l2)			/* Out in l1 only */
			return -1000;
		if (index1 < index2)	/* l1 has ignore ch before l2 */
			return -2000;
		if (index1 > index2)	/* l2 has ignore ch before l1 */
			return 2000;
		if (col1->Primary != col2->Primary)
			return (col1->Primary - col2->Primary);
		l1--;
		l2--;
		s1++;
		s2++;
		index1++;
		index2++;
	}
}


struct SortOrderTblEntry *get_coltab_entry(obj, p, l, stat)
	 TEXTTYPE obj;
	 UCHAR **p;
	 USHORT *l;
	 COLSTAT stat;
{
	struct SortOrderTblEntry *col;

	if (stat->stat_flags & HAVE_WAITING) {
		(*l)--;
		(*p)++;
		stat->stat_flags &= ~HAVE_WAITING;
		assert(stat->stat_waiting);
		return stat->stat_waiting;
	}

	stat->stat_waiting = NULL;
	while (*l) {
		col =
			&((struct SortOrderTblEntry *) obj->
			  texttype_collation_table)[**p];
		if (!(col->IsExpand || col->IsCompress)) {
			/* Have col */
			(*l)--;
			(*p)++;
			return col;
		}
		else if (col->IsExpand && col->IsCompress) {
			/* Both flags set indicate a special value */
			/* Need a new col */
			(*l)--;
			(*p)++;
			stat->stat_flags |= HAVE_SPECIAL;
			continue;
		}
		else if (col->IsExpand) {
			struct ExpandChar *exp;
			exp = &((struct ExpandChar *) obj->texttype_expand_table)[0];
			while (exp->Ch && exp->Ch != **p)
				exp++;
			assert(exp->Ch == **p);
			/* Have coll1 */
			/* Have waiting */

			stat->stat_waiting =
				&((struct SortOrderTblEntry *) obj->
				  texttype_collation_table)[exp->ExpCh2];
			stat->stat_flags |= HAVE_WAITING;
			return col;
		}
		else {					/* (col->IsCompress) */

			struct CompressPair *cmp;
			if (*l > 1) {
				cmp =
					&((struct CompressPair *) obj->
					  texttype_compress_table)[0];
				while (cmp->CharPair[0]) {
					if ((cmp->CharPair[0] == **p) &&
						(cmp->CharPair[1] == *(*p + 1))) {
						/* Have Col */
						col = &cmp->NoCaseWeight;
						(*l) -= 2;
						(*p) += 2;
						return col;
					}
					cmp++;
				}
			}
			/* Have col */
			(*l)--;
			(*p)++;
			return col;
		}
	}
	return NULL;
}


#define XOR	^					/* C bitwise XOR operator - defined for readability */

SSHORT LC_NARROW_compare(obj, l1, s1, l2, s2)
	 TEXTTYPE obj;
	 USHORT l1;
	 BYTE *s1;
	 USHORT l2;
	 BYTE *s2;
{
	struct SortOrderTblEntry *col1, *col2;
	struct coltab_status stat1, stat2;
	SSHORT save_secondary = 0;
	SSHORT save_tertiary = 0;
	SSHORT save_quandary = 0;
	USHORT save_l1, save_l2;
	BYTE *save_s1, *save_s2;
	BYTE *p;

	stat1.stat_flags = 0;
	stat2.stat_flags = 0;

/* Start at EOS, scan backwards to find non-space */
	p = s1 + l1 - 1;
	while ((p >= s1) && (*p == ASCII_SPACE))
		p--;
	l1 = (p - s1 + 1);

	p = s2 + l2 - 1;
	while ((p >= s2) && (*p == ASCII_SPACE))
		p--;
	l2 = (p - s2 + 1);

	save_l1 = l1;
	save_l2 = l2;
	save_s1 = s1;
	save_s2 = s2;

	while (1) {
		col1 = get_coltab_entry(obj, &s1, &l1, &stat1);
		col2 = get_coltab_entry(obj, &s2, &l2, &stat2);
		if (!col1 || !col2)
			break;
		if (col1->Primary != col2->Primary)
			return (col1->Primary - col2->Primary);
		if (col1->Secondary != col2->Secondary) {
			if ((obj->texttype_flags & TEXTTYPE_reverse_secondary) ||
				!save_secondary)
					save_secondary = (col1->Secondary - col2->Secondary);
		}
		else if (col1->Tertiary != col2->Tertiary) {
			if (!save_tertiary)
				save_tertiary = (col1->Tertiary - col2->Tertiary);
		}
		else if (((stat1.stat_flags & HAVE_WAITING) XOR
				  (stat2.stat_flags & HAVE_WAITING)) && !save_quandary) {
			if (obj->texttype_flags & TEXTTYPE_expand_before)
				save_quandary = (stat1.stat_flags & HAVE_WAITING) ? -1 : 1;
			else
				save_quandary = (stat1.stat_flags & HAVE_WAITING) ? 1 : -1;
		}
	}

/* One of the strings ended */
	assert(l1 == 0 || l2 == 0);
	assert(col1 == NULL || col2 == NULL);

	if (col1 && !col2)
		return 500;

	if (!col1 && col2)
		return -500;

	if (l1 == 0 && l2 == 0) {
		if (save_secondary)
			return save_secondary;
		if (save_tertiary)
			return save_tertiary;
		if (save_quandary)
			return save_quandary;
		if (
			((stat1.stat_flags & HAVE_SPECIAL)
			 || (stat2.stat_flags & HAVE_SPECIAL))
			&& !(obj->
				 texttype_flags & TEXTTYPE_ignore_specials)) return
				special_scan(obj, save_l1, save_s1, save_l2, save_s2);
		return 0;
	}

	if (l1)
		return 600;
	return -600;
}


#ifdef DEBUG_COMPARE

/*
 * Debugging only
 * Routine used for comparing results from comparision algorithm
 * to results from key creation algorithm
 */
STATIC SSHORT old_fam2_compare(obj, l1, s1, l2, s2)
	 TEXTTYPE obj;
	 USHORT l1;
	 BYTE *s1;
	 USHORT l2;
	 BYTE *s2;
{
	BYTE key1[LANGFAM2_MAX_KEY];
	BYTE key2[LANGFAM2_MAX_KEY];
	USHORT len1;
	USHORT len2;
	USHORT len;
	USHORT i;

	assert(obj != NULL);
	assert(s1 != NULL);
	assert(s2 != NULL);

	len1 = LC_NARROW_string_to_key(obj, l1, s1, sizeof(key1), key1, FALSE);
	len2 = LC_NARROW_string_to_key(obj, l2, s2, sizeof(key2), key2, FALSE);
	len = MIN(len1, len2);
	for (i = 0; i < len; i++) {
		if (key1[i] == key2[i])
			continue;
		else if (key1[i] < key2[i])
			return (-1);
		else
			return (1);
	};
	if (len1 < len2)
		return (-1);
	else if (len1 > len2)
		return (1);
	return (0);
}
#endif	/* DEBUG_COMPARE */


#ifdef DEBUG_COMPARE

#define	SIGN(x)	(((x) < 0)?-1:(((x) == 0)?0:1))

/*
 * Debugging only
 * Routine used for comparing results from comparision algorithm
 * to results from key creation algorithm
 */
STATIC SSHORT fam2_compare(obj, l1, s1, l2, s2)
	 TEXTTYPE obj;
	 USHORT l1;
	 BYTE *s1;
	 USHORT l2;
	 BYTE *s2;
{
	SSHORT res1, res2;

	res1 = old_fam2_compare(obj, l1, s1, l2, s2);
	res2 = LC_NARROW_compare(obj, l1, s1, l2, s2);

	if (SIGN(res1) != SIGN(res2)) {
		ib_printf("different compares:\n%d %s\n%d %s\nold = %d new = %d\n",
				  l1, s1, l2, s2, res1, res2);
		ib_fflush(ib_stdout);
		do_debug = 1;
		res1 = old_fam2_compare(obj, l1, s1, l2, s2);
		res2 = LC_NARROW_compare(obj, l1, s1, l2, s2);
		do_debug = 0;
	}
	return res2;
}

#endif	/* DEBUG_COMPARE */


#undef LANGFAM2_MAX_KEY

#undef ASCII_SPACE
#undef NULL_WEIGHT
#undef NULL_SECONDARY
#undef NULL_TERTIARY