Doc.

2025-01-22 20:43:02 +01:00 · 2005-07-05 01:19:47 +00:00 · 2005-07-05 01:19:47 +00:00 · 3bb7c2ee92
commit 3bb7c2ee92
parent 7d6faaa52d
1 changed files with 255 additions and 0 deletions
--- a/doc/README.intl
+++ b/doc/README.intl
@ -0,0 +1,255 @@
+Firebird INTL
+=============
+
+Author: Adriano dos Santos Fernandes <adrianosf at uol.com.br>
+
+
+Architecture
+------------
+
+Firebird allow you to specify character sets and collations in every field/variable declaration.
+You can also specify the default character set at database create time and every CHAR/VARCHAR declaration that omit character set will use it.
+
+At attachment time you can specify the character set that the client want to read all the strings.
+If you don't specify one, NONE is assumed.
+
+There are two specials character sets: NONE and OCTETS.
+Both can be used in declarations but OCTETS can't be used in attachment.
+They are very similar with the exception that space of NONE is ASCII 0x20 and space of OCTETS is 0x00.
+They are specials because they don't follow the rule of others character sets regarding conversions.
+With others character sets conversion is performed with CHARSET1->UNICODE->CHARSET2. With NONE/OCTETS the bytes is just copied: NONE/OCTETS->CHARSET2 and CHARSET1->NONE/OCTETS.
+
+
+Enhancements
+------------
+
+
+	Well-formedness checks
+	----------------------
+
+	Some character sets (specially multi-byte) don't accept everything.
+	Now, the engine verify if strings are wellformed when assigning from NONE/OCTETS and strings sended by the client (the statement string and parameters).
+
+
+	Uppercase
+	---------
+	
+	In FB 1.5.X only ASCII characters are uppercased in character sets default collation order (without collation specified). Ex:
+	
+	isql -q -ch dos850
+	SQL> create database 'test.fdb';
+	SQL> create table t (c char(1) character set dos850);
+	SQL> insert into t values ('a');
+	SQL> insert into t values ('e');
+	SQL> insert into t values ('á');
+	SQL> insert into t values ('é');
+	SQL> 
+	SQL> select c, upper(c) from t;
+
+	C      UPPER
+	====== ======
+	a      A
+	e      E
+	á      á
+	é      é
+
+	In FB 2.0 the result is:
+	
+	C      UPPER
+	====== ======
+	a      A
+	e      E
+	á      Á
+	é      É
+
+
+	Maximum string length
+	---------------------
+
+	In FB 1.5.X the engine don't verify logical length of MBCS strings.
+	Hence a UNICODE_FSS field can accept three (maximum length of one UNICODE_FSS character) times more characters than what's declared in the field size.
+	For compatibility purpose this was maintained for legacy character sets but new character sets (UTF8, for example) don't suffer from this problem.
+
+
+	NONE as attachment character set
+	--------------------------------
+	
+	When NONE is used as attachment character set, the sqlsubtype member of XSQLVAR has the character set number of the readed field, instead of always 0 as in previous versions.
+	
+	
+	BLOBs and collations
+	--------------------
+	
+	Allow usage of DML COLLATE clause with BLOBs. Ex:
+	select blob_column from table where blob_column collate unicode = 'foo';
+	
+	
+New character sets and collations
+---------------------------------
+
+
+	UTF8 character set
+	------------------
+
+	The UNICODE_FSS character set has a number of problems: it's a old version of UTF8, accept malformed strings and don't enforce correct maximum string length. In FB 1.5.X UTF8 is a alias to UNICODE_FSS.
+	Now UTF8 is a new character set, without these problems of UNICODE_FSS.
+
+
+	UNICODE collations (for UTF8)
+	-----------------------------
+	
+	UCS_BASIC works identical as UTF8 without collation specified (sorts in UNICODE code-point order).
+	UNICODE sorts using UCA (Unicode Collation Algorithm).
+	Sort order sample:
+	
+	isql -q -ch dos850
+	SQL> create database 'test.fdb';
+	SQL> create table t (c char(1) character set utf8);
+	SQL> insert into t values ('a');
+	SQL> insert into t values ('A');
+	SQL> insert into t values ('á');
+	SQL> insert into t values ('b');
+	SQL> insert into t values ('B');
+	SQL> select * from t order by c collate ucs_basic;
+	
+	C
+	======
+	A
+	B
+	a
+	b
+	á
+	
+	SQL> select * from t order by c collate unicode;
+	
+	C
+	======
+	a
+	A
+	á
+	b
+	B
+	
+	
+	Brazilian collations
+	--------------------
+
+	Two case-insensitive/accent-insensitive collations was created for Brazil: PT_BR/WIN_PTBR (for WIN1252) and PT_BR (for ISO8859_1).
+	Sort order and equality sample:
+
+	isql -q -ch dos850
+	SQL> create database 'test.fdb';
+	SQL> create table t (c char(1) character set iso8859_1 collate pt_br);
+	SQL> insert into t values ('a');
+	SQL> insert into t values ('A');
+	SQL> insert into t values ('á');
+	SQL> insert into t values ('b');
+	SQL> select * from t order by c;
+	
+	C
+	======
+	A
+	a
+	á
+	b
+	
+	SQL> select * from t where c = 'â';
+	
+	C
+	======
+	a
+	A
+	á
+
+
+Drivers
+-------
+
+New character sets and collations are implemented through dynamic libraries and installed in the server with a manifest file in intl subdirectory. For a example see fbintl.conf.
+Not all implemented character sets and collations need to be listed in the manifest file. Only those listed are available and duplications are not loaded.
+
+After installed in the server, they should be registered in the database's system tables (rdb$character_sets and rdb$collations). The followings stored procedures do the job:
+
+	set term !;
+	
+	create or alter procedure sp_register_character_set
+	(
+		name char(31) character set unicode_fss,
+		max_bytes_per_character smallint
+	)
+	as
+		declare variable id smallint;
+		declare variable temp_id smallint;
+	begin
+		name = upper(name);
+		id = 255;
+		
+		for select rdb$character_set_id
+				from rdb$character_sets
+				order by rdb$character_set_id desc
+			into temp_id do
+		begin
+			if (temp_id = id) then
+				id = id - 1;
+			else
+				break;
+		end
+		
+		insert into rdb$character_sets
+			(rdb$character_set_name, rdb$character_set_id, rdb$system_flag, rdb$bytes_per_character)
+			values (:name, :id, 0, :max_bytes_per_character);
+
+		insert into rdb$collations
+			(rdb$collation_name, rdb$collation_id, rdb$character_set_id, rdb$system_flag)
+			values (:name, 0, :id, 0);
+	end!
+
+	create or alter procedure sp_register_collation
+	(
+		character_set char(31) character set unicode_fss,
+		name char(31) character set unicode_fss,
+		base_name char(31) character set unicode_fss = null,
+		attributes smallint = null,
+		specific_attributes blob sub_type text character set unicode_fss = null
+	)
+	as
+		declare variable id smallint;
+		declare variable temp_id smallint;
+		declare variable charset_id smallint;
+	begin
+		character_set = upper(character_set);
+		name = upper(name);
+		base_name = coalesce(upper(base_name), name);
+		id = 126;
+
+		select rdb$character_set_id
+			from rdb$character_sets
+			where rdb$character_set_name = :character_set into charset_id;
+		
+		for select rdb$collation_id
+				from rdb$collations
+				where rdb$character_set_id = :charset_id
+				order by rdb$collation_id desc
+			into temp_id do
+		begin
+			if (temp_id = id) then
+				id = id - 1;
+			else
+				break;
+		end
+		
+		insert into rdb$collations
+			(rdb$collation_name, rdb$collation_id, rdb$character_set_id, rdb$system_flag,
+			 rdb$base_collation_name, rdb$collation_attributes, rdb$specific_attributes)
+			values (:name, :id, :charset_id, 0, :base_name, :attributes, :specific_attributes);
+	end!
+	
+	set term ;!
+	commit;
+
+Usage example:
+	execute procedure sp_register_character_set ('CHARSET_NAME', 1);
+	commit;
+	
+	execute procedure sp_register_collation ('ISO8859_1', 'COLLATION_NAME');
+	commit;