mirror of
https://github.com/FirebirdSQL/firebird.git
synced 2025-01-22 20:43:02 +01:00
256 lines
7.0 KiB
Plaintext
256 lines
7.0 KiB
Plaintext
Firebird INTL
|
|
=============
|
|
|
|
Author: Adriano dos Santos Fernandes <adrianosf at uol.com.br>
|
|
|
|
|
|
Architecture
|
|
------------
|
|
|
|
Firebird allow you to specify character sets and collations in every field/variable declaration.
|
|
You can also specify the default character set at database create time and every CHAR/VARCHAR declaration that omit character set will use it.
|
|
|
|
At attachment time you can specify the character set that the client want to read all the strings.
|
|
If you don't specify one, NONE is assumed.
|
|
|
|
There are two specials character sets: NONE and OCTETS.
|
|
Both can be used in declarations but OCTETS can't be used in attachment.
|
|
They are very similar with the exception that space of NONE is ASCII 0x20 and space of OCTETS is 0x00.
|
|
They are specials because they don't follow the rule of others character sets regarding conversions.
|
|
With others character sets conversion is performed with CHARSET1->UNICODE->CHARSET2. With NONE/OCTETS the bytes is just copied: NONE/OCTETS->CHARSET2 and CHARSET1->NONE/OCTETS.
|
|
|
|
|
|
Enhancements
|
|
------------
|
|
|
|
|
|
Well-formedness checks
|
|
----------------------
|
|
|
|
Some character sets (specially multi-byte) don't accept everything.
|
|
Now, the engine verify if strings are wellformed when assigning from NONE/OCTETS and strings sended by the client (the statement string and parameters).
|
|
|
|
|
|
Uppercase
|
|
---------
|
|
|
|
In FB 1.5.X only ASCII characters are uppercased in character sets default collation order (without collation specified). Ex:
|
|
|
|
isql -q -ch dos850
|
|
SQL> create database 'test.fdb';
|
|
SQL> create table t (c char(1) character set dos850);
|
|
SQL> insert into t values ('a');
|
|
SQL> insert into t values ('e');
|
|
SQL> insert into t values ('á');
|
|
SQL> insert into t values ('é');
|
|
SQL>
|
|
SQL> select c, upper(c) from t;
|
|
|
|
C UPPER
|
|
====== ======
|
|
a A
|
|
e E
|
|
á á
|
|
é é
|
|
|
|
In FB 2.0 the result is:
|
|
|
|
C UPPER
|
|
====== ======
|
|
a A
|
|
e E
|
|
á Á
|
|
é É
|
|
|
|
|
|
Maximum string length
|
|
---------------------
|
|
|
|
In FB 1.5.X the engine don't verify logical length of MBCS strings.
|
|
Hence a UNICODE_FSS field can accept three (maximum length of one UNICODE_FSS character) times more characters than what's declared in the field size.
|
|
For compatibility purpose this was maintained for legacy character sets but new character sets (UTF8, for example) don't suffer from this problem.
|
|
|
|
|
|
NONE as attachment character set
|
|
--------------------------------
|
|
|
|
When NONE is used as attachment character set, the sqlsubtype member of XSQLVAR has the character set number of the readed field, instead of always 0 as in previous versions.
|
|
|
|
|
|
BLOBs and collations
|
|
--------------------
|
|
|
|
Allow usage of DML COLLATE clause with BLOBs. Ex:
|
|
select blob_column from table where blob_column collate unicode = 'foo';
|
|
|
|
|
|
New character sets and collations
|
|
---------------------------------
|
|
|
|
|
|
UTF8 character set
|
|
------------------
|
|
|
|
The UNICODE_FSS character set has a number of problems: it's a old version of UTF8, accept malformed strings and don't enforce correct maximum string length. In FB 1.5.X UTF8 is a alias to UNICODE_FSS.
|
|
Now UTF8 is a new character set, without these problems of UNICODE_FSS.
|
|
|
|
|
|
UNICODE collations (for UTF8)
|
|
-----------------------------
|
|
|
|
UCS_BASIC works identical as UTF8 without collation specified (sorts in UNICODE code-point order).
|
|
UNICODE sorts using UCA (Unicode Collation Algorithm).
|
|
Sort order sample:
|
|
|
|
isql -q -ch dos850
|
|
SQL> create database 'test.fdb';
|
|
SQL> create table t (c char(1) character set utf8);
|
|
SQL> insert into t values ('a');
|
|
SQL> insert into t values ('A');
|
|
SQL> insert into t values ('á');
|
|
SQL> insert into t values ('b');
|
|
SQL> insert into t values ('B');
|
|
SQL> select * from t order by c collate ucs_basic;
|
|
|
|
C
|
|
======
|
|
A
|
|
B
|
|
a
|
|
b
|
|
á
|
|
|
|
SQL> select * from t order by c collate unicode;
|
|
|
|
C
|
|
======
|
|
a
|
|
A
|
|
á
|
|
b
|
|
B
|
|
|
|
|
|
Brazilian collations
|
|
--------------------
|
|
|
|
Two case-insensitive/accent-insensitive collations was created for Brazil: PT_BR/WIN_PTBR (for WIN1252) and PT_BR (for ISO8859_1).
|
|
Sort order and equality sample:
|
|
|
|
isql -q -ch dos850
|
|
SQL> create database 'test.fdb';
|
|
SQL> create table t (c char(1) character set iso8859_1 collate pt_br);
|
|
SQL> insert into t values ('a');
|
|
SQL> insert into t values ('A');
|
|
SQL> insert into t values ('á');
|
|
SQL> insert into t values ('b');
|
|
SQL> select * from t order by c;
|
|
|
|
C
|
|
======
|
|
A
|
|
a
|
|
á
|
|
b
|
|
|
|
SQL> select * from t where c = 'â';
|
|
|
|
C
|
|
======
|
|
a
|
|
A
|
|
á
|
|
|
|
|
|
Drivers
|
|
-------
|
|
|
|
New character sets and collations are implemented through dynamic libraries and installed in the server with a manifest file in intl subdirectory. For a example see fbintl.conf.
|
|
Not all implemented character sets and collations need to be listed in the manifest file. Only those listed are available and duplications are not loaded.
|
|
|
|
After installed in the server, they should be registered in the database's system tables (rdb$character_sets and rdb$collations). The followings stored procedures do the job:
|
|
|
|
set term !;
|
|
|
|
create or alter procedure sp_register_character_set
|
|
(
|
|
name char(31) character set unicode_fss,
|
|
max_bytes_per_character smallint
|
|
)
|
|
as
|
|
declare variable id smallint;
|
|
declare variable temp_id smallint;
|
|
begin
|
|
name = upper(name);
|
|
id = 255;
|
|
|
|
for select rdb$character_set_id
|
|
from rdb$character_sets
|
|
order by rdb$character_set_id desc
|
|
into temp_id do
|
|
begin
|
|
if (temp_id = id) then
|
|
id = id - 1;
|
|
else
|
|
break;
|
|
end
|
|
|
|
insert into rdb$character_sets
|
|
(rdb$character_set_name, rdb$character_set_id, rdb$system_flag, rdb$bytes_per_character)
|
|
values (:name, :id, 0, :max_bytes_per_character);
|
|
|
|
insert into rdb$collations
|
|
(rdb$collation_name, rdb$collation_id, rdb$character_set_id, rdb$system_flag)
|
|
values (:name, 0, :id, 0);
|
|
end!
|
|
|
|
create or alter procedure sp_register_collation
|
|
(
|
|
character_set char(31) character set unicode_fss,
|
|
name char(31) character set unicode_fss,
|
|
base_name char(31) character set unicode_fss = null,
|
|
attributes smallint = null,
|
|
specific_attributes blob sub_type text character set unicode_fss = null
|
|
)
|
|
as
|
|
declare variable id smallint;
|
|
declare variable temp_id smallint;
|
|
declare variable charset_id smallint;
|
|
begin
|
|
character_set = upper(character_set);
|
|
name = upper(name);
|
|
base_name = coalesce(upper(base_name), name);
|
|
id = 126;
|
|
|
|
select rdb$character_set_id
|
|
from rdb$character_sets
|
|
where rdb$character_set_name = :character_set into charset_id;
|
|
|
|
for select rdb$collation_id
|
|
from rdb$collations
|
|
where rdb$character_set_id = :charset_id
|
|
order by rdb$collation_id desc
|
|
into temp_id do
|
|
begin
|
|
if (temp_id = id) then
|
|
id = id - 1;
|
|
else
|
|
break;
|
|
end
|
|
|
|
insert into rdb$collations
|
|
(rdb$collation_name, rdb$collation_id, rdb$character_set_id, rdb$system_flag,
|
|
rdb$base_collation_name, rdb$collation_attributes, rdb$specific_attributes)
|
|
values (:name, :id, :charset_id, 0, :base_name, :attributes, :specific_attributes);
|
|
end!
|
|
|
|
set term ;!
|
|
commit;
|
|
|
|
Usage example:
|
|
execute procedure sp_register_character_set ('CHARSET_NAME', 1);
|
|
commit;
|
|
|
|
execute procedure sp_register_collation ('ISO8859_1', 'COLLATION_NAME');
|
|
commit;
|