mirror of
https://github.com/FirebirdSQL/firebird-qa.git
synced 2025-01-22 21:43:06 +01:00
270 lines
9.2 KiB
Python
270 lines
9.2 KiB
Python
#coding:utf-8
|
|
|
|
"""
|
|
ID: issue-1440
|
|
ISSUE: 1440
|
|
TITLE: Estonian collation in UTF8 charset
|
|
DESCRIPTION:
|
|
Original ticker subj: Estonian collations for WIN1252 charset
|
|
|
|
http://www.eki.ee/itstandard/2000/FDCC.shtml.en
|
|
Estonian sort order changes the default positions for one base character (<z>), š and ž ('s' and 'z' with caron)
|
|
and four accented vowels in the Estonian alphabet:
|
|
* <z> and all it's modifications are ordered after <s>;
|
|
* 'š' and 'ž' are separate letters and follow the unaccented <s> and <z> respectively.
|
|
* all four Estonian vowels with diacritics - 'ä', 'ö', 'õ' and 'ü' - are also sorted as separate letters after 'w';
|
|
* 'w' is generally sorted as a separate letter except of Estonian personnal names.
|
|
Thus the Estonian alphabet ends with:
|
|
####################################
|
|
... r s š z ž t u v w õ ä ö ü x y
|
|
####################################
|
|
NOTES:
|
|
[1]
|
|
collation = WIN1252_UNICODE (defined for charset WIN1252 in %FB_HOME%\\intl\\fbintl.conf) sorts
|
|
estonian letters NOT as expected: a,A,ä,Ä,b...,o,O,ö,Ö,õ,Õ,...,s,S,š,Š,t,T,u,U,ü,Ü,...,z,Z,ž,Ž
|
|
|
|
[2]
|
|
collation WIN1257_EE (defined for charset WIN1257) sorts estonian letters properly but this collation
|
|
is not what author asked about ("CP 1257 (Baltic) is suggested as second when 1252 is not available.")
|
|
Because of this, UTF8 collation for LOCALE=et_EE is tested here.
|
|
|
|
[3]
|
|
lowercase letters are sorted BEFORE uppercase ones ('a' < 'A') when we use most of collations,
|
|
including utf8 'LOCALE=et_EE'. This is defined by current ICU implementation.
|
|
Current syntax of CREATE COLLATION statement do not allow to change priority of uppercase/lowercase
|
|
letters which denote the same character when they are sorted.
|
|
|
|
Result of sorting such letters will be opposite to "ascii-style" where uppercase letters ('A')
|
|
are always considered as less then lowercase ones ('a').
|
|
|
|
Quote from https://unicode.org/reports/tr10/#Case_Comparisons
|
|
=====
|
|
"In some languages, it is common to sort lowercase before uppercase; in other languages this is reversed.
|
|
Often this is more dependent on the individual concerned, and is not standard across a single language.
|
|
It is strongly recommended that implementations provide parameterization that allows uppercase to be
|
|
sorted before lowercase, and provides information as to the standard (if any) for particular countries".
|
|
======
|
|
See also:
|
|
http://userguide.icu-project.org/collation/concepts
|
|
JIRA: CORE-1026
|
|
FBTEST: bugs.core_1026_utf8
|
|
"""
|
|
|
|
import pytest
|
|
from firebird.qa import *
|
|
|
|
db = db_factory(charset='UTF8')
|
|
|
|
test_script = """
|
|
set bail on;
|
|
|
|
-- NB: 'et' in 'LOCALE=...' must be specified in lowercase!
|
|
create collation estonian_coll_cs_as for utf8 from unicode 'LOCALE=et_EE';
|
|
create collation estonian_coll_ci_ai for utf8 from unicode case insensitive accent insensitive 'LOCALE=et_EE';
|
|
|
|
create table test(
|
|
id smallint generated by default as identity
|
|
,s varchar(1) character set utf8 collate estonian_coll_cs_as
|
|
);
|
|
commit;
|
|
|
|
|
|
-- Fill records according to SORT order that is declared by Estonian Standardization Board
|
|
-- http://www.eki.ee/itstandard/2000/FDCC.shtml.en
|
|
|
|
insert into test(s) values( 'a' );
|
|
insert into test(s) values( 'A' );
|
|
insert into test(s) values( 'b' );
|
|
insert into test(s) values( 'B' );
|
|
insert into test(s) values( 'c' );
|
|
insert into test(s) values( 'C' );
|
|
insert into test(s) values( 'd' );
|
|
insert into test(s) values( 'D' );
|
|
insert into test(s) values( 'e' );
|
|
insert into test(s) values( 'E' );
|
|
insert into test(s) values( 'f' );
|
|
insert into test(s) values( 'F' );
|
|
insert into test(s) values( 'g' );
|
|
insert into test(s) values( 'G' );
|
|
insert into test(s) values( 'h' );
|
|
insert into test(s) values( 'H' );
|
|
insert into test(s) values( 'i' );
|
|
insert into test(s) values( 'I' );
|
|
insert into test(s) values( 'j' );
|
|
insert into test(s) values( 'J' );
|
|
insert into test(s) values( 'k' );
|
|
insert into test(s) values( 'K' );
|
|
insert into test(s) values( 'l' );
|
|
insert into test(s) values( 'L' );
|
|
insert into test(s) values( 'm' );
|
|
insert into test(s) values( 'M' );
|
|
insert into test(s) values( 'n' );
|
|
insert into test(s) values( 'N' );
|
|
insert into test(s) values( 'o' );
|
|
insert into test(s) values( 'O' );
|
|
insert into test(s) values( 'p' );
|
|
insert into test(s) values( 'P' );
|
|
insert into test(s) values( 'r' );
|
|
insert into test(s) values( 'R' );
|
|
insert into test(s) values( 's' );
|
|
insert into test(s) values( 'S' );
|
|
|
|
insert into test(s) values( 'š' );
|
|
insert into test(s) values( 'Š' );
|
|
insert into test(s) values( 'z' );
|
|
insert into test(s) values( 'Z' );
|
|
insert into test(s) values( 'ž' );
|
|
insert into test(s) values( 'Ž' );
|
|
insert into test(s) values( 't' );
|
|
insert into test(s) values( 'T' );
|
|
insert into test(s) values( 'u' );
|
|
insert into test(s) values( 'U' );
|
|
insert into test(s) values( 'v' );
|
|
insert into test(s) values( 'V' );
|
|
insert into test(s) values( 'õ' );
|
|
insert into test(s) values( 'Õ' );
|
|
insert into test(s) values( 'ä' );
|
|
insert into test(s) values( 'Ä' );
|
|
insert into test(s) values( 'ö' );
|
|
insert into test(s) values( 'Ö' );
|
|
insert into test(s) values( 'ü' );
|
|
insert into test(s) values( 'Ü' );
|
|
insert into test(s) values( 'x' );
|
|
insert into test(s) values( 'X' );
|
|
insert into test(s) values( 'y' );
|
|
insert into test(s) values( 'Y' );
|
|
|
|
commit;
|
|
|
|
set heading off;
|
|
|
|
-- test-1:
|
|
-- check whether letters are sorted properly when case- and accent-sensitive collation is in use:
|
|
-- values of row_number()over(order by a.s) must be equal to ID values.
|
|
select a.id,a.s,row_number()over(order by a.s) rn from test a;
|
|
|
|
|
|
-- test-2:
|
|
-- check that every character matches to some another (and single) with respect
|
|
-- to requirement: "case insensitive, accent insensitive".
|
|
-- This means that for 'a' we must find 'A' (but NOT 'ä' or 'Ä') etc.
|
|
-- NOTE.
|
|
-- 'š', 'ž', 'õ', 'ä', 'ö' and 'ü' - must be considered and sorted as *separate* letters.
|
|
-- upper() / lower() for each of them must find match to only *one* character with another ID in the 'test' table
|
|
|
|
set count on;
|
|
select a.id, a.s, b.id, b.s
|
|
from test a
|
|
join test b on
|
|
a.s collate estonian_coll_ci_ai = b.s collate estonian_coll_ci_ai
|
|
and a.id < b.id
|
|
order by a.id, b.id
|
|
;
|
|
"""
|
|
|
|
act = isql_act('db', test_script, substitutions=[('[ \t]+', ' ')])
|
|
|
|
expected_stdout = """
|
|
1 a 1
|
|
2 A 2
|
|
3 b 3
|
|
4 B 4
|
|
5 c 5
|
|
6 C 6
|
|
7 d 7
|
|
8 D 8
|
|
9 e 9
|
|
10 E 10
|
|
11 f 11
|
|
12 F 12
|
|
13 g 13
|
|
14 G 14
|
|
15 h 15
|
|
16 H 16
|
|
17 i 17
|
|
18 I 18
|
|
19 j 19
|
|
20 J 20
|
|
21 k 21
|
|
22 K 22
|
|
23 l 23
|
|
24 L 24
|
|
25 m 25
|
|
26 M 26
|
|
27 n 27
|
|
28 N 28
|
|
29 o 29
|
|
30 O 30
|
|
31 p 31
|
|
32 P 32
|
|
33 r 33
|
|
34 R 34
|
|
35 s 35
|
|
36 S 36
|
|
37 š 37
|
|
38 Š 38
|
|
39 z 39
|
|
40 Z 40
|
|
41 ž 41
|
|
42 Ž 42
|
|
43 t 43
|
|
44 T 44
|
|
45 u 45
|
|
46 U 46
|
|
47 v 47
|
|
48 V 48
|
|
49 õ 49
|
|
50 Õ 50
|
|
51 ä 51
|
|
52 Ä 52
|
|
53 ö 53
|
|
54 Ö 54
|
|
55 ü 55
|
|
56 Ü 56
|
|
57 x 57
|
|
58 X 58
|
|
59 y 59
|
|
60 Y 60
|
|
|
|
|
|
1 a 2 A
|
|
3 b 4 B
|
|
5 c 6 C
|
|
7 d 8 D
|
|
9 e 10 E
|
|
11 f 12 F
|
|
13 g 14 G
|
|
15 h 16 H
|
|
17 i 18 I
|
|
19 j 20 J
|
|
21 k 22 K
|
|
23 l 24 L
|
|
25 m 26 M
|
|
27 n 28 N
|
|
29 o 30 O
|
|
31 p 32 P
|
|
33 r 34 R
|
|
35 s 36 S
|
|
37 š 38 Š
|
|
39 z 40 Z
|
|
41 ž 42 Ž
|
|
43 t 44 T
|
|
45 u 46 U
|
|
47 v 48 V
|
|
49 õ 50 Õ
|
|
51 ä 52 Ä
|
|
53 ö 54 Ö
|
|
55 ü 56 Ü
|
|
57 x 58 X
|
|
59 y 60 Y
|
|
|
|
Records affected: 30
|
|
"""
|
|
|
|
@pytest.mark.version('>=4.0')
|
|
def test_1(act: Action):
|
|
act.expected_stdout = expected_stdout
|
|
act.execute()
|
|
assert act.clean_stdout == act.clean_expected_stdout
|
|
|