6
0
mirror of https://github.com/FirebirdSQL/firebird-qa.git synced 2025-01-22 21:43:06 +01:00
firebird-qa/tests/bugs/core_1026_utf8_test.py

271 lines
9.2 KiB
Python

#coding:utf-8
"""
ID: issue-1440
ISSUE: 1440
TITLE: Estonian collation in UTF8 charset
DESCRIPTION:
Original ticker subj: Estonian collations for WIN1252 charset
http://www.eki.ee/itstandard/2000/FDCC.shtml.en
Estonian sort order changes the default positions for one base character (<z>), š and ž ('s' and 'z' with caron)
and four accented vowels in the Estonian alphabet:
* <z> and all it's modifications are ordered after <s>;
* 'š' and 'ž' are separate letters and follow the unaccented <s> and <z> respectively.
* all four Estonian vowels with diacritics - 'ä', 'ö', 'õ' and 'ü' - are also sorted as separate letters after 'w';
* 'w' is generally sorted as a separate letter except of Estonian personnal names.
Thus the Estonian alphabet ends with:
####################################
... r s š z ž t u v w õ ä ö ü x y
####################################
NOTES:
[1]
collation = WIN1252_UNICODE (defined for charset WIN1252 in %FB_HOME%\\intl\\fbintl.conf) sorts
estonian letters NOT as expected: a,A,ä,Ä,b...,o,O,ö,Ö,õ,Õ,...,s,S,š,Š,t,T,u,U,ü,Ü,...,z,Z,ž,Ž
[2]
collation WIN1257_EE (defined for charset WIN1257) sorts estonian letters properly but this collation
is not what author asked about ("CP 1257 (Baltic) is suggested as second when 1252 is not available.")
Because of this, UTF8 collation for LOCALE=et_EE is tested here.
[3]
lowercase letters are sorted BEFORE uppercase ones ('a' < 'A') when we use most of collations,
including utf8 'LOCALE=et_EE'. This is defined by current ICU implementation.
Current syntax of CREATE COLLATION statement do not allow to change priority of uppercase/lowercase
letters which denote the same character when they are sorted.
Result of sorting such letters will be opposite to "ascii-style" where uppercase letters ('A')
are always considered as less then lowercase ones ('a').
Quote from https://unicode.org/reports/tr10/#Case_Comparisons
=====
"In some languages, it is common to sort lowercase before uppercase; in other languages this is reversed.
Often this is more dependent on the individual concerned, and is not standard across a single language.
It is strongly recommended that implementations provide parameterization that allows uppercase to be
sorted before lowercase, and provides information as to the standard (if any) for particular countries".
======
See also:
http://userguide.icu-project.org/collation/concepts
JIRA: CORE-1026
FBTEST: bugs.core_1026_utf8
"""
import pytest
from firebird.qa import *
db = db_factory(charset='UTF8')
test_script = """
set bail on;
-- NB: 'et' in 'LOCALE=...' must be specified in lowercase!
create collation estonian_coll_cs_as for utf8 from unicode 'LOCALE=et_EE';
create collation estonian_coll_ci_ai for utf8 from unicode case insensitive accent insensitive 'LOCALE=et_EE';
create table test(
id smallint generated by default as identity
,s varchar(1) character set utf8 collate estonian_coll_cs_as
);
commit;
-- Fill records according to SORT order that is declared by Estonian Standardization Board
-- http://www.eki.ee/itstandard/2000/FDCC.shtml.en
insert into test(s) values( 'a' );
insert into test(s) values( 'A' );
insert into test(s) values( 'b' );
insert into test(s) values( 'B' );
insert into test(s) values( 'c' );
insert into test(s) values( 'C' );
insert into test(s) values( 'd' );
insert into test(s) values( 'D' );
insert into test(s) values( 'e' );
insert into test(s) values( 'E' );
insert into test(s) values( 'f' );
insert into test(s) values( 'F' );
insert into test(s) values( 'g' );
insert into test(s) values( 'G' );
insert into test(s) values( 'h' );
insert into test(s) values( 'H' );
insert into test(s) values( 'i' );
insert into test(s) values( 'I' );
insert into test(s) values( 'j' );
insert into test(s) values( 'J' );
insert into test(s) values( 'k' );
insert into test(s) values( 'K' );
insert into test(s) values( 'l' );
insert into test(s) values( 'L' );
insert into test(s) values( 'm' );
insert into test(s) values( 'M' );
insert into test(s) values( 'n' );
insert into test(s) values( 'N' );
insert into test(s) values( 'o' );
insert into test(s) values( 'O' );
insert into test(s) values( 'p' );
insert into test(s) values( 'P' );
insert into test(s) values( 'r' );
insert into test(s) values( 'R' );
insert into test(s) values( 's' );
insert into test(s) values( 'S' );
insert into test(s) values( 'š' );
insert into test(s) values( 'Š' );
insert into test(s) values( 'z' );
insert into test(s) values( 'Z' );
insert into test(s) values( 'ž' );
insert into test(s) values( 'Ž' );
insert into test(s) values( 't' );
insert into test(s) values( 'T' );
insert into test(s) values( 'u' );
insert into test(s) values( 'U' );
insert into test(s) values( 'v' );
insert into test(s) values( 'V' );
insert into test(s) values( 'õ' );
insert into test(s) values( 'Õ' );
insert into test(s) values( 'ä' );
insert into test(s) values( 'Ä' );
insert into test(s) values( 'ö' );
insert into test(s) values( 'Ö' );
insert into test(s) values( 'ü' );
insert into test(s) values( 'Ü' );
insert into test(s) values( 'x' );
insert into test(s) values( 'X' );
insert into test(s) values( 'y' );
insert into test(s) values( 'Y' );
commit;
set heading off;
-- test-1:
-- check whether letters are sorted properly when case- and accent-sensitive collation is in use:
-- values of row_number()over(order by a.s) must be equal to ID values.
select a.id,a.s,row_number()over(order by a.s) rn from test a;
-- test-2:
-- check that every character matches to some another (and single) with respect
-- to requirement: "case insensitive, accent insensitive".
-- This means that for 'a' we must find 'A' (but NOT 'ä' or 'Ä') etc.
-- NOTE.
-- 'š', 'ž', 'õ', 'ä', 'ö' and 'ü' - must be considered and sorted as *separate* letters.
-- upper() / lower() for each of them must find match to only *one* character with another ID in the 'test' table
set count on;
select a.id, a.s, b.id, b.s
from test a
join test b on
a.s collate estonian_coll_ci_ai = b.s collate estonian_coll_ci_ai
and a.id < b.id
order by a.id, b.id
;
"""
act = isql_act('db', test_script, substitutions=[('[ \t]+', ' ')])
expected_stdout = """
1 a 1
2 A 2
3 b 3
4 B 4
5 c 5
6 C 6
7 d 7
8 D 8
9 e 9
10 E 10
11 f 11
12 F 12
13 g 13
14 G 14
15 h 15
16 H 16
17 i 17
18 I 18
19 j 19
20 J 20
21 k 21
22 K 22
23 l 23
24 L 24
25 m 25
26 M 26
27 n 27
28 N 28
29 o 29
30 O 30
31 p 31
32 P 32
33 r 33
34 R 34
35 s 35
36 S 36
37 š 37
38 Š 38
39 z 39
40 Z 40
41 ž 41
42 Ž 42
43 t 43
44 T 44
45 u 45
46 U 46
47 v 47
48 V 48
49 õ 49
50 Õ 50
51 ä 51
52 Ä 52
53 ö 53
54 Ö 54
55 ü 55
56 Ü 56
57 x 57
58 X 58
59 y 59
60 Y 60
1 a 2 A
3 b 4 B
5 c 6 C
7 d 8 D
9 e 10 E
11 f 12 F
13 g 14 G
15 h 16 H
17 i 18 I
19 j 20 J
21 k 22 K
23 l 24 L
25 m 26 M
27 n 28 N
29 o 30 O
31 p 32 P
33 r 34 R
35 s 36 S
37 š 38 Š
39 z 40 Z
41 ž 42 Ž
43 t 44 T
45 u 46 U
47 v 48 V
49 õ 50 Õ
51 ä 52 Ä
53 ö 54 Ö
55 ü 56 Ü
57 x 58 X
59 y 60 Y
Records affected: 30
"""
@pytest.mark.intl
@pytest.mark.version('>=4.0')
def test_1(act: Action):
act.expected_stdout = expected_stdout
act.execute()
assert act.clean_stdout == act.clean_expected_stdout