6
0
mirror of https://github.com/FirebirdSQL/firebird-qa.git synced 2025-01-22 21:43:06 +01:00
firebird-qa/tests/bugs/core_1026_utf8_test.py

281 lines
10 KiB
Python

#coding:utf-8
#
# id: bugs.core_1026_utf8
# title: Estonian collation in UTF8 charset
# decription:
# Original ticker subj: Estonian collations for WIN1252 charset
#
# http://www.eki.ee/itstandard/2000/FDCC.shtml.en
# Estonian sort order changes the default positions for one base character (<z>), š and ž ('s' and 'z' with caron)
# and four accented vowels in the Estonian alphabet:
# * <z> and all it's modifications are ordered after <s>;
# * 'š' and 'ž' are separate letters and follow the unaccented <s> and <z> respectively.
# * all four Estonian vowels with diacritics - 'ä', 'ö', 'õ' and 'ü' - are also sorted as separate
# letters after 'w';
# * 'w' is generally sorted as a separate letter except of Estonian personnal names.
# Thus the Estonian alphabet ends with:
# ####################################
# ... r s š z ž t u v w õ ä ö ü x y
# ####################################
#
# NOTE-1:
# collation = WIN1252_UNICODE (defined for charset WIN1252 in %FB_HOME%\\intl
# bintl.conf) sorts
# estonian letters NOT as expected: a,A,ä,Ä,b...,o,O,ö,Ö,õ,Õ,...,s,S,š,Š,t,T,u,U,ü,Ü,...,z,Z,ž,Ž
#
# NOTE-2:
# collation WIN1257_EE (defined for charset WIN1257) sorts estonian letters properly but this collation
# is not what author asked about ("CP 1257 (Baltic) is suggested as second when 1252 is not available.")
# Because of this, UTF8 collation for LOCALE=et_EE is tested here.
#
# NOTE-3:
# lowercase letters are sorted BEFORE uppercase ones ('a' < 'A') when we use most of collations,
# including utf8 'LOCALE=et_EE'. This is defined by current ICU implementation.
# Current syntax of CREATE COLLATION statement do not allow to change priority of uppercase/lowercase
# letters which denote the same character when they are sorted.
#
# Result of sorting such letters will be opposite to "ascii-style" where uppercase letters ('A')
# are always considered as less then lowercase ones ('a').
#
# Quote from https://unicode.org/reports/tr10/#Case_Comparisons
# =====
# "In some languages, it is common to sort lowercase before uppercase; in other languages this is reversed.
# Often this is more dependent on the individual concerned, and is not standard across a single language.
# It is strongly recommended that implementations provide parameterization that allows uppercase to be
# sorted before lowercase, and provides information as to the standard (if any) for particular countries".
# ======
# See also:
# http://userguide.icu-project.org/collation/concepts
#
# Checked on 4.0.0.2214
#
# tracker_id: CORE-1026
# min_versions: ['4.0']
# versions: 4.0
# qmid: None
import pytest
from firebird.qa import db_factory, isql_act, Action
# version: 4.0
# resources: None
substitutions_1 = [('[ \t]+', ' ')]
init_script_1 = """"""
db_1 = db_factory(charset='UTF8', sql_dialect=3, init=init_script_1)
test_script_1 = """
set bail on;
-- NB: 'et' in 'LOCALE=...' must be specified in lowercase!
create collation estonian_coll_cs_as for utf8 from unicode 'LOCALE=et_EE';
create collation estonian_coll_ci_ai for utf8 from unicode case insensitive accent insensitive 'LOCALE=et_EE';
create table test(
id smallint generated by default as identity
,s varchar(1) character set utf8 collate estonian_coll_cs_as
);
commit;
-- Fill records according to SORT order that is declared by Estonian Standardization Board
-- http://www.eki.ee/itstandard/2000/FDCC.shtml.en
insert into test(s) values( 'a' );
insert into test(s) values( 'A' );
insert into test(s) values( 'b' );
insert into test(s) values( 'B' );
insert into test(s) values( 'c' );
insert into test(s) values( 'C' );
insert into test(s) values( 'd' );
insert into test(s) values( 'D' );
insert into test(s) values( 'e' );
insert into test(s) values( 'E' );
insert into test(s) values( 'f' );
insert into test(s) values( 'F' );
insert into test(s) values( 'g' );
insert into test(s) values( 'G' );
insert into test(s) values( 'h' );
insert into test(s) values( 'H' );
insert into test(s) values( 'i' );
insert into test(s) values( 'I' );
insert into test(s) values( 'j' );
insert into test(s) values( 'J' );
insert into test(s) values( 'k' );
insert into test(s) values( 'K' );
insert into test(s) values( 'l' );
insert into test(s) values( 'L' );
insert into test(s) values( 'm' );
insert into test(s) values( 'M' );
insert into test(s) values( 'n' );
insert into test(s) values( 'N' );
insert into test(s) values( 'o' );
insert into test(s) values( 'O' );
insert into test(s) values( 'p' );
insert into test(s) values( 'P' );
insert into test(s) values( 'r' );
insert into test(s) values( 'R' );
insert into test(s) values( 's' );
insert into test(s) values( 'S' );
insert into test(s) values( 'š' );
insert into test(s) values( 'Š' );
insert into test(s) values( 'z' );
insert into test(s) values( 'Z' );
insert into test(s) values( 'ž' );
insert into test(s) values( 'Ž' );
insert into test(s) values( 't' );
insert into test(s) values( 'T' );
insert into test(s) values( 'u' );
insert into test(s) values( 'U' );
insert into test(s) values( 'v' );
insert into test(s) values( 'V' );
insert into test(s) values( 'õ' );
insert into test(s) values( 'Õ' );
insert into test(s) values( 'ä' );
insert into test(s) values( 'Ä' );
insert into test(s) values( 'ö' );
insert into test(s) values( 'Ö' );
insert into test(s) values( 'ü' );
insert into test(s) values( 'Ü' );
insert into test(s) values( 'x' );
insert into test(s) values( 'X' );
insert into test(s) values( 'y' );
insert into test(s) values( 'Y' );
commit;
set heading off;
-- test-1:
-- check whether letters are sorted properly when case- and accent-sensitive collation is in use:
-- values of row_number()over(order by a.s) must be equal to ID values.
select a.id,a.s,row_number()over(order by a.s) rn from test a;
-- test-2:
-- check that every character matches to some another (and single) with respect
-- to requirement: "case insensitive, accent insensitive".
-- This means that for 'a' we must find 'A' (but NOT 'ä' or 'Ä') etc.
-- NOTE.
-- 'š', 'ž', 'õ', 'ä', 'ö' and 'ü' - must be considered and sorted as *separate* letters.
-- upper() / lower() for each of them must find match to only *one* character with another ID in the 'test' table
set count on;
select a.id, a.s, b.id, b.s
from test a
join test b on
a.s collate estonian_coll_ci_ai = b.s collate estonian_coll_ci_ai
and a.id < b.id
order by a.id, b.id
;
"""
act_1 = isql_act('db_1', test_script_1, substitutions=substitutions_1)
expected_stdout_1 = """
1 a 1
2 A 2
3 b 3
4 B 4
5 c 5
6 C 6
7 d 7
8 D 8
9 e 9
10 E 10
11 f 11
12 F 12
13 g 13
14 G 14
15 h 15
16 H 16
17 i 17
18 I 18
19 j 19
20 J 20
21 k 21
22 K 22
23 l 23
24 L 24
25 m 25
26 M 26
27 n 27
28 N 28
29 o 29
30 O 30
31 p 31
32 P 32
33 r 33
34 R 34
35 s 35
36 S 36
37 š 37
38 Š 38
39 z 39
40 Z 40
41 ž 41
42 Ž 42
43 t 43
44 T 44
45 u 45
46 U 46
47 v 47
48 V 48
49 õ 49
50 Õ 50
51 ä 51
52 Ä 52
53 ö 53
54 Ö 54
55 ü 55
56 Ü 56
57 x 57
58 X 58
59 y 59
60 Y 60
1 a 2 A
3 b 4 B
5 c 6 C
7 d 8 D
9 e 10 E
11 f 12 F
13 g 14 G
15 h 16 H
17 i 18 I
19 j 20 J
21 k 22 K
23 l 24 L
25 m 26 M
27 n 28 N
29 o 30 O
31 p 32 P
33 r 34 R
35 s 36 S
37 š 38 Š
39 z 40 Z
41 ž 42 Ž
43 t 44 T
45 u 46 U
47 v 48 V
49 õ 50 Õ
51 ä 52 Ä
53 ö 54 Ö
55 ü 56 Ü
57 x 58 X
59 y 60 Y
Records affected: 30
"""
@pytest.mark.version('>=4.0')
def test_1(act_1: Action):
act_1.expected_stdout = expected_stdout_1
act_1.execute()
assert act_1.clean_expected_stdout == act_1.clean_stdout