firebird-qa/tests/bugs/core_1026_utf8_test.py

#coding:utf-8
#
# id:           bugs.core_1026_utf8
# title:        Estonian collation in UTF8 charset
# decription:
#                    Original ticker subj: Estonian collations for WIN1252 charset
#
#                   http://www.eki.ee/itstandard/2000/FDCC.shtml.en
#               	Estonian sort order changes the default positions for one base character (<z>), š and ž ('s' and 'z' with caron)
#               	and four accented vowels in the Estonian alphabet:
#               		* <z> and all it's modifications are ordered after <s>;
#               		* 'š' and 'ž' are separate letters and follow the unaccented <s> and <z> respectively.
#               		* all four Estonian vowels with diacritics - 'ä', 'ö', 'õ' and 'ü' - are also sorted as separate
#               		letters after 'w';
#               		* 'w' is generally sorted as a separate letter except of Estonian personnal names.
#               	Thus the Estonian alphabet ends with:
#               	####################################
#               	... r s š z ž t u v w õ ä ö ü x y
#               	####################################
#
#               	 NOTE-1:
#               	 collation = WIN1252_UNICODE (defined for charset WIN1252 in %FB_HOME%\\intl
#               bintl.conf) sorts
#               	 estonian letters NOT as expected: a,A,ä,Ä,b...,o,O,ö,Ö,õ,Õ,...,s,S,š,Š,t,T,u,U,ü,Ü,...,z,Z,ž,Ž
#
#               	 NOTE-2:
#               	 collation WIN1257_EE (defined for charset WIN1257) sorts estonian letters properly but this collation
#               	 is not what author asked about ("CP 1257 (Baltic) is suggested as second when 1252 is not available.")
#               	 Because of this, UTF8 collation for LOCALE=et_EE is tested here.
#
#               	 NOTE-3:
#               	 lowercase letters are sorted BEFORE uppercase ones ('a' < 'A') when we use most of collations,
#                    including utf8 'LOCALE=et_EE'. This is defined by current ICU implementation.
#               	 Current syntax of CREATE COLLATION statement do not allow to change priority of uppercase/lowercase
#               	 letters which denote the same character when they are sorted.
#
#               	 Result of sorting such letters will be opposite to "ascii-style" where uppercase letters ('A')
#               	 are always considered as less then lowercase ones ('a').
#
#               	 Quote from https://unicode.org/reports/tr10/#Case_Comparisons
#               	 =====
#               	 "In some languages, it is common to sort lowercase before uppercase; in other languages this is reversed.
#               	 Often this is more dependent on the individual concerned, and is not standard across a single language.
#               	 It is strongly recommended that implementations provide parameterization that allows uppercase to be
#               	 sorted before lowercase, and provides information as to the standard (if any) for particular countries".
#               	 ======
#               	 See also:
#               	 http://userguide.icu-project.org/collation/concepts
#
#               	 Checked on 4.0.0.2214
#
# tracker_id:   CORE-1026
# min_versions: ['4.0']
# versions:     4.0
# qmid:         None

import pytest
from firebird.qa import db_factory, isql_act, Action

# version: 4.0
# resources: None

substitutions_1 = [('[ \t]+', ' ')]

init_script_1 = """"""

db_1 = db_factory(charset='UTF8', sql_dialect=3, init=init_script_1)

test_script_1 = """
	set bail on;

	-- NB: 'et' in 'LOCALE=...' must be specified in lowercase!
	create collation estonian_coll_cs_as for utf8 from unicode 'LOCALE=et_EE';
	create collation estonian_coll_ci_ai for utf8 from unicode case insensitive accent insensitive 'LOCALE=et_EE';

	create table test(
		id smallint generated by default as identity
		,s varchar(1) character set utf8 collate estonian_coll_cs_as
	);
	commit;


	-- Fill records according to SORT order that is declared by Estonian Standardization Board
	-- http://www.eki.ee/itstandard/2000/FDCC.shtml.en

	insert into test(s) values( 'a' );
	insert into test(s) values( 'A' );
	insert into test(s) values( 'b' );
	insert into test(s) values( 'B' );
	insert into test(s) values( 'c' );
	insert into test(s) values( 'C' );
	insert into test(s) values( 'd' );
	insert into test(s) values( 'D' );
	insert into test(s) values( 'e' );
	insert into test(s) values( 'E' );
	insert into test(s) values( 'f' );
	insert into test(s) values( 'F' );
	insert into test(s) values( 'g' );
	insert into test(s) values( 'G' );
	insert into test(s) values( 'h' );
	insert into test(s) values( 'H' );
	insert into test(s) values( 'i' );
	insert into test(s) values( 'I' );
	insert into test(s) values( 'j' );
	insert into test(s) values( 'J' );
	insert into test(s) values( 'k' );
	insert into test(s) values( 'K' );
	insert into test(s) values( 'l' );
	insert into test(s) values( 'L' );
	insert into test(s) values( 'm' );
	insert into test(s) values( 'M' );
	insert into test(s) values( 'n' );
	insert into test(s) values( 'N' );
	insert into test(s) values( 'o' );
	insert into test(s) values( 'O' );
	insert into test(s) values( 'p' );
	insert into test(s) values( 'P' );
	insert into test(s) values( 'r' );
	insert into test(s) values( 'R' );
	insert into test(s) values( 's' );
	insert into test(s) values( 'S' );

	insert into test(s) values( 'š' );
	insert into test(s) values( 'Š' );
	insert into test(s) values( 'z' );
	insert into test(s) values( 'Z' );
	insert into test(s) values( 'ž' );
	insert into test(s) values( 'Ž' );
	insert into test(s) values( 't' );
	insert into test(s) values( 'T' );
	insert into test(s) values( 'u' );
	insert into test(s) values( 'U' );
	insert into test(s) values( 'v' );
	insert into test(s) values( 'V' );
	insert into test(s) values( 'õ' );
	insert into test(s) values( 'Õ' );
	insert into test(s) values( 'ä' );
	insert into test(s) values( 'Ä' );
	insert into test(s) values( 'ö' );
	insert into test(s) values( 'Ö' );
	insert into test(s) values( 'ü' );
	insert into test(s) values( 'Ü' );
	insert into test(s) values( 'x' );
	insert into test(s) values( 'X' );
	insert into test(s) values( 'y' );
	insert into test(s) values( 'Y' );

	commit;

	set heading off;

	-- test-1:
	-- check whether letters are sorted properly when case- and accent-sensitive collation is in use:
	-- values of row_number()over(order by a.s) must be equal to ID values.
	select a.id,a.s,row_number()over(order by a.s) rn from test a;


	-- test-2:
	-- check that every character matches to some another (and single) with respect
	-- to requirement: "case insensitive, accent insensitive".
	-- This means that for  'a' we must find 'A' (but NOT 'ä' or 'Ä') etc.
	-- NOTE.
	-- 'š', 'ž', 'õ', 'ä', 'ö' and 'ü' - must be considered and sorted as *separate* letters.
	-- upper() / lower() for each of them must find match to only *one* character with another ID in the 'test' table

	set count on;
	select a.id, a.s, b.id, b.s
	from test a
	join test b on
		a.s collate estonian_coll_ci_ai = b.s collate estonian_coll_ci_ai
		and a.id < b.id
	order by a.id, b.id
	;
  """

act_1 = isql_act('db_1', test_script_1, substitutions=substitutions_1)

expected_stdout_1 = """
		  1 a                          1
		  2 A                          2
		  3 b                          3
		  4 B                          4
		  5 c                          5
		  6 C                          6
		  7 d                          7
		  8 D                          8
		  9 e                          9
		 10 E                         10
		 11 f                         11
		 12 F                         12
		 13 g                         13
		 14 G                         14
		 15 h                         15
		 16 H                         16
		 17 i                         17
		 18 I                         18
		 19 j                         19
		 20 J                         20
		 21 k                         21
		 22 K                         22
		 23 l                         23
		 24 L                         24
		 25 m                         25
		 26 M                         26
		 27 n                         27
		 28 N                         28
		 29 o                         29
		 30 O                         30
		 31 p                         31
		 32 P                         32
		 33 r                         33
		 34 R                         34
		 35 s                         35
		 36 S                         36
		 37 š                         37
		 38 Š                         38
		 39 z                         39
		 40 Z                         40
		 41 ž                         41
		 42 Ž                         42
		 43 t                         43
		 44 T                         44
		 45 u                         45
		 46 U                         46
		 47 v                         47
		 48 V                         48
		 49 õ                         49
		 50 Õ                         50
		 51 ä                         51
		 52 Ä                         52
		 53 ö                         53
		 54 Ö                         54
		 55 ü                         55
		 56 Ü                         56
		 57 x                         57
		 58 X                         58
		 59 y                         59
		 60 Y                         60


		  1 a            2 A
		  3 b            4 B
		  5 c            6 C
		  7 d            8 D
		  9 e           10 E
		 11 f           12 F
		 13 g           14 G
		 15 h           16 H
		 17 i           18 I
		 19 j           20 J
		 21 k           22 K
		 23 l           24 L
		 25 m           26 M
		 27 n           28 N
		 29 o           30 O
		 31 p           32 P
		 33 r           34 R
		 35 s           36 S
		 37 š           38 Š
		 39 z           40 Z
		 41 ž           42 Ž
		 43 t           44 T
		 45 u           46 U
		 47 v           48 V
		 49 õ           50 Õ
		 51 ä           52 Ä
		 53 ö           54 Ö
		 55 ü           56 Ü
		 57 x           58 X
		 59 y           60 Y

	Records affected: 30
  """

@pytest.mark.version('>=4.0')
def test_1(act_1: Action):
    act_1.expected_stdout = expected_stdout_1
    act_1.execute()
    assert act_1.clean_expected_stdout == act_1.clean_stdout