mirror of
https://github.com/FirebirdSQL/firebird-qa.git
synced 2025-01-22 13:33:07 +01:00
Added/Updated tests\bugs\core_0214_test.py: fully re-implemented. See notes.
This commit is contained in:
parent
de2237e06e
commit
3ea708d076
@ -2,195 +2,275 @@
|
||||
|
||||
"""
|
||||
ID: issue-542
|
||||
ISSUE: 542
|
||||
TITLE: Count ( DISTINCT ... ) is too slow
|
||||
ISSUE: https://github.com/FirebirdSQL/firebird/issues/542
|
||||
TITLE: Count (DISTINCT ...) is too slow
|
||||
DESCRIPTION:
|
||||
This test does following:
|
||||
1. Creates several tables with different number of unique values in field ID.
|
||||
2. Measures for each table time for two statements:
|
||||
2.1. select count(*) from ( select distinct id from ... )
|
||||
vs
|
||||
2.2. select count(distinct id) from ...
|
||||
3. If time for 2.1 exceeds time for 2.2 more than <X> times - output message
|
||||
about possible regression. After multiple runs it was found that ratio for
|
||||
2.1 vs 2.2 is about 1.05 ... 1.10. Constant <X> (threshold) was selected
|
||||
to be enough for not to be "violated".
|
||||
JIRA: CORE-214
|
||||
FBTEST: bugs.core_0214
|
||||
"""
|
||||
NOTES:
|
||||
[29.11.2024] pzotov
|
||||
1. Fix was 31-may-2015, https://sourceforge.net/p/firebird/code/61671/ (3.0.0.31846)
|
||||
Query 'select count(distinct id) from <table_with_200000_rows>' (hereafter "q1") elapsed time:
|
||||
* before fix: ~840 ms
|
||||
* after fix: ~248 ms
|
||||
(and this value is close to the time for query 'select count(*) from (select distinct id from ...)', hereafter "q2")
|
||||
2. Test was fully re-implemented: we have to measure difference between cpu.user_time values instead of datediff().
|
||||
Each query for each datatype is done <N_MEASURES> times, with storing cpu.user_time difference as array element.
|
||||
Then we evaluate median value for this array and save it for further comparison with <MAX_RATIO>.
|
||||
3. It was encountered that for several datatypes ratio between CPU time for q1 and q2 remains unusually high:
|
||||
* for DECFLOAT it can be about 3...4;
|
||||
* for VARCHAR it depends on the column length and number of bytes in the charset:
|
||||
** for single-byte charset (e.g. win1251 etc) ratio is ~2 for field size ~100 and more than 10 for size 1k ... 4k;
|
||||
** for mylti-byte (utf8) ratio is 5...7 for field size ~100 and 35...60 for size 1k ...4k (depends on FB version)
|
||||
For all other datatypes ratio is about 0.9 ... 1.2.
|
||||
Time ratio for LIST(distinct ...) was also measured. Results are the same as for COUNT.
|
||||
Test is considered as passed if ratios for all datatypes are less than <MAX_RATIO>.
|
||||
4. Because of necessity to measure ratio for datatypes that absent in FB 3.x, it was decided to increase min_version to 4.0
|
||||
(plus, there won't be any niticeable changes in FB 3.x code).
|
||||
5. A new ticket has been created to describe problem with DECFLOAT and VARCHAR datatypes:
|
||||
https://github.com/FirebirdSQL/firebird/issues/8330
|
||||
(it contains two excel files with comparison for misc datatypes and different declared length of varchar column).
|
||||
Test for these datatypes will be added after fix of this ticked.
|
||||
|
||||
Checked on Windiws (SS/CS): 6.0.0.535; 5.0.2.1569; 4.0.6.3169.
|
||||
"""
|
||||
import psutil
|
||||
import time
|
||||
import pytest
|
||||
from firebird.qa import *
|
||||
|
||||
db = db_factory()
|
||||
#--------------------------------------------------------------------
|
||||
def median(lst):
|
||||
n = len(lst)
|
||||
s = sorted(lst)
|
||||
return (sum(s[n//2-1:n//2+1])/2.0, s[n//2])[n % 2] if n else None
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
MAX_DIFF = 3.0
|
||||
###########################
|
||||
### S E T T I N G S ###
|
||||
###########################
|
||||
N_MEASURES = 11
|
||||
PAGE_SIZE = 8192
|
||||
N_ROWS_CNT= 100000
|
||||
MAX_RATIO = 2.0
|
||||
|
||||
test_script = f"""
|
||||
recreate table test1e1(id int); -- 10^1 distinct values
|
||||
recreate table test1e2(id int); -- 10^2 distinct values
|
||||
recreate table test1e3(id int); -- 10^3 distinct values
|
||||
recreate table test1e4(id int); -- 10^4 distinct values
|
||||
recreate table test1e5(id int); -- 10^5 distinct values
|
||||
#----------------------------------------------------
|
||||
# NOT yet used. See #8330:
|
||||
TXT_WIDTH_SINGLE_BYTE_ENCODING = int(PAGE_SIZE/2 + 1)
|
||||
TXT_WIDTH_MULTI_BYTE_ENCODING = int(PAGE_SIZE/2 + 1)
|
||||
SINGLE_BYTE_TEXT = 'u' * 4097
|
||||
MULTI_BYTE_TEXT = 'λ' * 4097
|
||||
#----------------------------------------------------
|
||||
|
||||
init_sql = f"""
|
||||
set bail on;
|
||||
recreate table test_one_unique_value(
|
||||
f_sml smallint -- 0
|
||||
,f_int int -- 1
|
||||
,f_big bigint -- 2
|
||||
,f_i128 int128 -- 3
|
||||
,f_bool boolean -- 4
|
||||
,f_dt date -- 5
|
||||
,f_tm time -- 6
|
||||
,f_ts timestamp -- 7
|
||||
,f_tmtz time with time zone -- 8
|
||||
,f_tstz timestamp with time zone -- 9
|
||||
,f_num numeric(2,2) -- 10
|
||||
,f_dec decimal(2,2) -- 11
|
||||
,f_dbl double precision -- 12
|
||||
|
||||
-- commented out until #8330 remains unfixed:
|
||||
---------------------------------------------
|
||||
--,f_decf decfloat -- 13
|
||||
--,f_txt_1251 varchar({TXT_WIDTH_SINGLE_BYTE_ENCODING}) character set win1251 -- 14
|
||||
--,f_txt_utf8 varchar({TXT_WIDTH_MULTI_BYTE_ENCODING}) character set utf8 -- 15
|
||||
---------------------------------------------
|
||||
);
|
||||
|
||||
recreate table test_null_in_all_rows(
|
||||
nul_sml smallint
|
||||
,nul_int int
|
||||
,nul_big bigint
|
||||
,nul_i128 int128
|
||||
,nul_bool boolean
|
||||
,nul_dt date
|
||||
,nul_tm time
|
||||
,nul_ts timestamp
|
||||
,nul_tmtz time with time zone
|
||||
,nul_tstz timestamp with time zone
|
||||
,nul_num numeric(2,2)
|
||||
,nul_dec decimal(2,2)
|
||||
,nul_dbl double precision
|
||||
--,nul_decf decfloat
|
||||
--,nul_txt_1251 varchar({TXT_WIDTH_SINGLE_BYTE_ENCODING}) character set win1251
|
||||
--,nul_txt_utf8 varchar({TXT_WIDTH_MULTI_BYTE_ENCODING}) character set utf8
|
||||
);
|
||||
commit;
|
||||
|
||||
create or alter view v_fill as
|
||||
with recursive
|
||||
r as(select 0 i from rdb$database union all select r.i+1 from r where r.i<9)
|
||||
select r4.i * 10000 + r3.i * 1000 + r2.i * 100 + r1.i * 10 + r0.i as id
|
||||
from r r4, r r3, r r2, r r1, r r0;
|
||||
commit;
|
||||
|
||||
insert into test1e1 select mod(id, 10) from v_fill;
|
||||
insert into test1e2 select mod(id, 100) from v_fill;
|
||||
insert into test1e3 select mod(id, 1000) from v_fill;
|
||||
insert into test1e4 select mod(id, 10000) from v_fill;
|
||||
insert into test1e5 select mod(id, 100000) from v_fill;
|
||||
commit;
|
||||
|
||||
set list on;
|
||||
|
||||
set term ^;
|
||||
|
||||
execute block returns (
|
||||
ratio_for_1e1 varchar(150)
|
||||
,ratio_for_1e2 varchar(150)
|
||||
,ratio_for_1e3 varchar(150)
|
||||
,ratio_for_1e4 varchar(150)
|
||||
,ratio_for_1e5 varchar(150)
|
||||
)
|
||||
as
|
||||
-- ############################################
|
||||
-- ############ T H R E S H O L D ########
|
||||
|
||||
-- Before 28.10.2015: 1.85 (changed after letter by dimitr).
|
||||
-- Probably random disturbance was caused by other (concurrent) processes on test host.
|
||||
-- Check with new threshold was done on: WI-V2.5.5.26942 (SC) and WI-V3.0.0.32134 (CS/SC/SS).
|
||||
|
||||
declare max_diff_threshold numeric(10,4) = {MAX_DIFF};
|
||||
|
||||
-- ############################################
|
||||
|
||||
declare ratio_select_vs_count_1e1 numeric(10,4);
|
||||
declare ratio_select_vs_count_1e2 numeric(10,4);
|
||||
declare ratio_select_vs_count_1e3 numeric(10,4);
|
||||
declare ratio_select_vs_count_1e4 numeric(10,4);
|
||||
declare ratio_select_vs_count_1e5 numeric(10,4);
|
||||
declare sel_distinct_1e1_ms int;
|
||||
declare cnt_distinct_1e1_ms int;
|
||||
declare sel_distinct_1e2_ms int;
|
||||
declare cnt_distinct_1e2_ms int;
|
||||
declare sel_distinct_1e3_ms int;
|
||||
declare cnt_distinct_1e3_ms int;
|
||||
declare sel_distinct_1e4_ms int;
|
||||
declare cnt_distinct_1e4_ms int;
|
||||
declare sel_distinct_1e5_ms int;
|
||||
declare cnt_distinct_1e5_ms int;
|
||||
declare n int;
|
||||
declare t0 timestamp;
|
||||
create or alter procedure sp_fill(a_cnt int) returns(id int) as
|
||||
begin
|
||||
t0='now';
|
||||
select count(*) from ( select distinct id from test1e1 ) into n;
|
||||
sel_distinct_1e1_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
id = 0;
|
||||
while (id < a_cnt) do
|
||||
begin
|
||||
suspend;
|
||||
id = id + 1;
|
||||
end
|
||||
end ^
|
||||
set term ;^
|
||||
commit;
|
||||
|
||||
t0='now';
|
||||
select count(distinct id) from test1e1 into n;
|
||||
cnt_distinct_1e1_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
insert into test_one_unique_value (
|
||||
f_sml -- 0
|
||||
,f_int -- 1
|
||||
,f_big -- 2
|
||||
,f_i128 -- 3
|
||||
,f_bool -- 4
|
||||
,f_dt -- 5
|
||||
,f_tm -- 6
|
||||
,f_ts -- 7
|
||||
,f_tmtz -- 8
|
||||
,f_tstz -- 9
|
||||
,f_num -- 10
|
||||
,f_dec -- 11
|
||||
,f_dbl -- 12
|
||||
--,f_decf -- 13
|
||||
--,f_txt_1251 -- 14
|
||||
--,f_txt_utf8 -- 15
|
||||
)
|
||||
select
|
||||
-32768 -- 0
|
||||
,-2147483648 -- 1
|
||||
,-9223372036854775808 -- 2
|
||||
,-170141183460469231731687303715884105728 -- 3
|
||||
,true -- 4
|
||||
,date '19.12.2023' -- 5
|
||||
,time '23:59:59' -- 6
|
||||
,timestamp '19.12.2023 23:59:59' -- 7
|
||||
,time '11:11:11.111 Indian/Cocos' -- 8
|
||||
,timestamp '2018-12-31 12:31:42.543 Pacific/Fiji' -- 9
|
||||
,-327.68 -- 10
|
||||
,-327.68 -- 11
|
||||
,pi() -- 12
|
||||
--,exp(1) -- cast(-9.999999999999999999999999999999999E6144 as decfloat(34)) -- 13
|
||||
--,lpad('', {TXT_WIDTH_SINGLE_BYTE_ENCODING}, '{SINGLE_BYTE_TEXT}') -- 14
|
||||
--,lpad('', {TXT_WIDTH_MULTI_BYTE_ENCODING}, '{MULTI_BYTE_TEXT}') -- 15
|
||||
from sp_fill({N_ROWS_CNT}) as p;
|
||||
|
||||
ratio_select_vs_count_1e1 = 1.0000 * sel_distinct_1e1_ms / cnt_distinct_1e1_ms;
|
||||
|
||||
------------
|
||||
|
||||
t0='now';
|
||||
select count(*) from ( select distinct id from test1e2 ) into n;
|
||||
sel_distinct_1e2_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
t0='now';
|
||||
select count(distinct id) from test1e2 into n;
|
||||
cnt_distinct_1e2_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
ratio_select_vs_count_1e2 = 1.0000 * sel_distinct_1e2_ms / cnt_distinct_1e2_ms;
|
||||
|
||||
------------
|
||||
|
||||
t0='now';
|
||||
select count(*) from ( select distinct id from test1e3 ) into n;
|
||||
sel_distinct_1e3_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
t0='now';
|
||||
select count(distinct id) from test1e3 into n;
|
||||
cnt_distinct_1e3_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
ratio_select_vs_count_1e3 = 1.0000 * sel_distinct_1e3_ms / cnt_distinct_1e3_ms;
|
||||
|
||||
------------
|
||||
|
||||
t0='now';
|
||||
select count(*) from ( select distinct id from test1e4 ) into n;
|
||||
sel_distinct_1e4_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
t0='now';
|
||||
select count(distinct id) from test1e4 into n;
|
||||
cnt_distinct_1e4_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
ratio_select_vs_count_1e4 = 1.0000 * sel_distinct_1e4_ms / cnt_distinct_1e4_ms;
|
||||
|
||||
------------
|
||||
|
||||
t0='now';
|
||||
select count(*) from ( select distinct id from test1e5 ) into n;
|
||||
sel_distinct_1e5_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
t0='now';
|
||||
select count(distinct id) from test1e5 into n;
|
||||
cnt_distinct_1e5_ms = datediff(millisecond from t0 to cast('now' as timestamp));
|
||||
|
||||
ratio_select_vs_count_1e5 = 1.0000 * sel_distinct_1e5_ms / cnt_distinct_1e5_ms;
|
||||
|
||||
------------
|
||||
|
||||
ratio_for_1e1 = 'Acceptable';
|
||||
ratio_for_1e2 = 'Acceptable';
|
||||
ratio_for_1e3 = 'Acceptable';
|
||||
ratio_for_1e4 = 'Acceptable';
|
||||
ratio_for_1e5 = 'Acceptable';
|
||||
|
||||
if (1=0 or ratio_select_vs_count_1e1 > max_diff_threshold) then
|
||||
-- Example: RATIO_FOR_1E1 Regression /* perf_issue_tag */: ratio = 3.3695 > 3.0000
|
||||
ratio_for_1e1 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e1||' > '||max_diff_threshold;
|
||||
|
||||
if (1=0 or ratio_select_vs_count_1e2 > max_diff_threshold) then
|
||||
ratio_for_1e2 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e2||' > '||max_diff_threshold;
|
||||
|
||||
if (1=0 or ratio_select_vs_count_1e3 > max_diff_threshold) then
|
||||
ratio_for_1e3 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e3||' > '||max_diff_threshold;
|
||||
|
||||
if (1=0 or ratio_select_vs_count_1e4 > max_diff_threshold) then
|
||||
ratio_for_1e4 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e4||' > '||max_diff_threshold;
|
||||
|
||||
if (1=0 or ratio_select_vs_count_1e5 > max_diff_threshold) then
|
||||
ratio_for_1e5 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e5||' > '||max_diff_threshold;
|
||||
|
||||
|
||||
suspend;
|
||||
|
||||
end
|
||||
^ set term ;^
|
||||
insert into test_null_in_all_rows
|
||||
select
|
||||
null -- 0
|
||||
,null -- 1
|
||||
,null -- 2
|
||||
,null -- 3
|
||||
,null -- 4
|
||||
,null -- 5
|
||||
,null -- 6
|
||||
,null -- 7
|
||||
,null -- 8
|
||||
,null -- 9
|
||||
,null -- 10
|
||||
,null -- 11
|
||||
,null -- 12
|
||||
--,null -- 13
|
||||
--,null -- 14
|
||||
--,null -- 15
|
||||
from test_one_unique_value;
|
||||
commit;
|
||||
"""
|
||||
db = db_factory(init = init_sql, page_size = PAGE_SIZE)
|
||||
|
||||
act = isql_act('db', test_script,substitutions = [('[ \t]+', ' ')])
|
||||
act = python_act('db', substitutions = [('[ \t]+', ' ')])
|
||||
|
||||
expected_stdout = f"""
|
||||
RATIO_FOR_1E1 Acceptable
|
||||
RATIO_FOR_1E2 Acceptable
|
||||
RATIO_FOR_1E3 Acceptable
|
||||
RATIO_FOR_1E4 Acceptable
|
||||
RATIO_FOR_1E5 Acceptable
|
||||
"""
|
||||
@pytest.mark.version('>=4')
|
||||
def test_1(act: Action, capsys):
|
||||
|
||||
@pytest.mark.version('>=3')
|
||||
def test_1(act: Action):
|
||||
act.expected_stdout = expected_stdout
|
||||
act.execute(combine_output = True)
|
||||
with act.db.connect() as con:
|
||||
cur1=con.cursor()
|
||||
cur2=con.cursor()
|
||||
|
||||
cur1.execute('select mon$server_pid as p from mon$attachments where mon$attachment_id = current_connection')
|
||||
fb_pid = int(cur1.fetchone()[0])
|
||||
|
||||
run_cpu_map = {}
|
||||
for t_name in ('test_one_unique_value', 'test_null_in_all_rows'):
|
||||
|
||||
fields_qry = f"""
|
||||
select rf.rdb$field_name
|
||||
from rdb$relation_fields rf
|
||||
where rf.rdb$relation_name = upper('{t_name}')
|
||||
order by rf.rdb$field_position
|
||||
"""
|
||||
cur1.execute(fields_qry)
|
||||
fields_lst = [x[0].strip() for x in cur1.fetchall()]
|
||||
|
||||
for f_name in fields_lst:
|
||||
query1 = f'select count(*) from (select distinct {f_name} from {t_name})'
|
||||
query2 = f'select count(distinct {f_name}) from {t_name}'
|
||||
ps1 = cur1.prepare(query1)
|
||||
ps2 = cur2.prepare(query2)
|
||||
for c in (cur1, cur2):
|
||||
cpu_usage_values = []
|
||||
psc = ps1 if c == cur1 else ps2
|
||||
for i in range(0, N_MEASURES):
|
||||
fb_info_init = psutil.Process(fb_pid).cpu_times()
|
||||
rs = c.execute(psc)
|
||||
c.fetchall()
|
||||
rs.close()
|
||||
fb_info_curr = psutil.Process(fb_pid).cpu_times()
|
||||
cpu_usage_values.append( max(fb_info_curr.user - fb_info_init.user, 0.000001) )
|
||||
|
||||
v = run_cpu_map.get( (t_name,f_name), [0,0,0, '',''])
|
||||
if psc == ps1:
|
||||
v[0] = median(cpu_usage_values) # 'select count(*) from (select distinct ...)'
|
||||
else:
|
||||
v[1] = median(cpu_usage_values) # 'select count(distinct ...) from ...'
|
||||
v[2] = v[0] / v[1]
|
||||
v[3] = query1
|
||||
v[4] = query2
|
||||
run_cpu_map[ (t_name,f_name) ] = v
|
||||
|
||||
ps1.free()
|
||||
ps2.free()
|
||||
|
||||
poor_ratios_lst = []
|
||||
|
||||
#for k,v in run_cpu_map.items():
|
||||
# print(':::',k,':::')
|
||||
# cpu_median_1, cpu_median_2, cpu_medians_ratio, query_1, query_2 = v
|
||||
# # f'{ra=:12.4f}'
|
||||
# msg = '\n'.join(
|
||||
# ( f'{query_1=}'
|
||||
# ,f'{query_2=}'
|
||||
# ,f'{cpu_median_1=:12.4f} {cpu_median_2=:12.4f} {cpu_medians_ratio=:12.6f}'
|
||||
# )
|
||||
# )
|
||||
# print(msg)
|
||||
# print('-------------------------------------------------------------------------------------')
|
||||
|
||||
|
||||
msg_prefix = 'CPU time medians ratio: '
|
||||
msg_expected = msg_prefix + 'EXPECTED.'
|
||||
for k,v in run_cpu_map.items():
|
||||
if v[2] > MAX_RATIO:
|
||||
poor_ratios_lst.append( '\n'.join(
|
||||
( 'query_1: ' + v[3]
|
||||
,'query_2: ' + v[4]
|
||||
,f'cpu_median_1: {v[0]:12.6f}'
|
||||
,f'cpu_median_2: {v[1]:12.6f}'
|
||||
,f'cpu_median_1 / cpu_median_2: {v[0]/v[1]:12.6f}'
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if poor_ratios_lst:
|
||||
print(f'{msg_prefix} /* perf_issue_tag */ UNEXPECTED. Following ratio(s) exceeds MAX_RATIO={MAX_RATIO}:')
|
||||
for x in poor_ratios_lst:
|
||||
print(x)
|
||||
else:
|
||||
print(msg_expected)
|
||||
|
||||
|
||||
act.expected_stdout = msg_expected
|
||||
act.stdout = capsys.readouterr().out
|
||||
assert act.clean_stdout == act.clean_expected_stdout
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user