diff --git a/tests/bugs/core_0214_test.py b/tests/bugs/core_0214_test.py index 0f7ddf5c..3e0e48d1 100644 --- a/tests/bugs/core_0214_test.py +++ b/tests/bugs/core_0214_test.py @@ -2,195 +2,275 @@ """ ID: issue-542 -ISSUE: 542 -TITLE: Count ( DISTINCT ... ) is too slow +ISSUE: https://github.com/FirebirdSQL/firebird/issues/542 +TITLE: Count (DISTINCT ...) is too slow DESCRIPTION: - This test does following: - 1. Creates several tables with different number of unique values in field ID. - 2. Measures for each table time for two statements: - 2.1. select count(*) from ( select distinct id from ... ) - vs - 2.2. select count(distinct id) from ... - 3. If time for 2.1 exceeds time for 2.2 more than times - output message - about possible regression. After multiple runs it was found that ratio for - 2.1 vs 2.2 is about 1.05 ... 1.10. Constant (threshold) was selected - to be enough for not to be "violated". JIRA: CORE-214 FBTEST: bugs.core_0214 -""" +NOTES: + [29.11.2024] pzotov + 1. Fix was 31-may-2015, https://sourceforge.net/p/firebird/code/61671/ (3.0.0.31846) + Query 'select count(distinct id) from ' (hereafter "q1") elapsed time: + * before fix: ~840 ms + * after fix: ~248 ms + (and this value is close to the time for query 'select count(*) from (select distinct id from ...)', hereafter "q2") + 2. Test was fully re-implemented: we have to measure difference between cpu.user_time values instead of datediff(). + Each query for each datatype is done times, with storing cpu.user_time difference as array element. + Then we evaluate median value for this array and save it for further comparison with . + 3. It was encountered that for several datatypes ratio between CPU time for q1 and q2 remains unusually high: + * for DECFLOAT it can be about 3...4; + * for VARCHAR it depends on the column length and number of bytes in the charset: + ** for single-byte charset (e.g. win1251 etc) ratio is ~2 for field size ~100 and more than 10 for size 1k ... 4k; + ** for mylti-byte (utf8) ratio is 5...7 for field size ~100 and 35...60 for size 1k ...4k (depends on FB version) + For all other datatypes ratio is about 0.9 ... 1.2. + Time ratio for LIST(distinct ...) was also measured. Results are the same as for COUNT. + Test is considered as passed if ratios for all datatypes are less than . + 4. Because of necessity to measure ratio for datatypes that absent in FB 3.x, it was decided to increase min_version to 4.0 + (plus, there won't be any niticeable changes in FB 3.x code). + 5. A new ticket has been created to describe problem with DECFLOAT and VARCHAR datatypes: + https://github.com/FirebirdSQL/firebird/issues/8330 + (it contains two excel files with comparison for misc datatypes and different declared length of varchar column). + Test for these datatypes will be added after fix of this ticked. + Checked on Windiws (SS/CS): 6.0.0.535; 5.0.2.1569; 4.0.6.3169. +""" +import psutil +import time import pytest from firebird.qa import * -db = db_factory() +#-------------------------------------------------------------------- +def median(lst): + n = len(lst) + s = sorted(lst) + return (sum(s[n//2-1:n//2+1])/2.0, s[n//2])[n % 2] if n else None +#-------------------------------------------------------------------- -MAX_DIFF = 3.0 +########################### +### S E T T I N G S ### +########################### +N_MEASURES = 11 +PAGE_SIZE = 8192 +N_ROWS_CNT= 100000 +MAX_RATIO = 2.0 -test_script = f""" - recreate table test1e1(id int); -- 10^1 distinct values - recreate table test1e2(id int); -- 10^2 distinct values - recreate table test1e3(id int); -- 10^3 distinct values - recreate table test1e4(id int); -- 10^4 distinct values - recreate table test1e5(id int); -- 10^5 distinct values +#---------------------------------------------------- +# NOT yet used. See #8330: +TXT_WIDTH_SINGLE_BYTE_ENCODING = int(PAGE_SIZE/2 + 1) +TXT_WIDTH_MULTI_BYTE_ENCODING = int(PAGE_SIZE/2 + 1) +SINGLE_BYTE_TEXT = 'u' * 4097 +MULTI_BYTE_TEXT = 'λ' * 4097 +#---------------------------------------------------- + +init_sql = f""" + set bail on; + recreate table test_one_unique_value( + f_sml smallint -- 0 + ,f_int int -- 1 + ,f_big bigint -- 2 + ,f_i128 int128 -- 3 + ,f_bool boolean -- 4 + ,f_dt date -- 5 + ,f_tm time -- 6 + ,f_ts timestamp -- 7 + ,f_tmtz time with time zone -- 8 + ,f_tstz timestamp with time zone -- 9 + ,f_num numeric(2,2) -- 10 + ,f_dec decimal(2,2) -- 11 + ,f_dbl double precision -- 12 + + -- commented out until #8330 remains unfixed: + --------------------------------------------- + --,f_decf decfloat -- 13 + --,f_txt_1251 varchar({TXT_WIDTH_SINGLE_BYTE_ENCODING}) character set win1251 -- 14 + --,f_txt_utf8 varchar({TXT_WIDTH_MULTI_BYTE_ENCODING}) character set utf8 -- 15 + --------------------------------------------- + ); + + recreate table test_null_in_all_rows( + nul_sml smallint + ,nul_int int + ,nul_big bigint + ,nul_i128 int128 + ,nul_bool boolean + ,nul_dt date + ,nul_tm time + ,nul_ts timestamp + ,nul_tmtz time with time zone + ,nul_tstz timestamp with time zone + ,nul_num numeric(2,2) + ,nul_dec decimal(2,2) + ,nul_dbl double precision + --,nul_decf decfloat + --,nul_txt_1251 varchar({TXT_WIDTH_SINGLE_BYTE_ENCODING}) character set win1251 + --,nul_txt_utf8 varchar({TXT_WIDTH_MULTI_BYTE_ENCODING}) character set utf8 + ); commit; - create or alter view v_fill as - with recursive - r as(select 0 i from rdb$database union all select r.i+1 from r where r.i<9) - select r4.i * 10000 + r3.i * 1000 + r2.i * 100 + r1.i * 10 + r0.i as id - from r r4, r r3, r r2, r r1, r r0; - commit; - - insert into test1e1 select mod(id, 10) from v_fill; - insert into test1e2 select mod(id, 100) from v_fill; - insert into test1e3 select mod(id, 1000) from v_fill; - insert into test1e4 select mod(id, 10000) from v_fill; - insert into test1e5 select mod(id, 100000) from v_fill; - commit; - - set list on; - set term ^; - - execute block returns ( - ratio_for_1e1 varchar(150) - ,ratio_for_1e2 varchar(150) - ,ratio_for_1e3 varchar(150) - ,ratio_for_1e4 varchar(150) - ,ratio_for_1e5 varchar(150) - ) - as - -- ############################################ - -- ############ T H R E S H O L D ######## - - -- Before 28.10.2015: 1.85 (changed after letter by dimitr). - -- Probably random disturbance was caused by other (concurrent) processes on test host. - -- Check with new threshold was done on: WI-V2.5.5.26942 (SC) and WI-V3.0.0.32134 (CS/SC/SS). - - declare max_diff_threshold numeric(10,4) = {MAX_DIFF}; - - -- ############################################ - - declare ratio_select_vs_count_1e1 numeric(10,4); - declare ratio_select_vs_count_1e2 numeric(10,4); - declare ratio_select_vs_count_1e3 numeric(10,4); - declare ratio_select_vs_count_1e4 numeric(10,4); - declare ratio_select_vs_count_1e5 numeric(10,4); - declare sel_distinct_1e1_ms int; - declare cnt_distinct_1e1_ms int; - declare sel_distinct_1e2_ms int; - declare cnt_distinct_1e2_ms int; - declare sel_distinct_1e3_ms int; - declare cnt_distinct_1e3_ms int; - declare sel_distinct_1e4_ms int; - declare cnt_distinct_1e4_ms int; - declare sel_distinct_1e5_ms int; - declare cnt_distinct_1e5_ms int; - declare n int; - declare t0 timestamp; + create or alter procedure sp_fill(a_cnt int) returns(id int) as begin - t0='now'; - select count(*) from ( select distinct id from test1e1 ) into n; - sel_distinct_1e1_ms = datediff(millisecond from t0 to cast('now' as timestamp)); + id = 0; + while (id < a_cnt) do + begin + suspend; + id = id + 1; + end + end ^ + set term ;^ + commit; - t0='now'; - select count(distinct id) from test1e1 into n; - cnt_distinct_1e1_ms = datediff(millisecond from t0 to cast('now' as timestamp)); + insert into test_one_unique_value ( + f_sml -- 0 + ,f_int -- 1 + ,f_big -- 2 + ,f_i128 -- 3 + ,f_bool -- 4 + ,f_dt -- 5 + ,f_tm -- 6 + ,f_ts -- 7 + ,f_tmtz -- 8 + ,f_tstz -- 9 + ,f_num -- 10 + ,f_dec -- 11 + ,f_dbl -- 12 + --,f_decf -- 13 + --,f_txt_1251 -- 14 + --,f_txt_utf8 -- 15 + ) + select + -32768 -- 0 + ,-2147483648 -- 1 + ,-9223372036854775808 -- 2 + ,-170141183460469231731687303715884105728 -- 3 + ,true -- 4 + ,date '19.12.2023' -- 5 + ,time '23:59:59' -- 6 + ,timestamp '19.12.2023 23:59:59' -- 7 + ,time '11:11:11.111 Indian/Cocos' -- 8 + ,timestamp '2018-12-31 12:31:42.543 Pacific/Fiji' -- 9 + ,-327.68 -- 10 + ,-327.68 -- 11 + ,pi() -- 12 + --,exp(1) -- cast(-9.999999999999999999999999999999999E6144 as decfloat(34)) -- 13 + --,lpad('', {TXT_WIDTH_SINGLE_BYTE_ENCODING}, '{SINGLE_BYTE_TEXT}') -- 14 + --,lpad('', {TXT_WIDTH_MULTI_BYTE_ENCODING}, '{MULTI_BYTE_TEXT}') -- 15 + from sp_fill({N_ROWS_CNT}) as p; - ratio_select_vs_count_1e1 = 1.0000 * sel_distinct_1e1_ms / cnt_distinct_1e1_ms; - - ------------ - - t0='now'; - select count(*) from ( select distinct id from test1e2 ) into n; - sel_distinct_1e2_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - t0='now'; - select count(distinct id) from test1e2 into n; - cnt_distinct_1e2_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - ratio_select_vs_count_1e2 = 1.0000 * sel_distinct_1e2_ms / cnt_distinct_1e2_ms; - - ------------ - - t0='now'; - select count(*) from ( select distinct id from test1e3 ) into n; - sel_distinct_1e3_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - t0='now'; - select count(distinct id) from test1e3 into n; - cnt_distinct_1e3_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - ratio_select_vs_count_1e3 = 1.0000 * sel_distinct_1e3_ms / cnt_distinct_1e3_ms; - - ------------ - - t0='now'; - select count(*) from ( select distinct id from test1e4 ) into n; - sel_distinct_1e4_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - t0='now'; - select count(distinct id) from test1e4 into n; - cnt_distinct_1e4_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - ratio_select_vs_count_1e4 = 1.0000 * sel_distinct_1e4_ms / cnt_distinct_1e4_ms; - - ------------ - - t0='now'; - select count(*) from ( select distinct id from test1e5 ) into n; - sel_distinct_1e5_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - t0='now'; - select count(distinct id) from test1e5 into n; - cnt_distinct_1e5_ms = datediff(millisecond from t0 to cast('now' as timestamp)); - - ratio_select_vs_count_1e5 = 1.0000 * sel_distinct_1e5_ms / cnt_distinct_1e5_ms; - - ------------ - - ratio_for_1e1 = 'Acceptable'; - ratio_for_1e2 = 'Acceptable'; - ratio_for_1e3 = 'Acceptable'; - ratio_for_1e4 = 'Acceptable'; - ratio_for_1e5 = 'Acceptable'; - - if (1=0 or ratio_select_vs_count_1e1 > max_diff_threshold) then - -- Example: RATIO_FOR_1E1 Regression /* perf_issue_tag */: ratio = 3.3695 > 3.0000 - ratio_for_1e1 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e1||' > '||max_diff_threshold; - - if (1=0 or ratio_select_vs_count_1e2 > max_diff_threshold) then - ratio_for_1e2 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e2||' > '||max_diff_threshold; - - if (1=0 or ratio_select_vs_count_1e3 > max_diff_threshold) then - ratio_for_1e3 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e3||' > '||max_diff_threshold; - - if (1=0 or ratio_select_vs_count_1e4 > max_diff_threshold) then - ratio_for_1e4 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e4||' > '||max_diff_threshold; - - if (1=0 or ratio_select_vs_count_1e5 > max_diff_threshold) then - ratio_for_1e5 = 'Regression /* perf_issue_tag */: ratio = '||ratio_select_vs_count_1e5||' > '||max_diff_threshold; - - - suspend; - - end - ^ set term ;^ + insert into test_null_in_all_rows + select + null -- 0 + ,null -- 1 + ,null -- 2 + ,null -- 3 + ,null -- 4 + ,null -- 5 + ,null -- 6 + ,null -- 7 + ,null -- 8 + ,null -- 9 + ,null -- 10 + ,null -- 11 + ,null -- 12 + --,null -- 13 + --,null -- 14 + --,null -- 15 + from test_one_unique_value; + commit; """ +db = db_factory(init = init_sql, page_size = PAGE_SIZE) -act = isql_act('db', test_script,substitutions = [('[ \t]+', ' ')]) +act = python_act('db', substitutions = [('[ \t]+', ' ')]) -expected_stdout = f""" - RATIO_FOR_1E1 Acceptable - RATIO_FOR_1E2 Acceptable - RATIO_FOR_1E3 Acceptable - RATIO_FOR_1E4 Acceptable - RATIO_FOR_1E5 Acceptable -""" +@pytest.mark.version('>=4') +def test_1(act: Action, capsys): -@pytest.mark.version('>=3') -def test_1(act: Action): - act.expected_stdout = expected_stdout - act.execute(combine_output = True) + with act.db.connect() as con: + cur1=con.cursor() + cur2=con.cursor() + + cur1.execute('select mon$server_pid as p from mon$attachments where mon$attachment_id = current_connection') + fb_pid = int(cur1.fetchone()[0]) + + run_cpu_map = {} + for t_name in ('test_one_unique_value', 'test_null_in_all_rows'): + + fields_qry = f""" + select rf.rdb$field_name + from rdb$relation_fields rf + where rf.rdb$relation_name = upper('{t_name}') + order by rf.rdb$field_position + """ + cur1.execute(fields_qry) + fields_lst = [x[0].strip() for x in cur1.fetchall()] + + for f_name in fields_lst: + query1 = f'select count(*) from (select distinct {f_name} from {t_name})' + query2 = f'select count(distinct {f_name}) from {t_name}' + ps1 = cur1.prepare(query1) + ps2 = cur2.prepare(query2) + for c in (cur1, cur2): + cpu_usage_values = [] + psc = ps1 if c == cur1 else ps2 + for i in range(0, N_MEASURES): + fb_info_init = psutil.Process(fb_pid).cpu_times() + rs = c.execute(psc) + c.fetchall() + rs.close() + fb_info_curr = psutil.Process(fb_pid).cpu_times() + cpu_usage_values.append( max(fb_info_curr.user - fb_info_init.user, 0.000001) ) + + v = run_cpu_map.get( (t_name,f_name), [0,0,0, '','']) + if psc == ps1: + v[0] = median(cpu_usage_values) # 'select count(*) from (select distinct ...)' + else: + v[1] = median(cpu_usage_values) # 'select count(distinct ...) from ...' + v[2] = v[0] / v[1] + v[3] = query1 + v[4] = query2 + run_cpu_map[ (t_name,f_name) ] = v + + ps1.free() + ps2.free() + + poor_ratios_lst = [] + + #for k,v in run_cpu_map.items(): + # print(':::',k,':::') + # cpu_median_1, cpu_median_2, cpu_medians_ratio, query_1, query_2 = v + # # f'{ra=:12.4f}' + # msg = '\n'.join( + # ( f'{query_1=}' + # ,f'{query_2=}' + # ,f'{cpu_median_1=:12.4f} {cpu_median_2=:12.4f} {cpu_medians_ratio=:12.6f}' + # ) + # ) + # print(msg) + # print('-------------------------------------------------------------------------------------') + + + msg_prefix = 'CPU time medians ratio: ' + msg_expected = msg_prefix + 'EXPECTED.' + for k,v in run_cpu_map.items(): + if v[2] > MAX_RATIO: + poor_ratios_lst.append( '\n'.join( + ( 'query_1: ' + v[3] + ,'query_2: ' + v[4] + ,f'cpu_median_1: {v[0]:12.6f}' + ,f'cpu_median_2: {v[1]:12.6f}' + ,f'cpu_median_1 / cpu_median_2: {v[0]/v[1]:12.6f}' + ) + ) + ) + + if poor_ratios_lst: + print(f'{msg_prefix} /* perf_issue_tag */ UNEXPECTED. Following ratio(s) exceeds MAX_RATIO={MAX_RATIO}:') + for x in poor_ratios_lst: + print(x) + else: + print(msg_expected) + + + act.expected_stdout = msg_expected + act.stdout = capsys.readouterr().out assert act.clean_stdout == act.clean_expected_stdout -