From 4554ee24f547128ca5102a48b1516b61c86b9266 Mon Sep 17 00:00:00 2001 From: asfernandes Date: Wed, 25 Mar 2015 17:51:54 +0000 Subject: [PATCH] Feature CORE-4722 - Aggregate linear regression functions. --- doc/WhatsNew | 8 + doc/sql.extensions/README.regr_functions.txt | 38 +++ src/dsql/AggNodes.cpp | 320 +++++++++++++++++++ src/dsql/AggNodes.h | 71 ++++ src/dsql/parse.y | 36 +++ src/jrd/blp.h | 9 + src/jrd/blr.h | 9 + src/yvalve/keywords.cpp | 9 + 8 files changed, 500 insertions(+) create mode 100644 doc/sql.extensions/README.regr_functions.txt diff --git a/doc/WhatsNew b/doc/WhatsNew index 33b44cd2cd..97ba755c94 100644 --- a/doc/WhatsNew +++ b/doc/WhatsNew @@ -291,6 +291,14 @@ Contributor(s): Alex Peshkov + * New feature CORE-4722 + Aggregate linear regression functions + See also: + /doc/sql.extensions/README.regr_functions.txt + Contributor(s): + Hajime Nakagami + Adriano dos Santos Fernandes + * New feature CORE-4717 Aggregate statistical functions COVAR_SAMP, COVAR_POP and CORR See also: diff --git a/doc/sql.extensions/README.regr_functions.txt b/doc/sql.extensions/README.regr_functions.txt new file mode 100644 index 0000000000..47857394cf --- /dev/null +++ b/doc/sql.extensions/README.regr_functions.txt @@ -0,0 +1,38 @@ +--------------------------- +Linear Regression Functions +--------------------------- + +REGR_* functions analyze relationshitp of the 2 numeric set of data. +These functions calculate with records that both of 2 set are not NULL. + +Syntax: + + ::= (, ) + := { REGR_AVGX | REGR_AVGY | REGR_COUNT | REGR_INTERCEPT | + REGR_R2 | REGR_SLOPE | REGR_SXX | REGR_SXY | REGR_SYY } + +Formula use bellow variable. + +Y: ( IS NOT NULL AND IS NOT NULL). +X: ( IS NOT NULL AND IS NOT NULL). +N: COUNT of recordset except IS NULL OR IS NULL. + +Formula: + + REGR_AVGX(Y, X) = SUM(X) / N + REGR_AVGY(Y, X) = SUM(Y) / N + REGR_COUNT(Y, X) = N + REGR_INTERCEPT(Y, X) = REGR_AVGY(Y, X) - REGR_SLOPE(Y, X) * REGR_AVG_X(Y, X) + REGR_R2(Y, X) = POWER(CORR(Y, X),2) + REGR_SLOPE(Y, X) = COVAR_POP(Y, X) + REGR_SXX(Y, X) = N * VAR_POP(X) + REGR_SXY(Y, X) = N * COVAR_POP(Y, X) + REGR_SYY(Y, X) = N * VAR_POP(Y) + +Author: + + Hajime Nakagami + +Note: + + Function return NULL if N = 0 except of REGR_COUNT(). diff --git a/src/dsql/AggNodes.cpp b/src/dsql/AggNodes.cpp index ab11d5a8b8..269c355298 100644 --- a/src/dsql/AggNodes.cpp +++ b/src/dsql/AggNodes.cpp @@ -1220,6 +1220,7 @@ DmlNode* StdDevAggNode::parse(thread_db* tdbb, MemoryPool& pool, CompilerScratch default: fb_assert(false); + return NULL; } return FB_NEW(pool) StdDevAggNode(pool, type, PAR_parse_value(tdbb, csb)); @@ -1359,6 +1360,7 @@ DmlNode* CorrAggNode::parse(thread_db* tdbb, MemoryPool& pool, CompilerScratch* default: fb_assert(false); + return NULL; } ValueExprNode* a1 = PAR_parse_value(tdbb, csb); @@ -1488,4 +1490,322 @@ AggNode* CorrAggNode::dsqlCopy(DsqlCompilerScratch* dsqlScratch) /*const*/ } +//-------------------- + +static AggNode::Register regrAvgxAggInfo("REGR_AVGX", blr_agg_regr_avgx); +static AggNode::Register regrAvgyAggInfo("REGR_AVGY", blr_agg_regr_avgy); +static AggNode::Register regrInterceptAggInfo("REGR_INTERCEPT", blr_agg_regr_intercept); +static AggNode::Register regrR2AggInfo("REGR_R2", blr_agg_regr_r2); +static AggNode::Register regrSlopeAggInfo("REGR_SLOPE", blr_agg_regr_slope); +static AggNode::Register regrSxxAggInfo("REGR_SXX", blr_agg_regr_sxx); +static AggNode::Register regrSxyAggInfo("REGR_SXY", blr_agg_regr_sxy); +static AggNode::Register regrSyyAggInfo("REGR_SYY", blr_agg_regr_syy); + +RegrAggNode::RegrAggNode(MemoryPool& pool, RegrType aType, ValueExprNode* aArg, ValueExprNode* aArg2) + : AggNode(pool, + (aType == RegrAggNode::TYPE_REGR_AVGX ? regrAvgxAggInfo : + aType == RegrAggNode::TYPE_REGR_AVGY ? regrAvgyAggInfo : + aType == RegrAggNode::TYPE_REGR_INTERCEPT ? regrInterceptAggInfo : + aType == RegrAggNode::TYPE_REGR_R2 ? regrR2AggInfo : + aType == RegrAggNode::TYPE_REGR_SLOPE ? regrSlopeAggInfo : + aType == RegrAggNode::TYPE_REGR_SXX ? regrSxxAggInfo : + aType == RegrAggNode::TYPE_REGR_SXY ? regrSxyAggInfo : + aType == RegrAggNode::TYPE_REGR_SYY ? regrSyyAggInfo : + regrSyyAggInfo), + false, false, aArg), + type(aType), + arg2(aArg2), + impure2Offset(0) +{ + addChildNode(arg2, arg2); +} + +void RegrAggNode::aggPostRse(thread_db* tdbb, CompilerScratch* csb) +{ + AggNode::aggPostRse(tdbb, csb); + impure2Offset = CMP_impure(csb, sizeof(RegrImpure)); +} + +DmlNode* RegrAggNode::parse(thread_db* tdbb, MemoryPool& pool, CompilerScratch* csb, const UCHAR blrOp) +{ + RegrType type; + + switch (blrOp) + { + case blr_agg_regr_avgx: + type = TYPE_REGR_AVGX; + break; + + case blr_agg_regr_avgy: + type = TYPE_REGR_AVGY; + break; + + case blr_agg_regr_intercept: + type = TYPE_REGR_INTERCEPT; + break; + + case blr_agg_regr_r2: + type = TYPE_REGR_R2; + break; + + case blr_agg_regr_slope: + type = TYPE_REGR_SLOPE; + break; + + case blr_agg_regr_sxx: + type = TYPE_REGR_SXX; + break; + + case blr_agg_regr_sxy: + type = TYPE_REGR_SXY; + break; + + case blr_agg_regr_syy: + type = TYPE_REGR_SYY; + break; + + default: + fb_assert(false); + return NULL; + } + + ValueExprNode* a1 = PAR_parse_value(tdbb, csb); + ValueExprNode* a2 = PAR_parse_value(tdbb, csb); + return FB_NEW(pool) RegrAggNode(pool, type, a1, a2); +} + +void RegrAggNode::make(DsqlCompilerScratch* dsqlScratch, dsc* desc) +{ + desc->makeDouble(); + desc->setNullable(true); +} + +void RegrAggNode::getDesc(thread_db* tdbb, CompilerScratch* csb, dsc* desc) +{ + desc->makeDouble(); +} + +ValueExprNode* RegrAggNode::copy(thread_db* tdbb, NodeCopier& copier) const +{ + RegrAggNode* node = FB_NEW(*tdbb->getDefaultPool()) RegrAggNode(*tdbb->getDefaultPool(), type); + node->nodScale = nodScale; + node->arg = copier.copy(tdbb, arg); + node->arg2 = copier.copy(tdbb, arg2); + return node; +} + +void RegrAggNode::aggInit(thread_db* tdbb, jrd_req* request) const +{ + AggNode::aggInit(tdbb, request); + + impure_value_ex* impure = request->getImpure(impureOffset); + impure->make_double(0); + + RegrImpure* impure2 = request->getImpure(impure2Offset); + impure2->x = impure2->x2 = impure2->y = impure2->y2 = impure2->xy = 0.0; +} + +bool RegrAggNode::aggPass(thread_db* tdbb, jrd_req* request) const +{ + dsc* desc = NULL; + dsc* desc2 = NULL; + + desc = EVL_expr(tdbb, request, arg); + if (request->req_flags & req_null) + return false; + + desc2 = EVL_expr(tdbb, request, arg2); + if (request->req_flags & req_null) + return false; + + impure_value_ex* impure = request->getImpure(impureOffset); + ++impure->vlux_count; + + const double y = MOV_get_double(desc); + const double x = MOV_get_double(desc2); + + RegrImpure* impure2 = request->getImpure(impure2Offset); + impure2->x += x; + impure2->x2 += x * x; + impure2->y += y; + impure2->y2 += y * y; + impure2->xy += x * y; + + return true; +} + +void RegrAggNode::aggPass(thread_db* /*tdbb*/, jrd_req* /*request*/, dsc* /*desc*/) const +{ + fb_assert(false); +} + +dsc* RegrAggNode::aggExecute(thread_db* tdbb, jrd_req* request) const +{ + impure_value_ex* impure = request->getImpure(impureOffset); + RegrImpure* impure2 = request->getImpure(impure2Offset); + + if (impure->vlux_count == 0) + return NULL; + + const double varPopX = (impure2->x2 - impure2->x * impure2->x / impure->vlux_count) / impure->vlux_count; + const double varPopY = (impure2->y2 - impure2->y * impure2->y / impure->vlux_count) / impure->vlux_count; + const double covarPop = (impure2->xy - impure2->y * impure2->x / impure->vlux_count) / impure->vlux_count; + const double avgX = impure2->x / impure->vlux_count; + const double avgY = impure2->y / impure->vlux_count; + const double slope = covarPop / varPopX; + const double sq = sqrt(varPopX) * sqrt(varPopY); + const double corr = covarPop / sq; + + double d; + + switch (type) + { + case TYPE_REGR_AVGX: + d = avgX; + break; + + case TYPE_REGR_AVGY: + d = avgY; + break; + + case TYPE_REGR_INTERCEPT: + if (varPopX == 0.0) + return NULL; + else + d = avgY - slope * avgX; + break; + + case TYPE_REGR_R2: + if (varPopX == 0.0) + return NULL; + else if (varPopY == 0.0) + d = 1.0; + else if (sq == 0.0) + return NULL; + else + d = corr * corr; + break; + + case TYPE_REGR_SLOPE: + if (varPopX == 0.0) + return NULL; + else + d = covarPop / varPopX; + break; + + case TYPE_REGR_SXX: + d = impure->vlux_count * varPopX; + break; + + case TYPE_REGR_SXY: + d = impure->vlux_count * covarPop; + break; + + case TYPE_REGR_SYY: + d = impure->vlux_count * varPopY; + break; + } + + dsc temp; + temp.makeDouble(&d); + + EVL_make_value(tdbb, &temp, impure); + + return &impure->vlu_desc; +} + +AggNode* RegrAggNode::dsqlCopy(DsqlCompilerScratch* dsqlScratch) /*const*/ +{ + return FB_NEW(getPool()) RegrAggNode(getPool(), type, + doDsqlPass(dsqlScratch, arg), doDsqlPass(dsqlScratch, arg2)); +} + + +//-------------------- + + +static AggNode::Register regrCountAggInfo("REGR_COUNT", blr_agg_regr_count); + +RegrCountAggNode::RegrCountAggNode(MemoryPool& pool, ValueExprNode* aArg, ValueExprNode* aArg2) + : AggNode(pool, regrCountAggInfo, false, false, aArg), + arg2(aArg2) +{ + addChildNode(arg2, arg2); +} + +DmlNode* RegrCountAggNode::parse(thread_db* tdbb, MemoryPool& pool, CompilerScratch* csb, const UCHAR blrOp) +{ + ValueExprNode* a1 = PAR_parse_value(tdbb, csb); + ValueExprNode* a2 = PAR_parse_value(tdbb, csb); + return FB_NEW(pool) RegrCountAggNode(pool, a1, a2); +} + +void RegrCountAggNode::make(DsqlCompilerScratch* dsqlScratch, dsc* desc) +{ + desc->makeInt64(0); +} + +void RegrCountAggNode::getDesc(thread_db* tdbb, CompilerScratch* csb, dsc* desc) +{ + desc->makeInt64(0); +} + +ValueExprNode* RegrCountAggNode::copy(thread_db* tdbb, NodeCopier& copier) const +{ + RegrCountAggNode* node = FB_NEW(*tdbb->getDefaultPool()) RegrCountAggNode(*tdbb->getDefaultPool()); + node->nodScale = nodScale; + node->arg = copier.copy(tdbb, arg); + node->arg2 = copier.copy(tdbb, arg2); + return node; +} + +void RegrCountAggNode::aggInit(thread_db* tdbb, jrd_req* request) const +{ + AggNode::aggInit(tdbb, request); + + impure_value_ex* impure = request->getImpure(impureOffset); + impure->make_int64(0); +} + +bool RegrCountAggNode::aggPass(thread_db* tdbb, jrd_req* request) const +{ + dsc* desc = NULL; + dsc* desc2 = NULL; + + desc = EVL_expr(tdbb, request, arg); + if (request->req_flags & req_null) + return false; + + desc2 = EVL_expr(tdbb, request, arg2); + if (request->req_flags & req_null) + return false; + + impure_value_ex* impure = request->getImpure(impureOffset); + ++impure->vlu_misc.vlu_int64; + + return true; +} + +void RegrCountAggNode::aggPass(thread_db* /*tdbb*/, jrd_req* /*request*/, dsc* /*desc*/) const +{ + fb_assert(false); +} + +dsc* RegrCountAggNode::aggExecute(thread_db* tdbb, jrd_req* request) const +{ + impure_value_ex* impure = request->getImpure(impureOffset); + + if (!impure->vlu_desc.dsc_dtype) + return NULL; + + return &impure->vlu_desc; +} + +AggNode* RegrCountAggNode::dsqlCopy(DsqlCompilerScratch* dsqlScratch) /*const*/ +{ + return FB_NEW(getPool()) RegrCountAggNode(getPool(), + doDsqlPass(dsqlScratch, arg), doDsqlPass(dsqlScratch, arg2)); +} + + } // namespace Jrd diff --git a/src/dsql/AggNodes.h b/src/dsql/AggNodes.h index dcd3193b84..d5db1f8524 100644 --- a/src/dsql/AggNodes.h +++ b/src/dsql/AggNodes.h @@ -234,6 +234,77 @@ private: ULONG impure2Offset; }; +class RegrAggNode : public AggNode +{ +public: + enum RegrType + { + TYPE_REGR_AVGX, + TYPE_REGR_AVGY, + TYPE_REGR_INTERCEPT, + TYPE_REGR_R2, + TYPE_REGR_SLOPE, + TYPE_REGR_SXX, + TYPE_REGR_SXY, + TYPE_REGR_SYY + }; + + struct RegrImpure + { + double x, x2, y, y2, xy; + }; + + explicit RegrAggNode(MemoryPool& pool, RegrType aType, + ValueExprNode* aArg = NULL, ValueExprNode* aArg2 = NULL); + + virtual void aggPostRse(thread_db* tdbb, CompilerScratch* csb); + + static DmlNode* parse(thread_db* tdbb, MemoryPool& pool, CompilerScratch* csb, const UCHAR blrOp); + + virtual void make(DsqlCompilerScratch* dsqlScratch, dsc* desc); + virtual void getDesc(thread_db* tdbb, CompilerScratch* csb, dsc* desc); + virtual ValueExprNode* copy(thread_db* tdbb, NodeCopier& copier) const; + + virtual void aggInit(thread_db* tdbb, jrd_req* request) const; + virtual bool aggPass(thread_db* tdbb, jrd_req* request) const; + virtual void aggPass(thread_db* tdbb, jrd_req* request, dsc* desc) const; + virtual dsc* aggExecute(thread_db* tdbb, jrd_req* request) const; + +protected: + virtual AggNode* dsqlCopy(DsqlCompilerScratch* dsqlScratch) /*const*/; + +public: + const RegrType type; + NestConst arg2; + +private: + ULONG impure2Offset; +}; + +class RegrCountAggNode : public AggNode +{ +public: + explicit RegrCountAggNode(MemoryPool& pool, + ValueExprNode* aArg = NULL, ValueExprNode* aArg2 = NULL); + + static DmlNode* parse(thread_db* tdbb, MemoryPool& pool, CompilerScratch* csb, const UCHAR blrOp); + + virtual void make(DsqlCompilerScratch* dsqlScratch, dsc* desc); + virtual void getDesc(thread_db* tdbb, CompilerScratch* csb, dsc* desc); + virtual ValueExprNode* copy(thread_db* tdbb, NodeCopier& copier) const; + + virtual void aggInit(thread_db* tdbb, jrd_req* request) const; + virtual bool aggPass(thread_db* tdbb, jrd_req* request) const; + virtual void aggPass(thread_db* tdbb, jrd_req* request, dsc* desc) const; + virtual dsc* aggExecute(thread_db* tdbb, jrd_req* request) const; + +protected: + virtual AggNode* dsqlCopy(DsqlCompilerScratch* dsqlScratch) /*const*/; + +public: + NestConst arg2; +}; + } // namespace #endif // DSQL_AGG_NODES_H diff --git a/src/dsql/parse.y b/src/dsql/parse.y index c703b046bd..f3449b8319 100644 --- a/src/dsql/parse.y +++ b/src/dsql/parse.y @@ -579,6 +579,15 @@ using namespace Firebird; %token COVAR_SAMP %token COVAR_POP %token CORR +%token REGR_AVGX +%token REGR_AVGY +%token REGR_COUNT +%token REGR_INTERCEPT +%token REGR_R2 +%token REGR_SLOPE +%token REGR_SXX +%token REGR_SXY +%token REGR_SYY // precedence declarations for expression evaluation @@ -3824,6 +3833,15 @@ keyword_or_column | KW_FALSE | OFFSET | OVER + | REGR_AVGX + | REGR_AVGY + | REGR_COUNT + | REGR_INTERCEPT + | REGR_R2 + | REGR_SLOPE + | REGR_SXX + | REGR_SXY + | REGR_SYY | RETURN | RDB_RECORD_VERSION | ROW @@ -6817,6 +6835,24 @@ aggregate_function { $$ = newNode(CorrAggNode::TYPE_COVAR_POP, $3, $5); } | CORR '(' value ',' value ')' { $$ = newNode(CorrAggNode::TYPE_CORR, $3, $5); } + | REGR_AVGX '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_AVGX, $3, $5); } + | REGR_AVGY '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_AVGY, $3, $5); } + | REGR_COUNT '(' value ',' value ')' + { $$ = newNode($3, $5); } + | REGR_INTERCEPT '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_INTERCEPT, $3, $5); } + | REGR_R2 '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_R2, $3, $5); } + | REGR_SLOPE '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_SLOPE, $3, $5); } + | REGR_SXX '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_SXX, $3, $5); } + | REGR_SXY '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_SXY, $3, $5); } + | REGR_SYY '(' value ',' value ')' + { $$ = newNode(RegrAggNode::TYPE_REGR_SYY, $3, $5); } ; %type window_function diff --git a/src/jrd/blp.h b/src/jrd/blp.h index adfeb75417..4c0e4c8630 100644 --- a/src/jrd/blp.h +++ b/src/jrd/blp.h @@ -248,5 +248,14 @@ static const struct {"agg_covar_samp", two}, {"agg_covar_pop", two}, {"agg_corr", two}, + {"blr_agg_regr_avgx", two}, + {"blr_agg_regr_avgy", two}, + {"blr_agg_regr_count", two}, + {"blr_agg_regr_intercept", two}, + {"blr_agg_regr_r2", two}, + {"blr_agg_regr_slope", two}, + {"blr_agg_regr_sxx", two}, + {"blr_agg_regr_sxy", two}, + {"blr_agg_regr_syy", two}, {0, 0} }; diff --git a/src/jrd/blr.h b/src/jrd/blr.h index 94e71d31a3..35c9153774 100644 --- a/src/jrd/blr.h +++ b/src/jrd/blr.h @@ -414,5 +414,14 @@ #define blr_agg_covar_samp (unsigned char) 215 #define blr_agg_covar_pop (unsigned char) 216 #define blr_agg_corr (unsigned char) 217 +#define blr_agg_regr_avgx (unsigned char) 218 +#define blr_agg_regr_avgy (unsigned char) 219 +#define blr_agg_regr_count (unsigned char) 220 +#define blr_agg_regr_intercept (unsigned char) 221 +#define blr_agg_regr_r2 (unsigned char) 222 +#define blr_agg_regr_slope (unsigned char) 223 +#define blr_agg_regr_sxx (unsigned char) 224 +#define blr_agg_regr_sxy (unsigned char) 225 +#define blr_agg_regr_syy (unsigned char) 226 #endif // JRD_BLR_H diff --git a/src/yvalve/keywords.cpp b/src/yvalve/keywords.cpp index 4e4c6736ec..474ea391ad 100644 --- a/src/yvalve/keywords.cpp +++ b/src/yvalve/keywords.cpp @@ -333,6 +333,15 @@ static const TOK tokens[] = {RECREATE, "RECREATE", 2, false}, {RECURSIVE, "RECURSIVE", 2, false}, {REFERENCES, "REFERENCES", 1, false}, + {REGR_AVGX, "REGR_AVGX", 2, false}, + {REGR_AVGY, "REGR_AVGY", 2, false}, + {REGR_COUNT, "REGR_COUNT", 2, false}, + {REGR_INTERCEPT, "REGR_INTERCEPT", 2, false}, + {REGR_R2, "REGR_R2", 2, false}, + {REGR_SLOPE, "REGR_SLOPE", 2, false}, + {REGR_SXX, "REGR_SXX", 2, false}, + {REGR_SXY, "REGR_SXY", 2, false}, + {REGR_SYY, "REGR_SYY", 2, false}, {KW_RELATIVE, "RELATIVE", 2, true}, {RELEASE, "RELEASE", 2, false}, {REPLACE, "REPLACE", 2, false},