percentile.c 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /*
  2. ** 2013-05-28
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. ******************************************************************************
  12. **
  13. ** This file contains code to implement the percentile(Y,P) SQL function
  14. ** as described below:
  15. **
  16. ** (1) The percentile(Y,P) function is an aggregate function taking
  17. ** exactly two arguments.
  18. **
  19. ** (2) If the P argument to percentile(Y,P) is not the same for every
  20. ** row in the aggregate then an error is thrown. The word "same"
  21. ** in the previous sentence means that the value differ by less
  22. ** than 0.001.
  23. **
  24. ** (3) If the P argument to percentile(Y,P) evaluates to anything other
  25. ** than a number in the range of 0.0 to 100.0 inclusive then an
  26. ** error is thrown.
  27. **
  28. ** (4) If any Y argument to percentile(Y,P) evaluates to a value that
  29. ** is not NULL and is not numeric then an error is thrown.
  30. **
  31. ** (5) If any Y argument to percentile(Y,P) evaluates to plus or minus
  32. ** infinity then an error is thrown. (SQLite always interprets NaN
  33. ** values as NULL.)
  34. **
  35. ** (6) Both Y and P in percentile(Y,P) can be arbitrary expressions,
  36. ** including CASE WHEN expressions.
  37. **
  38. ** (7) The percentile(Y,P) aggregate is able to handle inputs of at least
  39. ** one million (1,000,000) rows.
  40. **
  41. ** (8) If there are no non-NULL values for Y, then percentile(Y,P)
  42. ** returns NULL.
  43. **
  44. ** (9) If there is exactly one non-NULL value for Y, the percentile(Y,P)
  45. ** returns the one Y value.
  46. **
  47. ** (10) If there N non-NULL values of Y where N is two or more and
  48. ** the Y values are ordered from least to greatest and a graph is
  49. ** drawn from 0 to N-1 such that the height of the graph at J is
  50. ** the J-th Y value and such that straight lines are drawn between
  51. ** adjacent Y values, then the percentile(Y,P) function returns
  52. ** the height of the graph at P*(N-1)/100.
  53. **
  54. ** (11) The percentile(Y,P) function always returns either a floating
  55. ** point number or NULL.
  56. **
  57. ** (12) The percentile(Y,P) is implemented as a single C99 source-code
  58. ** file that compiles into a shared-library or DLL that can be loaded
  59. ** into SQLite using the sqlite3_load_extension() interface.
  60. */
  61. #include "sqlite3ext.h"
  62. SQLITE_EXTENSION_INIT1
  63. #include <assert.h>
  64. #include <string.h>
  65. #include <stdlib.h>
  66. /* The following object is the session context for a single percentile()
  67. ** function. We have to remember all input Y values until the very end.
  68. ** Those values are accumulated in the Percentile.a[] array.
  69. */
  70. typedef struct Percentile Percentile;
  71. struct Percentile {
  72. unsigned nAlloc; /* Number of slots allocated for a[] */
  73. unsigned nUsed; /* Number of slots actually used in a[] */
  74. double rPct; /* 1.0 more than the value for P */
  75. double *a; /* Array of Y values */
  76. };
  77. /*
  78. ** Return TRUE if the input floating-point number is an infinity.
  79. */
  80. static int isInfinity(double r){
  81. sqlite3_uint64 u;
  82. assert( sizeof(u)==sizeof(r) );
  83. memcpy(&u, &r, sizeof(u));
  84. return ((u>>52)&0x7ff)==0x7ff;
  85. }
  86. /*
  87. ** Return TRUE if two doubles differ by 0.001 or less
  88. */
  89. static int sameValue(double a, double b){
  90. a -= b;
  91. return a>=-0.001 && a<=0.001;
  92. }
  93. /*
  94. ** The "step" function for percentile(Y,P) is called once for each
  95. ** input row.
  96. */
  97. static void percentStep(sqlite3_context *pCtx, int argc, sqlite3_value **argv){
  98. Percentile *p;
  99. double rPct;
  100. int eType;
  101. double y;
  102. assert( argc==2 );
  103. /* Requirement 3: P must be a number between 0 and 100 */
  104. eType = sqlite3_value_numeric_type(argv[1]);
  105. rPct = sqlite3_value_double(argv[1]);
  106. if( (eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT) ||
  107. ((rPct = sqlite3_value_double(argv[1]))<0.0 || rPct>100.0) ){
  108. sqlite3_result_error(pCtx, "2nd argument to percentile() is not "
  109. "a number between 0.0 and 100.0", -1);
  110. return;
  111. }
  112. /* Allocate the session context. */
  113. p = (Percentile*)sqlite3_aggregate_context(pCtx, sizeof(*p));
  114. if( p==0 ) return;
  115. /* Remember the P value. Throw an error if the P value is different
  116. ** from any prior row, per Requirement (2). */
  117. if( p->rPct==0.0 ){
  118. p->rPct = rPct+1.0;
  119. }else if( !sameValue(p->rPct,rPct+1.0) ){
  120. sqlite3_result_error(pCtx, "2nd argument to percentile() is not the "
  121. "same for all input rows", -1);
  122. return;
  123. }
  124. /* Ignore rows for which Y is NULL */
  125. eType = sqlite3_value_type(argv[0]);
  126. if( eType==SQLITE_NULL ) return;
  127. /* If not NULL, then Y must be numeric. Otherwise throw an error.
  128. ** Requirement 4 */
  129. if( eType!=SQLITE_INTEGER && eType!=SQLITE_FLOAT ){
  130. sqlite3_result_error(pCtx, "1st argument to percentile() is not "
  131. "numeric", -1);
  132. return;
  133. }
  134. /* Throw an error if the Y value is infinity or NaN */
  135. y = sqlite3_value_double(argv[0]);
  136. if( isInfinity(y) ){
  137. sqlite3_result_error(pCtx, "Inf input to percentile()", -1);
  138. return;
  139. }
  140. /* Allocate and store the Y */
  141. if( p->nUsed>=p->nAlloc ){
  142. unsigned n = p->nAlloc*2 + 250;
  143. double *a = sqlite3_realloc(p->a, sizeof(double)*n);
  144. if( a==0 ){
  145. sqlite3_free(p->a);
  146. memset(p, 0, sizeof(*p));
  147. sqlite3_result_error_nomem(pCtx);
  148. return;
  149. }
  150. p->nAlloc = n;
  151. p->a = a;
  152. }
  153. p->a[p->nUsed++] = y;
  154. }
  155. /*
  156. ** Compare to doubles for sorting using qsort()
  157. */
  158. static int doubleCmp(const void *pA, const void *pB){
  159. double a = *(double*)pA;
  160. double b = *(double*)pB;
  161. if( a==b ) return 0;
  162. if( a<b ) return -1;
  163. return +1;
  164. }
  165. /*
  166. ** Called to compute the final output of percentile() and to clean
  167. ** up all allocated memory.
  168. */
  169. static void percentFinal(sqlite3_context *pCtx){
  170. Percentile *p;
  171. unsigned i1, i2;
  172. double v1, v2;
  173. double ix, vx;
  174. p = (Percentile*)sqlite3_aggregate_context(pCtx, 0);
  175. if( p==0 ) return;
  176. if( p->a==0 ) return;
  177. if( p->nUsed ){
  178. qsort(p->a, p->nUsed, sizeof(double), doubleCmp);
  179. ix = (p->rPct-1.0)*(p->nUsed-1)*0.01;
  180. i1 = (unsigned)ix;
  181. i2 = ix==(double)i1 || i1==p->nUsed-1 ? i1 : i1+1;
  182. v1 = p->a[i1];
  183. v2 = p->a[i2];
  184. vx = v1 + (v2-v1)*(ix-i1);
  185. sqlite3_result_double(pCtx, vx);
  186. }
  187. sqlite3_free(p->a);
  188. memset(p, 0, sizeof(*p));
  189. }
  190. #ifdef _WIN32
  191. __declspec(dllexport)
  192. #endif
  193. int sqlite3_percentile_init(
  194. sqlite3 *db,
  195. char **pzErrMsg,
  196. const sqlite3_api_routines *pApi
  197. ){
  198. int rc = SQLITE_OK;
  199. SQLITE_EXTENSION_INIT2(pApi);
  200. (void)pzErrMsg; /* Unused parameter */
  201. rc = sqlite3_create_function(db, "percentile", 2, SQLITE_UTF8, 0,
  202. 0, percentStep, percentFinal);
  203. return rc;
  204. }