|
@@ -69,11 +69,70 @@ about `c * 8` bytes.
|
|
|
|
|
|
The following chart shows how the error varies before and after the threshold:
|
|
|
|
|
|
+////
|
|
|
+To generate this chart use this gnuplot script:
|
|
|
+-------
|
|
|
+#!/usr/bin/gnuplot
|
|
|
+reset
|
|
|
+set terminal png size 1000,400
|
|
|
+
|
|
|
+set xlabel "Actual cardinality"
|
|
|
+set logscale x
|
|
|
+
|
|
|
+set ylabel "Relative error (%)"
|
|
|
+set yrange [0:8]
|
|
|
+
|
|
|
+set title "Cardinality error"
|
|
|
+set grid
|
|
|
+
|
|
|
+set style data lines
|
|
|
+
|
|
|
+plot "test.dat" using 1:2 title "threshold=100", \
|
|
|
+"" using 1:3 title "threshold=1000", \
|
|
|
+"" using 1:4 title "threshold=10000"
|
|
|
+#
|
|
|
+-------
|
|
|
+
|
|
|
+and generate data in a 'test.dat' file using the below Java code:
|
|
|
+
|
|
|
+-------
|
|
|
+private static double error(HyperLogLogPlusPlus h, long expected) {
|
|
|
+ double actual = h.cardinality(0);
|
|
|
+ return Math.abs(expected - actual) / expected;
|
|
|
+}
|
|
|
+
|
|
|
+public static void main(String[] args) {
|
|
|
+ HyperLogLogPlusPlus h100 = new HyperLogLogPlusPlus(precisionFromThreshold(100), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
|
|
+ HyperLogLogPlusPlus h1000 = new HyperLogLogPlusPlus(precisionFromThreshold(1000), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
|
|
+ HyperLogLogPlusPlus h10000 = new HyperLogLogPlusPlus(precisionFromThreshold(10000), BigArrays.NON_RECYCLING_INSTANCE, 1);
|
|
|
+
|
|
|
+ int next = 100;
|
|
|
+ int step = 10;
|
|
|
+
|
|
|
+ for (int i = 1; i <= 10000000; ++i) {
|
|
|
+ long h = BitMixer.mix64(i);
|
|
|
+ h100.collect(0, h);
|
|
|
+ h1000.collect(0, h);
|
|
|
+ h10000.collect(0, h);
|
|
|
+
|
|
|
+ if (i == next) {
|
|
|
+ System.out.println(i + " " + error(h100, i)*100 + " " + error(h1000, i)*100 + " " + error(h10000, i)*100);
|
|
|
+ next += step;
|
|
|
+ if (next >= 100 * step) {
|
|
|
+ step *= 10;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+-------
|
|
|
+
|
|
|
+////
|
|
|
+
|
|
|
image:images/cardinality_error.png[]
|
|
|
|
|
|
For all 3 thresholds, counts have been accurate up to the configured threshold
|
|
|
(although not guaranteed, this is likely to be the case). Please also note that
|
|
|
-even with a threshold as low as 100, the error remains under 5%, even when
|
|
|
+even with a threshold as low as 100, the error remains very low, even when
|
|
|
counting millions of items.
|
|
|
|
|
|
==== Pre-computed hashes
|