Skip to content

Commit 8ed654b

Browse files
authored
added range correction and more specific bias constants
1 parent 86cbb18 commit 8ed654b

File tree

1 file changed

+25
-4
lines changed

1 file changed

+25
-4
lines changed

HyperLogLogEstimator.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,32 @@ def HyperLogLog(hashes, k):
3535
#calculate estimate
3636
estimate = (m ** 2) * mean
3737

38-
#bias can be approximated with the formula 0.7213 / (1 + (1.079/2^k)) for k > 5
39-
#since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this as an estimate of the bias
40-
BIAS = 0.7213 / (1 + (1.079 / m))
38+
#bias can be approximated with the formula 0.7213 / (1 + (1.079/2^k)) for k >= 6
39+
#or for values of k < 6 we can use pre-calculated bias factors
40+
if k <= 4:
41+
BIAS = 0.673
42+
elif k == 5:
43+
BIAS == 0.697
44+
else:
45+
BIAS = 0.7213 / (1 + (1.079 / m))
4146

4247
#correct for bias
4348
estimate = BIAS * estimate
4449

45-
return estimate
50+
#small range correction
51+
if estimate < ((5 / 2) * m):
52+
#get count of registers with rank of 0
53+
zeros = 0
54+
for i in buckets:
55+
if buckets[i] == 0:
56+
zeros += 1
57+
#apply small range correction
58+
if not zeros == 0:
59+
estimate = m * math.log(estimate, 2)
60+
61+
#large range correction
62+
elif estimate > ((2 ** 32) / 30):
63+
estimate = -1 * (2 ** 32) * math.log(1 - (estimate / (2 ** 32)))
64+
65+
return estimate
66+

0 commit comments

Comments
 (0)