Skip to content

Commit 4cf92f1

Browse files
committed
added FM Estimation and reworked output
1 parent 37aa2af commit 4cf92f1

File tree

4 files changed

+116
-87
lines changed

4 files changed

+116
-87
lines changed

FlajoletMartinEstimator.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
2+
# COS226 University of Maine
3+
4+
#returns an estimate of the cardinality of given hashes using k bits for the buckets
5+
def FlajoletMartin(hashes):
6+
#initialize a 32 bit long bitmap
7+
bitMap = ["0"] * 32
8+
9+
#loop through all hashes
10+
for i in range(0, len(hashes)):
11+
#get rank of the first 1 in the hash (number of trailing zeros)
12+
rank = 0
13+
for c in reversed(str(hashes[i])):
14+
if c == "0":
15+
rank += 1
16+
else:
17+
break
18+
19+
#set the corresponding index of the bitmap to 1
20+
bitMap[rank] = "1"
21+
22+
#get the lowest index of 0
23+
r = 0
24+
for i in bitMap:
25+
if i == "1":
26+
r += 1
27+
else:
28+
break
29+
#calculate estimate
30+
estimate = 2 ** r
31+
32+
#there is a predictible bias in the average case which is corrected by this bias factor
33+
BIAS = 0.77351
34+
#correct for bias
35+
estimate = estimate / BIAS
36+
37+
return estimate

HyperLogLogEstimator.py

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,22 @@
11
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
22
# COS226 University of Maine
33

4-
import hashlib
5-
6-
#returns an estimate of the cardinality of values using k bits for the buckets
7-
def HyperLogLog(hashes, off, k):
4+
#returns an estimate of the cardinality of given hashes using k bits for the buckets
5+
def HyperLogLog(hashes, k):
86
#number of buckets
97
m = 2 ** k
10-
#initialize buckets to 0
8+
#initialimeane buckets to 0
119
buckets = [0] * m
1210

1311
#loop through all hashes
14-
for hash in hashes:
15-
#encode the hash in binary
16-
hash = '{:256b}'.format(int(hash, 16)).replace(' ','0')
17-
18-
temp = hash[len(hash)-(32+off)+1:len(hash)-off]
19-
12+
for i in range(0, len(hashes)):
2013
#get bucket{j} of the hash (first k bits)
21-
j = temp[:k]
22-
j = int(j, 2)
14+
j = str(hashes[i])[:k]
15+
j = int(j, 2)
2316

24-
#get remaining bits after bucket
25-
data = temp[k:32]
26-
#get rank of the first 1 in the remaining bits (number of trailing zeros)
17+
#get remaining bits (remaining bits after bucket)
18+
data = hashes[i][k:]
19+
#get rank of the first 1 in the remaining bits (number of trailing meaneros)
2720
rank = 1
2821
for c in reversed(data):
2922
if c == "0":
@@ -35,14 +28,12 @@ def HyperLogLog(hashes, off, k):
3528
buckets[j] = max(buckets[j], rank)
3629

3730
# get the harmonic mean of the buckets
38-
# harmonic mean{z} = 1 / sum(2 ^ -buckets)
3931
total = 0
4032
for bucket in buckets:
4133
total += 2 ** (-1 * bucket)
42-
z = total ** -1
43-
34+
mean = total ** -1
4435
#calculate estimate
45-
estimate = (m ** 2) * z
36+
estimate = (m ** 2) * mean
4637

4738
#bias can be approximated with the formula 0.7213 / (1 + (1.079/2^k)) for k > 5
4839
#since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this as an estimate of the bias
@@ -51,11 +42,4 @@ def HyperLogLog(hashes, off, k):
5142
#correct for bias
5243
estimate = BIAS * estimate
5344

54-
return estimate
55-
56-
#find the average estimate for n different offsets of the hash with k bits for buckets
57-
def average_with_different_hashes(hashes, k, n):
58-
total = 0
59-
for i in range(n):
60-
total += HyperLogLog(hashes, i*32, k)
61-
return total/n
45+
return estimate

LogLogEstimator.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,21 @@
11
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
22
# COS226 University of Maine
33

4-
import hashlib
5-
6-
#returns an estimate of the cardinality of values using k bits for the buckets
7-
def LogLog(hashes, off, k):
4+
#returns an estimate of the cardinality of given hashes using k bits for the buckets
5+
def LogLog(hashes, k):
86
#number of buckets
9-
m = 2**k
7+
m = 2 ** k
108
#initialize buckets to 0
119
buckets = [0] * m
1210

1311
#loop through all hashes
14-
for hash in hashes:
15-
#encode the hash in binary
16-
hash = '{:256b}'.format(int(hash, 16)).replace(' ','0')
17-
18-
temp = hash[len(hash)-(32+off)+1:len(hash)-off]
19-
12+
for i in range(0, len(hashes)):
2013
#get bucket{j} of the hash (first k bits)
21-
j = temp[:k]
22-
j = int(j, 2)
14+
j = str(hashes[i])[:k]
15+
j = int(j, 2)
2316

2417
#get remaining bits (remaining bits after bucket)
25-
data = temp[k:32]
18+
data = hashes[i][k:]
2619
#get rank of the first 1 in the remaining bits (number of trailing zeros)
2720
rank = 1
2821
for c in reversed(data):
@@ -37,19 +30,12 @@ def LogLog(hashes, off, k):
3730
#get bucket average
3831
average = sum(buckets) / m
3932
#calculate estimate
40-
estimate = m * 2 ** average
33+
estimate = m * (2 ** average)
4134

42-
#bias converges to 0.79402 for larger numbers of buckets (k > 5)
35+
#bias converges to 0.397011808 for larger numbers of buckets (k > 5)
4336
#since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this convergance as an estimate of the bias
4437
BIAS = 0.397011808
4538
#correct for bias
4639
estimate = BIAS * estimate
4740

48-
return estimate
49-
50-
#find the average estimate for n different offsets of the hash with k bits for buckets
51-
def average_with_different_hashes(hashes, k, n):
52-
total = 0
53-
for i in range(n):
54-
total += LogLog(hashes, i*32, k)
55-
return total/n
41+
return estimate

main.py

Lines changed: 57 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
33
# COS226 University of Maine
44

5-
import os.path, time, random
6-
import csv_parser, ActualCount, LogLogEstimator, SuperLogLogEstimator, HyperLogLogEstimator
5+
import os.path, time, sys
6+
import csv_parser, ActualCount, FlajoletMartinEstimator, LogLogEstimator, HyperLogLogEstimator
77

88
# the file to parse for unique plates
99
filename = "nyc-dataset.csv"
@@ -18,51 +18,73 @@
1818
print ("LogLog, SuperLogLog, and HyperLogLog Demo")
1919
print ("by Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott\n")
2020

21-
# check to make sure the dataset has been downloaded and named properly
22-
if not os.path.exists(filename):
23-
print("Error: Please download the dataset from https://data.cityofnewyork.us/api/views/faiq-9dfq/rows.csv?accessType=DOWNLOAD and place it in this directory as "+filename)
24-
exit()
25-
2621
plate_hashes = []
2722
if os.path.exists("hashes.csv"):
2823
print("Loading plate hashes from file..")
2924
plate_hashes = csv_parser.load_hashes()
3025
else:
26+
# check to make sure the dataset has been downloaded and named properly
27+
if not os.path.exists(filename):
28+
print("Error: Please download the dataset from https://data.cityofnewyork.us/api/views/faiq-9dfq/rows.csv?accessType=DOWNLOAD and place it in this directory as "+filename)
29+
exit()
30+
3131
print("Creating plate hashes..")
3232
plate_hashes = csv_parser.create_hashes(filename)
3333
print(str(len(plate_hashes))+" hashes loaded.\n")
3434

3535

3636
# directly tally the number of unique plates
37-
print("Calculating exact amount of unique plates..")
38-
start = time.time()
37+
print("Calculating exact amount of unique plates..\n")
3938
exact_unique = ActualCount.get_exact_unique_using_set(filename)
40-
time_used = time.time() - start
41-
print ("There are exactly " + str(exact_unique)+ " unique plates, counted in "+str(time_used)+" seconds\n")
39+
40+
#run tests 8 times
41+
fm = []
42+
ll = []
43+
hll = []
44+
for i in range(0, 8):
45+
#print loading message
46+
sys.stdout.write("Running test " + str(i + 1) + " of 8...")
47+
sys.stdout.flush()
48+
49+
#Generate test hashes
50+
hashes = []
51+
for hash in plate_hashes:
52+
#encode the hash in binary
53+
hash = '{:256b}'.format(int(hash, 16)).replace(' ','0')
54+
55+
off = i*32
56+
temp = hash[len(hash)-(32+off)+1:len(hash)-off]
57+
hashes.append(temp)
58+
59+
#add to results array
60+
fm.append(FlajoletMartinEstimator.FlajoletMartin(hashes))
61+
ll.append(LogLogEstimator.LogLog(hashes, ex_buckets))
62+
hll.append(HyperLogLogEstimator.HyperLogLog(hashes, ex_buckets))
63+
64+
#reset line
65+
sys.stdout.write('\r')
4266

43-
# approximate number of plates using LogLog Estimation
44-
print("Approximating amount of unique plates using LogLog..")
45-
start = time.time()
46-
loglog_appx = LogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
47-
time_used = time.time() - start
48-
percent_error = abs(loglog_appx - exact_unique) / exact_unique * 100
49-
print ("LogLog Estimation: " + str(loglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
50-
print ("Percent Error: " + str(percent_error) + "% \n")
67+
#test formatter
68+
def printTest(label, expected, actual):
69+
print('*'*50)
70+
print('{s:{c}^{n}}'.format(s=f' {label} ', n=50, c='*'))
71+
print('*'*50)
72+
print()
5173

52-
# approximate number of plates using SuperLogLog Estimation
53-
print("Approximating amount of unique plates using SuperLogLog..")
54-
start = time.time()
55-
sloglog_appx = SuperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
56-
time_used = time.time() - start
57-
percent_error = abs(sloglog_appx - exact_unique) / exact_unique * 100
58-
print ("SuperLogLog Estimation: " + str(sloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
59-
print ("Percent Error: " + str(percent_error) + "% \n")
74+
print('{s:{c}^{n}}'.format(s=' Expected result ', n=50, c='-'))
75+
print(expected)
76+
print('{s:{c}^{n}}'.format(s=' Actual results ', n=50, c='-'))
77+
print("\n".join("{0:.0f}".format(x) for x in actual))
78+
print('{s:{c}^{n}}'.format(s=' Average result ', n=50, c='-'))
79+
average = sum(actual) / len(actual)
80+
print("{0:.0f}".format(average))
81+
print('{s:{c}^{n}}'.format(s=' Difference ', n=50, c='-'))
82+
print("{0:.0f}".format(expected - average))
83+
print('{s:{c}^{n}}'.format(s=' Percent Error ', n=50, c='-'))
84+
print("{0:.2%}".format(abs(expected - average) / expected))
85+
print()
6086

61-
# approximate number of plates using HyperLogLog Estimation
62-
print("Approximating amount of unique plates using HyperLogLog..")
63-
start = time.time()
64-
hloglog_appx = HyperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
65-
time_used = time.time() - start
66-
percent_error = abs(hloglog_appx - exact_unique) / exact_unique * 100
67-
print ("HyperLogLog Estimation: " + str(hloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
68-
print ("Percent Error: " + str(percent_error) + "% \n")
87+
#print results
88+
printTest("Flajolet-Martin", exact_unique, fm)
89+
printTest("LogLog", exact_unique, ll)
90+
printTest("HyperLogLog", exact_unique, hll)

0 commit comments

Comments
 (0)