Skip to content

Commit 48dde3e

Browse files
committed
bug fixes and average implementation
- fixed bug: couldn't handle hashes greater than 32 bits - added SHA256 hashing - added averaging with more than one hash value for more accurate results
1 parent a48eee3 commit 48dde3e

File tree

5 files changed

+93
-36
lines changed

5 files changed

+93
-36
lines changed

HyperLogLogEstimator.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
11
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
22
# COS226 University of Maine
33

4+
import hashlib
5+
46
#returns an estimate of the cardinality of values using k bits for the buckets
5-
def HyperLogLog(values, k):
7+
def HyperLogLog(hashes, off, k):
68
#number of buckets
7-
m = 2**k
9+
m = 2 ** k
810
#initialize buckets to 0
911
buckets = [0] * m
1012

11-
#loop through all values
12-
for value in values:
13-
#get 32 bit hash of the value
14-
temp = '{:032b}'.format(hash(value))
13+
#loop through all hashes
14+
for hash in hashes:
15+
#encode the hash in binary
16+
hash = bin(int(hash, 16))
17+
18+
temp = hash[len(hash)-(32+off):len(hash)-off]
1519

1620
#get bucket{j} of the hash (first k bits)
1721
j = temp[:k]
@@ -36,14 +40,22 @@ def HyperLogLog(values, k):
3640
for bucket in buckets:
3741
total += 2 ** (-1 * bucket)
3842
z = total ** -1
39-
43+
4044
#calculate estimate
4145
estimate = (m ** 2) * z
4246

43-
#this is an algebraic approximation for the bias dependant on the number of buckets
44-
bias = 0.7213/(1+1.079/m)
47+
#bias can be approximated with the formula 0.7213 / (1 + (1.079/2^k)) for k > 5
48+
#since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this as an estimate of the bias
49+
BIAS = 0.7213 / (1 + (1.079 / m))
4550

4651
#correct for bias
47-
estimate = bias * estimate
52+
estimate = BIAS * estimate
4853

49-
return estimate
54+
return estimate
55+
56+
#find the average estimate for n different offsets of the hash with k bits for buckets
57+
def average_with_different_hashes(hashes, k, n):
58+
total = 0
59+
for i in range(n):
60+
total += HyperLogLog(hashes, i*32, k)
61+
return total/n

LogLogEstimator.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,21 @@
11
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
22
# COS226 University of Maine
33

4+
import hashlib
5+
46
#returns an estimate of the cardinality of values using k bits for the buckets
5-
def LogLog(values, k):
7+
def LogLog(hashes, off, k):
68
#number of buckets
79
m = 2**k
810
#initialize buckets to 0
911
buckets = [0] * m
1012

11-
#loop through all values
12-
for value in values:
13-
#get 32 bit hash of the value
14-
temp = '{:032b}'.format(hash(value))
13+
#loop through all hashes
14+
for hash in hashes:
15+
#encode the hash in binary
16+
hash = bin(int(hash, 16))
17+
18+
temp = hash[len(hash)-(32+off):len(hash)-off]
1519

1620
#get bucket{j} of the hash (first k bits)
1721
j = temp[:k]
@@ -41,4 +45,11 @@ def LogLog(values, k):
4145
#correct for bias
4246
estimate = BIAS * estimate
4347

44-
return estimate
48+
return estimate
49+
50+
#find the average estimate for n different offsets of the hash with k bits for buckets
51+
def average_with_different_hashes(hashes, k, n):
52+
total = 0
53+
for i in range(n):
54+
total += LogLog(hashes, i*32, k)
55+
return total/n

SuperLogLogEstimator.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
22
# COS226 University of Maine
33

4-
import math
4+
import math, hashlib
55

66
#returns an estimate of the cardinality of values using k bits for the buckets
7-
def SuperLogLog(values, k):
7+
def SuperLogLog(hashes, off, k):
88
#number of buckets
99
m = 2**k
1010
#initialize buckets to 0
1111
buckets = [0] * m
1212

13-
#loop through all values
14-
for value in values:
15-
#get 32 bit hash of the value
16-
temp = '{:032b}'.format(hash(value))
13+
#loop through all hashes
14+
for hash in hashes:
15+
#encode the hash in binary
16+
hash = bin(int(hash, 16))
17+
18+
temp = hash[len(hash)-(32+off):len(hash)-off]
1719

1820
#get bucket{j} of the hash (first k bits)
1921
j = temp[:k]
@@ -53,4 +55,11 @@ def SuperLogLog(values, k):
5355
#correct for bias
5456
estimate = BIAS * estimate
5557

56-
return estimate
58+
return estimate
59+
60+
#find the average estimate for n different offsets of the hash with k bits for buckets
61+
def average_with_different_hashes(hashes, k, n):
62+
total = 0
63+
for i in range(n):
64+
total += SuperLogLog(hashes, i*32, k)
65+
return total/n

csv_parser.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
22
# COS226 University of Maine
33

4-
import csv
4+
import csv, hashlib
55

66
# this variable allows us to limit the number of rows we parse
77
# (for convenience in debugging)
@@ -25,4 +25,24 @@ def get_plates(filename):
2525
yield row[1]
2626
count += 1
2727
else:
28-
return
28+
return
29+
30+
#creates the SHA256 hashes of the plates and saves them to hashes.csv
31+
def create_hashes(filename):
32+
hashes = []
33+
with open('hashes.csv', 'w') as file:
34+
for plate in get_plates(filename):
35+
#get the sha256 hash of the value
36+
hash = hashlib.sha256(str(plate).encode('utf-8'))
37+
#store the hash in the list
38+
hashes.append(hash.hexdigest())
39+
#write the hash to the file
40+
file.write(hash.hexdigest()+"\n")
41+
return hashes
42+
43+
def load_hashes():
44+
f = open("hashes.csv", "r")
45+
if (row_limit > 0):
46+
return f.readlines(row_limit*65)
47+
else:
48+
return f.readlines()

main.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
33
# COS226 University of Maine
44

5-
import os.path, time
5+
import os.path, time, random
66
import csv_parser, ActualCount, LogLogEstimator, SuperLogLogEstimator, HyperLogLogEstimator
77

88
# the file to parse for unique plates
@@ -12,6 +12,9 @@
1212
# 2**ex_buckets = number of buckets
1313
ex_buckets = 8
1414

15+
# number of runs to average with different hashes
16+
loops_to_run = 1
17+
1518
print ("LogLog, SuperLogLog, and HyperLogLog Demo")
1619
print ("by Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott\n")
1720

@@ -20,13 +23,15 @@
2023
print("Error: Please download the dataset from https://data.cityofnewyork.us/api/views/faiq-9dfq/rows.csv?accessType=DOWNLOAD and place it in this directory as "+filename)
2124
exit()
2225

26+
plate_hashes = []
27+
if os.path.exists("hashes.csv"):
28+
print("Loading plate hashes from file..")
29+
plate_hashes = csv_parser.load_hashes()
30+
else:
31+
print("Creating plate hashes..")
32+
plate_hashes = csv_parser.create_hashes(filename)
33+
print(str(len(plate_hashes))+" hashes loaded.\n")
2334

24-
# count the total number of traffic violations
25-
print("Counting the total number of traffic violations..")
26-
start = time.time()
27-
total_entries = csv_parser.count_lines(filename)
28-
time_used = time.time() - start
29-
print("There are " + str(total_entries) + " entries in " + filename + ", counted in "+str(time_used) +" seconds\n")
3035

3136
# directly tally the number of unique plates
3237
print("Calculating exact amount of unique plates..")
@@ -38,7 +43,7 @@
3843
# approximate number of plates using LogLog Estimation
3944
print("Approximating amount of unique plates using LogLog..")
4045
start = time.time()
41-
loglog_appx = LogLogEstimator.LogLog(csv_parser.get_plates(filename), ex_buckets)
46+
loglog_appx = LogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
4247
time_used = time.time() - start
4348
percent_error = abs(loglog_appx - exact_unique) / exact_unique * 100
4449
print ("LogLog Estimation: " + str(loglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
@@ -47,7 +52,7 @@
4752
# approximate number of plates using SuperLogLog Estimation
4853
print("Approximating amount of unique plates using SuperLogLog..")
4954
start = time.time()
50-
sloglog_appx = SuperLogLogEstimator.SuperLogLog(csv_parser.get_plates(filename), ex_buckets)
55+
sloglog_appx = SuperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
5156
time_used = time.time() - start
5257
percent_error = abs(sloglog_appx - exact_unique) / exact_unique * 100
5358
print ("SuperLogLog Estimation: " + str(sloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
@@ -56,7 +61,7 @@
5661
# approximate number of plates using HyperLogLog Estimation
5762
print("Approximating amount of unique plates using HyperLogLog..")
5863
start = time.time()
59-
hloglog_appx = HyperLogLogEstimator.HyperLogLog(csv_parser.get_plates(filename), ex_buckets)
64+
hloglog_appx = HyperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
6065
time_used = time.time() - start
6166
percent_error = abs(hloglog_appx - exact_unique) / exact_unique * 100
6267
print ("HyperLogLog Estimation: " + str(hloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")

0 commit comments

Comments
 (0)