bug fixes and average implementation

ethantrott · ethantrott · commit 48dde3e736be · 2019-12-04T22:32:46.000-05:00
- fixed bug: couldn't handle hashes greater than 32 bits
- added SHA256 hashing
- added averaging with more than one hash value for more accurate results
diff --git a/HyperLogLogEstimator.py b/HyperLogLogEstimator.py
@@ -1,17 +1,21 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
+import hashlib
+
 #returns an estimate of the cardinality of values using k bits for the buckets
-def HyperLogLog(values, k):
+def HyperLogLog(hashes, off, k):
   #number of buckets
-  m = 2**k
+  m = 2 ** k
   #initialize buckets to 0
   buckets = [0] * m
 
-  #loop through all values
-  for value in values:
-    #get 32 bit hash of the value
-    temp = '{:032b}'.format(hash(value))
+  #loop through all hashes
+  for hash in hashes:
+    #encode the hash in binary
+    hash = bin(int(hash, 16))
+
+    temp = hash[len(hash)-(32+off):len(hash)-off]
 
     #get bucket{j} of the hash (first k bits)
     j = temp[:k]
@@ -36,14 +40,22 @@ def HyperLogLog(values, k):
   for bucket in buckets:
     total += 2 ** (-1 * bucket)
   z = total ** -1
-  
+
   #calculate estimate
   estimate = (m ** 2) * z
 
-  #this is an algebraic approximation for the bias dependant on the number of buckets
-  bias = 0.7213/(1+1.079/m)
+  #bias can be approximated with the formula 0.7213 / (1 + (1.079/2^k)) for k > 5
+  #since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this as an estimate of the bias
+  BIAS = 0.7213 / (1 + (1.079 / m))
 
   #correct for bias
-  estimate = bias * estimate
+  estimate = BIAS * estimate
 
-  return estimate
+  return estimate
+
+#find the average estimate for n different offsets of the hash with k bits for buckets
+def average_with_different_hashes(hashes, k, n):
+  total = 0
+  for i in range(n):
+    total += HyperLogLog(hashes, i*32, k)
+  return total/n
diff --git a/LogLogEstimator.py b/LogLogEstimator.py
@@ -1,17 +1,21 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
+import hashlib
+
 #returns an estimate of the cardinality of values using k bits for the buckets
-def LogLog(values, k):
+def LogLog(hashes, off, k):
   #number of buckets
   m = 2**k
   #initialize buckets to 0
   buckets = [0] * m
 
-  #loop through all values
-  for value in values:
-    #get 32 bit hash of the value
-    temp = '{:032b}'.format(hash(value))
+  #loop through all hashes
+  for hash in hashes:
+    #encode the hash in binary
+    hash = bin(int(hash, 16))
+
+    temp = hash[len(hash)-(32+off):len(hash)-off]
 
     #get bucket{j} of the hash (first k bits)
     j = temp[:k]
@@ -41,4 +45,11 @@ def LogLog(values, k):
   #correct for bias
   estimate = BIAS * estimate
 
-  return estimate
+  return estimate
+
+#find the average estimate for n different offsets of the hash with k bits for buckets
+def average_with_different_hashes(hashes, k, n):
+  total = 0
+  for i in range(n):
+    total += LogLog(hashes, i*32, k)
+  return total/n
diff --git a/SuperLogLogEstimator.py b/SuperLogLogEstimator.py
@@ -1,19 +1,21 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
-import math
+import math, hashlib
 
 #returns an estimate of the cardinality of values using k bits for the buckets
-def SuperLogLog(values, k):
+def SuperLogLog(hashes, off, k):
   #number of buckets
   m = 2**k
   #initialize buckets to 0
   buckets = [0] * m
 
-  #loop through all values
-  for value in values:
-    #get 32 bit hash of the value
-    temp = '{:032b}'.format(hash(value))
+  #loop through all hashes
+  for hash in hashes:
+    #encode the hash in binary
+    hash = bin(int(hash, 16))
+
+    temp = hash[len(hash)-(32+off):len(hash)-off]
 
     #get bucket{j} of the hash (first k bits)
     j = temp[:k]
@@ -53,4 +55,11 @@ def SuperLogLog(values, k):
   #correct for bias
   estimate = BIAS * estimate
 
-  return estimate
+  return estimate
+
+#find the average estimate for n different offsets of the hash with k bits for buckets
+def average_with_different_hashes(hashes, k, n):
+  total = 0
+  for i in range(n):
+    total += SuperLogLog(hashes, i*32, k)
+  return total/n
diff --git a/csv_parser.py b/csv_parser.py
@@ -1,7 +1,7 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
-import csv
+import csv, hashlib
 
 # this variable allows us to limit the number of rows we parse
 # (for convenience in debugging)
@@ -25,4 +25,24 @@ def get_plates(filename):
                 yield row[1]
                 count += 1
             else:
-                return
+                return
+
+#creates the SHA256 hashes of the plates and saves them to hashes.csv
+def create_hashes(filename):
+    hashes = []
+    with open('hashes.csv', 'w') as file:
+        for plate in get_plates(filename):
+            #get the sha256 hash of the value
+            hash = hashlib.sha256(str(plate).encode('utf-8'))
+            #store the hash in the list
+            hashes.append(hash.hexdigest())
+            #write the hash to the file
+            file.write(hash.hexdigest()+"\n")
+    return hashes
+
+def load_hashes():
+    f = open("hashes.csv", "r")
+    if (row_limit > 0):
+        return f.readlines(row_limit*65)
+    else:
+        return f.readlines()
diff --git a/main.py b/main.py
@@ -2,7 +2,7 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
-import os.path, time
+import os.path, time, random
 import csv_parser, ActualCount, LogLogEstimator, SuperLogLogEstimator, HyperLogLogEstimator
 
 # the file to parse for unique plates
@@ -12,6 +12,9 @@
 # 2**ex_buckets = number of buckets
 ex_buckets = 8
 
+# number of runs to average with different hashes
+loops_to_run = 1
+
 print ("LogLog, SuperLogLog, and HyperLogLog Demo")
 print ("by Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott\n")
 
@@ -20,13 +23,15 @@
     print("Error: Please download the dataset from https://data.cityofnewyork.us/api/views/faiq-9dfq/rows.csv?accessType=DOWNLOAD and place it in this directory as "+filename)
     exit()
 
+plate_hashes = []
+if os.path.exists("hashes.csv"):
+    print("Loading plate hashes from file..")
+    plate_hashes = csv_parser.load_hashes()
+else:
+    print("Creating plate hashes..")
+    plate_hashes = csv_parser.create_hashes(filename)
+print(str(len(plate_hashes))+" hashes loaded.\n")
 
-# count the total number of traffic violations
-print("Counting the total number of traffic violations..")
-start = time.time()
-total_entries = csv_parser.count_lines(filename)
-time_used = time.time() - start
-print("There are " + str(total_entries) + " entries in " + filename + ", counted in "+str(time_used) +" seconds\n")
 
 # directly tally the number of unique plates
 print("Calculating exact amount of unique plates..")
@@ -38,7 +43,7 @@
 # approximate number of plates using LogLog Estimation
 print("Approximating amount of unique plates using LogLog..")
 start = time.time()
-loglog_appx = LogLogEstimator.LogLog(csv_parser.get_plates(filename), ex_buckets)
+loglog_appx = LogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
 time_used = time.time() - start
 percent_error = abs(loglog_appx - exact_unique) / exact_unique * 100
 print ("LogLog Estimation: " + str(loglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
@@ -47,7 +52,7 @@
 # approximate number of plates using SuperLogLog Estimation
 print("Approximating amount of unique plates using SuperLogLog..")
 start = time.time()
-sloglog_appx = SuperLogLogEstimator.SuperLogLog(csv_parser.get_plates(filename), ex_buckets)
+sloglog_appx = SuperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
 time_used = time.time() - start
 percent_error = abs(sloglog_appx - exact_unique) / exact_unique * 100
 print ("SuperLogLog Estimation: " + str(sloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
@@ -56,7 +61,7 @@
 # approximate number of plates using HyperLogLog Estimation
 print("Approximating amount of unique plates using HyperLogLog..")
 start = time.time()
-hloglog_appx = HyperLogLogEstimator.HyperLogLog(csv_parser.get_plates(filename), ex_buckets)
+hloglog_appx = HyperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
 time_used = time.time() - start
 percent_error = abs(hloglog_appx - exact_unique) / exact_unique * 100
 print ("HyperLogLog Estimation: " + str(hloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")