added FM Estimation and reworked output

ethantrott · ethantrott · commit 4cf92f11c2a4 · 2019-12-05T12:54:20.000-05:00
diff --git a/FlajoletMartinEstimator.py b/FlajoletMartinEstimator.py
@@ -0,0 +1,37 @@
+# Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
+# COS226 University of Maine
+
+#returns an estimate of the cardinality of given hashes using k bits for the buckets
+def FlajoletMartin(hashes):
+  #initialize a 32 bit long bitmap
+  bitMap = ["0"] * 32
+
+  #loop through all hashes
+  for i in range(0, len(hashes)):
+  #get rank of the first 1 in the hash (number of trailing zeros)
+    rank = 0
+    for c in reversed(str(hashes[i])):
+      if c == "0":
+        rank += 1
+      else:
+        break
+
+    #set the corresponding index of the bitmap to 1
+    bitMap[rank] = "1"
+  
+  #get the lowest index of 0
+  r = 0
+  for i in bitMap:
+    if i == "1":
+      r += 1
+    else:
+      break
+  #calculate estimate
+  estimate = 2 ** r
+
+  #there is a predictible bias in the average case which is corrected by this bias factor
+  BIAS = 0.77351
+  #correct for bias
+  estimate = estimate / BIAS
+
+  return estimate
diff --git a/HyperLogLogEstimator.py b/HyperLogLogEstimator.py
@@ -1,29 +1,22 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
-import hashlib
-
-#returns an estimate of the cardinality of values using k bits for the buckets
-def HyperLogLog(hashes, off, k):
+#returns an estimate of the cardinality of given hashes using k bits for the buckets
+def HyperLogLog(hashes, k):
   #number of buckets
   m = 2 ** k
-  #initialize buckets to 0
+  #initialimeane buckets to 0
   buckets = [0] * m
 
   #loop through all hashes
-  for hash in hashes:
-    #encode the hash in binary
-    hash = '{:256b}'.format(int(hash, 16)).replace(' ','0')
-    
-    temp = hash[len(hash)-(32+off)+1:len(hash)-off]
-
+  for i in range(0, len(hashes)):
     #get bucket{j} of the hash (first k bits)
-    j = temp[:k]
-    j = int(j, 2)
+    j = str(hashes[i])[:k]
+    j = int(j, 2)    
 
-    #get remaining bits after bucket
-    data = temp[k:32]
-    #get rank of the first 1 in the remaining bits (number of trailing zeros)
+    #get remaining bits (remaining bits after bucket)
+    data = hashes[i][k:]
+    #get rank of the first 1 in the remaining bits (number of trailing meaneros)
     rank = 1
     for c in reversed(data):
       if c == "0":
@@ -35,14 +28,12 @@ def HyperLogLog(hashes, off, k):
     buckets[j] = max(buckets[j], rank)
 
   # get the harmonic mean of the buckets
-  # harmonic mean{z} = 1 / sum(2 ^ -buckets)
   total = 0
   for bucket in buckets:
     total += 2 ** (-1 * bucket)
-  z = total ** -1
-
+  mean = total ** -1
   #calculate estimate
-  estimate = (m ** 2) * z
+  estimate = (m ** 2) * mean
 
   #bias can be approximated with the formula 0.7213 / (1 + (1.079/2^k)) for k > 5
   #since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this as an estimate of the bias
@@ -51,11 +42,4 @@ def HyperLogLog(hashes, off, k):
   #correct for bias
   estimate = BIAS * estimate
 
-  return estimate
-
-#find the average estimate for n different offsets of the hash with k bits for buckets
-def average_with_different_hashes(hashes, k, n):
-  total = 0
-  for i in range(n):
-    total += HyperLogLog(hashes, i*32, k)
-  return total/n
+  return estimate
diff --git a/LogLogEstimator.py b/LogLogEstimator.py
@@ -1,28 +1,21 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
-import hashlib
-
-#returns an estimate of the cardinality of values using k bits for the buckets
-def LogLog(hashes, off, k):
+#returns an estimate of the cardinality of given hashes using k bits for the buckets
+def LogLog(hashes, k):
   #number of buckets
-  m = 2**k
+  m = 2 ** k
   #initialize buckets to 0
   buckets = [0] * m
 
   #loop through all hashes
-  for hash in hashes:
-    #encode the hash in binary
-    hash = '{:256b}'.format(int(hash, 16)).replace(' ','0')
-    
-    temp = hash[len(hash)-(32+off)+1:len(hash)-off]
-
+  for i in range(0, len(hashes)):
     #get bucket{j} of the hash (first k bits)
-    j = temp[:k]
-    j = int(j, 2)
+    j = str(hashes[i])[:k]
+    j = int(j, 2)    
 
     #get remaining bits (remaining bits after bucket)
-    data = temp[k:32]
+    data = hashes[i][k:]
     #get rank of the first 1 in the remaining bits (number of trailing zeros)
     rank = 1
     for c in reversed(data):
@@ -37,19 +30,12 @@ def LogLog(hashes, off, k):
   #get bucket average
   average = sum(buckets) / m
   #calculate estimate
-  estimate = m * 2 ** average
+  estimate = m * (2 ** average)
 
-  #bias converges to 0.79402 for larger numbers of buckets (k > 5)
+  #bias converges to 0.397011808 for larger numbers of buckets (k > 5)
   #since we want to be using more buckets anyways, and the percise value of the bias is costly to calculate. We will use this convergance as an estimate of the bias
   BIAS = 0.397011808
   #correct for bias
   estimate = BIAS * estimate
 
-  return estimate
-
-#find the average estimate for n different offsets of the hash with k bits for buckets
-def average_with_different_hashes(hashes, k, n):
-  total = 0
-  for i in range(n):
-    total += LogLog(hashes, i*32, k)
-  return total/n
+  return estimate
diff --git a/main.py b/main.py
@@ -2,8 +2,8 @@
 # Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott
 # COS226 University of Maine
 
-import os.path, time, random
-import csv_parser, ActualCount, LogLogEstimator, SuperLogLogEstimator, HyperLogLogEstimator
+import os.path, time, sys
+import csv_parser, ActualCount, FlajoletMartinEstimator, LogLogEstimator, HyperLogLogEstimator
 
 # the file to parse for unique plates
 filename = "nyc-dataset.csv"
@@ -18,51 +18,73 @@
 print ("LogLog, SuperLogLog, and HyperLogLog Demo")
 print ("by Dawsin Blanchard, Sam Braga, Brian Couture, and Ethan Trott\n")
 
-# check to make sure the dataset has been downloaded and named properly
-if not os.path.exists(filename):
-    print("Error: Please download the dataset from https://data.cityofnewyork.us/api/views/faiq-9dfq/rows.csv?accessType=DOWNLOAD and place it in this directory as "+filename)
-    exit()
-
 plate_hashes = []
 if os.path.exists("hashes.csv"):
     print("Loading plate hashes from file..")
     plate_hashes = csv_parser.load_hashes()
 else:
+    # check to make sure the dataset has been downloaded and named properly
+    if not os.path.exists(filename):
+        print("Error: Please download the dataset from https://data.cityofnewyork.us/api/views/faiq-9dfq/rows.csv?accessType=DOWNLOAD and place it in this directory as "+filename)
+        exit()
+        
     print("Creating plate hashes..")
     plate_hashes = csv_parser.create_hashes(filename)
 print(str(len(plate_hashes))+" hashes loaded.\n")
 
 
 # directly tally the number of unique plates
-print("Calculating exact amount of unique plates..")
-start = time.time()
+print("Calculating exact amount of unique plates..\n")
 exact_unique = ActualCount.get_exact_unique_using_set(filename)
-time_used = time.time() - start
-print ("There are exactly " + str(exact_unique)+ " unique plates, counted in "+str(time_used)+" seconds\n")
+  
+#run tests 8 times
+fm = []
+ll = []
+hll = []
+for i in range(0, 8):
+  #print loading message
+  sys.stdout.write("Running test " + str(i + 1) + " of 8...")
+  sys.stdout.flush()
+  
+  #Generate test hashes
+  hashes = []
+  for hash in plate_hashes:
+    #encode the hash in binary
+    hash = '{:256b}'.format(int(hash, 16)).replace(' ','0')
+    
+    off = i*32
+    temp = hash[len(hash)-(32+off)+1:len(hash)-off]
+    hashes.append(temp)
+  
+  #add to results array
+  fm.append(FlajoletMartinEstimator.FlajoletMartin(hashes))
+  ll.append(LogLogEstimator.LogLog(hashes, ex_buckets))
+  hll.append(HyperLogLogEstimator.HyperLogLog(hashes, ex_buckets))
+
+  #reset line
+  sys.stdout.write('\r')
 
-# approximate number of plates using LogLog Estimation
-print("Approximating amount of unique plates using LogLog..")
-start = time.time()
-loglog_appx = LogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
-time_used = time.time() - start
-percent_error = abs(loglog_appx - exact_unique) / exact_unique * 100
-print ("LogLog Estimation: " + str(loglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
-print ("Percent Error: " + str(percent_error) + "% \n")
+#test formatter
+def printTest(label, expected, actual):
+  print('*'*50)
+  print('{s:{c}^{n}}'.format(s=f' {label} ', n=50, c='*'))
+  print('*'*50)
+  print()
 
-# approximate number of plates using SuperLogLog Estimation
-print("Approximating amount of unique plates using SuperLogLog..")
-start = time.time()
-sloglog_appx = SuperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
-time_used = time.time() - start
-percent_error = abs(sloglog_appx - exact_unique) / exact_unique * 100
-print ("SuperLogLog Estimation: " + str(sloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
-print ("Percent Error: " + str(percent_error) + "% \n")
+  print('{s:{c}^{n}}'.format(s=' Expected result ', n=50, c='-'))
+  print(expected)
+  print('{s:{c}^{n}}'.format(s=' Actual results ', n=50, c='-'))
+  print("\n".join("{0:.0f}".format(x) for x in actual))
+  print('{s:{c}^{n}}'.format(s=' Average result ', n=50, c='-'))
+  average = sum(actual) / len(actual)
+  print("{0:.0f}".format(average))
+  print('{s:{c}^{n}}'.format(s=' Difference ', n=50, c='-'))
+  print("{0:.0f}".format(expected - average))
+  print('{s:{c}^{n}}'.format(s=' Percent Error ', n=50, c='-'))
+  print("{0:.2%}".format(abs(expected - average) / expected))
+  print()
 
-# approximate number of plates using HyperLogLog Estimation
-print("Approximating amount of unique plates using HyperLogLog..")
-start = time.time()
-hloglog_appx = HyperLogLogEstimator.average_with_different_hashes(plate_hashes, ex_buckets, loops_to_run)
-time_used = time.time() - start
-percent_error = abs(hloglog_appx - exact_unique) / exact_unique * 100
-print ("HyperLogLog Estimation: " + str(hloglog_appx)+ " unique plates, approximated in "+str(time_used)+" seconds")
-print ("Percent Error: " + str(percent_error) + "% \n")
+#print results
+printTest("Flajolet-Martin", exact_unique, fm)
+printTest("LogLog", exact_unique, ll)
+printTest("HyperLogLog", exact_unique, hll)