#!/usr/bin/env python # gpf: testing how often cuckoo_insert goes into infinite loop and the releationship between # success_rate and hash table size. # comments: # This implementation uses 2 hash tables and 2 respective hash functions. # The size of a hash table is the next mprime number of the total amount of elements. # So for 100 elements we will have 2 * 101 = 202 cells. # observations: # This way, success rate is around ~85%. # If we increate the size of the tables by 10%, success rate goes to ~96% # In this case, success rates increase as the number of total elements n increase. # TODO: # Next step is to test with actual real life data (pefs namemacs) and maybe some # modification of the vanilla cuckoo algorithm import random import gmpy from gmpy import mpz def hash1(elem, hts): pos = elem % hts return pos def hash2(elem, hts): pos = (elem / hts) % hts return pos def cuckoo_insert(elem, ht1, ht2, hts): max_tries = hts for i in xrange(0, max_tries): pos1 = hash1(elem, hts) elem1 = ht1[pos1] # do the cuckoo ht1[pos1] = elem if elem1 == 0: return 0 pos2 = hash2(elem1, hts) elem2 = ht2[pos2] # do the cuckoo ht2[pos2] = elem1 if elem2 == 0: return 0 else: elem = elem2 return 1 def build_cuckoo_table(R, hts): ht1 = [ 0 for x in xrange(0, hts) ] ht2 = [ 0 for x in xrange(0, hts) ] for i in xrange(0, len(R)): r = R[i] inf = cuckoo_insert(r, ht1, ht2, hts) if inf == 1: return 1 return 0 def generate_ht(n, tries, extra_space): nextra = int(n * extra_space / 100) hts = int(gmpy.next_prime(mpz(n + nextra))) print 'Items: %s' % (n,) print 'Hash Table Sizes: ', hts print 'Extra Space for each table: %.02f%%' % extra_space successful = 0 for j in xrange(0, tries): R = [ random.getrandbits(64) for i in xrange(0, n) ] while len(R) != len(set(R)): print 'Initial hash collisions, regenerating: %s/%s', len(set(R)), n R = [ random.getrandbits(64) for i in xrange(0, n) ] res = build_cuckoo_table(R, hts) if res == 0: successful += 1 #done = float(100) * j / tries #if (done % 25) == 0: #print 'Done: %.02f%%' % (done) succ_rate = float(100) * successful / tries print 'Tries: ', tries print 'Successful %d, Success rate %.02f%%' % (successful, succ_rate) print '' # XXXgpf: better to lower the 2nd argument by 1/10 if you want results in a couple of mins generate_ht(100, 10000, 0) generate_ht(1000, 10000, 0) generate_ht(10000, 10000, 0) generate_ht(100000, 1000, 0) generate_ht(100, 10000, 10) generate_ht(1000, 10000, 10) generate_ht(10000, 10000, 10) generate_ht(100000, 1000, 10)