#!/usr/bin/env python # gleb's script to try out hash table stuff import random import gmpy from gmpy import mpz def count_coll(R, p, embed): s = 0 c = [ 0 for x in xrange(0, p) ] for i in xrange(0, len(R)): r = R[i] c[r % p] += 1 m = max(c) coll = [ c.count(i) for i in xrange(m, 0, -1) ] slots = p * embed fast = sum([ coll[-i] * i for i in xrange(1, embed + 1)]) fast = float(100) * fast / len(R) unused0 = c.count(0) * embed unused = sum([ coll[-i] * (embed - i) for i in xrange(1, embed)]) unusedp = float(100) * (unused0 + unused) / slots return "Max collisions: %s, slots: %d/%d, unused: %.02f%% %d+%d, embedded %d: %.02f%%, %s" % \ (m, p, slots, unusedp, unused0, unused, embed, fast, coll) def clp2(x): x = x - 1 x = x | (x >> 1) x = x | (x >> 2) x = x | (x >> 4) x = x | (x >> 8) x = x | (x >>16) return x + 1 def generate_ph(n): # pz = int(gmpy.next_prime(mpz(n*2/5))) pz = int(gmpy.next_prime(mpz(n*3/5))) pz1 = int(gmpy.next_prime(mpz(n))) p2 = int(clp2(n) / 2) print 'Items: %s' % (n,) R = [ random.getrandbits(64) for i in xrange(0, n) ] while len(R) != len(set(R)): print 'Initial hash collisions, regenerating: %s/%s', len(set(R)), n R = [ random.getrandbits(64) for i in xrange(0, n) ] print 'Prime ', count_coll(R, pz, 4) print 'Power2', count_coll(R, p2, 4) print 'Prime ', count_coll(R, pz, 2) print 'Power2', count_coll(R, p2, 2) print 'Prime ', count_coll(R, pz1, 1) print 'Power2', count_coll(R, p2 * 2, 1) print '' generate_ph(1000) generate_ph(11000) generate_ph(22000) generate_ph(140000)