#!/usr/bin/env python
#
#  pycdb.py - Python implementation of cdb
#
#   * public domain *
#

import sys, os
from struct import pack, unpack


# calc hash value with a given key
def hash(s):
  return reduce(lambda h,c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381)


##  CDBIterator
##
class CDBIterator:
  
  def __init__(self, fp, eod, t):
    self._fp = fp
    self._kloc = 2048
    self._eod = eod
    self._t = t
    return

  def __iter__(self):
    return self
  
  def next(self):
    if self._eod <= self._kloc:
      raise StopIteration
    self._fp.seek(self._kloc)
    (klen, vlen) = unpack('<LL', self._fp.read(4+4))
    k = self._fp.read(klen)
    v = self._fp.read(vlen)
    self._kloc += 4+4+klen+vlen
    if self._t == 1:
      return k
    elif self._t == 2:
      return v
    return (k,v)


##  CDBReader
##
class CDBReader:
  
  def __init__(self, cdbname, docache=1):
    self.name = cdbname
    self._fp = file(cdbname, "rb")
    (self._eod,) = unpack('<L', self._fp.read(4))
    self._docache = docache
    self._cache = {}
    self._keyiter = None
    return

  def __getitem__(self, k):
    k = str(k)
    if self._cache.has_key(k):
      return self._cache[k]
    h = hash(k)
    self._fp.seek((h % 256)*(4+4))
    (pos_bucket, ncells) = unpack('<LL', self._fp.read(4+4))
    if ncells == 0: raise KeyError(k)
    start = (h >> 8) % ncells
    for i in range(ncells):
      self._fp.seek(pos_bucket + ((start+i) % ncells)*(4+4))
      (h1, p1) = unpack('<LL', self._fp.read(4+4))
      if p1 == 0: raise KeyError(k)
      if h1 == h:
        self._fp.seek(p1)
        (klen, vlen) = unpack('<LL', self._fp.read(4+4))
        k1 = self._fp.read(klen)
        v1 = self._fp.read(vlen)
        if k1 == k:
          if self._docache:
            self._cache[k] = v1
          return v1
    raise KeyError(k)

  def get(self, k, failed=None):
    try:
      return self.__getitem__(k)
    except KeyError:
      return failed

  def firstkey(self):
    self._keyiter = None
    return self.nextkey()
  def nextkey(self):
    if not self._keyiter:
      self._keyiter = CDBIterator(self._fp, self._eod, 1)
    try:
      return self._keyiter.next()
    except StopIteration:
      return None
  def each(self):
    if not self._eachiter:
      self._eachiter = CDBIterator(self._fp, self._eod, 3)
    try:
      return self._keyiter.next()
    except StopIteration:
      return None
  
  def iterkeys(self):
    return CDBIterator(self._fp, self._eod, 1)
  def itervalues(self):
    return CDBIterator(self._fp, self._eod, 2)
  def iteritems(self):
    return CDBIterator(self._fp, self._eod, 3)


##  CDBMaker
##
class CDBMaker:

  def __init__(self, cdbname, tmpname):
    self.fn = cdbname
    self.fntmp = tmpname
    self.numentries = 0
    self._fp = file(tmpname, "wb")
    self._pos = (4+4)*256                # sizeof((h,p))*256
    self._bucket = [ [] for i in range(256) ]
    return

  def add(self, k, v):
    (k, v) = (str(k), str(v))
    (klen, vlen) = (len(k), len(v))
    self._fp.seek(self._pos)
    self._fp.write(pack('<LL', klen, vlen))
    self._fp.write(k)
    self._fp.write(v)
    h = hash(k)
    self._bucket[h % 256].append((h, self._pos))
    # sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
    self._pos += 4+4+klen+vlen
    self.numentries += 1
    return
  
  def finish(self):
    self._fp.seek(self._pos)
    pos_hash = self._pos
    # write hashes
    for b1 in self._bucket:
      if b1:
        ncells = len(b1)*2
        cell = [ (0,0) for i in range(ncells) ]
        for (h,p) in b1:
          i = (h >> 8) % ncells
          while cell[i][1]:  # is cell[i] already occupied?
            i = (i+1) % ncells
          cell[i] = (h,p)
        for (h,p) in cell:
          self._fp.write(pack('<LL', h, p))
    # write header
    self._fp.seek(0)
    for b1 in self._bucket:
      self._fp.write(pack('<LL', pos_hash, len(b1)*2))
      pos_hash += (len(b1)*2)*(4+4)
    # close
    self._fp.close()
    os.rename(self.fntmp, self.fn)
    return


#
def cdbmake(cdbname, tmpname):
  return CDBMaker(cdbname, tmpname)
def init(cdbname):
  return CDBReader(cdbname)
