#!/usr/bin/env python
# -*- encoding: japanese.euc_jp -*-

try:
  import cdb
except ImportError:
  import pycdb as cdb
import sys, sexpr, glob
from abstfilter import AbstractFeeder, AbstractFilter, AbstractConsumer

from cforms import KFORM
from table import TABLE


# findlabels
def findlabels(base, pos, ktype, kform):
  for v1 in { pos:1, "*":1 }.keys():
    for v2 in { ktype:1, "*":1 }.keys():
      for v3 in { kform:1, "*":1 }.keys():
        for v4 in { base:1, "*":1 }.keys():
          try:
            yield TABLE[u"%s,%s,%s,%s" % (v1,v2,v3,v4)]
          except KeyError:
            pass
  return

# kexpand
def kexpand(base, yomi, ktype):
  try:
    forms = KFORM[ktype]
    inf = forms[u"ܷ"]
    assert len(inf) == 1
    (ny, ns) = inf[0]
    y0 = yomi[:len(yomi)-len(ny)]
    s0 = base[:len(base)-len(ns)]
    for (kform,pairs) in forms.iteritems():
      for (sx,yx) in pairs:
        yield (kform, s0+sx, y0+yx)
  except KeyError:
    yield ("*", base, yomi)
  return


##  DictCompiler
##
class DictCompiler:
  
  def __init__(self):
    self.trie = {}
    return

  def intern(self, s):
    if self.trie.has_key(s):
      return self.trie[s]
    ent = ({}, [])
    self.trie[s] = ent
    return ent

  # all strings should be unicode.
  def add(self, base, pos, yomi, ktype, cost):
    info = (base, pos, ktype, cost)
    for (kform,s,y) in kexpand(base, yomi, ktype):
      self.intern(s)[1].append((kform, y, info))
    return self

  def finish(self):
    for k in self.trie.keys():          # keys change. not iterkeys()
      if 1 < len(k):
        for i in range(len(k)-1, 1, -1):
          t = k[0:i]
          if self.trie.has_key(t):
            self.trie[t][0][len(k)] = 1
            break
        else:
          self.intern(k[0:1])[0][len(k)] = 1
    return self

  def export_cdb(self, cdbname, encout="japanese.euc_jp", dbg=None):
    db = cdb.cdbmake(cdbname, cdbname+".tmp")
    i = 0
    for (s, (next, infos)) in self.trie.iteritems():
      keys = next.keys()
      keys.sort()
      n = ",".join(map(str, keys))
      def mrepr((kform, yomi, (base, pos, ktype, cost))):
        labels = ",".join(map(str, findlabels(base, pos, ktype, kform)))
        return "%s,%s,%s,%s,%s,%s:%s" % (base, yomi, pos, ktype, kform, cost, labels)
      
      db.add(s.encode(encout), "%s %s" % (n, " ".join(map(mrepr, infos))))
      i += 1
      if dbg and (i%1000)==0:
        dbg.write(".")
        dbg.flush()
    db.finish()
    if dbg:
      dbg.write("\n")
      dbg.flush()
    return self


##  IPADictReader
##
class IPADictReader(AbstractConsumer):

  def __init__(self, dictmaker, encin="japanese.euc_jp"):
    AbstractConsumer.__init__(self)
    self.dictmaker = dictmaker
    self.encin = encin
    self.t = 0
    return

  def feed(self, x):
    def dec(s): return unicode(s, self.encin)
    try:
      if self.t == 0:
        assert x[0] == "ʻ"           # not unicode!
        self.pos = "-".join(x[1])
      else:
        try:
          x = dict(x)
        except ValueError:
          print sexpr.sexpr2str(x)
          
        (base, cost) = x["Ф"]    # not unicode!
        yomi = x.get("ɤ", base)      # not unicode!
        ktype = x.get("ѷ", "*")    # not unicode!
        self.dictmaker.add(dec(base), dec(self.pos), dec(yomi), dec(ktype), dec(cost))
    except UnicodeError:
      sys.stderr.write('"%s"! ' % base)
    self.t = (self.t+1) % 2
    return

  def read(self, fname, dbg=None):
    i = 0
    f = file(fname)
    p = sexpr.SExprReader(self)
    for line in f.xreadlines():
      p.feed(line)
      i += 1
      if dbg and (i%1000)==0:
        dbg.write(".")
        dbg.flush()
    f.close()
    if dbg:
      dbg.write("\n")
      dbg.flush()
    return self


# main
if __name__ == "__main__":
  if len(sys.argv) < 3:
    print >>sys.stderr, "usage: dictool.py cdbname [files ...]"
    sys.exit(2)
  dictmaker = DictCompiler()
  r = IPADictReader(dictmaker)
  for arg in sys.argv[2:]:
    for f in glob.glob(arg):
      print >>sys.stderr, "Reading:", f,
      r.read(f, sys.stderr)
  r.close()
  print >>sys.stderr, "Writing:", sys.argv[1],
  dictmaker.finish().export_cdb(sys.argv[1], dbg=sys.stderr)
