#!/usr/bin/env python2.3
# -*- encoding: japanese.euc_jp -*-
#
#  morph.py - a tiny morphological analyzer
#
#  by Yusuke Shinyama
#  * public domain *

try:
  import cdb
except ImportError:
  import pycdb as cdb


##  Morpheme
##
class Morph:
  
  def __init__(self, *args):
    (self.loc, self.s, self.base, self.yomi, self.pos,
     self.ktype, self.kform, self.cost, self.labels) = args
    self.mincost = 0
    self.prev = None
    return

  def __repr__(self):
    return '<Morph: "%s"(%s) %s %s>' % (self.s, self.pos, self.ktype, self.kform)

  # update the minimal cost if possible.
  def update(self, mincost, prev):
    if self.prev == None or mincost < self.mincost:
      self.mincost = mincost
      self.prev = prev
    return


##  Matrix
##
class Matrix:

  def __init__(self, cdbname):
    self.mat = cdb.init(cdbname)
    return

  # try to find as specific entry which corresponds to the labels.
  def lookup(self, labels1, labels2):
    cost = 10000                        # undefined link
    prio = 99999
    for r1 in labels1:
      for r2 in labels2:
        try:
          x = int(self.mat["%d-%d" % (r1, r2)])
          if r1+r2 < prio:              # lower number: more specific
            prio = r1+r2
            cost = x
        except KeyError:
          pass
    return cost


##  Context
##
class Context:

  def __init__(self, mat, *dics):
    self.mat = mat
    self.dics = dics
    self.init()
    return

  # prepare the context for a new sentence.
  def init(self):
    self.lat = { -1: [Morph(-1," ","","",u"ʸƬ","","",0,[0])] }
    self.end = 0
    return self

  # append string at the end of the context.
  def addstr(self, s):
    s = unicode(s)
    todo = { self.end: 1 }
    while todo:
      (loc, dummy) = todo.popitem()
      if not self.lat.has_key(loc):
        self.lat[loc] = []
      for d in self.dics:
        for (t,base,yomi,pos,ktype,kform,cost,labels) in d.search(s, loc):
          todo[loc+len(t)] = 1
          self.lat[loc].append(Morph(loc,t,base,yomi,pos,ktype,kform,cost,labels))
    self.end += len(s)
    return self

  # start viterbi decoding.
  def decode(self):
    self.lat[self.end] = [Morph(self.end,"","","",u"ʸ","","",0,[1])]
    keys = self.lat.keys()
    keys.sort()
    for i in keys[:-1]:
      for m1 in self.lat[i]:
        try:
          for m2 in self.lat[i+len(m1.s)]:
            m2.update(m1.mincost+m1.cost+self.mat.lookup(m1.labels, m2.labels), m1)
        except KeyError:
          pass
    return self

  # obtain the best path.
  def getpath(self):
    m = self.lat[self.end][0]
    r = []
    while m:
      r.append(m)
      m = m.prev
    r.reverse()
    return r


##  Dictionary
##
class CompiledDictionary:
  
  def __init__(self, dic, enc="japanese.euc_jp"):
    self.dic = dic
    self.enc = enc
    return

  # returns a list of entries which matches to the given string.
  def search(self, s, loc):
    next = [1]
    while 1:
      for n in next:
        if len(s) < loc+n: return
        t = s[loc:loc+n]
        try:
          e = [ unicode(x, self.enc) for x in self.dic[t.encode(self.enc)].split(" ") ]
          if e[1]:
            for x in e[1:]:
              (posinfo, labels) = x.split(":")
              (base, yomi, pos, ktype, kform, cost) = posinfo.split(",")
              try:
                labels = map(int, labels.split(","))
              except ValueError:
                labels = []
              yield (t,base,yomi,pos,ktype,kform,int(cost),labels)
          if not e[0]: return
          next = map(int, e[0].split(","))
          break
        except KeyError:
          pass
      else:
        return


# main
if __name__ == "__main__":
  import fileinput
  mat = Matrix("matrix.cdb")
  dic = CompiledDictionary(cdb.init("chadic.cdb"))
  c = Context(mat, dic)
  for line in fileinput.input():
    s = unicode(line.strip())
    c.init().addstr(s).decode()
    for m in c.getpath()[1:-1]:
      if m.ktype == "*":
        (m.ktype, m.kform) = ("", "")
      print "%s\t%s\t%s\t%s\t%s\t%s" % \
            (m.s, m.yomi, m.base, m.pos, m.ktype, m.kform) #, m.labels, m.mincost
    print "EOS"
