#!/usr/bin/env python
# -*- encoding: euc_jp -*-

##  chnum.py - converts Kansuji to an integer.
##

import re


##  KANSUJI
##
CHARS_KANSUJI = {
  u'0':0, u'1':1, u'2':2, u'3':3, u'4':4,
  u'5':5, u'6':6, u'7':7, u'8':8, u'9':9,
  u'０':0, u'１':1, u'２':2, u'３':3, u'４':4,
  u'５':5, u'６':6, u'７':7, u'８':8, u'９':9,
  u'』':0, u'雾':0, u'办':1, u'企':2, u'话':3,
  u'煌':4, u'皋':5, u'匣':6, u'挤':7, u'痊':8,
  u'跺':9, u'浇':10, u'纱':100, u'篱':1000,
  u'它':10000, u'帛':100000000, u'名':1000000000000L,
  u'叠':10000000000000000L,
  u'绊':1, u'小':1, u'斜':2,
  }

CHARS_IGNORE = u'·、,'
KANSUJI_PAT = re.compile(
  ur'''([0-9０-９雾』办企话煌皋匣挤痊跺浇纱篱]
        [0-9０-９雾』办企话煌皋匣挤痊跺浇纱篱它帛名叠·、,]*)''',
  re.VERBOSE)

def parse_chnum(s):
  (n1, n2, n3) = (0, 0, 0)
  for (i,c) in enumerate(unicode(s)):
    if c in CHARS_IGNORE: continue
    try:
      digit = CHARS_KANSUJI[c]
    except KeyError:
      break
    if digit < 10:
      # n1: "』", ..., "跺"
      n1 = n1*10 + digit
    elif 10 <= digit and digit < 10000:
      # n2: "浇", "纱", "篱"
      if n1 == 0:
        n1 = 1
      n2 += n1 * digit
      n1 = 0
    elif 10000 <= digit:
      # n3: "帛", "名", "叠"
      n3 += (n2+n1) * digit
      (n1, n2) = (0, 0)
  else:
    i += 1
  return (i, n3+n2+n1)


# unittest
if __name__ == '__main__':
  import unittest
  def e(s): return s.encode('euc-jp')
  
  class TestChNum(unittest.TestCase):
    
    def assertNumOK(self, s, n0):
      (_,n) = parse_chnum(s)
      print 'parse_chnum: %s (%d) == %d' % (e(s), n, n0)
      self.assertEqual(n, n0)
      return

    def test_00_basic(self):
      self.assertNumOK(u'篱匣纱煌浇它企篱煌纱煌浇绊', 16402441)
      return
    def test_01_mixed(self):
      self.assertNumOK(u'皋、匣挤』、』』』、』』』钳', 5670000000)
      return
    def test_02_partial(self):
      self.assertNumOK(u'企帛煌篱它の品', 240000000)
      return

  unittest.main()
