#!/usr/bin/env python
##
##  swfparser.py, extract texts from swfs.
##  Version 0.1
##
##  Copyright (c) 2004  Yusuke Shinyama <yusuke at cs dot nyu dot edu>
##
##  Permission is hereby granted, free of charge, to any person
##  obtaining a copy of this software and associated documentation
##  files (the "Software"), to deal in the Software without
##  restriction, including without limitation the rights to use,
##  copy, modify, merge, publish, distribute, sublicense, and/or
##  sell copies of the Software, and to permit persons to whom the
##  Software is furnished to do so, subject to the following
##  conditions:
##
##  The above copyright notice and this permission notice shall be
##  included in all copies or substantial portions of the Software.
##
##  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
##  KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
##  WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
##  PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
##  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
##  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
##  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
##  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
##


##  usage:
##    $ ./swfparser.py [-d] [-c encoding] [-A autoblank_ratio] file ...
##
##  options:
##    -d: debugging
##    -c: set default encoding (ms932)
##    -A: automatically insert a blank when the gap is wider than this.
##

import sys
stderr = sys.stderr
from struct import pack, unpack


class Font:
  """
  Database which includes glyphs and charmap.
  """
  def __init__(self, fontid, shapes):
    self.fontid = fontid
    self.shapes = shapes
    self.charmap = []
    return
  
  def addcharmap(self, charmap, wide, encoding):
    if encoding == 'ucs2':
      self.charmap = [ unichr(c) for c in charmap ]
    elif wide:
      self.charmap = [ unicode(chr(c>>8)+chr(c&255), encoding) for c in charmap ]
    else:
      self.charmap = [ unicode(chr(c), encoding) for c in charmap ]
    return
  
  def getglyph(self, i):
    return (self.charmap[i], self.shapes[i])


class SWFParser:

  def __init__(self, defaultencoding='iso-8859-1', autoblank=0, debug=False):
    self.fp = None
    self.buff = 0
    self.bpos = 8
    self.fonts = {}
    self.fontinfo = {}
    self.glyphmap = {}
    self.encoding = defaultencoding
    self.autoblank = autoblank
    self.debug = debug
    return

  # fixed bytes read
  
  def read(self, n):
    return self.fp.read(n)
  
  def readui8(self):
    return ord(self.fp.read(1))
  def readsi8(self):
    return unpack('<b', self.fp.read(1))[0]
  
  def readui16(self):
    return unpack('<H', self.fp.read(2))[0]
  def readsi16(self):
    return unpack('<h', self.fp.read(2))[0]
  
  def readui32(self):
    return unpack('<L', self.fp.read(4))[0]

  def readrgb(self):
    return ( self.readui8(), self.readui8(), self.readui8() )
  def readrgba(self):
    return ( self.readui8(), self.readui8(), self.readui8(), self.readui8() )

  # fixed bits read

  def setbuff(self, bpos=8, buff=0):
    (self.bpos, self.buff) = (bpos, buff)
    return
  
  def readbits(self, bits, signed=False):
    if bits == 0: return 0
    bits0 = bits
    v = 0
    while 1:
      r = 8-self.bpos # the number of remaining bits we can get from the current buffer.
      if bits <= r:
        # |-----8-bits-----|
        # |-bpos-|-bits-|  |
        # |      |----r----|
        v = (v<<bits) | ((self.buff>>(r-bits)) & ((1<<bits)-1))
        self.bpos += bits
        break
      # |-----8-bits-----|
      # |-bpos-|---bits----...
      # |      |----r----|
      v = (v<<r) | (self.buff & ((1<<r)-1))
      bits -= r
      self.buff = ord(self.fp.read(1))
      self.bpos = 0
    if signed and (v>>(bits0-1)):
      v -= (1<<bits0)
    return v
  
  # variable length structure

  def readstring(self):
    s = []
    while 1:
      c = self.read(1)
      if c == '\x00': break
      s.append(c)
    return unicode(''.join(s), self.encoding)
  
  def readrect(self):
    x = ord(self.fp.read(1))
    bits = x>>3
    self.setbuff(5, x)
    return ( self.readbits(bits,1), self.readbits(bits,1), self.readbits(bits,1), self.readbits(bits,1) )

  def readmatrix(self):
    self.setbuff()
    (scalex, scaley) = (None, None)
    if self.readbits(1):                # hasscale
      n = self.readbits(5)
      scalex = self.readbits(n,1)
      scaley = self.readbits(n,1)
    (rot0, rot1) = (None, None)
    if self.readbits(1):                # hasrotate
      n = self.readbits(5)
      rot0 = self.readbits(n,1)
      rot1 = self.readbits(n,1)
    (transx, transy) = (None, None)
    n = self.readbits(5)
    transx = self.readbits(n,1)
    transy = self.readbits(n,1)
    return (scalex, scaley, rot0, rot1, transx, transy)

  def readshape(self, nfillbits, nlinebits):
    self.setbuff()
    r = []
    (x0,y0) = (0,0)
    while 1:
      typeflag = self.readbits(1)
      if typeflag:
        # edge
        straightflag = self.readbits(1)
        (dx,dy) = (0,0)
        if straightflag:
          # StraightEdgeRecord
          n = self.readbits(4)+2
          if self.readbits(1):
            dx = self.readbits(n,1)
            dy = self.readbits(n,1)
          elif self.readbits(1):
            dy = self.readbits(n,1)
          else:
            dx = self.readbits(n,1)
          x0 += dx
          y0 += dy
          r.append((x0,y0))
        else:
          # CurveEdgeRecord
          n = self.readbits(4)+2
          cx = self.readbits(n,1)
          cy = self.readbits(n,1)
          ax = self.readbits(n,1)
          ay = self.readbits(n,1)
          r.append((x0+cx,y0+cy))
          x0 += cx+ax
          y0 += cy+ay
          r.append((x0,y0))
      else:
        # style
        flags = self.readbits(5)
        if flags == 0: break
        # XXX: cannot handle StateNewStyles bit.
        assert flags & 16 == 0, flags
        if flags & 1:
          n = self.readbits(5)
          x0 = self.readbits(n,1)
          y0 = self.readbits(n,1)
          r.append((x0, y0))
        if flags & 2:
          fillstyle0 = self.readbits(nfillbits)
        if flags & 4:
          fillstyle1 = self.readbits(nfillbits)
        if flags & 8:
          linestyle1 = self.readbits(nlinebits)
    xs = [ x for (x,y) in r ]
    ys = [ y for (x,y) in r ]
    if not r: return (0,0)
    return (max(xs), -min(ys))

  def parse_header(self):
    (F,W,S,V) = self.read(4)
    assert W == 'W'
    assert S == 'S'
    self.swfversion = ord(V)
    if 6 <= self.swfversion:
      self.encoding = 'utf-8'
    self.totallen = self.readui32()
    if debug:
      print >>stderr, F,W,S,self.swfversion,self.totallen
    if F == 'C':
      # compressed
      import zlib, StringIO
      x = zlib.decompress(self.fp.read())
      self.totallen -= 8
      assert len(x) == self.totallen, 'invalid tag'
      self.fp = StringIO.StringIO(x)
    self.rect = self.readrect()
    self.frate = self.readui16()
    self.fcount = self.readui16()
    return

  def parse_tag1(self):
    x = self.readui16()
    tag = x>>6
    if x & 63 == 63:
      length = self.readui32()
    else:
      length = x & 63
    pos0 = self.fp.tell()
    name = 'do_tag%d' % tag
    # branch to do_tag<N>
    if hasattr(self, name):
      getattr(self, name)(tag, length)
    elif debug:
      print >>stderr, 'unknown tag:', tag, length
    self.fp.seek(pos0+length)
    return

  # DefineFont
  def do_tag10(self, tag, length):
    fontid = self.readui16()
    origin = self.fp.tell()
    offset1 = self.readui16()
    offsets = [offset1]
    for i in xrange(offset1/2-1):
      offsets.append(self.readui16())
    shapes = []
    for o in offsets:
      self.fp.seek(origin+o)
      self.setbuff()
      nfillbits = self.readbits(4)
      nlinebits = self.readbits(4)
      shapes.append(self.readshape(nfillbits, nlinebits))
    self.fonts[fontid] = Font(fontid, shapes)
    if debug:
      print >>stderr, 'DefineFont:', fontid
    return

  # DefineFont2
  def do_tag48(self, tag, length):
    fontid = self.readui16()
    flags = self.readui8()
    langcode = self.readui8()
    fontnamelen = self.readui8()
    fontname = unicode(self.read(fontnamelen), self.encoding)
    nglyphs = self.readui16()
    origin = self.fp.tell()
    offsets = []
    shiftjis = flags & 64
    widecodes = flags & 4
    wideoffsets = flags & 8
    for i in xrange(nglyphs):
      if wideoffsets:
        offsets.append(self.readui32())
      else:
        offsets.append(self.readui16())
    if wideoffsets:
      ctoffset = self.readui32()
    else:
      ctoffset = self.readui16()
    # read shapes
    shapes = []
    for o in offsets:
      self.fp.seek(origin+o)
      self.setbuff()
      nfillbits = self.readbits(4)
      nlinebits = self.readbits(4)
      shapes.append(self.readshape(nfillbits, nlinebits))
    self.fonts[fontid] = Font(fontid, shapes)
    if debug:
      print >>stderr, 'DefineFont2:', fontid
    self.fp.seek(origin+ctoffset)
    # read charmaps
    codes = []
    for i in xrange(nglyphs):
      if widecodes:
        codes.append(self.readui16())
      else:
        codes.append(self.readui8())
    encoding = 'ucs2'
    if shiftjis:
      encoding = self.encoding
    self.fonts[fontid].addcharmap(codes, 1, encoding)
    return

  # DefineText, DefineText2
  def do_tag11(self, tag, length):
    charid = self.readui16()
    bounds = self.readrect()
    matrix = self.readmatrix()
    glyphbits = self.readui8()
    advbits = self.readui8()
    curfont = None
    (fontid, xoffset, yoffset, height) = (0,0,0,0)
    while 1:
      flags = self.readui8()
      if flags == 0: break
      if flags & 8:
        fontid = self.readui16()
        curfont = self.fonts[fontid]
      if flags & 4:
        if tag == 11:
          textcolor = self.readrgb()  # tag11: DefineText
        else:
          textcolor = self.readrgba()  # tag33: DefineText2
      if flags & 2: xoffset = self.readsi16()
      if flags & 1: yoffset = self.readsi16()
      if flags & 8: height = self.readui16()
      chars = []
      nchars = self.readui8()
      self.setbuff()
      ratio = height/1024.0
      for i in xrange(nchars):
        glyphindex = self.readbits(glyphbits)
        glyphadv = self.readbits(advbits, 1)
        (c,(w,h)) = curfont.getglyph(glyphindex)
        chars.append(c)
        if self.autoblank and w*ratio*self.autoblank < glyphadv:
          chars.append(' ')
      self.outputtext(''.join(chars))
    return
  do_tag33 = do_tag11

  # DefineFontInfo, DefineFontInfo2
  def do_tag13(self, tag, length):
    fontid = self.readui16()
    fontnamelen = self.readui8()
    fontname = unicode(self.read(fontnamelen), self.encoding)
    flags = self.readui8()
    if tag == 62: # DefineFontInfo2 - Langcode added.
      langcode = self.readui8()
    nglyphs = len(self.fonts[fontid].shapes)
    codes = []
    for i in xrange(nglyphs):
      if flags & 1:                     # wide?
        codes.append(self.readui16())
      else:
        codes.append(self.readui8())
    encoding = 'iso-8859-1'
    if flags & 16:                      # shift_jis?
      encoding = self.encoding
    if tag == 63:                       # DefineFontInfo2 - UCS2.
      encoding = 'ucs2'
    self.fonts[fontid].addcharmap(codes, flags & 1, encoding)
    if debug:
      print >>stderr, 'FontInfo:', fontid, fontname
    return
  do_tag62 = do_tag13

  # DefineEditText
  def do_tag37(self, tag, length):
    charid = self.readui16()
    bounds = self.readrect()
    flags1 = self.readui8()
    flags2 = self.readui8()
    if flags1 & 1: # hasFont
      fontid = self.readui16()
      fontheight = self.readui16()
    if flags1 & 4: # hasTextColor
      textcolor = self.readrgba()
    if flags1 & 2: # hasMaxLength
      maxlength = self.readui16()
    if flags2 & 32: # hasLayout
      align = self.readui8()
      leftmargin = self.readui16()
      rightmargin = self.readui16()
      indent = self.readui16()
      leading = self.readui16()
    varname = self.readstring()
    if flags1 & 128: # hasText
      self.outputtext(self.readstring())
    return

  def outputtext(self, s):
    print s.encode(sys.getdefaultencoding(), 'replace')
    return

  def parse(self, fname):
    self.fp = file(fname, 'rb')
    self.parse_header()
    while self.fp.tell() < self.totallen:
      self.parse_tag1()
    self.fp.close()
    return

# main
if __name__ == "__main__":
  import sys, getopt
  def usage():
    print "usage: swfparser.py [-d] [-c encoding] [-A autoblank_ratio] file ..."
    sys.exit(2)
  try:
    (opts, args) = getopt.getopt(sys.argv[1:], "dc:A:")
  except getopt.GetoptError:
    usage()
  (debug, encoding, autoblank) = (False, 'ms932', 0)
  for (k, v) in opts:
    if k == "-d": debug = True
    elif k == "-c": encoding = v
    elif k == "-A": autoblank = float(v)
  for fname in args:
    SWFParser(encoding, autoblank, debug).parse(fname)
