#!/usr/bin/env python
##
##  html2txt.py - strip printable texts from html files.
##
##  by Yusuke Shinyama, May 2003, * public domain *
##

import sys, re
from sgmllib import SGMLParser, SGMLParseError


class HTMLConverter(SGMLParser):

  RE1 = re.compile(u"([\u3000-\uff5f])[\n\r]+([\u3000-\uff5f])")
  RE2 = re.compile(r"[\n\r\s]+")

  entitydefs = {
    'nbsp':u' ', 'thinsp':u' ', 'emsp':u' ', 'ensp':u' ',
    'amp':u'&', 'lt':u'<', 'gt':u'>', 'quot':u'"', 'apos':u"'"
    }
  
  def __init__(self):
    SGMLParser.__init__(self)
    self.ignore = 0
    self.pre = 0
    self.is_title = 0
    self.title = u""
    self.title_not_processed = 1
    self.s = u""
    return

  def close(self):
    SGMLParser.close(self)
    self.newline()
    return

  def convstr(self, s):
    # remove all newlines between two zenkaku characters.
    s = HTMLConverter.RE1.sub(r"\1\2", s)
    # replace all contiguous blanks into a single space.
    s = HTMLConverter.RE2.sub(r" ", s)
    return s.strip()
  
  def handle_data(self, x):
    try:
      x = unicode(x)
    except UnicodeError:
      return
    if not self.ignore:
      self.s += x
    return

  def newline(self, attrs=[]):
    if self.title_not_processed:
      self.process_title(self.convstr(self.title))
      self.title_not_processed = 0
    if self.s:
      if self.pre:
        self.process_text(self.s)
      else:
        s = self.convstr(self.s)
        if s: self.process_text(s)
      self.s = u""
    return

  def begin_ignore(self, attrs):
    self.ignore += 1
    return
  
  def end_ignore(self):
    self.ignore -= 1
    return

# uncomment if you want to extract formatted texts as they are.
#  def start_pre(self, attrs):
#    self.pre = 1
#    self.newline()
#    return
#  
#  def end_pre(self):
#    self.newline()
#    self.pre = 0
#    return
  
  def start_title(self, attrs):
    self.is_title = 1
    return
  
  def end_title(self):
    self.is_title = 0
    self.title = self.s
    self.s = ""
    return
  
  start_body = newline
  start_p = end_p = newline
  do_br = newline
  do_hr = newline
  start_th = end_th = newline
  start_td = end_td = newline
  start_li = end_li = newline
  start_dt = end_dt = newline
  start_dd = end_dd = newline
  start_h1 = end_h1 = newline
  start_h2 = end_h2 = newline
  start_h3 = end_h3 = newline
  start_h4 = end_h4 = newline
  start_h5 = end_h5 = newline
  start_h6 = end_h6 = newline
  start_pre = end_pre = newline
  start_div = end_div = newline
  start_center = end_center = newline
  start_blockquote = end_blockquote = newline
  start_caption = end_caption = newline
  start_form = end_form = newline
  start_button = end_button = newline
  
  start_style = begin_ignore; end_style = end_ignore
  start_script = begin_ignore; end_script = end_ignore
  start_applet = begin_ignore; end_applet = end_ignore
  start_object = begin_ignore; end_object = end_ignore
  start_select = begin_ignore; end_select = end_ignore
  
  ##
  def process_title(self, t):
    print t
    return

  def process_text(self, s):
    print s
    return


if __name__ == "__main__":
  import fileinput
  p = HTMLConverter()
  try:
    for s in fileinput.input():
      try:
        p.feed(unicode(s))
      except UnicodeError:
        print >>sys.stderr, "warning: skipped:", repr(s)
    p.close()
  except SGMLParseError:
    print >>sys.stderr, "fatal: sgml parser error"
