#!/usr/bin/env python
# -*- encoding: euc-jp -*-
# euc:京

# usage:
#  ./rsstrans.py http://lwn.net/headlines/newrss /htdocs/lwn.html http://yourhost.example.com/lwn.html > /htdocs/lwn.xml
#

DIC={
  'gnu': 'GNU',
  'spam': 'spam',
  'worm': u'ワーム',
  'firefox': 'Firefox',
  'samba': 'Samba',
  'linux': 'Linux',
  'community': u'コミュニティ',
  'kernel': 'Kernel',
  'python': 'Python',
  'debian': 'Debian',
  'fedora': 'Fedora',
  'apache': 'Apache',
  'squid': 'Squid',
  }

import sys, os, re, fileinput, urllib
from sgmllib import SGMLParser
from xml.sax import handler, parse


##  RSSReader
##
class RSSReader(handler.ContentHandler):
  
  def __init__(self):
    self.repository = []
    self.phase = 0
    self.s = []
    self.dic = {}
    return handler.ContentHandler.__init__(self)
  
  def startElement(self, name, attrs):
    if name in ('item', 'channel'):
      assert self.phase == 0
      self.phase = 1
    elif self.phase == 1 and name in ('title', 'link', 'description'):
      self.phase = 2
      self.s = []
    return
  
  def endElement(self, name):
    if name in ('item', 'channel'):
      assert self.phase == 1
      (title, link, description) = [ self.dic.get(k, '').strip() for k in ('title', 'link', 'description') ]
      if name == 'channel':
        self.ch_title = title
        self.ch_link = link
        self.ch_description = description
      elif title and link:
        try:
          self.repository.append((title, link, description))
        except TransactionError, e:
          print >>sys.stderr, 'ignored:', e, link
      self.dic = {}
      self.phase = 0
    elif self.phase == 2:
      self.dic[name] = ''.join(self.s)
      self.phase = 1
    return
  
  def characters(self, ch):
    if self.phase == 2:
      self.s.append(ch)
    return


##  rss2html
##
def rss2html(rssurl, tmp):
  WORD=re.compile(r'\w+')
  ENTITY = { '&':'&amp;', '<':'&lt;', '>':'&gt;', '"':'&quot;' }
  def htmlquote(s):
    def quote1(c, codec):
      if c in ENTITY:
        return ENTITY[c]
      else:
        return c.encode(codec, 'xmlcharrefreplace')
    return ''.join([ quote1(c, 'euc-jp') for c in s ])
  
  def pretrans(s):
    def sub1(m):
      w = m.group(0).lower()
      if w in DIC:
        return '<span id="%s">%s</span>' % (htmlquote(DIC[w]), htmlquote(m.group(0)))
      else:
        return htmlquote(m.group(0))
    return WORD.sub(sub1, s)  

  reader = RSSReader()
  fp = urllib.urlopen(rssurl)
  parse(fp, reader)
  fp.close()
  print >>tmp, '<html><head><meta http-equiv="Content-Type" content="text/html; charset=euc-jp"></head><body>'
  print >>tmp, '<div id=channel><a href="%s">.</a>' % htmlquote(reader.ch_link)
  print >>tmp, '<div id=title>%s</div>' % pretrans(reader.ch_title)
  print >>tmp, '<div id=description>%s</div>' % pretrans(reader.ch_description)
  print >>tmp, '</div>'
  for (title,link,description) in reader.repository:
    print >>tmp, '<div id=item><a href="%s">.</a>' % htmlquote(link)
    print >>tmp, '<div id=title>%s</div>' % pretrans(title)
    if description:
      print >>tmp, '<div id=description>%s</div>' % pretrans(description)
    print >>tmp, '</div>'
  print >>tmp, '</body></html>'


##  export_rss
##
def export_rss((title, link, description), items, puburl):
  ENTITY = { '&':'&amp;', '<':'&lt;', '>':'&gt;', '"':'&quot;' }
  def rssquote(s):
    def quote1(c, codec):
      if c in ENTITY:
        return ENTITY[c]
      else:
        return c.encode(codec, 'xmlcharrefreplace')
    return ''.join([ quote1(c, 'utf-8') for c in s ])

  print '<?xml version="1.0" encoding="utf-8"?>'
  print '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">'
  print ' <channel rdf:about="%s">' % rssquote(puburl)
  print '  <title>%s</title>' % rssquote(title)
  print '  <link>%s</link>' % rssquote(link)
  if description:
    print '  <description>%s</description>' % rssquote(description)
  print '  <items><rdf:Seq>'
  for (title,link,description) in items:
    print '   <rdf:li resource="%s" />' % rssquote(link)
  print '  </rdf:Seq></items>'
  print ' </channel>'
  for (title,link,description) in items:
    print ' <item rdf:about="%s">' % rssquote(link)
    print '  <title>%s</title>' % rssquote(title)
    print '  <link>%s</link>' % rssquote(link)
    if description:
      print '  <description>%s</description>' % rssquote(description)
    print ' </item>'
  print '</rdf:RDF>'
  return


##  HTML parser
##
class html2rss(SGMLParser):
  def __init__(self):
    SGMLParser.__init__(self)
    self.dic = {}
    self.curtxt = []
    self.curtag = []
    self.hold = 0
    self.channel = ('','','')
    self.items = []
    return

  def handle_data(self, data):
    if not self.hold:
      self.curtxt.append(data)
    return

  def doit(self, tag, dic):
    if tag == 'channel':
      self.channel = (dic.get('title'), dic.get('link'), dic.get('description'))
    else:
      self.items.append((dic.get('title'), dic.get('link'), dic.get('description')))
    return
  
  def start_div(self, attrs):
    attrs = dict(attrs)
    if 'id' in attrs:
      self.curtag.append(attrs['id'])
    return
  def end_div(self):
    if self.curtag:
      t = self.curtag.pop()
      if t in ('item', 'channel'):
        self.doit(t, self.dic)
        self.dic = {}
      else:
        self.dic[t] = ''.join(self.curtxt).strip()
      self.curtxt = []
    return

  def start_a(self, attrs):
    attrs = dict(attrs)
    self.hold += 1
    self.dic['link'] = attrs['href']
    return
  def end_a(self):
    self.hold -= 1
    return
  
  def start_span(self, attrs):
    attrs = dict(attrs)
    self.hold += 1
    self.curtxt.append(attrs.get('id',''))
    return
  def end_span(self):
    self.hold -= 1
    return
  

# main
if __name__ == "__main__":
  (rssurl, tmpfile, tmpurl) = sys.argv[1:]
  fp = file(tmpfile, 'w')
  rss2html(rssurl, fp)
  fp.close()
  opener = urllib.FancyURLopener()
  opener.addheaders = [('User-agent', 'Mozilla/5.0')]
  fp = opener.open('http://www.excite.co.jp/world/english/web/body/?wb_url=%s&wb_lp=ENJA' % tmpurl)
  parser = html2rss()
  for line in fp:
    parser.feed(unicode(line, 'shift_jis', 'replace'))
  fp.close()
  parser.close()
  export_rss(parser.channel, parser.items, rssurl)
