%s

#!/usr/bin/env python # -*- encoding: euc-jp -*- # euc:京 # usage: # ./rsstrans.py http://lwn.net/headlines/newrss /htdocs/lwn.html http://yourhost.example.com/lwn.html > /htdocs/lwn.xml # DIC={ 'gnu': 'GNU', 'spam': 'spam', 'worm': u'ワーム', 'firefox': 'Firefox', 'samba': 'Samba', 'linux': 'Linux', 'community': u'コミュニティ', 'kernel': 'Kernel', 'python': 'Python', 'debian': 'Debian', 'fedora': 'Fedora', 'apache': 'Apache', 'squid': 'Squid', } import sys, os, re, fileinput, urllib from sgmllib import SGMLParser from xml.sax import handler, parse ## RSSReader ## class RSSReader(handler.ContentHandler): def __init__(self): self.repository = [] self.phase = 0 self.s = [] self.dic = {} return handler.ContentHandler.__init__(self) def startElement(self, name, attrs): if name in ('item', 'channel'): assert self.phase == 0 self.phase = 1 elif self.phase == 1 and name in ('title', 'link', 'description'): self.phase = 2 self.s = [] return def endElement(self, name): if name in ('item', 'channel'): assert self.phase == 1 (title, link, description) = [ self.dic.get(k, '').strip() for k in ('title', 'link', 'description') ] if name == 'channel': self.ch_title = title self.ch_link = link self.ch_description = description elif title and link: try: self.repository.append((title, link, description)) except TransactionError, e: print >>sys.stderr, 'ignored:', e, link self.dic = {} self.phase = 0 elif self.phase == 2: self.dic[name] = ''.join(self.s) self.phase = 1 return def characters(self, ch): if self.phase == 2: self.s.append(ch) return ## rss2html ## def rss2html(rssurl, tmp): WORD=re.compile(r'\w+') ENTITY = { '&':'&', '<':'<', '>':'>', '"':'"' } def htmlquote(s): def quote1(c, codec): if c in ENTITY: return ENTITY[c] else: return c.encode(codec, 'xmlcharrefreplace') return ''.join([ quote1(c, 'euc-jp') for c in s ]) def pretrans(s): def sub1(m): w = m.group(0).lower() if w in DIC: return '%s' % (htmlquote(DIC[w]), htmlquote(m.group(0))) else: return htmlquote(m.group(0)) return WORD.sub(sub1, s) reader = RSSReader() fp = urllib.urlopen(rssurl) parse(fp, reader) fp.close() print >>tmp, '' print >>tmp, '

.' % htmlquote(reader.ch_link) print >>tmp, '

' % pretrans(reader.ch_title) print >>tmp, '

' % pretrans(reader.ch_description) print >>tmp, '

' for (title,link,description) in reader.repository: print >>tmp, '

.' % htmlquote(link) print >>tmp, '

' % pretrans(title) if description: print >>tmp, '

' % pretrans(description) print >>tmp, '

' print >>tmp, '' ## export_rss ## def export_rss((title, link, description), items, puburl): ENTITY = { '&':'&', '<':'<', '>':'>', '"':'"' } def rssquote(s): def quote1(c, codec): if c in ENTITY: return ENTITY[c] else: return c.encode(codec, 'xmlcharrefreplace') return ''.join([ quote1(c, 'utf-8') for c in s ]) print '' print '' print ' ' % rssquote(puburl) print ' %s' % rssquote(title) print ' %s' % rssquote(link) if description: print ' %s' % rssquote(description) print ' ' for (title,link,description) in items: print ' ' % rssquote(link) print ' ' print ' ' for (title,link,description) in items: print ' ' % rssquote(link) print ' %s' % rssquote(title) print ' %s' % rssquote(link) if description: print ' %s' % rssquote(description) print ' ' print '' return ## HTML parser ## class html2rss(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.dic = {} self.curtxt = [] self.curtag = [] self.hold = 0 self.channel = ('','','') self.items = [] return def handle_data(self, data): if not self.hold: self.curtxt.append(data) return def doit(self, tag, dic): if tag == 'channel': self.channel = (dic.get('title'), dic.get('link'), dic.get('description')) else: self.items.append((dic.get('title'), dic.get('link'), dic.get('description'))) return def start_div(self, attrs): attrs = dict(attrs) if 'id' in attrs: self.curtag.append(attrs['id']) return def end_div(self): if self.curtag: t = self.curtag.pop() if t in ('item', 'channel'): self.doit(t, self.dic) self.dic = {} else: self.dic[t] = ''.join(self.curtxt).strip() self.curtxt = [] return def start_a(self, attrs): attrs = dict(attrs) self.hold += 1 self.dic['link'] = attrs['href'] return def end_a(self): self.hold -= 1 return def start_span(self, attrs): attrs = dict(attrs) self.hold += 1 self.curtxt.append(attrs.get('id','')) return def end_span(self): self.hold -= 1 return # main if __name__ == "__main__": (rssurl, tmpfile, tmpurl) = sys.argv[1:] fp = file(tmpfile, 'w') rss2html(rssurl, fp) fp.close() opener = urllib.FancyURLopener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] fp = opener.open('http://www.excite.co.jp/world/english/web/body/?wb_url=%s&wb_lp=ENJA' % tmpurl) parser = html2rss() for line in fp: parser.feed(unicode(line, 'shift_jis', 'replace')) fp.close() parser.close() export_rss(parser.channel, parser.items, rssurl)