#!/usr/bin/env python
import sys, os, re, time, urlparse
from StringIO import StringIO
from sgmllib import SGMLParser, SGMLParseError
from httplib import HTTPConnection
from robotparser import RobotFileParser
from gzip import GzipFile
from cookielib import MozillaCookieJar
from urllib2 import Request, URLError
from threading import Thread


##  TextCrawler
##
class TextCrawler(SGMLParser, Thread):

  EXTPAT = re.compile(r'\.([a-zA-Z0-9]+)$')
  REMOVENAME = re.compile(r'#.*$')
  REJECTS = dict.fromkeys('jpg jpeg gif png swf class'.split(' '))

  USER_AGENT = 'TextCrawler/0.0'
  HEADERS = {
    'User-Agent': USER_AGENT,
    'Accept-Encoding': 'gzip',
    'Connection': 'keep-alive'
    }
  
  def __init__(self, base, cookiejar=None, maxlevel=1, reject=REJECTS):
    (proto, self.hostport, _w, _x, _y, _z) = urlparse.urlparse(base)
    assert proto == 'http'
    SGMLParser.__init__(self)
    Thread.__init__(self)
    self.robotstxt = RobotFileParser()
    self.robotstxt.set_url('http://%s/robots.txt' % self.hostport)
    self.robotstxt.read()
    self.conn = HTTPConnection(self.hostport)
    self.base = base
    self.cookiejar = cookiejar
    self.maxlevel = maxlevel
    self.reject = reject
    self.visited = {}
    self.paths = [('',0)]
    self.cururl = ''
    return

  def inject_url(self, url, base, level):
    if self.maxlevel <= level: return
    url = urlparse.urljoin(base, self.REMOVENAME.sub('',url))
    if not url.startswith(self.base): return
    if not self.robotstxt.can_fetch(self.USER_AGENT, url): return
    path = url[len(self.base):]
    if path.endswith('/'):
      path += 'index.html'
    m = self.EXTPAT.search(path)
    if m and (m.group(1).lower() in self.reject): return
    if path in self.visited: return
    print >>sys.stderr, 'INJECT: %r' % url
    self.paths.append((path, level+1))
    self.visited[path] = 1
    return

  def get1(self, url, retry=0):
    req = Request(url)
    if self.cookiejar:
      self.cookiejar.add_cookie_header(req)
      headers = req.unredirected_hdrs
      headers.update(self.HEADERS)
    else:
      headers = self.HEADERS
    self.conn.request('GET', req.get_selector(), '', headers)
    resp = self.conn.getresponse()
    if 'gzip' in resp.getheader('Content-Encoding', '').lower():
      fp = GzipFile(fileobj=StringIO(resp.read()))
    else:
      fp = StringIO(resp.read())
    if retry < 3 and (resp.status in (301, 302)):
      url = resp.getheader('Location', '')
      if url.startswith(self.base):
        print >>sys.stderr, 'Moved: %r' % url
        return self.get1(url, retry+1)
    if resp.status != 200:
      raise URLError(resp.status)
    return (fp, resp.getheader('Content-Type', 'text/plain'))

  # SGMLParser method
  def unknown_starttag(self, tag, attrs):
    attrs = dict(attrs)
    if tag in ('a','area') and 'href' in attrs:
      self.inject_url(attrs['href'], self.cururl, self.curlevel)
    return
  
  def crawl1(self):
    (path, level) = self.paths.pop()
    url = 'http://%s/%s' % (self.hostport, path)
    print >>sys.stderr, 'CRAWL: %r' % url
    try:
      (fp, type) = self.get1(url)
      self.curlevel = level
      self.cururl = url
      if type.startswith('text/'):
        body = fp.read()
        self.feed(body)
        self.close()
        print 'data=', len( body )
      fp.close()
    except (SGMLParseError, URLError), x:
      print >>sys.stderr, 'ERROR: %s' % x
    return

  def run(self):
    while self.paths:
      self.crawl1()
    return


# main
if __name__ == "__main__":
  jar = MozillaCookieJar('sites/cookies')
  jar.load()
  for url in sys.argv[1:]:
    TextCrawler(url, jar).start()
