#!/usr/bin/env python2.5

""" pse - the personal search engine

    a customizable indexing and search system for the WWW (HTML pages)
"""

from BaseHTTPObjectPublisher import ObjectPublisherHandler, Collection,\
                                    run_server
import httplib
import urlparse
import time
import re
import thread

def expose(func):
    func.exposed = True
    return func

class PSEHandler(ObjectPublisherHandler):
    config = None
    application = None
    indexer = None

    def find_resource(self, path, query):
        if path and not path.startswith('/'):
            return self.proxy(path, query)
        return ObjectPublisherHandler.find_resource(self, path, query)

    def proxy(self, url, query):
        """ proxy and index a url
        """
        parsed = urlparse.urlparse(url)
        assert parsed.scheme == 'http', ('only http is supported, '
                                         'please reconfigure your browser')
        path = parsed.path
        if parsed.params:
            path += ';' + parsed.params
        if query:
            path += '?' + query
        if parsed.fragment:
            path += '#' + parsed.fragment
        body = self._get_body()
        headers = dict(self.headers)
        headers.pop('accept-encoding')

        conn = httplib.HTTPConnection(parsed.hostname, parsed.port or 80)
        conn.request(self.command, path, body, headers)
        res = conn.getresponse()

        body = res # by default, let the system use res.read() for the data
        ct = res.getheader('content-type')
        if res.status == 200 and ct is not None:
            if ct.startswith('text/'):
                enc = self.config.default_encoding
                if ';' in ct:
                    ct, enc = ct.rsplit(';', 1)
                    enc = enc.split('=')[-1].strip()
                cl = res.getheader('content-length')
                if cl:
                    cl = int(cl)
                    body = res.read(cl)
                else:
                    body = res.read()
                if ct in self.splitters:
                    ubody = unicode(body, enc, 'replace')
                    data = self.splitters[ct](ubody)
                    if query:
                        url += '?' + query
                    thread.start_new_thread(self.indexer.index, (url, data))
        headers = dict(('-'.join(t.capitalize() for t in k.split('-')), v) for
                       (k, v) in res.getheaders())
        def ret(*args, **kwargs):
            return (res.status, headers, body)
        return ret

    def _get_body(self):
        cl = self.headers.get('content-length')
        if cl:
            cl = int(cl)
        if cl:
            return self.rfile.read(cl)
        return ''

    def _parse_url(self, url):
        parsed = urlparse.urlparse(url)
        assert parsed.scheme == 'http', ('only http is supported, '
                                         'please reconfigure your browser')
        path = parsed.path
        if parsed.params:
            path += ';' + parsed.params
        return (parsed.hostname, parsed.port, path, parsed.fragment)

class PSE(Collection):
    """ the core of the application
    
        a proxy/web server that indexes viewed pages and provides some search
        views
    """
    def __init__(self, config, db):
        self.config = config
        self.db = db

    @expose
    def search(self, handler, path, query):
        """ full-text search on all the indexed data
        """
        if query:
            return (200, {'Content-Type': 'text/plain'},
                    '\n'.join(self.db.search(query)))
        return (200, {'Content-Type': 'text/plain'},
                    'call this script with a GET query to get search results')
    search.exposed = True

    def serve(self):
        pass

reg_tag = re.compile('<[^>]+>', re.S)
reg_whitespace = re.compile('\s+')
remove_entirely = ('script', 'style')
def striptags(s):
    s = s.lower()
    for tagname in remove_entirely:
        while 1:
            start = s.find('<%s' % (tagname,))
            if start == -1:
                break
            endtag = '</%s' % (tagname,)
            end = s.find(endtag)
            end = s.find('>', end)
            if end < start:
                break
            s = s[:start] + ' ' + s[end+1:]
    s = ' '.join(reg_tag.split(s))
    s = ' '.join(reg_whitespace.split(s))
    return s.strip()

if __name__ == '__main__':
    import config, solr_indexer
    i = solr_indexer.SolrIndexer('localhost', 8983)
    PSEHandler.config = config
    PSEHandler.indexer = i
    PSEHandler.splitters = {'text/html': striptags,
                            'text/plain': lambda s: s,
                            }
    PSEHandler.application = PSE(config, i)
    run_server((config.host, config.port), PSEHandler)


