#!/usr/bin/env python2.5 """ pse - the personal search engine a customizable indexing and search system for the WWW (HTML pages) """ from BaseHTTPObjectPublisher import ObjectPublisherHandler, Collection,\ run_server import httplib import urlparse import time import re import thread def expose(func): func.exposed = True return func class PSEHandler(ObjectPublisherHandler): config = None application = None indexer = None def find_resource(self, path, query): if path and not path.startswith('/'): return self.proxy(path, query) return ObjectPublisherHandler.find_resource(self, path, query) def proxy(self, url, query): """ proxy and index a url """ parsed = urlparse.urlparse(url) assert parsed.scheme == 'http', ('only http is supported, ' 'please reconfigure your browser') path = parsed.path if parsed.params: path += ';' + parsed.params if query: path += '?' + query if parsed.fragment: path += '#' + parsed.fragment body = self._get_body() headers = dict(self.headers) headers.pop('accept-encoding') conn = httplib.HTTPConnection(parsed.hostname, parsed.port or 80) conn.request(self.command, path, body, headers) res = conn.getresponse() body = res # by default, let the system use res.read() for the data ct = res.getheader('content-type') if res.status == 200 and ct is not None: if ct.startswith('text/'): enc = self.config.default_encoding if ';' in ct: ct, enc = ct.rsplit(';', 1) enc = enc.split('=')[-1].strip() cl = res.getheader('content-length') if cl: cl = int(cl) body = res.read(cl) else: body = res.read() if ct in self.splitters: ubody = unicode(body, enc, 'replace') data = self.splitters[ct](ubody) if query: url += '?' + query thread.start_new_thread(self.indexer.index, (url, data)) headers = dict(('-'.join(t.capitalize() for t in k.split('-')), v) for (k, v) in res.getheaders()) def ret(*args, **kwargs): return (res.status, headers, body) return ret def _get_body(self): cl = self.headers.get('content-length') if cl: cl = int(cl) if cl: return self.rfile.read(cl) return '' def _parse_url(self, url): parsed = urlparse.urlparse(url) assert parsed.scheme == 'http', ('only http is supported, ' 'please reconfigure your browser') path = parsed.path if parsed.params: path += ';' + parsed.params return (parsed.hostname, parsed.port, path, parsed.fragment) class PSE(Collection): """ the core of the application a proxy/web server that indexes viewed pages and provides some search views """ def __init__(self, config, db): self.config = config self.db = db @expose def search(self, handler, path, query): """ full-text search on all the indexed data """ if query: return (200, {'Content-Type': 'text/plain'}, '\n'.join(self.db.search(query))) return (200, {'Content-Type': 'text/plain'}, 'call this script with a GET query to get search results') search.exposed = True def serve(self): pass reg_tag = re.compile('<[^>]+>', re.S) reg_whitespace = re.compile('\s+') remove_entirely = ('script', 'style') def striptags(s): s = s.lower() for tagname in remove_entirely: while 1: start = s.find('<%s' % (tagname,)) if start == -1: break endtag = '', end) if end < start: break s = s[:start] + ' ' + s[end+1:] s = ' '.join(reg_tag.split(s)) s = ' '.join(reg_whitespace.split(s)) return s.strip() if __name__ == '__main__': import config, solr_indexer i = solr_indexer.SolrIndexer('localhost', 8983) PSEHandler.config = config PSEHandler.indexer = i PSEHandler.splitters = {'text/html': striptags, 'text/plain': lambda s: s, } PSEHandler.application = PSE(config, i) run_server((config.host, config.port), PSEHandler)