""" emulate a browser with js support using python-spidermonkey """ import os import sys import traceback import re import gc from lxml import etree import spidermonkey here = os.path.dirname(os.path.abspath(__file__)) def get_exception(): exc, e, tb = sys.exc_info() ret = JSPyError(str(exc), str(e), ''.join(traceback.format_tb(tb))) del tb return ret def format_stack(stack): ret = '' for line in stack: if '@' not in line: ret += line continue info, fpath = line.rsplit('@', 1) fpath = fpath.split('/')[-1] if len(info) > 75: info = info[:35] + '...' + info[-20:] ret += ' %s@%s\n' % (info, fpath) return ret class PyJSError(Exception): def __init__(self, name, message, stack): self.args = (name, message, stack) self.name = name self.message = message self.stack = stack def __str__(self): ret = '%s - %s\n' % (self.name, self.message) ret += format_stack(self.stack) return ret class JSPyError: def __init__(self, name, message, stack): self.args = (name, message, stack) self.name = name self.message = message self.stack = stack.split('\n') class XPathProcessor(object): def __init__(self, expr, xml, nodepath, resolvinfo): self.expr = expr self.xml = xml xml = unicode(xml, 'latin-1') # XXX ?!? self.resolvinfo = resolvinfo or None self.tree = etree.fromstring(xml) self.node = self._resolve_xml_path(nodepath) def paths(self): if self.resolvinfo: r = self.node.xpath(self.expr, namespaces=self.resolvinfo) else: r = self.node.xpath(self.expr) return [self._get_path(node) for node in r] def _get_path(self, node): current = node path = [] if isinstance(node, str) or isinstance(node, unicode): if node.is_text: path = [0] current = current.getparent() else: parent = node.getparent() pparent = parent.getparent() parenti = self._get_node_index(parent, parent.getparent()) path = [parenti + 1] current = parent while True: parent = current.getparent() if parent is None: break i = self._get_node_index(current, parent) path.insert(0, i) current = parent return path def _get_node_index(self, node, parent): i = 0 if parent.text: i = 1 for child in parent: if child == node: break if child.tail: i += 2 else: i += 1 else: raise AssertionError('node not found in tree!') return i def _resolve_xml_path(self, path): xpexpr = ''.join(['/node()[%s]' % (int(i) + 1,) for i in path.split('/')]) nodes = self.tree.xpath(xpexpr) assert len(nodes) == 1 return nodes[0] class XSLTProcessor(object): def __init__(self, stylesheet, xml): self.stylesheet = stylesheet self.xml = xml def process(self): xsltdoc = etree.fromstring(unicode(self.stylesheet, 'utf-8')) xmldoc = etree.fromstring(unicode(self.xml, 'latin-1')) open('/tmp/test.xsl', 'w').write(self.stylesheet) transform = etree.XSLT(xsltdoc) tree = transform(xmldoc) return etree.tostring(tree) class Browser: def __init__(self, server, webpath='/'): self.server = server self.webpath = webpath basepath = webpath if not basepath.endswith('/'): basepath = '/'.join(basepath.split('/')[:-1]) self.basepath = basepath def _init_runtime(self): self.runtime = spidermonkey.Runtime(maxbytes=10000000) self.context = self.runtime.new_context() self.context.bind_class(Browser, bind_constructor=False) self.context.bind_class(JSPyError, bind_constructor=True) self.context.bind_object('__fakebrowser__', self) self._xhrs = {} def initialize(self): self._init_runtime() html = self.server.load_url(self.webpath) title = self._get_title(html) self.load(os.path.join(here, 'fakebrowser.js'), True) self.eval('init_fakebrowser(%r, "http://localhost%s", %r);' % ( unicode(html, 'utf-8').encode('utf-8'), self.webpath, title)) htree = etree.fromstring(html) self.resolve_script_tags(htree) self.call_onload(htree) del htree def cleanup(self): del self.runtime del self.context del self._xhrs gc.collect() def _get_title(self, html): match = re.search('