"""Simple script to generate .pox files

    parses XML for i18n attrs and JS files for _() calls and generates an
    XML .pox template document (.poxt file)

    (c) Guido Wesdorp 2005

"""

from xml.dom.minidom import parseString, getDOMImplementation
import sys, re, os

class POX:
    """container for the results"""
    def __init__(self):
        impl = getDOMImplementation()
        self.doc = impl.createDocument(None, 'catalog', None)
        self.root = self.doc.documentElement
        self.processed = {} # mapping from mid to ([filenames], node)

    def add(self, msgid, filename):
        # strip and reduce whitespace
        msgid = msgid.strip().replace('\n', ' ').replace('\t', ' ')
        while msgid.find('  ') > -1:
            msgid.replace('  ', ' ')
        if self.processed.has_key(msgid):
            filenames, node = self.processed[msgid]
            if not filename in filenames:
                filenames.append(filename)
                node.setAttribute('filenames', 
                    '%s %s' % (node.getAttribute('filenames'), filename))
            return
        doc = self.doc
        root = self.root
        # add the nodes
        msgnode = doc.createElement('message')
        msgnode.setAttribute('filenames', filename)
        root.appendChild(msgnode)
        msgidnode = doc.createElement('msgid')
        msgidnode.appendChild(doc.createTextNode(msgid))
        msgnode.appendChild(msgidnode)
        msgstrnode = doc.createElement('msgstr')
        msgstrnode.appendChild(doc.createTextNode(msgid))
        msgnode.appendChild(msgstrnode)
        msgstrnode.setAttribute('i18n:translate', '')
        root.appendChild(msgnode)
        self.processed[msgid] = ([filename], msgnode)

    def get_result(self):
        return self.doc.toprettyxml()

class XMLParser:
    """scans XML files (or well-formed HTML files, obviously) for i18 attrs"""
    def __init__(self, files, pox):
        self._current = None
        for file in files:
            self.parse_file(file, pox)

    def parse_file(self, filename, pox):
        fp = open(filename)
        try:
            dom = parseString(fp.read())
        except:
            exc, e, tb = sys.exc_info()
            del tb
            print 'Error parsing %s: %s - %s' % (filename, exc, e)
            return
        # walk through all the nodes and scan for i18n: stuff
        while 1:
            node = self.next_node(dom)
            if not node:
                break
            if node.nodeType == 1:
                attrs = node.attributes
                translate = attrs.getNamedItem('i18n:translate')
                if translate:
                    msgid = translate.value
                    if not msgid.strip():
                        msgid = self.extract_text(node)
                    pox.add(msgid, filename)
                attributes = attrs.getNamedItem('i18n:attributes')
                if attributes:
                    attributes = [a.strip() for a in 
                                        attributes.value.split(';')]
                    for attr in attributes:
                        attritem = attrs.getNamedItem(attr)
                        if not attritem:
                            raise AttributeError, \
                                'No %s on %s in %s' % (
                                    attr, node.nodeName, filename)
                        msgid = attritem.value;
                        pox.add(msgid, filename)

    def extract_text(self, node):
        xml = ''
        for child in node.childNodes:
            xml += child.toxml().strip().replace('\n', ' ').replace('\t', ' ')
        while xml.find('  ') > -1:
            xml = xml.replace('  ', ' ')
        return xml

    def next_node(self, dom):
        if not self._current or self._current.ownerDocument != dom:
            self._current = dom.documentElement
        else:
            cur = self._current
            if cur.hasChildNodes():
                self._current = cur.childNodes[0]
            elif cur != cur.parentNode.lastChild:
                self._current = cur.nextSibling
            else:
                self._current = cur.parentNode.nextSibling
        return self._current

class JSParser:
    """scans JS files for _() calls"""
    def __init__(self, files, pox):
        for file in files:
            self.parse_file(file, pox)

    def parse_file(self, filename, pox):
        data = open(filename).read()
        # XXX the following regexp has a flaw: it fails if the string contains
        # a )
        reg_func = re.compile('\W_\((.*?)\)', re.S)
        while 1:
            match = reg_func.search(data)
            if not match:
                break
            data = data.replace(match.group(0), '')
            func_content = match.group(1).strip()
            quote = func_content[0]
            if quote not in ['"', "'"]:
                raise Exception, 'Unrecognized function content: %s in %s' % \
                        (func_content, filename)
            reg_concat = re.compile(
                    r'%(q)s[ \t\n]*\+[ \t\n]*%(q)s' % 
                        {'q': quote})
            while 1:
                match = reg_concat.search(func_content)
                if not match:
                    break
                func_content = func_content.replace(match.group(0), '')

            func_content = list(func_content)
            content = []
            previous = func_content.pop(0)
            while 1:
                current = func_content.pop(0)
                if current == quote and previous != '\\':
                    break
                content.append(current)
                previous = current
            literal = ''.join(content)
            if not literal:
                raise Exception, 'Unrecognized function content: %s in %s' % \
                            (func_content, filename)
            literal = literal.replace('\t', ' ').replace('\n', ' ')
            while literal.find('  ') > -1:
                literal = literal.replace('  ', ' ')
            pox.add(literal, filename)

if __name__ == '__main__':
    stderr = sys.stderr
    print >>stderr, 'POX extract v0.1'
    print >>stderr, '(c) Guido Wesdorp 2004'
    files = sys.argv[1:]
    print >>stderr, 'Going to parse files', ', '.join(files)
    pox = POX()
    xml = [f for f in files if not f.endswith('.js')]
    js = [f for f in files if f.endswith('.js')]
    XMLParser(xml, pox)
    JSParser(js, pox)
    print pox.get_result()
    print >>stderr, 'Done'

