import re class node(object): parent = None def __init__(self, **kwargs): self.__dict__.update(kwargs) def __repr__(self): return '<%s node>' % (self.__class__.__name__,) class element(node): name = None namespace = None children = None def __init__(self, **kwargs): self.children = [] self.attributes = [] super(element, self).__init__(**kwargs) def __repr__(self): return '' % (self.namespace or 'null', self.name) class document(element): root = None class text(node): value = None class attribute(node): name = None namespace = None value = None def __repr__(self): return '' % (self.namespace or 'null', self.name) class exmlparser(object): def __init__(self, document): self.document = document charset = self.get_charset(document) self.indentsize = self.get_indent(document) def parse(self): self.currline = 0 doc = document() self.state = {'indent': -1, 'current': doc} for line in self.document.split('\n'): self._parse_line(line, doc) return doc _reg_charset = re.compile('^#( -[*]- (en)?coding[:] (.*) -[*]-)?$') def get_charset(self, document): for line in document.split('\n'): match = self._reg_charset.search(line) if not match: break groups = match.groups() if not groups: # still a hash, but no coding info continue return groups[2] return 'UTF-8' _reg_indent = re.compile('^( +)') def get_indent(self, document): for line in document.split('\n'): match = self._reg_indent.match(line) if match: return len(match.group(1)) return 1 # no indentation in document - must be very simple :) def _parse_line(self, line, doc): self.currline += 1 if not line.strip(): return state = self.state current = state['current'] for handler in (self.handle_multiline_comment, self.handle_comment, self.handle_indent_new, ): current, line = handler(current, line) if not line: break currname = (isinstance(current, element) and current.name or type(current).__name__) state['current'] = current def handle_multiline_comment(self, current, line): # handle multi-line strings mc = self.state.get('multilinechar') if mc is not None: if line.strip().endswith(mc): line = line.rstrip()[:-len(mc)] del self.state['multilinechar'] else: if line and line[-1] == '\\': line = line[:-1] else: line += '\n' current.value += line line = '' return current, line # kinda complex, need to get unquoted hashes _reg_comment = re.compile( r'^[^\'"#]*' r'((\'([^\']|(\\\\)|(\\\'))*\')|("([^"]|(\\\\)|(\\"))*"))*' r'[^\'"#]*([#].*)$') def handle_comment(self, current, line): # deal with empty lines, comments, etc. match = self._reg_comment.match(line) if match: groups = match.groups() comment = groups[-1].strip() line = line[:-len(comment)].rstrip() return current, line def handle_indent_new(self, current, line): # deal with indentation # first find out current previous and current indent previndent = self.state['indent'] currspaces = len(line) - len(line.lstrip()) assert (not len(line[:currspaces]) % self.indentsize), ( 'unexpected amount of whitespace - use consistent indentation!' ' (line %s)' % (self.currline,)) line = line[currspaces:] currindent = currspaces / self.indentsize if currindent <= previndent: # close block(s) for i in range(previndent - currindent): current = current.parent elif currindent > previndent: assert (currindent - previndent == 1), \ 'too much indentation (line %s)' % (self.currline,) if line[0] in '\'"': node = self.handle_new_text(current, line) if current.children and isinstance(current.children[-1], text): # concatenate to previous child current.children[-1].value += node.value current = current.children[-1] currindent += 1 else: node.parent = current current.children.append(node) current = node currindent += 1 elif line[0] == '[': node = self.handle_new_element(current, line) current.children.append(node) current = node else: # XXX support some data types here raise AssertionError( 'unexpected data (line %s)' % (self.currline,)) self.state['indent'] = currindent return current, '' def handle_new_text(self, current, line): if not isinstance(current, element): raise AssertionError( 'can not attach a node here (line %s)' % (self.currline,)) # create new text node quote = line[0] mc = 3 * quote if line.startswith(mc): line = line[3:] if line.endswith(mc): line = line[:-3] else: self.state['multilinechar'] = mc if line and line[-1] == '\\': line = line[:-1] else: line += '\n' line = eval(mc + line + mc) else: line = eval(line) node = text(value=line) return node _reg_elnode = re.compile( r'^\[(\w+(\.\w+)?)' r'( \w+(\.\w+)?=' r'((\'([^\'\\]|(\\\\)|(\\\'))*\')|("([^"\\]|(\\\\)|(\\"))*")))*\]$') _reg_attr = re.compile( r'^ (\w+(\.\w+)?)=' r'((\'([^\\\']|(\\\\)|(\\\'))*\')|("([^"\\]|(\\\\)|(\\"))*"))') def handle_new_element(self, current, line): # create new element node if not isinstance(current, element): raise AssertionError( 'can not attach a node here (line %s)' % (self.currline,)) match = self._reg_elnode.match(line.rstrip()) assert match, \ 'invalid opening tag (line %s)' % (self.currline,) ns = None name = match.group(1) line = line[len(name)+1:-1] if '.' in name: assert name.count('.') == 1, \ 'invalid name (line %s)' % (self.currline,) ns, name = name.split('.') node = element(name=name, namespace=ns) node.parent = current if line: # attributes while True: match = self._reg_attr.search(line) if not match: break line = line.replace(match.group(0), '') groups = match.groups() ns = None name = groups[0] if '.' in name: ns, name = name.split('.', 1) node.attributes.append(attribute( ns=ns, name=name, value=eval(groups[2]))) assert not line, 'garbage in element node (line %s)' % (self.currline,) return node if __name__ == '__main__': import sys if len(sys.argv) != 2: print 'usage: %s ' % (sys.argv[0],) sys.exit(1) def printnode(node, depth=-1): indent = depth * 2 * ' ' if isinstance(node, element): if not isinstance(node, document): nodename = (node.namespace and node.namespace + ':' + node.name or node.name) start = indent + '<' + nodename if node.children: start += '>' print start for child in node.children: printnode(child, depth+1) if not node.children: print '/>' elif not isinstance(node, document): print indent + '' % (nodename,) else: print indent + node.value p = exmlparser(open(sys.argv[1]).read()) doc = p.parse() printnode(doc)