Source code for dk.utidy

# -*- coding: utf-8 -*-

"""Micro tidy.

   Usage::

       >>> print utidy('''
       ... <form name="FirmaForm" id="FirmaForm" method="POST" autocomplete="off"
       ... action="." class="fForm"><input type="hidden" name="__cmd"
       ... value="FirmaForm"></form>hello
       ... ''')
       ...
       <form action="." autocomplete="off" class="fForm" id="FirmaForm" method="POST" name="FirmaForm">
           <input name="__cmd" type="hidden" value="FirmaForm">
       </form>>
       hello

"""

import re

self_closing_tags = """
    area
    base
    br
    col
    command
    embed
    hr
    img
    input
    keygen
    link
    meta
    param
    source
    track
    wbr
""".split()


[docs]class HtmlTag(object): attre = re.compile(r""" (?P<attr>[-\w]+) # attribute (?: # either = followed by.. (?: = (?P<quote>['"])(.*?)(?P=quote)) # something in quotes |(?: = ([^\s]+)) # something without quotes )? # or a plain attribute """, re.VERBOSE) # " def __init__(self, txt): self.orig = txt # collapse multiple spaces self.text = re.subn(r'(\s+)', " ", txt)[0] m = re.match(r'<\s*(/)?\s*([-\w]+)(\s.*)?>', self.text) if not m: # pragma:nocover print "NOT M:", txt g = m.groups() self.closing = g[0] is not None self.name = g[1] self.attrtxt = g[2] or "" self.selfclosing = self.name in self_closing_tags if not self.closing and self.attrtxt.strip(): self.attrs = self.normalize_attrs( HtmlTag.attre.findall(self.attrtxt) ) else: self.attrs = [] self.kind = 'tag' if self.closing: self.kind += '-end' if not self.closing and not self.selfclosing: self.kind += '-start'
[docs] def normalize_class(self, val): return ' '.join(sorted(val.split()))
[docs] def normalize_style(self, val): return ';'.join(sorted([v for v in val.split(';') if v.strip()])) + ';'
[docs] def normalize_attrs(self, attrs): res = [] for attrname, _quote, qval, noqval in sorted(attrs): val = qval or noqval or attrname if attrname == 'class': res.append((attrname, self.normalize_class(val))) elif attrname == 'style': res.append((attrname, self.normalize_style(val))) else: res.append((attrname, val)) return res
def __str__(self): if self.closing: return "</%s>" % self.name res = "<%s" % self.name if self.attrtxt: res += ' ' res += ' '.join(['%s="%s"' % (k, v) for k, v in self.attrs]) res += ">" return res def __repr__(self): return "{{%s}}" % str(self)
[docs]def tokenize_html(html): tagre = re.compile(r'(<.*?>)', re.MULTILINE|re.DOTALL|re.UNICODE) tokens = [] pos = 0 while 1: m = tagre.search(html, pos) if not m: break txt = html[pos:m.start()] if txt.strip(): tokens.append(('text', txt.strip())) tag = HtmlTag(html[m.start():m.end()]) tokens.append((tag.kind, tag)) pos = m.end() if pos < len(html): tokens.append(('text', html[pos:].strip())) return tokens
[docs]def simplify_simple_tags(html): """Put tags without any nested children on one line, i.e. turn:: <h1> foo </h1> into:: <h1>foo</h1> """ def replacement(m): grps = m.groups() res = "<%s>%s</%s>" % (grps[0], grps[1].strip(), grps[0]) # print "REPLS:", grps, res return res import time start = time.time() res = re.sub( r'<(\w+)>([^<]*)</\1>', replacement, html, flags=re.MULTILINE|re.DOTALL ) import sys sys.stderr.write('done: %.3f\n' % (time.time() - start)) return res
[docs]def utidy(html, level=0, indent=' ', simplify=False): """micro-tidy Normalizes the html. """ tokens = tokenize_html(html.strip()) res = [] def _indent(n): return indent * max(0, n) i = level for kind, token in tokens: if kind == 'text': res.append(_indent(i) + token) elif kind == 'tag-start': res.append(_indent(i) + str(token)) i += 1 elif kind == 'tag-end': i -= 1 res.append(_indent(i) + str(token)) elif kind == 'tag': res.append(_indent(i) + str(token)) html = '\n'.join(res) if simplify: html = simplify_simple_tags(html) return html
# print utidy(''' # <div style="font-family:verdana;color:red" class="c b a"> # <input type=checkbox data-toggle="#foo" checked> # </div> # ''') if __name__ == "__main__": # pragma: nocover import sys print utidy(open(sys.argv[1]).read(), simplify=True)