Source code for dk.utidy

# -*- coding: utf-8 -*-

"""Micro tidy.

   Usage::

       >>> print utidy('''
       ... <form name="FirmaForm" id="FirmaForm" method="POST" autocomplete="off"
       ... action="." class="fForm"><input type="hidden" name="__cmd"
       ... value="FirmaForm"></form>hello
       ... ''')
       ...
       <form action="." autocomplete="off" class="fForm" id="FirmaForm" method="POST" name="FirmaForm">
           <input name="__cmd" type="hidden" value="FirmaForm">
       </form>>
       hello

"""

import re

self_closing_tags = """
    area
    base
    br
    col
    command
    embed
    hr
    img
    input
    keygen
    link
    meta
    param
    source
    track
    wbr
""".split()


[docs]class HtmlTag(object):
    attre = re.compile(r"""
        (?P<attr>[-\w]+)                            # attribute
        (?:                                         # either = followed by..
           (?: = (?P<quote>['"])(.*?)(?P=quote))    #  something in quotes
          |(?: = ([^\s]+))                          #  something without quotes
        )?                                          # or a plain attribute
        """, re.VERBOSE)  # "

    def __init__(self, txt):
        self.orig = txt
        # collapse multiple spaces
        self.text = re.subn(r'(\s+)', " ", txt)[0]
        m = re.match(r'<\s*(/)?\s*([-\w]+)(\s.*)?>', self.text)
        if not m:  # pragma:nocover
            print "NOT M:", txt
        g = m.groups()
        self.closing = g[0] is not None
        self.name = g[1]
        self.attrtxt = g[2] or ""
        self.selfclosing = self.name in self_closing_tags
        if not self.closing and self.attrtxt.strip():
            self.attrs = self.normalize_attrs(
                HtmlTag.attre.findall(self.attrtxt)
            )
        else:
            self.attrs = []
        self.kind = 'tag'
        if self.closing:
            self.kind += '-end'
        if not self.closing and not self.selfclosing:
            self.kind += '-start'

[docs]    def normalize_class(self, val):
        return ' '.join(sorted(val.split()))

[docs]    def normalize_style(self, val):
        return ';'.join(sorted([v for v in val.split(';') if v.strip()])) + ';'

[docs]    def normalize_attrs(self, attrs):
        res = []
        for attrname, _quote, qval, noqval in sorted(attrs):
            val = qval or noqval or attrname
            if attrname == 'class':
                res.append((attrname, self.normalize_class(val)))
            elif attrname == 'style':
                res.append((attrname, self.normalize_style(val)))
            else:
                res.append((attrname, val))
        return res

    def __str__(self):
        if self.closing:
            return "</%s>" % self.name
        res = "<%s" % self.name
        if self.attrtxt:
            res += ' '
        res += ' '.join(['%s="%s"' % (k, v) for k, v in self.attrs])
        res += ">"
        return res

    def __repr__(self):
        return "{{%s}}" % str(self)


[docs]def tokenize_html(html):
    tagre = re.compile(r'(<.*?>)', re.MULTILINE|re.DOTALL|re.UNICODE)
    tokens = []
    pos = 0
    while 1:
        m = tagre.search(html, pos)
        if not m:
            break

        txt = html[pos:m.start()]
        if txt.strip():
            tokens.append(('text', txt.strip()))

        tag = HtmlTag(html[m.start():m.end()])
        tokens.append((tag.kind, tag))

        pos = m.end()
    if pos < len(html):
        tokens.append(('text', html[pos:].strip()))
    return tokens


[docs]def simplify_simple_tags(html):
    """Put tags without any nested children on one line, i.e. turn::

           <h1>
               foo
           </h1>

       into::

           <h1>foo</h1>
           
    """
    def replacement(m):
        grps = m.groups()
        res = "<%s>%s</%s>" % (grps[0], grps[1].strip(), grps[0])
        # print "REPLS:", grps, res
        return res

    import time
    start = time.time()
    res = re.sub(
        r'<(\w+)>([^<]*)</\1>',
        replacement,
        html,
        flags=re.MULTILINE|re.DOTALL
    )
    import sys
    sys.stderr.write('done: %.3f\n' % (time.time() - start))
    return res


[docs]def utidy(html, level=0, indent='    ', simplify=False):
    """micro-tidy

       Normalizes the html.
    """
    tokens = tokenize_html(html.strip())
    res = []
    def _indent(n):
        return indent * max(0, n)
    i = level
    for kind, token in tokens:
        if kind == 'text':
            res.append(_indent(i) + token)
        elif kind == 'tag-start':
            res.append(_indent(i) + str(token))
            i += 1
        elif kind == 'tag-end':
            i -= 1
            res.append(_indent(i) + str(token))
        elif kind == 'tag':
            res.append(_indent(i) + str(token))
    html = '\n'.join(res)
    if simplify:
        html = simplify_simple_tags(html)
    return html


# print utidy('''
#     <div style="font-family:verdana;color:red" class="c b a">
#     <input type=checkbox data-toggle="#foo" checked>
#     </div>
#     ''')

if __name__ == "__main__":  # pragma: nocover
    import sys
    print utidy(open(sys.argv[1]).read(), simplify=True)