Source code for django_auxilium.utils.html

from __future__ import print_function, unicode_literals
import re

import six
from six.moves.html_entities import name2codepoint
from six.moves.html_parser import HTMLParser

from .string import text_onion


EXCLUDE_TAGS = ('textarea', 'pre', 'code', 'script',)
RE_WHITESPACE = re.compile(r'\s{2,}|\n')
RE_SPACE_BETWEEN_TAGS = re.compile(r'>(?:\s{2,}|\n)<')
RE_EXCLUDE_TAGS = re.compile(
    """(               # group for results to be included in re.split
          <(?:{0})     # match beginning of one of exclude tags
                       #   e.g. <pre or <textarea
          .*?          # non-greedy match anything inside the tag
                       # until the next group is matched
          </(?:{0})>   # match the closing tag   e.g. </pre>
    )""".format('|'.join(EXCLUDE_TAGS)),
    re.DOTALL | re.VERBOSE)


[docs]@text_onion def simple_minify(html): """ Minify HTML with very simple algorithm. This function tries to minify HTML by stripping most spaces between all html tags (e.g. ``</div> <div>`` -> ``</div> <div>``). Note that not all spaces are removed since sometimes that can adjust rendered HTML (e.g. ``<strong>Hello</strong> <i></i>``). In addition to that, this function replaces all whitespace (more then two consecutive whitespace characters or new line) with a space character except inside excluded tags such as ``pre`` or ``textarea``. **Though process**: To minify everything except content of excluded tags in one step requires very complex regular expression. The disadvantage is the regular expression will involve look-behinds and look-aheads. Those operations make regex much more resource-hungry which eats precious server resources. In addition, complex regex are hard to understand and can be hard to maintain. That is why this function splits the task into multiple sections. #. Regex expression which matches all exclude tags within the html is used to split the HTML split into components. Since the regex expression is wrapped inside a group, the content of the exclude tags is also included inside the resulting split list. Due to that it is guaranteed that every odd element (if there are any) will be the excluded tags. #. All the split components are looped and processed in order to construct final minified HTML. #. All odd indexed elements are not processed and are simply appended to final HTML since as explained above, they are guaranteed to be content of excluded tags hence do not require minification. #. All even indexed elements are minified by stripping whitespace between tags and redundant whitespace is stripped in general via simple regex. You can notice that the process does not involve parsing HTML since that usually adds some overhead (e.g. using beautiful soup). By using 2 regex passes this achieves very similar result which performs much better. """ components = RE_EXCLUDE_TAGS.split(html) html = '' for i, component in enumerate(components): if i % 2 == 0: component = component.strip() component = RE_SPACE_BETWEEN_TAGS.sub('> <', component) component = RE_WHITESPACE.sub(' ', component) html += component else: html += component return html
[docs]class TextExtractorHTMLParser(HTMLParser): """ Custom HTML parser which extracts only text while parsing HTML Once the parser parses the HTML, :py:meth:`.get_text` can be called which will return extracted text """ def __init__(self): HTMLParser.__init__(self) self.result = []
[docs] def handle_data(self, d): """ Handler for data/text in HTML This simply adds the data to the results list this class maintains of the extracted html """ self.result.append(d)
[docs] def handle_charref(self, number): """ Handler for processing character references. This method handles both decimal (e.g. ``'&gt;' == '&#62;'``) and hexadecimal (e.g. ``'&gt;' == '&#x3E;'``) references. It does that by simply converting the reference number to an integer with appropriate base and then converts that number to a character. """ codepoint = int(number[1:], 16) if number[0] in ('x', 'X') else int(number) text = six.unichr(codepoint) self.result.append(text) return text
[docs] def handle_entityref(self, name): """ Handler for processing character references. This method handles processing HTML entities (e.g. ``'&gt;'``). It first maps the entity name to a codepoint which is a unicode character number and then converts that number to a unicode character. """ text = six.unichr(name2codepoint[name]) self.result.append(text) return text
[docs] def get_text(self): """ Get extracted text after HTML is parsed. Returns ------- str Extracted text from the HTML document """ return ''.join(self.result)
[docs]def html_to_text(html): """ Function to convert HTML text to plain text by stripping all HTML. Implementation is based from `<http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python>`_. """ s = TextExtractorHTMLParser() s.feed(html) return s.get_text()