from __future__ import print_function, unicode_literals
import re
import six
from six.moves.html_entities import name2codepoint
from six.moves.html_parser import HTMLParser
from .string import text_onion
EXCLUDE_TAGS = ('textarea', 'pre', 'code', 'script',)
RE_WHITESPACE = re.compile(r'\s{2,}|\n')
RE_SPACE_BETWEEN_TAGS = re.compile(r'>(?:\s{2,}|\n)<')
RE_EXCLUDE_TAGS = re.compile(
"""( # group for results to be included in re.split
<(?:{0}) # match beginning of one of exclude tags
# e.g. <pre or <textarea
.*? # non-greedy match anything inside the tag
# until the next group is matched
</(?:{0})> # match the closing tag e.g. </pre>
)""".format('|'.join(EXCLUDE_TAGS)),
re.DOTALL | re.VERBOSE)
[docs]@text_onion
def simple_minify(html):
"""
Minify HTML with very simple algorithm.
This function tries to minify HTML by stripping most spaces between all html tags
(e.g. ``</div> <div>`` -> ``</div> <div>``). Note that not all spaces are removed
since sometimes that can adjust rendered HTML (e.g. ``<strong>Hello</strong> <i></i>``).
In addition to that, this function replaces all whitespace
(more then two consecutive whitespace characters or new line)
with a space character except inside excluded tags such as ``pre`` or ``textarea``.
**Though process**:
To minify everything except content of excluded tags in one step requires very
complex regular expression. The disadvantage is the regular expression will involve
look-behinds and look-aheads. Those operations make regex much more resource-hungry
which eats precious server resources. In addition, complex regex are hard to understand
and can be hard to maintain. That is why this function splits the task into multiple
sections.
#. Regex expression which matches all exclude tags within the html is used
to split the HTML split into components. Since the regex expression is
wrapped inside a group, the content of the exclude tags is also included
inside the resulting split list.
Due to that it is guaranteed that every odd element (if there are any)
will be the excluded tags.
#. All the split components are looped and processed in order to construct
final minified HTML.
#. All odd indexed elements are not processed and are simply
appended to final HTML since as explained above, they are guaranteed
to be content of excluded tags hence do not require minification.
#. All even indexed elements are minified by stripping whitespace between
tags and redundant whitespace is stripped in general via simple regex.
You can notice that the process does not involve parsing HTML since that
usually adds some overhead (e.g. using beautiful soup). By using 2 regex
passes this achieves very similar result which performs much better.
"""
components = RE_EXCLUDE_TAGS.split(html)
html = ''
for i, component in enumerate(components):
if i % 2 == 0:
component = component.strip()
component = RE_SPACE_BETWEEN_TAGS.sub('> <', component)
component = RE_WHITESPACE.sub(' ', component)
html += component
else:
html += component
return html
[docs]def html_to_text(html):
"""
Function to convert HTML text to plain text by stripping all HTML.
Implementation is based from
`<http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python>`_.
"""
s = TextExtractorHTMLParser()
s.feed(html)
return s.get_text()