Counting words (etc.) in an HTML file with Python

In a previous post, I wrote about how to count words, characters, and Asian characters using python.

In this post I want to pull that together with code to get a word count from an HTML file.

What needs counting

What needs counting depends to some extent on what you need the word count for, but here I'm going to be assuming that the word count is going to be used to count billable/localizable content.

In that scenario, you've got to count the text in the title tag, as well as the visible text in the body, and certain other localizable content: img alt attributes, a title attributes, and input value attributes (am I missing any?).

The Code

The code for counting the actual text is in the above link. Here we need code to extract the text from the HTML file, and to accumulate the counts for all the chunks we've extracted.

Here's the Segment class for accumulating counts:

class Segment(object):
    """Represents a text segment.
    (For bookkeeping)
    "
""

    def __init__(self, text=""):
        """ text is the segment of text we will calculate.
        Leave it empty if this will be a master count for a document

        @param text: The text of the segment
        """

        self.characters = len(text)

        num_spaces = len([x for x in text if x.isspace()])
        self.chars_no_spaces = self.characters – num_spaces

        self.asian_chars = len([x for x in text if is_asian(x)])

        self.non_asian_words = non_j_len(text)

        self.words = self.non_asian_words + self.asian_chars

    def accumulate(self, seg):
        """Add the stats from <seg> to this one.
        Use this to keep a count for the entire document;
        use another for the whole batch of documents

        @param seg: The segment to accumulate

        >>> seg = Segment(u"")
        >>> seg2 = Segment(u"
abc")
        >>> seg.accumulate(seg2)
        >>> seg.words
        1
        >>> seg.characters
        3
        "
""

        self.words += seg.words
        self.characters += seg.characters
        self.chars_no_spaces += seg.chars_no_spaces
        self.asian_chars += seg.asian_chars
        self.non_asian_words += seg.non_asian_words

Next, the code for extracting (segmenting) the text from an HTML file. For this, you'll need the excellent Beautiful Soup module.

#coding: UTF8
"""Html segmenter"""

from BeautifulSoup import BeautifulSoup as bsoup
from BeautifulSoup import BeautifulStoneSoup
import re

def normalize(text):
    """Normalize whitepace in C{text}.

    >>> normalize(u"   spam\\n\\tspam   SPAM")
    u'spam spam SPAM'
    "
""

    return u' '.join(text.split())

class Segmenter(object):
    """Html segmenter
    Retrieves the editable/translatable text from an HTML document.
    "
""

    def __init__(self):
        """Set up various regular expressions for splitting the text"""

        self.pre_parse_stripper = re.compile(u"|".join([u"<body*?>|</body>",
                                         u"<a[\s\S]*?>|</a>",
                                         u"<img[\s\S]*?>|</img>",
                                         u"<input[\s\S]*?>|</input>",
                                         u"<script*?>[\s\S]*?</script>",
                                         u"<form[\s\S]*?>|</form>"]),
                                         re.I | re.M)
        """Strip out unsightly tags before heading to the splitter"""

        self.splitter = re.compile(u'|'.join([u"<p*?>|</p>",
                                         u"<div*?>|</div>",
                                         u"<td*?>|</td>",
                                         u"<li*?>|</li>",
                                         u"<h\d*?>|</h\d>",
                                         u"<dd*?>|</dd>",
                                         u"<dt*?>|</dt>",
                                         u"<br*?>"]),
                                         re.I | re.M)
        """Split segments by certain tags (removing tags in bargain)
        These tags indicate a segment boundary"
""

        self.charset_finder = re.compile(u'[\s\S]*<meta[\s\S]*?charset\s*=\s*([\S]+)"[\s\S]*?>[\s\S]*', re.I)
        """Find the charset if necessary"""

        self.soup = None

    def __str__(self):
        """So we can tell which segger we have (assuming multiple segmenter classes)"""
        return "HTML"

    def get_chunks(self, html_text):
        """Extract the text from the HTML file"""

        self.soup = bsoup(html_text, fromEncoding=self.getEncoding(html_text))

        # document title
        if self.soup.head:
            title = self.soup.head.title
            if title:
                yield title.string

        # image alt attributes, anchor title attributes, input value attributes
        for tag, attr in ((u"img", u"alt"),
                (u"a", u"title"),
                (u"input", u"value")):
            for chunk in self.getAttributes(tag, attr):
                if chunk:
                    yield chunk

        # Parse the body text
        if self.soup.body:
            text = self.pre_parse_stripper.sub(u"", unicode(self.soup.body))
            for chunk in self.splitter.split(text):
                normal = normalize(html2plain(chunk))
                if normal:
                    yield normal

    def getAttributes(self, tagName, attrName):
        """Get all attrName values for tagName tags"""

        attrs = []

        tags = self.soup.findAll(tagName)

        for tag in tags:
            try:
                attr = tag[attrName]
                if attr:
                    attrs.append(attr)
            except KeyError, e:
                #print "Tag %s does not have attribute %s" % (tagName, attrName)
                pass

        return attrs

    def getEncoding(self, text):
        """Retrieve the encoding META tag, if present"""

        m = self.charset_finder.match(text)
        if m:
            return m.groups(0)[0]
        return None

TAG_STRIPPER = re.compile(u"<[!\w/][\s\S]*?>", re.I | re.M)

def strip_tags(line):
    """strip the HTML tags from the line

    >>> strip_tags(u"<b>spam</b>")
    u'spam'

    """

    return TAG_STRIPPER.sub(u"", line)

def html2plain(text):
    """Strips out tags from HTML text

    >>> html2plain('spam <b>eggs</b>')
    u'spam\\xa0eggs'
    >>> html2plain('–>')
    u'–>'
    """

    entities = BeautifulStoneSoup.HTML_ENTITIES
    text = unicode(BeautifulStoneSoup(strip_tags(text),
                                      convertEntities=entities))
    return text.replace(u"&#38;gt;", ">").replace(u"&#38;lt;", "<")

And here's some code to get the actual wordcount:

    wordcount = docstats.Segment()
    segger = htmlseg.Segmenter()

    for chunk in segger.get_chunks(open("thefile.html").read()):
        wordcount.accumulate(docstats.Segment(chunk))

Here are the docstats and htmlseg modules, and here is an online tool using the code for the HTML word counts.

2 comments to Counting words (etc.) in an HTML file with Python

  • Sasha

    Very nice work, exactly what I was looking for. I am interested in using this code in GPL/ZPL licensed code like the XLIFFMarshall for Plone. Is it ok and are there any special credits I should add?

  • @Sasha:

    I’d be very happy if you can find the code of use. No special dispensation needed.

Leave a Reply

 

 

 

You can use these HTML tags

<a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>