Creating a Blog Summary in Python?

问题

Is there any good library (or regex magic) which can convert a blog entry into a blog summary? I'd like the summary to display the first four sentences, first paragraph, or first X number of characters... not really sure what would be the best. Ideally, I would like it to keep html formatting tags such as <a>, <b>, <u> and <i>, but it could remove all other html tags, javascript and css.

More specifically, as input I'd give an html string representing an entire blog post. As output, I'd like an html string which contains the first few sentences, paragraph, or X number of characters. With all potentially unsafe html tags removed. In Python please.

回答1:

If you're looking at the HTML you'll need to parse it. In addition to aforementioned BeautifulSoup, lxml.html has some nice HTML handling tools.

However if it's a blog you may find it even easier to work with RSS/Atom feeds. Feedparser is fantastic and would make it easy. You'd gain compatibility and durability (because RSS is more defined things will change less) but if the feed doesn't include what you need it won't help you.

回答2:

I ended up using the gdata library and rolling my own blog summarizer, which uses the gdata library to fetch a Blogspot blog on Google App Engine (wouldn't be hard to port it to other platforms). The code is below. To use it, first set the constant blog_id_constant and then call get_blog_info to return a dictionary with the blog summaries.

I would not trust the code to create summaries of any random blog out there on the internet because it may not remove all unsafe html from the blog feed. However, for a simple blog that you write yourself, the code below should work.

Please feel free to copy but if you see any bugs or would like to make improvements, add them in the comments. (Sorry for the semicolons).

import sys
import os
import logging
import time
import urllib
from HTMLParser import HTMLParser
from django.core.cache import cache
# Import the Blogger API
sys.path.insert(0, 'gdata.zip')
from gdata import service

Months = ["Jan.", "Feb.", "Mar.", "Apr.", "May", "June", "July", "Aug.", "Sept.", "Oct.", "Nov.", "Dec."];
blog_id_constant = -1 # YOUR BLOG ID HERE
blog_pages_at_once = 5

# -----------------------------------------------------------------------------
#   Blogger 
class BlogHTMLSummarizer(HTMLParser):
    '''
    An HTML parser which only grabs X number of words and removes
    all tags except for certain safe ones.
    '''

    def __init__(self, max_words = 80):
        self.max_words = max_words
        self.allowed_tags = ["a", "b", "u", "i", "br", "div", "p", "img", "li", "ul", "ol"]
        if self.max_words < 80:
            # If it's really short, don't include layout tags
            self.allowed_tags = ["a", "b", "u", "i"]
        self.reset()
        self.out_html = ""
        self.num_words = 0
        self.no_more_data = False
        self.no_more_tags = False
        self.tag_stack = []

    def handle_starttag(self, tag, attrs):
        if not self.no_more_data and tag in self.allowed_tags:
            val = "<%s %s>"%(tag, 
                " ".join("%s='%s'"%(a,b) for (a,b) in attrs))
            self.tag_stack.append(tag)
            self.out_html += val

    def handle_data(self, data):
        if self.no_more_data:
            return
        data = data.split(" ")
        if self.num_words + len(data) >= self.max_words:
            data = data[:self.max_words-self.num_words]
            data.append("...")
            self.no_more_data = True
        self.out_html  += " ".join(data)
        self.num_words += len(data)

    def handle_endtag(self, tag):
        if self.no_more_data and not self.tag_stack:
            self.no_more_tags = True
        if not self.no_more_tags and self.tag_stack and tag == self.tag_stack[-1]:
            if not self.tag_stack:
                logging.warning("mixed up blogger tags")
            else:
                self.out_html += "</%s>"%tag
                self.tag_stack.pop()

def get_blog_info(short_summary = False, page = 1, year = "", month = "", day = "", post = None):
    '''
    Returns summaries of several recent blog posts to be displayed on the front page
        page: which page of blog posts to get. Starts at 1.
    '''
    blogger_service = service.GDataService()
    blogger_service.source = 'exampleCo-exampleApp-1.0'
    blogger_service.service = 'blogger'
    blogger_service.account_type = 'GOOGLE'
    blogger_service.server = 'www.blogger.com'
    blog_dict = {}

    # Do the common stuff first
    query = service.Query()
    query.feed = '/feeds/' + blog_id_constant + '/posts/default'
    query.order_by = "published"
    blog_dict['entries'] = []

    def get_common_entry_data(entry, summarize_len = None):
        '''
        Convert an entry to a dictionary object.
        '''
        content = entry.content.text
        if summarize_len != None:
            parser = BlogHTMLSummarizer(summarize_len)
            parser.feed(entry.content.text)
            content = parser.out_html
        pubstr = time.strptime(entry.published.text[:-10], '%Y-%m-%dT%H:%M:%S')
        safe_title = entry.title.text.replace(" ","_")
        for c in ":,.<>!@#$%^&*()+-=?/'[]{}\\\"":
            # remove nasty characters
            safe_title = safe_title.replace(c, "")
        link = "%d/%d/%d/%s/"%(pubstr.tm_year, pubstr.tm_mon, pubstr.tm_mday, 
            urllib.quote_plus(safe_title))
        return {
                'title':entry.title.text,
                'alllinks':[x.href for x in entry.link] + [link], #including blogger links
                'link':link,
                'content':content,
                'day':pubstr.tm_mday,
                'month':Months[pubstr.tm_mon-1],
                'summary': True if summarize_len != None else False,
            }

    def get_blogger_feed(query):
        feed = cache.get(query.ToUri())
        if not feed:
            logging.info("GET Blogger Page: " + query.ToUri())
            try:
                feed = blogger_service.Get(query.ToUri())
            except DownloadError:
                logging.error("Cant download blog, rate limited? %s"%str(query.ToUri()))
                return None
            except Exception, e:
                web_exception('get_blogger_feed', e)
                return None
            cache.set(query.ToUri(), feed, 3600)
        return feed

    def _in_one(a, allBs):
        # Return true if a is in one of allBs
        for b in allBs:
            if a in b:
                return True
        return False

    def _get_int(i):
        try:
            return int(i)
        except ValueError:
            return None
    (year, month, day) = (_get_int(year), _get_int(month), _get_int(day))

    if not short_summary and year and month and day:
        # Get one more than we need so we can see if we have more
        query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, day)
        query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, day)
        feed = get_blogger_feed(query)
        if not feed:
            return {}
        blog_dict['detail_view'] = True
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
    elif not short_summary and year and month and not day:
        # Get one more than we need so we can see if we have more
        query.published_min = "%d-%02d-%02dT00:00:00-08:00"%(year, month, 1)
        query.published_max = "%d-%02d-%02dT23:59:59-08:00"%(year, month, 31)
        feed = get_blogger_feed(query)
        if not feed:
            return {}
        blog_dict['detail_view'] = True
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, None), feed.entry)
        if post:
            blog_dict['entries'] = filter(lambda f: _in_one(post, f['alllinks']), blog_dict['entries'])
    elif short_summary:
        # Get a summary of all posts
        query.max_results = str(3) 
        query.start_index = str(1)
        feed = get_blogger_feed(query)
        if not feed:
            return {}
        feed.entry = feed.entry[:3]
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 18), feed.entry)
    else:
        # Get a summary of all posts
        try:
            page = int(page)
        except ValueError:
            page = 1

        # Get one more than we need so we can see if we have more
        query.max_results = str(blog_pages_at_once + 1) 
        query.start_index = str((page - 1)* blog_pages_at_once + 1)
        logging.info("GET Blogger Page: " + query.ToUri())
        feed = blogger_service.Get(query.ToUri())

        has_older = len(feed.entry) > blog_pages_at_once
        feed.entry = feed.entry[:blog_pages_at_once]
        if page > 1:
            blog_dict['newer_page'] = str(page-1)
        if has_older:
            blog_dict['older_page'] = str(page+1)       
        blog_dict['entries'] = map(lambda e: get_common_entry_data(e, 80), feed.entry)

    return blog_dict

回答3:

You will have to parse the html. A nice lib for doing that is BeautifulSoup. It will allow to remove specific tags and extract values (text between tags). The text can than be relatively easily cut down to four sentences, though I'd go for a fixed number of characters, as the sentence length might vary a lot.

来源：https://stackoverflow.com/questions/5234996/creating-a-blog-summary-in-python

标签

python

html

blogs