Requiring assistance in debugging a Python web crawler

孤街醉人 提交于 2019-12-11 15:07:29

问题


I can't run a crawler (named searchengine.py) despite my best effort for the past couple of hours. It seems it could not successfully index the pages as it goes. I will give you the full crawler code. The kind of errors I'm receiving looks like below

Indexing http://www.4futureengineers.com/company.html
Could not parse page http://www.4futureengineers.com/company.html

I am calling searchengine.py by entering the following commands in my Python interactive session (shell).

>> import searchengine
>> crawler=searchengine.crawler('searchindex.db')
>> pages= \
.. ['http://www.4futureengineers.com/company.html']
>> crawler.crawl(pages)

It's giving errors i.e. unsuccessful parsing right after the command crawler.crawl(pages)

Here is the complete source code of searchengine.py

import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite


# Create a list of words to ignore
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}


class crawler:
  # Initialize the crawler with the name of database
  def __init__(self,dbname):
    self.con=sqlite.connect(dbname)

  def __del__(self):
    self.con.close()

  def dbcommit(self):
    self.con.commit()


  # Auxilliary function for getting an entry id and adding 
  # it if it's not present
  def getentryid(self,table,field,value,createnew=True):
    cur=self.con.execute(
    "select rowid from %s where %s='%s'" % (table,field,value))
    res=cur.fetchone()
    if res==None:
      cur=self.con.execute(
      "insert into %s (%s) values ('%s')" % (table,field,value))
      return cur.lastrowid
    else:
      return res[0]


  # Index an individual page
  def addtoindex(self,url,soup):
    if self.isindexed(url): return
    print 'Indexing '+url

    # Get the individual words
    text=self.gettextonly(soup)
    words=self.separatewords(text)

    # Get the URL id
    urlid=self.getentryid('urllist','url',url)

    # Link each word to this url
    for i in range(len(words)):
      word=words[i]
      if word in ignorewords: continue
      wordid=self.getentryid('wordlist','word',word)
      self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))


  # Extract the text from an HTML page (no tags)
  def gettextonly(self,soup):
    v=soup.string
    if v==Null:   
      c=soup.contents
      resulttext=''
      for t in c:
        subtext=self.gettextonly(t)
        resulttext+=subtext+'\n'
      return resulttext
    else:
      return v.strip()

  # Seperate the words by any non-whitespace character
  def separatewords(self,text):
    splitter=re.compile('\\W*')
    return [s.lower() for s in splitter.split(text) if s!='']



  def isindexed(self,url):
    u=self.con.execute \
      ("select rowid from urllist where url='%s'" % url).fetchone()
    if u!=None:
      #Check if it has actually been crawled
      v=self.con.execute(
      'select * from wordlocation where urlid=%d' % u[0]).fetchone()
      if v!=None: return True
    return False



  def crawl(self,pages,depth=2):
    for i in range(depth):
      newpages={}
      for page in pages:
        try:
          c=urllib2.urlopen(page)
        except:
          print "Could not open %s" % page
          continue

        try:
          soup=BeautifulSoup(c.read())
          self.addtoindex(page,soup)

          links=soup('a')
          for link in links:
            if ('href' in dict(link.attrs)):
              url=urljoin(page,link['href'])
              if url.find("'")!=-1: continue
              url=url.split('#')[0]  # remove location portion
              if url[0:4]=='http' and not self.isindexed(url):
                newpages[url]=1
              linkText=self.gettextonly(link)
              self.addlinkref(page,url,linkText)

          self.dbcommit()
        except:
          print "Could not parse page %s" % page


      pages=newpages



  # Create the database tables
  def createindextables(self): 
    self.con.execute('create table urllist(url)')
    self.con.execute('create table wordlist(word)')
    self.con.execute('create table wordlocation(urlid,wordid,location)')
    self.con.execute('create table link(fromid integer,toid integer)')
    self.con.execute('create table linkwords(wordid,linkid)')
    self.con.execute('create index wordidx on wordlist(word)')
    self.con.execute('create index urlidx on urllist(url)')
    self.con.execute('create index wordurlidx on wordlocation(wordid)')
    self.con.execute('create index urltoidx on link(toid)')
    self.con.execute('create index urlfromidx on link(fromid)')
    self.dbcommit()

回答1:


The error handling in crawl has made debugging extremely difficult:

try:
    # too much stuff here
except: # bare except
    print "Could not parse page %s" % page # generic message

Although very stable (i.e. if anything goes wrong the program keeps running) this makes it impossible to figure out what is going wrong, all you know is that one of the thirteen lines in the try block went wrong somehow. Refactor this section of the code with shorter try blocks and test for specific errors (see "the evils of except").

You could try running without any error handling at all (comment out the try: except: and print ... lines and dedent the lines currently in the try block) and read the specific error tracebacks to help you along, then put appropriate error handling back in later.



来源:https://stackoverflow.com/questions/21330438/requiring-assistance-in-debugging-a-python-web-crawler

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!