问题
I can't run a crawler (named searchengine.py) despite my best effort for the past couple of hours. It seems it could not successfully index the pages as it goes. I will give you the full crawler code. The kind of errors I'm receiving looks like below
Indexing http://www.4futureengineers.com/company.html
Could not parse page http://www.4futureengineers.com/company.html
I am calling searchengine.py by entering the following commands in my Python interactive session (shell).
>> import searchengine
>> crawler=searchengine.crawler('searchindex.db')
>> pages= \
.. ['http://www.4futureengineers.com/company.html']
>> crawler.crawl(pages)
It's giving errors i.e. unsuccessful parsing right after the command crawler.crawl(pages)
Here is the complete source code of searchengine.py
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite
# Create a list of words to ignore
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
cur=self.con.execute(
"select rowid from %s where %s='%s'" % (table,field,value))
res=cur.fetchone()
if res==None:
cur=self.con.execute(
"insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
# Index an individual page
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing '+url
# Get the individual words
text=self.gettextonly(soup)
words=self.separatewords(text)
# Get the URL id
urlid=self.getentryid('urllist','url',url)
# Link each word to this url
for i in range(len(words)):
word=words[i]
if word in ignorewords: continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
v=soup.string
if v==Null:
c=soup.contents
resulttext=''
for t in c:
subtext=self.gettextonly(t)
resulttext+=subtext+'\n'
return resulttext
else:
return v.strip()
# Seperate the words by any non-whitespace character
def separatewords(self,text):
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!='']
def isindexed(self,url):
u=self.con.execute \
("select rowid from urllist where url='%s'" % url).fetchone()
if u!=None:
#Check if it has actually been crawled
v=self.con.execute(
'select * from wordlocation where urlid=%d' % u[0]).fetchone()
if v!=None: return True
return False
def crawl(self,pages,depth=2):
for i in range(depth):
newpages={}
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
try:
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages[url]=1
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit()
except:
print "Could not parse page %s" % page
pages=newpages
# Create the database tables
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
回答1:
The error handling in crawl has made debugging extremely difficult:
try:
# too much stuff here
except: # bare except
print "Could not parse page %s" % page # generic message
Although very stable (i.e. if anything goes wrong the program keeps running) this makes it impossible to figure out what is going wrong, all you know is that one of the thirteen lines in the try block went wrong somehow. Refactor this section of the code with shorter try blocks and test for specific errors (see "the evils of except").
You could try running without any error handling at all (comment out the try: except: and print ... lines and dedent the lines currently in the try block) and read the specific error tracebacks to help you along, then put appropriate error handling back in later.
来源:https://stackoverflow.com/questions/21330438/requiring-assistance-in-debugging-a-python-web-crawler