一个获取w3school上面SQL教程的Python爬虫

from bs4 import BeautifulSoup import urllib.request  print ('Hello world')  header='http://www.w3school.com.cn' follower='/sql/index.asp' url=header+follower end='http://www.w3school.com.cn/sql/sql_summary.asp' title='教程'  #while url!=end : print(url) response = urllib.request.urlopen(url) html=response.read() soup=BeautifulSoup(html,'lxml') fileHandle=open((title+'.html'),'w') fileHandle.write(soup.prettify()) fileHandle.close() #print(soup.prettify()) for link in soup.find_all('a'):     follower = link.get('href')     if follower.startswith('/sql'):         print(follower)         title=link.get('title')         url=header+follower         response = urllib.request.urlopen(url)         html=response.read()         fileHandle=open((title+'.html'),'wb')         fileHandle.write(html)         fileHandle.close()      #提取下一页的后缀，更新url，更新title              #fileHandle=open('sql_update.html','w') #fileHandle.write(soup.prettify()) #fileHandle.close()                                                 #html=html.decode('UTF-8') #print(html)    #html=html.decode('UTF-8')  #print(soup.prettify()) '''print(soup.title) for x in soup.find_all('a'):     if x.get('title')!= None:         print (x.get('title')) '''          #if x.get('class')== #print(soup.get_text()) #print(response) #print(html)

自己写来抓所有的SQL相关内容的东西，用了beautifulSoup，感觉不错。确实可以开始学Python了，好玩=。=

应该再把html文件前面一段没什么用的东西截掉。然后是想办法把经过JS处理的html网页再扒下来，现在扒下来的都是原始的html网页只是刚好够用。

from bs4 import BeautifulSoup import urllib.request  print ('Hello world')  header='http://www.w3school.com.cn' follower='/sql/index.asp' url=header+follower end='http://www.w3school.com.cn/sql/sql_summary.asp' title='教程'  print(url) response = urllib.request.urlopen(url) html=response.read() soup=BeautifulSoup(html,'lxml') #print(soup.prettify()) '''for link in soup.find_all('div'):     if link.get('id') =='maincontent':         print (link.prettify())         str=link.prettify('gbk')         fileHandle=open((title+'.html'),'wb')         fileHandle.write(str)         fileHandle.close() ''' for link in soup.find_all('a'):     follower = link.get('href')     if follower.startswith('/sql'):         print(follower)         title=link.get('title')         url=header+follower         response = urllib.request.urlopen(url)         html=response.read()         tempSoup=BeautifulSoup(html,'lxml')         for tempLink in tempSoup.find_all('div'):             if tempLink.get('id')== 'maincontent':                 str=tempLink.prettify('gbk')#坑在这里                 fileHandle=open((title+'.html'),'wb')                 fileHandle.write(str)                 fileHandle.close()

改了一下内容，获取了所有网址主要的那部分。其中比较坑的是编码方式。BeautifulSoup会自动把html解析成Unicode编码,直接输出到文件里再用浏览器打开就是乱码

用prettify改成原来的编码方式就好了。

文章来源: 一个获取w3school上面SQL教程的Python爬虫

标签

response

html文件