from bs4 import BeautifulSoup import urllib.request print ('Hello world') header='http://www.w3school.com.cn' follower='/sql/index.asp' url=header+follower end='http://www.w3school.com.cn/sql/sql_summary.asp' title='教程' #while url!=end : print(url) response = urllib.request.urlopen(url) html=response.read() soup=BeautifulSoup(html,'lxml') fileHandle=open((title+'.html'),'w') fileHandle.write(soup.prettify()) fileHandle.close() #print(soup.prettify()) for link in soup.find_all('a'): follower = link.get('href') if follower.startswith('/sql'): print(follower) title=link.get('title') url=header+follower response = urllib.request.urlopen(url) html=response.read() fileHandle=open((title+'.html'),'wb') fileHandle.write(html) fileHandle.close() #提取下一页的后缀,更新url,更新title #fileHandle=open('sql_update.html','w') #fileHandle.write(soup.prettify()) #fileHandle.close() #html=html.decode('UTF-8') #print(html) #html=html.decode('UTF-8') #print(soup.prettify()) '''print(soup.title) for x in soup.find_all('a'): if x.get('title')!= None: print (x.get('title')) ''' #if x.get('class')== #print(soup.get_text()) #print(response) #print(html)
自己写来抓所有的SQL相关内容的东西,用了beautifulSoup,感觉不错。确实可以开始学Python了,好玩=。=
应该再把html文件前面一段没什么用的东西截掉。然后是想办法把经过JS处理的html网页再扒下来,现在扒下来的都是原始的html网页只是刚好够用。
from bs4 import BeautifulSoup import urllib.request print ('Hello world') header='http://www.w3school.com.cn' follower='/sql/index.asp' url=header+follower end='http://www.w3school.com.cn/sql/sql_summary.asp' title='教程' print(url) response = urllib.request.urlopen(url) html=response.read() soup=BeautifulSoup(html,'lxml') #print(soup.prettify()) '''for link in soup.find_all('div'): if link.get('id') =='maincontent': print (link.prettify()) str=link.prettify('gbk') fileHandle=open((title+'.html'),'wb') fileHandle.write(str) fileHandle.close() ''' for link in soup.find_all('a'): follower = link.get('href') if follower.startswith('/sql'): print(follower) title=link.get('title') url=header+follower response = urllib.request.urlopen(url) html=response.read() tempSoup=BeautifulSoup(html,'lxml') for tempLink in tempSoup.find_all('div'): if tempLink.get('id')== 'maincontent': str=tempLink.prettify('gbk')#坑在这里 fileHandle=open((title+'.html'),'wb') fileHandle.write(str) fileHandle.close()
改了一下内容,获取了所有网址主要的那部分。其中比较坑的是编码方式。BeautifulSoup会自动把html解析成Unicode编码,直接输出到文件里再用浏览器打开就是乱码
用prettify改成原来的编码方式就好了。