1、爬取小破站的弹幕
2、展示爬取内容
打开网页,用教的方法找到cid 和header
import requests
from bs4 import BeautufulSoup
import pandas as pd
import re
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}#小破站的header可以通过谷歌浏览器的network中的request header获取
url=' http://comment.破站.com/125507930.xml' #以爬取小破站cid=125507930为例
response=request.get(headers=header,url=url) #获取响应
response.encoding=response.apparent_encoding
data=response.text
soup=BeautifulSoup(data,'lxml' )#解析
dlist=soup.find_all('d') #将内容按照规律向需要获取的内容筛选靠拢,存入dlist
d_list=[ ]
for i in d_list
danmu{ }
danmu['弹幕']=i.text
d_list.append(danmu) #将取到的值放到d_list中
df=pd.dateframe(d_list) #df应该还是列表,pd是pandas
f.open('sign.txt','encoding='utf-8')
for p in danmu['danmu'].value:
pat=re.compile(r'[一-龥]+')
filter_data=re.findall(pattern=pat,string=p )#筛选
f.write( " ".join(filter_data)) #保存数据进入sign文件
f.close()
------------------------------------------------------------------------------------------------------------------------------------------------------
利用上面获得的文件 sign.txt进行展示
import jieba
from imageio import imread
from wordcloud import WordCloud
f.open('sign.txt ',encoding='utf-8' )
data=f.read()
result=" ".join(jieba.lcut(data))
f.close( )
mask_color=imread('XXXX.jpg')
wc=WordCloud(
font_path='font的路径',
mask=mask_color,
width=xx,
height=xx,
)
wc.generate(result)
wc.to_file('xxxx.png')
plt.imshow( wc)
plt.show()
----------------------------------------------------以下为正式代码将两者合并--------------------------------------------------------------------------------------------------
如果不生成中间文件,爬完网页直接生成图片,代码合并,如下所示
1 #coding:utf-8
2 import requests
3 import jieba
4 import pandas as pd
5 import re
6 import matplotlib.pyplot as plt
7 from bs4 import BeautifulSoup
8 from imageio import imread
9 from wordcloud import WordCloud
10 header={
11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
12 }
13 url='http://comment.bilibili.com/122593266.xml' #换一下数字的cid即可
14 response = requests.get(url=url,headers=header)
15 response.encoding=response.apparent_encoding
16 data=response.text
17 suop=BeautifulSoup(data,'lxml')
18 list_filter=suop.find_all('d')
19 plist=[]
20 for i in list_filter:
21 danmu={}
22 danmu['弹幕']= i.text
23 plist.append(danmu)
24 df=pd.DataFrame(plist)
25 reslist=[]
26 for p in df['弹幕'].values:
27 txtfilter= re.compile(r'[一-龥]+')
28 res=re.findall(pattern=txtfilter,string=p)
29 result = " ".join(res)
30 reslist.append(result)
31 result= " ".join(reslist)
32 finalResult=" ".join(jieba.lcut(result))
33 mask_color=imread('五角星.jpg') #图片可以随意替换,放在project中即可
34 wc=WordCloud(
35 font_path=r'C:\Windows\Fonts\STLITI.TTF',
36 mask=mask_color,
37 width=1920,
38 height=1080,
39 background_color='white'
40 )
41 wc.generate(finalResult)
42 wc.to_file('hunt.png')
43 plt.imshow(wc)
44 plt.show( )