微博爬虫及舆情分析-2.用户分析
import pandas as pd import re #读取数据 user_frame = pd . read_csv ( 'users.csv' , index_col = None ) user_frame . head ( 2 ) # 我用的是notebook所以这里只有截图发上来 # location字段整理,保留到省份 user_frame [ 'location' ] = user_frame . location . fillna ( '其他' ) def get_provience ( loc ) : location = str ( loc ) location = re . findall ( r "^[\u4e00-\u9fff][^' ']*" , location ) if not location : return '其他' return location [ 0 ] user_frame [ 'location' ] = user_frame [ 'location' ] . apply ( get_provience ) # 认证用户比例 import matplotlib . pyplot as plt % matplotlib inline plt . rcParams [ 'font.sans-serif' ] = [ 'SimHei'