# [Python3爬虫] 爬取用户信息和全部微博（pyquery+selenium+mongodb ）

微博爬虫爬取手机版微博会比较简单，而手机版里面微博彩板标准版比较适合爬取。

需要代理IP和代理账号，IP代理最好不要使用免费的，例如西刺，免费代理多数挂的快，不稳定而且速度慢，速度慢就无法再超时设置的规定时间内加载出网页，导致出现异常，程序终止

微博账号购买：

IP代理商：http://h.zhimaruanjian.com/getapi/#obtain_ip

数据存入MongoDB中，确保MongoDB已安装并且可以连接成功

主要步骤：
- 测试代理ip
- 模拟登陆
- 获取用户基本信息
- 获取用户所有微博

proxy = "http://58.218.200.248:3178"#设置代理IP  # 测试代理IP def check_ip():     print(r'正在检查代理IP是否可用...')     # 测试ip是否可用      proxies = {         'http': proxy,         'https': proxy,     }      print('当前测试的代理IP为：' + proxy)      print('...')      print('测试结果：')      try:         r = requests.get('http://ip111.cn/', proxies=proxies,timeout=3)#超时设置,避免使用不稳定的IP          r.encoding = 'utf-8'          doc = pq(r.text)         # print(doc)         ip = doc('body > div.container > div.card-deck.mb-3.text-center')          ip = str(ip.find('div:nth-child(1) > div.card-header').text()) + \              " : " + str(ip.find('div:nth-child(1) > div.card-body > p:nth-child(1)').text())          print(ip)      except:          print("抱歉，此IP无法使用，请更换IP重试")          exit(0)

这里IP肯定是不能直接使用的，需要自己找可用的代理IP

def log_in():      chromeOptions = webdriver.ChromeOptions()      chromeOptions.add_argument("--proxy-server={0}".format(proxy))#设置代理      chromeOptions.add_argument('lang=zh_CN.UTF-8')      chromeOptions.add_argument(         'User-Agent:"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"'     )      global browser      browser = webdriver.Chrome(chrome_options=chromeOptions)      try:          print(u'正在登陆新浪微博手机端...')          #给定登陆的网址         url = 'https://passport.weibo.cn/signin/login'          browser.get(url)          time.sleep(3)         #找到输入用户名的地方，并将用户名里面的内容清空，然后送入你的账号         username = browser.find_element_by_css_selector('#loginName')          time.sleep(2)          username.clear()          username.send_keys('输入自己的账号')#输入自己的账号         #找到输入密码的地方，然后送入你的密码          password = browser.find_element_by_css_selector('#loginPassword')          time.sleep(2)          password.send_keys('输入自己的密码')          #点击登录         browser.find_element_by_css_selector('#loginAction').click()                 #这里给个15秒非常重要，因为在点击登录之后，新浪微博会有个验证码，下图有，通过程序执行的话会有点麻烦（可以参考崔庆才的Python书里面有解决方法），这里就手动         time.sleep(15)          print('完成登陆!')      except:          print('登录出现错误，请检查网速或者代理IP是否稳定!!!')

 #此处要求输入用户的uid，例如网址为https://weibo.cn/6518605376/info，则uid=6518605376 def get_basic_info(id):          dict = {         '_id':'基本信息'     }      global url     url = 'http://weibo.cn/' + id      try:          browser.get(url + '/info')#这里可能出现获取不到页面的情况      except TimeoutError:          print("请求超时，节点可能不太稳定，请跟换节点")          exit(0)      except:          print("出现错误，错误uid")          exit(0)      doc = pq(browser.page_source,parser='html')      info = doc('body > div:nth-child(7)').text()      nickname = str(info).split('\n', 1)[0]     print(nickname)      dict[nickname.split(':')[0]] = nickname.split(':')[1]      dict['uid'] = id      other_info = str(info).split('\n', 1)[1].strip()[:-5]      img = doc('body > div:nth-child(4) > img').attr('src')     img = '头像:' + str(img)     print(img)      dict[img.split(':')[0]] = img.split(':')[1]      rank = doc('body > div:nth-child(5)').text()     rank = str(rank).split('\n', 1)[0].split('：')[1].strip()[:2]     rank = '会员等级:' + rank     print(rank)      dict[rank.split(':')[0]] = rank.split(':')[1]      other_info=other_info.replace('：',':')      print(other_info)      other_info=other_info.strip()      for a in other_info.split('\n'):          dict[a.split(':')[0]]=a.split(':')[1]      browser.get(url)      doc = pq(browser.page_source, parser='html')      follow_and_fans = str(doc('body > div.u > div').text()).strip().split('分')[0]      follow_and_fans=follow_and_fans.strip() #去处前后空格      for a in follow_and_fans.split():#创建列表，方便提取信息          t = a.split('[')[1]          dict[a.split('[')[0]+'数']=t.split(']')[0]      try:         result=collection.insert_one(dict)      except:         print('基本信息存储进mongodb出现错误')         exit(0)      finally:         tot_page = doc('#pagelist > form > div').text()          tot_page = str(tot_page).split('/')[1][:-1]          return tot_page

def get_weibo(tot_page):      for k in range(1,int(tot_page)+1):          browser.get(url+'?page='+str(k))          doc=pq(browser.page_source,parser='html')          c = doc('.c')         lens = len(c) #获取每页微博的数量，实际上lens-3才是该页微博数量         c = c.items()          i = 0;j=1           for cc in c:              dict = {}              i = i + 1              if (i == 1): continue             if (i == lens - 1): break              print("正在爬取第"+str(k)+"页，第"+str(j)+"条微博...")              dict['_id'] = cc.find('div > span.ct').text()              dict['info'] = cc.find('.ctt').text()              dict['img'] = cc.find('div:nth-child(2) > a:nth-child(2)').attr('href')              if dict['img']:                 dict['active'] = str(cc.find('div:nth-child(2) > a:nth-child(4)').text()) + '，' + str(cc.find('div:nth-child(2) > a:nth-child(5)').text()) + '，' + str(cc.find('.cc').text())             else:                 dict['active'] = str(cc.find('div > a:nth-child(3)').text()) + '，' + str(cc.find('div > a:nth-child(4)').text()) + '，' + str(cc.find('.cc').text())              try:                 collection.insert_one(dict)                 print("第" + str(k) + "页，第" + str(j) + "条微博存储成功...")             except:                 print("第" + str(k) + "页，第" + str(j) + "条微博存储失败!!!")              j = j + 1              print('')              time.sleep(0.5)