Making subsequent POST request in session doesn't work - web scraping

后端未结

关注

 1  761

Here\'s what I\'m trying to do: go here, then hit \"search\". Grab the data, then hit \"next\", and keep hitting next until you\'re out of pages. Everything up to hitting \"

相关标签:

1条回答

忘了有多久

2021-01-05 17:05

Well this nearly drove me mental but it is finally working, you have to make a get request to get a new __EVENTVALIDATION token for each post:

import requests

from bs4 import BeautifulSoup

h = {"X-MicrosoftAjax": "Delta = true",
     "X-Requested-With": "XMLHttpRequest",
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
     }
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
    "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
    "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
    'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
    'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
    '__ASYNCPOST': 'true'}

nxt_d = {
    "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
    "__ASYNCPOST": "true",
    "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}

url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
    r = s.get(url, headers=h)
    soup = BeautifulSoup(r.content, "lxml")
    ev = soup.select("#__EVENTVALIDATION")[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    d["__EVENTVALIDATION"] = ev
    d["__VIEWSTATE"] = vs
    r = s.post(url, data=d, headers=h)
    soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
    ev = soup.select("#__EVENTVALIDATION")[0]["value"]
    vs = soup.select("#__VIEWSTATE")[0]["value"]
    nxt_d["__EVENTVALIDATION"] = ev
    nxt_d["__VIEWSTATE"] = vs
    r = s.post(url, data=nxt_d, headers=h)

If you open the source from the last post you will see you hit page 2. We need to add more logic to get through all the pages, I will add it in a bit.

The params:

"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"

are the page to go to and the page you are coming from so that after a get should be all that needs to change.

This will get all the pages, pulling most of the values programmatically, you could probably pull more especially with the aid of a regex but it pulls most without hard coding values:

from lxml.html import fromstring
import requests


class Crawler(object):
    def __init__(self, ua, url):
        self.user_agent = ua
        self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
        self.post_data2 = {'__ASYNCPOST': 'true',
                           "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
        self.url = url
        self.post_data1 = { '__ASYNCPOST': 'true'}

    def populate(self, xml):
        """Pulls form post data keys and values for initial post."""
        k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
        k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
        self.post_data1[k1.get("name")] = k1.get("value")
        self.post_data1[k2.get("name")] = k2.get("value")
        self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
        self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]

    def populate2(self, xml):
        """Pulls form post data keys and values,
           for all subsequent posts,
           setting initial page number values.
        """
        data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
        self.pge = data[0]
        self.ev = data[1]
        self.post_data2["__EVENTTARGET"] = self.ev
        self.post_data2[self.ev] = "1"
        self.post_data2[self.pge] = "2"

    @staticmethod
    def put_validation(xml, d):
        """Need to request new __EVENTVALIDATION for each post.
        """
        ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
        vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
        d["__EVENTVALIDATION"] = ev
        d["__VIEWSTATE"] = vs

    def next_page(self, d):
        """Increments the page number by one per iteration."""
        e = self.post_data2[self.ev]
        v = self.post_data2[self.pge]
        self.post_data2[self.pge] = str(int(v) + 1)
        self.post_data2[self.ev] = str(int(e) + 1)

    def start(self):
        with requests.session() as s:
            # get initial page to pull __EVENTVALIDATION etc..
            req = s.get(self.url, headers={"user-agent": self.user_agent}).content
            # add __EVENTVALIDATION" to post data.
            self.put_validation(fromstring(req), self.post_data1)

            xml = fromstring(req)
            # populate the rest of the post data.
            self.populate(xml)
            resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
            # yield first page results.
            yield resp
            # fill post data for next pages.
            self.populate2(resp)
            # when this is an empty list, we will have hit the last page.
            nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
            while not nxt:
                # update  __EVENTVALIDATION token and _VIEWSTATE.
                self.put_validation(fromstring(s.get(self.url).content), self.post_data2)

                # post to get next page of results.
                yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)

                nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
                self.next_page(nxt_d)


ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
url = "http://search.cpsa.ca/PhysicianSearch"
c = Crawler(ua, url)
for tree in c.start():
   # use tree

0 讨论(0)