Making subsequent POST request in session doesn't work - web scraping

后端 未结 1 759
没有蜡笔的小新
没有蜡笔的小新 2021-01-05 16:44

Here\'s what I\'m trying to do: go here, then hit \"search\". Grab the data, then hit \"next\", and keep hitting next until you\'re out of pages. Everything up to hitting \"

相关标签:
1条回答
  • 2021-01-05 17:05

    Well this nearly drove me mental but it is finally working, you have to make a get request to get a new __EVENTVALIDATION token for each post:

    import requests
    
    from bs4 import BeautifulSoup
    
    h = {"X-MicrosoftAjax": "Delta = true",
         "X-Requested-With": "XMLHttpRequest",
         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
         }
    "ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
    d = {
        "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
        "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
        'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
        'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
        '__ASYNCPOST': 'true'}
    
    nxt_d = {
        "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
        "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
        "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
        "__ASYNCPOST": "true",
        "__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
    
    url = "http://search.cpsa.ca/PhysicianSearch"
    with requests.session() as s:
        r = s.get(url, headers=h)
        soup = BeautifulSoup(r.content, "lxml")
        ev = soup.select("#__EVENTVALIDATION")[0]["value"]
        vs = soup.select("#__VIEWSTATE")[0]["value"]
        d["__EVENTVALIDATION"] = ev
        d["__VIEWSTATE"] = vs
        r = s.post(url, data=d, headers=h)
        soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
        ev = soup.select("#__EVENTVALIDATION")[0]["value"]
        vs = soup.select("#__VIEWSTATE")[0]["value"]
        nxt_d["__EVENTVALIDATION"] = ev
        nxt_d["__VIEWSTATE"] = vs
        r = s.post(url, data=nxt_d, headers=h)
    

    If you open the source from the last post you will see you hit page 2. We need to add more logic to get through all the pages, I will add it in a bit.

    The params:

    "ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
    "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"
    

    are the page to go to and the page you are coming from so that after a get should be all that needs to change.

    This will get all the pages, pulling most of the values programmatically, you could probably pull more especially with the aid of a regex but it pulls most without hard coding values:

    from lxml.html import fromstring
    import requests
    
    
    class Crawler(object):
        def __init__(self, ua, url):
            self.user_agent = ua
            self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
            self.post_data2 = {'__ASYNCPOST': 'true',
                               "ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
            self.url = url
            self.post_data1 = { '__ASYNCPOST': 'true'}
    
        def populate(self, xml):
            """Pulls form post data keys and values for initial post."""
            k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
            k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
            self.post_data1[k1.get("name")] = k1.get("value")
            self.post_data1[k2.get("name")] = k2.get("value")
            self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
            self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]
    
        def populate2(self, xml):
            """Pulls form post data keys and values,
               for all subsequent posts,
               setting initial page number values.
            """
            data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
            self.pge = data[0]
            self.ev = data[1]
            self.post_data2["__EVENTTARGET"] = self.ev
            self.post_data2[self.ev] = "1"
            self.post_data2[self.pge] = "2"
    
        @staticmethod
        def put_validation(xml, d):
            """Need to request new __EVENTVALIDATION for each post.
            """
            ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
            vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
            d["__EVENTVALIDATION"] = ev
            d["__VIEWSTATE"] = vs
    
        def next_page(self, d):
            """Increments the page number by one per iteration."""
            e = self.post_data2[self.ev]
            v = self.post_data2[self.pge]
            self.post_data2[self.pge] = str(int(v) + 1)
            self.post_data2[self.ev] = str(int(e) + 1)
    
        def start(self):
            with requests.session() as s:
                # get initial page to pull __EVENTVALIDATION etc..
                req = s.get(self.url, headers={"user-agent": self.user_agent}).content
                # add __EVENTVALIDATION" to post data.
                self.put_validation(fromstring(req), self.post_data1)
    
                xml = fromstring(req)
                # populate the rest of the post data.
                self.populate(xml)
                resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
                # yield first page results.
                yield resp
                # fill post data for next pages.
                self.populate2(resp)
                # when this is an empty list, we will have hit the last page.
                nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
                while not nxt:
                    # update  __EVENTVALIDATION token and _VIEWSTATE.
                    self.put_validation(fromstring(s.get(self.url).content), self.post_data2)
    
                    # post to get next page of results.
                    yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)
    
                    nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
                    self.next_page(nxt_d)
    
    
    ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
    url = "http://search.cpsa.ca/PhysicianSearch"
    c = Crawler(ua, url)
    for tree in c.start():
       # use tree
    
    0 讨论(0)
提交回复
热议问题