Here\'s what I\'m trying to do: go here, then hit \"search\". Grab the data, then hit \"next\", and keep hitting next until you\'re out of pages. Everything up to hitting \"
Well this nearly drove me mental but it is finally working, you have to make a get request to get a new __EVENTVALIDATION
token for each post:
import requests
from bs4 import BeautifulSoup
h = {"X-MicrosoftAjax": "Delta = true",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
}
"ctl00$ctl13 | ctl00$MainContent$physicianSearchView$btnSearch"
d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$btnSearch",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$btnSearch",
'ctl00$MainContent$physicianSearchView$hfPrefetchUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=",
'ctl00$MainContent$physicianSearchView$hfRemoveUrl': "http://service.cpsa.ca/OnlineService/OnlineService.svc/Services/GetAlbertaCities?name=%QUERY",
'__ASYNCPOST': 'true'}
nxt_d = {
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager",
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1",
"__ASYNCPOST": "true",
"__EVENTTARGET": "ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
url = "http://search.cpsa.ca/PhysicianSearch"
with requests.session() as s:
r = s.get(url, headers=h)
soup = BeautifulSoup(r.content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
r = s.post(url, data=d, headers=h)
soup = BeautifulSoup(s.get("http://search.cpsa.ca/PhysicianSearch").content, "lxml")
ev = soup.select("#__EVENTVALIDATION")[0]["value"]
vs = soup.select("#__VIEWSTATE")[0]["value"]
nxt_d["__EVENTVALIDATION"] = ev
nxt_d["__VIEWSTATE"] = vs
r = s.post(url, data=nxt_d, headers=h)
If you open the source from the last post you will see you hit page 2. We need to add more logic to get through all the pages, I will add it in a bit.
The params:
"ctl00$MainContent$physicianSearchView$gvResults$ctl01$ddlPager": "2",
"ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager": "1"
are the page to go to and the page you are coming from so that after a get should be all that needs to change.
This will get all the pages, pulling most of the values programmatically, you could probably pull more especially with the aid of a regex but it pulls most without hard coding values:
from lxml.html import fromstring
import requests
class Crawler(object):
def __init__(self, ua, url):
self.user_agent = ua
self.post_header = {"X-MicrosoftAjax": "Delta = true", "X-Requested-With": "XMLHttpRequest", "user-agent": ua}
self.post_data2 = {'__ASYNCPOST': 'true',
"ctl00$ctl13": "ctl00$MainContent$physicianSearchView$ResultsPanel|ctl00$MainContent$physicianSearchView$gvResults$ctl14$ddlPager"}
self.url = url
self.post_data1 = { '__ASYNCPOST': 'true'}
def populate(self, xml):
"""Pulls form post data keys and values for initial post."""
k1 = xml.xpath("//*[@id='hfPrefetchUrl']")[0]
k2 = xml.xpath("//*[@id='hfRemoveUrl']")[0]
self.post_data1[k1.get("name")] = k1.get("value")
self.post_data1[k2.get("name")] = k2.get("value")
self.post_data1["ctl00$ctl13"] = xml.xpath("//input[@value='Search']/@name")[0]
self.post_data1["__EVENTTARGET"] = self.post_data1["ctl00$ctl13"]
def populate2(self, xml):
"""Pulls form post data keys and values,
for all subsequent posts,
setting initial page number values.
"""
data = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_ddlPager']/@name")
self.pge = data[0]
self.ev = data[1]
self.post_data2["__EVENTTARGET"] = self.ev
self.post_data2[self.ev] = "1"
self.post_data2[self.pge] = "2"
@staticmethod
def put_validation(xml, d):
"""Need to request new __EVENTVALIDATION for each post.
"""
ev = xml.xpath("//*[@id='__EVENTVALIDATION']/@value")[0]
vs = xml.xpath("//*[@id='__VIEWSTATE']/@value")[0]
d["__EVENTVALIDATION"] = ev
d["__VIEWSTATE"] = vs
def next_page(self, d):
"""Increments the page number by one per iteration."""
e = self.post_data2[self.ev]
v = self.post_data2[self.pge]
self.post_data2[self.pge] = str(int(v) + 1)
self.post_data2[self.ev] = str(int(e) + 1)
def start(self):
with requests.session() as s:
# get initial page to pull __EVENTVALIDATION etc..
req = s.get(self.url, headers={"user-agent": self.user_agent}).content
# add __EVENTVALIDATION" to post data.
self.put_validation(fromstring(req), self.post_data1)
xml = fromstring(req)
# populate the rest of the post data.
self.populate(xml)
resp = fromstring(s.post(self.url, data=self.post_data1, headers=self.post_header).content)
# yield first page results.
yield resp
# fill post data for next pages.
self.populate2(resp)
# when this is an empty list, we will have hit the last page.
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
while not nxt:
# update __EVENTVALIDATION token and _VIEWSTATE.
self.put_validation(fromstring(s.get(self.url).content), self.post_data2)
# post to get next page of results.
yield fromstring(s.post(url, data=self.post_data2, headers=self.post_header).content)
nxt = xml.xpath("//*[@id='MainContent_physicianSearchView_gvResults_btnNextPage']/@disabled")
self.next_page(nxt_d)
ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"
url = "http://search.cpsa.ca/PhysicianSearch"
c = Crawler(ua, url)
for tree in c.start():
# use tree