问题
for the last month or so, I've been trying to read a few pages from an aspx site. I have no problems finding all the required items on the site but my attempted solution is still not working properly. I read somewhere that all the header details must be present, so I added them. I also read somewhere that the __EVENTTARGET must be set to something to tell aspx which button had been pressed, so I tried a few different things(see below). I also read that a session should be established to deal with the cookies - so I implemented that as well. As of now, my code snippet produce the exact same info that I get when I use a web developper tool to analyse the post request(the print lines have been commented out) - but this code always give me the first page. Does anyone know what is missing in this code for it to work. I should also point out that selenium or mechanize is not really an option for this project.
import requests
from bs4 import BeautifulSoup
import time
import collections
import json
def SPAIN_STK_LIST(numpage):
payload = collections.OrderedDict()
header = {'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' : 'gzip, deflate',
'Accept-language' : 'en-US,en;q=0.9',
'Cache-Control' : 'max-age=0',
'Connection' : 'keep-alive',
'Content-Type': 'text/html; charset=utf-8',
'Host' : 'www.bolsamadrid.es',
'Origin' : 'null',
'Upgrade-Insecure-Requests' : '1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
for i in range(0, numpage):
ses = requests.session()
if(i == 0):
req = ses.get("http://www.bolsamadrid.es/ing/aspx/Empresas/Empresas.aspx", headers = header)
else:
req = ses.post("http://www.bolsamadrid.es/ing/aspx/Empresas/Empresas.aspx", headers = header, data = payload)
# print(req.request.body)
# print(req.request.headers)
# print(req.request.url)
page = req.text
soup = BeautifulSoup(page, "lxml")
# find __VIEWSTATE and __EVENTVALIDATION for the next page
viewstate = soup.select("#__VIEWSTATE")[0]['value']
# print("VIEWSTATE: ", viewstate)
eventval = soup.select("#__EVENTVALIDATION")[0]['value']
# print("EVENTVALIDATION: ", eventval)
header = {'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' : 'gzip, deflate',
'Accept-language' : 'en-US,en;q=0.9',
'Cache-Control' : 'max-age=0',
'Connection' : 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host' : 'www.bolsamadrid.es',
'Origin' : 'null',
'Upgrade-Insecure-Requests' : '1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
target = "ct100$Contenido$GoPag{:=>2}"
payload = collections.OrderedDict()
payload['__EVENTTARGET'] = ""
#payload['__EVENTTARGET'] = "GoPag"
#payload['__EVENTTARGET'] = "ct100$Contenido$GoPag"
#payload['__EVENTTARGET'] = target.format(i + 1)
payload['__EVENTARGUMENT'] = ""
payload['__VIEWSTATE'] = viewstate
payload['__VIEWSTATEGENERATOR'] = "65A1DED9"
payload['__EVENTVALIDATION'] = eventval
payload['ct100$Contenido$GoPag'] = i + 1
table = soup.find("table", {"id" : "ctl00_Contenido_tblEmisoras"})
for row in table.findAll("tr")[1:]:
cells = row.findAll("td")
print(cells[0].find("a").get_text().replace(",","").replace("S.A.", ""))
time.sleep(1)
SPAIN_STK_LIST(6)
Note that the first header content-type is set to "text/html" as this is the first request but any subsequent requests is done with a type-content of "application/x-www-form-urlencoded". Any pointers as to what I should try next would be much appreciated. E.
回答1:
The easiest way would be something like the following. Why hardcode those __EVENTTARGET
,__VIEWSTATE
and so on? Let the script take care of those:
import requests
from bs4 import BeautifulSoup
url = "http://www.bolsamadrid.es/ing/aspx/Empresas/Empresas.aspx"
res = requests.get(url,headers = {"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for page in range(7):
formdata = {}
for item in soup.select("#aspnetForm input"):
if "ctl00$Contenido$GoPag" in item.get("name"):
formdata[item.get("name")] = page
else:
formdata[item.get("name")] = item.get("value")
req = requests.post(url,data=formdata)
soup = BeautifulSoup(req.text,"lxml")
for items in soup.select("#ctl00_Contenido_tblEmisoras tr")[1:]:
data = [item.get_text(strip=True) for item in items.select("td")]
print(data)
Assuming that you need the tabular data spread across multiple pages.
回答2:
You need to set your payload
before your request:
import requests
from bs4 import BeautifulSoup
import time
import collections
import json
def SPAIN_STK_LIST(numpage):
payload = collections.OrderedDict()
header = {'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' : 'gzip, deflate',
'Accept-language' : 'en-US,en;q=0.9',
'Cache-Control' : 'max-age=0',
'Connection' : 'keep-alive',
'Content-Type': 'text/html; charset=utf-8',
'Host' : 'www.bolsamadrid.es',
'Origin' : 'null',
'Upgrade-Insecure-Requests' : '1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
ses = requests.session()
viewstate = ""
eventval = ""
for i in range(0, numpage):
if(i == 0):
req = ses.get("http://www.bolsamadrid.es/ing/aspx/Empresas/Empresas.aspx", headers = header)
page = req.text
soup = BeautifulSoup(page, "lxml")
# find __VIEWSTATE and __EVENTVALIDATION for the next page
viewstate = soup.select("#__VIEWSTATE")[0]['value']
# print("VIEWSTATE: ", viewstate)
eventval = soup.select("#__EVENTVALIDATION")[0]['value']
else:
header = {'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' : 'gzip, deflate',
'Accept-language' : 'en-US,en;q=0.9',
'Cache-Control' : 'max-age=0',
'Connection' : 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host' : 'www.bolsamadrid.es',
'Origin' : 'null',
'Upgrade-Insecure-Requests' : '1',
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
target = "ct100$Contenido$GoPag{:=>2}"
payload = collections.OrderedDict()
payload['__EVENTTARGET'] = "ctl00$Contenido$SiguientesArr"
#payload['__EVENTTARGET'] = "GoPag"
#payload['__EVENTTARGET'] = "ct100$Contenido$GoPag"
#payload['__EVENTTARGET'] = target.format(i + 1)
payload['__EVENTARGUMENT'] = ""
payload['__VIEWSTATE'] = viewstate
payload['__VIEWSTATEGENERATOR'] = "65A1DED9"
payload['__EVENTVALIDATION'] = eventval
# payload['ct100$Contenido$GoPag'] = i + 1
payload['ct100$Contenido$GoPag'] = ""
req = ses.post("http://www.bolsamadrid.es/ing/aspx/Empresas/Empresas.aspx", headers = header, data = payload)
page = req.text
soup = BeautifulSoup(page, "lxml")
# find __VIEWSTATE and __EVENTVALIDATION for the next page
viewstate = soup.select("#__VIEWSTATE")[0]['value']
# print("VIEWSTATE: ", viewstate)
eventval = soup.select("#__EVENTVALIDATION")[0]['value']
# print(req.request.body)
# print(req.request.headers)
# print(req.request.url)
# print("EVENTVALIDATION: ", eventval)
table = soup.find("table", {"id" : "ctl00_Contenido_tblEmisoras"})
for row in table.findAll("tr")[1:]:
cells = row.findAll("td")
print( cells[0].find("a").get_text().replace(",","").replace("S.A.", "").encode('utf-8') )
time.sleep(1)
SPAIN_STK_LIST(6)
来源:https://stackoverflow.com/questions/51391833/scrapping-through-pages-of-aspx-website