Unable to load ASP.NET page using Python urllib2

三世轮回 提交于 2020-01-11 14:32:10

问题


I am trying to do a POST request to https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx in order to scrape data.

Here is my current code:

from urllib import urlencode
import urllib2

# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
headers = {
    'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
    'HTTP_ACCEPT': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded'
}
formFields = [(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber','003-00013'), 
              (r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
              (r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'),
              (r'ctl00%24MainContent%24WellDetailsCriteria1%24ViewDataButton','View Data'),
              (r'__VIEWSTATE', r'/wEPDwUJOTc2MzI0NTk4D2QWAmYPDxYEHglQYWdlVGl0bGUFDFdlbGwgRGV0YWlscx4SUmVxdWlyZXNKYXZhU2NyaXB0Z2QWAgIDD2QWCGYPFgIeBFRleHQF1hA8ZGl2IHN0eWxlPSJoZWlnaHQ6IDE0OXB4OyB3aWR0aDogOTUycHg7IGJhY2tncm91bmQtcmVwZWF0OiBuby1yZXBlYXQ7IGJhY2tncm91bmQtaW1hZ2U6dXJsKGh0dHBzOi8vd3d3LmFoczIuZGVwLnN0YXRlLnBhLnVzL2ltYWdlcy9kZXBfZXh0ZXJuYWxfb ... YWRlciRIZWFkZXJWaWV3D2dkrp784OTosLLEOFxy/mWBtsit I6kjKRlZ/ 1IBCkZNk='),
              (r'__EVENTVALIDATION', r'/wEWBALn79faCwK+qZJIAqXY04cBAorCkdMKL5VEAnd1IIQ3cnIHRxZAluFo5G5Y5ffyRXRdtmBiGCc='),
              (r'__EVENTTARGET', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber'),
              (r'__EVENTARGUMENT', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber')
             ]

# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
r = urllib2.urlopen(req)

# Handle results
print r.read()

The page that is returned says "Sorry, we are having technical difficulties. Please try your request again later" so I know I must be messing something up. I am not sending a cookie, but I wasn't sure if this was necessary. If it is, can I just add "Cookie:ASP.NET_SessionId=whatever" to my headers or do I need to use CookieLib?

Any thoughts on what is going wrong would be most appreciated!

EDIT: Here is an updated version of the code that pulls the __VIEWSTATE and __EVENTVALIDATION information from the page directly (so I don't need to copy and paste it or worry about it having expired)

from urllib import urlencode
import urllib2
from BeautifulSoup import BeautifulSoup
import cookielib

# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
# Create headers
headers = {
    'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Host': 'www.paoilandgasreporting.state.pa.us',
    'Origin': 'https://www.paoilandgasreporting.state.pa.us',
    'Referer': 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx',
    'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16',
}

# Set up cookie jar
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPSHandler(debuglevel=1))

# Grab information that we need to pass along with our requests
#r = urllib2.urlopen(uri)
req = urllib2.Request(uri,urlencode([]),headers)
cj.add_cookie_header(req)
r = opener.open(req)
print cj


soup = BeautifulSoup(r.read())
eventvalidation = soup.find('input', id='__EVENTVALIDATION')['value']
viewstate = soup.find('input', id='__VIEWSTATE')['value']
formFields = [  ('__EVENTVALIDATION',eventvalidation),
                ('__VIEWSTATE',viewstate),
                ('__EVENTTARGET', ''),
                ('__EVENTARGUMENT', ''),
                ('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber', '003-00013'),
                ('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
                ('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'), # TODO what value to pass?
                ('ctl00$MainContent$WellDetailsCriteria1$ViewDataButton','View Data'), # do we need this?
             ]


# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
cj.add_cookie_header(req)
r = opener.open(req)

# Handle results
print r.read()

回答1:


I had the same problem. The only solution I've found (which is not the best I guess) is to parse the data "while first_visit is True:" and after submit your form with these values.

        for result in the_page.findAll('input', attrs={'name' : '__VIEWSTATE'}):
            view_state =  result['value']

        for result_1 in the_page.findAll('input', attrs={'name' : '__EVENTVALIDATION'}):
            event_validation =  result_1['value']

        for result_2 in the_page.findAll('input', attrs={'name' : '__PREVIOUSPAGE'}):
            previous_page =  result_2['value']

        for result in the_page.findAll('input', attrs={'name' : '__EVENTTARGET'}):
            event_target =  result['value']

And after :

        url = 'http://bandscore.ielts.org/search.aspx'
        values = {
                                '__EVENTTARGET' : 'gdvSearchResults',
                                '__EVENTARGUMENT' : page,
                            '__VIEWSTATE' : view_state,
                            '__PREVIOUSPAGE' : previous_page,
                            '__EVENTVALIDATION' : event_validation,
                            'DropDownList1'  : Country,
                            #'txtSearchInstitution'  : '',
                            #'hdnSearchText'  : '',
                            #'rdoFilter': '%25',
        }
        user_agent = 'Mozilla/5 (Solaris 10) Gecko'
        headers = { 'User-Agent' : user_agent }
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data, headers)
        response = urllib2.urlopen(req)
        thePage = response.read()
        the_page = soup(thePage)

I am quite a n00b in Python but that worked for me ... so if it can help .. !




回答2:


You must not be submitting the data they are expecting, which is generating that error. You could find out exactly what your browser submits and then replicating this in your script. There are various firefox extensions that will help you do this, such as TamperData, Firebug, and LiveHttp.

However probably your easiest option would be to use mechanize.



来源:https://stackoverflow.com/questions/5380638/unable-to-load-asp-net-page-using-python-urllib2

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!