Unable to load ASP.NET page using Python urllib2

问题

I am trying to do a POST request to https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx in order to scrape data.

Here is my current code:

from urllib import urlencode
import urllib2

# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
headers = {
    'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
    'HTTP_ACCEPT': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Content-Type': 'application/x-www-form-urlencoded'
}
formFields = [(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber','003-00013'), 
              (r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
              (r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'),
              (r'ctl00%24MainContent%24WellDetailsCriteria1%24ViewDataButton','View Data'),
              (r'__VIEWSTATE', r'/wEPDwUJOTc2MzI0NTk4D2QWAmYPDxYEHglQYWdlVGl0bGUFDFdlbGwgRGV0YWlscx4SUmVxdWlyZXNKYXZhU2NyaXB0Z2QWAgIDD2QWCGYPFgIeBFRleHQF1hA8ZGl2IHN0eWxlPSJoZWlnaHQ6IDE0OXB4OyB3aWR0aDogOTUycHg7IGJhY2tncm91bmQtcmVwZWF0OiBuby1yZXBlYXQ7IGJhY2tncm91bmQtaW1hZ2U6dXJsKGh0dHBzOi8vd3d3LmFoczIuZGVwLnN0YXRlLnBhLnVzL2ltYWdlcy9kZXBfZXh0ZXJuYWxfb ... YWRlciRIZWFkZXJWaWV3D2dkrp784OTosLLEOFxy/mWBtsit I6kjKRlZ/ 1IBCkZNk='),
              (r'__EVENTVALIDATION', r'/wEWBALn79faCwK+qZJIAqXY04cBAorCkdMKL5VEAnd1IIQ3cnIHRxZAluFo5G5Y5ffyRXRdtmBiGCc='),
              (r'__EVENTTARGET', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber'),
              (r'__EVENTARGUMENT', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber')
             ]

# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
r = urllib2.urlopen(req)

# Handle results
print r.read()

The page that is returned says "Sorry, we are having technical difficulties. Please try your request again later" so I know I must be messing something up. I am not sending a cookie, but I wasn't sure if this was necessary. If it is, can I just add "Cookie:ASP.NET_SessionId=whatever" to my headers or do I need to use CookieLib?

Any thoughts on what is going wrong would be most appreciated!

EDIT: Here is an updated version of the code that pulls the __VIEWSTATE and __EVENTVALIDATION information from the page directly (so I don't need to copy and paste it or worry about it having expired)

from urllib import urlencode
import urllib2
from BeautifulSoup import BeautifulSoup
import cookielib

# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
# Create headers
headers = {
    'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
    'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'en-US,en;q=0.8',
    'Connection': 'keep-alive',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Host': 'www.paoilandgasreporting.state.pa.us',
    'Origin': 'https://www.paoilandgasreporting.state.pa.us',
    'Referer': 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx',
    'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16',
}

# Set up cookie jar
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPSHandler(debuglevel=1))

# Grab information that we need to pass along with our requests
#r = urllib2.urlopen(uri)
req = urllib2.Request(uri,urlencode([]),headers)
cj.add_cookie_header(req)
r = opener.open(req)
print cj


soup = BeautifulSoup(r.read())
eventvalidation = soup.find('input', id='__EVENTVALIDATION')['value']
viewstate = soup.find('input', id='__VIEWSTATE')['value']
formFields = [  ('__EVENTVALIDATION',eventvalidation),
                ('__VIEWSTATE',viewstate),
                ('__EVENTTARGET', ''),
                ('__EVENTARGUMENT', ''),
                ('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber', '003-00013'),
                ('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
                ('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'), # TODO what value to pass?
                ('ctl00$MainContent$WellDetailsCriteria1$ViewDataButton','View Data'), # do we need this?
             ]


# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
cj.add_cookie_header(req)
r = opener.open(req)

# Handle results
print r.read()

回答1:

I had the same problem. The only solution I've found (which is not the best I guess) is to parse the data "while first_visit is True:" and after submit your form with these values.

        for result in the_page.findAll('input', attrs={'name' : '__VIEWSTATE'}):
            view_state =  result['value']

        for result_1 in the_page.findAll('input', attrs={'name' : '__EVENTVALIDATION'}):
            event_validation =  result_1['value']

        for result_2 in the_page.findAll('input', attrs={'name' : '__PREVIOUSPAGE'}):
            previous_page =  result_2['value']

        for result in the_page.findAll('input', attrs={'name' : '__EVENTTARGET'}):
            event_target =  result['value']

And after :

        url = 'http://bandscore.ielts.org/search.aspx'
        values = {
                                '__EVENTTARGET' : 'gdvSearchResults',
                                '__EVENTARGUMENT' : page,
                            '__VIEWSTATE' : view_state,
                            '__PREVIOUSPAGE' : previous_page,
                            '__EVENTVALIDATION' : event_validation,
                            'DropDownList1'  : Country,
                            #'txtSearchInstitution'  : '',
                            #'hdnSearchText'  : '',
                            #'rdoFilter': '%25',
        }
        user_agent = 'Mozilla/5 (Solaris 10) Gecko'
        headers = { 'User-Agent' : user_agent }
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data, headers)
        response = urllib2.urlopen(req)
        thePage = response.read()
        the_page = soup(thePage)

I am quite a n00b in Python but that worked for me ... so if it can help .. !

回答2:

You must not be submitting the data they are expecting, which is generating that error. You could find out exactly what your browser submits and then replicating this in your script. There are various firefox extensions that will help you do this, such as TamperData, Firebug, and LiveHttp.

However probably your easiest option would be to use mechanize.

来源：https://stackoverflow.com/questions/5380638/unable-to-load-asp-net-page-using-python-urllib2

标签

ASP.NET

python

screen-scraping

urllib2