Grabbing data from subsequent pages of a website

问题

I'm trying to grab data from every page of the returned results for this page.

https://www.azjobconnection.gov/ada/mn_warn_dsp.cfm?def=false&securitysys=on

It's hard to verify if I've grabbed everything since when you hit the next page button everything gets out of order. The only page that is sorted by year is the first page. Subsequent pages have the data outside the range originally selected. For instance, if you enter 01/01/2020 at the search page, the first page returned will have only Jan 2020 and beyond entries. But once you hit next page, you get entries from 2016, 2018, ....

I just want to be able to enter in Jan 1, 2020 and get all data in the date range (Jan 2020 - today). I tried entering the end date and that does not help. I understand I'll need to do more to get the data I seek; but for now I just need to ensure I'm grabbing each entry from all returned pages. The site shows there are ~ 134 obs for Jan 1, 2020 to today. My output only has ~ 50 unique values.

I'm very new to web-scraping so if you could keep the suggestions very simple I'd appreciate it. Thanks

# Imports
from bs4 import BeautifulSoup
from requests import Session

# Session Object
session = Session()

# Add a user agent, so the request looks more human like.
session.headers.update({
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
})

# Initial sesssion, you need to fetch the url first, so the authenticity
# token can be parsed out of the html
init_session = session.get(url="https://www.azjobconnection.gov/ada/mn_warn_dsp.cfm?def=false")

# Beautiful soup object, used for HTML parsing
soup = BeautifulSoup(init_session.content, "html.parser")

# Get all of the input tags
inputs = soup.findAll('input')


# Upon running, we see that the authenticity token, is the first element in the array.
authenticty_token = inputs[0]['value']

# Now we can make our request!

data = {
    "authenticity_token" : authenticty_token,
    "coname": "", 
    "coName_ADAdefault": "", 
    "coName_verify_char[0|50]": "The value you have supplied for Company Name is too long.",
    "city": "", 
    "city_ADAdefault": "", 
    "city_verify_char[0|45]": "The value you have supplied for City is too long.",
    "zip": "", 
    "zip_ADAdefault": "", 
    "zip_verify_char[0|10]": "The value you have supplied for Zip/Postal Code is too long.",
    "sda": "", 
    "startdate": "01/01/2020",
    "startDate_ADAdefault": "mm/dd/yyyy",
    "startDate_verify_date4": "",
    "startDate_verify_char[0|45]": "The value you have supplied for Start Date is too long.",
    "enddate": "mm/dd/yyyy",
    "endDate_ADAdefault": "mm/dd/yyyy",
    "endDate_verify_date4": "", 
    "endDate_verify_char[0|45]": "The value you have supplied for End Date is too long.",
    "layoffType": "y",
    "search": "Search",
    "old_choice": 1,
    "ZIP_prev": "",
    "def_prev": "false",
    "CITY_prev": "",
    "SDA_prev": "",
    "STARTDATE_prev": "", 
    "CONAME_prev": "",
    "ENDDATE_prev": "",
    "FormName": "Form0",
}

# Get the data
get_warn_data = session.post("https://www.azjobconnection.gov/ada/mn_warn_dsp.cfm?securitysys=on&start_row=1&max_rows=25&FormID=0", data=data)


soup = BeautifulSoup(get_warn_data.content, "html.parser")

# These are all the hash taags you need to go to to get data and the links
# for the other pages you possibly need to go to to get data.
targets = soup.find_all('a', href = True)

import re
regex = re.compile("(?=\").*(?<=\")")
targets2 = [re.search(regex, str(a)).group(0) for a in targets]

# These are the url parts to which you need to append the url_head then run a
# request on the entire url; also, python shows &amp; where it should just be
# &; making that substitution here

# FIRST SET OF IDs to pull data on; will append businesses gathered from 
# the other pages
bus1 = [a for a in targets2 if 'mn_warn_dsp' in a and 'hash' in a]
bus1 = [re.sub(r"\"", "", a) for a in bus1]
bus1 = [re.sub(r"&amp;", "&", a) for a in bus1]



# Most queries will return multiple pages of business; need to loop through the pages to get
# all of the businesses; business from here will be combined with business from first
# page above; 
more_pages = [a for a in targets2 if 'start_row' in a and 'max_row' in a and 'orderby' in a]
more_pages = [re.sub(r"\"", "", a) for a in more_pages]
more_pages = [re.sub(r"&amp;", "&", a) for a in more_pages]
# Getting rid of ada... part from all additional page url parts; will attach
# to all below
more_pages = [re.sub(r"/ada/mn_warn_dsp.cfm", "", a) for a in more_pages]



# url prefixes for businesses already have the mn_warn_dsp part; the additional
# page urls do not; for url parts in "more_pages", append url_head_pages; for
# businesses, append the url_head
url_head = "https://www.azjobconnection.gov/ada/"
url_head_pages = "https://www.azjobconnection.gov/ada/mn_warn_dsp.cfm"

# Going to additional pages and getting all the ids/hash number
# Here, I'm just repeating on subsequent pages what I did on the first page; no
# need to check for additional pages here. Just going through each page
# and grabbing the hash marks

hash_hold = []
for page in more_pages:
    test123 = url_head_pages+page # url of the page with additional businesses
    work_now = session.get(url = test123) # getting html to parse
    soup = BeautifulSoup(work_now.content, "html.parser")
    targets = soup.find_all('a', href = True) # finding all ids/hash values
    regex = re.compile("(?=\").*(?<=\")") # getting stuff between double quotes
    targets2 = [re.search(regex, str(a)).group(0) for a in targets]
    bus2 = [a for a in targets2 if 'mn_warn_dsp' in a and 'hash' in a]
    bus2 = [re.sub(r"\"", "", a) for a in bus1]
    bus2 = [re.sub(r"&amp;", "&", a) for a in bus1]
    hash_hold.append(bus2)


# hash_hold has hash/ids from subsequent pages and the bus1 has hash/ids
# from the first page; joining them all together here to get all hash/ids
# we need
hash_hold.append(bus1)


# These are all of the hash/ids/businesses I captured; notice it is much smaller than the number of returned results if you search from Jan. 1, 2020 to today
from pandas.core.common import flatten
businesses_use = list(flatten(hash_hold))

回答1:

This script goes from 1st January 2020 (~136 records) (not chronologically, they are sorted by Name, but you can sort it by date easily):

import requests
from bs4 import BeautifulSoup


url = 'https://www.azjobconnection.gov/ada/mn_warn_dsp.cfm?def=false&securitysys=on'
page_url = 'https://www.azjobconnection.gov/ada/mn_warn_dsp.cfm?securitysys=on&start_row={p}&max_rows=50&orderby=employer&choice=1'

with requests.session() as s:
    soup = BeautifulSoup(s.get(url).content, 'html.parser')

    data = {}
    for i in soup.select('input'):
        data[i['name']] = i.get('value', '')
    del data['clear']
    data['startdate'] = '01/01/2020'
    data['layoffType'] = 'y'

    p = 1
    while True:
        soup = BeautifulSoup(s.post(page_url.format(p=p), data=data).content, 'html.parser')

        for i, tr in enumerate(soup.select('tr.cfOutputTableRow'), p):
            tds = [td.get_text(strip=True) for td in tr.select('td')]
            print(i, tds)

        if i % 50:
            break

        p += 50

Prints:

1 ['Aecom', 'Glendale', '85310', '7', '01/17/2020']
2 ['Ahern Rentals Inc.', 'Phoenix', '85006', '5', '03/28/2020']
3 ['Alsco', 'Yuma', '85365', '9', '04/07/2020']
4 ['Amentum', 'Yuma', '85364', '9', '03/13/2020']
5 ['AmSafe', 'Phoenix', '85043', '5', '05/12/2020']
6 ['Ares Collective Restaurants', 'Tucson', '85715', '6', '03/23/2020']
7 ['Arizona Grand Resort', 'Phoenix', '85044', '5', '05/04/2020']
8 ['Atrium Hospitality', 'Glendale', '85305', '7', '03/26/2020']
9 ['Avis Budget', 'Phoenix', '85034', '5', '04/08/2020']
10 ['Bella Fresh', 'Phoenix', '85043', '5', '02/05/2020']
11 ['Benihana Ahwatukee', 'Phoenix', '85044', '7', '04/05/2020']
12 ['Benihana Chandler', 'Chandler', '85226', '7', '04/05/2020']
13 ['Benihana Mid town', 'Scottsdale', '85251', '7', '04/05/2020']
14 ['Benihana Scottsdale', 'Scottsdale', '85254', '7', '04/05/2020']
15 ['Best Western Hotels & Resorts', 'Phoenix', '85016', '5', '03/25/2020']
16 ['Black Bear Diner', 'Laveen', '85339', '7', '04/09/2020']
17 ['Camby Hotel', 'Phoenix', '85021', '5', '05/07/2020']
18 ['Camelback Inn Resort & Spa (JW Marriott)', 'Scottsdale', '85253', '7', '06/03/2020']
19 ['Cameron Mitchell Restaurants, LLC', 'Columbus', '85054', '5', '03/24/2020']
20 ['Cinemark', 'Tucson', '85713', '6', '04/01/2020']
21 ['civana', 'Carefree', '85377', '7', '04/03/2020']
22 ['civana', 'Carefree', '85377', '7', '04/03/2020']
23 ['CMA CGM (America) LLC', 'Scottsdale', '85254', '7', '01/03/2020']
24 ['Cocopah Bend RV Resort and Golf', 'Yuma', '85364', '9', '04/07/2020']
25 ['Cocopah Casino and Resort', 'Somerton', '85350', '9', '04/07/2020']
26 ['Cocopah indian Tribe', 'Somerton', '85350', '9', '04/07/2020']
27 ['COX Automotive', 'Phoenix', '85040', '5', '05/08/2020']
28 ["Denny's (Beshay Enterprises)", 'Murrieta', '92562', '2003', '03/23/2020']
29 ["Denny's (Beshay Enterprises)", 'Murrieta', '92562', '5', '03/23/2020']
30 ["Denny's (Beshay Enterprises)", 'Murrieta', '92562', '5', '03/23/2020']
31 ["Denny's (Beshay Enterprises)", 'Murrieta', '92562', '5', '03/23/2020']
32 ["Denny's (Beshay Enterprises)", 'Murrieta', '92562', '5', '03/23/2020']
33 ["Denny's (Beshay Enterprises)", 'Murrieta', '92562', '5', '03/23/2020']
34 ['Doubletree Suites by Hilton Phoenix', 'Phoenix', '85008', '5', '04/15/2020']
35 ['Drive Time', 'Phoenix', '85040', '5', '03/27/2020']
36 ['Drive Time', 'Phoenix', '85040', '5', '03/27/2020']
37 ['Dyncorp', 'Tucson', '85704', '6', '01/15/2020']
38 ['Embassy Suites Tempe', 'Tempe', '85282', '5', '03/18/2020']
39 ['Estrellita Child Care center', 'San Luis', '85336', '9', '04/07/2020']
40 ['Evolution Hospitality', 'sedona', '85325', '10', '04/03/2020']
41 ['Fairmont Scottsdale Princess', 'Scottsdale', '85255', '7', '06/08/2020']
42 ['FEAST American Diners', 'Phoenix', '85010', '7', '03/23/2020']
43 ['Flagstaff DoubleTree', 'Flagstaff', '86001', '10', '04/02/2020']
44 ['Flying Food Group, LLC', 'Phoenix', '85003', '5', '04/10/2020']
45 ['Four Seasons Resort', 'Scottsdale', '85262', '7', '03/20/2020']
46 ['Fruit Growers Supply', 'Yuma', '85365', '9', '05/29/2020']
47 ['GBT US LLC', 'Scottsdale', '85254', '7', '04/13/2020']
48 ['Go Rentals', 'Newport', '92660', '7', '03/23/2020']
49 ['Great Wolf Lodge', 'Scottsdale', '85258', '7', '03/30/2020']
50 ['Guess?, Inc', 'Glendale', '85305', '7', '04/13/2020']
51 ['Hertz', 'Phoenix', '85034', '7', '04/29/2020']
52 ['Hexcel', 'Casa Grande', '85122', '2003', '04/20/2020']
53 ['Holiday Inn Hotels', 'Yuma', '85364', '9', '03/31/2020']
54 ['HotChalk', 'Phoenix', '85034', '5', '02/25/2020']
55 ['Huhtamaki', 'Googyear', '85338', '7', '04/16/2020']
56 ['Hyatt Regency Scottsdale Resort & Spa at Gainey Ra', 'Scottsdale', '85258', '7', '06/12/2020']
57 ['IHG-Army Hotels, candlewood Suites', 'Yuma Proving grounds', '85365', '9', '04/07/2020']
58 ['International Cruise & Excursion Gallery', 'Scottsdale', '85256', '7', '03/24/2020']
59 ['Islands Restaurants', 'Phoenix', '85050', '7', '03/23/2020']
60 ['James River Insurance Company', 'Scottsdale', '85254', '7', '05/15/2020']
61 ['Katerra', 'Scottsdale', '85258', '7', '04/02/2020']
62 ['KDC Construction', 'Irvine', '92606', '7', '03/20/2020']
63 ["King's Seafood Company LLC", 'Tempe', '85281', '7', '03/24/2020']
64 ["L'Auberge de Sedona", 'Sedona', '85600', '10', '04/03/2020']
65 ['LM Industries', 'Chandler', '85226', '7', '05/01/2020']
66 ['Loews Ventana Canyon Resort', 'Tucson', '85750', '6', '05/29/2020']
67 ['Lucille’s Smokehouse Bar-B-Que', 'Tempe', '85282', '7', '04/20/2020']
68 ["Macy's Credit and Customer Services, Inc.", 'Tempe', '85281', '7', '01/06/2020']
69 ['MAPFRE Insurance - Enterprise Contact Center', 'Gilbert', '85234', '7', '01/13/2020']
70 ['Massage Envy', 'Scottsdale', '85260', '7', '04/15/2020']
71 ['McCormick Scottsdale', 'Scottsdale', '85253', '7', '03/30/2020']
72 ['Medieval Times Dinner & Tournament', 'Scottsdale', '85258', '7', '04/08/2020']
73 ['Mind Body', 'Scottsdale', '85257', '7', '04/07/2020']
74 ['Movement for Life Inc.', 'San Obispo', '93401', '6', '03/25/2020']
75 ['Northrop Grumman', 'Falls Church', '22042', '1', '03/09/2020']
76 ['old spagetti Factory', 'Chandler', '85226', '7', '03/24/2020']
77 ['Onni Properties', 'Phoenix', '85019', '5', '03/25/2020']
78 ['Open Door', 'Scottsdale', '85251', '7', '04/15/2020']
79 ['PAE Government Services', 'Yuma', '85365', '9', '05/28/2020']
80 ['Page Elks Lodge 2498', 'Page', '86040', '10', '04/06/2020']
81 ['Papersource', 'Pheonix', '85016', '5', '03/27/2020']
82 ['Pappas Restaurants', 'Phoenix', '85003', '5', '03/25/2020']
83 ['Passport Health', 'Scottsdale', '85262', '7', '03/23/2020']
84 ['Phoenix Desert Ridege Resort & Spa (JW Marriott)', 'Phoenix', '85054', '5', '06/03/2020']
85 ['Phoenix Glendale Renaissance', 'Glendale', '85305', '7', '03/26/2020']
86 ['Pima Valve', 'Chandler', '85226', '7', '03/18/2020']
87 ['Pink Adventure Tours', 'Sedona', '86336', '10', '04/14/2020']
88 ['Prospect', 'Phoenix', '85034', '7', '04/09/2020']
89 ['Punch Bowl social', 'Phoenix', '85004', '5', '03/18/2020']
90 ['RA Kierland Restaurant', 'Scottsdale', '85254', '6', '04/05/2020']
91 ['RA Mesa Corp', 'Mesa', '85204', '6', '04/05/2020']
92 ['Renaissance Phoenix Downtown Hotel', 'Phoenix, Arizona', '85004', '5', '06/01/2020']
93 ['Residence Inn/Courtyard Phoenix Downtown', 'Phoenix', '85004', '5', '03/27/2020']
94 ['Roadhouse cinemas', 'Tucson', '85712', '6', '03/18/2020']
95 ['Saddle Ranch Chop House', 'Glendale', '85305', '7', '03/27/2020']
96 ['Sam Levits', 'Tucson', '85705', '6', '03/25/2020']
97 ['Sam Levitz furniture', 'Tucson', '85705', '6', '03/25/2020']
98 ['Sanctuary Camelback', 'Phoenix', '85014', '5', '04/02/2020']
99 ["SAS Restaurant Ventures (Denny's)", 'Phoenix', '85022', '5', '03/31/2020']
100 ['Scottsdale Marriott at McDowell Mountains', 'Scottsdale', '85260', '7', '06/02/2020']
101 ['Scottsdale Marriott Old Yown', 'Scottsdale', '85251', '7', '06/04/2020']
102 ['Shamrock Farms', 'Phoenix', '85009', '5', '04/02/2020']
103 ['Sheraton Phoenix Downtown', 'Phoenix', '85022', '5', '06/02/2020']
104 ['Specialty Textile', 'Phoenix', '85007', '5', '03/30/2020']
105 ['Starr Pass Resort & Spa (JW Marriott)', 'Tucson', '85745', '6', '06/01/2020']
106 ['Sub-Zero Group Inc', 'Goodyear', '85340', '7', '03/20/2020']
107 ['Suit Supply', 'Scottsdale', '85254', '7', '04/08/2020']
108 ['Surprise Honda', 'Surprise', '85388', '7', '03/24/2020']
109 ['Surprise Honda', 'Surprise', '85388', '7', '03/25/2020']
110 ['Sushi Tucson', 'Tucson', '85717', '6', '04/05/2020']
111 ['SW Hotels and Resorts WW llc', 'Scottsdale', '85251', '7', '06/03/2020']
112 ['Tanque Verde Ranch', 'Tucson', '85748', '6', '03/27/2020']
113 ['Taylor Farms', 'Yuma', '85666', '9', '03/27/2020']
114 ['Taylor farms', 'Salinas', '93902', '9', '04/07/2020']
115 ['The Antiqua Group', 'Peoria', '85382', '7', '04/24/2020']
116 ['The Orchards', 'Sedona', '85600', '10', '04/03/2020']
117 ['The Phoenician', 'Phoenix', '85251', '5', '06/03/2020']
118 ['The Ritz-Carlton', 'Marana', '85658', '6', '06/05/2020']
119 ['The Royal Palms Resort and Spa', 'Phoenix', '85018', '5', '06/08/2020']
120 ['The Scott Resort and Spa', 'Scottsdale', '85251', '7', '05/04/2020']
121 ['The Scottsdale Resort at McCormick Ranch', 'Scottsdale', '85258', '7', '06/05/2020']
122 ['The Sheraton Grand at Wild Horse Pass', 'Chandler', '85226', '7', '06/03/2020']
123 ['The Westin Kierland Resort and Spa', 'Scottsdale', '85254', '7', '06/05/2020']
124 ['The Westin Kierland Villas', 'Scottsdale', '85254', '7', '06/05/2020']
125 ['The Westin Phoenix Downtown', 'Phoenix', '85004', '5', '06/05/2020']
126 ['TMI Acquisitions LLC', 'Tucson', '85713', '6', '01/10/2020']
127 ['Transportation Brokerage Specialists Inc (TBS)', 'Costa Mesa', '92626', '7', '02/20/2020']
128 ['Transportation Brokerage Specialists Inc (TBS)', 'Costa Mesa', '92626', '7', '02/20/2020']
129 ['Tucson Marriott University park', 'Tucson', '85719', '6', '03/26/2020']
130 ['Tuesday Morning, Inc.', 'Phoenix', '85006', '5', '04/22/2020']
131 ['Tufesa USA, LLC', 'Phoenix', '85009', '5', '04/15/2020']
132 ['Uber Technologies', 'Phoenix', '85004', '5', '05/07/2020']
133 ['Vision Works', 'Chandler', '85226', '7', '04/23/2020']
134 ['Wild River Family Entertainment Center', 'Somerton', '85350', '9', '04/07/2020']
135 ['Yelp', 'Scottsdale', '85251', '7', '04/09/2020']
136 ['Zip Recruiter', 'Santa Monica', '90401', '7', '03/27/2020']

来源：https://stackoverflow.com/questions/62416246/grabbing-data-from-subsequent-pages-of-a-website

标签

python-3.x

web-scraping

beautifulsoup

python-requests