问题
I'm sending request to https://cri.nbb.be/bc9/web/catalog?lang=E&companyNr=0456597806 get response then I want select more documents in filter example 100, but filter deos not work. I can not find all documents in response only 10 first.
import requests
url = "https://cri.nbb.be/bc9/web/catalog?lang=E&companyNr=0456597806"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
}
with requests.Session() as session:
r = session.get('https://cri.nbb.be/bc9/web/catalog?lang=E&companyNr=0456597806', headers=headers)
r = session.post(r.url, headers=headers, data='javax.faces.partial.ajax=true&javax.faces.source=j_idt131%3Aj_idt165&javax.faces.partial.execute=j_idt131&javax.faces.partial.render=j_idt131&j_idt131%3Aj_idt165=j_idt131%3Aj_idt165&j_idt131=j_idt131&j_idt131%3Aj_idt135_stateholder=panel_param_visible%3B&j_idt131%3Aj_idt165_selection=&j_idt131%3Aj_idt165_openDirect=&j_idt131%3Aj_idt165_pagerAction=action_number_of_rows_changed&j_idt131%3Aj_idt165_numberOfRows=100&j_idt131%3Aj_idt165_sortColumn=&j_idt131%3Aj_idt165_sortOrder=ascending&j_idt131%3Aj_idt164_stateholder=panel_param_visible%3B&javax.faces.ViewState=e2s1')
print(r.content)
回答1:
To change the pagination is little bit tricky on this page. You need to collect all <input>
names and values and then add "action_number_of_rows_changed"
and 100
to right ones:
import requests
from bs4 import BeautifulSoup
url = 'https://cri.nbb.be/bc9/web/catalog?lang=E&companyNr=0456597806'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
with requests.session() as s:
soup = BeautifulSoup(s.get(url, headers=headers).content, 'html.parser')
ajax_url = 'https://cri.nbb.be' + soup.select('form')[-1]['action']
soup.select_one('input[name$="_pagerAction"]')['value'] = 'action_number_of_rows_changed'
soup.select_one('input[name$="_numberOfRows"]')['value'] = '100' # <--- change to desired value
data = {}
for i in soup.select('form')[-1].select('input'):
data[i['name']] = i.get('value', '')
soup = BeautifulSoup(s.post(ajax_url, headers=headers, data=data).content, 'html.parser')
# print new content:
for i, tr in enumerate(soup.select('.baseDataTable tr:has(td)'), 1):
print(i, [td.get_text(strip=True) for td in tr.select('td')])
Prints:
1 ['', '31/12/2018', 'Annual accounts', 'F', '26/06/2019', '2019-25200267', 'Download (37 KB)', 'Download']
2 ['', '31/12/2017', 'Annual accounts', 'F', '21/06/2018', '2018-24700578', 'Download (26 KB)', 'Download']
3 ['', '31/12/2016', 'Annual accounts', 'F', '25/08/2017', '2017-48600262', 'Download (27 KB)', 'Download']
4 ['', '31/12/2015', 'Annual accounts', 'F', '16/06/2016', '2016-18600096', 'Download (29 KB)', 'Download']
5 ['', '31/12/2014', 'Annual accounts', 'F', '16/07/2015', '2015-31900048', 'Download (34 KB)', 'Download']
6 ['', '31/12/2013', 'Annual accounts', 'F', '08/07/2014', '2014-28600526', 'Download (34 KB)', 'Download']
7 ['', '31/12/2012', 'Annual accounts', 'F', '17/06/2013', '2013-18300220', 'Download (36 KB)', 'Download']
8 ['', '31/12/2011', 'Annual accounts', 'F', '21/05/2012', '2012-12200223', 'Download (35 KB)', 'Download']
9 ['', '31/12/2010', 'Annual accounts', 'F', '30/05/2011', '2011-13600404', 'Download (35 KB)', 'Download']
10 ['', '31/12/2009', 'Annual accounts', 'F', '27/05/2010', '2010-13200289', 'Download (33 KB)', 'Download']
11 ['', '31/12/2008', 'Annual accounts', 'F', '24/06/2009', '2009-28600165', 'Download (33 KB)', 'Download']
12 ['', '31/12/2007', 'Annual accounts', 'F', '15/05/2008', '2008-14100115', 'Download (30 KB)', 'Download']
13 ['', '31/12/2006', 'Annual accounts', 'F', '03/07/2007', '2007-34400058', 'Download (28 KB)', 'Download']
14 ['', '31/12/2005', 'Annual accounts', 'F', '21/08/2006', '2006-64703392', 'Download (12 KB)', '']
15 ['', '31/12/2004', 'Annual accounts', 'F', '29/09/2005', '2005-75204389', 'Download (12 KB)', '']
16 ['', '31/12/2003', 'Annual accounts', 'F', '02/08/2004', '2004-57502594', 'Download (16 KB)', '']
17 ['', '31/12/2002', 'Annual accounts', 'F', '04/11/2003', '2003-78000638', 'Download (17 KB)', '']
18 ['', '31/12/2001', 'Annual accounts', 'F', '01/10/2002', '2002-61102242', 'Download (17 KB)', '']
19 ['', '31/12/2000', 'Annual accounts', 'F', '09/10/2001', '2001-63001711', 'Download (17 KB)', '']
20 ['', '31/12/1999', 'Annual accounts', 'F', '05/09/2000', '2000-54700887', 'Download (21 KB)', '']
21 ['', '31/12/1998', 'Annual accounts', 'F', '29/10/1999', '1999-56100291', 'Download (21 KB)', '']
22 ['', '31/12/1997', 'Annual accounts', 'F', '07/05/1998', '1998-08701537', '', '']
23 ['', '31/12/1996', 'Annual accounts', 'F', '16/10/1997', '1997-49304441', '', '']
EDIT: To download all *pdf and *xblr you can use this example:
import requests
from bs4 import BeautifulSoup
url = 'https://cri.nbb.be/bc9/web/catalog?lang=E&companyNr=0456597806'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
with requests.session() as s:
soup = BeautifulSoup(s.get(url, headers=headers).content, 'html.parser')
ajax_url = 'https://cri.nbb.be' + soup.select('form')[-1]['action']
soup.select_one('input[name$="_pagerAction"]')['value'] = 'action_number_of_rows_changed'
soup.select_one('input[name$="_numberOfRows"]')['value'] = '100' # <--- change to desired value
data = {}
for i in soup.select('form')[-1].select('input'):
data[i['name']] = i.get('value', '')
soup = BeautifulSoup(s.post(ajax_url, headers=headers, data=data).content, 'html.parser')
ajax_url = 'https://cri.nbb.be' + soup.select('form')[-1]['action']
for n, a in enumerate(soup.select('a[id*="pdfDownload"]'), 1):
if 'Download' not in a.text:
continue
print('Downloading {}.pdf'.format(n))
data = {}
for i in soup.select('form')[-1].select('input'):
data[i['name']] = i.get('value', '')
data[a['id']] = a['id']
local_filename = '{}.pdf'.format(n)
with s.post(ajax_url, headers=headers, data=data, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
for n, a in enumerate(soup.select('a[id*="xbrlDownload"]'), 1):
if 'Download' not in a.text:
continue
print('Downloading {}.xbrl'.format(n))
data = {}
for i in soup.select('form')[-1].select('input'):
data[i['name']] = i.get('value', '')
data[a['id']] = a['id']
local_filename = '{}.xbrl'.format(n)
with s.post(ajax_url, headers=headers, data=data, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
Prints:
Downloading 1.pdf
Downloading 2.pdf
Downloading 3.pdf
Downloading 4.pdf
Downloading 5.pdf
Downloading 6.pdf
Downloading 7.pdf
Downloading 8.pdf
Downloading 9.pdf
Downloading 10.pdf
Downloading 11.pdf
Downloading 12.pdf
Downloading 13.pdf
Downloading 14.pdf
Downloading 15.pdf
Downloading 16.pdf
Downloading 17.pdf
Downloading 18.pdf
Downloading 19.pdf
Downloading 20.pdf
Downloading 21.pdf
Downloading 1.xbrl
Downloading 2.xbrl
Downloading 3.xbrl
Downloading 4.xbrl
Downloading 5.xbrl
Downloading 6.xbrl
Downloading 7.xbrl
Downloading 8.xbrl
Downloading 9.xbrl
Downloading 10.xbrl
Downloading 11.xbrl
Downloading 12.xbrl
Downloading 13.xbrl
And saves them to disk.
来源:https://stackoverflow.com/questions/62251937/im-sending-request-to-page-get-response-then-i-want-select-more-documents-in-fi