python beautifulsoup extracting text

旧街凉风 提交于 2019-12-02 04:42:52

Make sure you understand what is going on here:

import urllib2
import datetime

from bs4 import BeautifulSoup as soup


url = "http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours"
web_soup = soup(urllib2.urlopen(url))

table = web_soup.find(name="div", attrs={'class': 'c1'}).find_all(name="div")[2].find_all('table')[0]

data = {}
bold_time = ''
cur_time = datetime.datetime.strptime("12AM", "%I%p")
for tr_index, tr in enumerate(table.find_all('tr')):
    if 'Time' in tr.text:
        continue
    for td_index, td in enumerate(tr.find_all('td')):
        if not td_index:
            continue
        data[cur_time] = td.text.strip()
        if td.find('strong'):
            bold_time = cur_time
        cur_time += datetime.timedelta(hours=1)

print data.get(bold_time)  # bold
print data.get(bold_time - datetime.timedelta(hours=1))  # before bold
print data.get(bold_time - datetime.timedelta(hours=2))  # before before bold

This will print the 3-hr PSI value that is marked in bold and two values before it (if exist).

Hope that helps.

This code (see lines with #changed text)

from pprint import pprint
import urllib2
from bs4 import BeautifulSoup as soup


url = "http://app2.nea.gov.sg/anti-pollution-radiation-protection/air-pollution/psi/psi-readings-over-the-last-24-hours"
web_soup = soup(urllib2.urlopen(url))

table = web_soup.find(name="div", attrs={'class': 'c1'}).find_all(name="div")[2].find_all('table')[0]

table_rows = []
for row in table.find_all('tr'):
    table_rows.append([td.text.strip() for td in row.find_all('td')])

data = [] # changed
for tr_index, tr in enumerate(table_rows):
    if tr_index % 2 == 0:
        for td_index, td in enumerate(tr):
            data.append([td, table_rows[tr_index + 1][td_index]]) # changed

pprint(data)

give you

[[u'Time', u'3-hr PSI'],
 [u'12AM', u'57'],
 [u'1AM', u'-'],
 [u'2AM', u'-'],
 [u'3AM', u'-'],
 [u'4AM', u'-'],
 [u'5AM', u'-'],
 [u'6AM', u'-'],
 [u'7AM', u'-'],
 [u'8AM', u'-'],
 [u'9AM', u'-'],
 [u'10AM', u'-'],
 [u'11AM', u'-'],
 [u'Time', u'3-hr PSI'],
 [u'12PM', u'-'],
 [u'1PM', u'-'],
 [u'2PM', u'-'],
 [u'3PM', u'-'],
 [u'4PM', u'-'],
 [u'5PM', u'-'],
 [u'6PM', u'-'],
 [u'7PM', u'-'],
 [u'8PM', u'-'],
 [u'9PM', u'-'],
 [u'10PM', u'-'],
 [u'11PM', u'-']]

and print data[4:7] give you

[[u'3AM', u'-'], [u'4AM', u'-'], [u'5AM', u'-']]
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!