getting table value from nowgoal has got an index error

问题

I am quite new to scraping. I am getting links from nowgoal below is how I started navigating to above page. I do not wish to get link for all matches. but I will have an input txt file, which is attached Here and use the selected league and date. The following code will initialize as input:

#Intialisation
league_index =[]
final_list = []
j = 0
#config load
config = RawConfigParser()
configFilePath = r'.\config.txt'
config.read(configFilePath)
date = config.get('database_config','date')                     #input file provided by user - provide in YYYY-MM-DD format
leagues = config.get('database_config','leagues')               #input file provided by user - provide in windows format
headless_param =config.get('database_config','headless')        #Headless param - set True if you want to see bowser operating in foreground!
leagues_list = leagues.split(',')
print(leagues_list)

After I initialized with the preferred date and league, I will set up for chrome driver as follow:

options = webdriver.ChromeOptions()         #initialise webdriver options
#options.binary_location = brave_path        #if you are running the script on brave - then enable it
if headless_param == 'True' :
    print('headless')
    options.headless = True                 # if headeless parameter is set to true - the chrome browser will not appear in foreground
options.add_argument('start-maximized')     # Start the chrome maximised 
options.add_argument('disable-infobars')    # Disable infobars
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("prefs", {"profile.default_content_setting_values.cookies": 2})
options.add_experimental_option("prefs", {"profile.block_third_party_cookies": True})
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--incognito")         #Incognito mode


#intiate the driver
driver = webdriver.Chrome(resource_path('./drivers/chromedriver.exe'),options=options) 
#Format the url

url =  'http://www.nowgoal3.com/football/fixture/?f=ft0&date='+date


#get the url
driver.get(url)
#wait for some time
time.sleep(3)

driver.find_element_by_xpath('//*[@id="li_league"]').click()
time.sleep(5)
#click on the -team ranking
driver.find_element_by_xpath('//*[@id="TeamOrderCheck"]').click()

After this, you will be brought to the following page

I also add in the snap shot below

I try to get the data from the table by looping: the code is as follow:

> #Get the leagues name from page htmlSource = driver.page_source
> #Pass the htmlsource into soup soup = bs4.BeautifulSoup(htmlSource,'html.parser')
> #Table table = soup.select('table[id="table_live"]')
> #Rows of table all_rows = table[0].select('tr')
> #loop through each row 
for i , row in enumerate(all_rows[2:]) :
>     try:
>         key_word = row['class'][0]
>         print(key_word)
>         if 'Leaguestitle' in key_word:#if leagues got changed
>             league = row.a.text
>             print(row.a.text)
>             if row.a.text in leagues_list:
>                 j =1
>             else:
>                 j =0                
>         elif j== 1:
>             home_team = row.findAll('a')[0].text                                                #home team
>             print(home_team)
>             away_team = row.findAll('a')[1].text                                                #away team
>             match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))    
> #match_number
>             link  = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'            
> #link for 3 in 1 odds from the match code
>             home_ranking = row.findAll('span')[0].text.strip('[]')                              #home team ranking
>             away_ranking = row.findAll('span')[1].text.strip('[]')                              #Away team ranking
>             final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
>     except KeyError:
>         try:
>             if row['style']=='display:none':
>                 continue
>             elif j== 1:
>                 home_team = row.findAll('a')[0].text                                            #home team
>                 away_team = row.findAll('a')[1].text                                            #away team
>                 home_ranking = row.findAll('span')[0].text.strip('[]')                          #home team ranking
>                 away_ranking = row.findAll('span')[1].text.strip('[]')                          #Away team ranking
>                 match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_code associated with each match
>                 link  = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'            
> #link for 3 in 1 odds from the match code
>                 final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
>         except KeyError :
>             print('KeyError')
>             
> 
>     except IndexError:
>         if j== 1:
>             home_team = row.findAll('a')[0].text                                            #home team
>             away_team = row.findAll('a')[1].text                                            #away team
>             home_ranking = row.findAll('span')[0].text.strip('[]')                          #home team ranking
>             away_ranking = row.findAll('span')[1].text.strip('[]')                          #Away team ranking
>             match_number = ''.join(filter(str.isdigit,row.findAll('a')[2]['href'].strip()))
> #match_code associated with each match
>             link  = 'http://data.nowgoal.group/3in1odds/'+match_number+'.html'            
> #link for 3 in 1 odds from the match code
>             final_list.append([home_team,home_ranking,away_team,away_ranking,league,match_number,link])
>             print('IndexError-captured')        
> 
> print(final_list)#show the final result driver.quit()#close the
> browser

Then I print out the hometeam and the following results

Chelsea adtext-bg QC: MAY88.COM - NHÀ CÁI HỢP PHÁP NA UY - THƯỞNG NẠP 100% - HOÀN TRẢ 100TR - HỖ TRỢ 24/7

Then it threw me an index error as follow:

Traceback (most recent call last):
  File "D:/Football matters/Sttratagem data access/Games By Numbers/Nowgoal scraping project/codes/NOWGOAL-20200721T024808Z-001/NOWGOAL/PYFILES/Link_extractor_v1.3.py", line 124, in <module>
    away_team = row.findAll('a')[1].text                                                #away team
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "D:/Football matters/Sttratagem data access/Games By Numbers/Nowgoal scraping project/codes/NOWGOAL-20200721T024808Z-001/NOWGOAL/PYFILES/Link_extractor_v1.3.py", line 149, in <module>
    away_team = row.findAll('a')[1].text                                            #away team
IndexError: list index out of range

I am seeking your kind advice on that. I would be greatly appreciate for your help. Thanks, Zep.

回答1:

Prints all the information of each row.

wait = WebDriverWait(driver, 5)
driver.get('http://www.nowgoal3.com/football/fixture/?type=&f=sc1&date=2021-01-29')
league_list=["English Premier League",'Italian Serie A','England Championship','Spanish La Liga', 'Swedish Allsvenskan','USA Major League Soccer']
#wait for some time
wait.until(EC.element_to_be_clickable((By.ID, "li_league"))).click()
#click on the -team ranking
wait.until(EC.element_to_be_clickable((By.XPATH, "//label[@for='TeamOrderCheck']/span"))).click()
for league in league_list:
    try:  
        header = driver.find_element_by_xpath("//tr[@class='Leaguestitle fbHead']/td[2]/span/a[text()='"+league+"']")
        #print(len(header))
        print(header.text)
        rows = header.find_elements_by_xpath("./following::tr[contains(@id,'tr1')]")
        #print(len(rows))
        for row in rows:
            home = row.find_element_by_css_selector("td:nth-child(5) > a").text
            homeRank = row.find_element_by_css_selector("td:nth-child(5)  span.team-hg").text.strip('[]') 
            away = row.find_element_by_css_selector("td:nth-child(7) > a").text
            awayRank = row.find_element_by_css_selector("td:nth-child(7)  span.team-hg").text.strip('[]')
            link = row.find_element_by_css_selector("td.toolimg >a:nth-child(3)").get_attribute('href')
            link = ''.join(filter(lambda i: i.isdigit(), link))
            link  = 'http://data.nowgoal.group/3in1odds/'+link+'.html'
        print(home,homeRank,away,awayRank,link)
    except:
        continue

来源：https://stackoverflow.com/questions/65929976/getting-table-value-from-nowgoal-has-got-an-index-error

标签

python-3.x

selenium

web-scraping

beautifulsoup