问题
I have this assignment for a python class where I have to start from a specific link at a specific position, then follow that link for a specific number of times. Supposedly the first link has the position 1. This is the link: http://python-data.dr-chuck.net/known_by_Fikret.html
traceback error picture I have trouble with locating the link, the error "index out of range" comes out. can anyone help with figuring out how to locate the link/position? This is my code:
import urllib
from BeautifulSoup import *
url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
count = int(raw_input('Enter count: '))+1
position = int(raw_input('Enter position: '))
tags = soup('a')
tags_lst = list()
for tag in tags:
needed_tag = tag.get('href', None)
tags_lst.append(needed_tag)
for i in range(0,count):
print 'retrieving: ',tags_lst[position]
OK I wrote this code and it kind of works:
import urllib
from BeautifulSoup import *
url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
count = int(raw_input('Enter count: '))+1
position = int(raw_input('Enter position: '))
tags = soup('a')
tags_lst = list()
for tag in tags:
needed_tag = tag.get('href', None)
tags_lst.append(needed_tag)
for i in range(0,count):
print 'retrieving: ',tags_lst[position]
position = position + 1
I'm still getting other links than the ones in the example however when I print the whole list of links the positions match so I don't know. Very weird.
回答1:
[Edit: Cut+pasted this line from comments] Hi! I had to work in a similar exercise, and because i had some doubts i found your question. Here is my code and I think it works. I hope it will be helpful for you
import urllib
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
count = 8
position = 18
tags_lst = []
for x in xrange(count-1):
tags = soup('a')
my_tags = tags[position-1]
needed_tag = my_tags.get('href', None)
tags_lst.append(needed_tag)
url = str(needed_tag)
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
回答2:
Your BeautifulSoup import was wrong. I don't think it works with the code you show. Also your lower loop was confusing. You can get the list of urls you want by slicing the completely retrieved list.
I've hardcoded your url in my code because it was easier than typing it in each run.
Try this:
import urllib
from bs4 import BeautifulSoup
#url = raw_input('Enter - ')
url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# print soup
count = int(raw_input('Enter count: '))+1
position = int(raw_input('Enter position: '))
tags = soup('a')
# next line gets count tags starting from position
my_tags = tags[position: position+count]
tags_lst = []
for tag in my_tags:
needed_tag = tag.get('href', None)
tags_lst.append(needed_tag)
print tags_lst
回答3:
Almost all solutions to this assignment have two sections to load the urls. Instead, I defined a function that prints the relevant link for any given url.
Initially, the function will use the Fikret.html url as input. Subsequent inputs rely on refreshed urls that appear on the required position.
The important line of code is this one: url = allerretour(url)[position-1] This gets the new url that feeds the loop another round.
import urllib
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html' # raw_input('Enter URL : ')
position = 3 # int(raw_input('Enter position : '))
count = 4 #int(raw_input('Enter count : '))
def allerretour(url):
print('Retrieving: ' + url)
soup = BeautifulSoup(urllib.urlopen(url).read())
link = list()
for tag in soup('a'):
link.append(tag.get('href', None))
return(link)
for x in range(1, count + 2):
url = allerretour(url)[position-1]
回答4:
This is my solution:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter: ')
link_line = int(input("Enter position: ")) - 1
relative to first link
count = int(input("Enter count: "))
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
while count >= 0:
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
print(url)
url = tags[link_line].get("href", None)
count = count - 1
回答5:
This is my answer that worked for me in Python 2.7:
import urllib
from BeautifulSoup import *
URL = raw_input("Enter the URL:") #Enter main URL
link_line = int(raw_input("Enter position:")) - 1 #The position of link relative to first link
count = int(raw_input("Enter count:")) #The number of times to be repeated
while count >= 0:
html = urllib.urlopen(URL).read()
soup = BeautifulSoup(html)
tags = soup('a')
print URL
URL = tags[link_line].get("href", None)
count = count - 1
回答6:
Here is the working code giving the desired output
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
n=1
url = input('Enter - ')
count= int(input('Enter count'))+1
pos=int(input('Enter position'))
new=url
while n<count:
if new == url:
html = urllib.request.urlopen(url, context=ctx).read()
print('Retrieving', url)
html = urllib.request.urlopen(new, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
my_tags=tags[pos-1]
new=my_tags.get('href', None)
print('Retrieving' , new)
n=n+1
回答7:
I put the solution below, tested and working well as of today.
importing the require modules
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re
accessing websites
url = "http://py4e-data.dr-chuck.net/known_by_Vairi.html"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
all_num_list = list()
link_position = 18
Process_repeat = 7
Retrieve all of the anchor tags
tags = soup('a')
while Process_repeat - 1 >= 0 :
print("Process round", Process_repeat)
target = tags[link_position - 1]
print("target:", target)
url = target.get('href', 2)
print("Current url", url)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
Process_repeat = Process_repeat - 1
回答8:
import urllib.error, urllib.request
from bs4 import BeautifulSoup
#url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
url = input('Enter link - ')
count = int(input('Enter count - '))
position = int(input('position - ') )
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
my_tags = tags[position-1]
needed_tag = my_tags.get('href', None)
print("------ : ", tags[position-1].contents[0])
for x in range(count-1):
url = str(needed_tag)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
my_tags = tags[position-1]
needed_tag = my_tags.get('href', None)
print("------ : ", tags[position-1].contents[0])
回答9:
Try this. You can leave entering the URL. There is sample of your former link. Good Luck!
import urllib.request
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter url ')
cn = input('Enter count: ')
cnint = int(cn)
pos = input('Enter position: ')
posint = int(pos)
html = urllib.request.urlopen('http://py4e-data.dr-chuck.net/known_by_Fikret.html''''url''', context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags_lst = list()
for x in range(0,cnint):
tags = soup('a')
my_tags = tags[posint-1]
needed_tag = my_tags.get('href', None)
url = str(needed_tag)
html = urllib.request.urlopen(url,context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
print(my_tags.get('href', None))
来源:https://stackoverflow.com/questions/38267954/following-links-in-python-assignment-using-beautifulsoup