Following links in python assignment using Beautifulsoup

非 Y 不嫁゛ 提交于 2019-12-26 13:33:08

问题


I have this assignment for a python class where I have to start from a specific link at a specific position, then follow that link for a specific number of times. Supposedly the first link has the position 1. This is the link: http://python-data.dr-chuck.net/known_by_Fikret.html

traceback error picture I have trouble with locating the link, the error "index out of range" comes out. can anyone help with figuring out how to locate the link/position? This is my code:

import urllib
from BeautifulSoup import *

url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
count = int(raw_input('Enter count: '))+1
position = int(raw_input('Enter position: '))


tags = soup('a')
tags_lst = list()
for tag in tags:
    needed_tag = tag.get('href', None)
    tags_lst.append(needed_tag)
    for i in range(0,count):
        print 'retrieving: ',tags_lst[position]

OK I wrote this code and it kind of works:

import urllib
from BeautifulSoup import *

url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
count = int(raw_input('Enter count: '))+1
position = int(raw_input('Enter position: '))


tags = soup('a')
tags_lst = list()
for tag in tags:
    needed_tag = tag.get('href', None)
    tags_lst.append(needed_tag)
for i in range(0,count):    
    print 'retrieving: ',tags_lst[position]
    position = position + 1

I'm still getting other links than the ones in the example however when I print the whole list of links the positions match so I don't know. Very weird.


回答1:


[Edit: Cut+pasted this line from comments] Hi! I had to work in a similar exercise, and because i had some doubts i found your question. Here is my code and I think it works. I hope it will be helpful for you

import urllib
from bs4 import BeautifulSoup

url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
count = 8
position = 18
tags_lst = []

for x in xrange(count-1):
    tags = soup('a')
    my_tags = tags[position-1]
    needed_tag = my_tags.get('href', None)
    tags_lst.append(needed_tag)
    url = str(needed_tag)
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')



回答2:


Your BeautifulSoup import was wrong. I don't think it works with the code you show. Also your lower loop was confusing. You can get the list of urls you want by slicing the completely retrieved list.

I've hardcoded your url in my code because it was easier than typing it in each run.

Try this:

import urllib
from bs4 import BeautifulSoup

#url = raw_input('Enter - ')
url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# print soup
count = int(raw_input('Enter count: '))+1
position = int(raw_input('Enter position: '))


tags = soup('a')
# next line gets count tags starting from position
my_tags = tags[position: position+count]
tags_lst = []
for tag in my_tags:
    needed_tag = tag.get('href', None)
    tags_lst.append(needed_tag)
print tags_lst



回答3:


Almost all solutions to this assignment have two sections to load the urls. Instead, I defined a function that prints the relevant link for any given url.

Initially, the function will use the Fikret.html url as input. Subsequent inputs rely on refreshed urls that appear on the required position. The important line of code is this one: url = allerretour(url)[position-1] This gets the new url that feeds the loop another round.

import urllib
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html' # raw_input('Enter URL : ')

position = 3 # int(raw_input('Enter position : '))
count = 4 #int(raw_input('Enter count : '))

def allerretour(url):
    print('Retrieving: ' + url)
    soup = BeautifulSoup(urllib.urlopen(url).read())
    link = list()
    for tag in soup('a'):
        link.append(tag.get('href', None))
    return(link)


for x in range(1, count + 2):
    url = allerretour(url)[position-1]



回答4:


This is my solution:

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter: ')
link_line = int(input("Enter position: ")) - 1 
relative to first link
count = int(input("Enter count: "))

html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

while count >= 0:
   html = urllib.request.urlopen(url, context=ctx).read()
   soup = BeautifulSoup(html, 'html.parser')
   tags = soup('a')
   print(url)
   url = tags[link_line].get("href", None)
   count = count - 1



回答5:


This is my answer that worked for me in Python 2.7:

import urllib
from BeautifulSoup import *

URL = raw_input("Enter the URL:") #Enter main URL
link_line = int(raw_input("Enter position:")) - 1 #The position of link relative to first link
count = int(raw_input("Enter count:")) #The number of times to be repeated

while count >= 0:
    html = urllib.urlopen(URL).read()
    soup = BeautifulSoup(html)
    tags = soup('a')
    print URL
    URL = tags[link_line].get("href", None)
    count = count - 1



回答6:


Here is the working code giving the desired output

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
n=1
url = input('Enter - ')
count= int(input('Enter count'))+1
pos=int(input('Enter position'))
new=url
while n<count:
    if new == url:
        html = urllib.request.urlopen(url, context=ctx).read()
        print('Retrieving', url)
    html = urllib.request.urlopen(new, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    my_tags=tags[pos-1]
    new=my_tags.get('href', None)
    print('Retrieving' , new)
    n=n+1



回答7:


I put the solution below, tested and working well as of today.

importing the require modules

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import re

accessing websites

url = "http://py4e-data.dr-chuck.net/known_by_Vairi.html"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
all_num_list = list()
link_position = 18
Process_repeat = 7

Retrieve all of the anchor tags

tags = soup('a')

while Process_repeat - 1  >= 0 :
    print("Process round", Process_repeat)
    target = tags[link_position - 1]
    print("target:", target)
    url = target.get('href', 2)
    print("Current url", url)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    Process_repeat = Process_repeat - 1



回答8:


import urllib.error, urllib.request
from bs4 import BeautifulSoup

#url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
url = input('Enter link - ')
count = int(input('Enter count - '))
position = int(input('position - ') )
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

tags = soup('a')
my_tags = tags[position-1]
needed_tag = my_tags.get('href', None)
print("------ : ", tags[position-1].contents[0])

for x in range(count-1):

    url = str(needed_tag)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')

    tags = soup('a')
    my_tags = tags[position-1]
    needed_tag = my_tags.get('href', None)
    print("------ : ", tags[position-1].contents[0])



回答9:


Try this. You can leave entering the URL. There is sample of your former link. Good Luck!

import urllib.request
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter url ')
cn = input('Enter count: ')
cnint = int(cn)
pos = input('Enter position: ')
posint = int(pos)
html = urllib.request.urlopen('http://py4e-data.dr-chuck.net/known_by_Fikret.html''''url''', context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

tags_lst = list()
for x in range(0,cnint):
    tags = soup('a')
    my_tags = tags[posint-1]
    needed_tag = my_tags.get('href', None)
    url = str(needed_tag)
    html = urllib.request.urlopen(url,context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    print(my_tags.get('href', None))


来源:https://stackoverflow.com/questions/38267954/following-links-in-python-assignment-using-beautifulsoup

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!