I want to extract:
image tag anddiv class data
All the above answers really help me to construct my answer, because of this I voted for all the answers that other users put it out: But I finally put together my own answer to exact problem I was dealing with:
As question clearly defined I had to access some of the siblings and its children in a dom structure: This solution will iterate over the images in the dom structure and construct image name using product title and save the image to the local directory.
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
from BeautifulSoup import BeautifulSoup as bs
import requests
def getImages(url):
#Download the images
r = requests.get(url)
html = r.text
soup = bs(html)
output_folder = '~/amazon'
#extracting the images that in div(s)
for div in soup.findAll('div', attrs={'class':'image'}):
modified_file_name = None
try:
#getting the data div using findNext
nextDiv = div.findNext('div', attrs={'class':'data'})
#use findNext again on previous object to get to the anchor tag
fileName = nextDiv.findNext('a').text
modified_file_name = fileName.replace(' ','-') + '.jpg'
except TypeError:
print 'skip'
imageUrl = div.find('img')['src']
outputPath = os.path.join(output_folder, modified_file_name)
urlretrieve(imageUrl, outputPath)
if __name__=='__main__':
url = r'http://www.amazon.com/s/ref=sr_pg_1?rh=n%3A172282%2Ck%3Adigital+camera&keywords=digital+camera&ie=UTF8&qid=1343600585'
getImages(url)