Scraping Google Images using Selenium in Python

风格不统一 提交于 2021-02-11 08:44:29

问题


Now, I have been trying to scrape google images using the following code :

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
import os
import time
import requests
import re
import urllib2
import re
from threading import Thread
import json
#Assuming I have a folder named Pictures1, the images are downloaded there. 
def threaded_func(url,i):
     raw_img = urllib2.urlopen(url).read()
     cntr = len([i for i in os.listdir("Pictures1") if image_type in i]) + 1
     f = open("Pictures1/" + image_type + "_"+ str(total), 'wb')
     f.write(raw_img)
     f.close()
driver = webdriver.Firefox()
driver.get("https://images.google.com/")
elem = driver.find_element_by_xpath('/html/body/div/div[3]/div[3]/form/div[2]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/div/input[1]')
elem.clear()
elem.send_keys("parrot")
elem.send_keys(Keys.RETURN)
image_type = "parrot_defG"
images=[]
total=0
time.sleep(10)
for a in driver.find_elements_by_class_name('rg_meta'):
     link =json.loads(a.text)["ou"]
     thread = Thread(target = threaded_func, args = (link,total))
     thread.start()
     thread.join()
     total+=1

I tried to open the image results page of google using Selenium and then notice that every div has class 'rg-meta' and it is followed by JSON code .

I tried to access it using .text . The 'ou' index of JSON has the source of the image I am trying to download. I am trying to get all such divs with class 'rg-meta' and downloading the images. But it shows the error " NO JSON OBJECT CAN BE DECODED" and I have no idea what to do.

EDIT: This is what I am talking about :

    <div class="rg_meta">{"cl":3,"id":"FqCGaup9noXlMM:","isu":"kids.britannica.com","itg":false,"ity":"jpg","oh":600,"ou":"http://media.web.britannica.com/eb-media/89/89689-004-4C85E0F0.jpg","ow":380,"pt":"grain weevil -- Kids Encyclopedia | Children\u0026#39;s Homework Help ...","rid":"EusB0pk_sLg7vM","ru":"http://kids.britannica.com/comptons/art-143712/grain-or-granary-weevil","s":"grain weevil","sc":1,"st":"Kids Britannica","th":282,"tu":"https://encrypted-tbn2.gstatic.com/images?q\u003dtbn:ANd9GcQPbgXbRVzOicvPfBRtAkLOpJwy_wDQEC6a2q0BuTsUx-s0-h4b","tw":179}</div>

Check the "ou" index of the JSON. Please help me extract it.

Forgive me for my ignorance.

This is how I have solved it by making the following update :

    for a in driver.find_elements_by_xpath('//div[@class="rg_meta"]'):
        atext = a.get_attribute('innerHTML')
        link =json.loads(atext)["ou"]
        print link
        thread = Thread(target = threaded_func, args = (link,total))
        thread.start()
        thread.join()
        total+=1

回答1:


Replacing:

driver.find_elements_by_class_name('rg_meta') with driver.find_element_by_xpath('//div[@class="rg_meta"]/text()')

and a.text with a

will resolve your issue.

The resultant code :

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
import os
import time
import requests
import re
import urllib2
import re
from threading import Thread
import json
#Assuming I have a folder named Pictures1, the images are downloaded there. 
def threaded_func(url,i):
     raw_img = urllib2.urlopen(url).read()
     cntr = len([i for i in os.listdir("Pictures1") if image_type in i]) + 1
     f = open("Pictures1/" + image_type + "_"+ str(total), 'wb')
     f.write(raw_img)
     f.close()
driver = webdriver.Firefox()
driver.get("https://images.google.com/")
elem = driver.find_element_by_xpath('/html/body/div/div[3]/div[3]/form/div[2]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/div/input[1]')
elem.clear()
elem.send_keys("parrot")
elem.send_keys(Keys.RETURN)
image_type = "parrot_defG"
images=[]
total=0
time.sleep(10)
for a in driver.find_element_by_xpath('//div[@class="rg_meta"]/text()'):
     link =json.loads(a)["ou"]
     thread = Thread(target = threaded_func, args = (link,total))
     thread.start()
     thread.join()
     total+=1

Printing link results in :

http://media.web.britannica.com/eb-media/89/89689-004-4C85E0F0.jpg


来源:https://stackoverflow.com/questions/42893382/scraping-google-images-using-selenium-in-python

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!