问题
I am working on web scraping, I am taking names from text file by line by line and searching it on Google and scraping addresses from the results. I want to add that result in front of respective names. This is my text file a.txt:
0.5BN FINHEALTH PRIVATE LIMITED
01 SYNERGY CO.
1 BY 0 SOLUTIONS
and this is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
out_fl = open('a.txt','r')
for line in out_fl:
query = line
query = query.replace(' ', '+')
# print(line)
URL = f"https://google.com/search?q={query}"
# print(URL)
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_="i4J0ge"):
address = soup.find('span', class_="LrzXr")
if address:
address = (address.text)
else:
print("Not found")
phone = soup.find('span',class_="LrzXr zdqRlf kno-fv")
if phone:
phone = (phone.text)
else:
print("None")
company = line
item = {"company": line.replace('\n',''),"Address" : address,"Phone" : phone}
# print(item)
results.append(item)
print(results)
df = pd.DataFrame(results, columns=["company", "Address", "Phone"])
df.to_excel('filename.xlsx', sheet_name='sheet name', index=False)
out_fl.close()
And I don't know where it overwrites, please help me out. Thanks.
回答1:
This script will produce CSV with companies/phones from your input file a.txt:
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0'}
with open('a.txt','r') as f_in:
companies = [line.strip() for line in f_in if line.strip()]
all_data = []
for company in companies:
print(company)
soup = BeautifulSoup(requests.get('https://google.com/search', params={'q': company, 'hl': 'en'}, headers=headers).content, 'html.parser')
address = soup.select_one('.LrzXr')
if address:
address = address.text
else:
address = 'Not Found'
phone = soup.select_one('.LrzXr.zdqRlf.kno-fv')
if phone:
phone = phone.text
else:
phone = 'Not Found'
all_data.append({"Company": company,"Address": address,"Phone": phone})
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
Produces data.csv (screenshot from LibreOffice):
回答2:
try this but you have to modify the results as per your
import requests
from bs4 import BeautifulSoup
import pandas as pd
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
df = pd.DataFrame(results, columns=["company","result"])
out_fl = open('a.txt','r')
for line in out_fl:
query = line
query = query.replace(' ', '+')
# print(line)
URL = f"https://google.com/search?q={query}"
# print(URL)
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_="i4J0ge"):
address = soup.find('span', class_="LrzXr")
if address:
address = (address.text)
else:
print("Not found")
phone = soup.find('span',class_="LrzXr zdqRlf kno-fv")
if phone:
phone = (phone.text)
else:
print("None")
company = line
item = {"company": line.replace('\n',''),"Address" : address,"Phone" : phone}
# print(item)
results.append(item)
print(results)
df.loc[query]=[query,results]
df.to_excel("results.xlsx",sheet_name="result", index=False)
来源:https://stackoverflow.com/questions/62870492/pandas-is-not-writing-all-the-results-it-overwrites-and-gives-only-the-last-res