问题
I am trying to parse thru a list of URLs saved in CSV format to scrape email addresses. However, the below code only managed to fetch email addresses from a single website. Need advice on how to modify the code to loop thru the list and save the outcome (the list of emails) to csv file.
import requests
import re
import csv
from bs4 import BeautifulSoup
allLinks = [];mails=[]
with open(r'url.csv', newline='') as csvfile:
urls = csv.reader(csvfile, delimiter=' ', quotechar='|')
links = []
for url in urls:
response = requests.get(url)
soup=BeautifulSoup(response.text,'html.parser')
links = [a.attrs.get('href') for a in soup.select('a[href]') ]
allLinks=set(links)
def findMails(soup):
for name in soup.find_all('a'):
if(name is not None):
emailText=name.text
match=bool(re.match('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$',emailText))
if('@' in emailText and match==True):
emailText=emailText.replace(" ",'').replace('\r','')
emailText=emailText.replace('\n','').replace('\t','')
if(len(mails)==0)or(emailText not in mails):
print(emailText)
mails.append(emailText)
for link in allLinks:
if(link.startswith("http") or link.startswith("www")):
r=requests.get(link)
data=r.text
soup=BeautifulSoup(data,'html.parser')
findMails(soup)
else:
newurl=url+link
r=requests.get(newurl)
data=r.text
soup=BeautifulSoup(data,'html.parser')
findMails(soup)
mails=set(mails)
if(len(mails)==0):
print("NO MAILS FOUND")
回答1:
You are overwriting links when you want to add to it.
allLinks = [];mails=[]
urls = ['https://www.nus.edu.sg/', 'http://gwiconsulting.com/']
links = []
for url in urls:
response = requests.get(url)
soup=BeautifulSoup(response.text,'html.parser')
links += [a.attrs.get('href') for a in soup.select('a[href]') ]
allLinks=set(links)
At end loop your mails and write to csv
import csv
with open("emails.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(['Email'])
for mail in mails:
w.writerow(mail)
来源:https://stackoverflow.com/questions/58058619/scrapes-emails-from-a-list-of-urls-saved-in-csv-beautifulsoup