Scrapes Emails from a list of URLs saved in CSV - BeautifulSoup

问题

I am trying to parse thru a list of URLs saved in CSV format to scrape email addresses. However, the below code only managed to fetch email addresses from a single website. Need advice on how to modify the code to loop thru the list and save the outcome (the list of emails) to csv file.

import requests
import re
import csv
from bs4 import BeautifulSoup

allLinks = [];mails=[]
with open(r'url.csv', newline='') as csvfile:
    urls = csv.reader(csvfile, delimiter=' ', quotechar='|')
    links = []
    for url in urls:
        response = requests.get(url)
        soup=BeautifulSoup(response.text,'html.parser')
        links = [a.attrs.get('href') for a in soup.select('a[href]') ]

allLinks=set(links)

def findMails(soup):
    for name in soup.find_all('a'):
        if(name is not None):
            emailText=name.text
            match=bool(re.match('[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$',emailText))
            if('@' in emailText and match==True):
                emailText=emailText.replace(" ",'').replace('\r','')
                emailText=emailText.replace('\n','').replace('\t','')
                if(len(mails)==0)or(emailText not in mails):
                    print(emailText)
                mails.append(emailText)
for link in allLinks:
    if(link.startswith("http") or link.startswith("www")):
        r=requests.get(link)
        data=r.text
        soup=BeautifulSoup(data,'html.parser')
        findMails(soup)

    else:
        newurl=url+link
        r=requests.get(newurl)
        data=r.text
        soup=BeautifulSoup(data,'html.parser')
        findMails(soup)

mails=set(mails)
if(len(mails)==0):
    print("NO MAILS FOUND")

回答1:

You are overwriting links when you want to add to it.

allLinks = [];mails=[]
urls = ['https://www.nus.edu.sg/', 'http://gwiconsulting.com/']
links = []

for url in urls:
    response = requests.get(url)
    soup=BeautifulSoup(response.text,'html.parser')
    links += [a.attrs.get('href') for a in soup.select('a[href]') ]

allLinks=set(links)

At end loop your mails and write to csv

import csv

with open("emails.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
    w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
    w.writerow(['Email'])
    for mail in mails:
        w.writerow(mail)

来源：https://stackoverflow.com/questions/58058619/scrapes-emails-from-a-list-of-urls-saved-in-csv-beautifulsoup

标签

for-loop

web-scraping

beautifulsoup