from selenium import webdriver
import re
# define the website to scrape
url = "https://www.example.com"
# set up the selenium webdriver
driver = webdriver.Chrome()
driver.get(url)
# find all the links on the page using xpath
links = driver.find_elements_by_xpath("//a[@href]")
# extract the email addresses from each link
emails = []
for link in links:
href = link.get_attribute("href")
# check if the link contains an email address
if re.match(r"[^@]+@[^@]+\.[^@]+", href):
email = href.replace("mailto:", "")
if email not in emails:
emails.append(email)
# print the list of emails
for email in emails:
print(email)
# close the webdriver
driver.quit()
#################### log areas scraped:
from selenium import webdriver
import re
# Initialize webdriver
driver = webdriver.Chrome()
# Set the starting URL
start_url = "http://www.example.com/"
# Initialize set to track scraped URLs
scraped_urls = set()
# Initialize set to track emails found
emails_found = set()
# Define function to extract emails from a given string
def extract_emails(text):
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(pattern, text)
return emails
# Define function to spider a given URL
def spider(url):
driver.get(url)
scraped_urls.add(url)
# Find all links on the page
links = driver.find_elements_by_xpath("//a[@href]")
for link in links:
href = link.get_attribute("href")
# If the link is not already scraped, spider it
if href not in scraped_urls:
if start_url