Bypassing internal links with a given depth

Question

I can not properly make the depth of immersion (that would take the link, enter it and pull out the links again and so a certain number of times. As far as I understand, after I add a new element to the set, an error takes off.

Just how to do that would not go to the links that have already been visited.

Can't figure out how to solve this. I will be grateful for the help

import re import requests from bs4 import BeautifulSoup def get_html(url): """ This method get page html""" try: r = requests.get(url) except requests.ConnectionError: return if r.status_code < 400: return r.text class EmailWebCrawler: def __init__(self, base_url, deep): self.base_url = base_url self.deep = deep self.links = set() self.all_links = set() self.mails = [] def get_links(self, html): """ This method get page links""" soup = BeautifulSoup(html, 'lxml') domain = self.base_url.split('/') # Получаем все ссылки с страницы for al in soup.find_all('a'): link = al.get('href') # print("Found link: " + link) # если ссылка начинается с 'http' добавляем в set if link.startswith(domain[0] + '//' + domain[2]): if link not in self.links or link not in self.all_links: self.links.add(link) # возможно update self.all_links.add(link) # print("Adding link - " + link) # если ссылка начинается с '/' добавляем в set elif link.startswith('/'): join_domen = domain[0] + '//' + domain[2] + link if join_domen not in self.links or join_domen not in self.all_links: self.links.add(join_domen) # возможно update self.all_links.add(join_domen) # print("Adding link - " + join_domen) def get_email(self, html): """ This method get page email""" soup = BeautifulSoup(html, 'lxml').get_text() all_email = re.findall(r'[\w.-]+@[\w.-]+\.?[\w]+?', soup) for mail in all_email: if mail not in self.mails: self.mails.append(mail) # print(mail) def run(self): page = get_html(self.base_url) if self.deep == 1: self.get_email(page) print(self.mails) # Нужна помощь!!! elif self.deep > 1: self.get_links(page) while self.deep > 0: for link in self.links: p = get_html(link) self.get_links(p) self.deep -= 1 for i in self.all_links: p1 = get_html(i) self.get_email(p1) print(self.mails) if __name__ == '__main__': # url = input('Enter your url: ') # url = int(input('Enter deep should: ')) parse = EmailWebCrawler('https://www.python.org/', 3) parse.run()

It is not clear why you need two sets, when one is enough, in which the visited unique links are collected.

Bypassing internal links with a given depth

0

More articles: