There is a parser on Selenium Webdriver.

There is a code:

def crawl(): while True: urls = list_product.copy() list_product.clear() print(urls) with ThreadPoolExecutor(count_thread) as executor: for _ in executor.map(parsing, urls): pass if len(list_product) == 0: break 

When parsing, if Url is found, you should also apply parsing to them, etc., the found Url are added to the global list_product.

I wanted to see how it behaves when ProcessPoolExecutor, but for some reason when using it, the links that are visible when ThreadPoolExecutor are not defined

Full code:

  import logging import time import pandas as pd import os from selenium import webdriver from selenium.common.exceptions import * from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor count_thread = os.cpu_count() log = logging.getLogger(__name__) format = '%(asctime)s %(levelname)s:%(message)s' logging.basicConfig(format=format, level=logging.INFO) logging.basicConfig(level=logging.DEBUG) input_categories = "categories.txt" main_url_site = "https://spb.vseinstrumenti.ru" products = [] list_product = [] getting_url = [] def init_driver(): ff = "../install/geckodriver.exe" option = webdriver.FirefoxOptions() option.add_argument("--headless") # chrome_option = webdriver.ChromeOptions() # chrome_option.add_argument("headless") # prefs = {"profile.managed_default_content_settings.images": 2} # chrome_option.add_experimental_option("prefs", prefs) try: driver = webdriver.Firefox(executable_path=ff, options=option) # driver = webdriver.Chrome(executable_path=ff, options=chrome_option) # driver = webdriver.Chrome(executable_path=ff, chrome_options=chrome_option, service_args=service_args) except SessionNotCreatedException: print("Ошибка инициализации браузера. Скорее всего у вас не установлен браузер. Пожалуйста обратитесь к разработчику парсера") return driver def check_url(url): if url in getting_url: return False else: getting_url.append(url) return True def add_product(code, price): global products if(price is not None): temp = {"id": code, "price": price} products.append(temp) def get_categories(input_filename): with open(input_filename, "r") as file: for line in file: list_product.append(line) def parsing(url): driver = init_driver() try: driver.get(url) getting_url.append(url) print("Обработанная ссылка:",url) #Раздел поиска категорий urls = driver.find_elements_by_css_selector("a.fs-15.c-black") if len(urls) > 0: for url in urls: if url not in getting_url: href = url.get_attribute("href") list_product.append(href) print(href) else: urls = driver.find_elements_by_css_selector("div.name > a") if len(urls) > 0: for url in urls: if url not in getting_url: href = url.get_attribute("href") list_product.append(href) print(href) else: urls = driver.find_elements_by_css_selector("tr.valign-t > td.valign-t > div > a") if len(urls) > 0: for url in urls: href = url.get_attribute("href") list_product.append(href) print(href) else: #Раздел поиска продуктов items = driver.find_elements_by_css_selector("div.list-box.product") if(len(items) > 0): for item in items: href = item.find_element_by_css_selector("div > div.product-name > a").get_attribute( "href") if(check_url(href)): try: code = item.find_element_by_css_selector("div.list-left > div.header > div.code > span").text except: code = None try: price = item.find_element_by_css_selector( 'div.price > div.ns > div.price-actual > span.amount').text if price.find(" ") != -1: price = price.replace(" ", "") except: price = None add_product(code, price) print("Код:", code, "Price:", price) else: items = driver.find_elements_by_css_selector("div.tile-box.product") if (len(items) > 0): for item in items: href = item.find_element_by_css_selector("div > div.product-name > a").get_attribute("href") if(check_url(href)): try: code = item.find_element_by_css_selector( "div > div.header > div.code > span").text except Exception as e: code = None try: price = item.find_element_by_css_selector('div.price > div.ns > div.price-actual > span.amount').text if price.find(" ") != -1: price = price.replace(" ", "") except: price = None add_product(code, price) print("Код:", code, "Price:", price) else: items = driver.find_elements_by_css_selector("div.group_row") if len(items) > 0: for item in items: href = item.find_element_by_css_selector( "div > a").get_attribute("href") if (check_url(href)): try: code = item.find_element_by_css_selector( "div > div.header > div.code > span").text except Exception as e: code = None #Для страниц, где нету code if code is None: list_product.append(href) continue try: price = item.find_element_by_css_selector( 'div.price > div.ns > div.price-actual > span.amount').text if price.find(" ") != -1: price = price.replace(" ", "") except: price = None add_product(code, price) print("Код:", code, "Price:", price) else: code = driver.find_element_by_css_selector("#aboveImageBlock > span.wtis-id > span.wtis-id-value.codeToOrder").text try: price = driver.find_element_by_css_selector("span.price-value").text if price.find(" ") != -1: price = price.replace(" ", "") except: price = None add_product(code, price) print("Код:", code, "Price:", price) try: #Только для первой страницы if url.find("page") == -1: pagination = driver.find_element_by_css_selector("div.paging.dspl_ib.commonPagination") # data_per_page = int(pagination.get_attribute("data-per-page")) data_max_page = int(pagination.get_attribute("data-max-page")) for i in range(2, data_max_page + 1): href = url + "page" + str(i) list_product.append(href) print(href) # "https://spb.vseinstrumenti.ru/instrument/akkumulyatornyj/akkumulyatory/litij_ionnye/page2/" except: print("Pagination не найден") driver.close() driver.quit() except Exception as e: print(e) driver.close() driver.quit() parsing(url) def crawl(): while True: urls = list_product.copy() list_product.clear() print(urls) with ProcessPoolExecutor(count_thread) as executor: for _ in executor.map(parsing, urls): pass if len(list_product) == 0: break if __name__ == '__main__': start_time = time.time() get_categories(input_categories) crawl() print("Количество продуктов:", len(products)) pd.DataFrame(products).to_excel(r'products.xlsx', index=False, encoding='utf-8') print(time.time() - start_time) 

As the initial list_product link, you can use this link, on which I saw strange behavior.

 https://spb.vseinstrumenti.ru/stroitelnoe_oborudovanie/malyarnoe/ustanovki/ 

I do not understand what is wrong.

  • Because threads have shared memory, and each process has its own. - Sergey Gornostaev
  • @SergeyGornostaev what can you advise? - danilshik
  • Once again, I can advise you not to use global variables. - Sergey Gornostaev
  • @SergeyGornostaev is it possible to return the results from the process somehow?
  • First, the map returns the results of the function call. Secondly, there are queues, channels and shared memory . - Sergey Gornostaev

0