The task is to parse the category from one site.

  1. We go to the site, select all the links to the brand of car (site for the sale of auto parts).
  2. Save all links to the file /index.php
  3. We pass on each link, copy the path, create a folder from this path
  4. Copy all links to auto models in the file path / index.php
  5. Go to each model, then everything is similar to p.4

Repeat until the end of the directory and eventually save the last page in index.php. There is this:

<?php ini_set("max_execution_time", 3600); $custom_url = 'http://exist.ru/cat/TO'; include_once('simple_html_dom.php'); //Main Function function categories_parse($custom_url) { $header_m = '<? require($_SERVER["DOCUMENT_ROOT"]."/bitrix/header.php"); $APPLICATION->SetPageProperty("title", "Демонстрационная версия продукта «1С-Битрикс: Управление сайтом»"); $APPLICATION->SetPageProperty("NOT_SHOW_NAV_CHAIN", "Y"); $APPLICATION->SetTitle("Главная страница"); ?> <?$APPLICATION->IncludeComponent("bitrix:main.include", ".default", array( "AREA_FILE_SHOW" => "file", "PATH" => "/bitrix/templates/4/include_areas/show.php", "EDIT_TEMPLATE" => "" ),false); session_start(); process_form(); ?>'; $footer_m = '<?require($_SERVER["DOCUMENT_ROOT"]."/bitrix/footer.php");?>'; $a = 0; $i = 0; $parse_url = file_get_html($custom_url); $parse = $parse_url->find('div.catalog-column a'); $catalog_col = $parse_url->find('div.catalog-column'); foreach ($catalog_col as $col) { $catalog_col_m[$a] = $col->innertext; $file_main = fopen('./index.php', 'w'); fwrite($file_main, $header_m); fwrite($file_main, $catalog_col_m[$a]); fwrite($file_main, $footer_m); $a++; } foreach ($parse as $p) { $category_href_original[$i] = $p->href; $categoty_path[$i] = "." . $category_href_original[$i]; mkdir($categoty_path[$i], 0777, true); $category_href[$i] = 'http://exist.ru' . $p->href; model_parse($category_href[$i]); $i++; } } function model_parse($href) { $model_url = file_get_html($href); $i = 0; $model_link = $model_url->find('div.cell a'); $model_c = $model_url->find('div.cell'); $z = 0; foreach ($model_c as $cell) { $file_c_p = $href . 'index.php'; $file_c = fopen($file_c_p, 'w'); $model_col_m[$z] = $cell->innertext; fwrite($file_c, $header_c); fwrite($file_c, $model_col_m[$z]); fwrite($file_c, $footer_c); $z++; } foreach ($model_link as $lnk) { $model_href_original[$i] = $lnk->href; $model_path[$i] = "." . $model_href_original[$i]; $model_href[$i] = 'http://exist.ru' . $lnk->href; mkdir($model_path[$i], 0777, true); price_cat_parse($model_href[$i]); $i++; } } function price_cat_parse($price_cat_href) { $header_parse = '<? require($_SERVER["DOCUMENT_ROOT"]."/bitrix/header.php"); $APPLICATION->SetPageProperty("title", "Демонстрационная версия продукта «1С-Битрикс: Управление сайтом»"); $APPLICATION->SetPageProperty("NOT_SHOW_NAV_CHAIN", "Y"); $APPLICATION->SetTitle("Главная страница"); ?> <?$APPLICATION->IncludeComponent("bitrix:main.include", ".default", array( "AREA_FILE_SHOW" => "file", "PATH" => "/bitrix/templates/4/include_areas/show.php", "EDIT_TEMPLATE" => "" ),false); session_start(); process_form(); ?>'; $footer_parse = '<?require($_SERVER["DOCUMENT_ROOT"]."/bitrix/footer.php");?>'; $price_cat_href_n = $price_cat_href . 'index.php'; $price_cat_url = file_get_html($price_cat_href); $price_cat_lnk = $price_cat_url->find('.content-ul a'); $price_cat_content = $price_cat_url->find('tr', 0); $i = 0; $h = 0; $file_p_c = fopen($price_cat_href_n, 'w'); foreach ($price_cat_content as $cat_cont) { $price_cont[$h] = $cat_cont->innertext; fwrite($file_c, $header_parse); fwrite($file_c, $price_cont[$h]); fwrite($file_c, $footer_parse); $h++; } foreach ($price_cat_lnk as $lnk) { $price_cat_href_original[$i] = $lnk->href; $price_cat_path[$i] = "." . $price_cat_href_original[$i]; $price_cat_href[$i] = 'http://exist.ru' . "$price_cat_href_original[$i]"; // echo "$price_cat_href_n[$i] \n"; mkdir($price_cat_path[$i], 0777, true); price_parse($price_cat_href[$i], $price_cat_path[$i]); $i++; } } function price_parse($price_href, $folder_file) { $header = '<? require($_SERVER["DOCUMENT_ROOT"]."/bitrix/header.php"); $APPLICATION->SetPageProperty("title", "Демонстрационная версия продукта «1С-Битрикс: Управление сайтом»"); $APPLICATION->SetPageProperty("NOT_SHOW_NAV_CHAIN", "Y"); $APPLICATION->SetTitle("Главная страница"); ?> <?$APPLICATION->IncludeComponent("bitrix:main.include", ".default", array( "AREA_FILE_SHOW" => "file", "PATH" => "/bitrix/templates/4/include_areas/show.php", "EDIT_TEMPLATE" => "" ),false); session_start(); process_form(); ?>'; $footer = '<?require($_SERVER["DOCUMENT_ROOT"]."/bitrix/footer.php");?>'; $price_new_url = file_get_html($price_href); $price_cont = $price_new_url->find('table.tbl'); $i = 0; foreach ($price_cont as $cont) { $index = '/index.php'; $path_to_file = $folder_file . $index; $price_contnent_original[$i] = fopen($path_to_file, 'w+'); fwrite($price_contnent_original[$i], $header); fwrite($price_contnent_original[$i], $cont); fwrite($price_contnent_original[$i], $footer); echo "$path_to_file \n"; echo "file wrote \n"; $i++; } } categories_parse($custom_url); ?> 

Help who can ....

  • one
    What is the actual question? What did you try to do yourself? - Sergiks
  • Yes, indeed, it may be a question, and then "Help, who can ...." - give money? :) - zippp
  • The question is that for some reason index.php files are not saved. Access rights are in order. "What did you try to do yourself?" All that is written above is what I did myself. Can anyone have an idea how to implement it better? - Hlogeon
  • still, because of the nesting cycle, the execution time is huge. - Hlogeon

1 answer 1

According to the experience of drawing and parsing these elections from the site of the election committee, I can recommend this:

  1. Split the task into two separate stages: downloading files from the net, and parsing them;
  2. Download faster using parallel requests curl_multi ;