import requests import urllib.request from lxml.html import fromstring URL = 'http://allrecipes.com/recipes/79/desserts/' HTML = urllib.request.urlopen(URL) list_html = HTML.read().decode('utf-8') parser = fromstring(list_html) for elem in parser.xpath('//*[@id="grid"]/div[1]/article[1]/a[2]/h3'): print(elem.text) 

UPDATE 3:

Could not save іngredients , empty string gets to database

 import urllib.request from lxml.html import parse import sqlite3 import time WEBSITE = 'http://allrecipes.com' def correct_str(s): return s.encode('utf-8').decode('ascii', 'ignore').strip() def save(title,ingredients,photo_url): conn = sqlite3.connect('db.sqlite') c = conn.cursor() c.execute("INSERT INTO content(title,ingredients,img) VALUES(?,?,?)", (title,ingredients,photo_url,)) conn.commit() conn.close() for i in range(1, 11): URL = 'http://allrecipes.com/recipes/79/desserts/?page=' + str(i) HTML = urllib.request.urlopen(URL) page = parse(HTML).getroot() # пропускаем видео for elem in page.xpath('//*[@id="grid"]/article[not(contains(@class, "video-card"))]/a[1]'): href = WEBSITE + elem.get('href') title = correct_str(elem.find('h3').text) if title == 'Allrecipes Magazine': continue recipe_page = parse(urllib.request.urlopen(href)).getroot() photo_url = recipe_page.xpath('//img[@class="rec-photo"]')[0].get('src') # массив с ингредиентами ingredients = recipe_page.xpath('//ul[contains(@id, "lst_ingredients")]/li/label/span/text()') ingredients = filter(lambda s: 'Add all ingredients' not in s, map(correct_str, ingredients)) # массив с последовательностью приготовления directions = recipe_page.xpath('//span[@class="recipe-directions__list--item"]/text()') directions = map(correct_str, directions) times = recipe_page.xpath('//span[@class="ready-in-time"]/text()') servings = recipe_page.xpath('//span[@ng-bind="adjustedServings"]/text()') cals = recipe_page.xpath('//*[@id="nutrition-button"]/span[1]/span[1]/text()') author = recipe_page.xpath('//*[@class="submitter__name"]/text()')[0] #comment_author = recipe_page.xpath('//*[@class="submitter__description"]/text()') print('Страница:', i) print('Название:', title) print('Автор:', author) #print('Коментарь от автора:', comment_author) print('Ссылка:', href) print('Фото:', photo_url) print('Время:', times) print('Порции:', servings) print('Калории', cals) print('Ингредиенты:', '; '.join(ingredients)) print(' ----------------------') print('Шаги:') print('\n\n'.join(directions)) print(' ======================\n') ing = '; '.join(ingredients) save(title,ing,photo_url) 
  • updated the answer (image, ingredients, etc.) - kmv
  • The question smoothly flowed into a question like “write the code with me” ... - Im ieee
  • everything solved the problem) - Kill Noise
  • please tell me why xpath returns an empty array //span[@ng-bind="adjustedServings"]/text() although the data for this address is present - Kill Noise
  • The @Surfer value in this spun is substituted only as a result of the javascript code execution in the browser. On Python, repeating this behavior is a separate, not too simple task (for example, you can connect a webkit engine, etc.). - kmv

1 answer 1

The fact is that the original structure of the HTML document, obtained in the response from the server, can be changed by scripts when displayed in the browser. Your XPath query should work exactly with the original structure. You can see it, for example, in the Chrome debugger on the Network tab.

 import urllib.request from lxml.html import parse URL = 'http://allrecipes.com/recipes/79/desserts/' HTML = urllib.request.urlopen(URL) page = parse(HTML).getroot(); for elem in page.xpath('//*[@id="grid"]/article/a/h3/text()'): print(elem.encode('utf-8').decode('ascii', 'ignore').strip()) 

You can also use the cssselect function (you must install the cssselect package). Often the CSS selector is simpler to use than XPath.

 for elem in page.cssselect('#grid h3'): print(elem.text.encode('utf-8').decode('ascii', 'ignore').strip()) 

UPDATE Referencing links through pages:

 import urllib.request from lxml.html import parse SITE = 'http://allrecipes.com' for i in range(1, 6): URL = 'http://allrecipes.com/recipes/79/desserts/?page=' + str(i) HTML = urllib.request.urlopen(URL) page = parse(HTML).getroot(); for elem in page.xpath('//*[@id="grid"]/article/a[1]'): href = SITE + elem.get('href') title = elem.find('h3').text.encode('utf-8').decode('ascii', 'ignore').strip() print('Названия:', title, '\nСсылка: ', href, '\n ----------------------') 

UPDATE2. Upload photos, ingredients, steps:

 import urllib.request from lxml.html import parse WEBSITE = 'http://allrecipes.com' def correct_str(s): return s.encode('utf-8').decode('ascii', 'ignore').strip() for i in range(1, 11): URL = 'http://allrecipes.com/recipes/79/desserts/?page=' + str(i) HTML = urllib.request.urlopen(URL) page = parse(HTML).getroot() # пропускаем видео for elem in page.xpath('//*[@id="grid"]/article[not(contains(@class, "video-card"))]/a[1]'): href = WEBSITE + elem.get('href') title = correct_str(elem.find('h3').text) if title == 'Allrecipes Magazine': continue recipe_page = parse(urllib.request.urlopen(href)).getroot() photo_url = recipe_page.xpath('//img[@class="rec-photo"]')[0].get('src') # массив с ингредиентами ingredients = recipe_page.xpath('//ul[contains(@id, "lst_ingredients")]/li/label/span/text()') ingredients = filter(lambda s: 'Add all ingredients' not in s, map(correct_str, ingredients)) # массив с последовательностью приготовления directions = recipe_page.xpath('//span[@class="recipe-directions__list--item"]/text()') directions = map(correct_str, directions) print('Страница:', i) print('Название:', title) print('Ссылка:', href) print('Фото:', photo_url) print('Ингредиенты:', ', '.join(ingredients)) print(' ----------------------') print('Шаги:') print('\n\n'.join(directions)) print(' ======================\n') 
  • Once you made the css-selector, you could also write xpath as: //*[@id="grid"]//h3/text() :) - gil9red
  • @ gil9red yes, of course :) - kmv
  • great) and if you need another link to get from the tag 'a' //*[@id="grid"]/div[1]/article[1]/a[2] - Kill Noise
  • @Surfer elem.get('href') (only text() from the xpath request removed) - kmv
  • one
    and if you still parse the data from the next page, until the page is 5 http://allrecipes.com/recipes/79/desserts/?page=2 - Kill Noise