import requests import urllib.request from lxml.html import fromstring URL = 'http://allrecipes.com/recipes/79/desserts/' HTML = urllib.request.urlopen(URL) list_html = HTML.read().decode('utf-8') parser = fromstring(list_html) for elem in parser.xpath('//*[@id="grid"]/div[1]/article[1]/a[2]/h3'): print(elem.text) UPDATE 3:
Could not save
іngredients, empty string gets to database
import urllib.request from lxml.html import parse import sqlite3 import time WEBSITE = 'http://allrecipes.com' def correct_str(s): return s.encode('utf-8').decode('ascii', 'ignore').strip() def save(title,ingredients,photo_url): conn = sqlite3.connect('db.sqlite') c = conn.cursor() c.execute("INSERT INTO content(title,ingredients,img) VALUES(?,?,?)", (title,ingredients,photo_url,)) conn.commit() conn.close() for i in range(1, 11): URL = 'http://allrecipes.com/recipes/79/desserts/?page=' + str(i) HTML = urllib.request.urlopen(URL) page = parse(HTML).getroot() # пропускаем видео for elem in page.xpath('//*[@id="grid"]/article[not(contains(@class, "video-card"))]/a[1]'): href = WEBSITE + elem.get('href') title = correct_str(elem.find('h3').text) if title == 'Allrecipes Magazine': continue recipe_page = parse(urllib.request.urlopen(href)).getroot() photo_url = recipe_page.xpath('//img[@class="rec-photo"]')[0].get('src') # массив с ингредиентами ingredients = recipe_page.xpath('//ul[contains(@id, "lst_ingredients")]/li/label/span/text()') ingredients = filter(lambda s: 'Add all ingredients' not in s, map(correct_str, ingredients)) # массив с последовательностью приготовления directions = recipe_page.xpath('//span[@class="recipe-directions__list--item"]/text()') directions = map(correct_str, directions) times = recipe_page.xpath('//span[@class="ready-in-time"]/text()') servings = recipe_page.xpath('//span[@ng-bind="adjustedServings"]/text()') cals = recipe_page.xpath('//*[@id="nutrition-button"]/span[1]/span[1]/text()') author = recipe_page.xpath('//*[@class="submitter__name"]/text()')[0] #comment_author = recipe_page.xpath('//*[@class="submitter__description"]/text()') print('Страница:', i) print('Название:', title) print('Автор:', author) #print('Коментарь от автора:', comment_author) print('Ссылка:', href) print('Фото:', photo_url) print('Время:', times) print('Порции:', servings) print('Калории', cals) print('Ингредиенты:', '; '.join(ingredients)) print(' ----------------------') print('Шаги:') print('\n\n'.join(directions)) print(' ======================\n') ing = '; '.join(ingredients) save(title,ing,photo_url)
//span[@ng-bind="adjustedServings"]/text()although the data for this address is present - Kill Noise