There is a file structure:
files/ ├── AAN │ ├── some1.html │ └── some2.html └── AAPL ├── some3.html └── some4.html
I want to pull out the header from each html and insert it into Header Analyzer , but it does not work. What am I doing wrong?
import csv import os from bs4 import BeautifulSoup directory = '/home/achi/Desktop/auca/web_scrap/files' for folder in os.listdir(directory): with open(f'/home/achi/Desktop/auca/web_scrap/summary/{folder}.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Foldername', 'Filenames', 'Header analyzed']) for filename in os.listdir(f'{directory}/{folder}'): if filename.endswith(".HTML"): f = open(filename) lines = f.read() soup = BeautifulSoup(lines, 'html.parser') items = soup.find_all('div', attrs={'class': 'c4'}) for item in items: item_text = str(item) soup = BeautifulSoup(item_text, 'html.parser') head = soup.find_all('span', attrs={'class': 'c6'}) for h in head: data = h.get_text() writer.writerow([folder, filename, data]) csvfile.close()