You can use the Jsoup library, for example:
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; class HtmlParsingDemo { public static void main(String[] args) { String html = "<html><body><p>текст1</p><p>текст2</p><img src=\"some.jpg\"><br><p>текст3<img src=\"another.jpg\"><br></body></html>"; Document doc = Jsoup.parse(html); Elements paragraphs = doc.select("p"); for(Element paragraph : paragraphs) { System.out.println(paragraph.text()); } } }
A can and regular expression
import java.util.regex.*; import java.util.ArrayList; public class RegexDemo { public static void main(String[] args) { String html = "<html><body><p>текст1</p><p>текст2</p><img src=\"some.jpg\"><br><p>текст3<img src=\"another.jpg\"><br></body></html>"; Pattern p = Pattern.compile(">([^<]*)<"); Matcher m = p.matcher(html); ArrayList<String> matches = new ArrayList<>(); while(m.find()) { String text = m.group(1); if(!text.isEmpty()) matches.add(text); } for(String match : matches) { System.out.println(match); } } }
UPDATE:
import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.ArrayList; class HtmlParsingDemo { public static void main(String[] args) { String html = "<html><body><p>текст1</p><p>текст2</p><img src=\"some.jpg\"><br><p>текст3<img src=\"another.jpg\"><br></body></html>"; ArrayList<String> texts = new ArrayList<>(); Document doc = Jsoup.parse(html); Elements elements = doc.select("p, img"); ArrayList<String> text = new ArrayList<>(); for(Element element : elements) { if(element.tagName().equals("p")) text.add(element.text()); else { texts.add(String.join(" ", text)); text.clear(); } } texts.stream().forEach(System.out::println); } }