There is such a solution:
Initial data:
 $content = '<h1>Hello</h1> <p> zzzzz zz z zz <a href="#">VVV</a> zzzzz </p>'; 
Classes:
 function getInnerHTML(\DOMNode $element) { $innerHTML = ''; $children = $element->childNodes; foreach ($children as $child) { $innerHTML .= $element->ownerDocument->saveHTML($child); } return $innerHTML; } 
RecursiveDOMIterator
Using:
 $dom = new DOMDocument; $dom->loadHTML($content); $output = array(); $iterator = new RecursiveIteratorIterator( new RecursiveDOMIterator($dom), RecursiveIteratorIterator::SELF_FIRST); foreach($iterator as $node) { if ($node->nodeType === XML_ELEMENT_NODE && ($node->nodeName !== 'html' && $node->nodeName !== 'body')) { array_push($output, array( 'NAME' => $node->nodeName, 'TEXT' => trim(getInnerHTML($node), PHP_EOL))); } } 
Result:
 Array ( [0] => Array ( [NAME] => h1 [TEXT] => Hello ) [1] => Array ( [NAME] => p [TEXT] => zzzzz zz z zz <a href="#">VVV</a> zzzzz ) [2] => Array ( [NAME] => a [TEXT] => VVV ) ) 
UPD :
 $dom->loadHTML(mb_convert_encoding($content, 'HTML-ENTITIES', 'UTF-8')); 
UPD2 :
 function getAttributes(\DOMNode $element) { $attributes = array(); foreach ($element->attributes as $attribute) { $attributes[$attribute->nodeName] = $attribute->nodeValue; } return $attributes; } 
Then the challenge will change a bit:
 foreach($iterator as $node) { if ($node->nodeType === XML_ELEMENT_NODE && ($node->nodeName !== 'html' && $node->nodeName !== 'body')) { array_push($output, array( 'NAME' => $node->nodeName, 'TEXT' => trim(getInnerHTML($node), PHP_EOL), 'ATTR' => getAttributes($node))); } } 
In the new ATTR element will be with arrays of the form attribute name => attribute value.
And recursively not needed in this case - there is an iterator.
UPD3 :
 function walk($output, \DOMNode $node, $depth = 0) { if ($node->hasChildNodes()) { $children = $node->childNodes; foreach ($children as $child) { if ($child->nodeType === XML_TEXT_NODE) { continue; } $output[] = $child->nodeName; $item = walk([], $child, $depth + 1); if (!empty($item)) { $output[] = $item; } } } return $output; } $root = $dom->getElementsByTagName('body')[0]; $output = walk([], $root, 0); 
So you can recurse around.
Result :
 array(3) { [0]=> string(2) "h1" [1]=> string(1) "p" [2]=> array(1) { [0]=> string(1) "a" } }