我一直在与PHP s DOMDocument和相关班子一道工作,以便撰写能够处理类似问题的超文本帽子。 它是在发展的早期阶段,并非随时可供实际使用的地方,但我早期的实验似乎为这一想法带来了一些希望。
基本上,你把标记装入OMDocument中,然后tra树。 对于树中的每 no子,请对照允许的 no形清单,检查 no的类型。 如果名单上的树脂类型为吨,则从树中去除。
你可以采取类似的做法,把所有欧洲反对种族主义、种族歧视、仇外心理和相关的不容忍现象世界会议标记放在一片标记中,并删除。 如果你能够把任何封面文字从你提供的标识中删除,那么设在莫斯科的XSS就变得无能为力。
This is the code I m using, along with a test case that processes the StackOverflow home page. Like I said, it s far from production quality code and is little more than a proof of concept. Still, I hope you find it useful.
<?php
class HtmlClean
{
private $whiteList = array (
#cdata-section , #comment , #text , a , abbr , acronym , address , b ,
big , blockquote , body , br , caption , cite , code , col , colgroup ,
dd , del , dfn , div , dl , dt , em , fieldset , h1 , h2 , h3 , h4 ,
h5 , h6 , head , hr , html , i , img , ins , kbd , li , link , meta ,
ol , p , pre , q , samp , small , span , strike , strong , style , sub ,
sup , table , tbody , td , tfoot , th , thead , title , tr , tt , ul ,
var
);
private $attrWhiteList = array (
class , id , title
);
private $dom = NULL;
/**
* Get current tag whitelist
* @return array
*/
public function getWhiteListTags ()
{
$this -> whiteList = array_values ($this -> whiteList);
return ($this -> whiteList);
}
/**
* Add tag to the whitelist
* @param string $tagName
*/
public function addWhiteListTag ($tagName)
{
$tagName = strtolower (trin ($tagName));
if (!in_array ($tagName, $this -> whiteList))
{
$this -> whiteList [] = $tagName;
}
}
/**
* Remove a tag from the whitelist
* @param string $tagName
*/
public function removeWhiteListTag ($tagName)
{
if ($index = array_search ($tagName, $this -> whiteList))
{
unset ($this -> whiteList [$index]);
}
}
/**
* Load document markup into the class for cleaning
* @param string $html The markup to clean
* @return bool
*/
public function loadHTML ($html)
{
if (!$this -> dom)
{
$this -> dom = new DOMDocument();
}
$this -> dom -> preserveWhiteSpace = false;
$this -> dom -> formatOutput = true;
return $this -> dom -> loadHTML ($html);
}
public function outputHtml ()
{
$ret = ;
if ($this -> dom)
{
$ret = $this -> dom -> saveXML ();
}
return ($ret);
}
private function cleanAttrs (DOMnode $elem)
{
$attrs = $elem -> attributes;
$index = $attrs -> length;
while (--$index >= 0)
{
$attrName = strtolower ($attrs -> item ($indes) -> name);
if (!in_array ($attrName, $this -> attrWhiteList))
{
$elem -> removeAttribute ($attrName);
}
}
}
/**
* Recursivly remove elements from the DOM that aren t whitelisted
* @param DOMNode $elem
* @return array List of elements removed from the DOM
* @throws Exception If removal of a node failed than an exception is thrown
*/
private function cleanNodes (DOMNode $elem)
{
$removed = array ();
if (in_array (strtolower ($elem -> nodeName), $this -> whiteList))
{
// Remove non-whitelisted attributes
if ($elem -> hasAttributes ())
{
$this -> cleanAttrs ($elem);
}
/*
* Iterate over the element s children. The reason we go backwards is because
* going forwards will cause indexes to change when elements get removed
*/
if ($elem -> hasChildNodes ())
{
$children = $elem -> childNodes;
$index = $children -> length;
while (--$index >= 0)
{
$removed = array_merge ($removed, $this -> cleanNodes ($children -> item ($index)));
}
}
}
else
{
// The element is not on the whitelist, so remove it
if ($elem -> parentNode -> removeChild ($elem))
{
$removed [] = $elem;
}
else
{
throw new Exception ( Failed to remove node from DOM );
}
}
return ($removed);
}
/**
* Perform the cleaning of the document
*/
public function clean ()
{
$removed = $this -> cleanNodes ($this -> dom -> getElementsByTagName ( html ) -> item (0));
return ($removed);
}
}
$test = file_get_contents( ( http://www.stackoverflow.com/ ));
// Windows-stype linebreaks really foul up the works. There s probably a better fix for this
$test = str_replace (chr (13), , $test);
$cleaner = new HtmlClean ();
$cleaner -> loadHTML ($test);
echo ( <h1>Before</h1><pre> . htmlspecialchars ($cleaner -> outputHtml ()) . </pre> );
$start = microtime (true);
$removed = $cleaner -> clean ();
$cleanTime = microtime (true) - $start;
echo ( <h1>Removed tag list</h1> );
foreach ($removed as $elem)
{
var_dump ($elem -> nodeName);
}
echo ( <h1>After</h1><pre> . htmlspecialchars ($cleaner -> outputHtml ()) . </pre> );
// benchmark
var_dump ($cleanTime);
?>