我正在平整一个网页,并跟踪链接,以便绘制从一页到另一页的链接。 我只拿上链接的网页名称、用于链接网页的URL和URL线的网页名称。
我的《守则》顺利地发现我感兴趣的环节和代号分页,以便找到更多的产品链接。 至少有100页,其中有几百页,因此它有数份穿透镜。 我正在以以下形式构建一个产品:$products[index]
, 包含一个阵列array([ url] =>URLToPage, [职称] => 职称OfPage, [链接_title] => 职称OfLinkedPage)
,我希望能表明这一点。
文字在我添加这张幻灯塔之前就行了,之后,文字将不再执行,而不会有错误、警告、通知或任何内容;这根本就永远不会达到文字的终结。 我列入了<代码>set_time_limit(0),以防止执行时间从到期,因为这一规定需要一些时间才能完成。 该代码是在产品阵列出现之后实施的,如果发现任何链接,产品总是一个阵列,而且我已经输出了检测案例中的美元链路——html——显示,这些页面正在按预期检索。 这是犯罪法典:
// Populate the destination link titles
if ( isset( $products ) && count( $products ) > 0 )
{
foreach( $products as $id => $product )
{
$from_this_page = $product[ url ];
if ( $DEBUG ) echo Parsing . $from_this_page . .<br /> ;
$link_html_string = file_get_contents( $from_this_page, NULL, NULL, NULL, 500 );
$string_parts = explode( <title> , $link_html_string );
$string_parts = explode( </title> , $string_parts[1] );
$products[$id][ link_title ] = $string_parts[0];
if ( $DEBUG ) echo Found title: . $products[$id][ link_title ] . <br /> ;
ob_flush();
flush();
}
}
然而,在阅读整个档案时,我对记忆的使用感到有些关切,因此,我通过限制读数来减少负荷(我认为)。 我或许认为,这本书正在使用所有给人头盔的记忆。 如果列入这一条款,它会数次通过这一漏洞,但在某个时候停止执行,这一点也不完全相同。 我会收到好几处重复的复文。
This is the full code for the script, included to answer questions regarding contents of $products in comments.
<?php
// PHP HTML DOM Parser from http://simplehtmldom.sourceforge.net/
require_once( includes/simple_html_dom.php );
//error_reporting( E_ALL );
set_time_limit( 0 );
// Debugging flag
$DEBUG = false;
function reportProducts( $category, $products )
{
echo <table width="90%" align="center"><tr><th colspan="3"> ;
echo $category . has . count( $products ) . products listed, or in subpages. ;
echo </th></tr> ;
echo <tr><td bgcolor="#777777" width="30%">This page</td>
<td bgcolor="#bbbbbb" width="30%">links with</td>
<td bgcolor="#777777" width="30%">to this page</td></tr> ;
foreach( $products as $product )
{
echo <tr><td bgcolor="#777777"> . $product[ title ] . </td>
<td bgcolor="#bbbbbb"><a href=" . $product[ url ] . "> . $product[ url ] .
</a></td><td bgcolor="#777777"> . $product[ link_title ] . </td></tr> ;
}
echo </table><br /> ;
ob_flush(); // Server may buffer again, preventing incremental display
flush();
}
function parseProductsForPage( $page_to_parse )
{
global $DEBUG;
$failed = false;
$product_id = 0;
$page_dom = new simple_html_dom();
$page_html_string = @file_get_contents( $page_to_parse->href );
$load_state = @$page_dom->load( $page_html_string );
if ( $load_state === NULL )
{
// Find any direct product pages for this page
if ( $DEBUG ) echo $page_to_parse->href . being checked for products... ;
$possible = $page_dom->find( a[onclick] );
foreach( $possible as $link )
{
if ( $link->innertext == "[ Add to cart ]" )
{
$products[$product_id][ url ] = $link->href;
$titles = $page_dom->find( title );
$products[$product_id][ title ] = $titles[0]->innertext;
$product_id++;
}
}
if ( $DEBUG )
{
if ( isset( $products ) )
{
echo count( $products ) . found on page.<br /> ;
} else
{
echo 0 found on page.<br /> ;
}
}
// Find subpages...
if ( $DEBUG ) echo $page_to_parse->href . being checked for links... ;
$subpages = $page_dom->find( a[class=buy] );
if ( $DEBUG ) echo count( $subpages ) . found.<br /> ;
// ... and parse
foreach( $subpages as $subpage )
{
$subpage_dom = new simple_html_dom();
$subpage_html_string = @file_get_contents( $subpage->href );
$load_state = @$subpage_dom->load( $subpage_html_string );
if ( $load_state === NULL )
{
// Find any direct product pages for this page
if ( $DEBUG ) echo $subpage->href . being checked for products... ;
$possible = $subpage_dom->find( a[onclick] );
foreach( $possible as $link )
{
if ( $link->innertext == "[ Add to cart ]" )
{
$products[$product_id][ url ] = $link->href;
$titles = $page_dom->find( title );
$products[$product_id][ title ] = $titles[0]->innertext;
$product_id++;
}
}
if ( $DEBUG )
{
if ( isset( $products ) )
{
echo count( $products ) . found on page.<br /> ;
} else
{
echo 0 found on page.<br /> ;
}
}
$subpage_dom->clear();
} else
{
$failed[] = $subpage->href;
}
$subpage_dom->clear();
unset( $subpage_dom );
}
// Populate the destination link titles
if ( isset( $products ) && count( $products ) > 0 )
{
foreach( $products as $id => $product )
{
// $from_this_page = $product[ url ];
// if ( $DEBUG ) echo Parsing . $from_this_page . .<br /> ;
// $link_html_string = file_get_contents( $from_this_page, NULL, NULL, NULL, 500 );
// $string_parts = explode( <title> , $link_html_string );
// $string_parts = explode( </title> , $string_parts[1] );
// $products[$id][ link_title ] = $string_parts[0];
// if ( $DEBUG ) echo Found title: . $products[$id][ link_title ] . <br /> ;
// ob_flush();
// flush();
}
}
} else
{
$failed[] = $page_to_parse->href;
}
$titles = $page_dom->find( title );
if ( isset( $products ) ) reportProducts( $titles[0]->innertext, $products );
$page_dom->clear();
unset( $page_dom );
return $failed;
}
// Initialize the object
$html = new simple_html_dom();
$html->load_file( index.html );
// Start output buffer
ob_start();
// Find all product categories listed on the website
if ( $DEBUG ) echo <h1>Collecting links from LHN...</h1> ;
$sidelinks = $html->find( a[class=sidelink_main] );
$html->clear();
unset( $html );
echo <h1>Found . count( $sidelinks ) . categories.</h1><br /> ;
ob_flush(); // Server may buffer output, preventing incremental display
flush();
// Find links and products for each category
foreach( $sidelinks as $sidelink )
{
if ( $DEBUG ) echo Sending . $sidelink->href . to parser.<br /> ;
$parse_failed = parseProductsForPage( $sidelink );
if ( $parse_failed )
{
foreach( $parse_failed as $failure )
{
$failures[] = $failure;
}
}
}
echo count( $failures ) . pages failed to parse.<br /> ;
echo <br />FIN!<br /> ; // Easily searched to verfiy end of script was reached, also
// celebratory.
ob_end_flush(); // Clear output buffer
flush();
?>