php实现的一个很好用HTML解析器类可用于采集数据

复制代码 代码如下:

  <?php

  $oldSetting = libxml_use_internal_errors( true );

  libxml_clear_errors();

  /**

  *

  * -+-----------------------------------

  * |PHP5 Framework - 2011

  * |Web Site: www.iblue.cc

  * |E-mail: [email protected]

  * |Date: 2012-10-12

  * -+-----------------------------------

  *

  * @desc HTML解析器

  * @author jingke

  */

  class XF_HtmlDom

  {

  private $_xpath = null;

  private $_nodePath = '';

  public function __construct($xpath = null, $nodePath = '')

  {

  $this->_xpath = $xpath;

  $this->_nodePath = $nodePath;

  }

  public function loadHtml($url)

  {

  ini_set('user_agent', 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');

  $content = '';

  if(strpos(strtolower($url), 'http')===false)

  {

  $content = file_get_contents($url);

  }

  else

  {

  $ch = curl_init();

  $user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";

  $user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';

  curl_setopt($ch, CURLOPT_URL, $url);

  curl_setopt($ch, CURLOPT_HEADER, false);

  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

  curl_setopt($ch, CURLOPT_REFERER, $url);

  curl_setopt($ch, CURLOPT_USERAGENT, $user_agent1);

  curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);

  $content =curl_exec($ch);

  curl_close($ch);

  }

  $html = new DOMDocument();

  $html->loadHtml($content);

  $this->_xpath = new DOMXPath( $html );

  //return $this;

  }

  public function find($query, $index = null)

  {

  if($this->_nodePath == '')

  $this->_nodePath = '//';

  else

  $this->_nodePath .= '/';

  $nodes = $this->_xpath->query($this->_nodePath.$query);

  //echo $nodes->item(0)->getNodePath();exit;

  if ($index == null && !is_numeric($index))

  {

  $tmp = array();

  foreach ($nodes as $node)

  {

  $tmp[] = new XF_HtmlDom($this->_xpath, $node->getNodePath());

  }

  return $tmp;

  }

  return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());

  }

  /**

  * 获取内容

  */

  public function text()

  {

  if ($this->_nodePath != '' && $this->_xpath != null )

  return $this->_xpath->query($this->_nodePath)->item(0)->textContent;

  else

  return false;

  }

  /**

  * 获取属性值

  */

  public function getAttribute($name)

  {

  if ($this->_nodePath != '' && $this->_xpath != null )

  return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);

  else

  return false;

  }

  public function __get($name)

  {

  if($name == 'innertext')

  return $this->text();

  else

  return $this->getAttribute($name);

  }

  }

  $xp = new xf_HtmlDom();

  $xp->loadHtml('http://www.aizhan.com/siteall/www.opendir.cn/');

  $rows = $xp->find("td[@id='baidu']/a", 0)->innertext;

  print_r($rows);