分享下页面关键字抓取www.icbase.com站点代码(带asp.net参数的)

  

复制代码 代码如下:

  <?php

  /**

  * HOST: www.icbase.com

  */

  //set_time_limit(0);

  // base function

  function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')

  {

  $ch = curl_init();

  if (!empty($data)) {

  $data = is_array($data)?http_build_query($data): $data;

  $url .= (strpos($url,'?')? '&': "?") . $data;

  }

  curl_setopt($ch, CURLOPT_URL, $url);

  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

  curl_setopt($ch, CURLOPT_POST, 0);

  curl_setopt($ch, CURLOPT_PORT, $port);

  curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

  curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面

  $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);

  if($proxy) {

  curl_setopt($ch, CURLOPT_PROXY, $proxy);

  curl_setopt($ch, CURLOPT_PROXYPORT, 1723);

  curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");

  }

  $result = array();

  $result['result'] = curl_exec($ch);

  if (0 != curl_errno($ch)) {

  $result['error'] = "Error:\n" . curl_error($ch);

  }

  curl_close($ch);

  return $result;

  }

  

复制代码 代码如下:

  function curl_post($url, $data = array(), $header = array(), $timeout = 5, $port = 80)

  {

  $ch = curl_init();

  curl_setopt($ch, CURLOPT_URL, $url);

  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

  //curl_setopt($ch, CURLOPT_PORT, $port);

  !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

  curl_setopt($ch, CURLOPT_POST, 1);

  curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

  $result = array();

  $result['result'] = curl_exec($ch);

  if (0 != curl_errno($ch)) {

  $result['error'] = "Error:\n" . curl_error($ch);

  }

  curl_close($ch);

  return $result;

  }

  /**

  * 获取列表页的html源码

  * @param string $keywords 搜索关键字

  * @param int $page 页数

  * @return boolean|array

  */

  function getListHtml($keywords, $page=1)

  {

  if ($page < 0)

  {

  return false;

  }

  $page = $page == 0 ? 1 : intval($page);

  if ($page == 1)

  {

  $result = curl_get('http://www.icbase.com/ProResult.aspx', array('ProKey' => $keywords));

  if ( isset($result['error']) )

  {

  return false;

  //exit($result['error']);

  }

  $result = $result['result'];

  // asp.net post提交数据

  if(! defined('__VIEWSTATE') && preg_match('/<input\s+type="hidden"\s+name="__VIEWSTATE"\s+id="__VIEWSTATE"\s+value="(.[^"]+)"/isU', $result, $matches))

  {

  define('__VIEWSTATE', $matches[1]);

  } else {

  return false;

  }

  if(! defined('__PREVIOUSPAGE') && preg_match('/<input\s+type="hidden"\s+name="__PREVIOUSPAGE"\s+id="__PREVIOUSPAGE"\s+value="(.[^"]+)"/isU', $result, $matches))

  {

  define('__PREVIOUSPAGE', $matches[1]);

  } else {

  return false;

  }

  if(! defined('__EVENTVALIDATION') && preg_match('/<input\s+type="hidden"\s+name="__EVENTVALIDATION"\s+id="__EVENTVALIDATION"\s+value="(.[^"]+)"/isU', $result, $matches))

  {

  define('__EVENTVALIDATION', $matches[1]);

  } else {

  return false;

  }

  return $result;

  }

  $data = array(

  '__EVENTTARGET' => 'pager',

  '__EVENTARGUMENT' => $page,

  '__VIEWSTATE' => __VIEWSTATE,

  '__PREVIOUSPAGE' => __PREVIOUSPAGE,

  '__EVENTVALIDATION' => __EVENTVALIDATION,

  );

  $result = curl_post('http://www.icbase.com/ProResult.aspx?ProKey=' . $keywords, $data);

  if ( isset($result['error']) )

  {

  return false;

  //exit($result['error']);

  }

  $result = $result['result'];

  return $result;

  }

  /**

  * 获取列表页 a链接的url

  * @param string $html html源码

  * @return array

  */

  function getListHref($html)

  {

  $pattern = '/<a\s+href=\'(.[^\']+)\'\s+target="_blank"\s*>[\s\n]*<img.+[^>]\/>/isU';

  if (preg_match_all($pattern, $html, $matches))

  {

  return $matches[1];

  } else {

  // 没有匹配项

  return array();

  }

  }

  /**

  * 获取下一页数字

  * @param string $html html源码

  * @return number

  */

  function getListNextPage($html)

  {

  $pattern = '/<div\s+id="Pager".+[^>]>.+<a\s+href="javascript\:__doPostBack\(\'Pager\',\'(\d+)\'\)">><\/a>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  return intval($matches[1]);

  } else {

  return -1;

  }

  }

  /**

  * 获取列表也所有的href

  * @param string $keywords 搜索关键字

  * @return boolean|array

  */

  function getListHrefAll($keywords)

  {

  if (empty($keywords))

  {

  return false;

  }

  $html = getListHtml($keywords);

  $hrefList = getListHref($html);

  if (empty($hrefList))

  {

  // 没有结果

  return array();

  }

  $nextPage = getListNextPage($html);

  while ($nextPage > 0)

  {

  $html = getListHtml($keywords, $nextPage);

  $tmpHrefList = getListHref($html);

  $hrefList = array_merge($hrefList, $tmpHrefList);

  $nextPage = getListNextPage($html);

  }

  return $hrefList;

  }

  /**

  * 获取详情页信息

  * @param string $url url地址或者是抓取到的html源代码 根据@see $is_url 区分

  * @param int $is_url 1使用的是url地址 0直接处理html源代码

  * @return boolean|multitype:|multitype:string

  */

  function getDetail($url, $is_url = 1)

  {

  if ( empty($url) )

  {

  return false;

  }

  $host = 'www.icbase.com';

  $html = $url;

  if ($is_url) {

  $url = '/' . ltrim($url, '/');

  $result = curl_get($host . $url);

  if ( isset($result['error']) )

  {

  exit($result['error']);

  }

  $html = $result['result'];

  }

  $result = array(

  'sup_part' => '', // 供应商型号

  'sup_id' => '', // 供应商ID

  'mfg_part' => '', // 制造商型号

  'mfg_name' => '', // 制造商名称

  'cat_name' => '', // 分类名称

  'para' => '', // 属性

  'desc' => '', // 描述

  'pdf_url' => '', // PDF地址

  'sup_stock' => '', // 库存

  'min_purch' => '', // 最小订购量

  'price' => '', // 价格

  'img_url' => '', // 图片地址

  'createtime' => '', // 创建时间

  'datacode' => '', // 批号

  'package' => '', // 封装

  'page_url' => '', // 页面地址

  );

  // mfg_part

  $pattern = '/<td>产品型号<\/td><td>(.[^<]+)</isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['mfg_part'] = trim($matches[1]);

  } else {

  // 此项木有,说明也没处处了

  return array();

  }

  // mfg_name

  $pattern = '/<td>厂商<\/td>[\s\n]*<td>(.+)<\/td>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['mfg_name'] = trim($matches[1]);

  }

  // para

  $pattern = '/<tr\s+style="background-color:#E9E9E9;color:black; font-weight:bold;">(.+)<\/tr><\/table>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  if (preg_match_all('/<td>(.+)<\/td>/isU', $matches[1], $matches))

  {

  $count = count($matches[1]);

  $count = intval($count / 2 );

  foreach ($matches[1] as $k=>$v)

  {

  if ($k >= $count)

  {

  break;

  }

  if (trim($v) == '描述')

  {

  // desc

  $result['desc'] = trim($matches[1][$count + $k]);

  continue;

  }

  $v = trim($v);

  $result['para'][$v] = trim($matches[1][$count + $k]);

  }

  }

  }

  // pdf_url

  $pattern = '/<td>详细资料<\/td><td><a\s+href="(.[^"]+)"/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['pdf_url'] = trim($matches[1]);

  }

  // sup_stock

  $pattern = '/<td>库存数量<\/td>[\s\n]*<td>(\d+)<\/td>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['sup_stock'] = trim($matches[1]);

  }

  // price

  $pattern = '/<tr><td.[^>]+>(\d+)\+<\/td><td.[^>]+>.[^\d]*([\d.]+)<\/td><\/tr>/isU';

  if (preg_match_all($pattern, $html, $matches))

  {

  foreach ($matches[1] as $k=>$v)

  {

  $result['price'][$v] = '¥' . $matches[2][$k];

  }

  }

  //img_url

  $pattern = '/<td>图片<\/td><td><img\s+src="(.[^"]+)"/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['img_url'] = trim($matches[1]);

  }

  // page_url

  if ($is_url)

  {

  $result['page_url'] = $host . $url;

  }

  return $result;

  }

  /**

  * 最终调用函数

  * @param string $keywords 搜索关键字

  * @return array

  */

  function getData($keywords)

  {

  $hrefList = getListHrefAll($keywords);

  $result = array();

  foreach ($hrefList as $k=>$v)

  {

  $result[] = getDetail($v);

  }

  return $result;

  }

  // Test Script

  $keywords = trim($_GET['keywords']);

  $result = getData($keywords);

  print_r($result);