分享下页面关键字抓取components.arrow.com站点代码

  

复制代码 代码如下:

  <?php

  /**

  * HOST: components.arrow.com

  */

  //set_time_limit(0);

  // base function

  function curl_get($url, $data = array(), $header = array(), $timeout = 15, $port = 80, $reffer = '', $proxy = '')

  {

  $ch = curl_init();

  if (!empty($data)) {

  $data = is_array($data)?http_build_query($data): $data;

  $url .= (strpos($url,'?')? '&': "?") . $data;

  }

  curl_setopt($ch, CURLOPT_URL, $url);

  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

  curl_setopt($ch, CURLOPT_POST, 0);

  curl_setopt($ch, CURLOPT_PORT, $port);

  curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

  curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //是否抓取跳转后的页面

  $reffer && curl_setopt($ch, CURLOPT_REFERER, $reffer);

  if($proxy) {

  curl_setopt($ch, CURLOPT_PROXY, $proxy);

  curl_setopt($ch, CURLOPT_PROXYPORT, 1723);

  curl_setopt($ch, CURLOPT_PROXYUSERPWD,"andhm001:andhm123");

  }

  $result = array();

  $result['result'] = curl_exec($ch);

  if (0 != curl_errno($ch)) {

  $result['error'] = "Error:\n" . curl_error($ch);

  }

  curl_close($ch);

  return $result;

  }

  

复制代码 代码如下:

  function curl_post($url, $data = array(), $header = array(), $timeout = 15, $port = 80)

  {

  $ch = curl_init();

  curl_setopt($ch, CURLOPT_URL, $url);

  curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);

  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);

  curl_setopt($ch, CURLOPT_PORT, $port);

  !empty ($header) && curl_setopt($ch, CURLOPT_HTTPHEADER, $header);

  curl_setopt($ch, CURLOPT_POST, 1);

  curl_setopt($ch, CURLOPT_POSTFIELDS, $data);

  $result = array();

  $result['result'] = curl_exec($ch);

  if (0 != curl_errno($ch)) {

  $result['error'] = "Error:\n" . curl_error($ch);

  }

  curl_close($ch);

  return $result;

  }

  /**

  * 获取列表页的html源码

  * @param string $keywords 搜索关键字

  * @param int $start 开始记录数

  * @return boolean|array

  */

  function getListHtml($keywords, $start = 0)

  {

  if ($start < 0)

  {

  return false;

  }

  $postData = array(

  'search_token' => $keywords,

  'start' => $start,

  'limit' => 100,

  );

  $result = curl_post('http://components.arrow.com/part/search/' . $keywords, http_build_query($postData));

  if ( isset($result['error']) )

  {

  return false;

  //exit($result['error']);

  }

  $result = $result['result'];

  return $result;

  }

  /**

  * 获取列表页 连接href

  * @param string $html html源码

  * @return array

  */

  function getListHref($html)

  {

  $pattern = '/<td\s+class="col_mfr_part_num"><a\s+href="(.[^>]+)">/isU';

  if (preg_match_all($pattern, $html, $matches))

  {

  return $matches[1];

  } else {

  // 没有匹配项

  return array();

  }

  }

  /**

  * 获取下一页数字start

  * @param string $html html源码

  * @return number

  */

  function getListNextPage($html)

  {

  $pattern = '/<script\s+language="javascript">buildPagination\(\'\d+\',\'\d+\',\'(\d+)\',\d+\);<\/script>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  return intval($matches[1]);

  } else {

  return -1;

  }

  }

  /**

  * 获取列表也所有的详细列表

  * @param string $keywords 搜索关键字

  * @return boolean|array

  */

  function getListHrefAll($keywords)

  {

  if (empty($keywords))

  {

  return false;

  }

  $html = getListHtml($keywords);

  $hrefList = getListHref($html);

  if (empty($hrefList))

  {

  // 没有结果

  return array();

  }

  $nextPage = getListNextPage($html);

  $loop =0;

  while ($nextPage > 0)

  {

  $html = getListHtml($keywords, $nextPage);

  $tmpHrefList = getListHref($html);

  $hrefList = array_merge($hrefList, $tmpHrefList);

  $nextPage = getListNextPage($html);

  $loop ++;

  }

  return $hrefList;

  }

  /**

  * 获取详情页信息

  * @param string $url url地址

  * @return array()

  */

  function getDetail($url)

  {

  if ( empty($url) )

  {

  return false;

  }

  $host = 'http://components.arrow.com';

  $url = $host . $url;

  $result = curl_get($url);

  if ( isset($result['error']) )

  {

  return array();

  //exit($result['error']);

  }

  $html = $result['result'];

  $result = array(

  'sup_part' => '', // 供应商型

  'sup_id' => '', // 供应商ID

  'mfg_part' => '', // 制造商型号

  'mfg_name' => '', // 制造商名称

  'cat_name' => '', // 分类名称

  'para' => '', // 属性

  'desc' => '', // 描述

  'pdf_url' => '', // PDF地址

  'sup_stock' => '', // 库存

  'min_purch' => '', // 最小订购量

  'price' => '', // 价格

  'img_url' => '', // 图片地址

  'createtime' => '', // 创建时间

  'datacode' => '', // 批号

  'package' => '', // 封装

  'page_url' => '', // 页面地址

  );

  // mfg_part

  $pattern = '/<li>[\s\n]*<strong>Part No:\s*<\/strong>(.+)<\/li>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['mfg_part'] = trim($matches[1]);

  } else {file_put_contents('page.txt', $html);die('xxx');

  return array();

  }

  // mfg_name

  $pattern = '/<li>[\s\n]*<strong>Manufacturer: <\/strong>(.+)<\/li>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['mfg_name'] = trim($matches[1]);

  }

  // cat_name

  $pattern = '/displayCategory\(\'(.[^\']+)\'\);/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['cat_name'] = trim($matches[1]);

  $result['cat_name'] = str_replace('|', '>', $result['cat_name']);

  }

  // para

  $tablepattern = '/<table\s+id="part_specs".[^>]*>(.+)<\/table>/isU';

  if (preg_match($tablepattern, $html, $matches))

  {

  $pattern = '/<tr>[\s\n]*<td><strong>(.+)<\/strong><\/td><td>(.+)<\/td>[\s\n]*<\/tr>/isU';

  if (preg_match_all($pattern, $matches[1], $matches))

  {

  foreach($matches[1] as $k=>$v)

  {

  $v = trim($v);

  if ('Package Type' == $v)

  {

  $result['package'] = trim($matches[2][$k]);

  continue;

  }

  $result['para'][$v] = trim($matches[2][$k]);

  }

  }

  }

  // desc

  $pattern = '/<div\s+id="part_title">.+<h4>(.+)<\/h4>[\s\n]*<\/div>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['desc'] = trim($matches[1]);

  }

  // pdf_url

  $pattern = '/<li\s+class="datasheet">[\s\n]*<strong>Datasheet:<\/strong><a\s+href="(.[^"]+)"/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['pdf_url'] = $host . trim($matches[1]);

  }

  // sup_stock

  $pattern = '/<td\s+id="inv_1"\s+class="li_inv">([\d,]+)<\/td>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['sup_stock'] = trim($matches[1]);

  $result['sup_stock'] = str_replace(',', '', $result['sup_stock']);

  }

  // min_purch

  $pattern = '/<span\s+id="multiples">[\s\n]*<strong>Multiple:\s*<\/strong>(.+)<\/span>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['min_purch'] = trim($matches[1]);

  }

  // price

  $pattern = '/<div\s+id="price_1"\s+class="li_price">(.[^<]+)<\/div>/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['price'][1] = trim($matches[1]);

  }

  $pattern = '/<div\s+id="price_1"\s+class="li_price">[\s\n]*<span.[^>]+title="(.[^"]+)">/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $priceurl = str_replace('&', '&', $matches[1]);

  $json = curl_get($priceurl);

  $json = $json['result'];

  if (! empty($json))

  {

  $jsonresult = json_decode($json, true);

  foreach ($jsonresult['parts'][0]['webprice']['resale'] as $k=>$v)

  {

  $result['price'][$v['minqty']] = $v['price'];

  }

  }

  }

  // img_url

  $pattern = '/<div\s+id="part_image">[\s\n]*<img\s+src="(.[^"]+)"/isU';

  if (preg_match($pattern, $html, $matches))

  {

  $result['img_url'] = trim($matches[1]);

  }

  // page_url

  $result['page_url'] = $url;

  return $result;

  }

  /**

  * 最终调用函数

  * @param string $keywords 搜索关键字

  * @return array

  */

  function getData($keywords)

  {

  $hrefList = getListHrefAll($keywords);

  $result = array();

  foreach ($hrefList as $k=>$v)

  {

  $result[] = getDetail($v);

  }

  return $result;

  }

  // Test Script

  $keywords = trim($_GET['keywords']);

  $result = getData($keywords);

  print_r($result);