使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

  Web 前端代码

  

复制代码 代码如下:

  <%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %>

  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

  <html xmlns="http://www.w3.org/1999/xhtml">

  <head runat="server">

  <title></title>

  </head>

  <body>

  <form id="form1" runat="server">

  <div>

  <table cellpadding="1" cellspacing="1" bgcolor="#f1f1f1" style="text-align: center">

  <asp:Repeater ID="Repeater1" runat="server">

  <HeaderTemplate>

  <tr>

  <td>

  标题

  </td>

  <td>

  发布作者

  </td>

  <td>

  发布时间

  </td>

  </tr>

  </HeaderTemplate>

  <ItemTemplate>

  <tr bgcolor="#ffffff">

  <td align="left">

  <a href='<%#Eval("url") %>' target="_blank">

  <%#Eval("title") %>

  </a>

  </td>

  <td>

  <a href='<%#Eval("authorUrl") %>' target="_blank">

  <%#Eval("author") %>

  </a>

  </td>

  <td>

  <%#Eval("updatetime") %>

  </td>

  </tr>

  </ItemTemplate>

  </asp:Repeater>

  </table>

  </div>

  </form>

  </body>

  </html>

  cs 后台代码:

  

复制代码 代码如下:

  using System;

  using System.Collections.Generic;

  using System.Linq;

  using System.Web;

  using System.Web.UI;

  using System.Web.UI.WebControls;

  using S1;

  using System.Net;

  using System.IO;

  using System.Text;

  using HtmlAgilityPack;

  public partial class _Default : System.Web.UI.Page

  {

  protected void Page_Load(object sender, EventArgs e)

  {

  string page = string.Empty;

  if (!IsPostBack)

  {

  WebClient wc = new WebClient();

  string address = "http://www.cnblogs.com";

  if (!string.IsNullOrEmpty(Request.QueryString["p"]))

  {

  address += "/" + Request.QueryString["p"];//分页,p=p2,p=p3

  }

  Stream stream = wc.OpenRead(address);

  StreamReader sr = new StreamReader(stream, Encoding.UTF8);

  string html = sr.ReadToEnd();

  //实例化HtmlAgilityPack.HtmlDocument对象

  HtmlDocument doc = new HtmlDocument();

  //载入HTML

  doc.LoadHtml(html);

  //根据HTML节点NODE的ID获取节点

  HtmlNode navNode = doc.GetElementbyId("post_list");

  //div[2]表示文章链接a位于post_list里面第3个div节点中

  HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点

  Cnblogs cnblogs = null;

  IList<Cnblogs> cnlist = new List<Cnblogs>();

  foreach (HtmlNode node in list)

  {

  cnblogs = new Cnblogs();

  //获取文章链接地址

  cnblogs.url = node.Attributes["href"].Value.ToString();

  //获取文章标题

  cnblogs.title = node.InnerText;

  cnlist.Add(cnblogs);

  }

  HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a");

  for (int i = 0; i < cnlist.Count; i++)

  {

  cnlist[i].author = list1[i].InnerText;

  cnlist[i].authorUrl = list1[i].Attributes["href"].Value.ToString();

  cnlist[i].updatetime = list1[i].NextSibling.InnerText.Replace("发布于", "").Trim();

  }

  this.Repeater1.DataSource = cnlist;

  this.Repeater1.DataBind();

  }

  }

  public class Cnblogs

  {

  public string title { get; set; }

  public string url { get; set; }

  public string author { get; set; }

  public string authorUrl { get; set; }

  public string updatetime { get; set; }

  }

  }