使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

　　Web 前端代码

　　<%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %>

　　<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

　　</head>

　　<body>

　　<div>

　　<asp:Repeater ID="Repeater1" runat="server">

　　<tr>

　　<td>

　　标题

　　</td>

　　<td>

　　发布作者

　　</td>

　　<td>

　　发布时间

　　</td>

　　</tr>

　　</HeaderTemplate>

　　<a href='<%#Eval("url") %>' target="_blank">

　　<%#Eval("title") %>

　　</a>

　　</td>

　　<td>

　　<a href='<%#Eval("authorUrl") %>' target="_blank">

　　<%#Eval("author") %>

　　</a>

　　</td>

　　<td>

　　<%#Eval("updatetime") %>

　　</td>

　　</tr>

　　</ItemTemplate>

　　</asp:Repeater>

　　</table>

　　</div>

　　</form>

　　</body>

　　</html>

　　cs 后台代码：

复制代码代码如下:

　　using System;

　　using System.Collections.Generic;

　　using System.Linq;

　　using System.Web;

　　using System.Web.UI;

　　using System.Web.UI.WebControls;

　　using S1;

　　using System.Net;

　　using System.IO;

　　using System.Text;

　　using HtmlAgilityPack;

　　public partial class _Default : System.Web.UI.Page

　　{

　　protected void Page_Load(object sender, EventArgs e)

　　{

　　string page = string.Empty;

　　if (!IsPostBack)

　　{

　　WebClient wc = new WebClient();

　　string address = "http://www.cnblogs.com";

　　if (!string.IsNullOrEmpty(Request.QueryString["p"]))

　　{

　　address += "/" + Request.QueryString["p"];//分页，p=p2,p=p3

　　}

　　Stream stream = wc.OpenRead(address);

　　StreamReader sr = new StreamReader(stream, Encoding.UTF8);

　　string html = sr.ReadToEnd();

　　//实例化HtmlAgilityPack.HtmlDocument对象

　　HtmlDocument doc = new HtmlDocument();

　　//载入HTML

　　doc.LoadHtml(html);

　　//根据HTML节点NODE的ID获取节点

　　HtmlNode navNode = doc.GetElementbyId("post_list");

　　//div[2]表示文章链接a位于post_list里面第3个div节点中

　　HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点

　　Cnblogs cnblogs = null;

　　IList<Cnblogs> cnlist = new List<Cnblogs>();

　　foreach (HtmlNode node in list)

　　{

　　cnblogs = new Cnblogs();

　　//获取文章链接地址

　　cnblogs.url = node.Attributes["href"].Value.ToString();

　　//获取文章标题

　　cnblogs.title = node.InnerText;

　　cnlist.Add(cnblogs);

　　}

　　HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a");

　　for (int i = 0; i < cnlist.Count; i++)

　　{

　　cnlist[i].author = list1[i].InnerText;

　　cnlist[i].authorUrl = list1[i].Attributes["href"].Value.ToString();

　　cnlist[i].updatetime = list1[i].NextSibling.InnerText.Replace("发布于", "").Trim();

　　}

　　this.Repeater1.DataSource = cnlist;

　　this.Repeater1.DataBind();

　　}

　　public class Cnblogs

　　{

　　public string title { get; set; }

　　public string url { get; set; }

　　public string author { get; set; }

　　public string authorUrl { get; set; }

　　public string updatetime { get; set; }

　　}

教程首页更多教程