asp采集抓取网上房产信息的代码

复制代码 代码如下:

  <%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%>

  <!-- #include file="conn.asp" -->

  <!-- #include file="inc/function.asp" -->

  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

  "http://www.w3.org/TR/html4/loose.dtd">

  <html>

  <head>

  <title>Untitled Document</title>

  <meta http-equiv="Content-Type" content="text/html; charset=gb2312">

  <meta http-equiv="refresh" content="300;URL=steal_house.asp">

  </head>

  <body>

  <%

  on error resume next

  '

  Server.ScriptTimeout = 999999

  '========================================================

  '字符编码函数

  '====================================================

  Function BytesToBstr(body,code)

  dim objstream

  set objstream = Server.CreateObject("adodb.stream")

  objstream.Type = 1

  objstream.Mode =3

  objstream.Open

  objstream.Write body

  objstream.Position = 0

  objstream.Type = 2

  objstream.Charset =code

  BytesToBstr = objstream.ReadText

  objstream.Close

  set objstream = nothing

  End Function

  '取行字符串在另一字符串中的出现位置

  Function Newstring(wstr,strng)

  Newstring=Instr(lcase(wstr),lcase(strng))

  if Newstring<=0 then Newstring=Len(wstr)

  End Function

  '替换字符串函数

  function ReplaceStr(ori,str1,str2)

  ReplaceStr=replace(ori,str1,str2)

  end function

  '====================================================

  function ReadXml(url,code,start,ends)

  set oSend=createobject("Microsoft.XMLHTTP")

  SourceCode = oSend.open ("GET",url,false)

  oSend.send()

  ReadXml=BytesToBstr(oSend.responseBody,code )

  start=Instr(ReadXml,start)

  ReadXml=mid(ReadXml,start)

  ends=Instr(ReadXml,ends)

  ReadXml=left(ReadXml,ends-1)

  end function

  function SubStr(body,start,ends)

  start=Instr(body,start)

  SubStr=mid(body,start+len(start)+1)

  ends=Instr(SubStr,ends)

  SubStr=left(SubStr,ends-1)

  end function

  dim getcont,NewsContent

  dim url,title

  url="http://www.***.com"'新闻网址

  getcont=ReadXml(url,"gb2312","<table class=k2 border=""0""","</table>")

  getcont=RegexHtml(getcont)

  dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra

  dim ContactMan,Contact

  for i=2 to ubound(getcont)

  response.Write(getcont(i)&"__<br>")

  tempLink=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),"""

  onClick")-10)

  tempLink=replace(tempLink,"../","")

  response.Write(i&":"&tempLink&"<br>")

  NewsContent=ReadXml(tempLink,"gb2312","<td valign=""bottom""

  width=""400"">","<hr width=""760""

  noshade size=""1"" color=""#808080"">

  ")

  NewsContent=RemoveHtml(NewsContent)

  NewsContent=replace(NewsContent,VbCrLf,"")

  NewsContent=replace(NewsContent,vbNewLine,"")

  NewsContent=replace(NewsContent," ","")

  NewsContent=replace(NewsContent," ","")

  NewsContent=replace(NewsContent," ","")

  NewsContent=replace(NewsContent,"\n","")

  NewsContent=replace(NewsContent,chr(10),"")

  NewsContent=replace(NewsContent,chr(13),"")

  '===============get Content=======================

  response.Write(NewsContent)

  KeyId=SubStr(NewsContent,"列号:","信息类别:")

  NewsClass=SubStr(NewsContent,"类别:","所在城市:")

  City=SubStr(NewsContent,"城市:","房屋具体位置:")

  Position=SubStr(NewsContent,"位置:","房屋类型:")

  HouseType=SubStr(NewsContent,"类型:","楼层:")

  Level=SubStr(NewsContent,"楼层:","使用面积:")

  Area=SubStr(NewsContent,"面积:","房价:")

  Price=SubStr(NewsContent,"房价:","其他说明:")

  Demostra=SubStr(NewsContent,"说明:","联系人:")

  ContactMan=SubStr(NewsContent,"联系人:","联系方式:")

  Contact=SubStr(NewsContent,"联系方式:","信息来源:")

  response.Write("总序列号:"&KeyId&"<br>")

  response.Write("信息类别:"&NewsClass&"<br>")

  response.Write("所在城市:"&City&"<br>")

  response.Write("房屋具体位置:"&Position&"<br>")

  response.Write("房屋类型:"&HouseType&"<br>")

  response.Write("楼层:"&Level&"<br>")

  response.Write("使用面积:"&Area&"<br>")

  response.Write("房价:"&Price&"<br>")

  response.Write("其他说明:"&Demostra&"<br>")

  response.Write("联系人:"&ContactMan&"<br>")

  response.Write("联系方式:"&Contact&"<br>")

  'title=RemoveHTML(aa(i))

  'response.Write("title:"&title)

  for n=0 to application.Contents.count

  if(application.Contents(n)=KeyId) then

  ifexit=true

  end if

  next

  if not ifexit then

  application(time&i)=KeyId

  '添加到数据库

  '====================================================

  set rs=server.CreateObject("adodb.recordset")

  rs.open "select top 1 * from news order by id desc",conn,3,3

  rs.addnew

  rs("NewsClass")=NewsClass

  rs("City")=City

  rs("Position")=Position

  rs("HouseType")=HouseType

  rs("Level")=Level

  rs("Area")=Area

  rs("Price")=Price

  rs("Demostra")=Demostra

  rs("ContactMan")=ContactMan

  rs("Contact")=Contact

  rs.update

  rs.close

  set rs=nothing

  end if

  '==================================================

  next

  function RemoveTag(body)

  Set regEx = New RegExp

  regEx.Pattern = "<[a].*?<\/[a]>"

  regEx.IgnoreCase = True

  regEx.Global = True

  Set Matches = regEx.Execute(body)

  dim i,arr(15),ifexit

  i=0

  j=0

  For Each Match in Matches

  TempStr = Match.Value

  TempStr=replace(TempStr,"<td>","")

  TempStr=replace(TempStr,"</td>","")

  TempStr=replace(TempStr,"<tr>","")

  TempStr=replace(TempStr,"</tr>","")

  arr(i)=TempStr

  i=i+1

  if(i>=15) then

  exit for

  end if

  Next

  Set regEx=nothing

  Set Matches =nothing

  RemoveTag=arr

  end function

  function RegexHtml(body)

  dim r_arr(47),r_temp

  Set regEx2 = New RegExp

  regEx2.Pattern ="<a.*?<\/a>"

  regEx2.IgnoreCase = True

  regEx2.Global = True

  Set Matches2 = regEx2.Execute(body)

  iii=0

  For Each Match in Matches2

  r_arr(iii)=Match.Value

  iii=iii+1

  Next

  RegexHtml=r_arr

  set regEx2=nothing

  set Matches2=nothing

  end function

  '======================================================

  conn.close

  set conn=nothing

  %>

  </body>

  </html>