当前位置: 移动技术网 > IT编程>开发语言>Asp > 实例讲解ASP实现抓取网上房产信息

实例讲解ASP实现抓取网上房产信息

2017年12月12日  | 移动技术网IT编程  | 我要评论
<%@language="vbscript" codepage="936"%>
<!-- #include file="conn.asp" --> 
<!-- #include file="inc/function.asp" -->
<!doctype html public "-//w3c//dtd html 4.01 transitional//en" "http://www.w3.org/tr/html4/loose.dtd">
<html>
<head>
<title>untitled document</title>
<meta http-equiv="content-type" content="text/html; charset=gb2312">
<meta http-equiv="refresh" content="300;url=steal_house.asp">
</head>

<body>
<%
on error resume next

server.scripttimeout = 999999
'========================================================
'字符编码函数
'====================================================
function bytestobstr(body,code) 
dim objstream 
set objstream = server.createobject("adodb.stream") 
objstream.type = 1 
objstream.mode =3 
objstream.open 
objstream.write body 
objstream.position = 0 
objstream.type = 2 
objstream.charset =code
bytestobstr = objstream.readtext 
objstream.close 
set objstream = nothing 
end function 

'取行字符串在另一字符串中的出现位置
function newstring(wstr,strng) 
newstring=instr(lcase(wstr),lcase(strng)) 
if newstring<=0 then newstring=len(wstr) 
end function 
'替换字符串函数
function replacestr(ori,str1,str2)
replacestr=replace(ori,str1,str2)
end function
'====================================================
function readxml(url,code,start,ends)
set osend=createobject("microsoft.xmlhttp")
sourcecode = osend.open ("get",url,false) 
osend.send()
readxml=bytestobstr(osend.responsebody,code )
start=instr(readxml,start)
readxml=mid(readxml,start)
ends=instr(readxml,ends)
readxml=left(readxml,ends-1)
end function

function substr(body,start,ends)
start=instr(body,start)
substr=mid(body,start+len(start)+1)
ends=instr(substr,ends)
substr=left(substr,ends-1)
end function

dim getcont,newscontent
dim url,title
url="http://www.***.com"'新闻网址knowsky.com
getcont=readxml(url,"gb2312","<table class=k2 border=""0""","</table>")
getcont=regexhtml(getcont)
dim keyid,newsclass,city,position,housetype,level,area,price,demostra

dim contactman,contact
for i=2 to ubound(getcont)
response.write(getcont(i)&"__<br>")

templink=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),""" onclick")-10)
templink=replace(templink,"../","")

response.write(i&":"&templink&"<br>")
newscontent=readxml(templink,"gb2312","<td valign=""bottom"" width=""400"">","<hr width=""760"" noshade size=""1"" color=""#808080""> ")
newscontent=removehtml(newscontent)
newscontent=replace(newscontent,vbcrlf,"")
newscontent=replace(newscontent,vbnewline,"")
newscontent=replace(newscontent," ","")
newscontent=replace(newscontent," ","")
newscontent=replace(newscontent," ","") 
newscontent=replace(newscontent,"\n","") 
newscontent=replace(newscontent,chr(10),"")
newscontent=replace(newscontent,chr(13),"")
'===============get content=======================
response.write(newscontent)
keyid=substr(newscontent,"列号:","信息类别:")
newsclass=substr(newscontent,"类别:","所在城市:")
city=substr(newscontent,"城市:","房屋具体位置:")
position=substr(newscontent,"位置:","房屋类型:")
housetype=substr(newscontent,"类型:","楼层:")
level=substr(newscontent,"楼层:","使用面积:")
area=substr(newscontent,"面积:","房价:")
price=substr(newscontent,"房价:","其他说明:")
demostra=substr(newscontent,"说明:","联系人:")
contactman=substr(newscontent,"联系人:","联系方式:")
contact=substr(newscontent,"联系方式:","信息来源:") 
response.write("总序列号:"&keyid&"<br>")
response.write("信息类别:"&newsclass&"<br>")
response.write("所在城市:"&city&"<br>")
response.write("房屋具体位置:"&position&"<br>")
response.write("房屋类型:"&housetype&"<br>")
response.write("楼层:"&level&"<br>")
response.write("使用面积:"&area&"<br>")
response.write("房价:"&price&"<br>")
response.write("其他说明:"&demostra&"<br>")
response.write("联系人:"&contactman&"<br>")
response.write("联系方式:"&contact&"<br>")
'title=removehtml(aa(i))
'response.write("title:"&title)
for n=0 to application.contents.count
if(application.contents(n)=keyid) then
ifexit=true 
end if 
next 
if not ifexit then
application(time&i)=keyid
'添加到数据库
'====================================================
set rs=server.createobject("adodb.recordset") 
rs.open "select top 1 * from news order by id desc",conn,3,3
rs.addnew
rs("newsclass")=newsclass
rs("city")=city
rs("position")=position
rs("housetype")=housetype
rs("level")=level
rs("area")=area
rs("price")=price
rs("demostra")=demostra
rs("contactman")=contactman
rs("contact")=contact
rs.update
rs.close
set rs=nothing
end if
'==================================================

next
function removetag(body)

set regex = new regexp
regex.pattern = "<[a].*?<\/[a]>"
regex.ignorecase = true
regex.global = true
set matches = regex.execute(body) 
dim i,arr(15),ifexit
i=0
j=0
for each match in matches
tempstr = match.value 
tempstr=replace(tempstr,"<td>","")
tempstr=replace(tempstr,"</td>","")
tempstr=replace(tempstr,"<tr>","")
tempstr=replace(tempstr,"</tr>","") 
arr(i)=tempstr 
i=i+1
if(i>=15) then
exit for
end if
next
set regex=nothing
set matches =nothing
removetag=arr

end function
function regexhtml(body)
dim r_arr(47),r_temp
set regex2 = new regexp
regex2.pattern ="<a.*?<\/a>"
regex2.ignorecase = true
regex2.global = true
set matches2 = regex2.execute(body) 
iii=0 
for each match in matches2

r_arr(iii)=match.value

iii=iii+1 
next
regexhtml=r_arr
set regex2=nothing
set matches2=nothing
end function
'======================================================

conn.close
set conn=nothing
%>
</body>
</html>




  function.asp

<%
'**************************************************
'函数名:gottopic
'作 用:截字符串,汉字一个算两个字符,英文算一个字符
'参 数:str ----原字符串
' strlen ----截取长度
'返回值:截取后的字符串
'**************************************************
function gottopic(str,strlen)
if str="" then
gottopic=""
exit function
end if
dim l,t,c, i
str=replace(replace(replace(replace(str," "," "),""",chr(34)),">",">"),"<","<")
str=replace(str,"?","")
l=len(str)
t=0
for i=1 to l
c=abs(asc(mid(str,i,1)))
if c>255 then
t=t+2
else
t=t+1
end if
if t>=strlen then
gottopic=left(str,i) & "…"
exit for
else
gottopic=str
end if
next
gottopic=replace(replace(replace(replace(gottopic," "," "),chr(34),"""),">",">"),"<","<")
end function
'=========================================================
'函数:removehtml(strhtml)
'功能:去除html标记
'参数:strhtml --要去除html标记的字符串
'=========================================================
function removehtml(strhtml) 
dim objregexp, match, matches 
set objregexp = new regexp 

objregexp.ignorecase = true 
objregexp.global = true 
'取闭合的<> 
objregexp.pattern = "<.+?>" 
'进行匹配 
set matches = objregexp.execute(strhtml) 

' 遍历匹配集合,并替换掉匹配的项目 
for each match in matches 
strhtml=replace(strhtml,match.value,"") 
next 
removehtml=strhtml 
set objregexp = nothing 
set matches=nothing
end function 

%>



  conn.asp

<%
'on error resume next
set conn=server.createobject("adodb.connection") 
con= "driver={microsoft access driver (*.mdb)};dbq=" & server.mappath("stest.mdb") 
conn.open con

sub connclose 
conn.close
set conn=nothing 
end sub
%>

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网