当前位置: 移动技术网 > IT编程>开发语言>Asp > 写了段批量抓取某个列表页的东东

写了段批量抓取某个列表页的东东

2017年12月12日  | 移动技术网IT编程  | 我要评论


有些人当抓取程序是个宝,到目前还tnd有人在卖钱,强烈bs一下这些家伙 真是的!可能偶下边这段东西比较烂哈
下边这个没有写入库功能,已经到这一步了,入库功能是很简单的事了,需要的请自己去完成吧,其它功能各位自行完善吧!把代码拷贝过去直接运行即可看到效果

dim url,list_pagecode,array_articleid,i,articleid
dim content_pagecode,content_tempcode
dim content_categoryid,content_categoryname,borderid,classid,bordername,classname
dim articletitle,articleauthor,articlefrom,articlecontent

url = "http://www.webasp.net/article/class/1.htm"
list_pagecode = gethttppage(url)
list_pagecode = regexptext(list_pagecode,"打印</th></tr>","</table><table border=0 cellpadding=5",0)
list_pagecode = regexptext(list_pagecode,"<td align=left><a href='../","'><img border=0 src='../images/authortype0.gif'",1)    '取得当前列表页的文章链接,以,分隔
array_articleid = split(list_pagecode,",")    '创建数组,存储文章id

for i=0 to ubound(array_articleid)-1
    articleid = array_articleid(i)    '文章id
    content_pagecode = gethttppage("http://www.webasp.net/article/"&articleid)    '取得文章页的内容

    '=========取文章分类及相关id参数 开始=======================
    content_tempcode = regexptext(content_pagecode,"<a href=""/article/"">技术教程</a> >> ",">> 内容</td>",0)
    content_categoryid = regexptext(content_pagecode,"<a href='../class","/'>",1)
    borderid = split(content_categoryid,",")(0)    '大类id
    classid = split(content_categoryid,",")(1)    '子类id
        '==========检查大类是否存在 开始===============
        '如果不存在则入库

        '==========检查大类是否存在 结束===============
    'response.write(borderid & "," & classid & "<br />")
    content_categoryname = regexptext(content_pagecode,"/'>","</a>",1)
    bordername = split(content_categoryname,",")(0)    '大类名称
    classname = split(content_categoryname,",")(1)    '子类名称
        '==========检查子类是否存在 开始===============
        '如果不存在则入库

        '==========检查子类是否存在 结束===============
    '=========取文章分类及相关id参数 结束=======================

    '=========取文章标题及内容 开始=============================
    articletitle = regexptext(content_pagecode,"<tr><td align=center bgcolor=#dee2f5><strong>","</strong></td></tr>",0)
    articleauthor = regexptext(content_pagecode,"<tr><td><span class=blue>作者:</span>","</td></tr>",0)
    articlefrom = regexptext(content_pagecode,"<tr><td><span class=blue>来源:</span>","</td></tr>",0)
    articlecontent = regexptext(content_pagecode,"<tr><td class=content style=""word-wrap: break-word"" id=zoom>","</td></tr>"&vbcrlf&"        </table>"&vbcrlf&"    </td></tr></table>",0)
    '=========取文章标题及内容 结束=============================
    response.write(articletitle& "<br /><br />")
    response.flush()
next


附几个函数:

function gethttppage(url)
    if(isobjinstalled("microsoft.xmlhttp") = false)then
        response.write "<br><br>服务器不支持microsoft.xmlhttp组件"
        err.clear
        response.end
    end if
    on error resume next
    dim http
    set http=server.createobject("msxml2.xmlhttp")
    http.open "get",url,false
    http.send()
    if(http.readystate<>4)then
        exit function
    end if
    gethttppage=bytestobstr(http.responsebody,"gb2312")
    set http=nothing
    if(err.number<>0)then
        response.write "<br><br>获取文件内容出错"
        'response.end
        err.clear
    end if 
end function

       
function bytestobstr(codebody,codeset)
    dim objstream
    set objstream = server.createobject("adodb.stream")
    objstream.type = 1
    objstream.mode =3
    objstream.open
    objstream.write codebody
    objstream.position = 0
    objstream.type = 2
    objstream.charset = codeset
    bytestobstr = objstream.readtext
    objstream.close
    set objstream = nothing
end function

'================================================
'作  用:检查组件是否已经安装
'返回值:true  ----已经安装
'        false ----没有安装
'================================================
function isobjinstalled(objname)
    on error resume next
    isobjinstalled = false
    err = 0
    dim testobj
    set testobj = server.createobject(objname)
    if(0 = err)then isobjinstalled = true
    set testobj = nothing
    err = 0
end function

function regexptext(strng,strstart,strend,n)
    dim regex,match,matches,retstr
    set regex = new regexp
    regex.pattern = strstart&"([\s\s]*?)"&strend
    regex.ignorecase = true
    regex.global = true
    set matches = regex.execute(strng)
    for each match in matches
        if(n=1)then
            retstr = retstr & regex.replace(match.value,"$1") & ","
        else
            retstr = retstr & regex.replace(match.value,"$1")
        end if
    next
    regexptext = retstr
    set regex=nothing
end function

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网