当前位置: 移动技术网 > IT编程>开发语言>.net > 用DOM实现文章采集--通过jquery语法式的方法采集指定对象的文本

用DOM实现文章采集--通过jquery语法式的方法采集指定对象的文本

2018年12月07日  | 移动技术网IT编程  | 我要评论

[csharp]
/// <summary> 
/// dom查询器,用法跟jquery差不多 
/// </summary> 
public class domquery 

    /// <summary> 
    /// 获得节点 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    /// <remarks>dom选择器,用法跟jquery差不多</remarks> 
    public ilist<htmlnode> get(htmldocument _htmldocument, string selector) 
    { 
        string[] expressions = selector.split(new char[] { ' ' }, stringsplitoptions.removeemptyentries); 
 
        list<htmlnode> hnlist = new list<htmlnode>(); 
 
        if (expressions[0].startswith("#")) 
        { 
            hnlist.add(_htmldocument.getelementbyid(expressions[0].trimstart('#'))); 
            hnlist.removeall(x => { return x == null; }); 
 
            if (expressions.length == 1) 
            { 
                return hnlist; 
            } 
 
            for (int i = 1; i < expressions.length; i++) 
            { 
                hnlist = get(hnlist, expressions[i]); 
            } 
        } 
        else 
        { 
            hnlist.addrange(_htmldocument.documentnode.childnodes.where(x => { return x.nodetype == htmlnodetype.element; })); 
 
            for (int i = 0; i < expressions.length; i++) 
            { 
                hnlist = get(hnlist, expressions[i]); 
            } 
        } 
 
 
 
 
 
        return hnlist; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回innerhtml 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string singlegetinnerhtml(htmldocument _htmldocument, string selector) 
    { 
        htmlnode hn = singleget(_htmldocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.innerhtml; 
    } 
    /// <summary> 
    /// 查找节点,并直接返回innertext 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public string singlegetinnertext(htmldocument _htmldocument, string selector) 
    { 
        htmlnode hn = singleget(_htmldocument, selector); 
        if (hn == null) 
            return null; 
        else 
            return hn.innertext.trim(); 
    } 
    /// <summary> 
    /// 查找节点 
    /// </summary> 
    /// <param name="_htmldocument"></param> 
    /// <param name="selector"></param> 
    /// <returns></returns> 
    public htmlnode singleget(htmldocument _htmldocument, string selector) 
    { 
        ilist<htmlnode> hnlist = get(_htmldocument, selector); 
 
        if (hnlist.count == 0) 
        { 
            return null; 
        } 
        else 
        { 
            return hnlist[0]; 
        } 
    } 
 
    #region 获得属性 
    /// <summary> 
    /// 获得属性 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="attr"></param> 
    /// <returns></returns> 
    public string[] attr(ilist<htmlnode> _htmlnodes, string attr) 
    { 
        if (_htmlnodes == null) 
        { 
            return new string[0]; 
        } 
        if (_htmlnodes.count() == 0) 
        { 
            return new string[0]; 
        } 
        var v = from x in _htmlnodes where x.attributes[attr] != null select x; 
 
        return (from x in v select x.attributes[attr].value).toarray(); 
    } 
    #endregion 
 
    #region 根据选择器语法查找 
    /// <summary> 
    /// 根据选择器语法查找 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private list<htmlnode> get(list<htmlnode> _htmlnodes, string expression) 
    { 
        string _expre = null; 
        string fun = null; 
        int index = -1; 
        string keyword = null; 
        regex reg = new regex(@"([.|\-|\w]+)", regexoptions.singleline); 
        matchcollection mc = reg.matches(expression); 
        for (int i = 0; i < mc.count; i++) 
        { 
            if (i == 0) 
            { 
                _expre = mc[i].value; 
            } 
            if (i == 1) 
            { 
                fun = mc[i].value; 
            } 
            if (i == 2) 
            { 
                if (int.tryparse(mc[i].value, out index) == false) 
                { 
                    keyword = mc[i].value; 
                } 
            } 
        } 
        list<htmlnode> list = new list<htmlnode>(); 
 
        if (string.isnullorempty(fun) == true) 
        { 
            if (expression.startswith(".")) 
            { 
                return class(_htmlnodes, expression).tolist(); 
            } 
            else 
            { 
                return nodetype(_htmlnodes, expression).tolist(); 
            } 
        } 
        else 
        { 
            foreach (var n in _htmlnodes) 
            { 
                ienumerable<htmlnode> v; 
                if (_expre.startswith(".")) 
                { 
                    v = class(n, _expre); 
                } 
                else 
                { 
                    v = nodetype(n, _expre); 
                } 
 
 
                list.addrange(funaction(v, fun, index, keyword)); 
            } 
            return list; 
        } 
    } 
    #region 函数处理 
    /// <summary> 
    /// 函数处理   
    /// </summary> 
    /// <param name="v"></param> 
    /// <param name="fun"></param> 
    /// <returns></returns> 
    private ienumerable<htmlnode> funaction(ienumerable<htmlnode> v, string fun, int index, string keyword) 
    { 
        switch (fun.tolower()) 
        { 
            case "eq": 
                return v.where((nn, _index) => _index == index); 
            case "lt": 
                return v.where((nn, _index) => _index < index); 
            case "gt": 
                return v.where((nn, _index) => _index > index); 
            case "first": 
                if (v.count() > 0) 
                    return new htmlnode[] { v.first() }; 
                else 
                    return v; 
            case "last": 
                if (v.count() > 0) 
                    return new htmlnode[] { v.last() }; 
                else 
                    return v; 
            case "even": 
                return v.where((nn, _index) => _index % 2 == 0); 
            case "odd": 
                return v.where((nn, _index) => (_index & 1) == 1); 
            case "next": 
                return v.select(nn => nn.nextsibling); 
            case "contains": 
                return v.where(x => { return x.innerhtml.contains(keyword); }); 
            case "empty": 
                return v.where(x => { return x.haschildnodes == false; }); 
            case "header": 
                string[] headers = new string[] { "h1", "h2", "h3", "h4", "h5", "h6" }; 
                return findchildnodes(v.toarray()).where(x => { return headers.contains(x.originalname); }); 
            default: 
                throw new notsupportedexception("函数不支持。"); 
        } 
    } 
    #endregion 
    #endregion 
 
    #region 根据类名找节点 
    private parallelquery<htmlnode> class(htmlnode hn, string expression) 
    { 
        return class(new htmlnode[] { hn }, expression); 
    } 
    /// <summary> 
    /// 根据类名找节点 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private parallelquery<htmlnode> class(ilist<htmlnode> _htmlnodes, string expression) 
    { 
        var v = findchildnodes(_htmlnodes).asparallel().where(x => x.attributes["class"] != null); 
 
        var y = v.where(x => x.attributes["class"].value.split(new char[] { ' ' }, stringsplitoptions.removeemptyentries).contains(expression.trimstart('.'), stringcomparer.currentcultureignorecase)); 
 
        return y; 
    } 
    #endregion 
 
    #region 根据类型找节点 
    /// <summary> 
    /// 根据类型找节点 
    /// </summary> 
    /// <param name="hn"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private parallelquery<htmlnode> nodetype(htmlnode hn, string expression) 
    { 
        return nodetype(new htmlnode[] { hn }, expression); 
    } 
    /// <summary> 
    /// 根据类型找节点 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <param name="expression"></param> 
    /// <returns></returns> 
    private parallelquery<htmlnode> nodetype(ilist<htmlnode> _htmlnodes, string expression) 
    { 
        var v = findchildnodes(_htmlnodes).asparallel().where( 
                 x => x.originalname.equals(expression, stringcomparison.currentcultureignorecase)); 
 
 
        return v; 
    } 
    #endregion 
 
    #region 查找所有下级 
    /// <summary> 
    /// 查找所有下级 
    /// </summary> 
    /// <param name="_htmlnodes"></param> 
    /// <returns></returns> 
    private list<htmlnode> findchildnodes(ilist<htmlnode> _htmlnodes) 
    { 
        if (_htmlnodes == null) 
        { 
            throw new exception(""); 
        } 
        list<htmlnode> list = new list<htmlnode>(); 
        foreach (var v in _htmlnodes) 
        { 
            findchildnodesaction(v, list); 
        } 
 
        return list; 
    } 
    private void findchildnodesaction(htmlnode hn, list<htmlnode> list) 
    { 
        if (list == null) 
        { 
            throw new exception(""); 
        } 
        foreach (var v in hn.childnodes) 
        { 
            if (hn.nodetype == htmlnodetype.element) 
            { 
                list.add(v); 
                findchildnodesaction(v, list); 
            } 
        } 
    } 
 
    #endregion 
 
 

 

摘自 winner2050的专栏

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网