当前位置: 移动技术网 > IT编程>脚本编程>Python > selenium爬取lagou

selenium爬取lagou

2019年07月08日  | 移动技术网IT编程  | 我要评论

麻师娘加盟,asp简介,王喜老婆

from selenium import webdriver
import time
from lxml import etree
import re


class lagouspider(object):

    def __init__(self):
        self.driver = webdriver.chrome()
        self.url = "https://www.lagou.com/jobs/list_python?px=default&city=%e5%85%a8%e5%9b%bd#filterbox"

    def run(self):
        self.driver.get(self.url)
        while true:
            source = self.driver.page_source
            self.parse_page_list(source)
            next_btn = self.driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
            if "pager_next_disabled" in next_btn.get_attribute("class"):
                break
            else:
                next_btn.click()

    def parse_page_list(self, source):
        html = etree.html(source)
        detail_urls = html.xpath("//div/a[@class='position_link']/@href")
        for detail_url in detail_urls:
            self.get_detail_page(detail_url)
            time.sleep(1)

    def get_detail_page(self, detail_url):
        # self.driver.get(detail_url)
        # 新打开一个窗口
        self.driver.execute_script("window.open('%s')" % detail_url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        source = self.driver.page_source
        self.parse_datail_page(source)
        # 关闭该窗口
        self.driver.close()
        # 继续切换回职位列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    def parse_datail_page(self, source):
        html = etree.html(source)
        job_name = html.xpath("//div[@class='job-name']/h2/text()")[0].strip()
        job_request_spans = html.xpath("//dd[@class='job_request']//span")
        job_salary = job_request_spans[0].xpath("./text()")[0].strip()
        city = job_request_spans[1].xpath("./text()")[0].strip()
        city = re.sub(r'[/\s]', '', city)
        work_year = job_request_spans[2].xpath("./text()")[0].strip()
        work_year = re.sub(r'[/\s]', '', work_year)
        education = job_request_spans[3].xpath("./text()")[0].strip()
        education = re.sub(r'[/\s]', '', education)
        company_name = html.xpath("//h3[@class='fl']//text()")[0].strip()
        desc = "".join(html.xpath("//dl[@id='job_detail']/dd[@class='job_bt']//text()")).strip()
        desc = re.sub(r'[/\s\\xa]', '', desc)
        position = {
            "name": job_name,
            "job_salary": job_salary,
            "city": city,
            "work_year": work_year,
            "education": education,
            "company_name": company_name,
            "desc": desc
        }
        print(position)


lagou = lagouspider()
lagou.run()

 

如对本文有疑问,请在下面进行留言讨论,广大热心网友会与你互动!! 点击进行留言回复

相关文章:

验证码:
移动技术网