当前位置: 移动技术网 > IT编程>数据库>Mysql > scrapy爬虫实战 - 51job爬虫职位爬取

scrapy爬虫实战 - 51job爬虫职位爬取

2020年07月20日  | 移动技术网IT编程  | 我要评论

51job(爬虫相关职位)数据分析实战:
猛戳:51job爬虫职位数据分析实战

在这里插入图片描述

在这里插入图片描述

思路:

  1. 首先爬取所有内容页的链接存储到数据库
  2. 然后再新建一个scrapy 爬取这些链接
  3. 需要用到的模块:scrapy urllib pymysql

内容页链接爬取

  • 这里使用scrapy的通用爬虫框架
  • 创建命令:scrapy genspider -t crawl [name] [domains]
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from urllib.parse import quote


class Job51LinksSpider(CrawlSpider):
    name = 'job51_links'
    allowed_domains = ['51job.com']
    # 关键词url转码
    position = quote(quote("爬虫"))
    start_urls = [f"https://search.51job.com/list/000000,000000,0000,00,9,99,{position},2,1.html?"]

    rules = (
        # xpaths抓取每一页的链接
        Rule(LinkExtractor(restrict_xpaths="//div[@class='p_in']//li/a"), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        # 有时候t1标签后面会跟随一个空格才能爬取内容,所以这里做个判断
        # 如果解析t1没内容,就在t1后面加个空格即可
        t1 = response.xpath("//p[@class='t1']//span//a/@href").extract()
        # 解析url链接
        links = t1 if t1 else response.xpath("//p[@class='t1 ']//span//a/@href").extract()
        # 存入mysql数据库前先把数据处理一下
        # 第一个0是id,因为数据库设置了id自增,所以这里给0即可
        # 第二个就是每个内容页的url
        # 第三个0是爬取状态(status),这里设置首先设置为0,日后爬取过这个链接把这个0改成1代表已经爬取过
        links = [(0, url, 0) for url in links]
        # 传输给pipeline处理
        yield {"links": links}

内容爬取

  • 使用scrapy框架
  • 创建命令:scrapy genspider [name] [domains]
# -*- coding: utf-8 -*-
import scrapy, pymysql


class Jon51ContentSpider(scrapy.Spider):
    name = 'job51_content'
    allowed_domains = ['51job.com']
    # 创建mysql连接对象
    mysql_cli = pymysql.connect("localhost", "root", "123456", "51job")
    # 创建游标
    cursor = mysql_cli.cursor()
    def start_requests(self):
        # 读取status为0的url,status为0表示未被爬取过,日后爬取过后把这个status设为1
        self.cursor.execute("select id,url from links where status = 0")
        urls = self.cursor.fetchall()
        # i是这个url的id,为了以后修改status方便,这里先提取出来
        for i, url in urls:
            # 使用meta把id传过去
            yield scrapy.Request(url, callback=self.parse, meta={"id": i})

    def parse(self, response):
        # 解析页面内容
        # 因只作演示,所以仅仅爬取标题(title),工资(salary),公司(company),城市(city),要求(demand)
        title = response.xpath("//div[@class='cn']//h1/text()").extract_first()
        salary = response.xpath("//div[@class='cn']//strong/text()").extract_first()
        company = response.xpath("//p[@class='cname']//a[@class='catn']/text()").extract_first()
        city = response.xpath("//p[@class='msg ltype']/text()").extract_first().strip()
        demand = response.xpath("string(//div[@class='bmsg job_msg inbox'])").extract_first()
        data = {
            "title": title,
            "salary": salary,
            "company": company,
            "city": city,
            "demand": demand,
            "url": response.url,
            "id":response.meta["id"]
        }
        # 传输给pipeline处理
        yield data

pipeline管道

处理爬虫爬取下来的数据

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql

# 此类用于存储爬取的内容页链接
class Job51_link_Pipeline(object):
    def open_spider(self, spider):
        # 爬虫开启时候调用
        self.mysql_cli = pymysql.connect("localhost", "root", "123456", "51job")
        self.cursor = self.mysql_cli.cursor()
        # 查看数据表links是否存在
        exists = self.cursor.execute("show tables like 'links'")
        if not exists:
            # 数据表不存在便创建数据表
            try:
                sql = '''
                                create table links(
                                    id int primary key auto_increment,
                                    url varchar(150),
                                    status int
                                )
                            '''
                self.cursor.execute(sql)
                self.mysql_cli.commit()
            except:
                self.mysql_cli.rollback()
                print("create table links failed")

    def process_item(self, item, spider):
        # 数据存储到mysql中
        try:
            sql = "insert into links values (%s,%s,%s)"
            # executemany执行批量插入操作,后面参数类型如:[(0,url1,0),(0,url2,0)....]
            self.cursor.executemany(sql, item["links"])
            self.mysql_cli.commit()
        except:
            self.mysql_cli.rollback()
            print("insert links failed")

    def close_spider(self, spider):
        self.cursor.close()
        self.mysql_cli.close()

# 此类用于爬取储存爬取内容,基本上功能和上面大同小异
class Job51_content_Pipeline(object):
    def open_spider(self, spider):
        self.mysql_cli = pymysql.connect("localhost", "root", "123456", "51job")
        self.cursor = self.mysql_cli.cursor()
        exists = self.cursor.execute("show tables like 'information'")
        if not exists:
            try:
                sql = '''
                                create table information(
                                    id int primary key auto_increment,
                                    title varchar(100),
                                    salary varchar(10),
                                    company varchar(100),
                                    city varchar(50),
                                    demand text(512),
                                    url varchar(255)
                                )
                            '''
                self.cursor.execute(sql)
                self.mysql_cli.commit()
            except:
                self.mysql_cli.rollback()
                print("create table information failed")

    def process_item(self, item, spider):
        # 存储数据
        try:
            info = "%s," * 6 + "%s"
            sql = f"insert into information values ({info})"
            # 逐条逐条数据插入数据库
            self.cursor.execute(sql, (
                0,
                item["title"],
                item["salary"],
                item["company"],
                item["city"],
                item["demand"],
                item["url"]
            ))
            self.cursor.execute(f"update links set status=1 where id = {item['id']}")
            self.mysql_cli.commit()
        except Exception as err:
            self.mysql_cli.rollback()
            print("insert information failed")

    def close_spider(self, spider):
        self.cursor.close()
        self.mysql_cli.close()

scrapy配置

 - 爬虫相关配置,如果看不明白可以直接覆盖掉settings.py即可
 BOT_NAME = 'job_51'

SPIDER_MODULES = ['job_51.spiders']
NEWSPIDER_MODULE = 'job_51.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "DEBUG"

ITEM_PIPELINES = {
    # 注意,如果爬取内容页链接的话开启此行,注释掉下一行
   # 'job_51.pipelines.Job51_link_Pipeline': 300,
   # 默认开启爬取内容
   'job_51.pipelines.Job51_content_Pipeline': 300,
}

本文地址:https://blog.csdn.net/weixin_44345359/article/details/107430338

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网