当前位置：移动技术网 > IT编程>脚本编程>Python > 汽车用户消费投诉数据爬取分析（Python爬虫）

汽车用户消费投诉数据爬取分析（Python爬虫）

2020年07月18日 | 移动技术网IT编程 | 我要评论

多线程爬虫代码

"""
name:汽车用户消费投诉_品牌url爬取，已完成
author:zhangxiaoyu
"""
import _thread
import random
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from sqlalchemy import create_engine


def new_headers():
    """
    生成随机的Headers
    :return: Headers字典
    """
    a = random.randint(1, 999)
    b = random.randint(1, 99)

    # 随机生成User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }
    return headers


def get_url_for_all_brand():
    """
    爬取所有品牌对应的url，并写入数据库
    :return: 无输出
    """

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 确定要打开的网址
    url = 'http://tousu.315che.com/tousulist/serial/93/'

    # 打开界面
    driver.get(url=url)

    # 获取网页的源代码
    source1 = driver.page_source
    # print(source)

    # 通过界面的Xpath定位并点击A-Z界面，如果越界则跳出循环

    source = driver.page_source

    car_name = re.findall('<a href="http://tousu.315che.com/tousulist/serial/.{1,7}/">(.{1,40})</a>', source)
    for i in car_name:
        print(i)

    car_href = re.findall('<a href="(http://tousu.315che.com/tousulist/serial/.{1,7})/">.{1,40}</a>', source)
    for i in car_href:
        print(i)

    data = pd.DataFrame({
        'car_name': car_name,
        'car_href': car_href
    })

    # 链接数据库：mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('品牌url汇总', con=con, if_exists='append')

    print("成功")
    driver.close()


def download_url_for_all_brand():
    """
    从数据库下载所有品牌对应的url
    :return: 返回品牌名称和品牌对应的链接
    """

    # 用sqlalchemy构建数据库链接engine
    con = 'mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8'
    engine = create_engine(con)
    # sql 命令
    sql_cmd = "SELECT * FROM 品牌url汇总"

    url_brand = pd.read_sql(sql=sql_cmd, con=engine)[['car_name', 'car_href']]
    # print(url_brand)
    return url_brand


def get_brand_detail_url(brand, brand_url):
    """
    name:具体品牌的界面爬取
    author:zhangxiaoyu
    """
    all_detail_url_list = []

    # 随机生成User-Agent
    headers = new_headers()

    # 进入浏览器设置
    options = webdriver.ChromeOptions()

    # 更换头部
    options.add_argument('user-agent=' + headers['User-Agent'])

    # 打开一个空的窗口
    driver = webdriver.Chrome(options=options)

    # 打开界面
    driver.get(url=brand_url)

    # 获取网页的源代码
    source = driver.page_source
    # print(source)

    page_num = re.findall('<span class="pag-tip">共(.+)页</span>', source)
    print("页数:".format(page_num))
    # print(driver.current_url)

    if len(page_num) > 0:
        # 获取所有的页数
        for page in range(1, int(page_num[0]) + 1):
            brand_url_page = driver.current_url + "/0/0/0/" + str(page) + ".htm"
            print(brand_url_page)

            try:
                # 获取新的网页的源代码
                response = requests.get(url=brand_url_page, headers=headers)
                response.encoding = 'utf-8'
                source = response.text

                # 获取每个评论对应的url
                soup = BeautifulSoup(source, 'lxml')
                # print(soup)
                soup1 = soup.find_all(class_="tousu-filter-list")
                # print(soup1)
                detail_url_list = re.findall('<a href="(.+)" target="_blank">', str(soup1))
                print(detail_url_list)
                for feedback_url in detail_url_list:
                    all_detail_url_list.append(feedback_url)
                time.sleep(1)
            except:
                pass

        # 对数据进行去重
        all_detail_url_set = set(all_detail_url_list)
        all_detail_url_list = list(all_detail_url_set)

        # 去访问它的子页面
        print(all_detail_url_list)
        for feedback_url in all_detail_url_list:
            # print(feedback_url)
            try:
                # 获取附属界面信息
                get_feedback(brand, feedback_url)
                print(feedback_url + "成功")
            except:
                print(feedback_url + "出错！！！")


def get_feedback(brand, feedback_url):
    """
    :param brand: 汽车品牌
    :param feedback_url: 具体评论的url
    :return:
    """

    # 随机生成User-Agent
    a = random.randint(1, 999)
    b = random.randint(1, 99)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/' + str(
            a) + '.' + str(b)
    }

    # 访问评论的url
    response = requests.get(url=feedback_url, headers=headers)

    # 设置网页编码
    response.encoding = 'utf-8'

    # 获取网页源码
    source = response.text

    # 获取单号
    feedback_no = re.findall('<p class="">单号：(.+)</p>', source)
    print(feedback_no)
    # if feedback_no[0] is "":
    #     return 0

    # 获取品牌车型
    brand_model = re.findall('<p class="highlight">品牌车型：(.+)</p>', source)
    # print(brand_model)

    # 投诉问题
    feedback_question = re.findall('<p class="">诉求问题：(.+)</p>', source)
    print(feedback_question)

    # 投诉时间
    feedback_time = re.findall('<p class="">投诉时间：(....-.{0,3}-.{0,3} ..:..:..)</p>', source)
    # print(feedback_time)

    # 经销商
    shop = re.findall('<p class="">经销商：(.+)</p>', source)
    # print(shop)

    # 投诉具体内容
    soup = BeautifulSoup(source, 'lxml')
    # print(soup)
    soup1 = soup.find_all(class_="describe")
    # print(soup1)
    mark = [i.get_text() for i in soup1]
    mark = mark[0][1:-1]
    # print(mark)

    # 投诉状态
    soup2 = soup.find_all(class_="article-tag unsolved")
    status = re.findall('<span class=".+">(.+)</span>', str(soup2))
    # print(status)
    data = pd.DataFrame({
        'feedback_no': feedback_no,
        'brand': brand,
        'brand_model': brand_model,
        'feedback_question': feedback_question,
        'mark': mark,
        'feedback_time': feedback_time,
        'shop': shop,
        'status': status,
        'feedback_url': feedback_url,
    })

    # 链接数据库：mysql+pymysql://用户名:密码@地址:端口/数据库名?编码格式
    con = create_engine('mysql+pymysql://root:123456789@127.0.0.1:3306/python爬虫?charset=utf8')
    # 写入数据库
    data.to_sql('汽车用户消费投诉多线程', con=con, if_exists='append')
    time.sleep(0.5)


# 多线程
def print_time(start, end):
    # 从数据库下载所有品牌对应的url
    url_brand = download_url_for_all_brand()
    for i in range(start, end):
        brand = url_brand.loc[i][0]
        brand_url = url_brand.loc[i][1]
        print(brand, brand_url)
        get_brand_detail_url(brand, brand_url)
    print("一个线程结束")


if __name__ == '__main__':
    get_url_for_all_brand()

    _thread.start_new_thread(print_time, (0, 25))
    _thread.start_new_thread(print_time, (25, 50))
    _thread.start_new_thread(print_time, (50, 75))
    _thread.start_new_thread(print_time, (75, 100))

    _thread.start_new_thread(print_time, (100, 125))
    _thread.start_new_thread(print_time, (125, 150))
    _thread.start_new_thread(print_time, (150, 175))
    _thread.start_new_thread(print_time, (175, 200))
    _thread.start_new_thread(print_time, (200, 225))
    _thread.start_new_thread(print_time, (225, 250))
    _thread.start_new_thread(print_time, (250, 275))
    _thread.start_new_thread(print_time, (275, 300))
    _thread.start_new_thread(print_time, (300, 325))
    _thread.start_new_thread(print_time, (325, 350))
    _thread.start_new_thread(print_time, (350, 375))
    _thread.start_new_thread(print_time, (375, 400))

    _thread.start_new_thread(print_time, (400, 425))
    _thread.start_new_thread(print_time, (425, 450))
    _thread.start_new_thread(print_time, (450, 475))
    _thread.start_new_thread(print_time, (475, 500))
    _thread.start_new_thread(print_time, (500, 525))
    _thread.start_new_thread(print_time, (525, 550))
    _thread.start_new_thread(print_time, (550, 557))


    while (1):
        pass

简单的数据清洗

# 数据清洗

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉.xlsx')
data

# # 检查DataFrame中每行数据是否有重复的数据行
# mask = data.duplicated()
# mask
# data[~mask]

# 通过DataFrame内置方法去除重复的行数据
data = data.drop_duplicates()
data

data.to_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')

投诉最多的二十大车型

# 投诉最多的二十大车型

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand_model')[['brand_model']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:20]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

投诉最多的十大品牌

# 投诉最多的十大品牌

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
# data

result = data.groupby(by='brand')[['brand']].count()
result['数量'] = data.groupby(by='brand')[['brand']].count()
result = result.sort_values(by='数量',ascending=False)

result = result.iloc[:10]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最不靠谱的10大经销商

# 最不靠谱的10大经销商

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

result = data.groupby(by='shop')[['shop']].count()
result['数量'] = data.groupby(by='shop')[['shop']].count()
result = result.sort_values(by='数量',ascending=False)
result

result = result.iloc[1:11]

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图
# 绘制会员存量柱状图
plt.bar(result.index,result['数量'],width=0.5)

# 设置新绘图区y轴的刻度
yticks = range(0,2000,100)

# 3.展示
plt.show()

最近一年本网站接到的投诉数据趋势

# 最近一年本网站接到的投诉数据趋势

import pandas as pd
import matplotlib.pyplot as plt

# 导入数据
data = pd.read_excel(r'D:\Desktop\汽车用户消费投诉数据.xlsx')
data

# 2.数据处理
data['投诉年月'] = pd.DatetimeIndex(data['feedback_time']).strftime('%Y%m')
data.head()

result =  data.groupby(by='投诉年月')[['投诉年月']].count()
result['数量'] = data.groupby(by='投诉年月')[['投诉年月']].count()
result = result.iloc[-12:-1]
result

# 1.创建画布
plt.figure(figsize=(20,8),dpi=300)

# 2.绘图

# 绘制会员增量的折线图
ax = plt.twinx()
ax.plot(result.index,result['数量'],color='r')

# 3.展示
plt.show()

本文地址：https://blog.csdn.net/qq_29537269/article/details/107386998

您可能感兴趣的文章:

如对本文有疑问，点击进行留言回复！！

numpy手写kmeans

numpy手写kmeans，并可视化。由于可视化限制，拿二维数据进行示范。但该代码可适用于任意维的数据，同时可根... [阅读全文]
python爬取淘宝热卖商品（附xpath下载）

前言本文采用的主要工具是xpath，如果没有下载的可以点开此链接下载:https://pan.baidu.com... [阅读全文]
Task01——变量、运算符、数据类型及位运算

这里写自定义目录标题Task01——变量、运算符、数据类型及位运算注释运算符算术运算符比较运算符逻辑运算符位运算... [阅读全文]
LeetCode剪绳子问题（动态规划求解）

给你一根长度为 n 的绳子，请把绳子剪成整数长度的 m 段（m、n都是整数，n>1并且m>1），每段... [阅读全文]
python学习笔记之函数

python学习笔记之函数函数在python中函数是以def为开头的，后面跟上（）：括号里面是参数一般来说函数的... [阅读全文]
python实现逻辑回归

1.自定义代码实现import numpy as npimport matplotlib.pyplot as p... [阅读全文]
获取京东商品信息报错error pdos_captcha

获取京东商品信息报错error pdos_captcha2020.7.20python 请求商品信息出现 {“... [阅读全文]
利用python爬取京东商品评论

京东评论的爬取和淘宝的差不多，可以参考上两篇文章文章：利用python分析Ajax爬取淘宝评论最新Python爬... [阅读全文]
测试开发阿里巴巴笔试题2020

测试开发阿里巴巴笔试题2020幸运数7幸运数7输入范围[r,l]，输出该范围内幸运数的个数幸运数解释：数657，... [阅读全文]
贪心-LeetCode135. 分发糖果

1、题目描述https://leetcode-cn.com/problems/candy/老师想给孩子们分发糖果... [阅读全文]

网友评论


验证码：

汽车用户消费投诉数据爬取分析（Python爬虫）

2020年07月18日 | 移动技术网IT编程 | 我要评论

多线程爬虫代码

简单的数据清洗

投诉最多的二十大车型

投诉最多的十大品牌

最不靠谱的10大经销商

最近一年本网站接到的投诉数据趋势

您可能感兴趣的文章:

相关文章:

网友评论