当前位置: 移动技术网 > IT编程>脚本编程>Python > 利用python爬取京东商品评论

利用python爬取京东商品评论

2020年07月22日  | 移动技术网IT编程  | 我要评论

京东评论的爬取和淘宝的差不多,可以参考上两篇文章文章:
利用python分析Ajax爬取淘宝评论
最新Python爬取淘宝评论(2020年4月)

import time
import re
import requests
import json
import random
import csv



class JdSpider_content():
    def __init__(self, productId, page, name):
        self.name = name #要保存为的文件名称
        self.page = page #页码
        self.productId = productId #商品id
        self.url = "https://club.jd.com/comment/productPageComments.action?"
        self.headers = {"User-Agent": "自己的User-Agent",
                        "referer": "https://item.jd.com/10999284925.html",
                        "Cookie": '自己的cookie'
                        }

    def get_page(self):
       
        params = {
            "productId": self.productId,
            "page": self.page,
            "callback": "fetchJSON_comment98",
            "score": "0",  # 0是正常评价 1是差评 2是中评
            "sortType": "5",
            "pageSize": "10",
            "isShadowSku": "0",
            "rid": "0",
            "fold": "1"
        }
        res = requests.get(self.url, params=params, headers=self.headers)
        try:
            if res.status_code == 200:
                res = requests.get(
                    self.url, params=params, headers=self.headers).text[20:-2]
                res_json = json.loads(res)
                res_str = json.dumps(res_json, indent=4)
                return json.loads(res_str)
        except:
            return None

    def get_content(self, json_data):
        if json_data != None:
            for item in json_data.get("comments"):
                content_data = item.get("content")
                content_time = item.get("creationTime")
                content_name = item.get("nickname")
                type_size = item.get("productSize")
                type_color = item.get("productColor")
                yield {
                    "content_time": content_time,
                    "type_color": type_color,
                    "type_size": type_size,
                    "content_name": content_name,
                    "content_data": content_data,
                }

        else:
            print("该页出错啦!")
            return None
	
    def get_word(self, json_data):
        if json_data != None:
            word_list = re.findall(
                ".*?name.*?: '(.*?)'", str(json_data.get("hotCommentTagStatistics")))
            for i in word_list:
                with open(self.name+"关键词.txt", "a", encoding="utf-8") as file:
                    file.write(i+"\n")
	#将结果保存为txt文本
    def write_txt(self, data):
        with open(self.name+".txt", "a", encoding="utf-8") as file:
            file.write(json.dumps(data, indent=2, ensure_ascii=False))
            file.write("\n")
	#将结果保存为csv
    def write_csv(self, data):
        with open(self.name+".csv", "a", encoding="utf-8-sig", newline='') as file:
            fieldnames = ["content_time", "content_type",
                          "content_name", "content_data"]
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writerow(data)
	#将结果保存为json格式
    def write_json(self, data):
        with open("taobaocontent.json", "a", encoding="utf-8") as file:
            file.write(json.dumps(data, indent=2, ensure_ascii=False))

    def main(self):
        json_data = self.get_page()
        self.get_content(json_data)
        return self.get_content(json_data)


if __name__ == "__main__":
    ls = []
    for j in range(2):
        print("\n")
        print("现在是第%d页" % (j+1))
        a = JdSpider_content(
            productId="24155385153", page=j+1, name="祺奥")
        if j==0:
            json_data = a.get_page()
            a.get_word(json_data)
        if a.main() != None:
            for i in a.main():
                print(i)
                ls.append(i)
        else:
            pass
        time.sleep(random.randint(15,20)) #防止ip被封,或者用代理池也行。
    a.write_txt(ls)

本文地址:https://blog.csdn.net/m0_46412065/article/details/107468840

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网