本次爬取用到的知识点有:
1. selenium
2. pymysql
3 pyquery
1. 打开某宝首页, 输入"男装"后点击"搜索", 则跳转到"男装"的搜索界面.
2. 空白处"右击"再点击"检查"审查网页元素, 点击"network".
1) 找到对应的url, url里的参数正是query string parameters的参数, 且请求方式是get
2) 我们请求该url得到内容就是"response"里的内容, 那么点击它来确认信息.
3) 下拉看到"男装"字样, 那么再往下找, 并没有发现有关"男装"的商品信息.
4) 任意复制一个商品信息, 空白处右击再点击"查看网页源代码", 在源码查找该商品, 即可看到该商品的信息.
5) 对比网页源代码和"response"响应内容, 发现源代码<script>..........</script>中的商品信息被替换, 这便是采用了js加密
6) 如果去请求上面的url, 得到的则是加密过的信息, 这时就可以利用selenium库来模拟浏览器, 进而得到商品信息.
1. 请求网站
1 # -*- coding: utf-8 -*- 2 from selenium import webdriver #从selenium导入浏览器驱动 3 browser = webdriver.chrome() #声明驱动对象, 即chrome浏览器 4 def get_one_page(): 5 '''获取单个页面''' 6 browser.get("https://www.xxxxx.com") #请求网站
2. 输入"男装", 在输入之前, 需要判断输入框是否存在, 如果存在则输入"男装", 不存在则等待显示成功.
1 # -*- coding: utf-8 -*- 2 from selenium import webdriver 3 from selenium.webdriver.common.by import by #导入元素定位方法模块 4 from selenium.webdriver.support.ui import webdriverwait #导入等待判断模块 5 from selenium.webdriver.support import expected_conditions as ec #导入判断条件模块 6 browser = webdriver.chrome() 7 def get_one_page(): 8 '''获取单个页面''' 9 browser.get("https://www.xxxxx.com") 10 input = webdriverwait(browser,10).until( #等待判断 11 ec.presence_of_element_located((by.css_selector,"#q"))) #若输入框显示成功,则获取,否则等待 12 input.send_keys("男装") #输入商品名称
3. 下一步就是点击"搜索"按钮, 按钮具有属性: 可点击, 那么加入判断条件.
1 # -*- coding: utf-8 -*- 2 from selenium import webdriver 3 from selenium.webdriver.common.by import by 4 from selenium.webdriver.support.ui import webdriverwait 5 from selenium.webdriver.support import expected_conditions as ec 6 browser = webdriver.chrome() 7 def get_one_page(): 8 '''获取单个页面''' 9 browser.get("https://www.xxxxx.com") 10 input = webdriverwait(browser,10).until( 11 ec.presence_of_element_located((by.css_selector,"#q"))) # 12 input.send_keys("男装") 13 button = webdriverwait(browser,10).until( #等待判断 14 ec.element_to_be_clickable((by.css_selector,"#j_tsearchform > div.search-button > button"))) #若按钮可点击, 则获取, 否则等待 15 button.click() #点击按钮
4. 获取总的页数, 同样加入等待判断.
1 # -*- coding: utf-8 -*- 2 import re 3 from selenium import webdriver 4 from selenium.common.exceptions import timeoutexception 5 from selenium.webdriver.common.by import by 6 from selenium.webdriver.support.ui import webdriverwait 7 from selenium.webdriver.support import expected_conditions as ec 8 browser = webdriver.chrome() 9 def get_one_page(): 10 '''获取单个页面''' 11 browser.get("https://www.xxxxx.com") 12 input = webdriverwait(browser, 10).until( 13 ec.presence_of_element_located((by.css_selector, "#q"))) 14 input.send_keys("男装") 15 button = webdriverwait(browser, 10).until( 16 ec.element_to_be_clickable( 17 (by.css_selector, "#j_tsearchform > div.search-button > button"))) 18 button.click() 19 pages = webdriverwait(browser, 10).until( # 等待判断 20 ec.presence_of_element_located( 21 (by.css_selector, "#mainsrp-pager > div > div > div > div.total"))) # 若总页数加载成功,则获取总页数,否则等待 22 return pages.text 23 def main(): 24 pages = get_one_page() 25 print(pages) 26 if __name__ == '__main__': 27 main()
5. 打印出来的不是我们想要的结果, 利用正则表达式获取, 最后再利用try...except捕捉异常
1 # -*- coding: utf-8 -*- 2 import re 3 from selenium import webdriver 4 from selenium.common.exceptions import timeoutexception 5 from selenium.webdriver.common.by import by 6 from selenium.webdriver.support.ui import webdriverwait 7 from selenium.webdriver.support import expected_conditions as ec 8 browser = webdriver.chrome() 9 def get_one_page(): 10 '''获取单个页面''' 11 try: 12 browser.get("https://www.xxxxx.com") 13 input = webdriverwait(browser,10).until( 14 ec.presence_of_element_located((by.css_selector,"#q"))) 15 input.send_keys("男装") 16 button = webdriverwait(browser,10).until( 17 ec.element_to_be_clickable((by.css_selector,"#j_tsearchform > div.search-button > button"))) 18 button.click() 19 pages = webdriverwait(browser,10).until( 20 ec.presence_of_element_located((by.css_selector,"#mainsrp-pager > div > div > div > div.total"))) 21 return pages.text 22 except timeoutexception: 23 return get_one_page() #如果超时,继续获取 24 def main(): 25 pages = get_one_page() 26 pages = int(re.compile("(\d+)").findall(pages)[0]) #采用正则表达式提取文本中的总页数 27 print(pages) 28 if __name__ == '__main__': 29 main()
关于selenium的更多内容,可参看官方文档
采用获取"到第 页"输入框方式, 切换到下一页, 同样是等待判断
需要注意的是, 最后要加入判断: 高亮是否是当前页
1 def get_next_page(page): 2 try: 3 input = webdriverwait(browser, 10).until( 4 ec.presence_of_element_located((by.css_selector, "#mainsrp-pager > div > div > div > div.form > input"))) # 若输入框加载成功,则获取,否则等待 5 input.send_keys(page) # 输入页码 6 button = webdriverwait(browser, 10).until( 7 ec.element_to_be_clickable((by.css_selector, "#mainsrp-pager > div > div > div > div.form > span.btn.j_submit"))) # 若按钮可点击,则获取,否则等待 8 button.click() # 点击按钮 9 webdriverwait(browser,10).until( 10 ec.text_to_be_present_in_element((by.css_selector,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page))) # 判断高亮是否是当前页 11 except timeoutexception: # 超时, 继续请求 12 return get_next_page(page) 13 def main(): 14 pages = get_one_page() 15 pages = int(re.compile("(\d+)").findall(pages)[0]) 16 for page in range(1,pages+1): 17 get_next_page(page) 18 if __name__ == '__main__': 19 main()
首先, 判断信息是否加载成功, 紧接着获取源码并初始化, 进而解析.
需要注意的是, 在"get_one_page"和"get_next_page"中调用之后, 才可执行
1 def get_info(): 2 """获取详情""" 3 webdriverwait(browser,20).until(ec.presence_of_element_located(( 4 by.css_selector,"#mainsrp-itemlist .items .item"))) #判断商品信息是否加载成功 5 text = browser.page_source #获取网页源码 6 html = pq(text) #初始化网页源码 7 items = html('#mainsrp-itemlist .items .item').items() #采用items方法会得到生成器 8 for item in items: #遍历每个节点对象 9 data = [] 10 image = item.find(".pic .img").attr("data-src") #用find方法查找子孙节点,用attr方法获取属性名称 11 price = item.find(".price").text().strip().replace("\n","") #用text方法获取文本,strip()去掉前后字符串,默认是空格 12 deal = item.find(".deal-cnt").text()[:-2] 13 title = item.find(".title").text().strip() 14 shop = item.find(".shop").text().strip() 15 location = item.find(".location").text() 16 data.append([shop, location, title, price, deal, image]) 17 print(data)
1 def save_to_mysql(data): 2 """存储到数据库""" 3 # 创建数据库连接对象 4 db= pymysql.connect(host = "localhost",user = "root",password = "password",port = 3306, db = "spiders",charset = "utf8") 5 # 获取游标 6 cursor = db.cursor() 7 #创建数据库 8 cursor.execute("create table if not exists {0}(shop varchar(20),location varchar(10),title varchar(255),price varchar(20),deal varchar(20), image varchar(255))".format("男装")) 9 #sql语句 10 sql = "insert into {0} values(%s,%s,%s,%s,%s,%s)".format("男装") 11 try: 12 #传入参数sql,data 13 if cursor.execute(sql,data): 14 #插入数据库 15 db.commit() 16 print("********已入库**********") 17 except: 18 print("#########入库失败#########") 19 #回滚,相当什么都没做 20 db.rollback() 21 #关闭数据库 22 db.close()
1 # -*- coding: utf-8 -*- 2 import re 3 import pymysql 4 from selenium import webdriver 5 from selenium.common.exceptions import timeoutexception 6 from selenium.webdriver.common.by import by 7 from selenium.webdriver.support.ui import webdriverwait 8 from selenium.webdriver.support import expected_conditions as ec 9 from pyquery import pyquery as pq 10 browser = webdriver.chrome() 11 def get_one_page(name): 12 '''获取单个页面''' 13 print("-----------------------------------------------获取第一页-------------------------------------------------------") 14 try: 15 browser.get("https://www.xxxxx.com") 16 input = webdriverwait(browser,10).until( 17 ec.presence_of_element_located((by.css_selector,"#q"))) 18 input.send_keys(name) 19 button = webdriverwait(browser,10).until( 20 ec.element_to_be_clickable((by.css_selector,"#j_tsearchform > div.search-button > button"))) 21 button.click() 22 pages = webdriverwait(browser,10).until( 23 ec.presence_of_element_located((by.css_selector,"#mainsrp-pager > div > div > div > div.total"))) 24 print("----即将解析第一页信息----") 25 get_info(name) 26 print("----第一页信息解析完成----") 27 return pages.text 28 except timeoutexception: 29 return get_one_page(name) 30 def get_next_page(page,name): 31 """获取下一页""" 32 print("---------------------------------------------------正在获取第{0}页----------------------------------------".format(page)) 33 try: 34 input = webdriverwait(browser, 10).until( 35 ec.presence_of_element_located((by.css_selector, "#mainsrp-pager > div > div > div > div.form > input"))) 36 input.send_keys(page) 37 button = webdriverwait(browser, 10).until( 38 ec.element_to_be_clickable((by.css_selector, "#mainsrp-pager > div > div > div > div.form > span.btn.j_submit"))) 39 button.click() 40 webdriverwait(browser,10).until( 41 ec.text_to_be_present_in_element((by.css_selector,"#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page))) 42 print("-----即将解析第{0}页信息-----".format(page)) 43 get_info(name) 44 print("-----第{0}页信息解析完成-----".format(page)) 45 except timeoutexception: 46 return get_next_page(page,name) 47 def get_info(name): 48 """获取详情""" 49 webdriverwait(browser,20).until(ec.presence_of_element_located(( 50 by.css_selector,"#mainsrp-itemlist .items .item"))) 51 text = browser.page_source 52 html = pq(text) 53 items = html('#mainsrp-itemlist .items .item').items() 54 for item in items: 55 data = [] 56 image = item.find(".pic .img").attr("data-src") 57 price = item.find(".price").text().strip().replace("\n","") 58 deal = item.find(".deal-cnt").text()[:-2] 59 title = item.find(".title").text().strip() 60 shop = item.find(".shop").text().strip() 61 location = item.find(".location").text() 62 data.append([shop, location, title, price, deal, image]) 63 for dt in data: 64 save_to_mysql(dt,name) 65 def save_to_mysql(data,name): 66 """存储到数据库""" 67 db= pymysql.connect(host = "localhost",user = "root",password = "password",port = 3306, db = "spiders",charset = "utf8") 68 cursor = db.cursor() 69 cursor.execute("create table if not exists {0}(shop varchar(20),location varchar(10),title varchar(255),price varchar(20),deal varchar(20), image varchar(255))".format(name)) 70 sql = "insert into {0} values(%s,%s,%s,%s,%s,%s)".format(name) 71 try: 72 if cursor.execute(sql,data): 73 db.commit() 74 print("********已入库**********") 75 except: 76 print("#########入库失败#########") 77 db.rollback() 78 db.close() 79 def main(name): 80 pages = get_one_page(name) 81 pages = int(re.compile("(\d+)").findall(pages)[0]) 82 for page in range(1,pages+1): 83 get_next_page(page,name) 84 if __name__ == '__main__': 85 name = "男装" 86 main(name)
以上是对学习的总结, 若有不对的地方, 还请指正, 谢谢!
如对本文有疑问, 点击进行留言回复!!
网友评论