import requests
from bs4 import BeautifulSoup
import bs4
def getHtml(url, header):
try:
r = requests.get(url, headers=header)
r.raise_for_status()
print(r.request.headers)
# r.encoding = r.apparent_encoding # 根据情况是否填写
return r.text
except:
print("爬取失败!")
return " "
def parsePage(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for i in soup.find('main', {'class': 'site-main'}).children:
try:
if isinstance(i, bs4.element.Tag):
psrc = i('div', {'class': 'p-time'})
title = i('h1', {'class': 'entry-title'})
# print(psrc[0].text)
# print(title[0].string)
# print(title[0].a.attrs['href'])
ulist.append([psrc[0].text, title[0].string, title[0].a.attrs['href']])
# ulist.append([1, 1, 1])
except:
print("数据丢失!")
def printlist(ulist):
print("{:10}\t{:10}\t{:8}".format("发布日期", "标题", "链接"))
for i in ulist:
print("{:10}\t{:10}\t{:8}".format(i[0], i[1], i[2]))
def main():
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36",
}
worlds = '1'
ulist = []
url = "https://www.hellohuanxuan.top/?s=" + worlds
html = getHtml(url, header)
parsePage(ulist, html)
printlist(ulist)
if __name__ == "__main__":
main()
try:
# 通过requests的get方法获得源代码
r = requests.get(url, headers=header)
# 判断返回状态码是否为200,不为200直接进入异常
r.raise_for_status()
# 打印头部信息看看,可注释掉
print(r.request.headers)
# r.encoding = r.apparent_encoding # 根据情况是否填写,爬我的网站要注释,否则显示中文为乱码
return r.text
except:
print("爬取失败!")
return " "
# 利用BeautifulSoup解析html
soup = BeautifulSoup(html, "html.parser")
# for循环查找class为'site-main'的main标签的字标签
for i in soup.find('main', {'class': 'site-main'}).children:
# try except捕捉异常
try:
# isinstance函数在这里判断i是否是bs4库里规定的标签类型
if isinstance(i, bs4.element.Tag):
# 获取class为'p-time'的div标签
psrc = i('div', {'class': 'p-time'})
# 获取class为'entry-title'的h1标签
title = i('h1', {'class': 'entry-title'})
# print(psrc[0].text)
# print(title[0].string)
# print(title[0].a.attrs['href'])
# 将值写进列表
ulist.append([psrc[0].text, title[0].string, title[0].a.attrs['href']])
# ulist.append([1, 1, 1])
except:
print("数据丢失!")
# 格式化输出列表
print("{:10}\t{:10}\t{:8}".format("发布日期", "标题", "链接"))
for i in ulist:
print("{:10}\t{:10}\t{:8}".format(i[0], i[1], i[2]))
大家千万别全拿我的网站爬啊,学生服务器经不起太多折腾。(无奈)
最后推荐一个慕课的视频,北京理工大学嵩天老师的python爬虫课程,讲的很清晰也很透彻。
Bilibili链接:python网络爬虫与信息提取
python爬虫学习中,如果大佬们看出有什么可以优化的地方欢迎指正
转自自己的小网站:我的博客
本文地址:https://blog.csdn.net/Aaaes/article/details/107647278
如对本文有疑问, 点击进行留言回复!!
网友评论