当前位置: 移动技术网 > IT编程>网页制作>HTML > 通过微信公众号链接爬取内容

通过微信公众号链接爬取内容

2020年07月30日  | 移动技术网IT编程  | 我要评论

通过微信公众号链接爬取内容,并将爬取到的图片上传到七牛云

import re,os
import pymysql
import requests
import datetime
import random
import time
import oss2

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
endpoint = '***'
auth = oss2.Auth('***', '***')
bucket = oss2.Bucket(auth, endpoint, '***')

#获取html
def get_html(url):
    html = requests.get(url, headers=headers).text
    return html

#获取文章标题
def get_Title(url):
    html = get_html(url)
    resTitle = '<meta property="og:title" content="(.*?)" />'
    title = re.compile(resTitle, re.DOTALL).findall(html)[0]
    return title

#获得文章内容
def get_Content(url):
    html = get_html(url)
    resContent = '<div class="rich_media_content " id="js_content".*?">(.*?)</div>'
    # resContent = '<div class="rich_media_content " id="js_content".*?">(.*?)</div>.*?(<script.*?</script>)'
    content = re.compile(resContent, re.DOTALL).findall(html)[0]
    content = re.sub(r'\n','',content)
    content = re.sub(r'\t', '', content)
    content = re.sub(r'data-src', 'src', content)
    return content

#获得图片地址
def get_Content_urls(content):
    resUrl = 'src="(.*?)".*?data-type="(.*?)"'
    Urls =re.compile(resUrl,re.DOTALL).findall(content)
    return Urls

#下载文件到本地
def upload_wj(imgurl):
    response = requests.get(imgurl[0])
    pathname = datetime.datetime.now().strftime("%Y%m%d")
    randint = random.randint(1000,9999)
    path = pathname + str(randint)
    path = path+ '.' +imgurl[1]

    with open(path, "wb") as fp:
        for data in response.iter_content(128):
            fp.write(data)
    return path

#上传到oss并返回网址
def put_oss(imgurl):
    # 下载网络文件返回本地路径
    fileContent = upload_wj(imgurl)
    # 上传文件到OSS
    result = bucket.put_object_from_file(fileContent, fileContent)
    # 获取上传后的网址
    ret = bucket.sign_url('GET', fileContent, 60 * 60 * 24 * 365 * 5)
    os.remove(fileContent)  # 删除本地文件
    return ret

#替换文章的文件地址
def replace_src(yurl,ossurl,content):
    content = re.sub(yurl, ossurl, content)
    return content

#检查是否有存储空间
def does_bucket_exist(bucket):
    try:
        bucket.get_bucket_info()
    except oss2.exceptions.NoSuchBucket:
        return False
    except:
        raise
    return True
    
#查看存储空间内容
def echo_oss():
    bucket_info = bucket.get_bucket_info()
    for object_info in oss2.ObjectIterator(bucket):
        print(object_info.key)

def conn_mysql():
    url = '***'
    username = '***'
    password = '***'
    dbname = '***'
    db=pymysql.connect(url,username,password,dbname)
    return db

def run(url):
    # db = conn_mysql()
    if url=='':
        return 'url为空'
    title = get_Title(url)
    content = get_Content(url)
    urls = get_Content_urls(content)
    print(urls)
    for url in urls:
        # print(url[0]+'------'+url[1])
        yurl = url[0]
        print(yurl)
        print('图片存入oss')
        ossurl = put_oss(url)
        print('存入成功')        # 替换content里的src
        content = content.replace(yurl, ossurl)
        print('替换成功')
    print(content)
    print('运行结束')
    #存入数据库
    # sql = 'INSERT INTO ***(title,content,create_time) VALUES (%s,%s,%s)'
    # nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    # data = [title, content, str(nowtime)]
    # db.cursor().execute(sql, data)
    # db.commit()
    # db.close()


然后通过Flask调用写成接口形式即可。

本文地址:https://blog.csdn.net/Zhoulei16/article/details/107644658

如对本文有疑问, 点击进行留言回复!!

相关文章:

验证码:
移动技术网