某牛某客专栏文章爬虫

Lan

2023-10-08 / 0 评论 / 258 阅读 / 正在检测是否收录...

10/08

温馨提示：

本文最后更新于2023年10月08日，已超过992天没有更新，若内容或图片失效，请留言反馈。

代码已脱敏，自行替换

# @Time    : 2023/10/8 14:43
# @Author  : Lan
# @File    : niukespider.py
# @Software: PyCharm
import time
import requests


def get_category(catalog='10klpm'):
    url = f'https://www.lanol.cn.com/content/zhuanlan/index/catalog/{catalog}'
    return requests.get(url).json()


c = """
<!doctype html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>Document</title>
</head>
<body>
{{content}}
</body>
</html>
"""


def get_content(catalog, entity):
    url = f'https://www.lanol.cn.com/content/zhuanlan/index/detail/{catalog}/{entity}?_={int(time.time() * 1000)}'
    return requests.get(url, headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }).json()


if __name__ == '__main__':
    catalog = 'Gj5x2m'

    for i in get_category(catalog)['data']['catalog']:
        content = get_content(catalog, i['uuid'])['data']
        with open('./docs/' + i['title'].replace('/', '-') + '.html', 'w', encoding='utf-8') as f:
            f.write(c.replace('{{content}}', content['content']))