简单的Python爬虫自动下载小说
 
Notifications
Clear all

简单的Python爬虫自动下载小说

1 Posts
1 Users
0 Likes
3,292 Views
(@taichi)
Member
Joined: 4 years ago
Posts: 408
Topic starter  

简单的爬虫自动下载小说

import requests                                #导入HTTP请求库
from pyquery import PyQuery                        #PyQuery,原生css选择器

'''获取单章节内容'''
def get_one_chapter(chapter_url = None, name = None):
    response = requests.get(url = chapter_url)
    doc = PyQuery(response.text)
    title = doc("h1").text()                            #“h1”各网站标签不尽相同,请自行分析网站源码配置
    content = doc("#content").text()                    #“#content”各网站小说内容关键词,请自行分析网站源码配置
    print(title,content)
    with open(file = name + ".txt", mode = "a+", encoding = "utf-8") as f:
        f.write(title + "\n\n" + content)

'''解析一本书所有章节的url和书名''
def get_index(book_url):
    index_url = book_url
    text = requests.get(url = index_url).text
    doc = PyQuery(text)
    links = doc('#list a')                                                      #links 获取所有链接标签
    name = doc('h1').text()
    for link in list(links.items())[8:]:                            #一般目录网页都会有限显示最新更新章节,更具实际情况跳过章节
        chapter_url = 'http://www.XXXXXX' + link.attr.href            #防止有人说我打广告,网址自己配置
        get_one_chapter(chapter_url = chapter_url, name = name)

'''解析全站书籍url'''
def get_all_book_url():
    all_book_url = "http://www______XXXXXX%22& z" target="_blank" rel="noopener"> http://www.XXXXX X"
    response = requests.get(url = all_book_url).text
    doc = PyQuery(response)
    links = doc('#main a')
    for link in list(links.items()):
        book_url = 'http://www.XXXXXX' + link.attr.href
        get_index(book_url = book_url)

get_all_book_url()

 

 


   
Quote
Share: