简单的爬虫自动下载小说
import requests #导入HTTP请求库
from pyquery import PyQuery #PyQuery,原生css选择器
'''获取单章节内容'''
def get_one_chapter(chapter_url = None, name = None):
response = requests.get(url = chapter_url)
doc = PyQuery(response.text)
title = doc("h1").text() #“h1”各网站标签不尽相同,请自行分析网站源码配置
content = doc("#content").text() #“#content”各网站小说内容关键词,请自行分析网站源码配置
print(title,content)
with open(file = name + ".txt", mode = "a+", encoding = "utf-8") as f:
f.write(title + "\n\n" + content)
'''解析一本书所有章节的url和书名''
def get_index(book_url):
index_url = book_url
text = requests.get(url = index_url).text
doc = PyQuery(text)
links = doc('#list a') #links 获取所有链接标签
name = doc('h1').text()
for link in list(links.items())[8:]: #一般目录网页都会有限显示最新更新章节,更具实际情况跳过章节
chapter_url = 'http://www.XXXXXX' + link.attr.href #防止有人说我打广告,网址自己配置
get_one_chapter(chapter_url = chapter_url, name = name)
'''解析全站书籍url'''
def get_all_book_url():
all_book_url = "http://www______XXXXXX%22& z" target="_blank" rel="noopener"> http://www.XXXXX X"
response = requests.get(url = all_book_url).text
doc = PyQuery(response)
links = doc('#main a')
for link in list(links.items()):
book_url = 'http://www.XXXXXX' + link.attr.href
get_index(book_url = book_url)
get_all_book_url()