import requests, re
from urllib.parse import urljoin
# 写文件
def fileWrite(title, message, name):
with open(name + ".txt", "a+") as f:
f.write(title + "\n" + message + "\n")
print(title, "下载成功!")
# 获取页面信息
def getWebPage(url):
headers = {"User-Agent": "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) "
"AppleWebKit/604.1.34 (KHTML, like Gecko) Versio"
"n/11.0 Mobile/15A5341f Safari/604.1", }
try:
res = requests.get(url, headers)
res.encoding = res.apparent_encoding
return res.text
except:
print("页面请求出错!")
# 提取链接
def getLink(page, url, paternName, paternLink):
try:
links = re.findall(paternLink, page, re.M|re.S|re.I)
# 相对路径转绝对路径
links = iter([urljoin(url, link) for link in links])
name = re.findall(paternName, page)[0]
return name, links
except:
print("提取页面信息错误!")
# 获取内容
def getContent(name, link, paternTitle, paternBody):
page = getWebPage(link)
try:
title = re.findall(paternTitle, page)[0]
content = re.findall(paternBody, page)[0]
content = content.replace(" ", " ").replace("<br />", "\n")
fileWrite(title, content, name)
except:
print("提取小说页面错误!")
# main
def main(url, paternName, paternLink, paternTitle, paternBody):
page = getWebPage(url)
name, links = getLink(page, url, paternName, paternLink)
for link in links:
getContent(name, link, paternTitle, paternBody)
print("小说下载完毕!")
if __name__ == '__main__':
paternName, paterLink, paternTitle, paternBody = "<h1>(.*?)</h1>", \
"<dd><a href='(.*?)' >.*?</a></dd>", \
"<h1>(.*?)</h1>", \
'<div id="content">(.*?)<p>.*?</p></div>'
main("http://www.xbiquge.la/2/2208/", paternName, paterLink, paternTitle, paternBody)