Don't repeat yourself温故而知新

    xiaoxiao2024-12-12  17

    import requests, re from urllib.parse import urljoin # 写文件 def fileWrite(title, message, name): with open(name + ".txt", "a+") as f: f.write(title + "\n" + message + "\n") print(title, "下载成功!") # 获取页面信息 def getWebPage(url): headers = {"User-Agent": "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) " "AppleWebKit/604.1.34 (KHTML, like Gecko) Versio" "n/11.0 Mobile/15A5341f Safari/604.1", } try: res = requests.get(url, headers) res.encoding = res.apparent_encoding return res.text except: print("页面请求出错!") # 提取链接 def getLink(page, url, paternName, paternLink): try: links = re.findall(paternLink, page, re.M|re.S|re.I) # 相对路径转绝对路径 links = iter([urljoin(url, link) for link in links]) name = re.findall(paternName, page)[0] return name, links except: print("提取页面信息错误!") # 获取内容 def getContent(name, link, paternTitle, paternBody): page = getWebPage(link) try: title = re.findall(paternTitle, page)[0] content = re.findall(paternBody, page)[0] content = content.replace(" ", " ").replace("<br />", "\n") fileWrite(title, content, name) except: print("提取小说页面错误!") # main def main(url, paternName, paternLink, paternTitle, paternBody): page = getWebPage(url) name, links = getLink(page, url, paternName, paternLink) for link in links: getContent(name, link, paternTitle, paternBody) print("小说下载完毕!") if __name__ == '__main__': paternName, paterLink, paternTitle, paternBody = "<h1>(.*?)</h1>", \ "<dd><a href='(.*?)' >.*?</a></dd>", \ "<h1>(.*?)</h1>", \ '<div id="content">(.*?)<p>.*?</p></div>' main("http://www.xbiquge.la/2/2208/", paternName, paterLink, paternTitle, paternBody)
    最新回复(0)