下载小说前十章源代码

    xiaoxiao2022-07-05  193

    from bs4 import BeautifulSoup import requests import codecs import re

    def getHtml(url):         htm2 = requests.get(url,headers)     soup = BeautifulSoup(htm2.content,'lxml')         books_dir = []          name = soup.find('div',class_='listmain')     #父div,子dl,孙dt(总标题)和dd(每一章)     if name:         dd_items = name.find('dl')         dt_num = 0         for n in dd_items.children:             ename = str(n.name).strip()             if ename == 'dt':                 dt_num += 1             if ename != 'dd':                 continue             books_info = {}             if dt_num == 2:                 durls = n.find_all('a')[0]                 books_info['name'] = durls.get_text()                 books_info['url'] = 'http://www.biqukan.com' + durls.get('href')                 books_dir.append(books_info)     return books_dir        

    def get_per_address(book_ads):     for i in range(len(book_ads)-1340):         per_ads=book_ads[i].get('url')         get_charpter_text(per_ads)                                  def get_charpter_text(url):         html = requests.get(url,headers)     soup = BeautifulSoup(html.content,'lxml')           x_cnt = soup.find('div',attrs={'id':'content'})          cont=x_cnt.get_text()     cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')]     c = " ".join(cont)         print(c)          path='C:\\Users\Administrator\\Desktop\\new{}.txt'.format(x+1)         file = open( path,'a+',encoding='utf-8')     file.write(c[0])     file.close()     return    

    if __name__ == '__main__':         url='http://www.biqukan.com/1_1094/'     headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}     x=5     book_ads=getHtml(url)     get_per_address(book_ads)       

    最新回复(0)