from bs4 import BeautifulSoup import requests import codecs import re
def getHtml(url): htm2 = requests.get(url,headers) soup = BeautifulSoup(htm2.content,'lxml') books_dir = [] name = soup.find('div',class_='listmain') #父div,子dl,孙dt(总标题)和dd(每一章) if name: dd_items = name.find('dl') dt_num = 0 for n in dd_items.children: ename = str(n.name).strip() if ename == 'dt': dt_num += 1 if ename != 'dd': continue books_info = {} if dt_num == 2: durls = n.find_all('a')[0] books_info['name'] = durls.get_text() books_info['url'] = 'http://www.biqukan.com' + durls.get('href') books_dir.append(books_info) return books_dir
def get_per_address(book_ads): for i in range(len(book_ads)-1340): per_ads=book_ads[i].get('url') get_charpter_text(per_ads) def get_charpter_text(url): html = requests.get(url,headers) soup = BeautifulSoup(html.content,'lxml') x_cnt = soup.find('div',attrs={'id':'content'}) cont=x_cnt.get_text() cont = [str(cont).strip().replace('\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0', '').replace('\u3000\u3000', '')] c = " ".join(cont) print(c) path='C:\\Users\Administrator\\Desktop\\new{}.txt'.format(x+1) file = open( path,'a+',encoding='utf-8') file.write(c[0]) file.close() return
if __name__ == '__main__': url='http://www.biqukan.com/1_1094/' headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} x=5 book_ads=getHtml(url) get_per_address(book_ads)