Python3网络爬虫

    xiaoxiao2023-11-24  122

    https://blog.csdn.net/u012662731/article/details/78537432 import requests,sys from bs4 import BeautifulSoup import inspect def PrintLineFileFunc(string=''): callerframerecord = inspect.stack()[1] # 0代表当前行  , 1当前调用 frame = callerframerecord[0] info = inspect.getframeinfo(frame) filename = info.filename[info.filename.rfind('/')+1:] print("FILE:"+ filename + " FUNCTION:"+ info.function + " LINE:" + str(info.lineno)+ ' ' + string) class Downloader(object): def __init__(self): self.server = 'https://www.biqukan.com' self.target = 'https://www.biqukan.com/1_1094/' self.names = [] self.urls = [] self.nums = 0 def get_download_url(self): PrintLineFileFunc() req = requests.get(self.target) html = req.text PrintLineFileFunc() div_bf = BeautifulSoup(html) div = div_bf.find_all('div', 'listmain') a_bf = BeautifulSoup(str(div[0])) a = a_bf.find_all('a') PrintLineFileFunc() self.nums = len(a[15:]) for each in a[15:]: print('each.string: ' + each.string) self.names.append(each.string) href = each.get('href') self.urls.append(self.server + href) print('href: ' + href) print('self.nums:') print(self.nums) def get_contents(self, target): PrintLineFileFunc(target) req = requests.get(target) PrintLineFileFunc('requested') html = req.text bf = BeautifulSoup(html) texts = bf.find_all('div', 'showtxt') PrintLineFileFunc('dived') texts = texts[0].text.replace('\xa0' * 8, '\n\n') PrintLineFileFunc('replace') return texts def writer(self, name, path, text): PrintLineFileFunc() print('name: ' + name+' path: ' + path ) write_flag = True with open(path, 'a', encoding='utf-8') as f: f.write(name + '\n') f.writelines(text) f.write('\n\n') if __name__ == "__main__" : PrintLineFileFunc('main') dl = Downloader() dl.get_download_url() PrintLineFileFunc() for i in range(dl.nums): PrintLineFileFunc(str(i)) dl.writer(dl.names[i], 'a.txt', dl.get_contents(dl.urls[i])) sys.stdout.write("almost downed:%.3f%%" % float(i/dl.nums)+'\r') sys.stdout.flush() print('download finished')
    最新回复(0)