使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科(糗事百科)

    xiaoxiao2025-07-17  7

    ''' 3、 糗事百科:xpath http://www.qiushibaike.com/8hr/page/1 获取列表页每个帖子里的图片、用户昵称、段子内容、点赞次数和评论次数 选做:翻页 写到json文件中 ''' import requests,os,json from lxml import etree class Qiubai: def __call__(self, *args, **kwargs): self.get_xml(pages) def get_xml(self,pages): for page in range(1,pages+1): base_url = 'https://www.qiushibaike.com/8hr/page/'+str(page)+'/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36', } params = {} #网页转码为xml xml = etree.HTML(requests.get(base_url,headers=headers).text) self.get_data(xml,page) #获取终止型号 end_signal = xml.xpath( "/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li/a/span[@class='next']/text()") print('————————————————————',base_url) print('————————————————————',page,end_signal) if end_signal[0] == '\n更多\n' or int(page) == int(pages): print('##################################下载完成##################################') print(f'##################################共爬取前{page}页##################################') break def pages(selfm,xml): ''' /html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li[8]/a/span[@class='next'] :return: ''' #自动爬取到所有页面 end_signal_get = xml.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/ul[@class='pagination']/li[9]/a/span[@class='next']") def get_data(self,xml,page): path = './糗百/' + str(page) + '/' if not os.path.exists(path): os.makedirs(path) print(f'————————————————————————————————————开始爬取第{page}页————————————————————————————————————') #创建数据字典 page_info_dict = { f'第{page}页': {}, } #在大盒子获取每一个小盒子 small_divs = xml.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li") # print(small_divs) # print(len(small_divs)) for index,small_div in enumerate(small_divs): #相关推荐_标题 div_title = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/a[@class='recmd-content']/text()".format(str(index+1))) if not div_title: div_title = '未获取到数据' #相关推荐_图片地址 div_imgs_path = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/a/img/@src".format(str(index+1))) if not div_imgs_path: div_imgs_path = '未获取到数据' #相关推荐 推荐_用户名 div_username = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/a[@class='recmd-user']/span[@class='recmd-name']/text()".format(str(index+1))) if not div_username: div_username = '未获取到数据' #相关推荐_likes_好笑span[2]_好笑指数span[1] div_likes = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[1]/text()".format(str(index+1))) if not div_likes: div_likes = '未获取到数据' div_likes_title = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[2]/text()".format(str(index+1))) # 相关推荐_评论信息_评论数量[4]_评论标题span[5] div_comment = small_div.xpath("/html/body/div[@id='content']/div[@class='content-block clearfix']/div[@class='col1 new-style-col1']/div[@class='recommend-article']/ul/li[position()={}]/div[@class='recmd-right']/div[@class='recmd-detail clearfix']/div[@class='recmd-num']/span[4]/text()".format(str(index+1))) if not div_comment: div_comment = '未获取到数据' small_div_infos = { '标题': div_title[0], '首图地址': div_imgs_path[0], '用户名': div_username[0], '好笑': div_likes[0], '评论数量': div_comment[0], } #动态键值对 page_info_dict[f"第{page}页"][f'第{index + 1}条'] = small_div_infos # print(page_info_dict) # json_data = json.dumps(page_info_dict,indent=4) json_data = json.dumps(page_info_dict) file_name = path + '第' +str(page) + '页' with open(file_name ,'w',encoding='utf-8') as f : f.write(str(json_data)) print(json_data) if __name__ == '__main__': pages = int(input('请输入需要爬取的页数')) # pages = 1 qiubai = Qiubai() qiubai(pages)
    最新回复(0)