使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科（扇贝单词）

xiaoxiao2025-07-16 7

''' 爬取扇贝网python必背词汇表接口地址：https://www.shanbay.com/wordlist/110521/232414/ 要求：获取所有的python词汇数据，形成字典，然后存储数据思路：第一页：https://www.shanbay.com/wordlist/110521/232414/ 第二页：https://www.shanbay.com/wordlist/110521/232414/?page=2 ''' import requests,re,os,json from lxml import etree class Shanbei: def __call__(self, *args, **kwargs): self.get_xml(*args) def get_xml(self,pages): ''' 获取扇贝英语python页内容转换为xml :return:获取扇贝英语python页的xml ''' base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=' for page in range(1,pages+1): base_html = requests.get('https://www.shanbay.com/wordlist/110521/232414/?page=1').text base_xml = etree.HTML(requests.get(base_url+str(page)).text) pages = page # print(base_xml) with open('贝壳应英语.html','w',encoding='utf-8') as f : f.write(base_html) self.get_python(base_html,base_xml,pages) def get_python(self,base_html,base_xml,pages): ''' 处理python页面数据 :param base_xml: :return: ''' #获取标题 # main_title = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][1]/div[@class='span8']/div[@class='well']/div[@class='row']/div[@class='span6']/h4/text()") #胆子获取 worlds_rule = re.compile(r'<h4>(.*?)</h4>',re.S) main_title = worlds_rule.findall(base_html) get_words = re.compile(r'[a-z][A-Z]+',re.I) world_title = (get_words.findall(main_title[0]))[0] # print(base_html) print(world_title) # #获取本页词条数量 trs_length = len(base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row']")) print(trs_length) #准备大字典 shell_word_dict = { f'必备单词{world_title}':{}, } #根据词条数量获取循环次数 for line in range(int(trs_length)): word = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span2']/strong/text()".format(line+1)) word_description = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span10']/text()".format(line+1)) # print(word,word_description) shell_word_dict.get(f'必备单词{world_title}')[word[0]] = word_description[0] print(shell_word_dict) #储存文件注意：此处保存的是循环外的字典 path = './扇贝单词/'+'第'+str(pages)+'页' +'/' print(path) if not os.path.exists(path): os.makedirs(path) file_path = path+'第'+str(pages)+'页' with open(file_path,'w',encoding='utf-8')as f: f.write(json.dumps(shell_word_dict)) if __name__ == '__main__': # pages = int(input('请输入需要爬取的页数')) pages = 3 shanbei = Shanbei() shanbei(pages)

最新回复(0)