pyhton爬虫--爬取扇贝单词

xiaoxiao2025-08-02 53

“”"

思路

爬取所有的单词分页爬取写入一个文件里面然后写入json文件中最后写入文档 “”"

import os import requests from lxml import etree def python(page): #起始的url for page in range(1,int(page)+1): path='./english/python' if not os.path.exists(path): os.makedirs(path) base_url="https://www.shanbay.com/wordlist/110521/232414/?page={}".format(page) response=requests.get(base_url) html=response.text # print(html) #将字符串格式转换成xml格式 html_xml=etree.HTML(html) #获取所有的单词 python_list=html_xml.xpath('//strong/text()') print(python_list) #获取所有的中文解释 china_list=html_xml.xpath('//td[@class="span10"]/text()') print(china_list) big_dic={ '{}页数'.format(page):{} } for i in range(len(china_list)): dic = {} dic['英文']=python_list[i] dic['翻译']=china_list[i] # print(dic) # print(type(dic))#查看数据类型 big_dic.get('{}页数'.format(page))[python_list[i]]=china_list[i] import json path1=path+str(page)+'.txt' with open(path1,'w',encoding='utf-8') as f: f.write(json.dumps(str(big_dic))) if __name__ == '__main__': page=input('请输入要爬取的页数') python(page)

第二种方式

‘’’ 爬取扇贝网python必背词汇表接口地址：https://www.shanbay.com/wordlist/110521/232414/ 要求：获取所有的python词汇数据，形成字典，然后存储数据

思路：第一页：https://www.shanbay.com/wordlist/110521/232414/ 第二页：https://www.shanbay.com/wordlist/110521/232414/?page=2

‘’’

import requests,re,os,json from lxml import etree class Shanbei: def __call__(self, *args, **kwargs): self.get_xml(*args) def get_xml(self,pages): ''' 获取扇贝英语python页内容转换为xml :return:获取扇贝英语python页的xml ''' base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=' for page in range(1,pages+1): base_html = requests.get('https://www.shanbay.com/wordlist/110521/232414/?page=1').text base_xml = etree.HTML(requests.get(base_url+str(page)).text) pages = page # print(base_xml) with open('贝壳应英语.html','w',encoding='utf-8') as f : f.write(base_html) self.get_python(base_html,base_xml,pages) def get_python(self,base_html,base_xml,pages): ''' 处理python页面数据 :param base_xml: :return: ''' #获取标题 # main_title = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][1]/div[@class='span8']/div[@class='well']/div[@class='row']/div[@class='span6']/h4/text()") #胆子获取 worlds_rule = re.compile(r'<h4>(.*?)</h4>',re.S) main_title = worlds_rule.findall(base_html) get_words = re.compile(r'[a-z][A-Z]+',re.I) world_title = (get_words.findall(main_title[0]))[0] # print(base_html) print(world_title) # #获取本页词条数量 trs_length = len(base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row']")) print(trs_length) #准备大字典 shell_word_dict = { '必备单词{}'.format(world_title):{}, } #根据词条数量获取循环次数 for line in range(int(trs_length)): word = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span2']/strong/text()".format(line+1)) word_description = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span10']/text()".format(line+1)) # print(word,word_description) shell_word_dict.get('必备单词{}'.format(world_title))[word[0]] = word_description[0] print(shell_word_dict) #储存文件 path = './扇贝单词/'+'第'+str(pages)+'页' +'/' print(path) if not os.path.exists(path): os.makedirs(path) file_path = path+'第'+str(pages)+'页' with open(file_path,'w',encoding='utf-8')as f: f.write(json.dumps(shell_word_dict)) if __name__ == '__main__': # pages = int(input('请输入需要爬取的页数')) pages = 3 shanbei = Shanbei() shanbei(pages)

最新回复(0)