pyhton爬虫--爬取扇贝单词

    xiaoxiao2025-08-02  21

    “”"

    思路

    爬取所有的单词 分页爬取 写入一个文件里面 然后写入json文件中 最后写入文档 “”"

    import os import requests from lxml import etree def python(page): #起始的url for page in range(1,int(page)+1): path='./english/python' if not os.path.exists(path): os.makedirs(path) base_url="https://www.shanbay.com/wordlist/110521/232414/?page={}".format(page) response=requests.get(base_url) html=response.text # print(html) #将字符串格式转换成xml格式 html_xml=etree.HTML(html) #获取所有的单词 python_list=html_xml.xpath('//strong/text()') print(python_list) #获取所有的中文解释 china_list=html_xml.xpath('//td[@class="span10"]/text()') print(china_list) big_dic={ '{}页数'.format(page):{} } for i in range(len(china_list)): dic = {} dic['英文']=python_list[i] dic['翻译']=china_list[i] # print(dic) # print(type(dic))#查看数据类型 big_dic.get('{}页数'.format(page))[python_list[i]]=china_list[i] import json path1=path+str(page)+'.txt' with open(path1,'w',encoding='utf-8') as f: f.write(json.dumps(str(big_dic))) if __name__ == '__main__': page=input('请输入要爬取的页数') python(page)

    第二种方式

    ‘’’ 爬取扇贝网python必背词汇表 接口地址:https://www.shanbay.com/wordlist/110521/232414/ 要求:获取所有的python词汇数据,形成字典,然后存储数据

    思路: 第一页:https://www.shanbay.com/wordlist/110521/232414/ 第二页:https://www.shanbay.com/wordlist/110521/232414/?page=2

    ‘’’

    import requests,re,os,json from lxml import etree class Shanbei: def __call__(self, *args, **kwargs): self.get_xml(*args) def get_xml(self,pages): ''' 获取扇贝英语python页 内容 转换为xml :return:获取扇贝英语python页的xml ''' base_url = 'https://www.shanbay.com/wordlist/110521/232414/?page=' for page in range(1,pages+1): base_html = requests.get('https://www.shanbay.com/wordlist/110521/232414/?page=1').text base_xml = etree.HTML(requests.get(base_url+str(page)).text) pages = page # print(base_xml) with open('贝壳应英语.html','w',encoding='utf-8') as f : f.write(base_html) self.get_python(base_html,base_xml,pages) def get_python(self,base_html,base_xml,pages): ''' 处理python页面数据 :param base_xml: :return: ''' #获取标题 # main_title = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][1]/div[@class='span8']/div[@class='well']/div[@class='row']/div[@class='span6']/h4/text()") #胆子获取 worlds_rule = re.compile(r'<h4>(.*?)</h4>',re.S) main_title = worlds_rule.findall(base_html) get_words = re.compile(r'[a-z][A-Z]+',re.I) world_title = (get_words.findall(main_title[0]))[0] # print(base_html) print(world_title) # #获取本页词条数量 trs_length = len(base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row']")) print(trs_length) #准备大字典 shell_word_dict = { '必备单词{}'.format(world_title):{}, } #根据词条数量获取循环次数 for line in range(int(trs_length)): word = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span2']/strong/text()".format(line+1)) word_description = base_xml.xpath("/html/body/div[@class='container main-body ']/div[@class='row']/div[@class='span8']/div[@class='row'][2]/div[@class='span8']/table[@class='table table-bordered table-striped']/tbody/tr[@class='row'][{}]/td[@class='span10']/text()".format(line+1)) # print(word,word_description) shell_word_dict.get('必备单词{}'.format(world_title))[word[0]] = word_description[0] print(shell_word_dict) #储存文件 path = './扇贝单词/'+'第'+str(pages)+'页' +'/' print(path) if not os.path.exists(path): os.makedirs(path) file_path = path+'第'+str(pages)+'页' with open(file_path,'w',encoding='utf-8')as f: f.write(json.dumps(shell_word_dict)) if __name__ == '__main__': # pages = int(input('请输入需要爬取的页数')) pages = 3 shanbei = Shanbei() shanbei(pages)
    最新回复(0)