python3爬虫Selenium+Chrom爬取今日头条街拍信息

xiaoxiao2022-07-04 178

初次写这个，有借鉴别人的地方，写的不好勿喷开始就上代码， #爬取的url,title,img,comment,source 通过字典的形式存储在列表里，需要的时候遍历提取就好

import re from selenium import webdriver import time start_url = "https://www.toutiao.com/search/?keyword=街拍" #爬取的url,title,img,comment,source 通过字典的形式存储在列表里，需要的时候遍历提取就好 info_list = [] def Verification_Code(driver): try: time.sleep(2) driver.switch_to.window(driver.window_handles[0]) driver.find_element_by_class_name("icon").click() time.sleep(2) driver.refresh() time.sleep(3) driver.switch_to.window(driver.window_handles[0]) except : print("无验证码") pass class TouTiao(): def __init__(self): self.driver = webdriver.Chrome() self.driver.maximize_window() self.driver.implicitly_wait(10) self.url = start_url # self.driver.get(start_url) # time.sleep(1) def take_data_last(self): #获取新闻URL和标题 content_list = self.driver.find_elements_by_class_name("link") for content in content_list: info_dict = {} info_dict["url"] = content.get_attribute("href") info_dict["title"] = content.text info_list.append(info_dict) # print(content) #获取新闻来源 sources = self.driver.find_elements_by_class_name("J_source") for source in sources: info_list[sources.index(source)]["source"] = source.text # 获取评论数 comment_list = self.driver.find_elements_by_class_name("comment") for comment in comment_list: info_list[comment_list.index(comment)]["comment"] = comment.text #获取预览图片 view_img_list = self.driver.find_elements_by_class_name("articleCard") for view_img in view_img_list: img_html = view_img.get_attribute("innerHTML") img = re.findall(r'<img alt="" src="(.*?)">',img_html)[-1] info_list[view_img_list.index(view_img)]["view_img"] = img print(info_list) print(len(info_list)) def load_data(self): self.driver.get(start_url) time.sleep(1) #这里调用方法直接X掉验证码刷新页面就好了 Verification_Code(self.driver) self.driver.execute_script("window.scrollTo(0,1000);") time.sleep(1) while True: # 获取当前页面所有新闻数量 before_num =len(self.driver.find_elements_by_class_name("articleCard")) # 通过循环让浏览器的滚动条不断的向下拖动 for i in range(3): #拖动滚动条到底部 self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight);") time.sleep(3) # 获取当前页面所有新闻数量 element_num = len(self.driver.find_elements_by_class_name("articleCard")) if before_num == element_num: self.take_data_last() break def close_browser(self): time.sleep(5) self.driver.quit() def main(self): self.load_data() self.close_browser() if __name__ == '__main__': toutiao = TouTiao() toutiao.main()

最后的效果，由于是新手，代码都是自己写的，可以参考一下，代码和功能还有许多改进之处，希望大佬多多指正。

最新回复(0)