scrapy 爬取百度知道,多spider子一个项目中,使用一个pielines

    xiaoxiao2025-08-30  6

    爬取过程中 遇见 百度蜘蛛反爬 robot.txt,我们可以在scrapy 的setting.py 配置文件下配置

    ROBOTSTXT_OBEY = False

    最终代码

    # -*- coding: utf-8 -*- from scrapy.spider import Spider from scrapy.contrib.spiders import CrawlSpider, Rule #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.linkextractors import LinkExtractor from scrapy.selector import Selector from scrapy.http import Request, HtmlResponse from scrapy import log from items import BDzdItem class BDzdSpider(CrawlSpider): global qa_number; qa_number=0; """爬取百度知道 银行""" log.msg("log",level=log.DEBUG) def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: if link.text.find("银行") == -1: continue; seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) name = "bankSpider" download_delay = 1 allowed_domains = ["zhidao.baidu.com"] start_urls = [ "https://zhidao.baidu.com/question/1796062605517856547.html?fr=iks&word=
    转载请注明原文地址: https://yun.8miu.com/read-140573.html
    最新回复(0)