应用 Scrapy框架 爬取盗墓笔记小说数据,存入MongoDB 数据库。
# settings 配置mongodb MONGODB_HOST = '127.0.0.1' MONGODB_PORT = 27017 MONGODB_DBNAME = 'MySpider' MONGODB_DOCNAME = 'daomubiji' # items 配置抓取数据字段 import scrapy class NovelItem(scrapy.Item): bookName = scrapy.Field() bookTitle = scrapy.Field() chapterNum = scrapy.Field() chapterName = scrapy.Field() chapterUrl = scrapy.Field() # spider 抓取数据 import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from novel.items import NovelItem class DaomubijiSpider(CrawlSpider): name = 'daomubiji' allowed_domains = ['daomubiji.com'] start_urls = ['http://www.daomubiji.com/'] def parse_start_url(self, response): pass rules = ( Rule(LinkExtractor(restrict_xpaths='//article[@class="article-content"]//a'), callback='parse_item', follow=True), ) def parse_item(self, response): item = NovelItem() list = response.xpath('//body') for listItem in list: item['bookName'] = listItem.xpath('.//h1[@class="focusbox-title"]/text()').get().split(':')[0] subList = listItem.xpath('.//div[@class="excerpts"]//article') for subListItem in subList: item['bookTitle'] = subListItem.xpath('.//a/text()').get().split(' ')[0] item['chapterNum'] = subListItem.xpath('.//a/text()').get().split(' ')[1] item['chapterName'] = subListItem.xpath('.//a/text()').get().split(' ')[2] item['chapterUrl'] = subListItem.xpath('.//a/@href').get() yield item # pipeline 处理数据 from scrapy.conf import settings import pymongo class NovelPipeline(object): def __init__(self): host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbName = settings['MONGODB_DBNAME'] client = pymongo.MongoClient(host=host, port=port) db = client[dbName] self.post = db[settings['MONGODB_DOCNAME']] def open_spider(self, spider): print('This spider is starting!') def process_item(self, item, spider): bookInfo = dict(item) self.post.insert(bookInfo) return item def close_spider(self, spider): print('This spider is end!')