scrapy–基于Redis的Bloomfilter去重 本文代码去重对象是item
class RedisPipeline(object): def __init__(self, redis_uri, redis_db): self.redis_uri = redis_uri self.redis_db = redis_db @classmethod def from_crawler(cls, crawler): return cls(redis_uri=crawler.settings.get('REDIS_URI'), redis_db=crawler.settings.get('REDIS_DB')) def open_spider(self, spider): self.bf = BloomFilter(host=self.redis_uri,key=self.redis_db) def process_item(self, item, spider): if item["app_name"]: if self.bf.isContains(item["app_name"]): # 判断字符串是否存在 raise DropItem("{name}已经存在".format(name=item["app_name"])) else: self.bf.insert(item["app_name"]) return item else : raise DropItem("{name}空值".format(name=item["app_name"])) def close_spider(self, spider): self.bf.close()settings中
1、 #redis config REDIS_URI="localhost" REDIS_DB="wandoujia" 2、 ITEM_PIPELINES = { 'wandoujiaScrapy.pipelines.RedisPipeline': 200, }redis储存
# coding=utf-8 import redis from hashlib import md5 from wandoujiaScrapy.settings import * class SimpleHash(object): def __init__(self, cap, seed): self.cap = cap self.seed = seed def hash(self, value): ret = 0 for i in range(len(value)): ret += self.seed * ret + ord(value[i]) return (self.cap - 1) & ret class BloomFilter(object): def __init__(self, host='localhost', port=6379, db=0, blockNum=1, key='bloomfilter'): """ :param host: the host of Redis :param port: the port of Redis :param db: witch db in Redis :param blockNum: one blockNum for about 90,000,000; if you have more strings for filtering, increase it. :param key: the key's name in Redis """ self.server = redis.Redis(host=host, port=port, db=db) self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M self.seeds = [5, 7, 11, 13, 31, 37, 61] self.key = key self.blockNum = blockNum self.hashfunc = [] for seed in self.seeds: self.hashfunc.append(SimpleHash(self.bit_size, seed)) def isContains(self, str_input): if not str_input: return False m5 = md5() #s1.update(upwd.encode("utf8")) # 指定编码格式,否则会报错 m5.update(str_input.encode("utf8")) str_input = m5.hexdigest() ret = True name = self.key + str(int(str_input[0:2], 16) % self.blockNum) for f in self.hashfunc: loc = f.hash(str_input) ret = ret & self.server.getbit(name, loc) return ret def insert(self, str_input): m5 = md5() m5.update(str_input.encode("utf8")) str_input = m5.hexdigest() name = self.key + str(int(str_input[0:2], 16) % self.blockNum) for f in self.hashfunc: loc = f.hash(str_input) self.server.setbit(name, loc, 1) def close(self): self.server.flushdb() #关闭时清除此数据库 def __del__(self): self.close() if __name__ == '__main__': """ 第一次运行时会显示 not exists!,之后再运行会显示 exists! """ bf = BloomFilter() if bf.isContains('http://www.baidu.com'): # 判断字符串是否存在 print('exists!') else: print('not exists!') bf.insert('http://www.baidu.com')