# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from pymongo import MongoClient
class IpPipeline(object):
def open_spider(self,spider):
client=MongoClient()
self.collection=client["test"]["proxy"]
def process_item(self, item, spider):
if item is not None:
self.collections.insert(dict(item))
return item
# -*- coding: utf-8 -*-
import scrapy
from ip.items import IpItem
import re
class XiciSpider(scrapy.Spider):
name = 'xici'
allowed_domains = ['xicidaili.com']
start_urls = ['https://www.xicidaili.com/nn']
def parse(self, response):
tr_list=response.xpath("//table[@id='ip_list']//tr")
for tr in tr_list[1:]:
item=IpItem()
time=tr.xpath("./td[7]/div/@title").extract_first()
time=re.sub("秒","",time)
# time=re.findall("[0-9].*[0-9]",time)
time=(float)(time)
if time<0.5:
item["time"]=time
item["ip"]=tr.xpath("./td[2]/text()").extract_first()
item["port"] = tr.xpath("./td[3]/text()").extract_first()
item["type"] = tr.xpath("./td[6]/text()").extract_first()
print(item)
# next=response.xpath("//a[@class='next_page']/@href").extract_first()
# if next is not None:
# next_url="https://www.xicidaili.com"+next
# yield scrapy.Request(
# url=next_url,
# callback=self.parse
# )
for i in range(10):
next='https://www.xicidaili.com/nn/'+str(i)
yield scrapy.Request(
url=next,
callback=self.parse
)