今天有一位同学找我爬取安居客的信息,安居客网站有反爬虫机制,所以不能简单用requests来做,这里用selenium来模拟获取url网页。
环境
mac, python3.7 beautifulsoup
代码
#加载packages
from selenium import webdriver
from lxml import etree
import urllib.request
#设置headers信息
url="https://shanghai.anjuke.com/fangjia/"
req=urllib.request.Request(url)
req.add_header("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36")
req.add_header("Accept", "*/*")
req.add_header("Accept-Language", "zh-CN,zh;q=0.8")
#声明使用Chrome
driver = webdriver.Chrome()
#获取2010-2018年全国房价的链接
link1=[]
link1_all=[]
for i in range(2010,2019):
link1='https://www.anjuke.com/fangjia/quanguo'+str(i)+'/'
link1_all.append(link1)
import queue
q = queue.Queue()
for link in link1_all:
q.put(link)
%%time
from tqdm import tqdm
from bs4 import BeautifulSoup
file=open('output.csv','w+')
name=['年份','月份','房价','涨跌幅','市','链接']
info_list=[]
prices=[]
zones=[]
tendencies=[]
links=[]
years=[]
file.write(','.join(name))
file.write('\n')
while not q.empty():
current_url=q.get()
print("retrive:"+current_url)
try:
driver.get(current_url)
except:
continue
pass
page=etree.HTML(driver.page_source)
html_text=etree.tostring(page,encoding=str, pretty_print=True)
bsObj=BeautifulSoup(html_text)
info_up=bsObj.find_all('li',class_='clearfix up');
info_nochange=bsObj.find_all('li',class_='clearfix nochange')
info_down=bsObj.find_all('li',class_='clearfix down');
info=[]
info.extend(info_up)
info.extend(info_down)
info.extend(info_nochange)
city_arr=bsObj.find_all('h2');
city=city_arr[0].string.split('年')[1].split('房')[0]
# for item in city_arr:
# a_label=item.select('a')
# if(not a_label):
# city=item.string
# # print(city)
# break
for line in info:
item_list=[]
tendency=line.find('em').string
year_zone=line.find('b').string
year,month_zone=year_zone.split('年')
line_arr=month_zone.split('月');
url=line.a.attrs['href']
if('javascript' not in url):
q.put(url)
if(len(line_arr)<2):
continue
else:
month=line_arr[0]
zone=line_arr[1]
price=line.find('span').string
years.append(year)
zones.append(zone)
prices.append(price)
tendencies.append(tendency)
links.append(current_url)
item_list.append(year)
item_list.append(month)
item_list.append(price)
item_list.append(tendency)
item_list.append(city)
item_list.append(current_url)
file.write(','.join(item_list))
file.write('\n')
file.close()
参考文献
[1].python用BeautifulSoup用抓取a标签内所有数据.https://blog.csdn.net/suibianshen2012/article/details/62040460
[2].从零开始学网络爬虫之BeautifulSoap. https://blog.csdn.net/lxmanutd/article/details/53513103