python3 beautiful爬取安居客的二手房信息

    xiaoxiao2022-07-03  127

    今天有一位同学找我爬取安居客的信息,安居客网站有反爬虫机制,所以不能简单用requests来做,这里用selenium来模拟获取url网页。

    环境

    mac, python3.7 beautifulsoup

    代码

    #加载packages from selenium import webdriver from lxml import etree import urllib.request #设置headers信息 url="https://shanghai.anjuke.com/fangjia/" req=urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36") req.add_header("Accept", "*/*") req.add_header("Accept-Language", "zh-CN,zh;q=0.8") #声明使用Chrome driver = webdriver.Chrome() #获取2010-2018年全国房价的链接 link1=[] link1_all=[] for i in range(2010,2019): link1='https://www.anjuke.com/fangjia/quanguo'+str(i)+'/' link1_all.append(link1) import queue q = queue.Queue() for link in link1_all: q.put(link) %%time from tqdm import tqdm from bs4 import BeautifulSoup file=open('output.csv','w+') name=['年份','月份','房价','涨跌幅','市','链接'] info_list=[] prices=[] zones=[] tendencies=[] links=[] years=[] file.write(','.join(name)) file.write('\n') while not q.empty(): current_url=q.get() print("retrive:"+current_url) try: driver.get(current_url) except: continue pass page=etree.HTML(driver.page_source) html_text=etree.tostring(page,encoding=str, pretty_print=True) bsObj=BeautifulSoup(html_text) info_up=bsObj.find_all('li',class_='clearfix up'); info_nochange=bsObj.find_all('li',class_='clearfix nochange') info_down=bsObj.find_all('li',class_='clearfix down'); info=[] info.extend(info_up) info.extend(info_down) info.extend(info_nochange) city_arr=bsObj.find_all('h2'); city=city_arr[0].string.split('年')[1].split('房')[0] # for item in city_arr: # a_label=item.select('a') # if(not a_label): # city=item.string # # print(city) # break for line in info: item_list=[] tendency=line.find('em').string year_zone=line.find('b').string year,month_zone=year_zone.split('年') line_arr=month_zone.split('月'); url=line.a.attrs['href'] if('javascript' not in url): q.put(url) if(len(line_arr)<2): continue else: month=line_arr[0] zone=line_arr[1] price=line.find('span').string years.append(year) zones.append(zone) prices.append(price) tendencies.append(tendency) links.append(current_url) item_list.append(year) item_list.append(month) item_list.append(price) item_list.append(tendency) item_list.append(city) item_list.append(current_url) file.write(','.join(item_list)) file.write('\n') file.close()

    参考文献

    [1].python用BeautifulSoup用抓取a标签内所有数据.https://blog.csdn.net/suibianshen2012/article/details/62040460

    [2].从零开始学网络爬虫之BeautifulSoap. https://blog.csdn.net/lxmanutd/article/details/53513103

    最新回复(0)