使用python爬取猫眼电影、房王、股吧论坛、百度翻译、有道翻译、高德天气、华夏基金、扇贝单词、糗事百科（房王）

xiaoxiao2025-07-07 24

''' 1、房王：正则 http://gz.ihk.cn/myxf/houselist/?mark=gzxf089 图片，楼盘名，楼盘描述，主力户型，地址，价格放在一个字典中 es_conlistbox 思路： 1.获取URL规律第一页：http://gz.ihk.cn/myxf/houselist/p1/?mark=gzxf089 第二页：http://gz.ihk.cn/myxf/houselist/p2/?mark=gzxf089 第三页：http://gz.ihk.cn/myxf/houselist/p3/?mark=gzxf089 2.匹配 html div的clas为es_conlistbox 中的内内容是信息列表在每个页面中：完整页面:30项目 3.创建匹配规则图片：<img class="lazy" data-original="http://img1.appweb.ihk.cn/gd/estate/scan_photo/2016-09-30/7858d194577140dda031291e58a115db(WJ-3600-2016-09-30)" onerror="javascript: this.src = 'http://gz.ihk.cn/newsecond/images/default.jpg'"> ''' import requests,re from lxml import etree # from lxml import btree #用类来写 # class House_ling(): # def __init__(self): # self.run() # # def run(self): # base_url = 'http://gz.ihk.cn/myxf/houselist/?mark=gzxf089' # # params = { # # 'mark': 'gzxf089' # # } # # #获取第一页的内容 # response = requests.get(base_url) # # # # # def get_alldata(self,response): # pass #用函数写 def houseling(): base_url = 'http://gz.ihk.cn/myxf/houselist/?mark=gzxf089' #1、获取总的html page_str = requests.get(base_url).text # print(page_str) # print() pattern = re.compile('<div class="n_conlistpic">(.*?)<div class="n_conlistrliioc">',re.S) house_list = pattern.findall(page_str) # print(house_list[0]) # print(len(house_list)) #2、创建楼盘匹配规则 # 图片，楼盘名，楼盘描述，主力户型，地址，价格 count = 1 for house_info in house_list: #匹配图片地址 pattern_picture_rule =re.compile('data-original="(.*?)" onerror') picture_path = pattern_picture_rule.findall(house_info) #匹配楼盘名称 pattern_name_rule = re.compile('<a><strong>(.*?)</strong></a>') name = pattern_name_rule.findall(house_info) # print(name) #匹配楼盘描述 pattern_description_rule =re.compile(r'<div class="n_conlistrbrief">(.*?)</div>',re.S) descripation_lazy = pattern_description_rule.findall(house_info) descripation_rule = re.compile('<span>(.*?)</span>') descripation = descripation_rule.findall(str(descripation_lazy)) # print(descripation) #匹配主力户型 pattern_house_type = re.compile('<div class="n_conlistradd"><span>(.*?)</span></div>') house_type = pattern_house_type.findall(house_info) # print(house_type) #匹配地址 # pattern_postion = re.compile('</em><span>(.*?)</span>') pattern_postion = re.compile('</em><span>(.*?)</span></div>') postion = pattern_postion.findall(house_info) # print(postion) #匹配价格 pattern_price = re.compile('<li><strong>(\d+)</strong><b>') price = pattern_price.findall(house_info) # print(price) print(count,'：',picture_path[0],name[0],descripation[0],house_type[0],postion[0],price[0]) # print(count,postion) count +=1 if __name__ == '__main__': houseling()

最新回复(0)