python爬虫练习爬取信息

    xiaoxiao2024-12-19  4

    爬起点小说中的一章 

    # -*- coding: utf-8 -*- import requests import re #下载一个网页 url = 'https://read.qidian.com/chapter/pHCOMN5YAqETFqQ-idajwA2/NhXKjTTceCNOBDFlr9quQA2' #模拟浏览器发送HTTP请求 response = requests.get(url) #修改编码方式 #response.encoding = "utf-8" #response.encoding = "gbk" #目标小说主页网页源码 html = response.text main = re.findall(r'<div class="read-content j_readContent">(.*?)</div>',html,re.S)[0] main = main.replace("<p>","") print(main)

    关于如何爬小说可以参考 python爬虫爬网络小说

    接下来的内容学自自从学会了Python,我从来不为看什么电影发愁,好看的,付费的,百度不到资源的全部一网打井!

    关于反爬虫

    最简单反爬虫

    请求头:User-Agent

    用来确保是浏览器来访问的,而不是使用代码访问网站

    我们可以使用代码伪装成浏览器来访问网站(反反爬)

     

    爬猫眼电影榜单信息

     

    import requests import re #下载一个网页 url = 'https://maoyan.com/board/7' #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) print(response.text)

    可以将respond定义为方法

    import requests import re def respon(n): #下载一个网页 url = f'https://maoyan.com/board/6?offset={n}' #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) print(response.text) respon(5)

    安装lxml

     

    import requests import re import lxml from lxml import etree def respon(n): #下载一个网页 #字符串的格式化 url = f'https://maoyan.com/board/6?offset={n}' #url = 'https://maoyan.com/board/6?offset={}'.format(n) #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) return response.text def parse(text): #初始化,标准化 html = etree.HTML(text) #提取我们想要的信息 需要写xpath语法 names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title') print(names) text = respon(5) parse(text)

    import requests import re import lxml from lxml import etree def respon(n): #下载一个网页 #字符串的格式化 url = f'https://maoyan.com/board/6?offset={n}' #url = 'https://maoyan.com/board/6?offset={}'.format(n) #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) return response.text def parse(text): #初始化,标准化 html = etree.HTML(text) #提取我们想要的信息 需要写xpath语法 names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title') times = html.xpath('//div[@class="movie-item-info"]/p[@class="releasetime"]/text()') #zip拉链函数 for name,time in zip(names,times): print(name,time) text = respon(10) parse(text)

     

    最新回复(0)