爬起点小说中的一章
# -*- coding: utf-8 -*- import requests import re #下载一个网页 url = 'https://read.qidian.com/chapter/pHCOMN5YAqETFqQ-idajwA2/NhXKjTTceCNOBDFlr9quQA2' #模拟浏览器发送HTTP请求 response = requests.get(url) #修改编码方式 #response.encoding = "utf-8" #response.encoding = "gbk" #目标小说主页网页源码 html = response.text main = re.findall(r'<div class="read-content j_readContent">(.*?)</div>',html,re.S)[0] main = main.replace("<p>","") print(main)关于如何爬小说可以参考 python爬虫爬网络小说
接下来的内容学自自从学会了Python,我从来不为看什么电影发愁,好看的,付费的,百度不到资源的全部一网打井!
最简单反爬虫
请求头:User-Agent
用来确保是浏览器来访问的,而不是使用代码访问网站
我们可以使用代码伪装成浏览器来访问网站(反反爬)
import requests import re #下载一个网页 url = 'https://maoyan.com/board/7' #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) print(response.text)
可以将respond定义为方法
import requests import re def respon(n): #下载一个网页 url = f'https://maoyan.com/board/6?offset={n}' #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) print(response.text) respon(5)安装lxml
import requests import re import lxml from lxml import etree def respon(n): #下载一个网页 #字符串的格式化 url = f'https://maoyan.com/board/6?offset={n}' #url = 'https://maoyan.com/board/6?offset={}'.format(n) #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) return response.text def parse(text): #初始化,标准化 html = etree.HTML(text) #提取我们想要的信息 需要写xpath语法 names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title') print(names) text = respon(5) parse(text) import requests import re import lxml from lxml import etree def respon(n): #下载一个网页 #字符串的格式化 url = f'https://maoyan.com/board/6?offset={n}' #url = 'https://maoyan.com/board/6?offset={}'.format(n) #请求头,告诉服务器这是浏览器 header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} #模拟浏览器发送HTTP请求 response = requests.get(url,headers = header) return response.text def parse(text): #初始化,标准化 html = etree.HTML(text) #提取我们想要的信息 需要写xpath语法 names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title') times = html.xpath('//div[@class="movie-item-info"]/p[@class="releasetime"]/text()') #zip拉链函数 for name,time in zip(names,times): print(name,time) text = respon(10) parse(text)