"""
爬取豆瓣电影TOP250
- 完整示例代码
""
import codecs
import xlwings
as xw
import requests
from bs4
import BeautifulSoup
DOWNLOAD_URL
= 'http://movie.douban.com/top250/'
movie_name_list
= []
director_name_list
= []
score_list
= []
def download_page(url
):
return requests
.get
(url
, headers
={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}).content
def parse_html(html
):
soup
= BeautifulSoup
(html
)
movie_list_soup
= soup
.find
('ol', attrs
={'class': 'grid_view'})
for movie_li
in movie_list_soup
.find_all
('li'):
detail
= movie_li
.find
('div', attrs
={'class': 'hd'})
director_detail
= movie_li
.find
('div',attrs
={'class':'bd'})
score_detail
= movie_li
.find
('div',attrs
={'class':'star'})
movie_name
= detail
.find
('span', attrs
={'class': 'title'}).getText
()
director_name
= director_detail
.find
('p',attrs
={'':''}).getText
()
score
= score_detail
.find
('span',attrs
={'class':'rating_num'}).getText
()
director_name_list
.append
(director_name
)
movie_name_list
.append
(movie_name
)
score_list
.append
(score
)
next_page
= soup
.find
('span', attrs
={'class': 'next'}).find
('a')
if next_page
:
return movie_name_list
, DOWNLOAD_URL
+ next_page
['href']
return movie_name_list
, None
def showExcel():
i
= 0
app
= xw
.App
(visible
=True, add_book
=False)
app
.display_alerts
= False
filepath
= r
'D://Desktop/myexcel.xlsx'
wb
= app
.books
.open(filepath
)
sht
= wb
.sheets
['sheet1']
sht
.range('A1').value
= "电影名称"
sht
.range('B1').value
= "详细信息"
sht
.range('C1').value
= "豆瓣评分"
while i
< len(movie_name_list
):
sht
.cells
(i
+2,1).value
= movie_name_list
[i
]
sht
.cells
(i
+2,2).value
= director_name_list
[i
]
sht
.cells
(i
+2,3).value
= score_list
[i
]
i
= i
+ 1
def main():
url
= DOWNLOAD_URL
while url
:
html
= download_page
(url
)
movies
, url
= parse_html
(html
)
if __name__
== '__main__':
main
()
i
= 0
showExcel
()
转载请注明原文地址: https://yun.8miu.com/read-100350.html