import requests
from urllib.request import urlopen
from multiprocessing import Pool
# 200 网页正常的返回
# 404 网页找不到
# 502 504
def get(url):
response = requests.get(url)
if response.status_code == 200:
return url,response.content.decode('utf-8')
def get_urllib(url):
ret = urlopen(url)
return ret.read().decode('utf-8')
def call_back(args):
url,content = args
print(url,len(content))
if __name__ == '__main__':
url_lst = [
'https://www.cnblogs.com/',
'http://www.baidu.com',
'https://www.sogou.com/',
'http://www.sohu.com/',
]
p = Pool(5)
for url in url_lst:
p.apply_async(get,args=(url,),callback=call_back)
p.close()
p.join()
import re
from urllib.request import urlopen
from multiprocessing import Pool
def get_page(url,pattern):
response=urlopen(url).read().decode('utf-8')
return pattern,response # 正则表达式编译结果 网页内容
def parse_page(info):
pattern,page_content=info
res=re.findall(pattern,page_content)
for item in res:
dic={
'index':item[0].strip(),
'title':item[1].strip(),
'actor':item[2].strip(),
'time':item[3].strip(),
}
print(dic)
if __name__ == '__main__':
regex = r'<dd>.*?<.*?class="board-index.*?>(\d+)</i>.*?title="(.*?)".*?class="movie-item-info".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>'
pattern1=re.compile(regex,re.S)
url_dic={'http://maoyan.com/board/7':pattern1}
p=Pool()
res_l=[]
for url,pattern in url_dic.items():
res=p.apply_async(get_page,args=(url,pattern),callback=parse_page)
res_l.append(res)
for i in res_l:
i.get()
# 进程池
# cpu个数+1
# ret = map(func,iterable)
# 异步 自带close和join
# 所有结果的[]
# apply
# 同步的:只有当func执行完之后,才会继续向下执行其他代码
# ret = apply(func,args=())
# 返回值就是func的return
# apply_async
# 异步的:当func被注册进入一个进程之后,程序就继续向下执行
# apply_async(func,args=())
# 返回值 : apply_async返回的对象obj
# 为了用户能从中获取func的返回值obj.get()
# get会阻塞直到对应的func执行完毕拿到结果
# 使用apply_async给进程池分配任务,
# 需要先close后join来保持多进程和主进程代码的同步性