22import random
33import requests
44import pymongo
5+ import aiohttp
6+ import asyncio
57from bs4 import BeautifulSoup
68import multiprocessing
79
5961}
6062
6163proxies = {
62- 'http' : '123.206.6.17:8080 ' ,
63- # 'https':' 123.206.6.17:80 '
64+ 'http' : 'http:// 123.206.6.17:3128 ' ,
65+ 'https' : 'http:// 123.206.6.17:3128 '
6466}
6567
6668
67- # 方式一:使用requests + BeautifulSoup
69+ # 方式一:使用常见的requests
6870def method_1 ():
6971 start = time .time ()
7072 for url in urls :
71- html = requests .get (url , headers = headers ).text
73+ html = requests .get (url , headers = headers , proxies = proxies ).text
7274 soup = BeautifulSoup (html , 'lxml' )
7375 title = soup .find_all (class_ = 'title' )
7476 app_title = soup .find_all (class_ = 'app-title' )
@@ -82,13 +84,20 @@ def method_1():
8284 'icon_cover' : icon_cover_i ['data-original' ]
8385 }
8486 col .insert (content )
85- # print('成功插入一组数据' + str(content))
87+ print ('成功插入一组数据' + str (content ))
8688 print ('一共用时:' + str (time .time () - start ))
8789
8890
89- # 方式二:使用Requests + BeautifulSoup + Pool
91+ # if __name__ == '__main__':
92+ # method_1()
93+
94+
95+
96+
97+
98+ # 方式二:使用Requests + Pool
9099def method_2 (url ):
91- html = requests .get (url , headers = headers ).text
100+ html = requests .get (url , headers = headers , proxies = proxies ).text
92101 soup = BeautifulSoup (html , 'lxml' )
93102 title = soup .find_all (class_ = 'title' )
94103 app_title = soup .find_all (class_ = 'app-title' )
@@ -103,13 +112,49 @@ def method_2(url):
103112 }
104113 # time.sleep(1)
105114 col .insert (content )
106- # print('成功插入一组数据' + str(content))
115+ print ('成功插入一组数据' + str (content ))
107116
108117
109- if __name__ == '__main__' :
118+ # if __name__ == '__main__':
119+ # start = time.time()
120+ # pool = multiprocessing.Pool(4)
121+ # pool.map(method_2, urls)
122+ # pool.close()
123+ # pool.join()
124+ # print('一共用时:' + str(time.time() - start))
125+
126+
127+ # 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
128+
129+ def method_3 ():
130+ async def get_url (url ):
131+ async with aiohttp .ClientSession () as session :
132+ async with session .get (url ) as html :
133+ response = await html .text (encoding = "utf-8" )
134+ return response
135+
136+ async def parser (url ):
137+ html = await get_url (url )
138+ soup = BeautifulSoup (html , 'lxml' )
139+ title = soup .find_all (class_ = 'title' )
140+ app_title = soup .find_all (class_ = 'app-title' )
141+ item_cover = soup .find_all (class_ = 'item-cover' )
142+ icon_cover = soup .select ('div.list-wrap > ul > li > div.icon > img' )
143+ for title_i , app_title_i , item_cover_i , icon_cover_i in zip (title , app_title , item_cover , icon_cover ):
144+ content = {
145+ 'title' : title_i .get_text (),
146+ 'app_title' : app_title_i .get_text (),
147+ 'item_cover' : item_cover_i ['data-original' ],
148+ 'icon_cover' : icon_cover_i ['data-original' ]
149+ }
150+ col .insert (content )
151+ print ('成功插入一组数据' + str (content ))
110152 start = time .time ()
111- pool = multiprocessing .Pool (4 )
112- pool .map (method_2 , urls )
113- pool .close ()
114- pool .join ()
115- print ('一共用时:' + str (time .time () - start ))
153+ loop = asyncio .get_event_loop ()
154+ tasks = [parser (url ) for url in urls ]
155+ loop .run_until_complete (asyncio .gather (* tasks ))
156+ print (time .time () - start )
157+
158+
159+ if __name__ == '__main__' :
160+ method_3 ()
0 commit comments