├── .gitignore
├── requirements-win32.txt
├── requirements-linux.txt
├── sequential.py
├── io-models
    ├── 1_tcp_client.py
    ├── 6_io_multiplexing_tcp_server.py
    ├── 2_blocking_io_single_process_tcp_server.py
    ├── 5_nonblocking_io_tcp_server.py
    ├── 3_blocking_io_multi_processes_tcp_server.py
    └── 4_blocking_io_multi_threads_tcp_server.py
├── logger.py
├── common.py
├── flags.txt
├── README.md
├── threadpool.py
├── asynchronous.py
└── processpool.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | __pycache__/
3 | downloads/
4 | logs/


--------------------------------------------------------------------------------
/requirements-win32.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | aiofiles
3 | beautifulsoup4
4 | lxml
5 | progressbar2
6 | pymongo
7 | requests


--------------------------------------------------------------------------------
/requirements-linux.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | aiofiles
3 | beautifulsoup4
4 | lxml
5 | progressbar2
6 | pymongo
7 | requests
8 | uvloop


--------------------------------------------------------------------------------
/sequential.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from common import setup_down_path, get_links, download_one
 3 | from logger import logger
 4 | 
 5 | 
 6 | def download_many():
 7 |     '''依序下载所有图片，同步阻塞'''
 8 |     down_path = setup_down_path()
 9 |     links = get_links()
10 | 
11 |     for linkno, link in enumerate(links, 1):
12 |         image = {
13 |             'path': down_path,
14 |             'linkno': linkno,  # 图片序号，方便日志输出时，正在下载哪一张
15 |             'link': link
16 |         }
17 |         download_one(image)
18 | 
19 |     return len(links)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     t0 = time.time()
24 |     count = download_many()
25 |     msg = '{} flags downloaded in {:.2f} seconds.'
26 |     logger.info(msg.format(count, time.time() - t0))
27 | 


--------------------------------------------------------------------------------
/io-models/1_tcp_client.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import time
 4 | from datetime import datetime
 5 | import socket
 6 | 
 7 | 
 8 | server_ip = input('Please enter the TCP server ip: ')
 9 | server_port = int(input('Enter the TCP server port: '))
10 | client_num = int(input('Enter the TCP clients count: '))
11 | 
12 | # 保存所有已成功连接的客户端TCP socket
13 | client_socks = []
14 | 
15 | for i in range(client_num):
16 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
17 |     sock.connect((server_ip, server_port))
18 |     client_socks.append(sock)
19 |     print('Client {}[ID: {}] has connected to {}'.format(sock, i, (server_ip, server_port)))
20 | 
21 | while True:
22 |     for s in client_socks:
23 |         data = str(datetime.now()).encode('utf-8')
24 |         s.send(data)
25 |         print('Client {} has sent {} to {}'.format(s, data, (server_ip, server_port)))
26 |     # 睡眠3秒后，继续让每个客户端连接向TCP Server发送数据
27 |     time.sleep(3)
28 | 


--------------------------------------------------------------------------------
/io-models/6_io_multiplexing_tcp_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import selectors
 4 | import socket
 5 | 
 6 | # 自动选择当前OS中最优的I/O multiplexing接口，Linux中会使用selectors.EpollSelector
 7 | sel = selectors.DefaultSelector()
 8 | 
 9 | 
10 | def accept(sock, mask):
11 |     '''监听套接字创建新的客户端连接'''
12 |     conn, addr = sock.accept()  # Should be ready
13 |     print('accepted', conn, 'from', addr)
14 |     conn.setblocking(False)
15 |     sel.register(conn, selectors.EVENT_READ, read)  # 将新的客户端socket注册到epoll实例上，并监听读事件
16 | 
17 | 
18 | def read(conn, mask):
19 |     '''接收客户端数据，并原样返回'''
20 |     data = conn.recv(1000)  # Should be ready
21 |     if data:
22 |         print('echoing', repr(data), 'to', conn)
23 |         conn.send(data)  # Hope it won't block
24 |     else:
25 |         print('closing', conn)
26 |         sel.unregister(conn)
27 |         conn.close()
28 | 
29 | 
30 | sock = socket.socket()
31 | sock.bind(('', 9090))
32 | sock.listen(100)
33 | sock.setblocking(False)
34 | sel.register(sock, selectors.EVENT_READ, accept)
35 | 
36 | while True:
37 |     events = sel.select()
38 |     for key, mask in events:
39 |         callback = key.data
40 |         callback(key.fileobj, mask)
41 | 


--------------------------------------------------------------------------------
/io-models/2_blocking_io_single_process_tcp_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # TCP Echo Server，单进程，阻塞 blocking I/O
 4 | import socket
 5 | 
 6 | 
 7 | # 创建监听socket
 8 | server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 9 | 
10 | # socket默认不支持地址复用，OSError: [Errno 98] Address already in use
11 | server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
12 | 
13 | # 绑定IP地址和固定端口
14 | server_address = ('', 9090)
15 | print('TCP Server starting up on port {}'.format(server_address[1]))
16 | server_sock.bind(server_address)
17 | 
18 | # socket默认是主动连接，调用listen()函数将socket变为被动连接，这样就可以接收客户端连接了
19 | server_sock.listen(5)
20 | 
21 | try:
22 |     while True:
23 |         print('Main Process, waiting for client connection...')
24 | 
25 |         # client_sock是专为这个客户端服务的socket，client_addr是包含客户端IP和端口的元组
26 |         client_sock, client_addr = server_sock.accept()
27 |         print('Client {} is connected'.format(client_addr))
28 | 
29 |         try:
30 |             while True:
31 |                 # 接收客户端发来的数据，阻塞，直到有数据到来
32 |                 # 事实上，除非当前客户端关闭后，才会跳转到外层的while循环，即一次只能服务一个客户
33 |                 # 如果客户端关闭了连接，data是空字符串
34 |                 data = client_sock.recv(4096)
35 |                 if data:
36 |                     print('Received {}({} bytes) from {}'.format(data, len(data), client_addr))
37 |                     # 返回响应数据，将客户端发送来的数据原样返回
38 |                     client_sock.send(data)
39 |                     print('Sent {} to {}'.format(data, client_addr))
40 |                 else:
41 |                     print('Client {} is closed'.format(client_addr))
42 |                     break
43 |         finally:
44 |             # 关闭为这个客户端服务的socket
45 |             client_sock.close()
46 | finally:
47 |     # 关闭监听socket，不再响应其它客户端连接
48 |     server_sock.close()
49 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import logging
 4 | 
 5 | 
 6 | ###
 7 | # 1. 创建logger实例，如果参数为空则返回root logger
 8 | ###
 9 | 
10 | logger = logging.getLogger('spider')
11 | # 设置总日志级别, 也可以给不同的handler设置不同的日志级别
12 | logger.setLevel(logging.DEBUG)
13 | 
14 | ###
15 | # 2. 创建Handler, 输出日志到控制台和文件
16 | ###
17 | 
18 | # 控制台日志和日志文件使用同一个Formatter
19 | formatter = logging.Formatter(
20 |     '%(asctime)s - %(filename)s[line:%(lineno)d] - <%(threadName)s %(thread)d>' +
21 |     '- <Process %(process)d> - %(levelname)s: %(message)s'
22 | )
23 | 
24 | # 日志文件FileHandler
25 | basedir = os.path.abspath(os.path.dirname(__file__))
26 | log_dest = os.path.join(basedir, 'logs')  # 日志文件所在目录
27 | if not os.path.isdir(log_dest):
28 |     os.mkdir(log_dest)
29 | filename = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'  # 日志文件名，以当前时间命名
30 | file_handler = logging.FileHandler(os.path.join(log_dest, filename), encoding='utf-8')  # 创建日志文件handler
31 | file_handler.setFormatter(formatter)  # 设置Formatter
32 | # file_handler.setLevel(logging.INFO)  # 单独设置日志文件的日志级别
33 | 
34 | # 控制台日志StreamHandler
35 | stream_handler = logging.StreamHandler()
36 | stream_handler.setFormatter(formatter)
37 | # stream_handler.setLevel(logging.DEBUG)  # 单独设置控制台日志的日志级别
38 | 
39 | ###
40 | # 3. 将handler添加到logger中
41 | ###
42 | 
43 | logger.addHandler(file_handler)
44 | logger.addHandler(stream_handler)
45 | 
46 | ###
47 | # 或者，使用简单配置
48 | ###
49 | # logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
50 | # logging.debug('This message should appear on the console')
51 | 
52 | # logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d', level=logging.DEBUG)
53 | # logging.debug('datetime format has changed to 2018-06-01')
54 | 
55 | # logging.basicConfig(filename='test.log', level=logging.DEBUG)
56 | # logging.debug('This message should go to test.log')
57 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import requests
 4 | from logger import logger
 5 | 
 6 | 
 7 | basepath = os.path.abspath(os.path.dirname(__file__))  # 当前模块文件的根目录
 8 | 
 9 | 
10 | def setup_down_path():
11 |     '''设置图片下载后的保存位置，所有图片放在同一个目录下'''
12 |     down_path = os.path.join(basepath, 'downloads')
13 |     if not os.path.isdir(down_path):
14 |         os.mkdir(down_path)
15 |         logger.info('Create download path {}'.format(down_path))
16 |     return down_path
17 | 
18 | 
19 | def get_links():
20 |     '''获取所有图片的下载链接'''
21 |     with open(os.path.join(basepath, 'flags.txt')) as f:  # 图片名都保存在这个文件中，每行一个图片名
22 |         return ['http://192.168.40.121/flags/' + flag.strip() for flag in f.readlines()]
23 | 
24 | 
25 | def download_one(image):  # 为什么设计成接收一个字典参数，而不是三个位置参数? 方便后续多线程时concurrent.futures.ThreadPoolExecutor.map()
26 |     '''下载一张图片
27 |     :param image: 字典，包括图片的保存目录、图片的序号、图片的URL
28 |     '''
29 |     logger.info('Downloading No.{} [{}]'.format(image['linkno'], image['link']))
30 |     t0 = time.time()
31 | 
32 |     resp = requests.get(image['link'])
33 |     filename = os.path.split(image['link'])[1]
34 |     with open(os.path.join(image['path'], filename), 'wb') as f:
35 |         f.write(resp.content)  # resp.content是bytes类型，而resp.text是str类型
36 | 
37 |     t1 = time.time()
38 |     logger.info('Task No.{} [{}] runs {:.2f} seconds.'.format(image['linkno'], image['link'], t1 - t0))
39 | 
40 | 
41 | def download_one_1(path, linkno, link):
42 |     '''下载一张图片
43 |     :param path: 图片的保存目录
44 |     :param linkno: 图片的序号
45 |     :param link: 图片的URL
46 |     '''
47 |     logger.info('Downloading No.{} [{}]'.format(linkno, link))
48 |     t0 = time.time()
49 | 
50 |     resp = requests.get(link)
51 |     filename = os.path.split(link)[1]
52 |     with open(os.path.join(path, filename), 'wb') as f:
53 |         f.write(resp.content)
54 | 
55 |     t1 = time.time()
56 |     logger.info('Task No.{} [{}] runs {:.2f} seconds.'.format(linkno, link, t1 - t0))
57 | 


--------------------------------------------------------------------------------
/io-models/5_nonblocking_io_tcp_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # TCP Echo Server，单进程，非阻塞 nonblocking I/O
 4 | import socket
 5 | 
 6 | 
 7 | # 用来保存所有已成功连接的客户端，每个列表元素是client_sock和client_addr组成的元组
 8 | clients = []
 9 | 
10 | # 创建监听socket
11 | server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
12 | 
13 | # socket默认不支持地址复用，OSError: [Errno 98] Address already in use
14 | server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
15 | 
16 | # 绑定IP地址和固定端口
17 | server_address = ('', 9090)
18 | print('TCP Server starting up on port {}'.format(server_address[1]))
19 | server_sock.bind(server_address)
20 | 
21 | # socket默认是主动连接，调用listen()函数将socket变为被动连接，这样就可以接收客户端连接了
22 | server_sock.listen(5)
23 | 
24 | # 将监听用的server_sock设置为非阻塞
25 | server_sock.setblocking(False)
26 | 
27 | print('Main Process, waiting for client connection...')
28 | 
29 | try:
30 |     while True:
31 |         try:
32 |             # client_sock是专为这个客户端服务的socket，client_addr是包含客户端IP和端口的元组
33 |             client_sock, client_addr = server_sock.accept()
34 |         except:
35 |             # server_sock设置为非堵塞后，如果accept时，恰巧没有客户端connect，那么accept会产生一个异常
36 |             pass
37 |         else:
38 |             print('Client {} is connected'.format(client_addr))
39 |             # 将新的客户端连接socket也设置为非阻塞
40 |             client_sock.setblocking(False)
41 |             # 添加到client_socks列表中
42 |             clients.append((client_sock, client_addr))
43 | 
44 |         # 循环处理每个客户端连接
45 |         for client_sock, client_addr in clients:
46 |             try:
47 |                 data = client_sock.recv(4096)
48 |                 if data:
49 |                     print('Received {}({} bytes) from {}'.format(data, len(data), client_addr))
50 |                     # 返回响应数据，将客户端发送来的数据原样返回
51 |                     client_sock.send(data)
52 |                     print('Sent {} to {}'.format(data, client_addr))
53 |                 else:
54 |                     print('Client {} is closed'.format(client_addr))
55 |                     # 关闭为这个客户端服务的socket
56 |                     client_sock.close()
57 |                     # 从列表中删除
58 |                     clients.remove((client_sock, client_addr))
59 |             except:
60 |                 # client_sock设置为非堵塞后，如果recv时，恰巧客户端没有发送数据过来，将会产生一个异常
61 |                 pass
62 | finally:
63 |     # 关闭监听socket，不再响应其它客户端连接
64 |     server_sock.close()
65 | 


--------------------------------------------------------------------------------
/io-models/3_blocking_io_multi_processes_tcp_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # TCP Echo Server，多进程，阻塞 blocking I/O
 4 | import os
 5 | import socket
 6 | from multiprocessing import Process
 7 | 
 8 | 
 9 | def client_handler(client_sock, client_addr):
10 |     '''接收各个客户端发来的数据，并原样返回'''
11 |     try:
12 |         while True:
13 |             # 接收客户端发来的数据，阻塞，直到有数据到来
14 |             # 如果客户端关闭了连接，data是空字符串
15 |             data = client_sock.recv(4096)
16 |             if data:
17 |                 print('Child Process [PID: {}], received {}({} bytes) from {}'.format(os.getpid(), data, len(data), client_addr))
18 |                 # 返回响应数据，将客户端发送来的数据原样返回
19 |                 client_sock.send(data)
20 |                 print('Child Process [PID: {}], sent {} to {}'.format(os.getpid(), data, client_addr))
21 |             else:
22 |                 print('Child Process [PID: {}], client {} is closed'.format(os.getpid(), client_addr))
23 |                 break
24 |     except:
25 |         # 如果客户端强制关闭连接，会报异常: ConnectionResetError: [Errno 104] Connection reset by peer
26 |         pass
27 |     finally:
28 |         # 关闭为这个客户端服务的socket
29 |         client_sock.close()
30 | 
31 | 
32 | # 创建监听socket
33 | server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
34 | 
35 | # socket默认不支持地址复用，OSError: [Errno 98] Address already in use
36 | server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
37 | 
38 | # 绑定IP地址和固定端口
39 | server_address = ('', 9090)
40 | print('TCP Server starting up on port {}'.format(server_address[1]))
41 | server_sock.bind(server_address)
42 | 
43 | # socket默认是主动连接，调用listen()函数将socket变为被动连接，这样就可以接收客户端连接了
44 | server_sock.listen(5)
45 | 
46 | try:
47 |     while True:
48 |         print('Main Process [PID: {}], waiting for client connection...'.format(os.getpid()))
49 | 
50 |         # 主进程只用来负责监听新的客户连接
51 |         # client_sock是专为这个客户端服务的socket，client_addr是包含客户端IP和端口的元组
52 |         client_sock, client_addr = server_sock.accept()
53 |         print('Main Process [PID: {}], client {} is connected'.format(os.getpid(), client_addr))
54 | 
55 |         # 为每个新的客户连接创建一个子进程，用来处理客户数据
56 |         client = Process(target=client_handler, args=(client_sock, client_addr))
57 |         client.start()
58 |         # 子进程已经复制了一份client_sock，所以主进程中可以关闭此client_sock
59 |         client_sock.close()
60 | finally:
61 |     # 关闭监听socket，不再响应其它客户端连接
62 |     server_sock.close()
63 | 


--------------------------------------------------------------------------------
/io-models/4_blocking_io_multi_threads_tcp_server.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | # TCP Echo Server，多线程，阻塞 blocking I/O
 4 | import socket
 5 | import threading
 6 | 
 7 | 
 8 | def client_handler(client_sock, client_addr):
 9 |     '''接收各个客户端发来的数据，并原样返回'''
10 |     try:
11 |         while True:
12 |             # 接收客户端发来的数据，阻塞，直到有数据到来
13 |             # 如果客户端关闭了连接，data是空字符串
14 |             data = client_sock.recv(4096)
15 |             if data:
16 |                 print('Child Thread [{}], received {}({} bytes) from {}'.format(threading.current_thread().name, data, len(data), client_addr))
17 |                 # 返回响应数据，将客户端发送来的数据原样返回
18 |                 client_sock.send(data)
19 |                 print('Child Thread [{}], sent {} to {}'.format(threading.current_thread().name, data, client_addr))
20 |             else:
21 |                 print('Child Thread [{}], client {} is closed'.format(threading.current_thread().name, client_addr))
22 |                 break
23 |     except:
24 |         # 如果客户端强制关闭连接，会报异常: ConnectionResetError: [Errno 104] Connection reset by peer
25 |         pass
26 |     finally:
27 |         # 关闭为这个客户端服务的socket
28 |         client_sock.close()
29 | 
30 | 
31 | # 创建监听socket
32 | server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
33 | 
34 | # socket默认不支持地址复用，OSError: [Errno 98] Address already in use
35 | server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
36 | 
37 | # 绑定IP地址和固定端口
38 | server_address = ('', 9090)
39 | print('TCP Server starting up on port {}'.format(server_address[1]))
40 | server_sock.bind(server_address)
41 | 
42 | # socket默认是主动连接，调用listen()函数将socket变为被动连接，这样就可以接收客户端连接了
43 | server_sock.listen(5)
44 | 
45 | try:
46 |     while True:
47 |         print('Main Thread [{}], waiting for client connection...'.format(threading.current_thread().name))
48 | 
49 |         # 主进程只用来负责监听新的客户连接
50 |         # client_sock是专为这个客户端服务的socket，client_addr是包含客户端IP和端口的元组
51 |         client_sock, client_addr = server_sock.accept()
52 |         print('Main Thread [{}], client {} is connected'.format(threading.current_thread().name, client_addr))
53 | 
54 |         # 为每个新的客户连接创建一个线程，用来处理客户数据
55 |         client = threading.Thread(target=client_handler, args=(client_sock, client_addr))
56 |         client.start()
57 | 
58 |         # 因为主线程与子线程共享client_sock，所以在主线程中不能关闭client_sock
59 |         # client_sock.close()
60 | finally:
61 |     # 关闭监听socket，不再响应其它客户端连接
62 |     server_sock.close()
63 | 


--------------------------------------------------------------------------------
/flags.txt:
--------------------------------------------------------------------------------
  1 | ad.gif
  2 | ae.gif
  3 | af.gif
  4 | ag.gif
  5 | al.gif
  6 | am.gif
  7 | ao.gif
  8 | ar.gif
  9 | at.gif
 10 | au.gif
 11 | az.gif
 12 | ba.gif
 13 | bb.gif
 14 | bd.gif
 15 | be.gif
 16 | bf.gif
 17 | bg.gif
 18 | bh.gif
 19 | bi.gif
 20 | bj.gif
 21 | bn.gif
 22 | bo.gif
 23 | br.gif
 24 | bs.gif
 25 | bt.gif
 26 | bw.gif
 27 | by.gif
 28 | bz.gif
 29 | ca.gif
 30 | cd.gif
 31 | cf.gif
 32 | cg.gif
 33 | ch.gif
 34 | ci.gif
 35 | cl.gif
 36 | cm.gif
 37 | cn.gif
 38 | co.gif
 39 | cr.gif
 40 | cu.gif
 41 | cv.gif
 42 | cy.gif
 43 | cz.gif
 44 | de.gif
 45 | dj.gif
 46 | dk.gif
 47 | dm.gif
 48 | dz.gif
 49 | ec.gif
 50 | ee.gif
 51 | eg.gif
 52 | er.gif
 53 | es.gif
 54 | et.gif
 55 | fi.gif
 56 | fj.gif
 57 | fm.gif
 58 | fr.gif
 59 | ga.gif
 60 | gb.gif
 61 | gd.gif
 62 | ge.gif
 63 | gh.gif
 64 | gm.gif
 65 | gn.gif
 66 | gq.gif
 67 | gr.gif
 68 | gt.gif
 69 | gw.gif
 70 | gy.gif
 71 | hn.gif
 72 | hr.gif
 73 | ht.gif
 74 | hu.gif
 75 | id.gif
 76 | ie.gif
 77 | il.gif
 78 | in.gif
 79 | iq.gif
 80 | ir.gif
 81 | is.gif
 82 | it.gif
 83 | jm.gif
 84 | jo.gif
 85 | jp.gif
 86 | ke.gif
 87 | kg.gif
 88 | kh.gif
 89 | ki.gif
 90 | km.gif
 91 | kn.gif
 92 | kp.gif
 93 | kr.gif
 94 | kw.gif
 95 | kz.gif
 96 | la.gif
 97 | lb.gif
 98 | lc.gif
 99 | li.gif
100 | lk.gif
101 | lr.gif
102 | ls.gif
103 | lt.gif
104 | lu.gif
105 | lv.gif
106 | ly.gif
107 | ma.gif
108 | mc.gif
109 | md.gif
110 | me.gif
111 | mg.gif
112 | mh.gif
113 | mk.gif
114 | ml.gif
115 | mm.gif
116 | mn.gif
117 | mr.gif
118 | mt.gif
119 | mu.gif
120 | mv.gif
121 | mw.gif
122 | mx.gif
123 | my.gif
124 | mz.gif
125 | na.gif
126 | ne.gif
127 | ng.gif
128 | ni.gif
129 | nl.gif
130 | no.gif
131 | np.gif
132 | nr.gif
133 | nz.gif
134 | om.gif
135 | pa.gif
136 | pe.gif
137 | pg.gif
138 | ph.gif
139 | pk.gif
140 | pl.gif
141 | pt.gif
142 | pw.gif
143 | py.gif
144 | qa.gif
145 | ro.gif
146 | rs.gif
147 | ru.gif
148 | rw.gif
149 | sa.gif
150 | sb.gif
151 | sc.gif
152 | sd.gif
153 | se.gif
154 | sg.gif
155 | si.gif
156 | sk.gif
157 | sl.gif
158 | sm.gif
159 | sn.gif
160 | so.gif
161 | sr.gif
162 | ss.gif
163 | st.gif
164 | sv.gif
165 | sy.gif
166 | sz.gif
167 | td.gif
168 | tg.gif
169 | th.gif
170 | tj.gif
171 | tl.gif
172 | tm.gif
173 | tn.gif
174 | to.gif
175 | tr.gif
176 | tt.gif
177 | tv.gif
178 | tw.gif
179 | tz.gif
180 | ua.gif
181 | ug.gif
182 | us.gif
183 | uy.gif
184 | uz.gif
185 | va.gif
186 | vc.gif
187 | ve.gif
188 | vn.gif
189 | vu.gif
190 | ws.gif
191 | ye.gif
192 | za.gif
193 | zm.gif
194 | zw.gif
195 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # [python3-concurrency](http://www.madmalls.com/blog/category/python3-spider/)
 2 | 
 3 | [![Python](https://img.shields.io/badge/python-v3.4%2B-blue.svg)](https://www.python.org/)
 4 | [![aiohttp](https://img.shields.io/badge/aiohttp-v3.3.2-brightgreen.svg)](https://aiohttp.readthedocs.io/en/stable/)
 5 | [![BeautifulSoup4](https://img.shields.io/badge/BeautifulSoup4-v4.6.3-orange.svg)](https://pypi.org/project/beautifulsoup4/)
 6 | [![requests](https://img.shields.io/badge/requests-v2.19.1-yellow.svg)](http://docs.python-requests.org/en/master/)
 7 | [![pymongo](https://img.shields.io/badge/pymongo-v3.7.1-red.svg)](https://pypi.org/project/pymongo/)
 8 | [![progressbar2](https://img.shields.io/badge/progressbar2-v3.38.0-lightgrey.svg)](https://pypi.org/project/progressbar2/)
 9 | 
10 | 
11 | ![](http://www.madmalls.com/api/medias/uploaded/python3-concurrency-1016d526.png)
12 | 
13 | 
14 | # 1. 爬虫系列
15 | 
16 | - [Python 3 爬虫｜第1章：I/O Models 阻塞/非阻塞 同步/异步](https://madmalls.com/blog/post/io-models/)
17 | - [Python 3 爬虫｜第2章：Python 并发编程](https://madmalls.com/blog/post/concurrent-programming-for-python/)
18 | - [Python 3 爬虫｜第3章：同步阻塞下载](https://madmalls.com/blog/post/sequential-download-for-python/)
19 | - [Python 3 爬虫｜第4章：多进程并发下载](https://madmalls.com/blog/post/multi-process-for-python3/)
20 | - [Python 3 爬虫｜第5章：多线程并发下载](https://madmalls.com/blog/post/multi-thread-for-python/)
21 | - [Python 3 爬虫｜第6章：可迭代对象 / 迭代器 / 生成器](https://madmalls.com/blog/post/iterable-iterator-and-generator-in-python/)
22 | - [Python 3 爬虫｜第7章：协程 Coroutines](https://madmalls.com/blog/post/coroutine-in-python/)
23 | - [Python 3 爬虫｜第8章：使用 asyncio 模块实现并发](https://madmalls.com/blog/post/asyncio-howto-in-python3/)
24 | - [Python 3 爬虫｜第9章：使用 asyncio + aiohttp 并发下载](https://madmalls.com/blog/post/aiohttp-howto-in-python3/)
25 | - [Python 3 爬虫｜第10章：爬取少量妹子图](https://madmalls.com/blog/post/python3-concurrency-pics-01/)
26 | - [Python 3 爬虫｜第11章：爬取海量妹子图](https://madmalls.com/blog/post/python3-concurrency-pics-02/)
27 | 
28 | 
29 | # 2. 使用方法
30 | 
31 | ## 2.1 Server
32 | 
33 | 为防止DDoS攻击，本次测试需要在本地搭建一个HTTP测试服务器，具体方法参考 [Python3爬虫系列03 (实验) - 同步阻塞下载](http://www.madmalls.com/blog/post/sequential-download-for-python/)
34 | 
35 | ## 2.2 Client
36 | 
37 | ### (1) 下载代码
38 | 
39 | ```bash
40 | [root@CentOS ~]# git clone https://github.com/wangy8961/python3-concurrency.git
41 | [root@CentOS ~]# cd python3-concurrency/
42 | ```
43 | 
44 | ### (2) 准备虚拟环境
45 | 
46 | 如果你的操作系统是`Linux`:
47 | 
48 | ```bash
49 | [root@CentOS python3-concurrency]# python3 -m venv venv3
50 | [root@CentOS python3-concurrency]# source venv3/bin/activate
51 | ```
52 | 
53 | > `Windows`激活虚拟环境的命令是: `venv3\Scripts\activate`
54 | 
55 | ### (3) 安装依赖包
56 | 
57 | 如果你的操作系统是`Linux`:
58 | 
59 | ```bash
60 | (venv3) [root@CentOS python3-concurrency]# pip install -r requirements-linux.txt
61 | ```
62 | 
63 | 如果你的操作系统是`Windows`（不会使用`uvloop`）:
64 | 
65 | ```bash
66 | (venv3) C:\Users\wangy> pip install -r requirements-win32.txt
67 | ```
68 | 
69 | ### (4) 测试
70 | 
71 | 依序下载：
72 | 
73 | ```python
74 | (venv3) [root@CentOS python3-concurrency]# python sequential.py
75 | ```
76 | 
77 | 多进程下载：
78 | 
79 | ```python
80 | (venv3) [root@CentOS python3-concurrency]# python processpool.py
81 | ```
82 | 
83 | 多线程下载：
84 | 
85 | ```python
86 | (venv3) [root@CentOS python3-concurrency]# python threadpool.py
87 | ```
88 | 
89 | 异步下载：
90 | 
91 | ```python
92 | (venv3) [root@CentOS python3-concurrency]# python asynchronous.py
93 | ```


--------------------------------------------------------------------------------
/threadpool.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from queue import Queue
  3 | from threading import Thread
  4 | from functools import partial
  5 | from concurrent import futures
  6 | from common import setup_down_path, get_links, download_one, download_one_1
  7 | from logger import logger
  8 | 
  9 | 
 10 | class ThreadWorker(Thread):
 11 |     def __init__(self, queue):
 12 |         Thread.__init__(self)
 13 |         self.queue = queue
 14 | 
 15 |     def run(self):
 16 |         while True:
 17 |             down_path, linkno, link = self.queue.get()
 18 |             download_one_1(down_path, linkno, link)
 19 |             self.queue.task_done()
 20 | 
 21 | 
 22 | def download_many():
 23 |     '''多线程，按线程数 并发（非并行） 下载所有图片'''
 24 |     down_path = setup_down_path()
 25 |     links = get_links()
 26 | 
 27 |     # 创建队列
 28 |     queue = Queue()
 29 | 
 30 |     # 创建多个线程
 31 |     for i in range(64):
 32 |         worker = ThreadWorker(queue)
 33 |         worker.daemon = True  # 如果工作线程在等待更多的任务时阻塞了，主线程也可以正常退出
 34 |         worker.start()  # 启动线程
 35 | 
 36 |     # 往队列中投放任务
 37 |     for linkno, link in enumerate(links, 1):  # 链接带序号
 38 |         logger.info('Queueing No.{} {}'.format(linkno, link))
 39 |         queue.put((down_path, linkno, link))
 40 | 
 41 |     logger.info('Waiting for all subthread done...')
 42 |     # Causes the main thread to wait for the queue to finish processing all the tasks
 43 |     queue.join()
 44 |     logger.info('All subthread done.')
 45 | 
 46 |     return len(links)
 47 | 
 48 | 
 49 | def download_many_1():
 50 |     '''多线程，按线程数 并发（非并行） 下载所有图片
 51 |     使用concurrent.futures.ThreadPoolExecutor()
 52 |     Executor.map()使用Future而不是返回Future，它返回迭代器，
 53 |     迭代器的__next__()方法调用各个Future的result()方法，因此我们得到的是各个Future的结果，而非Future本身
 54 | 
 55 |     注意Executor.map()限制了download_one()只能接受一个参数，所以images是字典构成的列表
 56 |     '''
 57 |     down_path = setup_down_path()
 58 |     links = get_links()
 59 | 
 60 |     images = []
 61 |     for linkno, link in enumerate(links, 1):
 62 |         image = {
 63 |             'path': down_path,
 64 |             'linkno': linkno,
 65 |             'link': link
 66 |         }
 67 |         images.append(image)
 68 | 
 69 |     workers = min(64, len(links))  # 保证线程池中的线程不会多于总的下载任务数
 70 |     # with语句将调用executor.__exit__()方法，而这个方法会调用executor.shutdown(wait=True)方法，它会在所有进程都执行完毕前阻塞主进程
 71 |     with futures.ThreadPoolExecutor(workers) as executor:
 72 |         # executor.map()效果类似于内置函数map()，但download_one()函数会在多个线程中并发调用
 73 |         # 它的返回值res是一个迭代器<itertools.chain object>，我们后续可以迭代获取各个被调用函数的返回值
 74 |         res = executor.map(download_one, images)  # 传一个序列
 75 | 
 76 |     return len(list(res))  # 如果有进程抛出异常，异常会在这里抛出，类似于迭代器中隐式调用next()的效果
 77 | 
 78 | 
 79 | def download_many_2():
 80 |     '''多线程，按线程数 并发（非并行） 下载所有图片
 81 |     使用concurrent.futures.ThreadPoolExecutor()
 82 |     Executor.map()中的调用函数如果要接受多个参数，可以给Executor.map()传多个序列
 83 |     参考：https://yuanjiang.space/threadpoolexecutor-map-method-with-multiple-parameters
 84 |     '''
 85 |     down_path = setup_down_path()
 86 |     links = get_links()
 87 | 
 88 |     # 固定住保存的路径，不用每次调用下载图片函数时都传同样的down_path参数
 89 |     download_one_1_partial = partial(download_one_1, down_path)
 90 | 
 91 |     # 创建包含所有linkno的序列
 92 |     linknos = [i for i in range(len(links))]
 93 | 
 94 |     workers = min(64, len(links))  # 保证线程池中的线程不会多于总的下载任务数
 95 |     with futures.ThreadPoolExecutor(workers) as executor:
 96 |         res = executor.map(download_one_1_partial, linknos, links)  # 给Executor.map()传多个序列
 97 | 
 98 |     return len(list(res))
 99 | 
100 | 
101 | def download_many_3():
102 |     '''多线程，按线程数 并发（非并行） 下载所有图片
103 |     使用concurrent.futures.ThreadPoolExecutor()
104 |     不使用Executor.map()，而使用Executor.submit()和concurrent.futures.as_completed()
105 |     Executor.submit()方法会返回Future，而Executor.map()是使用Future
106 |     '''
107 |     down_path = setup_down_path()
108 |     links = get_links()
109 | 
110 |     # 固定住保存的路径，不用每次调用下载图片函数时都传同样的down_path参数
111 |     download_one_1_partial = partial(download_one_1, down_path)
112 | 
113 |     workers = min(64, len(links))  # 保证线程池中的线程不会多于总的下载任务数
114 |     with futures.ThreadPoolExecutor(workers) as executor:
115 |         to_do = []
116 |         # 创建并排定Future
117 |         for linkno, link in enumerate(links, 1):  # 链接带序号
118 |             future = executor.submit(download_one_1_partial, linkno, link)
119 |             to_do.append(future)
120 |             logger.debug('Scheduled for No.{} {}: {}'.format(linkno, link, future))
121 | 
122 |         results = []
123 |         # 获取Future的结果，futures.as_completed(to_do)的参数是Future列表，返回迭代器，
124 |         # 只有当有Future运行结束后，才产出future
125 |         for future in futures.as_completed(to_do):  # future变量表示已完成的Future对象，所以后续future.result()绝不会阻塞
126 |             res = future.result()
127 |             results.append(res)
128 |             logger.debug('{} result: {!r}'.format(future, res))
129 | 
130 |     return len(results)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     t0 = time.time()
135 |     count = download_many()
136 |     msg = '{} flags downloaded in {:.2f} seconds.'
137 |     logger.info(msg.format(count, time.time() - t0))
138 | 


--------------------------------------------------------------------------------
/asynchronous.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import logging
  3 | import os
  4 | import sys
  5 | import time
  6 | import aiohttp
  7 | import aiofiles
  8 | import progressbar
  9 | 
 10 | 
 11 | # 当前模块文件的根目录
 12 | basepath = os.path.abspath(os.path.dirname(__file__))
 13 | 
 14 | # 记录日志
 15 | logger = logging.getLogger('spider')  # 创建logger实例
 16 | logger.setLevel(logging.CRITICAL)  # 保持控制台清爽，只输出总信息和进度条
 17 | formatter = logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')  # 控制台日志和日志文件使用同一个Formatter
 18 | log_path = os.path.join(basepath, 'logs')  # 日志文件所在目录
 19 | if not os.path.isdir(log_path):
 20 |     os.mkdir(log_path)
 21 | filename = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'  # 日志文件名，以当前时间命名
 22 | file_handler = logging.FileHandler(os.path.join(log_path, filename), encoding='utf-8')  # 创建日志文件handler
 23 | file_handler.setFormatter(formatter)  # 设置Formatter
 24 | file_handler.setLevel(logging.DEBUG)  # 单独设置日志文件的日志级别，注释掉则使用总日志级别
 25 | stream_handler = logging.StreamHandler()  # 控制台日志StreamHandler
 26 | stream_handler.setFormatter(formatter)
 27 | logger.addHandler(file_handler)  # 将handler添加到logger中
 28 | logger.addHandler(stream_handler)
 29 | 
 30 | 
 31 | def setup_down_path():
 32 |     '''设置图片下载后的保存位置，所有图片放在同一个目录下'''
 33 |     down_path = os.path.join(basepath, 'downloads')
 34 |     if not os.path.isdir(down_path):
 35 |         os.mkdir(down_path)
 36 |         logger.critical('Create download path {}'.format(down_path))
 37 |     return down_path
 38 | 
 39 | 
 40 | async def get_links():
 41 |     '''获取所有图片的下载链接'''
 42 |     async with aiofiles.open(os.path.join(basepath, 'flags.txt')) as f:  # 图片名都保存在这个文件中，每行一个图片名
 43 |         flags = await f.readlines()
 44 |         return ['http://192.168.40.121/flags/' + flag.strip() for flag in flags]
 45 | 
 46 | 
 47 | async def download_one(semaphore, session, image):
 48 |     logger.debug('Downloading No.{} [{}]'.format(image['linkno'], image['link']))
 49 |     t0 = time.time()
 50 | 
 51 |     try:
 52 |         async with semaphore:
 53 |             async with session.get(image['link']) as response:
 54 |                 if response.status == 200:
 55 |                     image_content = await response.read()  # Binary Response Content: access the response body as bytes, for non-text requests
 56 |                 else:
 57 |                     logger.error('received invalid response code: {}, message: {}'.format(response.status, response.reason))
 58 |                     raise aiohttp.ClientError()
 59 |     except Exception as e:
 60 |         logger.error('Exception {} raised on No.{} [{}]'.format(e.__class__, image['linkno'], image['link']))
 61 |         return False  # 用于告知 download_one() 的调用方，请求此图片URL时失败了
 62 | 
 63 |     filename = os.path.split(image['link'])[1]
 64 |     async with aiofiles.open(os.path.join(image['path'], filename), 'wb') as f:
 65 |         await f.write(image_content)
 66 | 
 67 |     t1 = time.time()
 68 |     logger.debug('Task No.{} [{}] runs {:.2f} seconds.'.format(image['linkno'], image['link'], t1 - t0))
 69 | 
 70 |     return True  # 用于告知 download_one() 的调用方，成功请求此图片URL
 71 | 
 72 | 
 73 | async def download_many():
 74 |     down_path = setup_down_path()
 75 |     links = await get_links()
 76 |     # 用于限制并发请求数量
 77 |     sem = asyncio.Semaphore(min(1000, len(links)))
 78 | 
 79 |     async with aiohttp.ClientSession() as session:  # aiohttp建议整个应用只创建一个session，不能为每个请求创建一个seesion
 80 |         successful_images = 0  # 请求成功的图片数
 81 |         failed_images = 0  # 请求失败的图片数
 82 | 
 83 |         if len(sys.argv) > 1 and sys.argv[1] == '-v':  # 输出详细信息
 84 |             logger.setLevel(logging.DEBUG)
 85 | 
 86 |             tasks = []  # 保存所有任务的列表
 87 |             for linkno, link in enumerate(links, 1):
 88 |                 image = {
 89 |                     'path': down_path,
 90 |                     'linkno': linkno,  # 图片序号，方便日志输出时，正在下载哪一张
 91 |                     'link': link
 92 |                 }
 93 |                 task = asyncio.create_task(download_one(sem, session, image))  # asyncio.create_task()是Python 3.7新加的，否则使用asyncio.ensure_future()
 94 |                 tasks.append(task)
 95 |             results = await asyncio.gather(*tasks)
 96 | 
 97 |             for result in results:
 98 |                 if result:
 99 |                     successful_images += 1
100 |                 else:
101 |                     failed_images += 1
102 |         else:  # 输出进度条
103 |             to_do = []
104 |             for linkno, link in enumerate(links, 1):
105 |                 image = {
106 |                     'path': down_path,
107 |                     'linkno': linkno,  # 图片序号，方便日志输出时，正在下载哪一张
108 |                     'link': link
109 |                 }
110 |                 to_do.append(download_one(sem, session, image))
111 | 
112 |             to_do_iter = asyncio.as_completed(to_do)
113 | 
114 |             with progressbar.ProgressBar(max_value=len(to_do)) as bar:
115 |                 for i, future in enumerate(to_do_iter):
116 |                     result = await future
117 |                     if result:
118 |                         successful_images += 1
119 |                     else:
120 |                         failed_images += 1
121 |                     bar.update(i)
122 | 
123 |         logger.critical('Successful [{}] images, failed [{}] images'.format(successful_images, failed_images))
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     t0 = time.time()
128 |     if sys.platform != 'win32':
129 |         import uvloop
130 |         asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
131 |     loop = asyncio.get_event_loop()
132 |     loop.run_until_complete(download_many())
133 |     loop.close()
134 |     logger.critical('Total Cost {:.2f} seconds'.format(time.time() - t0))
135 | 


--------------------------------------------------------------------------------
/processpool.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from functools import partial
  3 | from multiprocessing import Pool
  4 | from concurrent import futures
  5 | from common import setup_down_path, get_links, download_one, download_one_1
  6 | from logger import logger
  7 | 
  8 | 
  9 | def download_many():
 10 |     '''多进程，按进程数 并行 下载所有图片
 11 |     使用multiprocessing.Pool.apply_async()
 12 |     '''
 13 |     down_path = setup_down_path()
 14 |     links = get_links()
 15 | 
 16 |     p = Pool(4)  # 指定进程池中的进程数
 17 |     for linkno, link in enumerate(links, 1):
 18 |         image = {
 19 |             'path': down_path,
 20 |             'linkno': linkno,
 21 |             'link': link
 22 |         }
 23 |         p.apply_async(download_one, args=(image,))
 24 | 
 25 |     logger.info('Waiting for all subprocesses done...')
 26 |     p.close()  # 关闭进程池
 27 |     p.join()  # 主进程等待进程池中的所有子进程结束
 28 |     logger.info('All subprocesses done.')
 29 | 
 30 |     return len(links)
 31 | 
 32 | 
 33 | def download_many_1():
 34 |     '''多进程，按进程数 并行 下载所有图片
 35 |     使用multiprocessing.Pool.map(download_one, images)
 36 |     注意Pool.map()限制了download_one()只能接受一个参数，所以images是字典构成的列表
 37 |     '''
 38 |     down_path = setup_down_path()
 39 |     links = get_links()
 40 | 
 41 |     images = []
 42 |     for linkno, link in enumerate(links, 1):
 43 |         image = {
 44 |             'path': down_path,
 45 |             'linkno': linkno,
 46 |             'link': link
 47 |         }
 48 |         images.append(image)
 49 | 
 50 |     with Pool(4) as p:
 51 |         p.map(download_one, images)  # 将images序列依次映射给download_one()函数
 52 | 
 53 |     logger.info('Waiting for all subprocesses done...')
 54 |     # p.close()  # 使用with语句和Pool.map()后，会自动调用Pool.close()和Pool.join()
 55 |     # p.join()
 56 |     logger.info('All subprocesses done.')
 57 | 
 58 |     return len(links)
 59 | 
 60 | 
 61 | def download_many_2():
 62 |     '''多进程，按进程数 并行 下载所有图片
 63 |     使用multiprocessing.Pool.starmap(download_one_1, images)，它是Python-3.3添加的
 64 |     可以给download_one_1()函数传元组组成的序列，会自动解包元组给函数的多个参数
 65 |     '''
 66 |     down_path = setup_down_path()
 67 |     links = get_links()
 68 | 
 69 |     images = []
 70 |     for linkno, link in enumerate(links, 1):
 71 |         images.append((down_path, linkno, link))
 72 | 
 73 |     with Pool(4) as p:
 74 |         p.starmap(download_one_1, images)  # 链接带序号
 75 | 
 76 |     logger.info('Waiting for all subprocesses done...')
 77 |     # p.close()
 78 |     # p.join()
 79 |     logger.info('All subprocesses done.')
 80 | 
 81 |     return len(links)
 82 | 
 83 | 
 84 | def download_many_3():
 85 |     '''多进程，按进程数 并行 下载所有图片
 86 |     使用multiprocessing.Pool.starmap(download_one_1, images)，它是Python-3.3添加的
 87 |     可以给download_one_1()函数传元组组成的序列，会自动解包元组给函数的多个参数
 88 |     由于下载每张图片时的保存目录都相同，可以使用functools.partial()固定住这个参数
 89 |     '''
 90 |     down_path = setup_down_path()
 91 |     links = get_links()
 92 | 
 93 |     # 固定住保存的路径，不用每次调用下载图片函数时都传同样的down_path参数
 94 |     download_one_1_partial = partial(download_one_1, down_path)
 95 | 
 96 |     images = []
 97 |     for linkno, link in enumerate(links, 1):
 98 |         images.append((linkno, link))  # 每个元组将不包含保存的目录
 99 | 
100 |     with Pool(4) as p:
101 |         p.starmap(download_one_1_partial, images)  # 链接带序号
102 | 
103 |     logger.info('Waiting for all subprocesses done...')
104 |     # p.close()
105 |     # p.join()
106 |     logger.info('All subprocesses done.')
107 | 
108 |     return len(links)
109 | 
110 | 
111 | def download_many_4():
112 |     '''多进程，按进程数 并行 下载所有图片
113 |     使用concurrent.futures.ProcessPoolExecutor()
114 |     Executor.map()使用Future而不是返回Future，它返回迭代器，
115 |     迭代器的__next__()方法调用各个Future的result()方法，因此我们得到的是各个Future的结果，而非Future本身
116 | 
117 |     注意Executor.map()限制了download_one()只能接受一个参数，所以images是字典构成的列表
118 |     '''
119 |     down_path = setup_down_path()
120 |     links = get_links()
121 | 
122 |     images = []
123 |     for linkno, link in enumerate(links, 1):
124 |         image = {
125 |             'path': down_path,
126 |             'linkno': linkno,
127 |             'link': link
128 |         }
129 |         images.append(image)
130 | 
131 |     # with语句将调用executor.__exit__()方法，而这个方法会调用executor.shutdown(wait=True)方法，它会在所有进程都执行完毕前阻塞主进程
132 |     with futures.ProcessPoolExecutor(max_workers=16) as executor:  # 不指定max_workers时，进程池中进程个数默认为os.cpu_count()
133 |         # executor.map()效果类似于内置函数map()，但download_one()函数会在多个进程中并行调用
134 |         # 它的返回值res是一个迭代器<itertools.chain object>，我们后续可以迭代获取各个被调用函数的返回值
135 |         res = executor.map(download_one, images)  # 传一个序列
136 | 
137 |     return len(list(res))  # 如果有进程抛出异常，异常会在这里抛出，类似于迭代器中隐式调用next()的效果
138 | 
139 | 
140 | def download_many_5():
141 |     '''多进程，按进程数 并行 下载所有图片
142 |     使用concurrent.futures.ProcessPoolExecutor()
143 |     Executor.map()中的调用函数如果要接受多个参数，可以给Executor.map()传多个序列
144 |     参考：https://yuanjiang.space/threadpoolexecutor-map-method-with-multiple-parameters
145 |     '''
146 |     down_path = setup_down_path()
147 |     links = get_links()
148 | 
149 |     # 固定住保存的路径，不用每次调用下载图片函数时都传同样的down_path参数
150 |     download_one_1_partial = partial(download_one_1, down_path)
151 | 
152 |     # 创建包含所有linkno的序列
153 |     linknos = [i for i in range(len(links))]
154 | 
155 |     with futures.ProcessPoolExecutor(max_workers=16) as executor:
156 |         res = executor.map(download_one_1_partial, linknos, links)  # 给Executor.map()传多个序列
157 | 
158 |     return len(list(res))
159 | 
160 | 
161 | def download_many_6():
162 |     '''多进程，按进程数 并行 下载所有图片
163 |     使用concurrent.futures.ProcessPoolExecutor()
164 |     不使用Executor.map()，而使用Executor.submit()和concurrent.futures.as_completed()
165 |     Executor.submit()方法会返回Future，而Executor.map()是使用Future
166 |     '''
167 |     down_path = setup_down_path()
168 |     links = get_links()
169 | 
170 |     # 固定住保存的路径，不用每次调用下载图片函数时都传同样的down_path参数
171 |     download_one_1_partial = partial(download_one_1, down_path)
172 | 
173 |     with futures.ProcessPoolExecutor(max_workers=16) as executor:
174 |         to_do = []
175 |         # 创建并且排定Future
176 |         for linkno, link in enumerate(links, 1):  # 链接带序号
177 |             future = executor.submit(download_one_1_partial, linkno, link)
178 |             to_do.append(future)
179 |             logger.debug('Scheduled for No.{} {}: {}'.format(linkno, link, future))
180 | 
181 |         results = []
182 |         # 获取Future的结果，futures.as_completed(to_do)的参数是Future列表，返回迭代器，
183 |         # 只有当有Future运行结束后，才产出future
184 |         for future in futures.as_completed(to_do):  # future变量表示已完成的Future对象，所以后续future.result()绝不会阻塞
185 |             res = future.result()
186 |             results.append(res)
187 |             logger.debug('{} result: {!r}'.format(future, res))
188 | 
189 |     return len(results)
190 | 
191 | 
192 | if __name__ == '__main__':
193 |     t0 = time.time()
194 |     count = download_many_4()
195 |     msg = '{} flags downloaded in {:.2f} seconds.'
196 |     logger.info(msg.format(count, time.time() - t0))
197 | 


--------------------------------------------------------------------------------