├── threading-tutorial-main ├── threading │ ├── workers │ │ ├── __init__.py │ │ ├── WikiWorker.py │ │ ├── YahooFinanceWorkers.py │ │ └── PostgresWorker.py │ ├── main.py │ ├── locking.py │ ├── pipelines │ │ ├── mergedYAML19276.txt │ │ └── wiki_yahoo_scraper_pipeline.yaml │ └── yaml_reader.py ├── requirements.txt ├── multiprocessing │ └── main.py └── async │ └── main.py ├── README.md └── LICENSE /threading-tutorial-main/threading/workers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Concurrent-and-Parallel-Programming-in-Python 5 | Concurrent and Parallel Programming in Python, by Packt Publishing 6 | -------------------------------------------------------------------------------- /threading-tutorial-main/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.4.post0 2 | async-timeout==3.0.1 3 | attrs==21.2.0 4 | beautifulsoup4==4.9.3 5 | certifi==2021.5.30 6 | chardet==4.0.0 7 | greenlet==1.1.0 8 | idna==2.10 9 | importlib-metadata==4.6.0 10 | lxml==4.6.3 11 | multidict==5.1.0 12 | psycopg2-binary==2.9.1 13 | PyYAML==5.4.1 14 | requests==2.25.1 15 | soupsieve==2.2.1 16 | SQLAlchemy==1.4.20 17 | typing-extensions==3.10.0.0 18 | urllib3==1.26.6 19 | yarl==1.6.3 20 | zipp==3.5.0 21 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | 4 | from yaml_reader import YamlPipelineExecutor 5 | 6 | 7 | def main(): 8 | pipeline_location = os.environ.get('PIPELINE_LOCATION') 9 | if pipeline_location is None: 10 | print('Pipeline location not defined') 11 | exit(1) 12 | scraper_start_time = time.time() 13 | 14 | yamlPipelineExecutor = YamlPipelineExecutor(pipeline_location=pipeline_location) 15 | yamlPipelineExecutor.start() 16 | yamlPipelineExecutor.join() 17 | print('Extracting time took:', round(time.time() - scraper_start_time, 1)) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/locking.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | 4 | counter = 0 5 | 6 | lock = threading.Lock() 7 | 8 | # lock.acquire() 9 | # lock.release() 10 | 11 | 12 | def increment(): 13 | global counter 14 | for i in range(10**6): 15 | with lock: 16 | counter += 1 17 | # more locking 18 | # more locking 19 | 20 | threads = [] 21 | for i in range(4): 22 | x = threading.Thread(target=increment) 23 | threads.append(x) 24 | 25 | for t in threads: 26 | t.start() 27 | 28 | for t in threads: 29 | t.join() 30 | 31 | print('Counter value:', counter) 32 | 33 | 34 | # counter = x 35 | # thread is locking 36 | # thread <- counter = x 37 | # thread -> counter = counter + 1 -> counter = x + 1 38 | # release locking 39 | # thread 2 is locking 40 | # thread 2 <- counter = x + 1 41 | # thread 2 -> counter = counter + 1 -> counter = x +1 + 1 42 | # release locking 43 | 44 | # counter = x + 2 -------------------------------------------------------------------------------- /threading-tutorial-main/multiprocessing/main.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool, cpu_count 2 | 3 | 4 | def check_number_of_values_in_range(comp_list, lower, upper): 5 | number_of_hits = 0 6 | for i in range(lower, upper): 7 | if i in comp_list: 8 | number_of_hits += 1 9 | return number_of_hits 10 | 11 | 12 | num_processes = 4 13 | comparison_list = [1, 2, 3] 14 | lower_and_upper_bounds = [(0, 25*10**6), (25*10*6, 50*10**6), 15 | (50*10**6, 75*10*6), (75*10*6, 10**8)] 16 | 17 | num_cpu_to_use = max(1, cpu_count() - 1) 18 | 19 | print('Number of cpus being used:', num_cpu_to_use) 20 | 21 | prepared_list = [] 22 | for i in range(len(lower_and_upper_bounds)): 23 | prepared_list.append((comparison_list, *lower_and_upper_bounds[i])) 24 | 25 | print('List to use as input:', prepared_list) 26 | with Pool(num_cpu_to_use) as mp_pool: 27 | result = mp_pool.starmap(check_number_of_values_in_range, prepared_list) # [(comp_list, lower, upper), ..] 28 | 29 | print(result) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /threading-tutorial-main/async/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | import requests 4 | import aiohttp 5 | 6 | 7 | async def get_url_response(url): 8 | async with aiohttp.ClientSession() as session: 9 | async with session.get(url) as response: 10 | return await response.text() 11 | 12 | 13 | async def main(): 14 | urls = ['https://google.com', 15 | 'https://wikipedia.org/wiki/Concurrency', 16 | 'https://python.org', 17 | 'https://pypi.org/project/requests/', 18 | 'https://docs.python.org/3/library/asyncio-task.html', 19 | 'https://www.apple.com/', 20 | 'https://medium.com'] 21 | 22 | start = time.time() 23 | sync_text_response = [] 24 | for url in urls: 25 | sync_text_response.append(requests.get(url).text) 26 | 27 | end_time = time.time() 28 | print('Requests time:', end_time - start) 29 | 30 | start = time.time() 31 | tasks = [] 32 | for url in urls: 33 | tasks.append(asyncio.create_task(get_url_response(url))) 34 | 35 | async_text_response = await asyncio.gather(*tasks) 36 | 37 | end_time = time.time() 38 | print('Async requests time:', end_time - start) 39 | 40 | 41 | if __name__ == '__main__': 42 | asyncio.run(main()) 43 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/pipelines/mergedYAML19276.txt: -------------------------------------------------------------------------------- 1 | queues: 2 | - name: SymbolQueue 3 | dscription: contains symbols to be scraped from yahoo finance 4 | 5 | - name: PostgresUploading 6 | description: contains data that needs to be uploaded to postgres 7 | 8 | workers: 9 | - name: WikiWorker 10 | description: This scraps raw wikipedia page and pulls out symbols 11 | note: Only have on instance here, otherwise we scrap the same symbol multiple times 12 | location: workers.WikiWorker 13 | class: WikiWorkerMasterScheduler 14 | instance: 1 # Please don't change this, otherwise we do duplicate work, see note above 15 | input_values: 16 | - 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' 17 | output_queues: 18 | - SymbolQueue 19 | 20 | - name: YahooFinanceWorker 21 | description: pulls price data for a specific stock symbol from yahoo finance 22 | location: workers.YahooFinanceWorkers 23 | class: YahooFinancePriceScheduler 24 | instances: 2 25 | input_queue: SymbolQueue 26 | output_queues: 27 | - PostgresUploading 28 | 29 | - name: PostgresWorker 30 | description: take stock data and save in postgres 31 | location: workers.PostgresWorker 32 | class: PostgresMasterScheduler 33 | instances: 6 34 | input_queue: PostgresUploading 35 |  -------------------------------------------------------------------------------- /threading-tutorial-main/threading/pipelines/wiki_yahoo_scraper_pipeline.yaml: -------------------------------------------------------------------------------- 1 | queues: 2 | - name: SymbolQueue 3 | dscription: contains symbols to be scraped from yahoo finance 4 | 5 | - name: PostgresUploading 6 | description: contains data that needs to be uploaded to postgres 7 | 8 | workers: 9 | - name: WikiWorker 10 | description: This scraps raw wikipedia page and pulls out symbols 11 | note: Only have on instance here, otherwise we scrap the same symbol multiple times 12 | location: workers.WikiWorker 13 | class: WikiWorkerMasterScheduler 14 | instance: 1 # Please don't change this, otherwise we do duplicate work, see note above 15 | input_values: 16 | - 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies' 17 | output_queues: 18 | - SymbolQueue 19 | 20 | - name: YahooFinanceWorker 21 | description: pulls price data for a specific stock symbol from yahoo finance 22 | location: workers.YahooFinanceWorkers 23 | class: YahooFinancePriceScheduler 24 | instances: 2 25 | input_queue: SymbolQueue 26 | output_queues: 27 | - PostgresUploading 28 | 29 | - name: PostgresWorker 30 | description: take stock data and save in postgres 31 | location: workers.PostgresWorker 32 | class: PostgresMasterScheduler 33 | instances: 6 34 | input_queue: PostgresUploading 35 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/workers/WikiWorker.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | class WikiWorkerMasterScheduler(threading.Thread): 8 | def __init__(self, output_queue, **kwargs): 9 | if 'input_queue' in kwargs: 10 | kwargs.pop('input_queue') 11 | 12 | self._input_values = kwargs.pop('input_values') 13 | 14 | temp_queue = output_queue 15 | if type(temp_queue) != list: 16 | temp_queue = [temp_queue] 17 | self._output_queues = temp_queue 18 | super(WikiWorkerMasterScheduler, self).__init__(**kwargs) 19 | self.start() 20 | 21 | def run(self): 22 | for entry in self._input_values: 23 | wikiWorker = WikiWorker(entry) 24 | symbol_counter = 0 25 | for symbol in wikiWorker.get_sp_500_companies(): 26 | for output_queue in self._output_queues: 27 | output_queue.put(symbol) 28 | symbol_counter += 1 29 | if symbol_counter >= 5: 30 | break 31 | 32 | 33 | class WikiWorker(): 34 | def __init__(self, url): 35 | self._url = url 36 | 37 | @staticmethod 38 | def _extract_company_symbols(page_html): 39 | soup = BeautifulSoup(page_html, 'lxml') 40 | table = soup.find(id='constituents') 41 | table_rows = table.find_all('tr') 42 | for table_row in table_rows[1:]: 43 | symbol = table_row.find('td').text.strip('\n') 44 | yield symbol 45 | 46 | def get_sp_500_companies(self): 47 | response = requests.get(self._url) 48 | if response.status_code != 200: 49 | print("Couldn't get entries") 50 | return [] 51 | 52 | yield from self._extract_company_symbols(response.text) 53 | 54 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/workers/YahooFinanceWorkers.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import datetime 3 | import random 4 | import time 5 | from queue import Empty 6 | 7 | import requests 8 | from lxml import html 9 | 10 | 11 | class YahooFinancePriceScheduler(threading.Thread): 12 | def __init__(self, input_queue, output_queue, **kwargs): 13 | super(YahooFinancePriceScheduler, self).__init__(**kwargs) 14 | self._input_queue = input_queue 15 | temp_queue = output_queue 16 | if type(temp_queue) != list: 17 | temp_queue = [temp_queue] 18 | self._output_queues = temp_queue 19 | self.start() 20 | 21 | def run(self): 22 | while True: 23 | try: 24 | val = self._input_queue.get(timeout=10) 25 | except Empty: 26 | print('Yahoo scheduler queue is empty, stopping') 27 | break 28 | if val == 'DONE': 29 | break 30 | 31 | yahooFinacePriceWorker = YahooFinacePriceWorker(symbol=val) 32 | price = yahooFinacePriceWorker.get_price() 33 | for output_queue in self._output_queues: 34 | output_values = (val, price, datetime.datetime.utcnow()) 35 | output_queue.put(output_values) 36 | time.sleep(random.random()) 37 | 38 | 39 | class YahooFinacePriceWorker(): 40 | def __init__(self, symbol): 41 | self._symbol = symbol 42 | base_url = 'https://finance.yahoo.com/quote/' 43 | self._url = f'{base_url}{self._symbol}' 44 | 45 | def get_price(self): 46 | r = requests.get(self._url) 47 | if r.status_code != 200: 48 | return 49 | page_contents = html.fromstring(r.text) 50 | raw_price = page_contents.xpath('//*[@id="quote-header-info"]/div[3]/div[1]/div/span[1]')[0].text 51 | price = float(raw_price.replace(',', '')) 52 | return price 53 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/workers/PostgresWorker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import threading 3 | 4 | from queue import Empty 5 | 6 | from sqlalchemy import create_engine 7 | from sqlalchemy.sql import text 8 | 9 | 10 | class PostgresMasterScheduler(threading.Thread): 11 | def __init__(self, input_queue, **kwargs): 12 | if 'output_queue' in kwargs: 13 | kwargs.pop('output_queue') 14 | super(PostgresMasterScheduler, self).__init__(**kwargs) 15 | self._input_queue = input_queue 16 | self.start() 17 | 18 | def run(self): 19 | while True: 20 | try: 21 | val = self._input_queue.get(timeout=10) 22 | except Empty: 23 | print('Timeout reached in postgres scheduler, stopping') 24 | break 25 | 26 | if val == 'DONE': 27 | break 28 | 29 | symbol, price, extracted_time = val 30 | postgresWorker = PostgresWorker(symbol, price, extracted_time) 31 | postgresWorker.insert_into_db() 32 | 33 | 34 | class PostgresWorker(): 35 | def __init__(self, symbol, price, extracted_time): 36 | self._symbol = symbol 37 | self._price = price 38 | self._extracted_time = extracted_time 39 | 40 | self._PG_USER = os.environ.get('PG_USER') 41 | self._PG_PW = os.environ.get('PG_PW') 42 | self._PG_HOST = os.environ.get('PG_HOST') 43 | self._PG_DB = os.environ.get('PG_DB') 44 | 45 | self._engine = create_engine(f'postgresql://{self._PG_USER}:{self._PG_PW}@{self._PG_HOST}/{self._PG_DB}') 46 | 47 | def _create_insert_query(self): 48 | SQL = """INSERT INTO prices (symbol, price, extracted_time) VALUES 49 | (:symbol, :price, :extracted_time)""" 50 | return SQL 51 | 52 | def insert_into_db(self): 53 | insert_query = self._create_insert_query() 54 | 55 | with self._engine.connect() as conn: 56 | conn.execute(text(insert_query), {'symbol': self._symbol, 57 | 'price': self._price, 58 | 'extracted_time': str(self._extracted_time)}) 59 | -------------------------------------------------------------------------------- /threading-tutorial-main/threading/yaml_reader.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import threading 3 | import time 4 | 5 | import yaml 6 | from multiprocessing import Queue 7 | 8 | 9 | class YamlPipelineExecutor(threading.Thread): 10 | def __init__(self, pipeline_location): 11 | super(YamlPipelineExecutor, self).__init__() 12 | self._pipline_location = pipeline_location 13 | self._queues = {} 14 | self._workers = {} 15 | self._queue_consumers = {} 16 | self._downstream_queues = {} 17 | 18 | def _load_pipeline(self): 19 | with open(self._pipline_location, 'r') as inFile: 20 | self._yaml_data = yaml.safe_load(inFile) 21 | 22 | def _initialize_queues(self): 23 | for queue in self._yaml_data['queues']: 24 | queue_name = queue['name'] 25 | self._queues[queue_name] = Queue() 26 | 27 | def _initialize_workers(self): 28 | for worker in self._yaml_data['workers']: 29 | WorkerClass = getattr(importlib.import_module(worker['location']), worker['class']) 30 | input_queue = worker.get('input_queue') 31 | output_queues = worker.get('output_queues') 32 | worker_name = worker['name'] 33 | num_instances = worker.get('instances', 1) 34 | 35 | self._downstream_queues[worker_name] = output_queues 36 | if input_queue is not None: 37 | self._queue_consumers[input_queue] = num_instances 38 | init_params = { 39 | 'input_queue': self._queues[input_queue] if input_queue is not None else None, 40 | 'output_queue': [self._queues[output_queue] for output_queue in output_queues] \ 41 | if output_queues is not None else None 42 | } 43 | 44 | input_values = worker.get('input_values') 45 | if input_values is not None: 46 | init_params['input_values'] = input_values 47 | 48 | self._workers[worker_name] = [] 49 | for i in range(num_instances): 50 | self._workers[worker_name].append(WorkerClass(**init_params)) 51 | 52 | def _join_workers(self): 53 | for worker_name in self._workers: 54 | for worker_thread in self._workers[worker_name]: 55 | worker_thread.join() 56 | 57 | def process_pipeline(self): 58 | self._load_pipeline() 59 | self._initialize_queues() 60 | self._initialize_workers() 61 | # self._join_workers() 62 | 63 | def run(self): 64 | self.process_pipeline() 65 | 66 | while True: 67 | total_workers_alive = 0 68 | worker_stats = [] 69 | to_del = [] 70 | for worker_name in self._workers: 71 | total_worker_threads_alive = 0 72 | for worker_thread in self._workers[worker_name]: 73 | if worker_thread.is_alive(): 74 | total_worker_threads_alive += 1 75 | total_workers_alive += total_worker_threads_alive 76 | if total_worker_threads_alive == 0: 77 | if self._downstream_queues[worker_name] is not None: 78 | for output_queue in self._downstream_queues[worker_name]: 79 | number_of_consumers = self._queue_consumers[output_queue] 80 | for i in range(number_of_consumers): 81 | self._queues[output_queue].put('DONE') 82 | 83 | to_del.append(worker_name) 84 | 85 | worker_stats.append([worker_name, total_worker_threads_alive]) 86 | print(worker_stats) 87 | if total_workers_alive == 0: 88 | break 89 | 90 | # queue_stats = [] 91 | # for queue in self._queues: 92 | # queue_stats.append([queue, self._queues[queue].qsize()]) 93 | # 94 | # print(queue_stats) 95 | 96 | for worker_name in to_del: 97 | del self._workers[worker_name] 98 | 99 | time.sleep(1) 100 | --------------------------------------------------------------------------------