├── .gitignore
├── async
    └── main.py
├── multiprocessing
    └── main.py
├── requirements.txt
└── threading
    ├── .env-local
    ├── locking.py
    ├── main.py
    ├── pipelines
        └── wiki_yahoo_scraper_pipeline.yaml
    ├── workers
        ├── PostgresWorker.py
        ├── WikiWorker.py
        ├── YahooFinanceWorkers.py
        └── __init__.py
    └── yaml_reader.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | venv
3 | 
4 | *.pyc


--------------------------------------------------------------------------------
/async/main.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | import requests
 4 | import aiohttp
 5 | 
 6 | 
 7 | async def get_url_response(url):
 8 |     async with aiohttp.ClientSession() as session:
 9 |         async with session.get(url) as response:
10 |             return await response.text()
11 | 
12 | 
13 | async def main():
14 |     urls = ['https://google.com',
15 |             'https://wikipedia.org/wiki/Concurrency',
16 |             'https://python.org',
17 |             'https://pypi.org/project/requests/',
18 |             'https://docs.python.org/3/library/asyncio-task.html',
19 |             'https://www.apple.com/',
20 |             'https://medium.com']
21 | 
22 |     start = time.time()
23 |     sync_text_response = []
24 |     for url in urls:
25 |         sync_text_response.append(requests.get(url).text)
26 | 
27 |     end_time = time.time()
28 |     print('Requests time:', end_time - start)
29 | 
30 |     start = time.time()
31 |     tasks = []
32 |     for url in urls:
33 |         tasks.append(asyncio.create_task(get_url_response(url)))
34 | 
35 |     async_text_response = await asyncio.gather(*tasks)
36 | 
37 |     end_time = time.time()
38 |     print('Async requests time:', end_time - start)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     asyncio.run(main())
43 | 


--------------------------------------------------------------------------------
/multiprocessing/main.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Pool, cpu_count
 2 | 
 3 | 
 4 | def check_number_of_values_in_range(comp_list, lower, upper):
 5 |     number_of_hits = 0
 6 |     for i in range(lower, upper):
 7 |         if i in comp_list:
 8 |             number_of_hits += 1
 9 |     return number_of_hits
10 | 
11 | 
12 | num_processes = 4
13 | comparison_list = [1, 2, 3]
14 | lower_and_upper_bounds = [(0, 25*10**6), (25*10*6, 50*10**6),
15 |                           (50*10**6, 75*10*6), (75*10*6, 10**8)]
16 | 
17 | num_cpu_to_use = max(1, cpu_count() - 1)
18 | 
19 | print('Number of cpus being used:', num_cpu_to_use)
20 | 
21 | prepared_list = []
22 | for i in range(len(lower_and_upper_bounds)):
23 |     prepared_list.append((comparison_list, *lower_and_upper_bounds[i]))
24 | 
25 | print('List to use as input:', prepared_list)
26 | with Pool(num_cpu_to_use) as mp_pool:
27 |     result = mp_pool.starmap(check_number_of_values_in_range, prepared_list)  # [(comp_list, lower, upper), ..]
28 | 
29 | print(result)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.7.4.post0
 2 | async-timeout==3.0.1
 3 | attrs==21.2.0
 4 | beautifulsoup4==4.9.3
 5 | certifi==2021.5.30
 6 | chardet==4.0.0
 7 | greenlet==1.1.0
 8 | idna==2.10
 9 | importlib-metadata==4.6.0
10 | lxml==4.6.3
11 | multidict==5.1.0
12 | psycopg2-binary==2.9.1
13 | PyYAML==5.4.1
14 | requests==2.25.1
15 | soupsieve==2.2.1
16 | SQLAlchemy==1.4.20
17 | typing-extensions==3.10.0.0
18 | urllib3==1.26.6
19 | yarl==1.6.3
20 | zipp==3.5.0
21 | 


--------------------------------------------------------------------------------
/threading/.env-local:
--------------------------------------------------------------------------------
1 | export PIPELINE_LOCATION='pipelines/wiki_yahoo_scraper_pipeline.yaml'
2 | export PG_USER=''
3 | export PG_PW=''
4 | export PG_HOST='localhost'
5 | export PG_DB='postgres'
6 | 


--------------------------------------------------------------------------------
/threading/locking.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | 
 4 | counter = 0
 5 | 
 6 | lock = threading.Lock()
 7 | 
 8 | # lock.acquire()
 9 | # lock.release()
10 | 
11 | 
12 | def increment():
13 |     global counter
14 |     for i in range(10**6):
15 |         with lock:
16 |             counter += 1
17 |             # more locking
18 |             # more locking
19 | 
20 | threads = []
21 | for i in range(4):
22 |     x = threading.Thread(target=increment)
23 |     threads.append(x)
24 | 
25 | for t in threads:
26 |     t.start()
27 | 
28 | for t in threads:
29 |     t.join()
30 | 
31 | print('Counter value:', counter)
32 | 
33 | 
34 | # counter = x
35 | # thread is locking
36 | # thread <- counter = x
37 | # thread -> counter = counter + 1 -> counter = x + 1
38 | # release locking
39 | # thread 2 is locking
40 | # thread 2 <- counter = x + 1
41 | # thread 2 -> counter = counter + 1 -> counter = x +1 + 1
42 | # release locking
43 | 
44 | # counter = x + 2


--------------------------------------------------------------------------------
/threading/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | 
 4 | from yaml_reader import YamlPipelineExecutor
 5 | 
 6 | 
 7 | def main():
 8 |     pipeline_location = os.environ.get('PIPELINE_LOCATION')
 9 |     if pipeline_location is None:
10 |         print('Pipeline location not defined')
11 |         exit(1)
12 |     scraper_start_time = time.time()
13 | 
14 |     yamlPipelineExecutor = YamlPipelineExecutor(pipeline_location=pipeline_location)
15 |     yamlPipelineExecutor.start()
16 |     yamlPipelineExecutor.join()
17 |     print('Extracting time took:', round(time.time() - scraper_start_time, 1))
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/threading/pipelines/wiki_yahoo_scraper_pipeline.yaml:
--------------------------------------------------------------------------------
 1 | queues:
 2 |   - name: SymbolQueue
 3 |     dscription: contains symbols to be scraped from yahoo finance
 4 | 
 5 |   - name: PostgresUploading
 6 |     description: contains data that needs to be uploaded to postgres
 7 | 
 8 | workers:
 9 |   - name: WikiWorker
10 |     description: This scraps raw wikipedia page and pulls out symbols
11 |     note: Only have on instance here, otherwise we scrap the same symbol multiple times
12 |     location: workers.WikiWorker
13 |     class: WikiWorkerMasterScheduler
14 |     instance: 1 # Please don't change this, otherwise we do duplicate work, see note above
15 |     input_values:
16 |       - 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
17 |     output_queues:
18 |       - SymbolQueue
19 | 
20 |   - name: YahooFinanceWorker
21 |     description: pulls price data for a specific stock symbol from yahoo finance
22 |     location: workers.YahooFinanceWorkers
23 |     class: YahooFinancePriceScheduler
24 |     instances: 2
25 |     input_queue: SymbolQueue
26 |     output_queues:
27 |       - PostgresUploading
28 | 
29 |   - name: PostgresWorker
30 |     description: take stock data and save in postgres
31 |     location: workers.PostgresWorker
32 |     class: PostgresMasterScheduler
33 |     instances: 6
34 |     input_queue: PostgresUploading
35 | 


--------------------------------------------------------------------------------
/threading/workers/PostgresWorker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import threading
 3 | 
 4 | from queue import Empty
 5 | 
 6 | from sqlalchemy import create_engine
 7 | from sqlalchemy.sql import text
 8 | 
 9 | 
10 | class PostgresMasterScheduler(threading.Thread):
11 |     def __init__(self, input_queue, **kwargs):
12 |         if 'output_queue' in kwargs:
13 |             kwargs.pop('output_queue')
14 |         super(PostgresMasterScheduler, self).__init__(**kwargs)
15 |         self._input_queue = input_queue
16 |         self.start()
17 | 
18 |     def run(self):
19 |         while True:
20 |             try:
21 |                 val = self._input_queue.get(timeout=10)
22 |             except Empty:
23 |                 print('Timeout reached in postgres scheduler, stopping')
24 |                 break
25 | 
26 |             if val == 'DONE':
27 |                 break
28 | 
29 |             symbol, price, extracted_time = val
30 |             postgresWorker = PostgresWorker(symbol, price, extracted_time)
31 |             postgresWorker.insert_into_db()
32 | 
33 | 
34 | class PostgresWorker():
35 |     def __init__(self, symbol, price, extracted_time):
36 |         self._symbol = symbol
37 |         self._price = price
38 |         self._extracted_time = extracted_time
39 | 
40 |         self._PG_USER = os.environ.get('PG_USER')
41 |         self._PG_PW = os.environ.get('PG_PW')
42 |         self._PG_HOST = os.environ.get('PG_HOST')
43 |         self._PG_DB = os.environ.get('PG_DB')
44 | 
45 |         self._engine = create_engine(f'postgresql://{self._PG_USER}:{self._PG_PW}@{self._PG_HOST}/{self._PG_DB}')
46 | 
47 |     def _create_insert_query(self):
48 |         SQL = """INSERT INTO prices (symbol, price, extracted_time) VALUES 
49 |         (:symbol, :price, :extracted_time)"""
50 |         return SQL
51 | 
52 |     def insert_into_db(self):
53 |         insert_query = self._create_insert_query()
54 | 
55 |         with self._engine.connect() as conn:
56 |             conn.execute(text(insert_query), {'symbol': self._symbol,
57 |                                               'price': self._price,
58 |                                               'extracted_time': str(self._extracted_time)})
59 | 


--------------------------------------------------------------------------------
/threading/workers/WikiWorker.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | 
 7 | class WikiWorkerMasterScheduler(threading.Thread):
 8 |     def __init__(self, output_queue, **kwargs):
 9 |         if 'input_queue' in kwargs:
10 |             kwargs.pop('input_queue')
11 | 
12 |         self._input_values = kwargs.pop('input_values')
13 | 
14 |         temp_queue = output_queue
15 |         if type(temp_queue) != list:
16 |             temp_queue = [temp_queue]
17 |         self._output_queues = temp_queue
18 |         super(WikiWorkerMasterScheduler, self).__init__(**kwargs)
19 |         self.start()
20 | 
21 |     def run(self):
22 |         for entry in self._input_values:
23 |             wikiWorker = WikiWorker(entry)
24 |             symbol_counter = 0
25 |             for symbol in wikiWorker.get_sp_500_companies():
26 |                 for output_queue in self._output_queues:
27 |                     output_queue.put(symbol)
28 |                 symbol_counter += 1
29 |                 if symbol_counter >= 5:
30 |                     break
31 | 
32 | 
33 | class WikiWorker():
34 |     def __init__(self, url):
35 |         self._url = url
36 | 
37 |     @staticmethod
38 |     def _extract_company_symbols(page_html):
39 |         soup = BeautifulSoup(page_html, 'lxml')
40 |         table = soup.find(id='constituents')
41 |         table_rows = table.find_all('tr')
42 |         for table_row in table_rows[1:]:
43 |             symbol = table_row.find('td').text.strip('\n')
44 |             yield symbol
45 | 
46 |     def get_sp_500_companies(self):
47 |         response = requests.get(self._url)
48 |         if response.status_code != 200:
49 |             print("Couldn't get entries")
50 |             return []
51 | 
52 |         yield from self._extract_company_symbols(response.text)
53 | 
54 | 


--------------------------------------------------------------------------------
/threading/workers/YahooFinanceWorkers.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import datetime
 3 | import random
 4 | import time
 5 | from queue import Empty
 6 | 
 7 | import requests
 8 | from lxml import html
 9 | 
10 | 
11 | class YahooFinancePriceScheduler(threading.Thread):
12 |     def __init__(self, input_queue, output_queue, **kwargs):
13 |         super(YahooFinancePriceScheduler, self).__init__(**kwargs)
14 |         self._input_queue = input_queue
15 |         temp_queue = output_queue
16 |         if type(temp_queue) != list:
17 |             temp_queue = [temp_queue]
18 |         self._output_queues = temp_queue
19 |         self.start()
20 | 
21 |     def run(self):
22 |         while True:
23 |             try:
24 |                 val = self._input_queue.get(timeout=10)
25 |             except Empty:
26 |                 print('Yahoo scheduler queue is empty, stopping')
27 |                 break
28 |             if val == 'DONE':
29 |                 break
30 | 
31 |             yahooFinacePriceWorker = YahooFinacePriceWorker(symbol=val)
32 |             price = yahooFinacePriceWorker.get_price()
33 |             for output_queue in self._output_queues:
34 |                 output_values = (val, price, datetime.datetime.utcnow())
35 |                 output_queue.put(output_values)
36 |             time.sleep(random.random())
37 | 
38 | 
39 | class YahooFinacePriceWorker():
40 |     def __init__(self, symbol):
41 |         self._symbol = symbol
42 |         base_url = 'https://finance.yahoo.com/quote/'
43 |         self._url = f'{base_url}{self._symbol}'
44 | 
45 |     def get_price(self):
46 |         r = requests.get(self._url)
47 |         if r.status_code != 200:
48 |             return
49 |         page_contents = html.fromstring(r.text)
50 |         raw_price = page_contents.xpath('//*[@id="quote-header-info"]/div[3]/div[1]/div/span[1]')[0].text
51 |         price = float(raw_price.replace(',', ''))
52 |         return price
53 | 


--------------------------------------------------------------------------------
/threading/workers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codingwithmax/threading-tutorial/72ce51a9cfa68a292281fd6f53b54c6027b6411a/threading/workers/__init__.py


--------------------------------------------------------------------------------
/threading/yaml_reader.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import threading
  3 | import time
  4 | 
  5 | import yaml
  6 | from multiprocessing import Queue
  7 | 
  8 | 
  9 | class YamlPipelineExecutor(threading.Thread):
 10 |     def __init__(self, pipeline_location):
 11 |         super(YamlPipelineExecutor, self).__init__()
 12 |         self._pipline_location = pipeline_location
 13 |         self._queues = {}
 14 |         self._workers = {}
 15 |         self._queue_consumers = {}
 16 |         self._downstream_queues = {}
 17 | 
 18 |     def _load_pipeline(self):
 19 |         with open(self._pipline_location, 'r') as inFile:
 20 |             self._yaml_data = yaml.safe_load(inFile)
 21 | 
 22 |     def _initialize_queues(self):
 23 |         for queue in self._yaml_data['queues']:
 24 |             queue_name = queue['name']
 25 |             self._queues[queue_name] = Queue()
 26 | 
 27 |     def _initialize_workers(self):
 28 |         for worker in self._yaml_data['workers']:
 29 |             WorkerClass = getattr(importlib.import_module(worker['location']), worker['class'])
 30 |             input_queue = worker.get('input_queue')
 31 |             output_queues = worker.get('output_queues')
 32 |             worker_name = worker['name']
 33 |             num_instances = worker.get('instances', 1)
 34 | 
 35 |             self._downstream_queues[worker_name] = output_queues
 36 |             if input_queue is not None:
 37 |                 self._queue_consumers[input_queue] = num_instances
 38 |             init_params = {
 39 |                 'input_queue': self._queues[input_queue] if input_queue is not None else None,
 40 |                 'output_queue': [self._queues[output_queue] for output_queue in output_queues] \
 41 |                     if output_queues is not None else None
 42 |             }
 43 | 
 44 |             input_values = worker.get('input_values')
 45 |             if input_values is not None:
 46 |                 init_params['input_values'] = input_values
 47 | 
 48 |             self._workers[worker_name] = []
 49 |             for i in range(num_instances):
 50 |                 self._workers[worker_name].append(WorkerClass(**init_params))
 51 | 
 52 |     def _join_workers(self):
 53 |         for worker_name in self._workers:
 54 |             for worker_thread in self._workers[worker_name]:
 55 |                 worker_thread.join()
 56 | 
 57 |     def process_pipeline(self):
 58 |         self._load_pipeline()
 59 |         self._initialize_queues()
 60 |         self._initialize_workers()
 61 |         # self._join_workers()
 62 | 
 63 |     def run(self):
 64 |         self.process_pipeline()
 65 | 
 66 |         while True:
 67 |             total_workers_alive = 0
 68 |             worker_stats = []
 69 |             to_del = []
 70 |             for worker_name in self._workers:
 71 |                 total_worker_threads_alive = 0
 72 |                 for worker_thread in self._workers[worker_name]:
 73 |                     if worker_thread.is_alive():
 74 |                         total_worker_threads_alive += 1
 75 |                 total_workers_alive += total_worker_threads_alive
 76 |                 if total_worker_threads_alive == 0:
 77 |                     if self._downstream_queues[worker_name] is not None:
 78 |                         for output_queue in self._downstream_queues[worker_name]:
 79 |                             number_of_consumers = self._queue_consumers[output_queue]
 80 |                             for i in range(number_of_consumers):
 81 |                                 self._queues[output_queue].put('DONE')
 82 | 
 83 |                     to_del.append(worker_name)
 84 | 
 85 |                 worker_stats.append([worker_name, total_worker_threads_alive])
 86 |             print(worker_stats)
 87 |             if total_workers_alive == 0:
 88 |                 break
 89 | 
 90 |             # queue_stats = []
 91 |             # for queue in self._queues:
 92 |             #     queue_stats.append([queue, self._queues[queue].qsize()])
 93 |             #
 94 |             # print(queue_stats)
 95 | 
 96 |             for worker_name in to_del:
 97 |                 del self._workers[worker_name]
 98 | 
 99 |             time.sleep(1)
100 | 


--------------------------------------------------------------------------------