├── setup.cfg ├── requirements.txt ├── humans_not_invited_problem ├── __init__.py ├── re_parsers.py ├── utils.py ├── loaders.py ├── data_converters.py └── collectors.py ├── main_inference.py ├── main_collect_data.py ├── README.md ├── .gitignore ├── tags2images.json └── humans_not_invited_problem.ipynb /setup.cfg: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.26.0 2 | pandas==1.3.1 3 | pillow==8.3.1 4 | aiohttp==3.7.4.post0 -------------------------------------------------------------------------------- /humans_not_invited_problem/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Humans Not Invited Problem. 3 | """ 4 | 5 | from .collectors import run_collection_process 6 | from .collectors import get_correct_images_by_content 7 | -------------------------------------------------------------------------------- /main_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main Inference. 3 | """ 4 | import asyncio 5 | 6 | from humans_not_invited_problem import get_correct_images_by_content 7 | 8 | 9 | if __name__ == "__main__": 10 | print("Input content of the html page (finish you input with 'END'):") 11 | content = "" 12 | while True: 13 | tmp = input().replace("&", "&") 14 | if tmp == "END": 15 | break 16 | content += tmp 17 | 18 | print("Processing...") 19 | 20 | loop = asyncio.get_event_loop() 21 | loop.run_until_complete(get_correct_images_by_content(content)) 22 | loop.close() -------------------------------------------------------------------------------- /main_collect_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main Collect Data. 3 | """ 4 | import asyncio 5 | import logging 6 | 7 | from humans_not_invited_problem import run_collection_process 8 | 9 | N_ITERATIONS = 100 10 | N_PARALLEL_TASKS = 10 11 | 12 | 13 | if __name__ == "__main__": 14 | logging.basicConfig( 15 | format="%(asctime)s ~ %(name)s ~ %(levelname)s ~ %(message)s", 16 | datefmt="%Y-%m-%d %H:%M:%S", 17 | level=logging.INFO, 18 | ) 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | loop = asyncio.get_event_loop() 23 | loop.run_until_complete(run_collection_process(N_ITERATIONS, N_PARALLEL_TASKS)) 24 | loop.close() 25 | -------------------------------------------------------------------------------- /humans_not_invited_problem/re_parsers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parsers. 3 | """ 4 | import re 5 | from typing import List, Union 6 | 7 | 8 | def find_all_image_urls(content: Union[str, bytes]) -> List[str]: 9 | """Find image urls from some content. 10 | 11 | Parameters 12 | ---------- 13 | content : Union[str, bytes] 14 | The content. 15 | 16 | Returns 17 | ------- 18 | List[bytes] 19 | Image urls. 20 | """ 21 | return re.findall("captcha/image.php\\?image_name=.*?&id=.", str(content)) 22 | 23 | 24 | def find_tag(content: Union[str, bytes]) -> str: 25 | """Find the content tag. 26 | 27 | Parameters 28 | ---------- 29 | content : Union[str, bytes] 30 | The content. 31 | 32 | Returns 33 | ------- 34 | str 35 | The content tag. 36 | """ 37 | return re.findall('value="(.*?)" name="category"', str(content))[0] 38 | -------------------------------------------------------------------------------- /humans_not_invited_problem/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utils. 3 | """ 4 | import json 5 | 6 | 7 | def save_json(data: dict, filename: str = "tags2images.json") -> None: 8 | """Save dict in json format. 9 | 10 | Parameters 11 | ---------- 12 | data : dict 13 | The dict. 14 | filename : str, optional 15 | The filename to save, by default "tags2images.json" 16 | """ 17 | with open(filename, "w") as file: 18 | json.dump(data, file, indent=4) 19 | 20 | 21 | def load_json(filename: str = "tags2images.json") -> dict: 22 | """Load the dict form json file. 23 | 24 | Parameters 25 | ---------- 26 | filename : str, optional 27 | The filename to load, by default "tags2images.json" 28 | 29 | Returns 30 | ------- 31 | dict 32 | The dict. 33 | """ 34 | with open(filename, "r") as file: 35 | return json.load(file) 36 | -------------------------------------------------------------------------------- /humans_not_invited_problem/loaders.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loaders. 3 | """ 4 | import asyncio 5 | from typing import List, Tuple 6 | 7 | import hashlib 8 | from aiohttp import ClientSession 9 | 10 | from .re_parsers import find_all_image_urls, find_tag 11 | 12 | 13 | BASE_URL = "http://www.humansnotinvited.com/" 14 | 15 | 16 | async def load_image(session: ClientSession, image_url: str) -> str: 17 | """Load image and get hash of this image. 18 | 19 | Parameters 20 | ---------- 21 | session : ClientSession 22 | The aiohttp.ClientSession object. 23 | image_url : str 24 | The image url. 25 | 26 | Returns 27 | ------- 28 | str 29 | The image hash. 30 | """ 31 | async with session.get(BASE_URL + image_url, allow_redirects=True) as response: 32 | data = await response.read() 33 | _hash = hashlib.md5(data).hexdigest() 34 | 35 | return _hash 36 | 37 | 38 | async def load_page(session: ClientSession) -> Tuple[str, List[str]]: 39 | """Load page and images. 40 | 41 | Parameters 42 | ---------- 43 | session : ClientSession 44 | The aiohttp.ClientSession object. 45 | 46 | Returns 47 | ------- 48 | Tuple[str, List[str]] 49 | Tag and image hashes of the page. 50 | """ 51 | async with session.get(BASE_URL, allow_redirects=True) as response: 52 | data = await response.read() 53 | 54 | image_urls = find_all_image_urls(data) 55 | tasks = [] 56 | for image_url in image_urls: 57 | tasks.append(asyncio.create_task(load_image(session, image_url))) 58 | 59 | tag = find_tag(data) 60 | return tag, await asyncio.gather(*tasks) 61 | -------------------------------------------------------------------------------- /humans_not_invited_problem/data_converters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data Converters. 3 | """ 4 | import pandas as pd 5 | from typing import List, Tuple, Dict 6 | 7 | 8 | def create_image_tag_df(result: List[Tuple[str, List[str]]]) -> pd.DataFrame: 9 | """Convert tag - image ids pairs to frequency DataFrame. 10 | 11 | Parameters 12 | ---------- 13 | result : List[Tuple[str, List[str]]] 14 | Tag - image ids pairs. 15 | 16 | Returns 17 | ------- 18 | pd.DataFrame 19 | The frequency DataFrame. 20 | """ 21 | count_image_tag: Dict[str, Dict[str, int]] = {} 22 | for tag, image_ids in result: 23 | tmp_tag = count_image_tag.get(tag, {}) 24 | for image_id in image_ids: 25 | tmp_tag[image_id] = tmp_tag.get(image_id, 0) + 1 26 | count_image_tag[tag] = tmp_tag 27 | df = pd.DataFrame(count_image_tag) 28 | df = df.fillna(0) 29 | return df 30 | 31 | 32 | def create_tags2images_dict(df: pd.DataFrame) -> Dict[str, List[str]]: 33 | """Convert frequency DataFrame to tags2images dict. 34 | 35 | Parameters 36 | ---------- 37 | df : pd.DataFrame 38 | The frequency DataFrame. 39 | 40 | Returns 41 | ------- 42 | Dict[str, List[str]] 43 | tags2images dict. 44 | """ 45 | tags = list(df.columns) 46 | images = list(df.index) 47 | image2tag = list( 48 | df.values.argmax( 49 | axis=1, 50 | ) 51 | ) 52 | 53 | tags2images: Dict[str, List[str]] = {} 54 | for i_tag, tag in enumerate(tags): 55 | tags2images[tag] = [] 56 | for image_id, image_tag in zip(images, image2tag): 57 | if image_tag == i_tag: 58 | tags2images[tag].append(image_id) 59 | 60 | return tags2images 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Humans-Not-Invited-Problem 2 | 3 | There is solution for the [humansnotinvited problem](http://www.humansnotinvited.com/) 4 | 5 | 6 | ### Run Data Collection Process 7 | 8 | The idea is that we run the parsing many times and count how many each picture (by its hash) was found in each of the tags (man, woman, ...). Further, for each picture, by its hash, we select the category in which it was most often encountered. 9 | 10 | ```bash 11 | (venv) $ python main_collect_data.py 12 | 13 | 2021-08-08 16:14:23 ~ humans_not_invited_problem.collectors ~ INFO ~ iteration: 1/100; batch time: 4.66s; total time: 4.66s 14 | ... 15 | 2021-08-08 16:24:31 ~ humans_not_invited_problem.collectors ~ INFO ~ iteration: 100/100; batch time: 4.00s; total time: 613.00s 16 | ``` 17 | 18 | ### 19 | 20 | ```bash 21 | (venv) $ python main_inference.py 22 | Input content of the html page (finish you input with 'END'): 23 |
24 |
25 |

Select all squares with kids

26 | 27 |
28 |
29 |
30 |
31 | END 32 | Processing... 33 | GRID: 34 | | 1 | 2 | 3 | 35 | | 4 | 5 | 6 | 36 | | 7 | 8 | 9 | 37 | SELECT NEXT IMAGES: 38 | 1 39 | 3 40 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python ### 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | pytestdebug.log 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | doc/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # poetry 96 | #poetry.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | # .env 110 | .env/ 111 | .venv/ 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | pythonenv* 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # pytype static type analyzer 138 | .pytype/ 139 | 140 | # operating system-related files 141 | # file properties cache/storage on macOS 142 | *.DS_Store 143 | # thumbnail cache on Windows 144 | Thumbs.db 145 | 146 | # profiling data 147 | .prof 148 | 149 | 150 | ### VisualStudioCode ### 151 | .vscode/* 152 | # !.vscode/settings.json 153 | # !.vscode/tasks.json 154 | # !.vscode/launch.json 155 | # !.vscode/extensions.json 156 | *.code-workspace 157 | 158 | ### VisualStudioCode Patch ### 159 | # Ignore all local history of files 160 | .history 161 | .ionide -------------------------------------------------------------------------------- /humans_not_invited_problem/collectors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collectors. 3 | """ 4 | import time 5 | import asyncio 6 | import logging 7 | from typing import List, Tuple, Dict 8 | 9 | import aiohttp 10 | 11 | from .re_parsers import find_all_image_urls, find_tag 12 | from .loaders import load_page, load_image 13 | from .utils import save_json, load_json 14 | from .data_converters import create_image_tag_df, create_tags2images_dict 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | async def collect_dataset( 21 | n_iterations: int, n_parallel_tasks: int 22 | ) -> List[Tuple[str, List[str]]]: 23 | """Collect tag and image hashes pairs using info from the site. 24 | 25 | Parameters 26 | ---------- 27 | n_iterations : int 28 | The number of parsing iterations. 29 | n_parallel_tasks : int 30 | The number of parallel jobs in one iteration. 31 | 32 | Returns 33 | ------- 34 | List[Tuple[str, List[str]]] 35 | The list with tags and image ids for some tag. 36 | """ 37 | result = [] 38 | 39 | time_full_start = time.time() 40 | i_iteration = 1 41 | while i_iteration <= n_iterations: 42 | try: 43 | time_current_start = time.time() 44 | 45 | async with aiohttp.ClientSession() as session: 46 | tasks = [load_page(session) for _ in range(n_parallel_tasks)] 47 | 48 | result.extend(await asyncio.gather(*tasks)) 49 | 50 | current_time = time.time() - time_current_start 51 | total_time = time.time() - time_full_start 52 | logger.info( 53 | "iteration: {}/{}; batch time: {:.2f}s; total time: {:.2f}s".format( 54 | i_iteration, n_iterations, current_time, total_time 55 | ) 56 | ) 57 | 58 | i_iteration += 1 59 | 60 | except aiohttp.ServerDisconnectedError as error: 61 | logger.warning(str(error)) 62 | time.sleep(2) 63 | 64 | return result 65 | 66 | 67 | async def run_collection_process(n_iterations: int, n_parallel_tasks: int) -> None: 68 | """Create result json with tag to images info. 69 | 70 | Parameters 71 | ---------- 72 | n_iterations : int 73 | The number of parsing iterations. 74 | n_parallel_tasks : int 75 | The number of parallel jobs in one iteration. 76 | """ 77 | result = await collect_dataset(n_iterations, n_parallel_tasks) 78 | df = create_image_tag_df(result) 79 | tags2images = create_tags2images_dict(df) 80 | save_json(tags2images) 81 | 82 | 83 | async def get_correct_images_by_content(content: str) -> None: 84 | """Get correct answer for some html content. 85 | 86 | Parameters 87 | ---------- 88 | content : str 89 | The content from html page. 90 | """ 91 | tags2images = load_json() 92 | hashes = [] 93 | new_images = find_all_image_urls(content) 94 | new_tag = find_tag(content) 95 | 96 | async with aiohttp.ClientSession() as session: 97 | for img in new_images: 98 | hashes.append(await load_image(session, img)) 99 | 100 | print("GRID:\n| 1 | 2 | 3 |\n| 4 | 5 | 6 |\n| 7 | 8 | 9 |") 101 | print("SELECT NEXT IMAGES:") 102 | for ind, _hash in enumerate(hashes, 1): 103 | if _hash in tags2images[new_tag]: 104 | print(ind) 105 | -------------------------------------------------------------------------------- /tags2images.json: -------------------------------------------------------------------------------- 1 | { 2 | "modems": [ 3 | "cb6172c088687c8c24ef33eb5b06a327", 4 | "036a4b2325c846cf4ae54db3aa71f0eb", 5 | "fc10b862fee3e4a2c499b9e6a22f9328", 6 | "dd404e60945a7c6120ad1c28eec4d081", 7 | "c95ae75e4a1b31c7bb7aa2be1a4efb19", 8 | "20b84e1cd538a6b59e77c07d17c60057", 9 | "618d706628f841d7ac1179b3ac67820d", 10 | "29145d1a4b07d49dce40fe07e253a48b", 11 | "a5334140559236b8f3adc871ddcc7d12", 12 | "3e4de0807f2da13aa588c8d21316f176" 13 | ], 14 | "ants": [ 15 | "90e0c1eeaa55a2f83ba248a1fcab64e2", 16 | "da4b5f3b2ffafe775a02cc8fac4dd4f7", 17 | "51eff24c1b1216bc385cea81d5c75339", 18 | "6e0b55efd338b4c897c6fa89e13e3d19", 19 | "6337b306b6dcb7f6dc4c2fbbd997855d", 20 | "2dab8bc4034c9f1ed8e1a838a6c40f77", 21 | "0ad7b2d8c7e43a41dc0d93a82267a107", 22 | "be4dbade5b7774cfafbd5a991a8d9ea9", 23 | "d56340a9bc0b6a0879d485cd9510319f" 24 | ], 25 | "dogs": [ 26 | "556253a7497869a7cb2c7cdc75f25c39", 27 | "572e9cbb620acab56192475f3a34c3c2", 28 | "2be54362a0516883102298c6a81eaf81", 29 | "2ecb1d36671258d95fe3ed8461c1cfc5", 30 | "141eb75d678b2ba050f87f6ce1953c0a", 31 | "e864f50d80294f79a673ef5aacd6ded9", 32 | "0df427f3046300b4e96460fedc0534ab", 33 | "458dd32956d671e9865644ecf69e039e", 34 | "eccf153759a610e9265047c54505719e" 35 | ], 36 | "computers": [ 37 | "cd5e03571f7a8283384e1746011cac88", 38 | "8054c13e08c14a2104dfd9ae8fd4c8d5", 39 | "c85ee3f8a105ae9b11cd556bb782fdc0", 40 | "e5349d1aec7f9f104a9a8446b23ce5e0", 41 | "05270f154fac925e2b8bcbfd98dc6ab6", 42 | "422e48deb8f23f6d08474b95b5c1f164", 43 | "0f0d6fd8a2accf1cd198ffba8729553e", 44 | "06f824e73b1bc0bf534ae6fdce8a3e64", 45 | "911fc5c1710c17208f4d545730e8738d" 46 | ], 47 | "cars": [ 48 | "2efb26e4a1e428e3a6c6de5b3f7aaf22", 49 | "cb4f4d6ef33e95373c0f6808368f27ab", 50 | "0e3b076a10644c730a4085fc5187935c", 51 | "3df3a42178cbedeaabbfb7751204bbcf", 52 | "5c4d3c7f7bac8601006bb85ec8874b46", 53 | "a94b016daa9687c8bb4f0f088688eb56", 54 | "ca3c9518bbb7148affaa91c515c3aedf", 55 | "c79736ff3aa43c48b42c09eb35e05441", 56 | "8962dc9e1c41a4cda499d47a0f5dbf34" 57 | ], 58 | "star_trek": [ 59 | "ee43cf8c2620cbaec7bcbcdeaa4eef09", 60 | "bf782881b5bbfe40f492690e547b0623", 61 | "d779e243dbba8ddb91401d786d7727b5", 62 | "36391d5becf638d3243ccf58398f8baa", 63 | "4a7a5980112abe57981c95fc9ba4e598", 64 | "e0b91f0e88571ef2a383595a1a662635", 65 | "f5503bc6b91f7b81c03293f8315d50f2", 66 | "43e3242ef8fc84af8b2ec4ad6a55c49e", 67 | "9eb09235c445a82f34c8a1e3b0ad439e" 68 | ], 69 | "emojis": [ 70 | "78e83e14af11ce9970b4920bab067ecc", 71 | "525939f9d5b8f753ba912af337e058fc", 72 | "2148311df7db5d52f21e3bb67403705d", 73 | "ecf036028c7d33b553e8d303a4868def", 74 | "0296678910088decd05ad76124c70bbe", 75 | "279a8e685657b2b1fd8c48cc690f6dcb", 76 | "26954c19ffb9e3940cfa830ec556a8b2", 77 | "260bf9fa369a98220233a378631d97e5", 78 | "3d715cf57c7aa4cb28a311a60ad9af26" 79 | ], 80 | "spinners": [ 81 | "22090e336974b5df755fe3d7ae86dc1d", 82 | "934e2b89726e4422679ef9613f55a0a2", 83 | "806a63fccbe06c1d027a797757134992", 84 | "04643474672c917ee53a565008ff46ac", 85 | "28b8ef17bc88536fe120cec563e5ba39", 86 | "42c07f852b1d97e5aabd8f374cfc1877", 87 | "957fe1b2088e76f3ca1015423803eae1", 88 | "508e33f90313ca499c1d70fbe1fb82a3", 89 | "c3777ef67e327e9d2739447f3eca4094" 90 | ], 91 | "speakers": [ 92 | "7c80b24c5db9877929a96c1f59c77def", 93 | "9285af651f3965517b208f78c69242b0", 94 | "da046062dcd5ed3fb241c50a0dcbfc85", 95 | "e0a75021746642bcb98cb29d9f2efa4e", 96 | "96260e267bed77509d6f1a4074d7f250", 97 | "44a58f196b755c00422ef8e55e4efd51", 98 | "d5198a9c36e2cf146b386b19432dbcd9", 99 | "1505ffd5030079f9c23a2b174a27b88f", 100 | "3a1767457ad1ff9abc6485d7850cce07" 101 | ], 102 | "man": [ 103 | "30952cb8f9115045a9d812835e3ef907", 104 | "ff150f43c5e9f1083582844c1f8564ea", 105 | "98cf20e19ad5cf665168702a71c13267", 106 | "7459d2507963f1aa382556e1726e50ef", 107 | "1f3e6dff3236d37b8bbe9faa01989289", 108 | "55a6cc4460363038e53c42afe1b6fa9c", 109 | "3a7d01e45322e9f72bfda8e22ff6a339", 110 | "bd91403836910f153397b2e9ab21aa6d", 111 | "5c96932a9bb108b5768e025d38ffa673" 112 | ], 113 | "storefronts": [ 114 | "e84320e5fe94c9ad70bc8972dbb6875c", 115 | "760292d7a2b0c01e000aaad7e0c5294c", 116 | "7524a531338e7256112a4cf8ea4f3522", 117 | "ce6be4a94fba01aa11b426e61df0bcd7", 118 | "3df2d9652dfabeb06b8ec24767aedcc5", 119 | "928fa72f6c8fed3f90269986c63b7899", 120 | "e073cb4327ce900d5dd54b42368db1e0", 121 | "990870ac3187f148d8afe39755e35fe2", 122 | "bddee930836c20903fd7f2243f23ee22" 123 | ], 124 | "selfiesticks": [ 125 | "7f205833b323c62aa1821430c7caaf50", 126 | "00f0446c4ff0a9d72008d744aee94f0b", 127 | "9e64221c2edebe5d8a2889c4f2304947", 128 | "7c69f173abc206aa4689814f044a2b04", 129 | "8e78d8d975567502dc2679b4fca5d844", 130 | "8e6641b74c94c7221ca6c17afe834a22", 131 | "421e1efda48c4ee0f30113eeb04f97be", 132 | "7eebf8a947df49c7fa7fa7f00d4128af", 133 | "dc39a9741ac1745cb5d532bed7b1af6a" 134 | ], 135 | "spiders": [ 136 | "87e625307745c75949d5cf0034e58ae5", 137 | "8d045de2442e741a891029c6774ddd58", 138 | "0fef16f23180526aa542fa78d6a35067", 139 | "cd8fd2336d89e1fbaf625cace923eb26", 140 | "eadf503d27d7032610288b9e655055f5", 141 | "b06c80739733d2e989bca594822d8904", 142 | "21fd171f9135d55f849429b45f26457e", 143 | "2c1d47482778bb6985f735711d174343", 144 | "49bf509e54bb7552f0fca706e99d2d83" 145 | ], 146 | "art": [ 147 | "6735e6f6e6fe90cbf07e8f15eb3a3165", 148 | "c585327d438bfb187819ac664a5e0000", 149 | "5bf27a7596e1a2ec9c10e08d7cd2f029", 150 | "9c1ac8e18a848c845df6c29c4350caf4", 151 | "8e69998f180a17ef80ef4c34c85210f4", 152 | "ba8a8122b20a5218118e154175e848d1", 153 | "24f884543da914f2971e2a96cf61dc1b", 154 | "5810648e0188f1d675fc2b2082af9928", 155 | "23c55ed4c4fc33583155c649adc5e6a2" 156 | ], 157 | "servers": [ 158 | "4137fefd172706594672681e4efd23da", 159 | "4d5499cfd92193bccfe1236b7053b915", 160 | "b92fd54dac08218536bfed1fda20806b", 161 | "8530c3a5251aaffb98f0e5edefa04f47", 162 | "a9081b4ec0f49ff7d7ff0c48dc4e7fcb", 163 | "c5149796f9215054de14c40ee82e2518", 164 | "2ba5f90e406fa3f731d5dfb843811e29", 165 | "261483d1d839f4560399441c74abe815", 166 | "47858cb27ac500b6c6e5f73bb4b09852" 167 | ], 168 | "hotdogs": [ 169 | "ae5864ca1409b462a45a60ca6ad97623", 170 | "58b8a1fa63d33451e13446e53764e849", 171 | "57eb2c37e1c81134950ccd34c412d070", 172 | "af1e7ecf1c6e8330bd464c957d60f73d", 173 | "a2bb7b0377e9908039786e71dde5fb3a", 174 | "3bfc2c4079c288adb529bdeeda4e862a", 175 | "cad6239f7b24b0f5623fbd028bd9a094", 176 | "e5c98fe7b4cf115e5e8d6b9f630b8707", 177 | "4907ac844e0907c56cafac16d36cb118" 178 | ], 179 | "memes": [ 180 | "fe65b76d4394cdb2beb7c18163646eb6", 181 | "630e427f575b8a3641146952f540cf46", 182 | "1ee3d053536c1fbf162ae9f0887a29de", 183 | "ca9132a6086e2f0d92ab5728de8d10d0", 184 | "87818ece8fa5182c9094925ab640145c", 185 | "95e5ac72fa04c85a028d18debbaf9ea8", 186 | "803206f9d7e180c5a2f7f3893736a85b", 187 | "34ae40fe64c5fb2825e22c81c83df6dc", 188 | "c45c46dccdf26af6e8be9bde4ea56d05" 189 | ], 190 | "kids": [ 191 | "4ef348915c95cf352b61d4a97807beca", 192 | "d8f8fbc764e22a8e59849e240481b2ba", 193 | "a9f702b43bbd73dc32a236f20fab5c3c", 194 | "c064007297426d85be103d9a919f7c4b", 195 | "d5821f2b00867f945ad721d5b9daa05f", 196 | "88bb20393792446befd90c90452188a8", 197 | "92447a64f7dffc33f1c984f87fe25d9e", 198 | "a93ee3034cccb5ee0715964e696edb8d", 199 | "ac59c6cb365c58befca1196d575378b0" 200 | ], 201 | "trafficlights": [ 202 | "2428ae6d6bf30008e362d186d78826fb", 203 | "7bb9af44981f8a7ad23de0ae87015b3b", 204 | "9c3ae4a57da6034495c9d162ba4f844d", 205 | "53603b1abd025ec7e61d0929ecc050a2", 206 | "63688aa14ada4b39e4ef717f7b47fe7a", 207 | "7a582210d5c5acfbb48829acf23ac0e4", 208 | "d6a3e8e7eceaae4a592f2340d129a7c8", 209 | "239c1a9ee7033ec91773a563e6049033", 210 | "31ce552e1d5d9ae5d280b2b8cb3b1e2a" 211 | ], 212 | "women": [ 213 | "1f4c7473c830ee3b49ea70446875251f", 214 | "cf87620a1e2df77002fc2c1a351f99bd", 215 | "1d6b18e596ee67f12a494a327200e64f", 216 | "b409eed3729efb288aac3d23c08af582", 217 | "a5331052eaa66e55dc2cafacbcf6a9b4", 218 | "7f5175ede3fc2894a3805417b3425c36", 219 | "525a6c3b5bf7b37f123bc2be793c9116", 220 | "bb6929616077b29221ae195f1b745966", 221 | "cf7ef5e506cb4a9eb5a3c90b05d81421" 222 | ], 223 | "flowers": [ 224 | "5017c57f65405adb36d8a4cd41466394", 225 | "9d6b45f05296c711fe57cb34f3e4c272", 226 | "8dd882903c63a76055d2a4a04cb0f781", 227 | "d5c0f01e7385ed22218472fbe5fee3d4", 228 | "d521fd0b26d1b7425dc78e5d957aeaf5", 229 | "4a4bed0e773b3e19157eed5ce336ac51", 230 | "9351111d39ab1a79b372225db54e3909", 231 | "f1f9f2feb4d8b9b7d07f040a04405e65", 232 | "56196e287a786afcbad079fcc0e0e7df" 233 | ], 234 | "dicks": [ 235 | "d22e1c4bab103ba4265ffbe771986488", 236 | "b107f742b89ea7e28af0785baa628698", 237 | "bcad427c01039c87463793329ca07b48", 238 | "f7ecfcec435daac5ac957a09f46c4fd8", 239 | "ad275e06bd0df907a93c36cfa9ba08ec", 240 | "8187fa12ee3f73daf56512a088d08241", 241 | "80f960ed2328243985cd1325d4b1e9a9", 242 | "36af91971824f2f9ed5e48a17135e50c", 243 | "e4af0712ed4e23f6daca7f53d1641820" 244 | ], 245 | "ewoks": [ 246 | "6343aee9b9cd229a94b7e834e6752506", 247 | "06cfe46bd63fa5c956a69deb07b81836", 248 | "d4ec8ba505a0c4142bc42eed9c50d33f", 249 | "25b44b096eb168fbacc6b46f7a85b68d", 250 | "a79152a390fe887af15174d650911e37", 251 | "03e424f7aaa5d1994c2c5da5f909e073", 252 | "868a4d4794e9ade319b9cb11926b4644", 253 | "afe9df704708a6c82b36430985653629", 254 | "e521e08e42991353666babf6edd65b85" 255 | ], 256 | "cats": [ 257 | "bc0dc9bd4b4835ea5218b81d8ffb4556", 258 | "6a9fe561069e02efa842c2e4952d249f", 259 | "9c4fa18ed0b0df89a23187b28a18d865", 260 | "6a802e3226cb6667feb6a34bc46994d5", 261 | "e26aece3fca2dabff508fa14b8a8e18c", 262 | "99fa2f5c635d6b56f12baa7e8c6f8ad8", 263 | "813680d89cf265f944193fde769cd35c", 264 | "f0729146d5115bc1ad6bfac09acf126d", 265 | "942b62ddcc871988645081746055a722" 266 | ] 267 | } -------------------------------------------------------------------------------- /humans_not_invited_problem.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "import time\n", 11 | "import json\n", 12 | "import asyncio\n", 13 | "import logging\n", 14 | "\n", 15 | "import requests\n", 16 | "import pandas as pd\n", 17 | "from PIL import Image\n", 18 | "import hashlib\n", 19 | "import aiohttp" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "async with aiohttp.ClientSession() as session:\n", 37 | " print(type(session))" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "logging.basicConfig(\n", 47 | " format=\"%(asctime)s ~ %(name)s ~ %(levelname)s ~ %(message)s\",\n", 48 | " datefmt=\"%Y-%m-%d %H:%M:%S\",\n", 49 | " level=logging.INFO,\n", 50 | ")\n", 51 | "\n", 52 | "logger = logging.getLogger(__name__)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "BASE_URL = \"http://www.humansnotinvited.com/\"" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "def find_all_image_urls(content: str):\n", 71 | " return re.findall('captcha/image.php\\\\?image_name=.*?&id=.', str(content))\n", 72 | "\n", 73 | "\n", 74 | "def find_tag(content: str):\n", 75 | " return re.findall('value=\"(.*?)\" name=\"category\"', str(content))[0]\n", 76 | "\n", 77 | "\n", 78 | "async def load_image(session, image_url):\n", 79 | " async with session.get(BASE_URL + image_url, allow_redirects=True) as response:\n", 80 | " data = await response.read()\n", 81 | " _hash = hashlib.md5(data).hexdigest()\n", 82 | " \n", 83 | " return _hash\n", 84 | "\n", 85 | "\n", 86 | "async def load_page(session):\n", 87 | " async with session.get(BASE_URL, allow_redirects=True) as response:\n", 88 | " data = await response.read()\n", 89 | " \n", 90 | " image_urls = find_all_image_urls(data)\n", 91 | " tasks = []\n", 92 | " for image_url in image_urls:\n", 93 | " tasks.append(asyncio.create_task(load_image(session, image_url)))\n", 94 | " \n", 95 | " tag = find_tag(data)\n", 96 | " return tag, await asyncio.gather(*tasks)\n", 97 | "\n", 98 | "\n", 99 | "def create_image_tag_df(result):\n", 100 | " count_image_tag = {}\n", 101 | " for tag, image_ids in result:\n", 102 | " tmp_tag = count_image_tag.get(tag, {})\n", 103 | " for image_id in image_ids:\n", 104 | " tmp_tag[image_id] = tmp_tag.get(image_id, 0) + 1\n", 105 | " count_image_tag[tag] = tmp_tag\n", 106 | " df = pd.DataFrame(count_image_tag)\n", 107 | " df.to_csv(\"test.csv\")\n", 108 | " df = df.fillna(0)\n", 109 | " return df\n", 110 | "\n", 111 | "\n", 112 | "def create_tags2images_dict(df):\n", 113 | " tags = list(df.columns)\n", 114 | " images = list(df.index)\n", 115 | " image2tag = list(df.values.argmax(axis=1, ))\n", 116 | " \n", 117 | " tags2images = {}\n", 118 | " for i_tag, tag in enumerate(tags):\n", 119 | " tags2images[tag] = []\n", 120 | " for image_id, image_tag in zip(images, image2tag):\n", 121 | " if image_tag == i_tag:\n", 122 | " tags2images[tag].append(image_id)\n", 123 | " \n", 124 | " return tags2images\n", 125 | "\n", 126 | "\n", 127 | "def save_json(data: dict, filename: str = \"tags2images.json\") -> None:\n", 128 | " with open(filename, \"w\") as file:\n", 129 | " json.dump(data, file, indent=4)\n", 130 | " \n", 131 | "\n", 132 | "def load_json(filename: str = \"tags2images.json\") -> dict:\n", 133 | " with open(filename, \"r\") as file:\n", 134 | " return json.load(file)\n", 135 | " \n", 136 | " \n", 137 | "async def collect_dataset(n_iterations: int, n_parallel_tasks: int) -> list:\n", 138 | " result = []\n", 139 | " \n", 140 | " time_full_start = time.time()\n", 141 | " i_iteration = 1\n", 142 | " while i_iteration <= n_iterations:\n", 143 | " try:\n", 144 | " time_current_start = time.time()\n", 145 | "\n", 146 | " async with aiohttp.ClientSession() as session:\n", 147 | " tasks = [load_page(session) for _ in range(n_parallel_tasks)] \n", 148 | "\n", 149 | " result.extend(await asyncio.gather(*tasks))\n", 150 | "\n", 151 | " current_time = time.time() - time_current_start\n", 152 | " total_time = time.time() - time_full_start\n", 153 | " logger.info(\n", 154 | " \"iteration: {}/{}; batch time: {:.2f}s; total time: {:.2f}s\".format(\n", 155 | " i_iteration, n_iterations, current_time, total_time\n", 156 | " )\n", 157 | " )\n", 158 | "\n", 159 | " i_iteration += 1\n", 160 | " \n", 161 | " except aiohttp.ServerDisconnectedError as error:\n", 162 | " logger.warning(str(error))\n", 163 | " time.sleep(2)\n", 164 | " \n", 165 | " return result" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 5, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "async def run_collection_process(n_iterations: int, n_parallel_tasks: int) -> None:\n", 175 | " result = await collect_dataset(n_iterations, n_parallel_tasks)\n", 176 | " df = create_image_tag_df(result)\n", 177 | " tags2images = create_tags2images_dict(df)\n", 178 | " save_json(tags2images)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 6, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stderr", 188 | "output_type": "stream", 189 | "text": [ 190 | "2021-08-08 15:36:48 ~ __main__ ~ INFO ~ iteration: 1/100; batch time: 4.50s; total time: 4.50s\n", 191 | "2021-08-08 15:36:54 ~ __main__ ~ INFO ~ iteration: 2/100; batch time: 6.06s; total time: 10.57s\n", 192 | "2021-08-08 15:37:00 ~ __main__ ~ INFO ~ iteration: 3/100; batch time: 5.95s; total time: 16.52s\n", 193 | "2021-08-08 15:37:07 ~ __main__ ~ INFO ~ iteration: 4/100; batch time: 7.08s; total time: 23.60s\n", 194 | "2021-08-08 15:37:13 ~ __main__ ~ INFO ~ iteration: 5/100; batch time: 6.07s; total time: 29.67s\n", 195 | "2021-08-08 15:37:17 ~ __main__ ~ INFO ~ iteration: 6/100; batch time: 4.04s; total time: 33.72s\n", 196 | "2021-08-08 15:37:23 ~ __main__ ~ INFO ~ iteration: 7/100; batch time: 6.07s; total time: 39.79s\n", 197 | "2021-08-08 15:37:28 ~ __main__ ~ INFO ~ iteration: 8/100; batch time: 4.97s; total time: 44.75s\n", 198 | "2021-08-08 15:37:33 ~ __main__ ~ INFO ~ iteration: 9/100; batch time: 4.95s; total time: 49.71s\n", 199 | "2021-08-08 15:37:38 ~ __main__ ~ INFO ~ iteration: 10/100; batch time: 5.03s; total time: 54.74s\n", 200 | "2021-08-08 15:37:43 ~ __main__ ~ INFO ~ iteration: 11/100; batch time: 5.10s; total time: 59.85s\n", 201 | "2021-08-08 15:37:47 ~ __main__ ~ INFO ~ iteration: 12/100; batch time: 3.99s; total time: 63.84s\n", 202 | "2021-08-08 15:37:51 ~ __main__ ~ INFO ~ iteration: 13/100; batch time: 4.04s; total time: 67.88s\n", 203 | "2021-08-08 15:37:56 ~ __main__ ~ INFO ~ iteration: 14/100; batch time: 5.03s; total time: 72.90s\n", 204 | "2021-08-08 15:38:00 ~ __main__ ~ INFO ~ iteration: 15/100; batch time: 4.04s; total time: 76.94s\n", 205 | "2021-08-08 15:38:05 ~ __main__ ~ INFO ~ iteration: 16/100; batch time: 5.08s; total time: 82.02s\n", 206 | "2021-08-08 15:38:10 ~ __main__ ~ INFO ~ iteration: 17/100; batch time: 4.99s; total time: 87.01s\n", 207 | "2021-08-08 15:38:15 ~ __main__ ~ INFO ~ iteration: 18/100; batch time: 4.98s; total time: 92.00s\n", 208 | "2021-08-08 15:38:19 ~ __main__ ~ INFO ~ iteration: 19/100; batch time: 3.97s; total time: 95.97s\n", 209 | "2021-08-08 15:38:25 ~ __main__ ~ INFO ~ iteration: 20/100; batch time: 5.98s; total time: 101.95s\n", 210 | "2021-08-08 15:38:29 ~ __main__ ~ INFO ~ iteration: 21/100; batch time: 4.04s; total time: 105.99s\n", 211 | "2021-08-08 15:38:34 ~ __main__ ~ INFO ~ iteration: 22/100; batch time: 5.09s; total time: 111.09s\n", 212 | "2021-08-08 15:38:38 ~ __main__ ~ INFO ~ iteration: 23/100; batch time: 3.99s; total time: 115.08s\n", 213 | "2021-08-08 15:38:43 ~ __main__ ~ INFO ~ iteration: 24/100; batch time: 4.99s; total time: 120.07s\n", 214 | "2021-08-08 15:38:48 ~ __main__ ~ INFO ~ iteration: 25/100; batch time: 5.01s; total time: 125.08s\n", 215 | "2021-08-08 15:38:53 ~ __main__ ~ INFO ~ iteration: 26/100; batch time: 5.09s; total time: 130.18s\n", 216 | "2021-08-08 15:38:57 ~ __main__ ~ INFO ~ iteration: 27/100; batch time: 4.05s; total time: 134.23s\n", 217 | "2021-08-08 15:39:02 ~ __main__ ~ INFO ~ iteration: 28/100; batch time: 5.04s; total time: 139.28s\n", 218 | "2021-08-08 15:39:08 ~ __main__ ~ INFO ~ iteration: 29/100; batch time: 6.03s; total time: 145.31s\n", 219 | "2021-08-08 15:39:14 ~ __main__ ~ INFO ~ iteration: 30/100; batch time: 5.02s; total time: 150.33s\n", 220 | "2021-08-08 15:39:20 ~ __main__ ~ INFO ~ iteration: 31/100; batch time: 6.07s; total time: 156.40s\n", 221 | "2021-08-08 15:39:26 ~ __main__ ~ INFO ~ iteration: 32/100; batch time: 6.08s; total time: 162.47s\n", 222 | "2021-08-08 15:39:32 ~ __main__ ~ INFO ~ iteration: 33/100; batch time: 6.06s; total time: 168.54s\n", 223 | "2021-08-08 15:39:37 ~ __main__ ~ INFO ~ iteration: 34/100; batch time: 5.06s; total time: 173.61s\n", 224 | "2021-08-08 15:39:46 ~ __main__ ~ INFO ~ iteration: 35/100; batch time: 9.00s; total time: 182.61s\n", 225 | "2021-08-08 15:39:53 ~ __main__ ~ INFO ~ iteration: 36/100; batch time: 7.00s; total time: 189.61s\n", 226 | "2021-08-08 15:39:57 ~ __main__ ~ INFO ~ iteration: 37/100; batch time: 4.06s; total time: 193.68s\n", 227 | "2021-08-08 15:40:02 ~ __main__ ~ INFO ~ iteration: 38/100; batch time: 5.01s; total time: 198.69s\n", 228 | "2021-08-08 15:40:09 ~ __main__ ~ INFO ~ iteration: 39/100; batch time: 7.03s; total time: 205.72s\n", 229 | "2021-08-08 15:40:15 ~ __main__ ~ INFO ~ iteration: 40/100; batch time: 6.08s; total time: 211.80s\n", 230 | "2021-08-08 15:40:22 ~ __main__ ~ INFO ~ iteration: 41/100; batch time: 7.29s; total time: 219.09s\n", 231 | "2021-08-08 15:40:29 ~ __main__ ~ INFO ~ iteration: 42/100; batch time: 7.04s; total time: 226.13s\n", 232 | "2021-08-08 15:40:34 ~ __main__ ~ INFO ~ iteration: 43/100; batch time: 5.07s; total time: 231.20s\n", 233 | "2021-08-08 15:40:40 ~ __main__ ~ INFO ~ iteration: 44/100; batch time: 6.07s; total time: 237.27s\n", 234 | "2021-08-08 15:40:45 ~ __main__ ~ INFO ~ iteration: 45/100; batch time: 4.04s; total time: 241.32s\n", 235 | "2021-08-08 15:40:50 ~ __main__ ~ INFO ~ iteration: 46/100; batch time: 5.02s; total time: 246.34s\n", 236 | "2021-08-08 15:40:55 ~ __main__ ~ INFO ~ iteration: 47/100; batch time: 5.04s; total time: 251.38s\n", 237 | "2021-08-08 15:41:02 ~ __main__ ~ INFO ~ iteration: 48/100; batch time: 7.00s; total time: 258.38s\n", 238 | "2021-08-08 15:41:08 ~ __main__ ~ INFO ~ iteration: 49/100; batch time: 6.03s; total time: 264.41s\n", 239 | "2021-08-08 15:41:13 ~ __main__ ~ INFO ~ iteration: 50/100; batch time: 5.13s; total time: 269.55s\n", 240 | "2021-08-08 15:41:19 ~ __main__ ~ INFO ~ iteration: 51/100; batch time: 6.01s; total time: 275.56s\n", 241 | "2021-08-08 15:41:24 ~ __main__ ~ INFO ~ iteration: 52/100; batch time: 5.12s; total time: 280.68s\n", 242 | "2021-08-08 15:41:30 ~ __main__ ~ INFO ~ iteration: 53/100; batch time: 6.05s; total time: 286.73s\n", 243 | "2021-08-08 15:41:37 ~ __main__ ~ INFO ~ iteration: 54/100; batch time: 7.14s; total time: 293.87s\n", 244 | "2021-08-08 15:41:43 ~ __main__ ~ INFO ~ iteration: 55/100; batch time: 6.06s; total time: 299.93s\n", 245 | "2021-08-08 15:41:50 ~ __main__ ~ INFO ~ iteration: 56/100; batch time: 6.96s; total time: 306.90s\n", 246 | "2021-08-08 15:41:56 ~ __main__ ~ INFO ~ iteration: 57/100; batch time: 6.06s; total time: 312.96s\n", 247 | "2021-08-08 15:42:01 ~ __main__ ~ INFO ~ iteration: 58/100; batch time: 5.03s; total time: 317.99s\n", 248 | "2021-08-08 15:42:05 ~ __main__ ~ INFO ~ iteration: 59/100; batch time: 4.13s; total time: 322.12s\n", 249 | "2021-08-08 15:42:09 ~ __main__ ~ INFO ~ iteration: 60/100; batch time: 4.08s; total time: 326.21s\n", 250 | "2021-08-08 15:42:14 ~ __main__ ~ INFO ~ iteration: 61/100; batch time: 4.14s; total time: 330.34s\n", 251 | "2021-08-08 15:42:19 ~ __main__ ~ INFO ~ iteration: 62/100; batch time: 5.08s; total time: 335.43s\n", 252 | "2021-08-08 15:42:25 ~ __main__ ~ INFO ~ iteration: 63/100; batch time: 6.03s; total time: 341.46s\n", 253 | "2021-08-08 15:42:30 ~ __main__ ~ INFO ~ iteration: 64/100; batch time: 5.06s; total time: 346.52s\n", 254 | "2021-08-08 15:42:34 ~ __main__ ~ INFO ~ iteration: 65/100; batch time: 4.03s; total time: 350.55s\n", 255 | "2021-08-08 15:42:38 ~ __main__ ~ INFO ~ iteration: 66/100; batch time: 4.06s; total time: 354.61s\n", 256 | "2021-08-08 15:42:42 ~ __main__ ~ INFO ~ iteration: 67/100; batch time: 4.08s; total time: 358.68s\n", 257 | "2021-08-08 15:42:47 ~ __main__ ~ INFO ~ iteration: 68/100; batch time: 5.08s; total time: 363.77s\n", 258 | "2021-08-08 15:42:52 ~ __main__ ~ INFO ~ iteration: 69/100; batch time: 5.09s; total time: 368.86s\n", 259 | "2021-08-08 15:42:57 ~ __main__ ~ INFO ~ iteration: 70/100; batch time: 5.01s; total time: 373.87s\n", 260 | "2021-08-08 15:43:03 ~ __main__ ~ INFO ~ iteration: 71/100; batch time: 6.06s; total time: 379.93s\n", 261 | "2021-08-08 15:43:09 ~ __main__ ~ INFO ~ iteration: 72/100; batch time: 6.04s; total time: 385.97s\n", 262 | "2021-08-08 15:43:14 ~ __main__ ~ INFO ~ iteration: 73/100; batch time: 5.03s; total time: 391.01s\n", 263 | "2021-08-08 15:43:19 ~ __main__ ~ INFO ~ iteration: 74/100; batch time: 4.95s; total time: 395.96s\n", 264 | "2021-08-08 15:43:24 ~ __main__ ~ INFO ~ iteration: 75/100; batch time: 5.08s; total time: 401.04s\n", 265 | "2021-08-08 15:43:29 ~ __main__ ~ INFO ~ iteration: 76/100; batch time: 5.01s; total time: 406.05s\n", 266 | "2021-08-08 15:43:35 ~ __main__ ~ INFO ~ iteration: 77/100; batch time: 5.99s; total time: 412.05s\n", 267 | "2021-08-08 15:43:41 ~ __main__ ~ INFO ~ iteration: 78/100; batch time: 6.05s; total time: 418.10s\n", 268 | "2021-08-08 15:43:50 ~ __main__ ~ INFO ~ iteration: 79/100; batch time: 8.22s; total time: 426.32s\n", 269 | "2021-08-08 15:43:57 ~ __main__ ~ INFO ~ iteration: 80/100; batch time: 7.18s; total time: 433.50s\n", 270 | "2021-08-08 15:44:02 ~ __main__ ~ INFO ~ iteration: 81/100; batch time: 5.07s; total time: 438.58s\n", 271 | "2021-08-08 15:44:09 ~ __main__ ~ INFO ~ iteration: 82/100; batch time: 7.13s; total time: 445.71s\n", 272 | "2021-08-08 15:44:14 ~ __main__ ~ INFO ~ iteration: 83/100; batch time: 5.03s; total time: 450.74s\n", 273 | "2021-08-08 15:44:19 ~ __main__ ~ INFO ~ iteration: 84/100; batch time: 5.08s; total time: 455.83s\n" 274 | ] 275 | }, 276 | { 277 | "name": "stderr", 278 | "output_type": "stream", 279 | "text": [ 280 | "2021-08-08 15:44:24 ~ __main__ ~ INFO ~ iteration: 85/100; batch time: 5.03s; total time: 460.86s\n", 281 | "2021-08-08 15:44:30 ~ __main__ ~ INFO ~ iteration: 86/100; batch time: 6.06s; total time: 466.92s\n", 282 | "2021-08-08 15:44:37 ~ __main__ ~ INFO ~ iteration: 87/100; batch time: 7.06s; total time: 473.98s\n", 283 | "2021-08-08 15:44:43 ~ __main__ ~ INFO ~ iteration: 88/100; batch time: 6.03s; total time: 480.01s\n", 284 | "2021-08-08 15:44:49 ~ __main__ ~ INFO ~ iteration: 89/100; batch time: 6.04s; total time: 486.05s\n", 285 | "2021-08-08 15:44:54 ~ __main__ ~ INFO ~ iteration: 90/100; batch time: 5.11s; total time: 491.16s\n", 286 | "2021-08-08 15:45:00 ~ __main__ ~ INFO ~ iteration: 91/100; batch time: 6.01s; total time: 497.17s\n", 287 | "2021-08-08 15:45:06 ~ __main__ ~ INFO ~ iteration: 92/100; batch time: 5.15s; total time: 502.32s\n", 288 | "2021-08-08 15:45:11 ~ __main__ ~ INFO ~ iteration: 93/100; batch time: 5.03s; total time: 507.35s\n", 289 | "2021-08-08 15:45:17 ~ __main__ ~ INFO ~ iteration: 94/100; batch time: 6.01s; total time: 513.36s\n", 290 | "2021-08-08 15:45:23 ~ __main__ ~ INFO ~ iteration: 95/100; batch time: 6.07s; total time: 519.44s\n", 291 | "2021-08-08 15:45:28 ~ __main__ ~ INFO ~ iteration: 96/100; batch time: 5.04s; total time: 524.48s\n", 292 | "2021-08-08 15:45:32 ~ __main__ ~ INFO ~ iteration: 97/100; batch time: 3.98s; total time: 528.47s\n", 293 | "2021-08-08 15:45:37 ~ __main__ ~ INFO ~ iteration: 98/100; batch time: 5.01s; total time: 533.48s\n", 294 | "2021-08-08 15:45:41 ~ __main__ ~ INFO ~ iteration: 99/100; batch time: 4.03s; total time: 537.52s\n", 295 | "2021-08-08 15:45:45 ~ __main__ ~ INFO ~ iteration: 100/100; batch time: 4.00s; total time: 541.52s\n" 296 | ] 297 | } 298 | ], 299 | "source": [ 300 | "N_ITERATIONS = 100\n", 301 | "N_PARALLEL_TASKS = 10\n", 302 | "\n", 303 | "await run_collection_process(N_ITERATIONS, N_PARALLEL_TASKS)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 26, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "def find_all_image_urls(content: str):\n", 327 | " return re.findall('captcha/image.php\\\\?image_name=.*?&id=.', str(content))\n", 328 | "\n", 329 | "\n", 330 | "async def get_correct_images_by_content(content):\n", 331 | " tags2images = load_json()\n", 332 | " hashes = []\n", 333 | " new_images = find_all_image_urls(content)\n", 334 | " new_tag = find_tag(content)\n", 335 | " \n", 336 | " async with aiohttp.ClientSession() as session:\n", 337 | " for img in new_images:\n", 338 | " hashes.append(await load_image(session, img))\n", 339 | " \n", 340 | " print(\"GRID:\\n| 1 | 2 | 3 |\\n| 4 | 5 | 6 |\\n| 7 | 8 | 9 |\")\n", 341 | " print(\"SELECT NEXT IMAGES:\")\n", 342 | " for ind, _hash in enumerate(hashes, 1):\n", 343 | " if _hash in tags2images[new_tag]:\n", 344 | " print(ind)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 29, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "content = \"\"\"\n", 368 | "
\n", 369 | "\n", 370 | "
\n", 371 | "

Select all squares with spinners

\n", 372 | " \n", 373 | "
\n", 374 | "\n", 375 | "
\n", 376 | "
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\"\"
\n", 377 | "
\n", 378 | "\"\"\".replace(\"&\", \"&\")" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 30, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "GRID:\n", 391 | "| 1 | 2 | 3 |\n", 392 | "| 4 | 5 | 6 |\n", 393 | "| 7 | 8 | 9 |\n", 394 | "SELECT NEXT IMAGES:\n", 395 | "3\n", 396 | "4\n", 397 | "5\n", 398 | "6\n", 399 | "7\n", 400 | "9\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "await get_correct_images_by_content(content)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [] 421 | } 422 | ], 423 | "metadata": { 424 | "kernelspec": { 425 | "display_name": "Python 3 (ipykernel)", 426 | "language": "python", 427 | "name": "python3" 428 | }, 429 | "language_info": { 430 | "codemirror_mode": { 431 | "name": "ipython", 432 | "version": 3 433 | }, 434 | "file_extension": ".py", 435 | "mimetype": "text/x-python", 436 | "name": "python", 437 | "nbconvert_exporter": "python", 438 | "pygments_lexer": "ipython3", 439 | "version": "3.8.9" 440 | } 441 | }, 442 | "nbformat": 4, 443 | "nbformat_minor": 2 444 | } 445 | --------------------------------------------------------------------------------