├── setup.cfg
├── requirements.txt
├── humans_not_invited_problem
├── __init__.py
├── re_parsers.py
├── utils.py
├── loaders.py
├── data_converters.py
└── collectors.py
├── main_inference.py
├── main_collect_data.py
├── README.md
├── .gitignore
├── tags2images.json
└── humans_not_invited_problem.ipynb
/setup.cfg:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.26.0
2 | pandas==1.3.1
3 | pillow==8.3.1
4 | aiohttp==3.7.4.post0
--------------------------------------------------------------------------------
/humans_not_invited_problem/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Humans Not Invited Problem.
3 | """
4 |
5 | from .collectors import run_collection_process
6 | from .collectors import get_correct_images_by_content
7 |
--------------------------------------------------------------------------------
/main_inference.py:
--------------------------------------------------------------------------------
1 | """
2 | Main Inference.
3 | """
4 | import asyncio
5 |
6 | from humans_not_invited_problem import get_correct_images_by_content
7 |
8 |
9 | if __name__ == "__main__":
10 | print("Input content of the html page (finish you input with 'END'):")
11 | content = ""
12 | while True:
13 | tmp = input().replace("&", "&")
14 | if tmp == "END":
15 | break
16 | content += tmp
17 |
18 | print("Processing...")
19 |
20 | loop = asyncio.get_event_loop()
21 | loop.run_until_complete(get_correct_images_by_content(content))
22 | loop.close()
--------------------------------------------------------------------------------
/main_collect_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Main Collect Data.
3 | """
4 | import asyncio
5 | import logging
6 |
7 | from humans_not_invited_problem import run_collection_process
8 |
9 | N_ITERATIONS = 100
10 | N_PARALLEL_TASKS = 10
11 |
12 |
13 | if __name__ == "__main__":
14 | logging.basicConfig(
15 | format="%(asctime)s ~ %(name)s ~ %(levelname)s ~ %(message)s",
16 | datefmt="%Y-%m-%d %H:%M:%S",
17 | level=logging.INFO,
18 | )
19 |
20 | logger = logging.getLogger(__name__)
21 |
22 | loop = asyncio.get_event_loop()
23 | loop.run_until_complete(run_collection_process(N_ITERATIONS, N_PARALLEL_TASKS))
24 | loop.close()
25 |
--------------------------------------------------------------------------------
/humans_not_invited_problem/re_parsers.py:
--------------------------------------------------------------------------------
1 | """
2 | Parsers.
3 | """
4 | import re
5 | from typing import List, Union
6 |
7 |
8 | def find_all_image_urls(content: Union[str, bytes]) -> List[str]:
9 | """Find image urls from some content.
10 |
11 | Parameters
12 | ----------
13 | content : Union[str, bytes]
14 | The content.
15 |
16 | Returns
17 | -------
18 | List[bytes]
19 | Image urls.
20 | """
21 | return re.findall("captcha/image.php\\?image_name=.*?&id=.", str(content))
22 |
23 |
24 | def find_tag(content: Union[str, bytes]) -> str:
25 | """Find the content tag.
26 |
27 | Parameters
28 | ----------
29 | content : Union[str, bytes]
30 | The content.
31 |
32 | Returns
33 | -------
34 | str
35 | The content tag.
36 | """
37 | return re.findall('value="(.*?)" name="category"', str(content))[0]
38 |
--------------------------------------------------------------------------------
/humans_not_invited_problem/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utils.
3 | """
4 | import json
5 |
6 |
7 | def save_json(data: dict, filename: str = "tags2images.json") -> None:
8 | """Save dict in json format.
9 |
10 | Parameters
11 | ----------
12 | data : dict
13 | The dict.
14 | filename : str, optional
15 | The filename to save, by default "tags2images.json"
16 | """
17 | with open(filename, "w") as file:
18 | json.dump(data, file, indent=4)
19 |
20 |
21 | def load_json(filename: str = "tags2images.json") -> dict:
22 | """Load the dict form json file.
23 |
24 | Parameters
25 | ----------
26 | filename : str, optional
27 | The filename to load, by default "tags2images.json"
28 |
29 | Returns
30 | -------
31 | dict
32 | The dict.
33 | """
34 | with open(filename, "r") as file:
35 | return json.load(file)
36 |
--------------------------------------------------------------------------------
/humans_not_invited_problem/loaders.py:
--------------------------------------------------------------------------------
1 | """
2 | Loaders.
3 | """
4 | import asyncio
5 | from typing import List, Tuple
6 |
7 | import hashlib
8 | from aiohttp import ClientSession
9 |
10 | from .re_parsers import find_all_image_urls, find_tag
11 |
12 |
13 | BASE_URL = "http://www.humansnotinvited.com/"
14 |
15 |
16 | async def load_image(session: ClientSession, image_url: str) -> str:
17 | """Load image and get hash of this image.
18 |
19 | Parameters
20 | ----------
21 | session : ClientSession
22 | The aiohttp.ClientSession object.
23 | image_url : str
24 | The image url.
25 |
26 | Returns
27 | -------
28 | str
29 | The image hash.
30 | """
31 | async with session.get(BASE_URL + image_url, allow_redirects=True) as response:
32 | data = await response.read()
33 | _hash = hashlib.md5(data).hexdigest()
34 |
35 | return _hash
36 |
37 |
38 | async def load_page(session: ClientSession) -> Tuple[str, List[str]]:
39 | """Load page and images.
40 |
41 | Parameters
42 | ----------
43 | session : ClientSession
44 | The aiohttp.ClientSession object.
45 |
46 | Returns
47 | -------
48 | Tuple[str, List[str]]
49 | Tag and image hashes of the page.
50 | """
51 | async with session.get(BASE_URL, allow_redirects=True) as response:
52 | data = await response.read()
53 |
54 | image_urls = find_all_image_urls(data)
55 | tasks = []
56 | for image_url in image_urls:
57 | tasks.append(asyncio.create_task(load_image(session, image_url)))
58 |
59 | tag = find_tag(data)
60 | return tag, await asyncio.gather(*tasks)
61 |
--------------------------------------------------------------------------------
/humans_not_invited_problem/data_converters.py:
--------------------------------------------------------------------------------
1 | """
2 | Data Converters.
3 | """
4 | import pandas as pd
5 | from typing import List, Tuple, Dict
6 |
7 |
8 | def create_image_tag_df(result: List[Tuple[str, List[str]]]) -> pd.DataFrame:
9 | """Convert tag - image ids pairs to frequency DataFrame.
10 |
11 | Parameters
12 | ----------
13 | result : List[Tuple[str, List[str]]]
14 | Tag - image ids pairs.
15 |
16 | Returns
17 | -------
18 | pd.DataFrame
19 | The frequency DataFrame.
20 | """
21 | count_image_tag: Dict[str, Dict[str, int]] = {}
22 | for tag, image_ids in result:
23 | tmp_tag = count_image_tag.get(tag, {})
24 | for image_id in image_ids:
25 | tmp_tag[image_id] = tmp_tag.get(image_id, 0) + 1
26 | count_image_tag[tag] = tmp_tag
27 | df = pd.DataFrame(count_image_tag)
28 | df = df.fillna(0)
29 | return df
30 |
31 |
32 | def create_tags2images_dict(df: pd.DataFrame) -> Dict[str, List[str]]:
33 | """Convert frequency DataFrame to tags2images dict.
34 |
35 | Parameters
36 | ----------
37 | df : pd.DataFrame
38 | The frequency DataFrame.
39 |
40 | Returns
41 | -------
42 | Dict[str, List[str]]
43 | tags2images dict.
44 | """
45 | tags = list(df.columns)
46 | images = list(df.index)
47 | image2tag = list(
48 | df.values.argmax(
49 | axis=1,
50 | )
51 | )
52 |
53 | tags2images: Dict[str, List[str]] = {}
54 | for i_tag, tag in enumerate(tags):
55 | tags2images[tag] = []
56 | for image_id, image_tag in zip(images, image2tag):
57 | if image_tag == i_tag:
58 | tags2images[tag].append(image_id)
59 |
60 | return tags2images
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Humans-Not-Invited-Problem
2 |
3 | There is solution for the [humansnotinvited problem](http://www.humansnotinvited.com/)
4 |
5 |
6 | ### Run Data Collection Process
7 |
8 | The idea is that we run the parsing many times and count how many each picture (by its hash) was found in each of the tags (man, woman, ...). Further, for each picture, by its hash, we select the category in which it was most often encountered.
9 |
10 | ```bash
11 | (venv) $ python main_collect_data.py
12 |
13 | 2021-08-08 16:14:23 ~ humans_not_invited_problem.collectors ~ INFO ~ iteration: 1/100; batch time: 4.66s; total time: 4.66s
14 | ...
15 | 2021-08-08 16:24:31 ~ humans_not_invited_problem.collectors ~ INFO ~ iteration: 100/100; batch time: 4.00s; total time: 613.00s
16 | ```
17 |
18 | ###
19 |
20 | ```bash
21 | (venv) $ python main_inference.py
22 | Input content of the html page (finish you input with 'END'):
23 |
31 | END
32 | Processing...
33 | GRID:
34 | | 1 | 2 | 3 |
35 | | 4 | 5 | 6 |
36 | | 7 | 8 | 9 |
37 | SELECT NEXT IMAGES:
38 | 1
39 | 3
40 | ```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python ###
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | parts/
19 | sdist/
20 | var/
21 | wheels/
22 | pip-wheel-metadata/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | pytestdebug.log
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 | doc/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # poetry
96 | #poetry.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | # .env
110 | .env/
111 | .venv/
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | pythonenv*
118 |
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 |
123 | # Rope project settings
124 | .ropeproject
125 |
126 | # mkdocs documentation
127 | /site
128 |
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 |
134 | # Pyre type checker
135 | .pyre/
136 |
137 | # pytype static type analyzer
138 | .pytype/
139 |
140 | # operating system-related files
141 | # file properties cache/storage on macOS
142 | *.DS_Store
143 | # thumbnail cache on Windows
144 | Thumbs.db
145 |
146 | # profiling data
147 | .prof
148 |
149 |
150 | ### VisualStudioCode ###
151 | .vscode/*
152 | # !.vscode/settings.json
153 | # !.vscode/tasks.json
154 | # !.vscode/launch.json
155 | # !.vscode/extensions.json
156 | *.code-workspace
157 |
158 | ### VisualStudioCode Patch ###
159 | # Ignore all local history of files
160 | .history
161 | .ionide
--------------------------------------------------------------------------------
/humans_not_invited_problem/collectors.py:
--------------------------------------------------------------------------------
1 | """
2 | Collectors.
3 | """
4 | import time
5 | import asyncio
6 | import logging
7 | from typing import List, Tuple, Dict
8 |
9 | import aiohttp
10 |
11 | from .re_parsers import find_all_image_urls, find_tag
12 | from .loaders import load_page, load_image
13 | from .utils import save_json, load_json
14 | from .data_converters import create_image_tag_df, create_tags2images_dict
15 |
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | async def collect_dataset(
21 | n_iterations: int, n_parallel_tasks: int
22 | ) -> List[Tuple[str, List[str]]]:
23 | """Collect tag and image hashes pairs using info from the site.
24 |
25 | Parameters
26 | ----------
27 | n_iterations : int
28 | The number of parsing iterations.
29 | n_parallel_tasks : int
30 | The number of parallel jobs in one iteration.
31 |
32 | Returns
33 | -------
34 | List[Tuple[str, List[str]]]
35 | The list with tags and image ids for some tag.
36 | """
37 | result = []
38 |
39 | time_full_start = time.time()
40 | i_iteration = 1
41 | while i_iteration <= n_iterations:
42 | try:
43 | time_current_start = time.time()
44 |
45 | async with aiohttp.ClientSession() as session:
46 | tasks = [load_page(session) for _ in range(n_parallel_tasks)]
47 |
48 | result.extend(await asyncio.gather(*tasks))
49 |
50 | current_time = time.time() - time_current_start
51 | total_time = time.time() - time_full_start
52 | logger.info(
53 | "iteration: {}/{}; batch time: {:.2f}s; total time: {:.2f}s".format(
54 | i_iteration, n_iterations, current_time, total_time
55 | )
56 | )
57 |
58 | i_iteration += 1
59 |
60 | except aiohttp.ServerDisconnectedError as error:
61 | logger.warning(str(error))
62 | time.sleep(2)
63 |
64 | return result
65 |
66 |
67 | async def run_collection_process(n_iterations: int, n_parallel_tasks: int) -> None:
68 | """Create result json with tag to images info.
69 |
70 | Parameters
71 | ----------
72 | n_iterations : int
73 | The number of parsing iterations.
74 | n_parallel_tasks : int
75 | The number of parallel jobs in one iteration.
76 | """
77 | result = await collect_dataset(n_iterations, n_parallel_tasks)
78 | df = create_image_tag_df(result)
79 | tags2images = create_tags2images_dict(df)
80 | save_json(tags2images)
81 |
82 |
83 | async def get_correct_images_by_content(content: str) -> None:
84 | """Get correct answer for some html content.
85 |
86 | Parameters
87 | ----------
88 | content : str
89 | The content from html page.
90 | """
91 | tags2images = load_json()
92 | hashes = []
93 | new_images = find_all_image_urls(content)
94 | new_tag = find_tag(content)
95 |
96 | async with aiohttp.ClientSession() as session:
97 | for img in new_images:
98 | hashes.append(await load_image(session, img))
99 |
100 | print("GRID:\n| 1 | 2 | 3 |\n| 4 | 5 | 6 |\n| 7 | 8 | 9 |")
101 | print("SELECT NEXT IMAGES:")
102 | for ind, _hash in enumerate(hashes, 1):
103 | if _hash in tags2images[new_tag]:
104 | print(ind)
105 |
--------------------------------------------------------------------------------
/tags2images.json:
--------------------------------------------------------------------------------
1 | {
2 | "modems": [
3 | "cb6172c088687c8c24ef33eb5b06a327",
4 | "036a4b2325c846cf4ae54db3aa71f0eb",
5 | "fc10b862fee3e4a2c499b9e6a22f9328",
6 | "dd404e60945a7c6120ad1c28eec4d081",
7 | "c95ae75e4a1b31c7bb7aa2be1a4efb19",
8 | "20b84e1cd538a6b59e77c07d17c60057",
9 | "618d706628f841d7ac1179b3ac67820d",
10 | "29145d1a4b07d49dce40fe07e253a48b",
11 | "a5334140559236b8f3adc871ddcc7d12",
12 | "3e4de0807f2da13aa588c8d21316f176"
13 | ],
14 | "ants": [
15 | "90e0c1eeaa55a2f83ba248a1fcab64e2",
16 | "da4b5f3b2ffafe775a02cc8fac4dd4f7",
17 | "51eff24c1b1216bc385cea81d5c75339",
18 | "6e0b55efd338b4c897c6fa89e13e3d19",
19 | "6337b306b6dcb7f6dc4c2fbbd997855d",
20 | "2dab8bc4034c9f1ed8e1a838a6c40f77",
21 | "0ad7b2d8c7e43a41dc0d93a82267a107",
22 | "be4dbade5b7774cfafbd5a991a8d9ea9",
23 | "d56340a9bc0b6a0879d485cd9510319f"
24 | ],
25 | "dogs": [
26 | "556253a7497869a7cb2c7cdc75f25c39",
27 | "572e9cbb620acab56192475f3a34c3c2",
28 | "2be54362a0516883102298c6a81eaf81",
29 | "2ecb1d36671258d95fe3ed8461c1cfc5",
30 | "141eb75d678b2ba050f87f6ce1953c0a",
31 | "e864f50d80294f79a673ef5aacd6ded9",
32 | "0df427f3046300b4e96460fedc0534ab",
33 | "458dd32956d671e9865644ecf69e039e",
34 | "eccf153759a610e9265047c54505719e"
35 | ],
36 | "computers": [
37 | "cd5e03571f7a8283384e1746011cac88",
38 | "8054c13e08c14a2104dfd9ae8fd4c8d5",
39 | "c85ee3f8a105ae9b11cd556bb782fdc0",
40 | "e5349d1aec7f9f104a9a8446b23ce5e0",
41 | "05270f154fac925e2b8bcbfd98dc6ab6",
42 | "422e48deb8f23f6d08474b95b5c1f164",
43 | "0f0d6fd8a2accf1cd198ffba8729553e",
44 | "06f824e73b1bc0bf534ae6fdce8a3e64",
45 | "911fc5c1710c17208f4d545730e8738d"
46 | ],
47 | "cars": [
48 | "2efb26e4a1e428e3a6c6de5b3f7aaf22",
49 | "cb4f4d6ef33e95373c0f6808368f27ab",
50 | "0e3b076a10644c730a4085fc5187935c",
51 | "3df3a42178cbedeaabbfb7751204bbcf",
52 | "5c4d3c7f7bac8601006bb85ec8874b46",
53 | "a94b016daa9687c8bb4f0f088688eb56",
54 | "ca3c9518bbb7148affaa91c515c3aedf",
55 | "c79736ff3aa43c48b42c09eb35e05441",
56 | "8962dc9e1c41a4cda499d47a0f5dbf34"
57 | ],
58 | "star_trek": [
59 | "ee43cf8c2620cbaec7bcbcdeaa4eef09",
60 | "bf782881b5bbfe40f492690e547b0623",
61 | "d779e243dbba8ddb91401d786d7727b5",
62 | "36391d5becf638d3243ccf58398f8baa",
63 | "4a7a5980112abe57981c95fc9ba4e598",
64 | "e0b91f0e88571ef2a383595a1a662635",
65 | "f5503bc6b91f7b81c03293f8315d50f2",
66 | "43e3242ef8fc84af8b2ec4ad6a55c49e",
67 | "9eb09235c445a82f34c8a1e3b0ad439e"
68 | ],
69 | "emojis": [
70 | "78e83e14af11ce9970b4920bab067ecc",
71 | "525939f9d5b8f753ba912af337e058fc",
72 | "2148311df7db5d52f21e3bb67403705d",
73 | "ecf036028c7d33b553e8d303a4868def",
74 | "0296678910088decd05ad76124c70bbe",
75 | "279a8e685657b2b1fd8c48cc690f6dcb",
76 | "26954c19ffb9e3940cfa830ec556a8b2",
77 | "260bf9fa369a98220233a378631d97e5",
78 | "3d715cf57c7aa4cb28a311a60ad9af26"
79 | ],
80 | "spinners": [
81 | "22090e336974b5df755fe3d7ae86dc1d",
82 | "934e2b89726e4422679ef9613f55a0a2",
83 | "806a63fccbe06c1d027a797757134992",
84 | "04643474672c917ee53a565008ff46ac",
85 | "28b8ef17bc88536fe120cec563e5ba39",
86 | "42c07f852b1d97e5aabd8f374cfc1877",
87 | "957fe1b2088e76f3ca1015423803eae1",
88 | "508e33f90313ca499c1d70fbe1fb82a3",
89 | "c3777ef67e327e9d2739447f3eca4094"
90 | ],
91 | "speakers": [
92 | "7c80b24c5db9877929a96c1f59c77def",
93 | "9285af651f3965517b208f78c69242b0",
94 | "da046062dcd5ed3fb241c50a0dcbfc85",
95 | "e0a75021746642bcb98cb29d9f2efa4e",
96 | "96260e267bed77509d6f1a4074d7f250",
97 | "44a58f196b755c00422ef8e55e4efd51",
98 | "d5198a9c36e2cf146b386b19432dbcd9",
99 | "1505ffd5030079f9c23a2b174a27b88f",
100 | "3a1767457ad1ff9abc6485d7850cce07"
101 | ],
102 | "man": [
103 | "30952cb8f9115045a9d812835e3ef907",
104 | "ff150f43c5e9f1083582844c1f8564ea",
105 | "98cf20e19ad5cf665168702a71c13267",
106 | "7459d2507963f1aa382556e1726e50ef",
107 | "1f3e6dff3236d37b8bbe9faa01989289",
108 | "55a6cc4460363038e53c42afe1b6fa9c",
109 | "3a7d01e45322e9f72bfda8e22ff6a339",
110 | "bd91403836910f153397b2e9ab21aa6d",
111 | "5c96932a9bb108b5768e025d38ffa673"
112 | ],
113 | "storefronts": [
114 | "e84320e5fe94c9ad70bc8972dbb6875c",
115 | "760292d7a2b0c01e000aaad7e0c5294c",
116 | "7524a531338e7256112a4cf8ea4f3522",
117 | "ce6be4a94fba01aa11b426e61df0bcd7",
118 | "3df2d9652dfabeb06b8ec24767aedcc5",
119 | "928fa72f6c8fed3f90269986c63b7899",
120 | "e073cb4327ce900d5dd54b42368db1e0",
121 | "990870ac3187f148d8afe39755e35fe2",
122 | "bddee930836c20903fd7f2243f23ee22"
123 | ],
124 | "selfiesticks": [
125 | "7f205833b323c62aa1821430c7caaf50",
126 | "00f0446c4ff0a9d72008d744aee94f0b",
127 | "9e64221c2edebe5d8a2889c4f2304947",
128 | "7c69f173abc206aa4689814f044a2b04",
129 | "8e78d8d975567502dc2679b4fca5d844",
130 | "8e6641b74c94c7221ca6c17afe834a22",
131 | "421e1efda48c4ee0f30113eeb04f97be",
132 | "7eebf8a947df49c7fa7fa7f00d4128af",
133 | "dc39a9741ac1745cb5d532bed7b1af6a"
134 | ],
135 | "spiders": [
136 | "87e625307745c75949d5cf0034e58ae5",
137 | "8d045de2442e741a891029c6774ddd58",
138 | "0fef16f23180526aa542fa78d6a35067",
139 | "cd8fd2336d89e1fbaf625cace923eb26",
140 | "eadf503d27d7032610288b9e655055f5",
141 | "b06c80739733d2e989bca594822d8904",
142 | "21fd171f9135d55f849429b45f26457e",
143 | "2c1d47482778bb6985f735711d174343",
144 | "49bf509e54bb7552f0fca706e99d2d83"
145 | ],
146 | "art": [
147 | "6735e6f6e6fe90cbf07e8f15eb3a3165",
148 | "c585327d438bfb187819ac664a5e0000",
149 | "5bf27a7596e1a2ec9c10e08d7cd2f029",
150 | "9c1ac8e18a848c845df6c29c4350caf4",
151 | "8e69998f180a17ef80ef4c34c85210f4",
152 | "ba8a8122b20a5218118e154175e848d1",
153 | "24f884543da914f2971e2a96cf61dc1b",
154 | "5810648e0188f1d675fc2b2082af9928",
155 | "23c55ed4c4fc33583155c649adc5e6a2"
156 | ],
157 | "servers": [
158 | "4137fefd172706594672681e4efd23da",
159 | "4d5499cfd92193bccfe1236b7053b915",
160 | "b92fd54dac08218536bfed1fda20806b",
161 | "8530c3a5251aaffb98f0e5edefa04f47",
162 | "a9081b4ec0f49ff7d7ff0c48dc4e7fcb",
163 | "c5149796f9215054de14c40ee82e2518",
164 | "2ba5f90e406fa3f731d5dfb843811e29",
165 | "261483d1d839f4560399441c74abe815",
166 | "47858cb27ac500b6c6e5f73bb4b09852"
167 | ],
168 | "hotdogs": [
169 | "ae5864ca1409b462a45a60ca6ad97623",
170 | "58b8a1fa63d33451e13446e53764e849",
171 | "57eb2c37e1c81134950ccd34c412d070",
172 | "af1e7ecf1c6e8330bd464c957d60f73d",
173 | "a2bb7b0377e9908039786e71dde5fb3a",
174 | "3bfc2c4079c288adb529bdeeda4e862a",
175 | "cad6239f7b24b0f5623fbd028bd9a094",
176 | "e5c98fe7b4cf115e5e8d6b9f630b8707",
177 | "4907ac844e0907c56cafac16d36cb118"
178 | ],
179 | "memes": [
180 | "fe65b76d4394cdb2beb7c18163646eb6",
181 | "630e427f575b8a3641146952f540cf46",
182 | "1ee3d053536c1fbf162ae9f0887a29de",
183 | "ca9132a6086e2f0d92ab5728de8d10d0",
184 | "87818ece8fa5182c9094925ab640145c",
185 | "95e5ac72fa04c85a028d18debbaf9ea8",
186 | "803206f9d7e180c5a2f7f3893736a85b",
187 | "34ae40fe64c5fb2825e22c81c83df6dc",
188 | "c45c46dccdf26af6e8be9bde4ea56d05"
189 | ],
190 | "kids": [
191 | "4ef348915c95cf352b61d4a97807beca",
192 | "d8f8fbc764e22a8e59849e240481b2ba",
193 | "a9f702b43bbd73dc32a236f20fab5c3c",
194 | "c064007297426d85be103d9a919f7c4b",
195 | "d5821f2b00867f945ad721d5b9daa05f",
196 | "88bb20393792446befd90c90452188a8",
197 | "92447a64f7dffc33f1c984f87fe25d9e",
198 | "a93ee3034cccb5ee0715964e696edb8d",
199 | "ac59c6cb365c58befca1196d575378b0"
200 | ],
201 | "trafficlights": [
202 | "2428ae6d6bf30008e362d186d78826fb",
203 | "7bb9af44981f8a7ad23de0ae87015b3b",
204 | "9c3ae4a57da6034495c9d162ba4f844d",
205 | "53603b1abd025ec7e61d0929ecc050a2",
206 | "63688aa14ada4b39e4ef717f7b47fe7a",
207 | "7a582210d5c5acfbb48829acf23ac0e4",
208 | "d6a3e8e7eceaae4a592f2340d129a7c8",
209 | "239c1a9ee7033ec91773a563e6049033",
210 | "31ce552e1d5d9ae5d280b2b8cb3b1e2a"
211 | ],
212 | "women": [
213 | "1f4c7473c830ee3b49ea70446875251f",
214 | "cf87620a1e2df77002fc2c1a351f99bd",
215 | "1d6b18e596ee67f12a494a327200e64f",
216 | "b409eed3729efb288aac3d23c08af582",
217 | "a5331052eaa66e55dc2cafacbcf6a9b4",
218 | "7f5175ede3fc2894a3805417b3425c36",
219 | "525a6c3b5bf7b37f123bc2be793c9116",
220 | "bb6929616077b29221ae195f1b745966",
221 | "cf7ef5e506cb4a9eb5a3c90b05d81421"
222 | ],
223 | "flowers": [
224 | "5017c57f65405adb36d8a4cd41466394",
225 | "9d6b45f05296c711fe57cb34f3e4c272",
226 | "8dd882903c63a76055d2a4a04cb0f781",
227 | "d5c0f01e7385ed22218472fbe5fee3d4",
228 | "d521fd0b26d1b7425dc78e5d957aeaf5",
229 | "4a4bed0e773b3e19157eed5ce336ac51",
230 | "9351111d39ab1a79b372225db54e3909",
231 | "f1f9f2feb4d8b9b7d07f040a04405e65",
232 | "56196e287a786afcbad079fcc0e0e7df"
233 | ],
234 | "dicks": [
235 | "d22e1c4bab103ba4265ffbe771986488",
236 | "b107f742b89ea7e28af0785baa628698",
237 | "bcad427c01039c87463793329ca07b48",
238 | "f7ecfcec435daac5ac957a09f46c4fd8",
239 | "ad275e06bd0df907a93c36cfa9ba08ec",
240 | "8187fa12ee3f73daf56512a088d08241",
241 | "80f960ed2328243985cd1325d4b1e9a9",
242 | "36af91971824f2f9ed5e48a17135e50c",
243 | "e4af0712ed4e23f6daca7f53d1641820"
244 | ],
245 | "ewoks": [
246 | "6343aee9b9cd229a94b7e834e6752506",
247 | "06cfe46bd63fa5c956a69deb07b81836",
248 | "d4ec8ba505a0c4142bc42eed9c50d33f",
249 | "25b44b096eb168fbacc6b46f7a85b68d",
250 | "a79152a390fe887af15174d650911e37",
251 | "03e424f7aaa5d1994c2c5da5f909e073",
252 | "868a4d4794e9ade319b9cb11926b4644",
253 | "afe9df704708a6c82b36430985653629",
254 | "e521e08e42991353666babf6edd65b85"
255 | ],
256 | "cats": [
257 | "bc0dc9bd4b4835ea5218b81d8ffb4556",
258 | "6a9fe561069e02efa842c2e4952d249f",
259 | "9c4fa18ed0b0df89a23187b28a18d865",
260 | "6a802e3226cb6667feb6a34bc46994d5",
261 | "e26aece3fca2dabff508fa14b8a8e18c",
262 | "99fa2f5c635d6b56f12baa7e8c6f8ad8",
263 | "813680d89cf265f944193fde769cd35c",
264 | "f0729146d5115bc1ad6bfac09acf126d",
265 | "942b62ddcc871988645081746055a722"
266 | ]
267 | }
--------------------------------------------------------------------------------
/humans_not_invited_problem.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import re\n",
10 | "import time\n",
11 | "import json\n",
12 | "import asyncio\n",
13 | "import logging\n",
14 | "\n",
15 | "import requests\n",
16 | "import pandas as pd\n",
17 | "from PIL import Image\n",
18 | "import hashlib\n",
19 | "import aiohttp"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 2,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "name": "stdout",
29 | "output_type": "stream",
30 | "text": [
31 | "\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "async with aiohttp.ClientSession() as session:\n",
37 | " print(type(session))"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "logging.basicConfig(\n",
47 | " format=\"%(asctime)s ~ %(name)s ~ %(levelname)s ~ %(message)s\",\n",
48 | " datefmt=\"%Y-%m-%d %H:%M:%S\",\n",
49 | " level=logging.INFO,\n",
50 | ")\n",
51 | "\n",
52 | "logger = logging.getLogger(__name__)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "BASE_URL = \"http://www.humansnotinvited.com/\""
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "def find_all_image_urls(content: str):\n",
71 | " return re.findall('captcha/image.php\\\\?image_name=.*?&id=.', str(content))\n",
72 | "\n",
73 | "\n",
74 | "def find_tag(content: str):\n",
75 | " return re.findall('value=\"(.*?)\" name=\"category\"', str(content))[0]\n",
76 | "\n",
77 | "\n",
78 | "async def load_image(session, image_url):\n",
79 | " async with session.get(BASE_URL + image_url, allow_redirects=True) as response:\n",
80 | " data = await response.read()\n",
81 | " _hash = hashlib.md5(data).hexdigest()\n",
82 | " \n",
83 | " return _hash\n",
84 | "\n",
85 | "\n",
86 | "async def load_page(session):\n",
87 | " async with session.get(BASE_URL, allow_redirects=True) as response:\n",
88 | " data = await response.read()\n",
89 | " \n",
90 | " image_urls = find_all_image_urls(data)\n",
91 | " tasks = []\n",
92 | " for image_url in image_urls:\n",
93 | " tasks.append(asyncio.create_task(load_image(session, image_url)))\n",
94 | " \n",
95 | " tag = find_tag(data)\n",
96 | " return tag, await asyncio.gather(*tasks)\n",
97 | "\n",
98 | "\n",
99 | "def create_image_tag_df(result):\n",
100 | " count_image_tag = {}\n",
101 | " for tag, image_ids in result:\n",
102 | " tmp_tag = count_image_tag.get(tag, {})\n",
103 | " for image_id in image_ids:\n",
104 | " tmp_tag[image_id] = tmp_tag.get(image_id, 0) + 1\n",
105 | " count_image_tag[tag] = tmp_tag\n",
106 | " df = pd.DataFrame(count_image_tag)\n",
107 | " df.to_csv(\"test.csv\")\n",
108 | " df = df.fillna(0)\n",
109 | " return df\n",
110 | "\n",
111 | "\n",
112 | "def create_tags2images_dict(df):\n",
113 | " tags = list(df.columns)\n",
114 | " images = list(df.index)\n",
115 | " image2tag = list(df.values.argmax(axis=1, ))\n",
116 | " \n",
117 | " tags2images = {}\n",
118 | " for i_tag, tag in enumerate(tags):\n",
119 | " tags2images[tag] = []\n",
120 | " for image_id, image_tag in zip(images, image2tag):\n",
121 | " if image_tag == i_tag:\n",
122 | " tags2images[tag].append(image_id)\n",
123 | " \n",
124 | " return tags2images\n",
125 | "\n",
126 | "\n",
127 | "def save_json(data: dict, filename: str = \"tags2images.json\") -> None:\n",
128 | " with open(filename, \"w\") as file:\n",
129 | " json.dump(data, file, indent=4)\n",
130 | " \n",
131 | "\n",
132 | "def load_json(filename: str = \"tags2images.json\") -> dict:\n",
133 | " with open(filename, \"r\") as file:\n",
134 | " return json.load(file)\n",
135 | " \n",
136 | " \n",
137 | "async def collect_dataset(n_iterations: int, n_parallel_tasks: int) -> list:\n",
138 | " result = []\n",
139 | " \n",
140 | " time_full_start = time.time()\n",
141 | " i_iteration = 1\n",
142 | " while i_iteration <= n_iterations:\n",
143 | " try:\n",
144 | " time_current_start = time.time()\n",
145 | "\n",
146 | " async with aiohttp.ClientSession() as session:\n",
147 | " tasks = [load_page(session) for _ in range(n_parallel_tasks)] \n",
148 | "\n",
149 | " result.extend(await asyncio.gather(*tasks))\n",
150 | "\n",
151 | " current_time = time.time() - time_current_start\n",
152 | " total_time = time.time() - time_full_start\n",
153 | " logger.info(\n",
154 | " \"iteration: {}/{}; batch time: {:.2f}s; total time: {:.2f}s\".format(\n",
155 | " i_iteration, n_iterations, current_time, total_time\n",
156 | " )\n",
157 | " )\n",
158 | "\n",
159 | " i_iteration += 1\n",
160 | " \n",
161 | " except aiohttp.ServerDisconnectedError as error:\n",
162 | " logger.warning(str(error))\n",
163 | " time.sleep(2)\n",
164 | " \n",
165 | " return result"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 5,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "async def run_collection_process(n_iterations: int, n_parallel_tasks: int) -> None:\n",
175 | " result = await collect_dataset(n_iterations, n_parallel_tasks)\n",
176 | " df = create_image_tag_df(result)\n",
177 | " tags2images = create_tags2images_dict(df)\n",
178 | " save_json(tags2images)"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 6,
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "name": "stderr",
188 | "output_type": "stream",
189 | "text": [
190 | "2021-08-08 15:36:48 ~ __main__ ~ INFO ~ iteration: 1/100; batch time: 4.50s; total time: 4.50s\n",
191 | "2021-08-08 15:36:54 ~ __main__ ~ INFO ~ iteration: 2/100; batch time: 6.06s; total time: 10.57s\n",
192 | "2021-08-08 15:37:00 ~ __main__ ~ INFO ~ iteration: 3/100; batch time: 5.95s; total time: 16.52s\n",
193 | "2021-08-08 15:37:07 ~ __main__ ~ INFO ~ iteration: 4/100; batch time: 7.08s; total time: 23.60s\n",
194 | "2021-08-08 15:37:13 ~ __main__ ~ INFO ~ iteration: 5/100; batch time: 6.07s; total time: 29.67s\n",
195 | "2021-08-08 15:37:17 ~ __main__ ~ INFO ~ iteration: 6/100; batch time: 4.04s; total time: 33.72s\n",
196 | "2021-08-08 15:37:23 ~ __main__ ~ INFO ~ iteration: 7/100; batch time: 6.07s; total time: 39.79s\n",
197 | "2021-08-08 15:37:28 ~ __main__ ~ INFO ~ iteration: 8/100; batch time: 4.97s; total time: 44.75s\n",
198 | "2021-08-08 15:37:33 ~ __main__ ~ INFO ~ iteration: 9/100; batch time: 4.95s; total time: 49.71s\n",
199 | "2021-08-08 15:37:38 ~ __main__ ~ INFO ~ iteration: 10/100; batch time: 5.03s; total time: 54.74s\n",
200 | "2021-08-08 15:37:43 ~ __main__ ~ INFO ~ iteration: 11/100; batch time: 5.10s; total time: 59.85s\n",
201 | "2021-08-08 15:37:47 ~ __main__ ~ INFO ~ iteration: 12/100; batch time: 3.99s; total time: 63.84s\n",
202 | "2021-08-08 15:37:51 ~ __main__ ~ INFO ~ iteration: 13/100; batch time: 4.04s; total time: 67.88s\n",
203 | "2021-08-08 15:37:56 ~ __main__ ~ INFO ~ iteration: 14/100; batch time: 5.03s; total time: 72.90s\n",
204 | "2021-08-08 15:38:00 ~ __main__ ~ INFO ~ iteration: 15/100; batch time: 4.04s; total time: 76.94s\n",
205 | "2021-08-08 15:38:05 ~ __main__ ~ INFO ~ iteration: 16/100; batch time: 5.08s; total time: 82.02s\n",
206 | "2021-08-08 15:38:10 ~ __main__ ~ INFO ~ iteration: 17/100; batch time: 4.99s; total time: 87.01s\n",
207 | "2021-08-08 15:38:15 ~ __main__ ~ INFO ~ iteration: 18/100; batch time: 4.98s; total time: 92.00s\n",
208 | "2021-08-08 15:38:19 ~ __main__ ~ INFO ~ iteration: 19/100; batch time: 3.97s; total time: 95.97s\n",
209 | "2021-08-08 15:38:25 ~ __main__ ~ INFO ~ iteration: 20/100; batch time: 5.98s; total time: 101.95s\n",
210 | "2021-08-08 15:38:29 ~ __main__ ~ INFO ~ iteration: 21/100; batch time: 4.04s; total time: 105.99s\n",
211 | "2021-08-08 15:38:34 ~ __main__ ~ INFO ~ iteration: 22/100; batch time: 5.09s; total time: 111.09s\n",
212 | "2021-08-08 15:38:38 ~ __main__ ~ INFO ~ iteration: 23/100; batch time: 3.99s; total time: 115.08s\n",
213 | "2021-08-08 15:38:43 ~ __main__ ~ INFO ~ iteration: 24/100; batch time: 4.99s; total time: 120.07s\n",
214 | "2021-08-08 15:38:48 ~ __main__ ~ INFO ~ iteration: 25/100; batch time: 5.01s; total time: 125.08s\n",
215 | "2021-08-08 15:38:53 ~ __main__ ~ INFO ~ iteration: 26/100; batch time: 5.09s; total time: 130.18s\n",
216 | "2021-08-08 15:38:57 ~ __main__ ~ INFO ~ iteration: 27/100; batch time: 4.05s; total time: 134.23s\n",
217 | "2021-08-08 15:39:02 ~ __main__ ~ INFO ~ iteration: 28/100; batch time: 5.04s; total time: 139.28s\n",
218 | "2021-08-08 15:39:08 ~ __main__ ~ INFO ~ iteration: 29/100; batch time: 6.03s; total time: 145.31s\n",
219 | "2021-08-08 15:39:14 ~ __main__ ~ INFO ~ iteration: 30/100; batch time: 5.02s; total time: 150.33s\n",
220 | "2021-08-08 15:39:20 ~ __main__ ~ INFO ~ iteration: 31/100; batch time: 6.07s; total time: 156.40s\n",
221 | "2021-08-08 15:39:26 ~ __main__ ~ INFO ~ iteration: 32/100; batch time: 6.08s; total time: 162.47s\n",
222 | "2021-08-08 15:39:32 ~ __main__ ~ INFO ~ iteration: 33/100; batch time: 6.06s; total time: 168.54s\n",
223 | "2021-08-08 15:39:37 ~ __main__ ~ INFO ~ iteration: 34/100; batch time: 5.06s; total time: 173.61s\n",
224 | "2021-08-08 15:39:46 ~ __main__ ~ INFO ~ iteration: 35/100; batch time: 9.00s; total time: 182.61s\n",
225 | "2021-08-08 15:39:53 ~ __main__ ~ INFO ~ iteration: 36/100; batch time: 7.00s; total time: 189.61s\n",
226 | "2021-08-08 15:39:57 ~ __main__ ~ INFO ~ iteration: 37/100; batch time: 4.06s; total time: 193.68s\n",
227 | "2021-08-08 15:40:02 ~ __main__ ~ INFO ~ iteration: 38/100; batch time: 5.01s; total time: 198.69s\n",
228 | "2021-08-08 15:40:09 ~ __main__ ~ INFO ~ iteration: 39/100; batch time: 7.03s; total time: 205.72s\n",
229 | "2021-08-08 15:40:15 ~ __main__ ~ INFO ~ iteration: 40/100; batch time: 6.08s; total time: 211.80s\n",
230 | "2021-08-08 15:40:22 ~ __main__ ~ INFO ~ iteration: 41/100; batch time: 7.29s; total time: 219.09s\n",
231 | "2021-08-08 15:40:29 ~ __main__ ~ INFO ~ iteration: 42/100; batch time: 7.04s; total time: 226.13s\n",
232 | "2021-08-08 15:40:34 ~ __main__ ~ INFO ~ iteration: 43/100; batch time: 5.07s; total time: 231.20s\n",
233 | "2021-08-08 15:40:40 ~ __main__ ~ INFO ~ iteration: 44/100; batch time: 6.07s; total time: 237.27s\n",
234 | "2021-08-08 15:40:45 ~ __main__ ~ INFO ~ iteration: 45/100; batch time: 4.04s; total time: 241.32s\n",
235 | "2021-08-08 15:40:50 ~ __main__ ~ INFO ~ iteration: 46/100; batch time: 5.02s; total time: 246.34s\n",
236 | "2021-08-08 15:40:55 ~ __main__ ~ INFO ~ iteration: 47/100; batch time: 5.04s; total time: 251.38s\n",
237 | "2021-08-08 15:41:02 ~ __main__ ~ INFO ~ iteration: 48/100; batch time: 7.00s; total time: 258.38s\n",
238 | "2021-08-08 15:41:08 ~ __main__ ~ INFO ~ iteration: 49/100; batch time: 6.03s; total time: 264.41s\n",
239 | "2021-08-08 15:41:13 ~ __main__ ~ INFO ~ iteration: 50/100; batch time: 5.13s; total time: 269.55s\n",
240 | "2021-08-08 15:41:19 ~ __main__ ~ INFO ~ iteration: 51/100; batch time: 6.01s; total time: 275.56s\n",
241 | "2021-08-08 15:41:24 ~ __main__ ~ INFO ~ iteration: 52/100; batch time: 5.12s; total time: 280.68s\n",
242 | "2021-08-08 15:41:30 ~ __main__ ~ INFO ~ iteration: 53/100; batch time: 6.05s; total time: 286.73s\n",
243 | "2021-08-08 15:41:37 ~ __main__ ~ INFO ~ iteration: 54/100; batch time: 7.14s; total time: 293.87s\n",
244 | "2021-08-08 15:41:43 ~ __main__ ~ INFO ~ iteration: 55/100; batch time: 6.06s; total time: 299.93s\n",
245 | "2021-08-08 15:41:50 ~ __main__ ~ INFO ~ iteration: 56/100; batch time: 6.96s; total time: 306.90s\n",
246 | "2021-08-08 15:41:56 ~ __main__ ~ INFO ~ iteration: 57/100; batch time: 6.06s; total time: 312.96s\n",
247 | "2021-08-08 15:42:01 ~ __main__ ~ INFO ~ iteration: 58/100; batch time: 5.03s; total time: 317.99s\n",
248 | "2021-08-08 15:42:05 ~ __main__ ~ INFO ~ iteration: 59/100; batch time: 4.13s; total time: 322.12s\n",
249 | "2021-08-08 15:42:09 ~ __main__ ~ INFO ~ iteration: 60/100; batch time: 4.08s; total time: 326.21s\n",
250 | "2021-08-08 15:42:14 ~ __main__ ~ INFO ~ iteration: 61/100; batch time: 4.14s; total time: 330.34s\n",
251 | "2021-08-08 15:42:19 ~ __main__ ~ INFO ~ iteration: 62/100; batch time: 5.08s; total time: 335.43s\n",
252 | "2021-08-08 15:42:25 ~ __main__ ~ INFO ~ iteration: 63/100; batch time: 6.03s; total time: 341.46s\n",
253 | "2021-08-08 15:42:30 ~ __main__ ~ INFO ~ iteration: 64/100; batch time: 5.06s; total time: 346.52s\n",
254 | "2021-08-08 15:42:34 ~ __main__ ~ INFO ~ iteration: 65/100; batch time: 4.03s; total time: 350.55s\n",
255 | "2021-08-08 15:42:38 ~ __main__ ~ INFO ~ iteration: 66/100; batch time: 4.06s; total time: 354.61s\n",
256 | "2021-08-08 15:42:42 ~ __main__ ~ INFO ~ iteration: 67/100; batch time: 4.08s; total time: 358.68s\n",
257 | "2021-08-08 15:42:47 ~ __main__ ~ INFO ~ iteration: 68/100; batch time: 5.08s; total time: 363.77s\n",
258 | "2021-08-08 15:42:52 ~ __main__ ~ INFO ~ iteration: 69/100; batch time: 5.09s; total time: 368.86s\n",
259 | "2021-08-08 15:42:57 ~ __main__ ~ INFO ~ iteration: 70/100; batch time: 5.01s; total time: 373.87s\n",
260 | "2021-08-08 15:43:03 ~ __main__ ~ INFO ~ iteration: 71/100; batch time: 6.06s; total time: 379.93s\n",
261 | "2021-08-08 15:43:09 ~ __main__ ~ INFO ~ iteration: 72/100; batch time: 6.04s; total time: 385.97s\n",
262 | "2021-08-08 15:43:14 ~ __main__ ~ INFO ~ iteration: 73/100; batch time: 5.03s; total time: 391.01s\n",
263 | "2021-08-08 15:43:19 ~ __main__ ~ INFO ~ iteration: 74/100; batch time: 4.95s; total time: 395.96s\n",
264 | "2021-08-08 15:43:24 ~ __main__ ~ INFO ~ iteration: 75/100; batch time: 5.08s; total time: 401.04s\n",
265 | "2021-08-08 15:43:29 ~ __main__ ~ INFO ~ iteration: 76/100; batch time: 5.01s; total time: 406.05s\n",
266 | "2021-08-08 15:43:35 ~ __main__ ~ INFO ~ iteration: 77/100; batch time: 5.99s; total time: 412.05s\n",
267 | "2021-08-08 15:43:41 ~ __main__ ~ INFO ~ iteration: 78/100; batch time: 6.05s; total time: 418.10s\n",
268 | "2021-08-08 15:43:50 ~ __main__ ~ INFO ~ iteration: 79/100; batch time: 8.22s; total time: 426.32s\n",
269 | "2021-08-08 15:43:57 ~ __main__ ~ INFO ~ iteration: 80/100; batch time: 7.18s; total time: 433.50s\n",
270 | "2021-08-08 15:44:02 ~ __main__ ~ INFO ~ iteration: 81/100; batch time: 5.07s; total time: 438.58s\n",
271 | "2021-08-08 15:44:09 ~ __main__ ~ INFO ~ iteration: 82/100; batch time: 7.13s; total time: 445.71s\n",
272 | "2021-08-08 15:44:14 ~ __main__ ~ INFO ~ iteration: 83/100; batch time: 5.03s; total time: 450.74s\n",
273 | "2021-08-08 15:44:19 ~ __main__ ~ INFO ~ iteration: 84/100; batch time: 5.08s; total time: 455.83s\n"
274 | ]
275 | },
276 | {
277 | "name": "stderr",
278 | "output_type": "stream",
279 | "text": [
280 | "2021-08-08 15:44:24 ~ __main__ ~ INFO ~ iteration: 85/100; batch time: 5.03s; total time: 460.86s\n",
281 | "2021-08-08 15:44:30 ~ __main__ ~ INFO ~ iteration: 86/100; batch time: 6.06s; total time: 466.92s\n",
282 | "2021-08-08 15:44:37 ~ __main__ ~ INFO ~ iteration: 87/100; batch time: 7.06s; total time: 473.98s\n",
283 | "2021-08-08 15:44:43 ~ __main__ ~ INFO ~ iteration: 88/100; batch time: 6.03s; total time: 480.01s\n",
284 | "2021-08-08 15:44:49 ~ __main__ ~ INFO ~ iteration: 89/100; batch time: 6.04s; total time: 486.05s\n",
285 | "2021-08-08 15:44:54 ~ __main__ ~ INFO ~ iteration: 90/100; batch time: 5.11s; total time: 491.16s\n",
286 | "2021-08-08 15:45:00 ~ __main__ ~ INFO ~ iteration: 91/100; batch time: 6.01s; total time: 497.17s\n",
287 | "2021-08-08 15:45:06 ~ __main__ ~ INFO ~ iteration: 92/100; batch time: 5.15s; total time: 502.32s\n",
288 | "2021-08-08 15:45:11 ~ __main__ ~ INFO ~ iteration: 93/100; batch time: 5.03s; total time: 507.35s\n",
289 | "2021-08-08 15:45:17 ~ __main__ ~ INFO ~ iteration: 94/100; batch time: 6.01s; total time: 513.36s\n",
290 | "2021-08-08 15:45:23 ~ __main__ ~ INFO ~ iteration: 95/100; batch time: 6.07s; total time: 519.44s\n",
291 | "2021-08-08 15:45:28 ~ __main__ ~ INFO ~ iteration: 96/100; batch time: 5.04s; total time: 524.48s\n",
292 | "2021-08-08 15:45:32 ~ __main__ ~ INFO ~ iteration: 97/100; batch time: 3.98s; total time: 528.47s\n",
293 | "2021-08-08 15:45:37 ~ __main__ ~ INFO ~ iteration: 98/100; batch time: 5.01s; total time: 533.48s\n",
294 | "2021-08-08 15:45:41 ~ __main__ ~ INFO ~ iteration: 99/100; batch time: 4.03s; total time: 537.52s\n",
295 | "2021-08-08 15:45:45 ~ __main__ ~ INFO ~ iteration: 100/100; batch time: 4.00s; total time: 541.52s\n"
296 | ]
297 | }
298 | ],
299 | "source": [
300 | "N_ITERATIONS = 100\n",
301 | "N_PARALLEL_TASKS = 10\n",
302 | "\n",
303 | "await run_collection_process(N_ITERATIONS, N_PARALLEL_TASKS)"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": []
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": []
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 26,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "def find_all_image_urls(content: str):\n",
327 | " return re.findall('captcha/image.php\\\\?image_name=.*?&id=.', str(content))\n",
328 | "\n",
329 | "\n",
330 | "async def get_correct_images_by_content(content):\n",
331 | " tags2images = load_json()\n",
332 | " hashes = []\n",
333 | " new_images = find_all_image_urls(content)\n",
334 | " new_tag = find_tag(content)\n",
335 | " \n",
336 | " async with aiohttp.ClientSession() as session:\n",
337 | " for img in new_images:\n",
338 | " hashes.append(await load_image(session, img))\n",
339 | " \n",
340 | " print(\"GRID:\\n| 1 | 2 | 3 |\\n| 4 | 5 | 6 |\\n| 7 | 8 | 9 |\")\n",
341 | " print(\"SELECT NEXT IMAGES:\")\n",
342 | " for ind, _hash in enumerate(hashes, 1):\n",
343 | " if _hash in tags2images[new_tag]:\n",
344 | " print(ind)"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": []
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "metadata": {},
358 | "outputs": [],
359 | "source": []
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 29,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "content = \"\"\"\n",
368 | "\n",
369 | "\n",
370 | " \n",
374 | "\n",
375 | "
\n",
377 | "
\n",
378 | "\"\"\".replace(\"&\", \"&\")"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 30,
384 | "metadata": {},
385 | "outputs": [
386 | {
387 | "name": "stdout",
388 | "output_type": "stream",
389 | "text": [
390 | "GRID:\n",
391 | "| 1 | 2 | 3 |\n",
392 | "| 4 | 5 | 6 |\n",
393 | "| 7 | 8 | 9 |\n",
394 | "SELECT NEXT IMAGES:\n",
395 | "3\n",
396 | "4\n",
397 | "5\n",
398 | "6\n",
399 | "7\n",
400 | "9\n"
401 | ]
402 | }
403 | ],
404 | "source": [
405 | "await get_correct_images_by_content(content)"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": []
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": null,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": []
421 | }
422 | ],
423 | "metadata": {
424 | "kernelspec": {
425 | "display_name": "Python 3 (ipykernel)",
426 | "language": "python",
427 | "name": "python3"
428 | },
429 | "language_info": {
430 | "codemirror_mode": {
431 | "name": "ipython",
432 | "version": 3
433 | },
434 | "file_extension": ".py",
435 | "mimetype": "text/x-python",
436 | "name": "python",
437 | "nbconvert_exporter": "python",
438 | "pygments_lexer": "ipython3",
439 | "version": "3.8.9"
440 | }
441 | },
442 | "nbformat": 4,
443 | "nbformat_minor": 2
444 | }
445 |
--------------------------------------------------------------------------------