├── .gitattributes
├── helpers
    ├── crontab2
    ├── org_files_by_date.py
    └── parquet.ipynb
├── alibaba_workers
    ├── alibaba_tokens
    └── alibaba-upgrade-SAS-image.ipynb
├── audio
    ├── demo_audio.parquet
    └── audio_resample.ipynb
├── postCLIP_staging
    ├── bloom.sh
    ├── postCLIP.md
    ├── crontab
    ├── archive.sh
    ├── dashboard.py
    ├── movefiles.py
    ├── rsyncd.conf
    └── bloom.py
├── preCLIP_staging
    ├── cleanup.sh
    ├── capacity.py
    ├── rsyncd.conf
    ├── cleanup3db.py
    └── cleanup.py
├── gpu-requirements.txt
├── worker-requirements.txt
├── postgres
    ├── fstab
    ├── cold_storage.py
    ├── jobstables.sql
    ├── triggers.sql
    ├── fix_bad_csv.py
    ├── dedup_csv.py
    ├── tables.sql
    ├── stage_db.py
    └── dump_db.ipynb
├── worker-setup.sh
├── cleanup.py
├── LICENSE
├── gpu-setup.sh
├── bloom_server
    ├── bloom_dash.py
    ├── parquet2bloom.py
    └── bloomexport.py
├── notebooks
    └── query-bloom.ipynb
├── .gitignore
├── cloud boot
    ├── boot.sh
    └── cloud-init.yaml
├── README.md
├── docs
    ├── architecture_white.drawio
    ├── 3stage_architecture_white.drawio
    └── architecture.drawio
├── infrastructure.py
├── dbdl.py
└── ccpp.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | duplicates*.txt filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/helpers/crontab2:
--------------------------------------------------------------------------------
1 | 0 */1 * * * flock -n clean.lock /home/archiveteam/cleanup.sh


--------------------------------------------------------------------------------
/alibaba_workers/alibaba_tokens:
--------------------------------------------------------------------------------
1 | [tokens]
2 | id=<<your_AccessKeyID>>
3 | secret=<<your_AccesKeySecret>>


--------------------------------------------------------------------------------
/audio/demo_audio.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rvencu/crawlingathome-gpu-hcloud/HEAD/audio/demo_audio.parquet


--------------------------------------------------------------------------------
/postCLIP_staging/bloom.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # use in cron to update bloom filter
3 | 
4 | python3 bloom.py >> bloom.log
5 | 


--------------------------------------------------------------------------------
/preCLIP_staging/cleanup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # use in cron to update bloom filter
3 | 
4 | python3 cleanup.py >> cleanup.log


--------------------------------------------------------------------------------
/gpu-requirements.txt:
--------------------------------------------------------------------------------
 1 | ftfy
 2 | regex
 3 | trio
 4 | ujson
 5 | colorama
 6 | dashing
 7 | psycopg2
 8 | requests
 9 | pandas
10 | sqlalchemy
11 | sentence_transformers
12 | sentry_sdk


--------------------------------------------------------------------------------
/worker-requirements.txt:
--------------------------------------------------------------------------------
 1 | ftfy
 2 | pandas
 3 | gcld3
 4 | trio
 5 | ujson
 6 | asks
 7 | bloom-filter2
 8 | pillow
 9 | glances
10 | sqlalchemy
11 | psycopg2
12 | django
13 | tqdm
14 | 


--------------------------------------------------------------------------------
/postCLIP_staging/postCLIP.md:
--------------------------------------------------------------------------------
1 | This server will store the results of CLIP inference. Once arrived it arranges the files in date and time based subfolders, and will update the dataset bloom filters to prevent further duplication. 
2 | 
3 | The bloom log also exposes a dashboard with velocity information (24h, 7d and 1 month)


--------------------------------------------------------------------------------
/postCLIP_staging/crontab:
--------------------------------------------------------------------------------
1 | */1 * * * * flock -n archive.lock /home/archiveteam/archive.sh
2 | */1 * * * * flock -n bloom.lock /home/archiveteam/bloom.sh
3 | 0 0 */1 * * flock -n clpmove.lock python3 movefiles.py -e clp -d /home/archiveteam/CAH/clipped
4 | 10 0 */1 * * flock -n hshmove.lock python3 movefiles.py -e hsh -d /home/archiveteam/CAH/hashes
5 | @reboot python3 /home/archiveteam/dashboard.py &


--------------------------------------------------------------------------------
/postgres/fstab:
--------------------------------------------------------------------------------
1 | /dev/disk/by-uuid/9986-9974 /boot/efi vfat defaults 0 1
2 | /dev/md1        /mnt/md1        ext4    defaults 0 0
3 | /swap.img       none    swap    sw      0       0
4 | tmpfs   /mnt/ramdisk    tmpfs   rw,size=10G 0 0
5 | none /mnt/huge hugetlbfs pagesize=1G,size=210G 0 0
6 | //smb/share /mnt/smb cifs vers=3.11,uid=postgres,username=<<smb_username>>,password=<<smb_pwd>>,iocharset=utf8  0  0
7 | 


--------------------------------------------------------------------------------
/worker-setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # use this if you manually install download worker on your box
 3 | 
 4 | echo "insert your nickname for the leaderboard or press Enter for anonymous..."
 5 | read nickname
 6 | export CAH_NICKNAME=$nickname
 7 | 
 8 | git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client
 9 | pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir
10 | pip3 install -r worker-requirements.txt --no-cache-dir
11 | pip install random_user_agent
12 | 
13 | 


--------------------------------------------------------------------------------
/postCLIP_staging/archive.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # use in cron to move resulted files to the eye egress location (in staging server)
 3 | 
 4 | CURRENTDATE=`date +"%Y%m%d"`
 5 | CURRENTTIME=`date +"%H"`
 6 | mkdir --parents /home/archiveteam/CAH/ds/${CURRENTDATE}/${CURRENTTIME}/
 7 | 
 8 | find /home/archiveteam/CAH/results/*.tfrecord -mmin +1 -type f -exec rm "{}" \;
 9 | find /home/archiveteam/CAH/results/*.hsh -mmin +1 -type f -exec mv "{}" /home/archiveteam/CAH/hashes/ \;
10 | find /home/archiveteam/CAH/results/*.clp -mmin +1 -type f -exec mv "{}" /home/archiveteam/CAH/clipped/ \;
11 | find /home/archiveteam/CAH/results/ -mmin +5 -type f -exec mv "{}" /home/archiveteam/CAH/ds/${CURRENTDATE}/${CURRENTTIME}/ \;
12 | 


--------------------------------------------------------------------------------
/preCLIP_staging/capacity.py:
--------------------------------------------------------------------------------
 1 | # use in staging server to return disk capacity level
 2 | import shutil
 3 | import json
 4 | import os
 5 | 
 6 | from aioserver import Application
 7 | app = Application()
 8 | 
 9 | # Path
10 | path = "/home/archiveteam/CAH/gpujobs"
11 | # Get the disk usage statistics
12 | # about the given path
13 | @app.get('/disk')
14 | async def index(request):
15 |     stat = str(shutil.disk_usage(path))
16 |     stat = stat.split("(")[1].split(")")[0]
17 |     stat = '{"' + stat.replace('=','":').replace(', ',', "') + '}'
18 |     json_object = json.loads(stat)
19 |     json_object["utilization"] =  round(json_object["used"]/json_object["total"], 2)
20 |     json_object["jobscount"] = len([name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))])
21 | 
22 |     return 200, {'Content-Type': 'application/json; charset=utf-8'},  json.dumps(json_object)
23 |         
24 | app.run(host='0.0.0.0', port=8080)


--------------------------------------------------------------------------------
/cleanup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import shutil
 4 | 
 5 | # initial cleanup - delete all working files in case of crash recovery
 6 | reg_compile = re.compile(r"^\d{1,3}-\d{1,3}-\d{1,3}-\d{1,3}$")
 7 | for root, dirnames, filenames in os.walk("."):
 8 |     for filename in filenames:
 9 |         if filename.startswith("gpujob.zip_"):
10 |             os.remove(filename)
11 |     for dir in dirnames:
12 |         if reg_compile.match(dir):
13 |             shutil.rmtree(dir)
14 | re_uuid = re.compile(r'[0-9a-f]{32}', re.I)
15 | for root, dirnames, filenames in os.walk("."):
16 |     for dir in dirnames:
17 |         if re_uuid.match(dir):
18 |             shutil.rmtree(dir)
19 | re_gz = re.compile(r'.*.tar.gz.*', re.I)
20 | for root, dirnames, filenames in os.walk("."):
21 |     for file in filenames:
22 |         if re_gz.match(file):
23 |             os.remove(file)
24 | 
25 | for i in range(24):
26 |     os.system(f"rm -rf ./{i}")
27 | 
28 | os.system(f"rm -rf ./save")
29 | os.system(f"rm -rf ./stats")
30 | os.system(f"rm  ./shard.wat")


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Richard Vencu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/gpu-setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "insert your nickname for the leaderboard or press Enter for anonymous..."
 4 | read nickname
 5 | export CAH_NICKNAME=$nickname
 6 | 
 7 | sudo apt-get update
 8 | sudo apt-get install -y git build-essential python3-dev python3-pip libjpeg-dev zip libwebp-dev
 9 | 
10 | git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client
11 | #pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
12 | pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
13 | #pip3 install git+https://github.com/rvencu/asks
14 | pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir
15 | pip3 install -r gpu-requirements.txt --no-cache-dir
16 | #pip install tensorflow==2.5 --no-cache-dir
17 | pip install clip-anytorch
18 | 
19 | git clone "https://github.com/hetznercloud/hcloud-python" hcloud
20 | pip3 install -e ./hcloud
21 | 
22 | pip install parallel-ssh
23 | 
24 | yes | ssh-keygen -t rsa -b 4096 -f $HOME/.ssh/id_cah -q -P ""
25 | 
26 | yes | pip uninstall pillow
27 | CC="cc -mavx2" pip install -U --force-reinstall pillow-simd
28 | 


--------------------------------------------------------------------------------
/postgres/cold_storage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from multiprocessing import Process, Queue
 4 | 
 5 | def worker(queue):
 6 |     while not queue.empty():
 7 |         cmd = queue.get()
 8 |         print (cmd)
 9 |         os.system(cmd)
10 |     return
11 | 
12 | q = Queue()
13 | while True:
14 |     # Calculate which files are currently open (i.e. the ones currently being written to)
15 |     # and avoid uploading it. This is to ensure that when we process files on the server, they
16 |     # are complete.
17 |     i = 0
18 |     for root, dirs, files in os.walk("/mnt/md1/export/", topdown = False):
19 |         for file in files:
20 |             fullpath = os.path.join(root,file)
21 |             if file.endswith(".gz") and os.path.getmtime(fullpath) < time.time() - 60*60:
22 |                 dest = str(fullpath).replace("md1","smb")
23 |                 q.put(f"mv {fullpath} {dest}")
24 |                 i += 1
25 |                 if i % 1000 == 0:
26 |                     break
27 | 
28 |     procs = []
29 |     for i in range(16):
30 |         p = Process(target=worker, args=[q], daemon=False)
31 |         procs.append(p)
32 |         p.start()
33 | 
34 |     for proc in procs:
35 |         proc.join()
36 | 
37 |     print("Finish")
38 |     time.sleep(10)
39 | 


--------------------------------------------------------------------------------
/helpers/org_files_by_date.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import shutil
 4 | 
 5 | 
 6 | # Change the directory and jump to the location
 7 | # where you want to arrange the files
 8 | os.chdir(r"/mnt/md1/export/rsync")
 9 | 
10 | files = os.listdir('.')
11 | # files in the current directory
12 | i = 0
13 | for file in files:
14 |     if os.path.isfile(file) and os.path.getmtime(file) < time.time() - 60*60 and file.endswith("gz"):
15 |         # Get all the details of the file creation
16 |         # and modification
17 |         time_format = time.gmtime(os.path.getmtime(file))
18 | 
19 |         # Give the name of the folder
20 |         dir_name = str(time_format.tm_year) + "-" + \
21 |             str(time_format.tm_mon) + '-' + \
22 |             str(time_format.tm_mday)
23 | 
24 |         # Check if the folder exists or not
25 |         if not os.path.isdir(dir_name):
26 | 
27 |             # If not then make the new folder
28 |             os.mkdir(dir_name)
29 |         dest = dir_name
30 | 
31 |         # Move all the files to their respective folders
32 |         try:
33 |             shutil.move(file, dest)
34 |             files.remove(file)
35 |             i += 1
36 |             if i%1000 == 0:
37 |                 print ("+1000 files")
38 |         except:
39 |             continue
40 | 
41 | print("successfully moved...")


--------------------------------------------------------------------------------
/postCLIP_staging/dashboard.py:
--------------------------------------------------------------------------------
 1 | # use in staging server to create a small web dashboard with bloom filters stats
 2 | 
 3 | from aioserver import Application
 4 | app = Application()
 5 | 
 6 | @app.get('/')
 7 | async def index(request):
 8 |     reply = ""
 9 |     with open('/home/archiveteam/dashboard.txt', 'rt') as file:
10 |         reply = file.read()
11 |     return 200, {'Content-Type': 'text/html; charset=utf-8'}, "<html><body><style>body{font-family: monospace}</style>" + reply + "</body></html>"
12 | 
13 | @app.get('/stats')
14 | async def index(request):
15 |     uniques = []
16 |     total = []
17 |     clipped = []
18 |     with open('/home/archiveteam/dashboard.txt', 'rt') as file:
19 |         lines = file.readlines()
20 |         for line in lines:
21 |             if line.startswith("M unique pairs"):
22 |                 chunks = line.split("<br/>")
23 |                 uniques.append(chunks[0].split(" ")[-1])
24 |                 total.append(chunks[1].split(" ")[-1])
25 |                 clipped.append(chunks[2].split(" ")[-1])
26 | 
27 |     reply = '{"total": {"uniques":' + uniques[0] + ',"pairs":' + total[0] + ',"clips":' + clipped[0] + '},'
28 |     reply += '"day": {"uniques":' + uniques[1] + ',"pairs":' + total[1] + ',"clips":' + clipped[1] + '},'
29 |     reply += '"week": {"uniques":' + uniques[2] + ',"pairs":' + total[2] + ',"clips":' + clipped[2] + "}}"
30 | 
31 |     return 200, {'Content-Type': 'application.json; charset=utf-8'}, reply
32 | 
33 | app.run(host='0.0.0.0', port=8080)


--------------------------------------------------------------------------------
/bloom_server/bloom_dash.py:
--------------------------------------------------------------------------------
 1 | # use in staging server to create a small web dashboard with bloom filters stats
 2 | 
 3 | from aioserver import Application
 4 | app = Application()
 5 | 
 6 | @app.get('/')
 7 | async def index(request):
 8 |     reply = ""
 9 |     with open('/home/archiveteam/dashboard.txt', 'rt') as file:
10 |         reply = file.read()
11 |     return 200, {'Content-Type': 'text/html; charset=utf-8'}, "<html><body><style>body{font-family: monospace}</style>" + reply + "</body></html>"
12 | 
13 | @app.get('/stats')
14 | async def index(request):
15 |     uniques = []
16 |     total = []
17 |     clipped = []
18 |     with open('/home/archiveteam/dashboard.txt', 'rt') as file:
19 |         lines = file.readlines()
20 |         for line in lines:
21 |             if line.startswith("M unique pairs"):
22 |                 chunks = line.split("<br/>")
23 |                 uniques.append(chunks[0].split(" ")[-1])
24 |                 total.append(chunks[1].split(" ")[-1])
25 |                 clipped.append(chunks[2].split(" ")[-1])
26 | 
27 |     reply = '{"total": {"uniques":' + uniques[0] + ',"pairs":' + total[0] + ',"clips":' + clipped[0] + '},'
28 |     reply += '"day": {"uniques":' + uniques[1] + ',"pairs":' + total[1] + ',"clips":' + clipped[1] + '},'
29 |     reply += '"week": {"uniques":' + uniques[2] + ',"pairs":' + total[2] + ',"clips":' + clipped[2] + "}}"
30 | 
31 |     return 200, {'Content-Type': 'application.json; charset=utf-8'}, reply
32 | 
33 | app.run(host='0.0.0.0', port=8080)
34 | 


--------------------------------------------------------------------------------
/postgres/jobstables.sql:
--------------------------------------------------------------------------------
 1 | create table jobs_en
 2 | (
 3 |     jobid    varchar(32)       not null
 4 |         constraint jobs_pk
 5 |             primary key,
 6 |     status   integer default 0 not null,
 7 |     modified timestamp
 8 | );
 9 | 
10 | alter table jobs_en
11 |     owner to cah;
12 | 
13 | create unique index jobs_jobid_uindex
14 |     on jobs (jobid);
15 | 
16 | create trigger update_job_modtime
17 |     before update
18 |     on jobs_en
19 |     for each row
20 | execute procedure update_modified_column();
21 | 
22 | create table jobs_intl
23 | (
24 |     jobid    varchar(32)       not null
25 |         constraint jobs_intl_pk
26 |             primary key,
27 |     status   integer default 0 not null,
28 |     modified timestamp
29 | );
30 | 
31 | alter table jobs_intl
32 |     owner to cah;
33 | 
34 | create unique index jobs_intl_jobid_uindex
35 |     on jobs_intl (jobid);
36 | 
37 | create trigger update_job_modtime
38 |     before update
39 |     on jobs_intl
40 |     for each row
41 | execute procedure update_modified_column();
42 | 
43 | create table jobs_nolang
44 | (
45 |     jobid    varchar(32)       not null
46 |         constraint jobs_nolang_pk
47 |             primary key,
48 |     status   integer default 0 not null,
49 |     modified timestamp
50 | );
51 | 
52 | alter table jobs_nolang
53 |     owner to cah;
54 | 
55 | create unique index jobs_nolang_jobid_uindex
56 |     on jobs_nolang (jobid);
57 | 
58 | create trigger update_job_modtime
59 |     before update
60 |     on jobs_nolang
61 |     for each row
62 | execute procedure update_modified_column();
63 | 
64 | 


--------------------------------------------------------------------------------
/postgres/triggers.sql:
--------------------------------------------------------------------------------
 1 | create user cah
 2 |     superuser
 3 |     createdb
 4 |     createrole;
 5 | 
 6 | 
 7 | create function update_modified_column() returns trigger
 8 |     language plpgsql
 9 | as
10 | $$
11 | BEGIN
12 |     NEW.modified = now();
13 |     RETURN NEW;
14 | END;
15 | $$;
16 | 
17 | alter function update_modified_column() owner to cah;
18 | 
19 | create function on_insert_in_original_table() returns trigger
20 |     language plpgsql
21 | as
22 | $$
23 | BEGIN
24 |     BEGIN
25 |         IF NEW.language = 'en' THEN
26 |             INSERT INTO dataset_en (sampleid, url, text, license, domain, wat, hash, modified, language, width, height)
27 |             VALUES (NEW.sampleid, NEW.url, NEW.text, NEW.license, NEW.domain, NEW.wat, NEW.hash, NEW.modified, NEW.language,
28 |                     NEW.width, NEW.height)
29 |             ON CONFLICT DO NOTHING;
30 |         ELSIF NEW.language = '' THEN
31 |             INSERT INTO dataset_nolang (sampleid, url, text, license, domain, wat, hash, modified, language, width, height)
32 |             VALUES (NEW.sampleid, NEW.url, NEW.text, NEW.license, NEW.domain, NEW.wat, NEW.hash, NEW.modified, NEW.language,
33 |                     NEW.width, NEW.height)
34 |             ON CONFLICT DO NOTHING;
35 |         ELSE
36 |             INSERT INTO dataset_intl (sampleid, url, text, license, domain, wat, hash, modified, language, width, height)
37 |             VALUES (NEW.sampleid, NEW.url, NEW.text, NEW.license, NEW.domain, NEW.wat, NEW.hash, NEW.modified, NEW.language,
38 |                     NEW.width, NEW.height)
39 |             ON CONFLICT DO NOTHING;
40 |         END IF;
41 |     EXCEPTION
42 |         WHEN OTHERS THEN
43 |             NULL;
44 |     END;
45 | RETURN NULL;
46 | END;
47 | $$;
48 | 
49 | alter function on_insert_in_original_table() owner to cah;
50 | 
51 | 


--------------------------------------------------------------------------------
/postgres/fix_bad_csv.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from glob import glob
 3 | from tqdm.auto import tqdm
 4 | import numpy as np
 5 | import os
 6 | import re
 7 | 
 8 | def wonky_parser(fn):
 9 |     txt = open(fn).read()
10 |     #                          This is where I specified 8 tabs
11 |     #                                        V
12 |     preparse = re.findall('(([^\t]*\t[^\t]*){7}(\n|\Z))', txt)
13 |     parsed = [t[0].split('\t') for t in preparse]
14 |     return pd.DataFrame(parsed)
15 | 
16 | def is_num(x):
17 |     try:
18 |         x = int(float(x))
19 |         return True
20 |     except:
21 |         return False
22 | 
23 | files = glob("*.bad")
24 | 
25 | for file in tqdm(files):
26 |     try:
27 |         df = pd.read_csv(file, sep="\t", names=["s","url","a","b","c","d","e","f"], dtype={'s': 'int', 'd': 'int', 'url': 'str', 'a': 'str'})
28 |         df.drop_duplicates(subset="url", keep='first').reset_index(drop=True)
29 |         df.s = df.s.astype(int)
30 |         df.d = df.d.astype(int)
31 |         df.to_csv(file+".fix", sep="\t", index=False, header=False)
32 |         os.system(f"rm {file}")
33 |         os.system(f"mv {file}.fix {file}")
34 |     except:
35 |         df = wonky_parser(file)
36 |         df.columns=["s","url","a","b","c","d","e","f"]
37 |         df = df[df.s.apply(lambda x: is_num(x))]
38 |         df = df[df.d.apply(lambda x: is_num(x))]
39 |         df.drop_duplicates(subset="url", keep='first').reset_index(drop=True)
40 |         df.s = df.s.apply(lambda x: int(float(x)))
41 |         df.d = df.d.apply(lambda x: int(float(x)))
42 |         df["s"] = df["s"].astype(int)
43 |         df["d"] = df["d"].astype(int)
44 |         df.to_csv(file+".fix", sep="\t", index=False, header=False)
45 |         os.system(f"rm {file}")
46 |         os.system(f"mv {file}.fix {file}")
47 |         
48 | 
49 | 


--------------------------------------------------------------------------------
/postCLIP_staging/movefiles.py:
--------------------------------------------------------------------------------
 1 | import os, datetime, errno, argparse, sys
 2 | 
 3 | def create_file_list(CWD):
 4 |     """ takes string as path, returns tuple(files,date) """
 5 | 
 6 |     files_with_mtime = []
 7 |     for filename in [f for f in os.listdir(CWD) if os.path.splitext(f)[1] in ext and datetime.datetime.fromtimestamp(os.stat(os.path.join(CWD,f)).st_mtime) < datetime.datetime.now()-datetime.timedelta(days=1)]:
 8 |         files_with_mtime.append((filename,datetime.datetime.fromtimestamp(os.stat(os.path.join(CWD,filename)).st_mtime).strftime('%Y-%m-%d')))
 9 |     return files_with_mtime
10 | 
11 | def create_directories(files, CWD):
12 |     """ takes tuple(file,date) from create_file_list() """
13 | 
14 |     m = []
15 |     for i in files:
16 |         m.append(i[1])
17 |     for i in set(m):
18 |         try:
19 |             os.makedirs(os.path.join(CWD,i))
20 |         except OSError as exception:
21 |             if exception.errno != errno.EEXIST:
22 |                 raise
23 | 
24 | def move_files_to_folders(files, CWD):
25 |     """ gets tuple(file,date) from create_file_list() """
26 |     for i in files:
27 |         try:
28 |             os.rename(os.path.join(CWD,i[0]), os.path.join(CWD,(i[1] + '/' + i[0])))
29 |         except Exception as e:
30 |             raise
31 |     return len(files)
32 | 
33 | 
34 | if __name__ == '__main__':
35 | 
36 |     parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s [options]')
37 |     parser.add_argument("-e","--extension",action='append',help="File extensions to match",required=True)
38 |     parser.add_argument("-d","--directory",action='append',help="Target directory",required=True)
39 |     args = parser.parse_args()
40 | 
41 |     ext =  ['.' + e for e in args.extension]
42 |     print (f"Moving files with extensions:{ext}")
43 |     print(args.directory)
44 |     files = create_file_list(args.directory[0])
45 |     create_directories(files,args.directory[0])
46 |     print ("Moved %i files" % move_files_to_folders(files, args.directory[0]))


--------------------------------------------------------------------------------
/notebooks/query-bloom.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "source": [
 7 |     "import requests\n",
 8 |     "from pathlib import Path\n",
 9 |     "\n",
10 |     "bloomip = \"116.202.162.146\""
11 |    ],
12 |    "outputs": [],
13 |    "metadata": {}
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 5,
18 |    "source": [
19 |     "file = \"/home/rvencu/gpuhcloud/crawlingathome-gpu-hcloud/notebooks/test.hsh\"\n",
20 |     "stem = Path(file).stem.strip(\".\")\n",
21 |     "post = {\n",
22 |     "    'file': (stem, open(file, 'rb')),\n",
23 |     "    'key': (None, 'clipped'),\n",
24 |     "}\n",
25 |     "response = requests.post(f'http://{bloomip}:8000/deduplicate/', files=post)\n",
26 |     "if response.status_code == 200:\n",
27 |     "    print(response.content.decode(\"utf-8\"))"
28 |    ],
29 |    "outputs": [
30 |     {
31 |      "output_type": "stream",
32 |      "name": "stdout",
33 |      "text": [
34 |       "d10e361a18d69ef07c08857fb1198d22\n",
35 |       "136061a9e0426bc0a6274d70ab834e60\n",
36 |       "563a4dd46eb8dddf7b85d35f236cb12b\n",
37 |       "d10e361a18d69ef07c08857fb1198d23\n",
38 |       "136061a9e0426bc0a6274d70ab834e61\n",
39 |       "563a4dd46eb8dddf7b85d35f236cb12c\n",
40 |       "\n"
41 |      ]
42 |     }
43 |    ],
44 |    "metadata": {}
45 |   }
46 |  ],
47 |  "metadata": {
48 |   "orig_nbformat": 4,
49 |   "language_info": {
50 |    "name": "python",
51 |    "version": "3.8.8",
52 |    "mimetype": "text/x-python",
53 |    "codemirror_mode": {
54 |     "name": "ipython",
55 |     "version": 3
56 |    },
57 |    "pygments_lexer": "ipython3",
58 |    "nbconvert_exporter": "python",
59 |    "file_extension": ".py"
60 |   },
61 |   "kernelspec": {
62 |    "name": "python3",
63 |    "display_name": "Python 3.8.8 64-bit ('gpuhcloud': conda)"
64 |   },
65 |   "interpreter": {
66 |    "hash": "bc322c11e8113b1b1dfcd753c5702c5c5d95a81c495f9a7060b170a2a7888bca"
67 |   }
68 |  },
69 |  "nbformat": 4,
70 |  "nbformat_minor": 2
71 | }


--------------------------------------------------------------------------------
/audio/audio_resample.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 37,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "----------\n",
13 |       "Source: ./_sample_data/e2b474d6c0094c3da22788e7875f7787.mp3\n",
14 |       "----------\n",
15 |       " - File size: 603742 bytes\n",
16 |       " - AudioMetaData(sample_rate=22050, num_frames=1664202, num_channels=2, bits_per_sample=0, encoding=MP3)\n"
17 |      ]
18 |     }
19 |    ],
20 |    "source": [
21 |     "import torchaudio\n",
22 |     "import torchaudio.transforms as T\n",
23 |     "import requests\n",
24 |     "import uuid\n",
25 |     "\n",
26 |     "url = 'https://deploy.laion.ai/0fed69941baaabaeccedc2aaaaaaaaaa/WeSoundEffects/Glitchedtones/Urban%20Traffic/urban%20traffic%2C%20bus%20journey%2C%20interior%2C%20newcastle%2C%20uk%20%282%29.wav'\n",
27 |     "file = uuid.uuid4().hex\n",
28 |     "with requests.get(url) as response:\n",
29 |     "  with open(file, 'wb') as f:\n",
30 |     "    f.write(response.content)\n",
31 |     "    waveform, sample_rate = torchaudio.load(file)\n",
32 |     "    resampler = T.Resample(sample_rate, 20050, dtype=waveform.dtype)\n",
33 |     "    resampled_waveform = resampler(waveform)\n",
34 |     "    path = f\"{file}.mp3\"\n",
35 |     "    torchaudio.save(path, resampled_waveform, 20050, format=\"mp3\")"
36 |    ]
37 |   }
38 |  ],
39 |  "metadata": {
40 |   "interpreter": {
41 |    "hash": "ee22a52db22349ad32e35f3b499efddea1c9229e771c5fd65652469b6b2f1979"
42 |   },
43 |   "kernelspec": {
44 |    "display_name": "Python 3.9.7 ('gpu')",
45 |    "language": "python",
46 |    "name": "python3"
47 |   },
48 |   "language_info": {
49 |    "codemirror_mode": {
50 |     "name": "ipython",
51 |     "version": 3
52 |    },
53 |    "file_extension": ".py",
54 |    "mimetype": "text/x-python",
55 |    "name": "python",
56 |    "nbconvert_exporter": "python",
57 |    "pygments_lexer": "ipython3",
58 |    "version": "3.9.7"
59 |   },
60 |   "orig_nbformat": 4
61 |  },
62 |  "nbformat": 4,
63 |  "nbformat_minor": 2
64 | }
65 | 


--------------------------------------------------------------------------------
/bloom_server/parquet2bloom.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | import requests
 5 | from glob import glob
 6 | from random import randint
 7 | from tqdm.auto import tqdm
 8 | import os.path as path
 9 | import time
10 | bloomip = "116.202.162.146"
11 | 
12 | files = glob("**/*.parquet", recursive=True)
13 | with tqdm(total=len(files), file=sys.stdout) as pbar:
14 | 
15 |     for file in files:
16 |         
17 |         df = pd.read_parquet(file)
18 | 
19 |         n = 100000  #chunk row size
20 |         list_df = [df[i:i+n] for i in range(0,df.shape[0],n)]
21 | 
22 |         for ldf in list_df:
23 |             with open('hash.txt', 'w') as f:
24 |                 f.write(ldf['URL'].str.cat(sep='\n'))
25 |             post = {
26 |                 'file': ('hash.txt', open('hash.txt', 'rb')),
27 |                 'key': (None, "dedup"),
28 |             }
29 |             os.remove('hash.txt')
30 |             
31 |             failure = True
32 |             for _ in range(10):
33 |                 response = requests.post(f'http://{bloomip}:8000/add/', files=post)
34 |                 if response.status_code != 200:
35 |                     time.sleep(randint(5,30))
36 |                 else:
37 |                     failure = False
38 |                     break
39 |             if failure:
40 |                 print("could not add chunk")
41 |                 continue
42 | 
43 |             with open('hash.txt', 'w') as f:
44 |                 f.write(ldf['URL'].str.cat(sep='\n'))
45 |             post = {
46 |                 'file': ('hash.txt', open('hash.txt', 'rb')),
47 |                 'key': (None, "nolang"),
48 |             }
49 |             os.remove('hash.txt')
50 |             
51 |             failure = True
52 |             for _ in range(10):
53 |                 response = requests.post(f'http://{bloomip}:8000/add/', files=post)
54 |                 if response.status_code != 200:
55 |                     time.sleep(randint(5,30))
56 |                 else:
57 |                     failure = False
58 |                     break
59 |             if failure:
60 |                 print("could not add chunk")
61 |                 continue
62 | 
63 |         pbar.update(1)
64 | 
65 | 


--------------------------------------------------------------------------------
/postCLIP_staging/rsyncd.conf:
--------------------------------------------------------------------------------
 1 | # GLOBAL OPTIONS
 2 | 
 3 | #motd file=/etc/motd
 4 | #log file=/var/log/rsyncd
 5 | # for pid file, do not use /var/run/rsync.pid if
 6 | # you are going to run rsync out of the init.d script.
 7 | # The init.d script does its own pid file handling,
 8 | # so omit the "pid file" line completely in that case.
 9 | # pid file=/var/run/rsyncd.pid
10 | #syslog facility=daemon
11 | #socket options=
12 | 
13 | use chroot = yes
14 | lock file = /var/lock/rsyncd
15 | uid = archiveteam
16 | gid = archiveteam
17 | log file = /mnt/rs.log
18 | #       exclude =
19 | #       exclude from =
20 | #       include =
21 | #       include from =
22 | #       auth users =
23 | #       secrets file = /etc/rsyncd.secrets
24 | strict modes = yes
25 | #       hosts allow =
26 | #       hosts deny =
27 | ignore errors = no
28 | ignore nonreadable = yes
29 | transfer logging = no
30 | #       log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes.
31 | timeout = 600
32 | refuse options = checksum dry-run
33 | dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz
34 | 
35 | # MODULE OPTIONS
36 | 
37 | [CAH]
38 |         comment = CAH dataset archive
39 |         path = /home/archiveteam/CAH/results/
40 | #       max connections=10
41 | # the default for read only is yes...
42 |         read only = no
43 |         write only = yes
44 |         list = yes
45 | #       exclude =
46 | #       exclude from =
47 | #       include =
48 | #       include from =
49 | #       auth users =
50 | #       secrets file = /etc/rsyncd.secrets
51 | #       hosts allow =
52 | #       hosts deny =
53 | 
54 | [bloom]
55 |         comment = update filters
56 |         path = /home/archiveteam/CAH/bloom/
57 | #       max connections=10
58 | #       lock file = /var/lock/rsyncd_gpu
59 | # the default for read only is yes...
60 |         read only = yes
61 |         write only = no
62 |         list = yes
63 | 
64 | [CAHINTL]
65 |         comment = CAH INTL dataset archive
66 |         path = /home/archiveteam/CAH/resultsintl/
67 | #       max connections=10
68 | # the default for read only is yes...
69 |         read only = no
70 |         write only = yes
71 |         list = yes
72 | #       exclude =
73 | #       exclude from =
74 | #       include =
75 | #       include from =
76 | #       auth users =
77 | #       secrets file = /etc/rsyncd.secrets
78 | #       hosts allow =
79 | #       hosts deny =        path = /home/archiveteam/CAH/resultsnolang/
80 | #       max connections=10
81 | # the default for read only is yes...
82 |         read only = no
83 |         write only = yes
84 |         list = yes
85 | #       exclude =
86 | #       exclude from =
87 | #       include =
88 | #       include from =
89 | #       auth users =
90 | #       secrets file = /etc/rsyncd.secrets
91 | #       hosts allow =
92 | #       hosts deny =
93 | 
94 | 
95 | [CAHNOLANG]
96 |         comment = CAH dataset archive
97 | 


--------------------------------------------------------------------------------
/bloom_server/bloomexport.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | preable:
 3 | script is adapted for large redisbloom filters and will not require double memory size since it iteratively dumps or load chunks of 512MB to disk
 4 | 
 5 | arguments:
 6 | -m/--mode (dump|restore)
 7 | -k/--key key to be dumped or source key for backup to restore
 8 | -d/--dest key to be restored to
 9 | -p/--path where to store/retrieve the backup files
10 | 
11 | usage:
12 | 1. backup of key "main"
13 | 
14 | python3 bloomexport.py -m dump -k main
15 | 
16 | 2. restore from backup of key "main" into key "test" (destination key should not exist, it will be created)
17 | 
18 | python3 bloomexport.py -m restore -k main -d test
19 | 
20 | '''
21 | 
22 | import sys
23 | import glob
24 | import pickle
25 | import argparse
26 | from redisbloom.client import Client
27 | r = Client()
28 | 
29 | def make_dump(r, key, path):
30 |     iter = 0
31 |     while True:
32 |         iter, data = r.bfScandump(key, iter)
33 |         if iter == 0:
34 |             return
35 |         else:
36 |             print(iter)
37 |             with open(f"{path}/{iter}.{key}.bloom","wb") as f:
38 |                 pickle.dump(data, f)    
39 | 
40 | def restore_dump(r, source, dest, path):
41 |     iters = []
42 |     files = glob.glob(f"{path}/*.bloom")
43 |     for file in files:
44 |         try:
45 |             iter, key, ext = file.split("/")[-1].split(".")
46 |             if key == source:
47 |                 iters.append(iter)
48 |         except:
49 |             pass
50 |     # reorder chunks ascending
51 |     iters.sort(key=lambda x: int(x))
52 |     for iter in iters:
53 |         with open(f"{path}/{iter}.{source}.bloom","rb") as f:
54 |             data = pickle.load(f)
55 |             r.bfLoadChunk(dest, iter, data)
56 |             print(iter)
57 |     return
58 | 
59 | if __name__ == "__main__":
60 |     # script initialization
61 |     parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -m/--mode -k/--key -p/--path')
62 |     parser.add_argument("-m","--mode",action='append',help="Choose mode dump or restore", required=True)
63 |     parser.add_argument("-k","--key",action='append',help="Choose bloom key", required=True)
64 |     parser.add_argument("-d","--destination",action='append',help="Choose destination bloom key at restore", required=False)
65 |     parser.add_argument("-p","--path",action='append',help="Choose folder", required=False)
66 |     args = parser.parse_args()
67 |     path = "."
68 |     if args.path is not None:
69 |         path = args.path[0]
70 |     key = args.key[0]
71 |     dest = key
72 |     if args.destination is not None:
73 |         dest = args.destination[0]
74 |     if args.mode[0] == "dump":
75 |         make_dump(r, key, path)
76 |         print(f"dump for {key} saved in {path}")
77 |     elif args.mode[0] == "restore":
78 |         restore_dump(r, key, dest, path)
79 |         print(f"dump for {key} restored as {dest} from {path}")
80 |     else:
81 |         print("bad mode entered")


--------------------------------------------------------------------------------
/postgres/dedup_csv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | import requests
 5 | from glob import glob
 6 | from random import randint
 7 | from tqdm.auto import tqdm
 8 | import os.path as path
 9 | import time
10 | bloomip = "116.202.162.146"
11 | 
12 | files = glob("*.txt")
13 | with tqdm(total=len(files), file=sys.stdout) as pbar:
14 |     pbar.desc = "1"
15 |     for file in files:
16 |         age = time.time() - path.getmtime(file)
17 |         if not os.path.isfile(f"{file}.deduped") and age > 24*60*60:
18 |             try:
19 |                 df = pd.read_csv(file, sep="\t", names=["s","url","a","b","c","d","e","f"])
20 |                 df.drop_duplicates(subset="url", keep='first').reset_index(drop=True)
21 | 
22 |                 with open('hash.txt', 'w') as f:
23 |                     f.write(df['url'].str.cat(sep='\n'))
24 |                 post = {
25 |                     'file': ('hash.txt', open('hash.txt', 'rb')),
26 |                     'key': (None, "dedup"),
27 |                 }
28 |                 os.remove('hash.txt')
29 |                 
30 |                 failure = True
31 |                 for _ in range(10):
32 |                     response = requests.post(f'http://{bloomip}:8000/deduplicate/', files=post)
33 |                     if response.status_code != 200:
34 |                         time.sleep(randint(5,30))
35 |                     else:
36 |                         failure = False
37 |                         break
38 |                 if failure:
39 |                     continue
40 | 
41 |                 valid_urls = response.content.decode("utf-8").split("\n")
42 | 
43 |                 ratio = round(len(valid_urls) / len(df.index), 2)
44 | 
45 |                 df = df[df.url.isin(valid_urls)]
46 |                 df.reset_index(inplace=True, drop=True)
47 | 
48 |                 df.to_csv(file+".deduped", sep="\t", index=False, header=False)
49 | 
50 |                             # add parsed urls to parsed bloom server
51 |                 with open('hash.txt', 'w') as f:
52 |                     for url in valid_urls:
53 |                         f.write(url.strip()+"\n")
54 |                 post = {
55 |                     'file': ('hash.txt', open('hash.txt', 'rb')),
56 |                     'key': (None, 'dedup'),
57 |                 }
58 |                 os.remove('hash.txt')
59 | 
60 |                 failure = True
61 |                 for _ in range(10):
62 |                     try:
63 |                         response = requests.post(f'http://{bloomip}:8000/add/', files=post)
64 |                         if response.status_code != 200:
65 |                             time.sleep(randint(5,30))
66 |                         else:
67 |                             failure = False
68 |                             break
69 |                     except:
70 |                         time.sleep(15)
71 |                 if failure:
72 |                     continue
73 |                 os.system(f"rm {file}")
74 |                 os.system(f"mv {file}.deduped {file}")
75 |                 pbar.desc = str(ratio)
76 |                 pbar.update(1)
77 |             except Exception as e:
78 |                 print (e)
79 | 


--------------------------------------------------------------------------------
/preCLIP_staging/rsyncd.conf:
--------------------------------------------------------------------------------
  1 | # GLOBAL OPTIONS
  2 | 
  3 | #motd file=/etc/motd
  4 | #log file=/var/log/rsyncd
  5 | # for pid file, do not use /var/run/rsync.pid if
  6 | # you are going to run rsync out of the init.d script.
  7 | # The init.d script does its own pid file handling,
  8 | # so omit the "pid file" line completely in that case.
  9 | # pid file=/var/run/rsyncd.pid
 10 | #syslog facility=daemon
 11 | #socket options=
 12 | 
 13 | # MODULE OPTIONS
 14 | 
 15 | [gpujobs]
 16 | 
 17 |         comment = database 3 staged gpu jobs storage
 18 |         path = /mnt/md0/gpujobs
 19 |         use chroot = yes
 20 | #       max connections=10
 21 |         lock file = /var/lock/rsyncd
 22 | # the default for read only is yes...
 23 |         read only = no
 24 |         write only = no
 25 |         list = no
 26 |         uid = archiveteam
 27 |         gid = archiveteam
 28 | #       exclude =
 29 | #       exclude from =
 30 | #       include =
 31 | #       include from =
 32 | #       auth users =
 33 | #       secrets file = /etc/rsyncd.secrets
 34 |         strict modes = yes
 35 | #       hosts allow =
 36 | #       hosts deny =
 37 |         ignore errors = no
 38 |         ignore nonreadable = yes
 39 |         transfer logging = no
 40 | #       log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes.
 41 |         timeout = 600
 42 |         refuse options = checksum dry-run
 43 |         dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz
 44 | 
 45 | [gpujobsml]
 46 | 
 47 |         comment = database 3 staged gpu jobs storage
 48 |         path = /mnt/md0/gpujobsml
 49 |         use chroot = yes
 50 | #       max connections=10
 51 |         lock file = /var/lock/rsyncd
 52 | # the default for read only is yes...
 53 |         read only = no
 54 |         write only = no
 55 |         list = no
 56 |         uid = archiveteam
 57 |         gid = archiveteam
 58 | #       exclude =
 59 | #       exclude from =
 60 | #       include =
 61 | #       include from =
 62 | #       auth users =
 63 | #       secrets file = /etc/rsyncd.secrets
 64 |         strict modes = yes
 65 | #       hosts allow =
 66 | #       hosts deny =
 67 |         ignore errors = no
 68 |         ignore nonreadable = yes
 69 |         transfer logging = no
 70 | #       log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes.
 71 |         timeout = 600
 72 |         refuse options = checksum dry-run
 73 |         dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz
 74 | 
 75 | [gpujobsnolang]
 76 | 
 77 |         comment = database 3 staged gpu jobs storage
 78 |         path = /mnt/md0/gpujobsnolang
 79 |         use chroot = yes
 80 | #       max connections=10
 81 |         lock file = /var/lock/rsyncd
 82 | # the default for read only is yes...
 83 |         read only = no
 84 |         write only = no
 85 |         list = no
 86 |         uid = archiveteam
 87 |         gid = archiveteam
 88 | #       exclude =
 89 | #       exclude from =
 90 | #       include =
 91 | #       include from =
 92 | #       auth users =
 93 | #       secrets file = /etc/rsyncd.secrets
 94 |         strict modes = yes
 95 | #       hosts allow =
 96 | #       hosts deny =
 97 |         ignore errors = no
 98 |         ignore nonreadable = yes
 99 |         transfer logging = no
100 | #       log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes.
101 |         timeout = 600
102 |         refuse options = checksum dry-run
103 |         dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz


--------------------------------------------------------------------------------
/preCLIP_staging/cleanup3db.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import argparse
  5 | from glob import glob
  6 | import os.path as path
  7 | from datetime import datetime
  8 | from multiprocessing import Process, Queue
  9 | from sqlalchemy import create_engine
 10 | from configparser import ConfigParser
 11 | 
 12 | def config(filename='database.ini', section='cah_production'):
 13 |     # create a parser
 14 |     parser = ConfigParser()
 15 |     # read config file
 16 |     parser.read(filename)
 17 |     # get section, default to postgresql
 18 |     db = {}
 19 |     if parser.has_section(section):
 20 |         params = parser.items(section)
 21 |         for param in params:
 22 |             db[param[0]] = param[1]
 23 |     else:
 24 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 25 |     return db
 26 | 
 27 | def confirm_delete(engine, uuid, jobset="en"):
 28 |     jobtable = "jobs"
 29 |     if jobset=="intl":
 30 |         jobtable = "jobs_intl"
 31 |     select_stmt1 = f"select count(*) from {jobtable} where status > 1 and jobid = '{uuid}'"
 32 |     conn = engine.raw_connection()
 33 |     cur = conn.cursor()
 34 |     cur.execute(select_stmt1)
 35 |     jobcount = int(cur.fetchone()[0])
 36 |     conn.commit()
 37 |     cur.close()
 38 |     conn.close()
 39 |     return jobcount
 40 | 
 41 | def worker(engine, q: Queue, jobset = "en"):
 42 |     jobspath = '/mnt/md0/gpujobs/'
 43 |     if jobset == "intl":
 44 |         jobspath = '/mnt/md0/gpujobsml/'
 45 |     while q.qsize()>0:
 46 |         try:
 47 |             uuid = q.get_nowait()
 48 |             if confirm_delete(engine, uuid, jobset)==1:
 49 |                 file = f"{jobspath}{uuid}.tar.gz"
 50 |                 if os.path.isfile(file) and os.path.getmtime(file) < time.time() - 60*60: # this makes the code more robust
 51 |                     os.remove(file)
 52 |                     print(f"deleted {file}")
 53 |         except Exception as e:
 54 |             print (f"worker raised error {e}")
 55 |             pass
 56 | 
 57 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -s/--set')
 58 | parser.add_argument("-s","--set",action='append',help="Choose current set (en, nolang, intl)",required=False)
 59 | args = parser.parse_args()
 60 | 
 61 | params = config()
 62 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}',pool_size=25, max_overflow=50)
 63 | 
 64 | jobset = "en"
 65 | 
 66 | if args.set is not None:
 67 |     jobset = args.set[0]
 68 | 
 69 | jobspath = '/mnt/md0/gpujobs/*.tar.gz'
 70 | if jobset == "intl":
 71 |     jobspath = '/mnt/md0/gpujobsml/*.tar.gz'
 72 | 
 73 | now = datetime.now().strftime("%Y/%m/%d_%H:%M")
 74 | list_of_files = glob(jobspath)
 75 | frm = len(list_of_files)
 76 | 
 77 | start = time.time()
 78 | q = Queue()
 79 | procs = []
 80 | for i in range(10):
 81 |     procs.append(Process(target=worker, args=[engine, q, jobset]))
 82 | 
 83 | for file in list_of_files:
 84 |     if time.time() - path.getmtime(file) < 300:
 85 |         continue
 86 |     uuid = file.split("/")[4].split(".")[0]
 87 |     q.put(uuid)
 88 | 
 89 | time.sleep(20)
 90 | 
 91 | for proc in procs:
 92 |     proc.start()
 93 | for proc in procs:
 94 |     proc.join()
 95 | 
 96 | list_of_files = glob(jobspath)
 97 | end = len(list_of_files)
 98 | 
 99 | with open("jobs.txt","wt") as f:
100 |     for file in list_of_files:
101 |         f.write(file + "\n")
102 | 
103 | print(f"[{now}] from {frm} to {end} \"task executed in\" {round(time.time()-start,2)} sec")
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env*
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # Crawling@Home
141 | CLOP
142 | clip
143 | crawlingathome_client
144 | save/*
145 | *.wat
146 | hcloud-python
147 | hcloud
148 | cloud-init
149 | gpusemaphore
150 | *.zip
151 | superseeded*
152 | kaggle.py
153 | workers.txt
154 | crawling_at_home*
155 | FIRST_SAMPLE*
156 | image_embedding*
157 | *.jpg
158 | *.png 
159 | *.jpeg
160 | *.webp
161 | [1-9]*-[1-9]*-[1-9]*-[1-9]*/
162 | test.*
163 | duplicates/
164 | gpujob.zip_*
165 | results/
166 | stats/
167 | save/
168 | parquet/
169 | blocklists/*
170 | *.tar
171 | *.tar.gz
172 | hash.txt
173 | hetzner.txt
174 | alibaba.txt
175 | oracle.txt
176 | test/
177 | gpuerr.txt
178 | database.ini
179 | temp.gz
180 | gpuout.txt
181 | [0-9]/
182 | [1-9][0-9]/
183 | errors.txt
184 | *_full_wat.csv
185 | *.prod 
186 | alibaba_instances.csv
187 | _sample_data/
188 | chromedriver
189 | 


--------------------------------------------------------------------------------
/preCLIP_staging/cleanup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import requests
  5 | import argparse
  6 | from glob import glob
  7 | import os.path as path
  8 | from datetime import datetime
  9 | from multiprocessing import Process, Queue
 10 | from sqlalchemy import create_engine
 11 | from configparser import ConfigParser
 12 | 
 13 | def config(filename='database.ini', section='cah_production'):
 14 |     # create a parser
 15 |     parser = ConfigParser()
 16 |     # read config file
 17 |     parser.read(filename)
 18 |     # get section, default to postgresql
 19 |     db = {}
 20 |     if parser.has_section(section):
 21 |         params = parser.items(section)
 22 |         for param in params:
 23 |             db[param[0]] = param[1]
 24 |     else:
 25 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 26 |     return db
 27 | 
 28 | def confirm_delete(engine, uuid, jobset="en"):
 29 |     jobtable = "jobs"
 30 |     if jobset != "en":
 31 |         jobtable = f"jobs_{jobset}"
 32 |     select_stmt1 = f"select count(*) from {jobtable} where status > 1 and jobid = '{uuid}'"
 33 |     conn = engine.raw_connection()
 34 |     cur = conn.cursor()
 35 |     cur.execute(select_stmt1)
 36 |     jobcount = int(cur.fetchone()[0])
 37 |     conn.commit()
 38 |     cur.close()
 39 |     conn.close()
 40 |     return jobcount
 41 | 
 42 | def worker(engine, q: Queue, jobset = "en"):
 43 |     jobspath = '/mnt/md0/gpujobs/'
 44 |     if jobset == "intl":
 45 |         jobspath = '/mnt/md0/gpujobsml/'
 46 |     if jobset == "nolang":
 47 |         jobspath = '/mnt/md0/gpujobsnolang/'
 48 |     while q.qsize()>0:
 49 |         try:
 50 |             uuid = q.get_nowait()
 51 |             if confirm_delete(engine, uuid, jobset)==1:
 52 |                 file = f"{jobspath}{uuid}.tar.gz"
 53 |                 if os.path.isfile(file) and os.path.getmtime(file) < time.time() - 60*60: # this makes the code more robust
 54 |                     os.remove(file)
 55 |                     print(f"deleted {file}")
 56 |         except Exception as e:
 57 |             print (f"worker raised error {e}")
 58 |             pass
 59 | 
 60 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -s/--set')
 61 | parser.add_argument("-s","--set",action='append',help="Choose current set (en, nolang, intl)",required=False)
 62 | args = parser.parse_args()
 63 | 
 64 | params = config()
 65 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}',pool_size=25, max_overflow=50)
 66 | 
 67 | jobset = "en"
 68 | 
 69 | if args.set is not None:
 70 |     jobset = args.set[0]
 71 | 
 72 | jobspath = '/mnt/md0/gpujobs/*.tar.gz'
 73 | if jobset == "intl":
 74 |     jobspath = '/mnt/md0/gpujobsml/*.tar.gz'
 75 | if jobset == "nolang":
 76 |     jobspath = '/mnt/md0/gpujobsnolang/*.tar.gz'
 77 | 
 78 | now = datetime.now().strftime("%Y/%m/%d_%H:%M")
 79 | list_of_files = glob(jobspath)
 80 | frm = len(list_of_files)
 81 | 
 82 | start = time.time()
 83 | q = Queue()
 84 | procs = []
 85 | for i in range(10):
 86 |     procs.append(Process(target=worker, args=[engine, q, jobset]))
 87 | 
 88 | for file in list_of_files:
 89 |     if time.time() - path.getmtime(file) < 300:
 90 |         continue
 91 |     uuid = file.split("/")[4].split(".")[0]
 92 |     q.put(uuid)
 93 | 
 94 | time.sleep(20)
 95 | 
 96 | for proc in procs:
 97 |     proc.start()
 98 | for proc in procs:
 99 |     proc.join()
100 | 
101 | list_of_files = glob(jobspath)
102 | end = len(list_of_files)
103 | 
104 | with open("jobs.txt","wt") as f:
105 |     for file in list_of_files:
106 |         f.write(file + "\n")
107 | 
108 | print(f"[{now}] from {frm} to {end} \"task executed in\" {round(time.time()-start,2)} sec")
109 | 


--------------------------------------------------------------------------------
/postgres/tables.sql:
--------------------------------------------------------------------------------
  1 | create table dataset_en
  2 | (
  3 |     sampleid bigint  not null
  4 |         constraint dataset_en_pk
  5 |             primary key,
  6 |     url      text    not null,
  7 |     text     text    not null,
  8 |     license  varchar,
  9 |     domain   varchar,
 10 |     wat      integer,
 11 |     status   smallint default 0,
 12 |     illegal  boolean  default false,
 13 |     hash     varchar not null,
 14 |     modified timestamp,
 15 |     language varchar not null,
 16 |     width    integer,
 17 |     height   integer
 18 | )
 19 |     with (autovacuum_analyze_threshold = 10000, autovacuum_vacuum_cost_limit = 50, autovacuum_vacuum_cost_delay = 0.1, autovacuum_vacuum_scale_factor = 0.1);
 20 | 
 21 | alter table dataset_en
 22 |     owner to cah;
 23 | 
 24 | create index dataset_en_status_index
 25 |     on dataset_en (status);
 26 | 
 27 | create trigger update_customer_modtime
 28 |     before update
 29 |     on dataset_en
 30 |     for each row
 31 | execute procedure update_modified_column();
 32 | 
 33 | create table dataset_intl
 34 | (
 35 |     sampleid bigint  not null
 36 |         constraint dataset_pk
 37 |             primary key,
 38 |     url      text    not null,
 39 |     text     text    not null,
 40 |     license  varchar,
 41 |     domain   varchar,
 42 |     wat      integer,
 43 |     status   smallint default 0,
 44 |     illegal  boolean  default false,
 45 |     hash     varchar not null,
 46 |     modified timestamp,
 47 |     language varchar not null,
 48 |     width    integer,
 49 |     height   integer
 50 | )
 51 |     with (autovacuum_analyze_threshold = 10000000, autovacuum_vacuum_cost_limit = 150, autovacuum_vacuum_cost_delay = 0.1, autovacuum_vacuum_scale_factor = 0);
 52 | 
 53 | alter table dataset_intl
 54 |     owner to cah;
 55 | 
 56 | create index dataset_status_index
 57 |     on dataset_intl (status);
 58 | 
 59 | create trigger update_customer_modtime
 60 |     before update
 61 |     on dataset_intl
 62 |     for each row
 63 | execute procedure update_modified_column();
 64 | 
 65 | create table dataset_nolang
 66 | (
 67 |     sampleid bigint  not null
 68 |         constraint dataset_nolang_pk
 69 |             primary key,
 70 |     url      text    not null,
 71 |     text     text    not null,
 72 |     license  varchar,
 73 |     domain   varchar,
 74 |     wat      integer,
 75 |     status   smallint default 0,
 76 |     illegal  boolean  default false,
 77 |     hash     varchar not null,
 78 |     modified timestamp,
 79 |     language varchar not null,
 80 |     width    integer,
 81 |     height   integer
 82 | )
 83 |     with (autovacuum_analyze_threshold = 10000000, autovacuum_vacuum_cost_limit = 150, autovacuum_vacuum_cost_delay = 0.1, autovacuum_vacuum_scale_factor = 0);
 84 | 
 85 | alter table dataset_nolang
 86 |     owner to cah;
 87 | 
 88 | create index dataset_nolang_status_index
 89 |     on dataset_nolang (status);
 90 | 
 91 | create trigger update_customer_modtime
 92 |     before update
 93 |     on dataset_nolang
 94 |     for each row
 95 | execute procedure update_modified_column();
 96 | 
 97 | create table dataset_buffer
 98 | (
 99 |     sampleid bigint,
100 |     url      text    not null,
101 |     text     text    not null,
102 |     license  varchar,
103 |     domain   varchar,
104 |     wat      integer,
105 |     status   smallint default 0,
106 |     illegal  boolean  default false,
107 |     hash     varchar not null,
108 |     modified timestamp,
109 |     language varchar not null,
110 |     width    integer,
111 |     height   integer
112 | );
113 | 
114 | alter table dataset_buffer
115 |     owner to cah;
116 | 
117 | create trigger skip_errors
118 |     before insert
119 |     on dataset_buffer
120 |     for each row
121 | execute procedure on_insert_in_original_table();
122 | 
123 | 


--------------------------------------------------------------------------------
/postgres/stage_db.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sys
  4 | import psycopg2
  5 | import argparse
  6 | import fileinput
  7 | from glob import glob
  8 | import pandas as pd
  9 | from sqlalchemy import create_engine
 10 | from configparser import ConfigParser
 11 | from tqdm.auto import tqdm
 12 | 
 13 | 
 14 | def config(filename='database.ini', mode="test"):
 15 |     # create a parser
 16 |     parser = ConfigParser()
 17 |     # read config file
 18 |     parser.read(filename)
 19 | 
 20 |     section='postgresql'
 21 |     if mode == "production":
 22 |         section='cah_production'
 23 | 
 24 |     # get section, default to postgresql
 25 |     db = {}
 26 |     if parser.has_section(section):
 27 |         params = parser.items(section)
 28 |         for param in params:
 29 |             db[param[0]] = param[1]
 30 |     else:
 31 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 32 | 
 33 |     return db
 34 | 
 35 | def get_count(engine, ds="intl"):
 36 |     table="dataset_intl"
 37 |     if ds == "en":
 38 |         table = "dataset_en"
 39 |     elif ds == "nolang":
 40 |         table = "dataset_nolang"
 41 |     select_stmt1 = f"select count(*) from {table} where status = 0"
 42 |     conn = engine.raw_connection()
 43 |     cur = conn.cursor()
 44 |     cur.execute(select_stmt1)
 45 |     count = cur.fetchone()
 46 |     conn.commit()
 47 |     cur.close()
 48 |     conn.close()
 49 |     return str(count[0])
 50 | 
 51 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -m/--mode -s/--set -p/--path')
 52 | parser.add_argument("-m","--mode",action='append',help="Mode to run", required=True)
 53 | parser.add_argument("-s","--set",action='append',help="Dataset to run", required=False)
 54 | parser.add_argument("-p","--path",action='append',help="Choose source path", required=False)
 55 | parser.add_argument("-l","--limit",action='append',help="Specify DB table limit", required=False)
 56 | args = parser.parse_args()
 57 | 
 58 | dir = "/mnt/md1/export/staging"
 59 | if args.path is not None:
 60 |     dir = args.path[0]
 61 | 
 62 | mode = "txt"
 63 | if args.mode is not None:
 64 |     mode = args.mode[0]
 65 | 
 66 | ds = "intl"
 67 | if args.set is not None:
 68 |     ds = args.set[0]
 69 | 
 70 | limit = 500000000
 71 | if args.limit is not None:
 72 |     limit = int(args.limit[0])
 73 | 
 74 | i = 0
 75 | 
 76 | params = config(mode="production")
 77 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_size=5, max_overflow=10, pool_pre_ping=True)
 78 | 
 79 | files = glob(f'{dir}/*.{mode}')
 80 | 
 81 | conn = engine.raw_connection()
 82 | 
 83 | j = 3
 84 | if mode == "txt":
 85 |     j = 1000
 86 | 
 87 | with tqdm(total=len(files), file=sys.stdout) as pbar:
 88 |     pbar.desc = get_count(engine, ds)
 89 |     for file in files:
 90 |         try:
 91 |             cur = conn.cursor()
 92 |             with open(file, "rt") as f:
 93 |                 if mode == "txt":
 94 |                     cur.copy_from(f, 'dataset_buffer', columns=("sampleid","url","text","license","domain","wat","hash","language"))
 95 |                 elif mode == "csv":
 96 |                     cur.copy_expert("COPY dataset_buffer from STDIN DELIMITER '|' CSV HEADER", f)
 97 |                 else:
 98 |                     print("bad mode, choose txt or csv only")
 99 |                     break
100 |             conn.commit()
101 |             cur.close()
102 |             os.system(f"mv {file} {file}.done")
103 |             i+=1
104 |             if i % j == 0:
105 |                 count = get_count(engine, ds)
106 |                 if int(count) > limit:
107 |                     break
108 |                 else:
109 |                     pbar.desc = count
110 |             pbar.update(1)
111 |                 
112 |         except Exception as e:
113 |             print(f"error {file} because {e}")
114 |             for line in fileinput.input(file, inplace = True):
115 |                 if not re.search(r'\x00', line):
116 |                     print(line, end="")
117 |             try:
118 |                 df = pd.read_csv(file, sep="\t", on_bad_lines='skip', header=None)
119 |                 df[2] = df[2].apply(lambda x: x.replace("\n",""))
120 |                 df[5] = df[5].apply(lambda x: int(x))
121 |                 df.to_csv(file, sep="\t", index=False, header=False)
122 |             except:
123 |                 #os.system(f"mv {file} {file}.error")
124 |                 pass
125 |             conn.close()
126 |             conn = engine.raw_connection()
127 | conn.close()
128 | 
129 | print("if you had files with error of \x00 present in file, files were automatically corrected, please rerun the script")


--------------------------------------------------------------------------------
/cloud boot/boot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | sudo su root
 3 | 
 4 | apt update && yes | DEBIAN_FRONTEND=noninteractive apt upgrade
 5 | yes | apt install python3-pip git build-essential libssl-dev libffi-dev python3-dev libpq-dev libwebp-dev libjpeg-dev libtiff-dev libopenjp2-7-dev zlib1g-dev knot-dnsutils resolvconf protobuf-compiler libprotobuf-dev
 6 | wget https://secure.nic.cz/files/knot-resolver/knot-resolver-release.deb
 7 | sudo dpkg -i knot-resolver-release.deb
 8 | sudo apt install -y knot-resolver
 9 | systemctl enable --now kresd@{1..2}.service
10 | systemctl disable systemd-resolved
11 | 
12 | echo 'CAH_NICKNAME="Caricature, Inc"' >> /etc/environment
13 | 
14 | adduser --system --group --shell /bin/bash crawl
15 | echo 'crawl     ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
16 | 
17 | touch /home/crawl/worker-reset.sh
18 | chmod 0744 /home/crawl/worker-reset.sh
19 | echo '#!/bin/bash' >> /home/crawl/worker-reset.sh
20 | echo '# Updates and resets the worker via SSH command' >> /home/crawl/worker-reset.sh
21 | echo 'rm -rf /home/crawl/*.tar.gz' >> /home/crawl/worker-reset.sh
22 | echo 'cd /home/crawl/crawlingathome-gpu-hcloud' >> /home/crawl/worker-reset.sh
23 | echo 'git pull' >> /home/crawl/worker-reset.sh
24 | echo 'systemctl restart crawl' >> /home/crawl/worker-reset.sh
25 | 
26 | echo "* soft     nproc          65535 " >> /etc/security/limits.conf
27 | echo "* hard     nproc          65535 " >> /etc/security/limits.conf
28 | echo "* soft     nofile         65535" >> /etc/security/limits.conf
29 | echo "* hard     nofile         65535" >> /etc/security/limits.conf
30 | echo "root soft     nproc          65535 " >> /etc/security/limits.conf
31 | echo "root hard     nproc          65535 " >> /etc/security/limits.conf
32 | echo "root soft     nofile         65535" >> /etc/security/limits.conf
33 | echo "root hard     nofile         65535" >> /etc/security/limits.conf
34 | echo "session required pam_limits.so" >> /etc/pam.d/common-session
35 | echo "fs.file-max = 2097152" >> /etc/sysctl.conf
36 | 
37 | echo "[Unit]" >> /etc/systemd/system/crawl.service
38 | echo "After=network.service" >> /etc/systemd/system/crawl.service
39 | echo "Description=Crawling @ Home" >> /etc/systemd/system/crawl.service
40 | echo "[Service]" >> /etc/systemd/system/crawl.service
41 | echo "Type=simple" >> /etc/systemd/system/crawl.service
42 | echo "LimitNOFILE=2097152" >> /etc/systemd/system/crawl.service
43 | echo "WorkingDirectory=/home/crawl" >> /etc/systemd/system/crawl.service
44 | echo "ExecStart=/home/crawl/crawl.sh" >> /etc/systemd/system/crawl.service
45 | echo "EnvironmentFile=/etc/environment" >> /etc/systemd/system/crawl.service
46 | echo "User=crawl" >> /etc/systemd/system/crawl.service
47 | echo "Nice=10" >> /etc/systemd/system/crawl.service
48 | echo "[Install]" >> /etc/systemd/system/crawl.service
49 | echo "WantedBy=multi-user.target" >> /etc/systemd/system/crawl.service
50 | chmod 664 /etc/systemd/system/crawl.service
51 | 
52 | systemctl daemon-reload
53 | systemctl enable crawl.service
54 | 
55 | touch /home/crawl/crawl.sh
56 | echo '#!/bin/bash' >> /home/crawl/crawl.sh
57 | echo "while true" >> /home/crawl/crawl.sh
58 | echo "do" >> /home/crawl/crawl.sh
59 | echo "python3 -u /home/crawl/crawlingathome-gpu-hcloud/worker.py >> /home/crawl/crawl.log 2>&1" >> /home/crawl/crawl.sh
60 | echo "sleep 1" >> /home/crawl/crawl.sh
61 | echo "done" >> /home/crawl/crawl.sh
62 | chmod 744 /home/crawl/crawl.sh
63 | mkdir /home/crawl/.ssh
64 | echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC0Ff0RcDRafX/VyxYJTeMWJrJGHIKvAvIG+nUmUR73iQFcwF7JP8FucLO0baVIPb029DI469SOZJWh6FTwt5T+IT5jm0UDAs2gwYClS+tRbohr27kXoILhlugFiCor4TD0mMhBTKme4RPLlcbLYaZq4r7Rep0rbWn46f3Gma2fDXgpy3v1JZBa30yHxQVO+s2UjbqPk9RcsWNQ7oap36yGrVb6Bc8ucwAM6pGTdJMQBZoTj0tgI/b9cSgKO1JRyUTt6HhuW+DDfrOuZPJLqOq0f5sNV0gD+89K9zNEtZeO+bpQuZvf+cwhb10XQc4t0Yd8EsyhxSbWbdvn6Utb9yQwmk7ThJkxLLLmDp5LtClOvp6PTFUooDjj3DgFfD8ZBK+sckwu1TPAKa8Y8jU+q4GfF5abAej5rXObVjVcKHsziBSsSG6yViVtoFAvqh0dYfM/Ujz7dj6KtfRs67J5X+8CJvvKokRZcjMs6neJNHoRll5t6K/uhQgKHvBRpFqL9kGS4hTEdJog47w9o8qmLTMYQ340ckEZkRh/c1lWu51wNycLW1iab40D2F/ymMihGxMo9AqHKoqE/cnh9SaZr1EGr7s4BhBnAvyOwHh2+sW5ndOenDOZ1wGbYbwVJznSG8I1tdlJzEjf2GuW1HZtxE/95yW0zlEQkue8mBfNUL+Q6Q== Generated by richa@RICHARD' >> /home/crawl/.ssh/authorized_keys
65 | 
66 | sed -i -e '/^\(#\|\)cache\.size/s/^.*$/cache\.size = 10000 \* MB/' /etc/knot-resolver/kresd.conf
67 | echo "trust_anchors.remove('.')" >> /etc/knot-resolver/kresd.conf
68 | echo "policy.add(policy.all(policy.FORWARD({'1.1.1.1'})))" >> /etc/knot-resolver/kresd.conf
69 | 
70 | cd /home/crawl
71 | 
72 | git clone https://github.com/rvencu/crawlingathome-gpu-hcloud
73 | cd crawlingathome-gpu-hcloud
74 | git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client
75 | pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir
76 | pip3 install -r worker-requirements.txt --no-cache-dir
77 | pip install random_user_agent
78 | 
79 | chown crawl:crawl -R /home/crawl/
80 | 
81 | sudo apt clean
82 | sudo reboot
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Crawling@Home GPU controlled Hetzner Cloud swarm of scrapers
 2 | 
 3 | > Help us build a billion-scale image-caption dataset by filtering Common Crawl with OpenAI CLIP. At the time of this writing we are up to 5 billion high quality pairs ready for training various models but we still expect your help to advance to the potential 6 billion quality pairs estimated to exist in the commoncrawl data. This dataset is intended for public use and towards a truly open access to AI for everyone !
 4 | 
 5 | ## Concept
 6 | This image-text scraping task comes with specific characteristics: link lists might be old and images might not be online anymore, even entire domains might be missing. Also there are seldom multiple links pointing to the same domain, so the DNS queries are many and often. Finally after the actual scraping there is a computational intensive task to calculate similarities between images themselves and their captions.
 7 | 
 8 | On a normal CPU machine, scraping and filtering take almost the same time. On a GPU though filtering is much faster, in order of 60x faster than on single CPU.
 9 | 
10 | Hence this concept for crawling@home where we created a data pipeline on 3 levels:
11 | 1. commoncrawl preprocessing, where we use a swarm of about 500 cpus to download, parse and send results to a database node with candidates for our dataset, meaning image urls with alt text, plus the detected language using gcld3. By the language detection we split the candidates into English, Multilanguage (non English) and Nolang (language not detected with confidence) categories.
12 | 2. image downloading and inspection, prefiltering by image type and resolution, producing further candidates for CLIP or mCLIP inference
13 | 3. CLIP style inference where we calculate similarity of image embeddings with text embeddings and retain only pairs with higher similarity than a manually set threshold
14 | 
15 | Common Crawl jobs are coordinated by a tracker with dashboard at http://cah.io.community/
16 | 
17 | ## Cloud workers
18 | We used AWS workers for first level of the above pipeline, Hetzner and Alibaba workers for the second level and home GPU plus AWS GPU nodes for the third level.
19 | 
20 | Thus the code migrated to:
21 | 1. Hetzner swarm control: use `infrastructure.py` to control the swarm at Hetzner Cloud via commands like `python3 infrastructure.py up 20 fsn1` where up means bring up swarm, 20 is the desired number of nodes, and fsn1 is the desired datacenter location.
22 | 2. Alibaba swarm control: due to cost restrictions we used Simple Application Servers with Alibaba, and developed a limited scope control script
23 | 3. CPU clients:
24 |     a) `ccpp.py` is used to preprocess common crawl wat files. Nodes require minimum one CPU core and 1GB RAM for each CPU.
25 |     b) `dbdl.py` is used to download images. Nodes require minimum one CPU core and 1GB RAM for each CPU.
26 | 3. GPU clients only consume max 3.5GB of GPU VRAM so any nVidia GPU card with 4GB VRAM or more is deemed compatible:
27 |     a) run `python3 gpu_inference.py` from any Linux based PC with an Nvidia GPU and correct drivers installed
28 | 
29 | If you want to install on your own box, then
30 | ## Prerequisites
31 | 1. Ubuntu box with 4GB+ Nvidia GPU
32 | 2. Nvidia driver installed
33 | 3. Cuda toolkit 11+ (also corresponding cudnn is recommended for future)
34 | 4. check driver installation with `nvidia-smi` command
35 | 5. your user is able to run `sudo` commands
36 | 6. install `python3-pip` and `git` packages
37 | ## Distributed infrastructure setup and run
38 | 1. Make an account at Hetzner Cloud (https://www.hetzner.com/) and issue an API token
39 | 2. create the `.env` file and paste your HCLOUD API key in it. optionally, if you have more than one account, paste all API keys each on a separate line
40 | 3. bring up infrastructure at any time with `python3 infrastructure.py up N` in order to raise *N* nodes. It will scan all API keys and create maximum available servers on each until *N* limit is met
41 | 4. tear down infrastructure at any time with `python3 infrastructure.py down` in order to shutdown things (and save cash). this will shut down all cloud servers that belong to all API tokens saved in the `.env` file. Be aware, this command will delete all servers in the accounts even if they are NOT related to this project !!!
42 | 
43 | If you wish to SSH into any droplet you can use this command: `ssh -oStrictHostKeyChecking=no -oIdentitiesOnly=yes -i~/.ssh/id_cah crawl@<<droplet_ip>>`. The crawling script is ran as a service, check logs with `tail -f crawl.log`. Access service status or commands with `sudo systemctl stop|restart|start crawl`
44 | 
45 | If you are asked for any droplet root password at any time, it means you need to rerun `git pull` and `source conda-setup.sh` to refresh the files and regenerate the ssh keys pair.
46 | 
47 | ## How to run GPU node from home computer
48 | 1. run `git clone https://github.com/rvencu/crawlingathome-gpu-hcloud`, to download crawlingathome GPU node script
49 | 2. run `cd crawlingathome-gpu-hcloud`, to enter the newly created directory
50 | 3. run `source conda-setup.sh` to setup the environment if you use anaconda. otherwise use `source pip-setup.sh`. the script will ask for a nickame to be used on leaderboard as well as for the sudo password
51 | 4. run `gpu_inference.py`. The script will run in a loop that can be interrupted at any time with Ctrl-C.
52 | 
53 | This work is based on code written by:
54 | - https://github.com/TheoCoombes/crawlingathome
55 | - https://github.com/Wikidepia/crawlingathome-worker
56 | 
57 | This is a subproject ran by the community around https://github.com/lucidrains/DALLE-pytorch
58 | 


--------------------------------------------------------------------------------
/cloud boot/cloud-init.yaml:
--------------------------------------------------------------------------------
  1 | #cloud-config 
  2 | users:
  3 |   - default
  4 |   - name: crawl
  5 |     groups: users, adm
  6 |     sudo: ALL=(ALL) NOPASSWD:ALL
  7 |     shell: /bin/bash
  8 |     ssh_authorized_keys:
  9 |       - ssh-rsa <<your_ssh_public_key>>
 10 | package_update: true
 11 | package_upgrade: true
 12 | packages:
 13 |  - python3-pip
 14 |  - git
 15 |  - build-essential
 16 |  - libssl-dev
 17 |  - libffi-dev
 18 |  - python3-dev
 19 |  - libwebp-dev
 20 |  - libjpeg-dev
 21 |  - libwebp-dev
 22 |  - libtiff-dev 
 23 |  - libopenjp2-7-dev 
 24 |  - zlib1g-dev
 25 |  - libpq-dev
 26 |  - knot-dnsutils
 27 |  - resolvconf
 28 |  - protobuf-compiler
 29 |  - libprotobuf-dev
 30 | bootcmd:
 31 |  # setup knot resolver
 32 |  - wget https://secure.nic.cz/files/knot-resolver/knot-resolver-release.deb
 33 |  - sudo dpkg -i knot-resolver-release.deb
 34 |  - sudo apt install -y knot-resolver
 35 |  #- systemctl enable --now kresd@{1..2}.service
 36 |  #- systemctl disable systemd-resolved
 37 | write_files:
 38 | - path: /etc/environment
 39 |   content: |
 40 |     CAH_NICKNAME="<<your_nickname>>"
 41 |     CLOUD="<<deployment_cloud>>"
 42 |   append: true
 43 | - path: /home/crawl/worker-reset.sh
 44 |   permissions: '0744'
 45 |   content: |
 46 |     #!/bin/bash
 47 |     # Updates and resets the worker via SSH command
 48 |     rm -rf /home/crawl/*.tar.gz
 49 |     cd /home/crawl/crawlingathome-gpu-hcloud
 50 |     git pull
 51 |     chown crawl:adm -R /home/crawl/
 52 |     systemctl restart crawl
 53 | - path: /etc/security/limits.conf
 54 |   content: |
 55 |     *     soft     nproc     65535
 56 |     *     hard     nproc     65535
 57 |     *     soft     nofile    65535
 58 |     *     hard     nofile    65535
 59 |     root  soft     nproc     65535
 60 |     root  hard     nproc     65535
 61 |     root  soft     nofile    65535
 62 |     root  hard     nofile    65535
 63 |   append: true
 64 | - path: /home/crawl/crawl.sh
 65 |   permissions: '0744'
 66 |   content: |
 67 |     #!/bin/bash
 68 |     while true
 69 |       do
 70 |         python3 -u /home/crawl/crawlingathome-gpu-hcloud/dbdl.py -s nolang >> /home/crawl/crawl.log 2>&1
 71 |         sleep 1
 72 |       done
 73 | - path: /home/crawl/database.ini
 74 |   permissions: '0744'
 75 |   content: |
 76 |     [cah_production]
 77 |     host=<<host>>
 78 |     database=<<database>>
 79 |     user=<<dbuser>>
 80 |     password=<<dbpwd>>
 81 | - path: /etc/systemd/system/crawl.service
 82 |   permissions: '0664'
 83 |   content: |
 84 |     [Unit]
 85 |     After=network.service
 86 |     Description=Crawling @ Home
 87 |     [Service]
 88 |     Type=simple
 89 |     LimitNOFILE=2097152
 90 |     WorkingDirectory=/home/crawl
 91 |     ExecStart=/home/crawl/crawl.sh
 92 |     EnvironmentFile=/etc/environment
 93 |     User=crawl
 94 |     Nice=10
 95 |     [Install]
 96 |     WantedBy=multi-user.target
 97 | - path: /etc/knot-resolver/kresd.conf
 98 |   content: |
 99 |     -- SPDX-License-Identifier: CC0-1.0
100 |     -- vim:syntax=lua:set ts=4 sw=4:
101 |     -- Refer to manual: https://knot-resolver.readthedocs.org/en/stable/
102 |     -- Network interface configuration
103 |     net.listen('127.0.0.1', 53, { kind = 'dns' })
104 |     net.listen('127.0.0.1', 853, { kind = 'tls' })
105 |     --net.listen('127.0.0.1', 443, { kind = 'doh2' })
106 |     net.listen('::1', 53, { kind = 'dns', freebind = true })
107 |     net.listen('::1', 853, { kind = 'tls', freebind = true })
108 |     --net.listen('::1', 443, { kind = 'doh2' })
109 |     -- Load useful modules
110 |     modules = {
111 |             'hints > iterate',    -- Load /etc/hosts and allow custom root hints
112 |             'stats',              -- Track internal statistics
113 |             'predict',            -- Prefetch expiring/frequent records
114 |             'serve_stale < cache' -- Server from cache if forwarder is staled
115 |     }
116 |     -- Cache size
117 |     cache.size = 100 * MB
118 |     -- Disable DNSSEC
119 |     trust_anchors.remove('.')
120 |     -- forward all traffic to specified IP addresses (selected automatically)
121 |     -- policy.add(policy.all(policy.FORWARD({'10.254.0.5', '1.1.1.1'})))
122 | runcmd:
123 |  - [ ls, -l, / ]
124 |  - [ sh, -xc, "echo $(date) ': hello crawl!'" ]
125 |  - [ sh, -c, echo "=========hello crawl'=========" ]
126 |  - ls -l /root
127 |  # take care of max open files
128 |  - echo "session required pam_limits.so" >> /etc/pam.d/common-session
129 |  - echo "fs.file-max = 2097152" >> /etc/sysctl.conf
130 | # secure ssh
131 |  - sed -i -e '/^\(#\|\)PermitRootLogin/s/^.*$/PermitRootLogin no/' /etc/ssh/sshd_config
132 |  - sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config
133 |  - sed -i -e '/^\(#\|\)X11Forwarding/s/^.*$/X11Forwarding no/' /etc/ssh/sshd_config
134 |  - sed -i -e '/^\(#\|\)MaxAuthTries/s/^.*$/MaxAuthTries 2/' /etc/ssh/sshd_config
135 |  - sed -i -e '/^\(#\|\)AllowTcpForwarding/s/^.*$/AllowTcpForwarding no/' /etc/ssh/sshd_config
136 |  - sed -i -e '/^\(#\|\)AllowAgentForwarding/s/^.*$/AllowAgentForwarding no/' /etc/ssh/sshd_config
137 |  - sed -i -e '/^\(#\|\)AuthorizedKeysFile/s/^.*$/AuthorizedKeysFile .ssh\/authorized_keys/' /etc/ssh/sshd_config
138 |  - sed -i '$a AllowUsers crawl' /etc/ssh/sshd_config
139 |  # install the script
140 |  - cd /home/crawl
141 |  - git clone https://github.com/rvencu/crawlingathome-gpu-hcloud
142 |  - cd crawlingathome-gpu-hcloud
143 |  #- git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client
144 |  #- pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir
145 |  - pip3 install -r worker-requirements.txt --no-cache-dir
146 |  - pip install random_user_agent
147 |  # make the script run as a service at startup
148 |  - systemctl daemon-reload
149 |  - systemctl enable crawl.service
150 |  - chown crawl:adm -R /home/crawl/
151 |  - apt clean
152 |  - reboot
153 | final_message: "The system is finally up, after $UPTIME seconds"
154 | 
155 | 


--------------------------------------------------------------------------------
/postCLIP_staging/bloom.py:
--------------------------------------------------------------------------------
  1 | # use this file inside every minute cron in order to recalculate bloom filters. location: staging server
  2 | # folder structure
  3 | # /home/archiveteam/CAH/
  4 | #                   |_bloom         archiveteam@IP::bloom   contains bloom filters
  5 | #                   |_clipped                               contains clipped lists
  6 | #                   |_ds                                    contains files ready to be sent to the eye
  7 | #                   |_hashes                                contains list of hashes of files inserted into the dataset
  8 | #                   |_results       archiveteam@IP::CAH     incoming folder for the final results from workers
  9 | 
 10 | # Stacked bloom filters. Naming convention:
 11 | #   frozen filters: filter.bin, filter1.bin, filter2.bin
 12 | #   active filters: filter_active.bin
 13 | #
 14 | #
 15 | import sys
 16 | import time
 17 | import requests
 18 | import pandas as pd
 19 | from glob import glob
 20 | from pathlib import Path
 21 | from datetime import datetime
 22 | from bloom_filter2 import BloomFilter
 23 | 
 24 | with open("bloomlog.txt","a") as log:
 25 | 
 26 |     # update the bloom server filters too
 27 |     bloomip = "116.202.162.146"
 28 | 
 29 |     serverbloom = BloomFilter(max_elements=10000000, error_rate=0.01, filename=(f"/home/archiveteam/bloom-{bloomip}.bin",-1))
 30 |     intlbloom = BloomFilter(max_elements=10000000, error_rate=0.01, filename=(f"/home/archiveteam/intl-{bloomip}.bin",-1))
 31 |     serverclip = BloomFilter(max_elements=10000000, error_rate=0.01, filename=(f"/home/archiveteam/clip-{bloomip}.bin",-1))
 32 | 
 33 |     start = time.time()
 34 |     now = datetime.now().strftime("%Y/%m/%d_%H:%M")
 35 | 
 36 |     time.sleep(5)
 37 |     counter = 0
 38 |     counterintl = 0
 39 |     uniques = 0
 40 |     uniquesintl = 0
 41 |     main = [(0,0)]
 42 |     intl = [(0,0)]
 43 |     for file in glob("/home/archiveteam/CAH/hashes/*.hsh"):
 44 |         stem = Path(file).stem.strip(".")
 45 |         if stem not in serverbloom:
 46 |             with open(file,"rt") as f:
 47 |                 for line in f.readlines():
 48 |                     counter += 1
 49 |             post = {
 50 |                 'file': (stem, open(file, 'rb')),
 51 |                 'key': (None, 'main'),
 52 |             }
 53 |             response = requests.post(f'http://{bloomip}:8000/add/', files=post)
 54 |             if response.status_code == 200:
 55 |                 serverbloom.add(stem)
 56 |                 uniques += int(response.text)
 57 |             main.append(tuple(map(lambda i, j: i - j, (counter,uniques), main[-1])))
 58 |     del(main[0])
 59 |     #log.write(str(main) + "\n")
 60 |     for file in glob("/home/archiveteam/CAH/hashesintl/*.hsh"):
 61 |         stem = Path(file).stem.strip(".")
 62 |         if stem not in intlbloom:
 63 |             with open(file,"rt") as f:
 64 |                 for line in f.readlines():
 65 |                     counterintl += 1
 66 |             post = {
 67 |                 'file': (stem, open(file, 'rb')),
 68 |                 'key': (None, 'multilanguage'),
 69 |             }
 70 |             response = requests.post(f'http://{bloomip}:8000/add/', files=post)
 71 |             if response.status_code == 200:
 72 |                 intlbloom.add(stem)
 73 |                 uniquesintl += int(response.text)
 74 |             intl.append(tuple(map(lambda i, j: i - j, (counterintl,uniquesintl), intl[-1])))
 75 |     del(intl[0])
 76 | 
 77 |     clippedlist=[0]
 78 |     clipped_counter = 0
 79 |     for file in glob("/home/archiveteam/CAH/clipped/*.clp"):
 80 |         stem = Path(file).stem.strip(".")
 81 |         if stem not in serverclip:
 82 |             post = {
 83 |                 'file': (stem, open(file, 'rb')),
 84 |                 'key': (None, 'clipped'),
 85 |             }
 86 |             response = requests.post(f'http://{bloomip}:8000/add/', files=post)
 87 |             if response.status_code == 200:
 88 |                 serverclip.add(stem)
 89 |                 clipped_counter += int(response.text)
 90 |             clippedlist.append(clipped_counter-clippedlist[-1])
 91 |     del clippedlist[0]
 92 |     #log.write(str(clippedlist) + "\n")
 93 | 
 94 |     pd.set_option('precision', 2)
 95 |     df = pd.read_csv("bloom.log", sep=" ",header=None, names=["Date", "a", "unique pairs (5%)", "b", "total including duplicates","c","clipped filter (5%)","d","failed filter","e"])
 96 |     df["Date"]=df.Date.apply(lambda x: datetime.strptime(x, "[%Y/%m/%d_%H:%M]"))
 97 |     df["unique pairs (5%)"]=df["unique pairs (5%)"]/1000000
 98 |     df["total including duplicates"]=df["total including duplicates"]/1000000
 99 |     df["clipped filter (5%)"]=df["clipped filter (5%)"]/1000000
100 | 
101 |     #log.write("Done df calc \n")
102 |     if uniques + uniquesintl + clipped_counter > 0:
103 |         print(f"[{now}] added {uniques + uniquesintl} \"from total of\" {counter + counterintl} \"( {str(main)} i.e. {round((counter +  counterintl - uniques - uniquesintl)*100/(counter + counterintl + sys.float_info.epsilon), 2)}% duplication in {round(time.time()-start,2)} sec) Also added \" {clipped_counter} \" {str(clippedlist)} clipped\" and 0 failed")
104 | 
105 |         #log.write("Printed stats \n")
106 | 
107 |         with open('dashboard.txt', 'w') as file:
108 |             file.write("<h5><a href='http://cah.io.community'>Crawling at Home project</a></h5>\n")
109 |             file.write("<h1>Bloom filters status</h1>\n")
110 |             file.write("<h2>All time stats</h2>\n")
111 |             file.write("<h5>initialized from first parquet files</h5>\n")
112 |             file.write(str(df.sum(axis=0, numeric_only=True)).replace("\n","<br/>"))
113 |             file.write("<br/><br/>")
114 |             file.write("<h2>Last day stats</h2>\n")
115 |             file.write(str(df[df.Date > datetime.now() - pd.to_timedelta("1day")].sum(axis=0, numeric_only=True)).replace("\n","<br/>"))
116 |             file.write("<h2>Last week stats</h2>\n")
117 |             file.write("<h5>Last reset date: 01 December 2021</h5>\n")
118 |             file.write(str(df[df.Date > datetime.now() - pd.to_timedelta("7day")].sum(axis=0, numeric_only=True)).replace("\n","<br/>"))
119 |         #log.write("Printed dashboard \n")


--------------------------------------------------------------------------------
/helpers/parquet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "source": [
  7 |     "import pandas as pd\n",
  8 |     "from glob import glob"
  9 |    ],
 10 |    "outputs": [],
 11 |    "metadata": {}
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 5,
 16 |    "source": [
 17 |     "total = 0\n",
 18 |     "count = 0\n",
 19 |     "for file in glob(\"../parquet/3080.rom1504.fr/cah/cah_dataframe_unique/*.parquet\"):\n",
 20 |     "    df = pd.read_parquet(file)\n",
 21 |     "    total += len(df.index)\n",
 22 |     "    df.dropna(subset=['similarity'], inplace=True)\n",
 23 |     "    count += len(df.index)\n",
 24 |     "print(count)"
 25 |    ],
 26 |    "outputs": [
 27 |     {
 28 |      "output_type": "stream",
 29 |      "name": "stdout",
 30 |      "text": [
 31 |       "63505626\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "metadata": {}
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 6,
 40 |    "source": [
 41 |     "print(total)"
 42 |    ],
 43 |    "outputs": [
 44 |     {
 45 |      "output_type": "stream",
 46 |      "name": "stdout",
 47 |      "text": [
 48 |       "70153985\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "metadata": {}
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 8,
 57 |    "source": [
 58 |     "\n",
 59 |     "df = pd.DataFrame\n",
 60 |     "i=0\n",
 61 |     "for file in glob(\"../parquet/3080.rom1504.fr/cah/cah_dataframe_unique/*.parquet\"):\n",
 62 |     "    dfp = pd.read_parquet(file)\n",
 63 |     "    dfp = dfp[dfp['similarity'].isna()]\n",
 64 |     "    if i == 0:\n",
 65 |     "        df = dfp\n",
 66 |     "    else:\n",
 67 |     "        df = df.append(dfp)\n",
 68 |     "    i += 1\n",
 69 |     "df.shape"
 70 |    ],
 71 |    "outputs": [
 72 |     {
 73 |      "output_type": "execute_result",
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "(6648359, 8)"
 77 |       ]
 78 |      },
 79 |      "metadata": {},
 80 |      "execution_count": 8
 81 |     }
 82 |    ],
 83 |    "metadata": {}
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 28,
 88 |    "source": [
 89 |     "df.head()"
 90 |    ],
 91 |    "outputs": [
 92 |     {
 93 |      "output_type": "execute_result",
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "    SAMPLE_ID                                                URL  \\\n",
 97 |        "2         NaN  http://cdn2.newsok.biz/cache/sq105-9f961ae77a2...   \n",
 98 |        "5         NaN  http://images.tastespotting.com/thumbnails/707...   \n",
 99 |        "23        NaN  http://cdn.archinect.net/images/195x140/fc/fcl...   \n",
100 |        "44        NaN  http://patentimages.storage.googleapis.com/thu...   \n",
101 |        "62        NaN  http://demandware.edgesuite.net/sits_pod18/dw/...   \n",
102 |        "\n",
103 |        "                                                 TEXT  HEIGHT  WIDTH LICENSE  \\\n",
104 |        "2   Oklahoma City's Russell Westbrook (0) reacts a...     NaN    NaN       ?   \n",
105 |        "5   {recipe} Cucumber Salad with Cilantro Lime Hon...     NaN    NaN       ?   \n",
106 |        "23              Onerahi Bach Project - Design Details     NaN    NaN       ?   \n",
107 |        "44                                     Patent Drawing     NaN    NaN       ?   \n",
108 |        "62                                      Rouge In Love     NaN    NaN       ?   \n",
109 |        "\n",
110 |        "   NSFW  similarity                              hash  \n",
111 |        "2               NaN  d0533b647020bb8b9ea78c309c4b6457  \n",
112 |        "5               NaN  a146a91137119c0bcaa350d94415359f  \n",
113 |        "23              NaN  67344a9e6e9abcf1f01830f21bfc6b89  \n",
114 |        "44              NaN  6488ba8ef23f5756f8b4db231a69db6c  \n",
115 |        "62              NaN  2cbb4495c590cc34327978633a7f8d18  "
116 |       ],
117 |       "text/html": [
118 |        "<div>\n",
119 |        "<style scoped>\n",
120 |        "    .dataframe tbody tr th:only-of-type {\n",
121 |        "        vertical-align: middle;\n",
122 |        "    }\n",
123 |        "\n",
124 |        "    .dataframe tbody tr th {\n",
125 |        "        vertical-align: top;\n",
126 |        "    }\n",
127 |        "\n",
128 |        "    .dataframe thead th {\n",
129 |        "        text-align: right;\n",
130 |        "    }\n",
131 |        "</style>\n",
132 |        "<table border=\"1\" class=\"dataframe\">\n",
133 |        "  <thead>\n",
134 |        "    <tr style=\"text-align: right;\">\n",
135 |        "      <th></th>\n",
136 |        "      <th>SAMPLE_ID</th>\n",
137 |        "      <th>URL</th>\n",
138 |        "      <th>TEXT</th>\n",
139 |        "      <th>HEIGHT</th>\n",
140 |        "      <th>WIDTH</th>\n",
141 |        "      <th>LICENSE</th>\n",
142 |        "      <th>NSFW</th>\n",
143 |        "      <th>similarity</th>\n",
144 |        "      <th>hash</th>\n",
145 |        "    </tr>\n",
146 |        "  </thead>\n",
147 |        "  <tbody>\n",
148 |        "    <tr>\n",
149 |        "      <th>2</th>\n",
150 |        "      <td>NaN</td>\n",
151 |        "      <td>http://cdn2.newsok.biz/cache/sq105-9f961ae77a2...</td>\n",
152 |        "      <td>Oklahoma City's Russell Westbrook (0) reacts a...</td>\n",
153 |        "      <td>NaN</td>\n",
154 |        "      <td>NaN</td>\n",
155 |        "      <td>?</td>\n",
156 |        "      <td></td>\n",
157 |        "      <td>NaN</td>\n",
158 |        "      <td>d0533b647020bb8b9ea78c309c4b6457</td>\n",
159 |        "    </tr>\n",
160 |        "    <tr>\n",
161 |        "      <th>5</th>\n",
162 |        "      <td>NaN</td>\n",
163 |        "      <td>http://images.tastespotting.com/thumbnails/707...</td>\n",
164 |        "      <td>{recipe} Cucumber Salad with Cilantro Lime Hon...</td>\n",
165 |        "      <td>NaN</td>\n",
166 |        "      <td>NaN</td>\n",
167 |        "      <td>?</td>\n",
168 |        "      <td></td>\n",
169 |        "      <td>NaN</td>\n",
170 |        "      <td>a146a91137119c0bcaa350d94415359f</td>\n",
171 |        "    </tr>\n",
172 |        "    <tr>\n",
173 |        "      <th>23</th>\n",
174 |        "      <td>NaN</td>\n",
175 |        "      <td>http://cdn.archinect.net/images/195x140/fc/fcl...</td>\n",
176 |        "      <td>Onerahi Bach Project - Design Details</td>\n",
177 |        "      <td>NaN</td>\n",
178 |        "      <td>NaN</td>\n",
179 |        "      <td>?</td>\n",
180 |        "      <td></td>\n",
181 |        "      <td>NaN</td>\n",
182 |        "      <td>67344a9e6e9abcf1f01830f21bfc6b89</td>\n",
183 |        "    </tr>\n",
184 |        "    <tr>\n",
185 |        "      <th>44</th>\n",
186 |        "      <td>NaN</td>\n",
187 |        "      <td>http://patentimages.storage.googleapis.com/thu...</td>\n",
188 |        "      <td>Patent Drawing</td>\n",
189 |        "      <td>NaN</td>\n",
190 |        "      <td>NaN</td>\n",
191 |        "      <td>?</td>\n",
192 |        "      <td></td>\n",
193 |        "      <td>NaN</td>\n",
194 |        "      <td>6488ba8ef23f5756f8b4db231a69db6c</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>62</th>\n",
198 |        "      <td>NaN</td>\n",
199 |        "      <td>http://demandware.edgesuite.net/sits_pod18/dw/...</td>\n",
200 |        "      <td>Rouge In Love</td>\n",
201 |        "      <td>NaN</td>\n",
202 |        "      <td>NaN</td>\n",
203 |        "      <td>?</td>\n",
204 |        "      <td></td>\n",
205 |        "      <td>NaN</td>\n",
206 |        "      <td>2cbb4495c590cc34327978633a7f8d18</td>\n",
207 |        "    </tr>\n",
208 |        "  </tbody>\n",
209 |        "</table>\n",
210 |        "</div>"
211 |       ]
212 |      },
213 |      "metadata": {},
214 |      "execution_count": 28
215 |     }
216 |    ],
217 |    "metadata": {}
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 23,
222 |    "source": [
223 |     "df.dropna(subset=['URL','TEXT'], inplace=True)\n",
224 |     "df.shape"
225 |    ],
226 |    "outputs": [
227 |     {
228 |      "output_type": "execute_result",
229 |      "data": {
230 |       "text/plain": [
231 |        "(6633380, 8)"
232 |       ]
233 |      },
234 |      "metadata": {},
235 |      "execution_count": 23
236 |     }
237 |    ],
238 |    "metadata": {}
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 27,
243 |    "source": [
244 |     "import hashlib\n",
245 |     "df[\"hash\"] = df.apply(lambda x: hashlib.md5((str(x.URL) + str(x.TEXT)).encode(\"utf-8\")).hexdigest(), axis=1)"
246 |    ],
247 |    "outputs": [],
248 |    "metadata": {}
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 29,
253 |    "source": [
254 |     "df.to_csv(\"nansim.csv\",sep=\"|\",index=False)"
255 |    ],
256 |    "outputs": [],
257 |    "metadata": {}
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "orig_nbformat": 4,
262 |   "language_info": {
263 |    "name": "python",
264 |    "version": "3.8.8",
265 |    "mimetype": "text/x-python",
266 |    "codemirror_mode": {
267 |     "name": "ipython",
268 |     "version": 3
269 |    },
270 |    "pygments_lexer": "ipython3",
271 |    "nbconvert_exporter": "python",
272 |    "file_extension": ".py"
273 |   },
274 |   "kernelspec": {
275 |    "name": "python3",
276 |    "display_name": "Python 3.8.8 64-bit ('gpuhcloud': conda)"
277 |   },
278 |   "interpreter": {
279 |    "hash": "bc322c11e8113b1b1dfcd753c5702c5c5d95a81c495f9a7060b170a2a7888bca"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 2
284 | }


--------------------------------------------------------------------------------
/postgres/dump_db.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from multiprocessing import Process, Queue\n",
 10 |     "from sqlalchemy import create_engine, text\n",
 11 |     "from sqlalchemy.pool import NullPool\n",
 12 |     "from configparser import ConfigParser\n",
 13 |     "from tqdm.auto import tqdm, trange\n",
 14 |     "import uuid\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 7,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "def config(filename='database.ini', mode=\"test\"):\n",
 25 |     "    # create a parser\n",
 26 |     "    parser = ConfigParser()\n",
 27 |     "    # read config file\n",
 28 |     "    parser.read(filename)\n",
 29 |     "    section='postgresql'\n",
 30 |     "    if mode == \"production\":\n",
 31 |     "        section = \"cah_production\"\n",
 32 |     "    # get section, default to postgresql\n",
 33 |     "    db = {}\n",
 34 |     "    if parser.has_section(section):\n",
 35 |     "        params = parser.items(section)\n",
 36 |     "        for param in params:\n",
 37 |     "            db[param[0]] = param[1]\n",
 38 |     "    else:\n",
 39 |     "        raise Exception('Section {0} not found in the {1} file'.format(section, filename))\n",
 40 |     "    return db"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 8,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "def dump_3m(j, workers, engine, jobtype, cycles, queue, path, dataset):\n",
 50 |     "    engine.dispose()\n",
 51 |     "    with engine.connect() as eng:\n",
 52 |     "        conn = engine.raw_connection()\n",
 53 |     "        for i in range(cycles):\n",
 54 |     "            file = uuid.uuid4()\n",
 55 |     "            # clipped out\n",
 56 |     "            if jobtype == \"clipped\":\n",
 57 |     "                if dataset ==  \"en\":\n",
 58 |     "                    select_stmt1 = f\"\"\"BEGIN;\n",
 59 |     "                                    SET work_mem = '1GB';\n",
 60 |     "                                    -- query --\n",
 61 |     "                                    COPY (\n",
 62 |     "                                        DELETE FROM dataset_en WHERE sampleid in (\n",
 63 |     "                                            select sampleid from dataset_en where status = 2 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n",
 64 |     "                                            ) RETURNING *\n",
 65 |     "                                        ) TO '{path}/clipped/ok-en-{file}.csv' DELIMITER '|' CSV HEADER;\n",
 66 |     "                                    SET work_mem = default;\n",
 67 |     "                                    COMMIT;\"\"\"\n",
 68 |     "                else:\n",
 69 |     "                    select_stmt1 = f\"\"\"BEGIN;\n",
 70 |     "                                    SET work_mem = '1GB';\n",
 71 |     "                                    -- query --\n",
 72 |     "                                    COPY (\n",
 73 |     "                                        DELETE FROM dataset_{dataset} WHERE sampleid in (\n",
 74 |     "                                            select sampleid from dataset_{dataset} where status = 2 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n",
 75 |     "                                            ) RETURNING *\n",
 76 |     "                                        ) TO '{path}/clipped/ok-{dataset}-{file}.csv' DELIMITER '|' CSV HEADER;\n",
 77 |     "                                    SET work_mem = default;\n",
 78 |     "                                    COMMIT;\"\"\"\n",
 79 |     "            # rejected out\n",
 80 |     "            elif jobtype == \"rejected\":\n",
 81 |     "                if dataset ==  \"en\":\n",
 82 |     "                    select_stmt1 = f\"\"\"BEGIN;\n",
 83 |     "                                    SET work_mem = '1GB';\n",
 84 |     "                                    -- query --\n",
 85 |     "                                    COPY (\n",
 86 |     "                                        DELETE FROM dataset_en WHERE sampleid in (\n",
 87 |     "                                            select sampleid from dataset_en where status > 8 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n",
 88 |     "                                            ) RETURNING *\n",
 89 |     "                                        ) TO '{path}/rejected/bad-en-{file}.csv' DELIMITER '|' CSV HEADER;\n",
 90 |     "                                    SET work_mem = default;\n",
 91 |     "                                    COMMIT;\"\"\"\n",
 92 |     "                else:\n",
 93 |     "                    select_stmt1 = f\"\"\"BEGIN;\n",
 94 |     "                                    SET work_mem = '1GB';\n",
 95 |     "                                    -- query --\n",
 96 |     "                                    COPY (\n",
 97 |     "                                        DELETE FROM dataset_{dataset} WHERE sampleid in (\n",
 98 |     "                                            select sampleid from dataset_{dataset} where status > 8 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n",
 99 |     "                                            ) RETURNING *\n",
100 |     "                                        ) TO '{path}/rejected/bad-{dataset}-{file}.csv' DELIMITER '|' CSV HEADER;\n",
101 |     "                                    SET work_mem = default;\n",
102 |     "                                    COMMIT;\"\"\"\n",
103 |     "\n",
104 |     "            else:\n",
105 |     "                continue\n",
106 |     "            try:\n",
107 |     "                cur = conn.cursor()\n",
108 |     "                cur.execute(select_stmt1)\n",
109 |     "                conn.commit()\n",
110 |     "            except Exception as e:\n",
111 |     "                print(f\"error: {e}\")\n",
112 |     "            queue.put(1)\n",
113 |     "    return"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 32,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "application/vnd.jupyter.widget-view+json": {
124 |        "model_id": "a5699d0df7814b728b5f170c06711abf",
125 |        "version_major": 2,
126 |        "version_minor": 0
127 |       },
128 |       "text/plain": [
129 |        "  0%|          | 0/25 [00:00<?, ?it/s]"
130 |       ]
131 |      },
132 |      "metadata": {},
133 |      "output_type": "display_data"
134 |     },
135 |     {
136 |      "name": "stdout",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "[0] clipped\n",
140 |       "[1] clipped\n",
141 |       "[2] rejected\n",
142 |       "[3] rejected\n",
143 |       "[4] rejected\n",
144 |       "Job ended\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "mode = \"production\"\n",
150 |     "dataset = \"nolang\"\n",
151 |     "params = config(mode=mode)\n",
152 |     "engine = create_engine(f'postgresql://{params[\"user\"]}:{params[\"password\"]}@{params[\"host\"]}:5432/{params[\"database\"]}', pool_pre_ping=True, poolclass=NullPool)\n",
153 |     "threads1 = 2\n",
154 |     "threads2 = 3\n",
155 |     "cycles = 5\n",
156 |     "\n",
157 |     "path = \"/home/cah\"\n",
158 |     "if mode == \"production\":\n",
159 |     "    path = \"/mnt/md1/export\"\n",
160 |     "\n",
161 |     "workers = []\n",
162 |     "for _ in range(threads1):\n",
163 |     "    workers.append(\"clipped\")\n",
164 |     "for _ in range(threads2):\n",
165 |     "    workers.append(\"rejected\")\n",
166 |     "\n",
167 |     "iterations = len(workers) * cycles\n",
168 |     "processes = []\n",
169 |     "pbars = []\n",
170 |     "pbar = tqdm(total=iterations)\n",
171 |     "q = Queue()\n",
172 |     "\n",
173 |     "for i, worker in enumerate(workers):\n",
174 |     "    print(f\"[{i}] {worker}\")\n",
175 |     "    time.sleep(10)\n",
176 |     "    j = 0\n",
177 |     "    num = 0\n",
178 |     "    if worker == \"clipped\":\n",
179 |     "        j = i\n",
180 |     "        num = threads1\n",
181 |     "    elif worker == \"rejected\":\n",
182 |     "        j = i - threads1\n",
183 |     "        num = threads2\n",
184 |     "    else:\n",
185 |     "        pass\n",
186 |     "\n",
187 |     "    p = Process(target=dump_3m, args = [j, num, engine, worker, cycles, q, path, dataset], daemon=False)\n",
188 |     "    try:\n",
189 |     "        p.start()\n",
190 |     "        processes.append(p)\n",
191 |     "    except:\n",
192 |     "        pass\n",
193 |     "\n",
194 |     "progress = 0\n",
195 |     "while progress < iterations:\n",
196 |     "    if not q.empty():\n",
197 |     "        q.get()\n",
198 |     "        pbar.update(1)\n",
199 |     "        progress += 1\n",
200 |     "    time.sleep(0.2)\n",
201 |     "\n",
202 |     "for proc in processes:\n",
203 |     "    proc.join()\n",
204 |     "\n",
205 |     "print (f\"Job ended\")\n"
206 |    ]
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "interpreter": {
211 |    "hash": "ee22a52db22349ad32e35f3b499efddea1c9229e771c5fd65652469b6b2f1979"
212 |   },
213 |   "kernelspec": {
214 |    "display_name": "Python 3.9.7 64-bit ('gpu': conda)",
215 |    "name": "python3"
216 |   },
217 |   "language_info": {
218 |    "codemirror_mode": {
219 |     "name": "ipython",
220 |     "version": 3
221 |    },
222 |    "file_extension": ".py",
223 |    "mimetype": "text/x-python",
224 |    "name": "python",
225 |    "nbconvert_exporter": "python",
226 |    "pygments_lexer": "ipython3",
227 |    "version": "3.9.7"
228 |   },
229 |   "orig_nbformat": 4
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 2
233 | }
234 | 


--------------------------------------------------------------------------------
/docs/architecture_white.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="app.diagrams.net" modified="2021-09-07T18:05:01.524Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84" etag="yeYvzUkjsxeLVjeqLN5O" version="15.1.1" type="github">
  2 |   <diagram id="HWF6cFD_o7p9VQBaK8Nd" name="Page-1">
  3 |     <mxGraphModel dx="3342" dy="1874" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" background="none" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="32pcUigWsBj-OOpQZ1Kn-3" edge="1">
  8 |           <mxGeometry relative="1" as="geometry">
  9 |             <Array as="points">
 10 |               <mxPoint x="719" y="20" />
 11 |               <mxPoint x="508" y="20" />
 12 |             </Array>
 13 |           </mxGeometry>
 14 |         </mxCell>
 15 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="32pcUigWsBj-OOpQZ1Kn-2" edge="1">
 16 |           <mxGeometry relative="1" as="geometry">
 17 |             <mxPoint x="649" y="94" as="targetPoint" />
 18 |             <Array as="points">
 19 |               <mxPoint x="719" y="103" />
 20 |             </Array>
 21 |           </mxGeometry>
 22 |         </mxCell>
 23 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-1" value="&amp;nbsp;BLOOM FILTERS" style="aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Cache_Redis_Product.svg;labelPosition=right;verticalLabelPosition=middle;verticalAlign=middle;labelBackgroundColor=none;" parent="1" vertex="1">
 24 |           <mxGeometry x="694" y="197" width="50" height="42" as="geometry" />
 25 |         </mxCell>
 26 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-2" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-2" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 27 |           <mxGeometry relative="1" as="geometry">
 28 |             <Array as="points">
 29 |               <mxPoint x="550" y="103" />
 30 |               <mxPoint x="550" y="120" />
 31 |             </Array>
 32 |           </mxGeometry>
 33 |         </mxCell>
 34 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-2" value="HOME WORKERS" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Docker.svg;labelPosition=center;verticalLabelPosition=top;verticalAlign=bottom;labelBackgroundColor=none;direction=east;" parent="1" vertex="1">
 35 |           <mxGeometry x="600" y="82" width="50" height="41" as="geometry" />
 36 |         </mxCell>
 37 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-3" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 38 |           <mxGeometry relative="1" as="geometry" />
 39 |         </mxCell>
 40 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-3" value="SINGLE CPU WORKERS SWARMS&amp;nbsp;&lt;br&gt;MULTICPU WORKER DC NODES&amp;nbsp;&lt;br&gt;VM OR BARE METAL&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;fillColor=#505050;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_farm;fillColor=#7FBA42;" parent="1" vertex="1">
 41 |           <mxGeometry x="480" y="30" width="56" height="49" as="geometry" />
 42 |         </mxCell>
 43 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-4" target="32pcUigWsBj-OOpQZ1Kn-14" edge="1">
 44 |           <mxGeometry relative="1" as="geometry" />
 45 |         </mxCell>
 46 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-4" edge="1">
 47 |           <mxGeometry relative="1" as="geometry">
 48 |             <mxPoint x="720" y="240" as="targetPoint" />
 49 |           </mxGeometry>
 50 |         </mxCell>
 51 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-4" value="MAIN STAGING&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#2072B8;" parent="1" vertex="1">
 52 |           <mxGeometry x="494.5" y="280" width="27" height="52" as="geometry" />
 53 |         </mxCell>
 54 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-5" target="32pcUigWsBj-OOpQZ1Kn-6" edge="1">
 55 |           <mxGeometry relative="1" as="geometry" />
 56 |         </mxCell>
 57 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-5" value="DL STAGING&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#DA4026;" parent="1" vertex="1">
 58 |           <mxGeometry x="495" y="111" width="27" height="52" as="geometry" />
 59 |         </mxCell>
 60 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-6" target="32pcUigWsBj-OOpQZ1Kn-4" edge="1">
 61 |           <mxGeometry relative="1" as="geometry" />
 62 |         </mxCell>
 63 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-6" value="GPU INFERENCE NODES&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.devices.workstation;fillColor=#545454;" parent="1" vertex="1">
 64 |           <mxGeometry x="482" y="190" width="53" height="56" as="geometry" />
 65 |         </mxCell>
 66 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.523;entryY=-0.007;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-14" target="32pcUigWsBj-OOpQZ1Kn-17" edge="1">
 67 |           <mxGeometry relative="1" as="geometry" />
 68 |         </mxCell>
 69 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-14" value="" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Discs.svg;" parent="1" vertex="1">
 70 |           <mxGeometry x="483" y="386" width="50" height="42" as="geometry" />
 71 |         </mxCell>
 72 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-16" value="" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Discs.svg;" parent="1" vertex="1">
 73 |           <mxGeometry x="483" y="486" width="50" height="42" as="geometry" />
 74 |         </mxCell>
 75 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-21" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0.75;entryDx=0;entryDy=0;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-17" target="32pcUigWsBj-OOpQZ1Kn-20" edge="1">
 76 |           <mxGeometry relative="1" as="geometry">
 77 |             <Array as="points">
 78 |               <mxPoint x="719" y="514" />
 79 |             </Array>
 80 |           </mxGeometry>
 81 |         </mxCell>
 82 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-17" value="&amp;nbsp;IMG2DATASET" style="aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Cloud_Service.svg;labelPosition=right;verticalLabelPosition=middle;verticalAlign=middle;labelBackgroundColor=none;" parent="1" vertex="1">
 83 |           <mxGeometry x="694" y="440" width="50" height="42" as="geometry" />
 84 |         </mxCell>
 85 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-20" value="STORAGE" style="whiteSpace=wrap;html=1;aspect=fixed;dashed=1;rounded=1;fillColor=none;" parent="1" vertex="1">
 86 |           <mxGeometry x="403" y="350" width="210" height="210" as="geometry" />
 87 |         </mxCell>
 88 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-6" edge="1">
 89 |           <mxGeometry relative="1" as="geometry" />
 90 |         </mxCell>
 91 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-2" edge="1">
 92 |           <mxGeometry relative="1" as="geometry" />
 93 |         </mxCell>
 94 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-3" edge="1">
 95 |           <mxGeometry relative="1" as="geometry">
 96 |             <Array as="points">
 97 |               <mxPoint x="625" y="150" />
 98 |               <mxPoint x="570" y="150" />
 99 |               <mxPoint x="570" y="55" />
100 |             </Array>
101 |           </mxGeometry>
102 |         </mxCell>
103 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-1" edge="1">
104 |           <mxGeometry relative="1" as="geometry" />
105 |         </mxCell>
106 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
107 |           <mxGeometry relative="1" as="geometry">
108 |             <Array as="points">
109 |               <mxPoint x="625" y="150" />
110 |             </Array>
111 |           </mxGeometry>
112 |         </mxCell>
113 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-1" value="TRACKER" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=center;verticalLabelPosition=bottom;verticalAlign=top;outlineConnect=0;align=center;shape=mxgraph.office.servers.sql_server;fillColor=#D4AB84;" parent="1" vertex="1">
114 |           <mxGeometry x="603" y="190" width="43" height="56" as="geometry" />
115 |         </mxCell>
116 |       </root>
117 |     </mxGraphModel>
118 |   </diagram>
119 | </mxfile>
120 | 


--------------------------------------------------------------------------------
/docs/3stage_architecture_white.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="app.diagrams.net" modified="2022-03-01T17:33:39.526Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56" version="16.5.6" etag="_nk1eIhPa21utos-7AjZ" type="github">
  2 |   <diagram id="HWF6cFD_o7p9VQBaK8Nd" name="Page-1">
  3 |     <mxGraphModel dx="1337" dy="750" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" background="none" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="LRYR7kt2hS8zzGMO_R-A-9" value="&amp;nbsp;filter" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=left;verticalLabelPosition=top;align=right;verticalAlign=bottom;fontStyle=2;entryX=1;entryY=0.5;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="LRYR7kt2hS8zzGMO_R-A-2" edge="1">
  8 |           <mxGeometry relative="1" as="geometry">
  9 |             <mxPoint x="830" y="100" as="targetPoint" />
 10 |             <Array as="points">
 11 |               <mxPoint x="719" y="170" />
 12 |               <mxPoint x="840" y="170" />
 13 |               <mxPoint x="840" y="100" />
 14 |             </Array>
 15 |           </mxGeometry>
 16 |         </mxCell>
 17 |         <mxCell id="LRYR7kt2hS8zzGMO_R-A-11" value="filter" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=left;verticalLabelPosition=top;align=right;verticalAlign=bottom;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 18 |           <mxGeometry relative="1" as="geometry">
 19 |             <Array as="points">
 20 |               <mxPoint x="719" y="170" />
 21 |               <mxPoint x="580" y="170" />
 22 |               <mxPoint x="580" y="137" />
 23 |             </Array>
 24 |           </mxGeometry>
 25 |         </mxCell>
 26 |         <mxCell id="txCRSm0sxxDcbwHXg60d-5" value="&lt;font color=&quot;#ffffff&quot;&gt;stats&lt;/font&gt;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#000000;fontStyle=2;labelPosition=center;verticalLabelPosition=top;align=center;verticalAlign=bottom;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="aqjGa1b1h-gu6OBOAQX7-1" edge="1">
 27 |           <mxGeometry y="-8" relative="1" as="geometry">
 28 |             <mxPoint as="offset" />
 29 |           </mxGeometry>
 30 |         </mxCell>
 31 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-1" value="&amp;nbsp;BLOOM FILTERS" style="aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Cache_Redis_Product.svg;labelPosition=right;verticalLabelPosition=middle;verticalAlign=middle;labelBackgroundColor=none;" parent="1" vertex="1">
 32 |           <mxGeometry x="694" y="197" width="50" height="42" as="geometry" />
 33 |         </mxCell>
 34 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-2" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-2" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 35 |           <mxGeometry relative="1" as="geometry">
 36 |             <Array as="points">
 37 |               <mxPoint x="550" y="103" />
 38 |               <mxPoint x="550" y="120" />
 39 |             </Array>
 40 |           </mxGeometry>
 41 |         </mxCell>
 42 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-2" value="HOME WORKERS" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Docker.svg;labelPosition=center;verticalLabelPosition=top;verticalAlign=bottom;labelBackgroundColor=none;direction=east;" parent="1" vertex="1">
 43 |           <mxGeometry x="600" y="82" width="50" height="41" as="geometry" />
 44 |         </mxCell>
 45 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-7" value="store downloaded data&amp;nbsp;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-3" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 46 |           <mxGeometry relative="1" as="geometry" />
 47 |         </mxCell>
 48 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-3" value="SINGLE CPU WORKERS SWARMS&amp;nbsp;&lt;br&gt;MULTICPU WORKER DC NODES&amp;nbsp;&lt;br&gt;VM OR BARE METAL&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;fillColor=#505050;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_farm;fillColor=#7FBA42;" parent="1" vertex="1">
 49 |           <mxGeometry x="480" y="30" width="56" height="49" as="geometry" />
 50 |         </mxCell>
 51 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-15" value="pass metadata to permanent storage&amp;nbsp;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-4" target="32pcUigWsBj-OOpQZ1Kn-14" edge="1">
 52 |           <mxGeometry relative="1" as="geometry" />
 53 |         </mxCell>
 54 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-8" value="&lt;font color=&quot;#ffffff&quot;&gt;update bloom filters&lt;/font&gt;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#000000;labelPosition=left;verticalLabelPosition=top;align=right;verticalAlign=bottom;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-4" edge="1">
 55 |           <mxGeometry relative="1" as="geometry">
 56 |             <mxPoint x="720" y="240" as="targetPoint" />
 57 |           </mxGeometry>
 58 |         </mxCell>
 59 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-4" value="MAIN STAGING&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#2072B8;" parent="1" vertex="1">
 60 |           <mxGeometry x="494.5" y="280" width="27" height="52" as="geometry" />
 61 |         </mxCell>
 62 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-8" value="serve downloaded data to GPU&amp;nbsp;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-5" target="32pcUigWsBj-OOpQZ1Kn-6" edge="1">
 63 |           <mxGeometry relative="1" as="geometry" />
 64 |         </mxCell>
 65 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-5" value="DL STAGING&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#DA4026;" parent="1" vertex="1">
 66 |           <mxGeometry x="495" y="111" width="27" height="52" as="geometry" />
 67 |         </mxCell>
 68 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-9" value="store filtered metadata&amp;nbsp;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-6" target="32pcUigWsBj-OOpQZ1Kn-4" edge="1">
 69 |           <mxGeometry relative="1" as="geometry" />
 70 |         </mxCell>
 71 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-6" value="GPU INFERENCE NODES&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.devices.workstation;fillColor=#545454;" parent="1" vertex="1">
 72 |           <mxGeometry x="482" y="190" width="53" height="56" as="geometry" />
 73 |         </mxCell>
 74 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-18" value="read metadata" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.523;entryY=-0.007;entryDx=0;entryDy=0;entryPerimeter=0;labelPosition=right;verticalLabelPosition=top;align=left;verticalAlign=bottom;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-14" target="32pcUigWsBj-OOpQZ1Kn-17" edge="1">
 75 |           <mxGeometry relative="1" as="geometry" />
 76 |         </mxCell>
 77 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-14" value="" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Discs.svg;" parent="1" vertex="1">
 78 |           <mxGeometry x="483" y="386" width="50" height="42" as="geometry" />
 79 |         </mxCell>
 80 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-16" value="" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Discs.svg;" parent="1" vertex="1">
 81 |           <mxGeometry x="483" y="486" width="50" height="42" as="geometry" />
 82 |         </mxCell>
 83 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-21" value="reconstruct&lt;br&gt;webdataset" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=center;verticalLabelPosition=top;align=center;verticalAlign=bottom;fontStyle=2" parent="1" source="32pcUigWsBj-OOpQZ1Kn-17" target="32pcUigWsBj-OOpQZ1Kn-20" edge="1">
 84 |           <mxGeometry relative="1" as="geometry">
 85 |             <Array as="points">
 86 |               <mxPoint x="719" y="514" />
 87 |             </Array>
 88 |           </mxGeometry>
 89 |         </mxCell>
 90 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-17" value="&amp;nbsp;IMG2DATASET" style="aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Cloud_Service.svg;labelPosition=right;verticalLabelPosition=middle;verticalAlign=middle;labelBackgroundColor=none;" parent="1" vertex="1">
 91 |           <mxGeometry x="694" y="440" width="50" height="42" as="geometry" />
 92 |         </mxCell>
 93 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-20" value="STORAGE (S3)" style="whiteSpace=wrap;html=1;aspect=fixed;dashed=1;rounded=1;fillColor=none;" parent="1" vertex="1">
 94 |           <mxGeometry x="413" y="367" width="193" height="193" as="geometry" />
 95 |         </mxCell>
 96 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-3" value="gpu jobs" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=center;verticalLabelPosition=top;align=center;verticalAlign=bottom;fontStyle=2" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-6" edge="1">
 97 |           <mxGeometry relative="1" as="geometry" />
 98 |         </mxCell>
 99 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-9" value="&lt;font color=&quot;#ffffff&quot;&gt;clean&lt;/font&gt;" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#000000;labelPosition=right;verticalLabelPosition=top;align=left;verticalAlign=bottom;fontStyle=2" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
100 |           <mxGeometry relative="1" as="geometry">
101 |             <Array as="points">
102 |               <mxPoint x="625" y="150" />
103 |             </Array>
104 |           </mxGeometry>
105 |         </mxCell>
106 |         <mxCell id="LRYR7kt2hS8zzGMO_R-A-10" value="preprocessing jobs" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=center;verticalLabelPosition=top;align=center;verticalAlign=bottom;fontStyle=2;entryX=0.5;entryY=1;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="LRYR7kt2hS8zzGMO_R-A-2" edge="1">
107 |           <mxGeometry relative="1" as="geometry">
108 |             <Array as="points">
109 |               <mxPoint x="625" y="150" />
110 |               <mxPoint x="803" y="150" />
111 |               <mxPoint x="803" y="125" />
112 |             </Array>
113 |             <mxPoint x="803" y="150" as="targetPoint" />
114 |           </mxGeometry>
115 |         </mxCell>
116 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-1" value="TRACKER" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=center;verticalLabelPosition=bottom;verticalAlign=top;outlineConnect=0;align=center;shape=mxgraph.office.servers.sql_server;fillColor=#D4AB84;" parent="1" vertex="1">
117 |           <mxGeometry x="603" y="190" width="43" height="56" as="geometry" />
118 |         </mxCell>
119 |         <mxCell id="txCRSm0sxxDcbwHXg60d-2" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1.03;entryY=0.427;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="LRYR7kt2hS8zzGMO_R-A-1" target="32pcUigWsBj-OOpQZ1Kn-2" edge="1">
120 |           <mxGeometry relative="1" as="geometry" />
121 |         </mxCell>
122 |         <mxCell id="txCRSm0sxxDcbwHXg60d-3" value="download jobs" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;labelPosition=center;verticalLabelPosition=top;align=center;verticalAlign=bottom;fontStyle=2" parent="1" source="LRYR7kt2hS8zzGMO_R-A-1" target="32pcUigWsBj-OOpQZ1Kn-3" edge="1">
123 |           <mxGeometry relative="1" as="geometry">
124 |             <Array as="points">
125 |               <mxPoint x="680" y="100" />
126 |               <mxPoint x="680" y="55" />
127 |             </Array>
128 |           </mxGeometry>
129 |         </mxCell>
130 |         <mxCell id="LRYR7kt2hS8zzGMO_R-A-1" value="Postgresql" style="sketch=0;aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Database_General.svg;rounded=1;labelPosition=center;verticalLabelPosition=top;verticalAlign=bottom;" parent="1" vertex="1">
131 |           <mxGeometry x="700" y="75" width="38" height="50" as="geometry" />
132 |         </mxCell>
133 |         <mxCell id="LRYR7kt2hS8zzGMO_R-A-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" parent="1" source="LRYR7kt2hS8zzGMO_R-A-2" target="LRYR7kt2hS8zzGMO_R-A-1" edge="1">
134 |           <mxGeometry relative="1" as="geometry" />
135 |         </mxCell>
136 |         <mxCell id="LRYR7kt2hS8zzGMO_R-A-2" value="CC &lt;br&gt;PREPROCESSOR" style="sketch=0;aspect=fixed;pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=center;verticalLabelPosition=top;verticalAlign=bottom;align=center;fillColor=#00188D;shape=mxgraph.mscae.enterprise.cluster_server;rounded=1;" parent="1" vertex="1">
137 |           <mxGeometry x="781" y="75" width="40" height="50" as="geometry" />
138 |         </mxCell>
139 |       </root>
140 |     </mxGraphModel>
141 |   </diagram>
142 | </mxfile>
143 | 


--------------------------------------------------------------------------------
/infrastructure.py:
--------------------------------------------------------------------------------
  1 | # usage:
  2 | # starting swarm
  3 | #   python3 infrastructure.py command cloud nodes datacenter
  4 | #               where
  5 | #                   1st arg can be up, down, reset
  6 | #                   2nd arg can be hetzner, vultr, alibaba, hostwinds
  7 | #                   3rd arg is optional, number of nodes, implicit 1
  8 | #                   4th arg is optionsl, datacenter for hetzner (fsn1, )
  9 | #                   
 10 | #  the .env file format with single space delimiter
 11 | #  lx2evY5dL2uScjjp...Hjsobzcxvbm5Ng9gb27gulMC...CsobCmqOKlCmwzn6Qi rvencu     -1    rv
 12 | #                        API token                                  nickname  nodes real_name
 13 | # where nodes = -1 means we can spin up to the very server limit
 14 | #       nodes = 0 - do not use this key
 15 | #       nodes > 0 - spin up only to the minimum between this number and server limit
 16 | 
 17 | import os 
 18 | import sys
 19 | import trio
 20 | import time
 21 | import pipes
 22 | #import subprocess
 23 | from configparser import ConfigParser
 24 | from itertools import cycle
 25 | from hcloud import Client
 26 | from hcloud.images.domain import Image
 27 | from hcloud.hcloud import APIException
 28 | from hcloud.server_types.client import ServerType
 29 | #from hcloud.servers.client import BoundServer, CreateServerResponse
 30 | from pssh.clients import ParallelSSHClient, SSHClient
 31 | from gevent import joinall
 32 | 
 33 | def config(filename='database.ini', mode="test"):
 34 |     # create a parser
 35 |     parser = ConfigParser()
 36 |     # read config file
 37 |     parser.read(filename)
 38 |     section='postgresql'
 39 |     if mode == "production":
 40 |         section = "cah_production"
 41 |     # get section, default to postgresql
 42 |     db = {}
 43 |     if parser.has_section(section):
 44 |         params = parser.items(section)
 45 |         for param in params:
 46 |             db[param[0]] = param[1]
 47 |     else:
 48 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 49 |     return db
 50 | 
 51 | async def list_servers(tok=""):
 52 |     servers = []
 53 |     tokens = []
 54 |     if tok == "":
 55 |         with open(".env", "r") as auth:
 56 |             tokens = auth.readlines()
 57 |     else:
 58 |         tokens = [tok]
 59 |     for token in tokens:
 60 |         hclient = Client(token=token.rstrip())  # Please paste your API token here between the quotes
 61 |         servers = servers + hclient.servers.get_all()
 62 |     return servers
 63 | 
 64 | async def up(nodes, pref_loc, server_type="cx11", nick=""):
 65 |     workers = []
 66 |     tokens = []
 67 |     script = ""
 68 |     nodes = int(nodes)
 69 |     with open(".env", "r") as auth:
 70 |         tokens = [x.split(" ") for x in auth.readlines()]
 71 |     with open("cloud-init", "r") as user_data:
 72 |         script = user_data.read()
 73 |     for token in tokens:
 74 |         if nick != "" and nick != token[1]:
 75 |             continue
 76 |         number = nodes
 77 |         if int(token[2])>0:
 78 |             number = min(nodes, int(token[2]))
 79 |         init = script.replace("<<your_nickname>>", token[1])
 80 |         print(f"[swarm] nodes to spin up: {nodes}")
 81 |         if (number > 0 and int(token[2])!=0):
 82 |             try:
 83 |                 hclient = Client(token=token[0])
 84 |                 if pref_loc == None:
 85 |                     print ("[swarm] no specific location provided")
 86 |                     locations = hclient.locations.get_all()
 87 |                     loc = cycle(locations)
 88 |                     zip = [[i, next(loc)] for i in range(number)]
 89 |                 else:
 90 |                     print (f"[swarm] using {pref_loc} location")
 91 |                     location = hclient.locations.get_by_name(pref_loc)
 92 |                     zip = [[i, location] for i in range(number)]
 93 |                 for i, loc in zip:
 94 |                     try:
 95 |                         response = hclient.servers.create(
 96 |                             "cah-worker-"+str(i),
 97 |                             ServerType(name=server_type),
 98 |                             Image(name="ubuntu-20.04"),
 99 |                             hclient.ssh_keys.get_all(),
100 |                             None, #volumes
101 |                             None, #firewalls
102 |                             None, #networks
103 |                             init,
104 |                             None, #labels
105 |                             loc, #location - todo: create servers in all locations
106 |                             None, #datacenter
107 |                         )
108 |                         srv = response.server
109 |                         workers.append((srv.public_net.ipv4.ip, token[1])) # tuple IP and nickname
110 |                         nodes = nodes - 1
111 |                     except APIException as e:
112 |                         print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")")
113 |                         break
114 |                     except Exception as e:
115 |                         print(e)
116 |                         break
117 |             except APIException as e:
118 |                 print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")")
119 |                 continue
120 |             except Exception as e:
121 |                 print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")")
122 |                 continue
123 |             
124 |     print (f"[swarm] Cloud swarm intialized with {len(workers)} nodes. If this is less than expected please check your account limits")
125 |     return workers
126 | 
127 | async def down(cloud, nick=""):
128 |     workers = []
129 |     nicknames = []
130 |     if os.path.exists(f"{cloud}.txt"):
131 |         with open(f"{cloud}.txt", "r") as f:
132 |             for line in f.readlines():
133 |                 workers.append(line.split(" ")[0])
134 |                 nicknames.append(line.split(" ")[1])
135 |     with open(".env", "r") as auth:
136 |         tokens = [x.split(" ") for x in auth.readlines()]
137 |     for token in tokens:
138 |         if nick != "" and nick != token[1]:
139 |             continue
140 |         if int(token[2]) != 0:
141 |             try:
142 |                 servers = await list_servers(token[0])
143 |                 hclient = Client(token=token[0])
144 |                 for server in servers:
145 |                     server = hclient.servers.get_by_name(server.name)
146 |                     ip = server.public_net.ipv4.ip
147 |                     if ip not in workers:
148 |                         continue
149 |                     server.delete()
150 |             except APIException as e:
151 |                 print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")")
152 |                 continue
153 | 
154 | async def respawn(workers, ip, server_type="cx11"):
155 |     with open(".env", "r") as auth:
156 |         tokens = auth.readlines().split(" ")
157 |     for token in tokens:
158 |         hclient = Client(token=token[0])
159 |         index = workers.index(ip)
160 |         server = hclient.servers.get_by_name(f"cah-worker-{index}")
161 |         if server is None:
162 |             continue
163 |         try:
164 |             # first attempt to restart the crawl service
165 |             aclient = SSHClient(ip, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False)
166 |             aclient.execute('systemctl restart crawl', sudo=True )
167 |             aclient.disconnect()
168 | 
169 |         except:
170 |             # if impossible to restart the service then delete the worker and try to re-create it
171 |             server.delete()
172 |             with open("cloud-init", "r") as user_data:
173 |                 script = user_data.read().replace("<<your_nickname>>", token[1])
174 |                 try:
175 |                     response = hclient.servers.create(
176 |                         "cah-worker-"+index,
177 |                         ServerType(name=server_type),
178 |                         Image(name="ubuntu-20.04"),
179 |                         hclient.ssh_keys.get_all(),
180 |                         None, #volumes
181 |                         None, #firewalls
182 |                         None, #networks
183 |                         script,
184 |                         None, #labels
185 |                         None, #location - todo: create servers in all locations
186 |                         None, #datacenter
187 |                     )
188 |                     srv = response.server
189 |                     workers[index] = srv.public_net.ipv4.ip
190 |                 except APIException as e:
191 |                     # problem. we remove the worker from the dispatcher
192 |                     print (f"[swarm] API Exception: " + str(e))
193 |                     workers.remove(ip)
194 |                     return workers
195 |     return workers
196 | 
197 | def exists_remote(host, path, silent=False):
198 |     """Test if a file exists at path on a host accessible with SSH."""
199 |     aclient = SSHClient(host, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False )
200 |     #_start = time.time()
201 |     output = aclient.run_command("test -f {}".format(pipes.quote(path)))
202 |     
203 |     status = output.exit_code
204 | 
205 |     aclient.disconnect()
206 | 
207 |     if not silent:
208 |         print(".", end = "", flush=True)
209 |     if status == 0:
210 |         return True
211 |     if status == 1 or status == 255:
212 |         return False
213 | 
214 | async def wait_for_infrastructure (workers): # here workers is a list of IPs
215 |     print(f"[swarm] Waiting for {len(workers)} nodes to become ready. Polling starts after 4 minutes...")
216 |     time.sleep(240)
217 |     ready = []
218 |     pclient = ParallelSSHClient(workers, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False )
219 |     while len(ready) < len(workers):
220 |         print(".", end = "", flush=True)
221 |         ready = []
222 |         #_start = time.time()
223 |         output = pclient.run_command('test -f /home/crawl/crawl.log')
224 |         pclient.join(output)
225 |         for host_output in output:
226 |             hostname = host_output.host
227 |             exit_code = host_output.exit_code
228 |             if exit_code == 0:
229 |                 ready.append(hostname)
230 |         #print(len(ready))
231 |         time.sleep(10)
232 | 
233 | def last_status(ip, path):
234 |     aclient = SSHClient(ip, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False)
235 |     read = aclient.run_command("tail -1 {}".format(pipes.quote(path)))
236 |     aclient.disconnect()
237 |     return read.stdout
238 | 
239 | def reset_workers(cloud):
240 |     workers = []
241 |     with open(f"{cloud}.txt", "r") as f:
242 |         for line in f.readlines():
243 |             workers.append(line.split(" ")[0])
244 |     if cloud in ["oracle"]:
245 |         pclient = ParallelSSHClient(workers, user='ubuntu', pkey="~/gpuhcloud/richard", identity_auth=False )
246 |         output = pclient.run_command('cd /home/crawl & source worker-reset.sh', sudo=True)
247 |         pclient.join(output)
248 |     else:
249 |         pclient = ParallelSSHClient(workers, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False )
250 |         output = pclient.run_command('source worker-reset.sh', sudo=True)
251 |         pclient.join(output)
252 | 
253 | if __name__ == "__main__":
254 |     command = sys.argv[1]
255 |     cloud = sys.argv[2]
256 |     location = ""
257 |     if len(sys.argv) > 3:
258 |         nodes = int(sys.argv[3])
259 |     else:
260 |         nodes = 1
261 |     if len(sys.argv) > 4:
262 |         location = sys.argv[4]
263 | 
264 |     params = config(mode="production")
265 |     
266 |     if command == "up":
267 |         try:
268 |             start = time.time()
269 |             sshkey=""
270 |             escape = ["\\","$",".","*","[","^","/"]
271 |             with open (f"{os.getenv('HOME')}/.ssh/richard.pub","rt") as f:
272 |                 sshkey = f.read().split(" ")[1]
273 |                 for char in escape:
274 |                     sshkey = sshkey.replace(char,"\\"+char)
275 |             #print(sshkey)
276 |             if cloud in ["hetzner"]:
277 |                 if os.path.exists("cloud-init"):
278 |                     os.system("rm cloud-init")
279 |                 os.system("cp 'cloud boot/cloud-init.yaml' cloud-init")
280 |                 os.system(f"sed -i -e \"s/<<your_ssh_public_key>>/{sshkey}/\" cloud-init")
281 |                 os.system(f"sed -i -e \"s/<<deployment_cloud>>/{cloud}/\" cloud-init")
282 |                 os.system(f"sed -i -e \"s/<<host>>/{params['host']}/\" cloud-init")
283 |                 os.system(f"sed -i -e \"s/<<database>>/{params['database']}/\" cloud-init")
284 |                 os.system(f"sed -i -e \"s/<<dbuser>>/{params['user']}/\" cloud-init")
285 |                 os.system(f"sed -i -e \"s/<<dbpwd>>/{params['password']}/\" cloud-init")
286 |             elif cloud in ["vultr"]:
287 |                 # do some boot.sh API calls
288 |                 os.system("rm boot")
289 |                 os.system("cp 'cloud boot/boot.sh' boot")
290 |                 os.system(f"sed -i -e \"s/<<your_nickname>>/{os.getenv('CAH_NICKNAME')}/\" boot")
291 |                 os.system(f"sed -i -e \"s/<<your_ssh_public_key>>/{sshkey}/\" boot")
292 |                 os.system(f"sed -i -e \"s/<<deployment_cloud>>/{cloud}/\" boot")
293 |                 print ("Manual setup: please use `boot` file to manually initialize your cloud nodes.")
294 |                 sys.exit()
295 |             else:
296 |                 print ("not recognized cloud, abandoning")
297 |                 sys.exit()
298 |             # generate cloud workers
299 |             workers = trio.run(up, nodes, location)
300 |             with open(f"{cloud}.txt", "w") as f:
301 |                 for ip, nickname in workers:
302 |                     f.write(ip + " " + nickname + "\n")
303 |             trio.run(wait_for_infrastructure, workers)
304 |             print(
305 |                 f"[swarm] {len(workers)} nodes cloud swarm is up in {cloud} cloud and was initialized in {round(time.time() - start)}s")
306 |         except KeyboardInterrupt:
307 |             print(f"[swarm] Abort! Deleting cloud swarm...")
308 |             trio.run(down)
309 |             print(f"[swarm] Cloud swarm was shutdown")
310 |             sys.exit()
311 |         except Exception as e:
312 |             print(f"[swarm] Error, could not bring up swarm... please consider shutting down all workers via `python3 infrastructure.py down`")
313 |             print(e)
314 |             sys.exit()
315 |     elif command == "down":
316 |         trio.run(down, cloud)
317 |         print (f"[swarm] Cloud swarm was shutdown")
318 |     elif command == "reset":
319 |         reset_workers(cloud)
320 |         print(f"[swarm] All workers were reset")
321 | 


--------------------------------------------------------------------------------
/alibaba_workers/alibaba-upgrade-SAS-image.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 22,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "import hmac\n",
 11 |     "import uuid\n",
 12 |     "import hmac\n",
 13 |     "import time\n",
 14 |     "import base64\n",
 15 |     "import hashlib\n",
 16 |     "import datetime\n",
 17 |     "import requests\n",
 18 |     "import pandas as pd\n",
 19 |     "from urllib.parse import quote\n",
 20 |     "from configparser import ConfigParser\n",
 21 |     "\n",
 22 |     "baseInstances = [\"9d4c9fd55d884badba2540b561432c1e\", \"952dbd0188ff4603b94c81f69398ed75\"]\n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 23,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "class SignatureUrl():\n",
 32 |     "    \"\"\"python 计算openapi的签名\"\"\"\n",
 33 |     "\n",
 34 |     "    def __init__(self, public_param, private_param, secret):\n",
 35 |     "        self.public_param = public_param\n",
 36 |     "        self.private_param = private_param\n",
 37 |     "        self.secret = secret\n",
 38 |     "\n",
 39 |     "    def get_timestamp(self):\n",
 40 |     "        time_format = \"%Y-%m-%dT%H:%M:%SZ\"\n",
 41 |     "        return datetime.datetime.utcnow().strftime(time_format)\n",
 42 |     "\n",
 43 |     "    def get_uuid(self):\n",
 44 |     "        return str(uuid.uuid1())\n",
 45 |     "\n",
 46 |     "    def url_encode_str(self, all_params):\n",
 47 |     "        sort_all_params = list()\n",
 48 |     "        for key, value in all_params.items():\n",
 49 |     "            params = key + '=' + value\n",
 50 |     "            sort_all_params.append(params)\n",
 51 |     "        # 对参数进行升序排序\n",
 52 |     "        sort_all_params.sort()\n",
 53 |     "\n",
 54 |     "        for i in range(len(sort_all_params)):\n",
 55 |     "            # 对参数以及参数值进行urlencode处理,注意:’=‘此时不能处理，否则后面会再次对%3D进行encode\n",
 56 |     "            sort_all_params[i] = quote(sort_all_params[i], '=')\n",
 57 |     "            # 对encode之后的字符串进行再处理\n",
 58 |     "            tmp = sort_all_params[i]\n",
 59 |     "            if tmp.find('+'):\n",
 60 |     "                tmp.replace('+','%20')\n",
 61 |     "            elif tmp.find('*'):\n",
 62 |     "                tmp.replace('*','%2A')\n",
 63 |     "            elif tmp.find('%7E'):\n",
 64 |     "                tmp.replace('%7E','~')\n",
 65 |     "            \n",
 66 |     "            sort_all_params[i] = tmp\n",
 67 |     "        return sort_all_params\n",
 68 |     "\n",
 69 |     "    def get_signature(self, param, http_method, AccesskeySecret):\n",
 70 |     "        str_to_sign = ''\n",
 71 |     "        sort_all_params = self.url_encode_str(param)\n",
 72 |     "        #print(sort_all_params)\n",
 73 |     "        for i in range(len(sort_all_params)):\n",
 74 |     "            str_to_sign = str_to_sign + sort_all_params[i] + '&'\n",
 75 |     "\n",
 76 |     "        # 将最后一位&给截取掉\n",
 77 |     "        str_to_sign = http_method + '&%2F&' + quote(str_to_sign[:-1])\n",
 78 |     "        #print(str_to_sign)\n",
 79 |     "        key = AccesskeySecret+'&'\n",
 80 |     "        signature = hmac.new(key.encode(\n",
 81 |     "            'utf-8'), str_to_sign.encode('utf-8'), digestmod=hashlib.sha1)\n",
 82 |     "        signature = base64.b64encode(signature.digest()).decode().rstrip(\"\\n\")\n",
 83 |     "        # 解决签名中包含有'+'的特殊情况\n",
 84 |     "        signature = list(signature)\n",
 85 |     "        for i in range(len(signature)):\n",
 86 |     "            #signature[i] = str(signature[i])\n",
 87 |     "            if signature[i] == '+':\n",
 88 |     "                signature[i] = '%2B'\n",
 89 |     "        newSignature = ''.join(signature)\n",
 90 |     "        #print (\"Signature: \" + newSignature)\n",
 91 |     "        self.private_param['Signature'] = newSignature\n",
 92 |     "\n",
 93 |     "    def url_factory(self, method):\n",
 94 |     "        all_params = dict(self.public_param, **self.private_param)\n",
 95 |     "        self.get_signature(all_params, method, self.secret)\n",
 96 |     "        url = ''\n",
 97 |     "        par=[]\n",
 98 |     "        for key, value in all_params.items():\n",
 99 |     "            params = key + '=' + value\n",
100 |     "            par.append(params)\n",
101 |     "        for i in range(len(par)):\n",
102 |     "            url = url + par[i] + '&'\n",
103 |     "        url = 'http://swas.eu-central-1.aliyuncs.com?' + url[:-1] + '&Signature=' + self.private_param['Signature']\n",
104 |     "        #print('url is : ' + url)\n",
105 |     "        return url"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 24,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "def config(filename='alibaba_tokens.prod', section='tokens'):\n",
115 |     "    # create a parser\n",
116 |     "    parser = ConfigParser()\n",
117 |     "    # read config file\n",
118 |     "    parser.read(filename)\n",
119 |     "\n",
120 |     "    # get section, default to postgresql\n",
121 |     "    cfg = {}\n",
122 |     "    if parser.has_section(section):\n",
123 |     "        params = parser.items(section)\n",
124 |     "        for param in params:\n",
125 |     "            cfg[param[0]] = param[1]\n",
126 |     "    else:\n",
127 |     "        raise Exception('Section {0} not found in the {1} file'.format(section, filename))\n",
128 |     "    return cfg\n",
129 |     "\n",
130 |     "def upgradeInstance(InstanceId, ImageId, public_param, secret):\n",
131 |     "    action_param = dict()\n",
132 |     "    action_param[\"Action\"] = \"ResetSystem\"\n",
133 |     "\n",
134 |     "    private_param = dict()\n",
135 |     "    private_param[\"Action\"] = action_param[\"Action\"]\n",
136 |     "    private_param[\"InstanceId\"] = InstanceId\n",
137 |     "    private_param[\"ImageId\"] = ImageId\n",
138 |     "\n",
139 |     "    sig = SignatureUrl(public_param, private_param, secret)\n",
140 |     "    sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n",
141 |     "    sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n",
142 |     "    url = sig.url_factory('GET')\n",
143 |     "\n",
144 |     "    result = requests.request(method=\"get\",url=url)\n",
145 |     "\n",
146 |     "    print(\"Instance reset: \" + result.text)\n",
147 |     "\n",
148 |     "def renameInstance(InstanceId, Name, public_param, secret):\n",
149 |     "    action_param = dict()\n",
150 |     "    action_param[\"Action\"] = \"UpdateInstanceAttribute\"\n",
151 |     "\n",
152 |     "    private_param = dict()\n",
153 |     "    private_param[\"Action\"] = action_param[\"Action\"]\n",
154 |     "    private_param[\"InstanceId\"] = InstanceId\n",
155 |     "    private_param[\"InstanceName\"] = Name\n",
156 |     "\n",
157 |     "    sig = SignatureUrl(public_param, private_param, secret)\n",
158 |     "    sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n",
159 |     "    sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n",
160 |     "    url = sig.url_factory('GET')\n",
161 |     "\n",
162 |     "    result = requests.request(method=\"get\",url=url)\n",
163 |     "\n",
164 |     "    print(\"Instance renamed: \" + result.text)\n",
165 |     "\n",
166 |     "def startInstance(InstanceId, public_param, secret):\n",
167 |     "    action_param = dict()\n",
168 |     "    action_param[\"Action\"] = \"StartInstance\"\n",
169 |     "\n",
170 |     "    private_param = dict()\n",
171 |     "    private_param[\"Action\"] = action_param[\"Action\"]\n",
172 |     "    private_param[\"InstanceId\"] = InstanceId\n",
173 |     "\n",
174 |     "\n",
175 |     "    sig = SignatureUrl(public_param, private_param, secret)\n",
176 |     "    sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n",
177 |     "    sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n",
178 |     "    url = sig.url_factory('GET')\n",
179 |     "\n",
180 |     "    result = requests.request(method=\"get\",url=url)\n",
181 |     "\n",
182 |     "    print(\"Instance started: \" + result.text)\n",
183 |     "\n",
184 |     "def stopInstance(InstanceId, public_param, secret):\n",
185 |     "    action_param = dict()\n",
186 |     "    action_param[\"Action\"] = \"StopInstance\"\n",
187 |     "\n",
188 |     "    private_param = dict()\n",
189 |     "    private_param[\"Action\"] = action_param[\"Action\"]\n",
190 |     "    private_param[\"InstanceId\"] = InstanceId\n",
191 |     "\n",
192 |     "\n",
193 |     "    sig = SignatureUrl(public_param, private_param, secret)\n",
194 |     "    sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n",
195 |     "    sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n",
196 |     "    url = sig.url_factory('GET')\n",
197 |     "\n",
198 |     "    result = requests.request(method=\"get\",url=url)\n",
199 |     "\n",
200 |     "    print(\"Instance released: \" + result.text)\n",
201 |     "\n",
202 |     "\n",
203 |     "def rebootInstance(InstanceId, public_param, secret):\n",
204 |     "    action_param = dict()\n",
205 |     "    action_param[\"Action\"] = \"RebootInstance\"\n",
206 |     "\n",
207 |     "    private_param = dict()\n",
208 |     "    private_param[\"Action\"] = action_param[\"Action\"]\n",
209 |     "    private_param[\"InstanceId\"] = InstanceId\n",
210 |     "\n",
211 |     "\n",
212 |     "    sig = SignatureUrl(public_param, private_param, secret)\n",
213 |     "    sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n",
214 |     "    sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n",
215 |     "    url = sig.url_factory('GET')\n",
216 |     "\n",
217 |     "    result = requests.request(method=\"get\",url=url)\n",
218 |     "\n",
219 |     "    print(\"Instance rebooted: \" + result.text)\n",
220 |     "\n"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "# Start/stop/reboot/reset/list all instances"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 25,
233 |    "metadata": {},
234 |    "outputs": [
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "page 0 attempt failed\n",
240 |       "page 0 appended\n",
241 |       "page 1 attempt failed\n",
242 |       "page 1 appended\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "params = config()\n",
248 |     "mode = \"list\" # start or stop or reboot or reset or list\n",
249 |     "\n",
250 |     "ratio = 1.0\n",
251 |     "\n",
252 |     "public_param = dict()\n",
253 |     "public_param[\"AccessKeyId\"] = params[\"id\"]\n",
254 |     "public_param[\"SignatureMethod\"] = 'HMAC-SHA1'\n",
255 |     "public_param[\"SignatureVersion\"] = '1.0'\n",
256 |     "public_param[\"Version\"] = \"2020-06-01\"\n",
257 |     "public_param[\"Format\"] = 'json'\n",
258 |     "\n",
259 |     "action_param = dict()\n",
260 |     "action_param[\"Action\"] = \"ListInstances\"\n",
261 |     "\n",
262 |     "instances = []\n",
263 |     "ilist = []\n",
264 |     "j=0\n",
265 |     "k=0\n",
266 |     "for i in range(2):\n",
267 |     "    private_param = dict()\n",
268 |     "    private_param[\"Action\"] = action_param[\"Action\"]\n",
269 |     "    private_param[\"RegionId\"] = \"eu-central-1\"\n",
270 |     "    private_param[\"PageSize\"] = \"100\"\n",
271 |     "    private_param[\"PageNumber\"] = str(i+1)\n",
272 |     "\n",
273 |     "    sig = SignatureUrl(public_param, private_param, params[\"secret\"])\n",
274 |     "    sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n",
275 |     "    sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n",
276 |     "    url = sig.url_factory('GET')\n",
277 |     "    for _ in range(5):\n",
278 |     "        result = requests.request(method=\"get\",url=url)\n",
279 |     "        if result.status_code == 200:\n",
280 |     "            print(f\"page {i} attempt failed\")\n",
281 |     "            break\n",
282 |     "        time.sleep(20)\n",
283 |     "    \n",
284 |     "    print(f\"page {i} appended\")\n",
285 |     "    \n",
286 |     "    instances = instances + json.loads(result.text)[\"Instances\"]\n",
287 |     "    ilist.append( pd.json_normalize( json.loads(result.text), record_path = ['Instances'] ))\n",
288 |     "\n",
289 |     "if mode == \"list\":\n",
290 |     "    ilist[0].to_csv(\"alibaba_instances_0.csv\", index=None)\n",
291 |     "    ilist[1].to_csv(\"alibaba_instances_1.csv\", index=None)\n",
292 |     "else:\n",
293 |     "        \n",
294 |     "    for instance in instances:\n",
295 |     "        print(str(j+k) + \". \" + instance[\"InstanceId\"])\n",
296 |     "\n",
297 |     "        if instance[\"InstanceId\"] not in baseInstances:\n",
298 |     "            if mode == \"start\":\n",
299 |     "                if instance[\"Status\"] == \"Stopped\":\n",
300 |     "                    startInstance(instance[\"InstanceId\"], public_param, params[\"secret\"])\n",
301 |     "            elif mode == \"stop\":\n",
302 |     "                if instance[\"Status\"] == \"Running\":\n",
303 |     "                    stopInstance(instance[\"InstanceId\"], public_param, params[\"secret\"])\n",
304 |     "            elif mode == \"reboot\":\n",
305 |     "                if 1 == 1:\n",
306 |     "                    j += 1\n",
307 |     "                    rebootInstance(instance[\"InstanceId\"], public_param, params[\"secret\"])\n",
308 |     "                    renameInstance(instance[\"InstanceId\"], \"nolang_v1_\" + str(j), public_param, params[\"secret\"])\n",
309 |     "                    print (\"instance was rebooted\")\n",
310 |     "                else:\n",
311 |     "                    k+=1\n",
312 |     "                    renameInstance(instance[\"InstanceId\"], \"overquota_\" + str(k), public_param, params[\"secret\"])\n",
313 |     "                    print (\"instance inactive\")\n",
314 |     "            elif mode == \"reset\":\n",
315 |     "                j += 1\n",
316 |     "                print(str(j) + \". \" + instance[\"InstanceId\"] + \" has ImageId \" + instance[\"InstanceId\"])\n",
317 |     "                time.sleep(0.5)\n",
318 |     "                if instance[\"InstanceId\"] not in baseInstances: # and instance[\"Status\"] == \"Running\":\n",
319 |     "                    if j/len(instances) < ratio:\n",
320 |     "                        upgradeInstance(instance[\"InstanceId\"], \"m-gw8iyh9kb8hp3b8ed6gm\", public_param, params[\"secret\"])\n",
321 |     "                        renameInstance(instance[\"InstanceId\"], \"i2d_\" + str(j), public_param, params[\"secret\"])\n",
322 |     "                    else:\n",
323 |     "                        upgradeInstance(instance[\"InstanceId\"], \"m-gw8iyh9kb8hp3b8ed6gm\", public_param, params[\"secret\"])\n",
324 |     "                        renameInstance(instance[\"InstanceId\"], \"intl_v1_\" + str(j), public_param, params[\"secret\"])\n",
325 |     "                    time.sleep(8)\n",
326 |     "        else:\n",
327 |     "            pass\n",
328 |     "        time.sleep(8)\n",
329 |     "    "
330 |    ]
331 |   }
332 |  ],
333 |  "metadata": {
334 |   "interpreter": {
335 |    "hash": "ee22a52db22349ad32e35f3b499efddea1c9229e771c5fd65652469b6b2f1979"
336 |   },
337 |   "kernelspec": {
338 |    "display_name": "Python 3.9.7 64-bit ('gpu': conda)",
339 |    "name": "python3"
340 |   },
341 |   "language_info": {
342 |    "codemirror_mode": {
343 |     "name": "ipython",
344 |     "version": 3
345 |    },
346 |    "file_extension": ".py",
347 |    "mimetype": "text/x-python",
348 |    "name": "python",
349 |    "nbconvert_exporter": "python",
350 |    "pygments_lexer": "ipython3",
351 |    "version": "3.9.7"
352 |   },
353 |   "orig_nbformat": 4
354 |  },
355 |  "nbformat": 4,
356 |  "nbformat_minor": 2
357 | }
358 | 


--------------------------------------------------------------------------------
/docs/architecture.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="app.diagrams.net" modified="2021-11-03T16:53:53.930Z" agent="5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30" etag="EqyAiIusWVc6lRFuP06_" version="15.3.8" type="github">
  2 |   <diagram id="HWF6cFD_o7p9VQBaK8Nd" name="Page-1">
  3 |     <mxGraphModel dx="1422" dy="1677" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" background="#949494" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="32pcUigWsBj-OOpQZ1Kn-3" edge="1">
  8 |           <mxGeometry relative="1" as="geometry">
  9 |             <Array as="points">
 10 |               <mxPoint x="719" y="20" />
 11 |               <mxPoint x="508" y="20" />
 12 |             </Array>
 13 |           </mxGeometry>
 14 |         </mxCell>
 15 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="32pcUigWsBj-OOpQZ1Kn-2" edge="1">
 16 |           <mxGeometry relative="1" as="geometry">
 17 |             <mxPoint x="649" y="94" as="targetPoint" />
 18 |             <Array as="points">
 19 |               <mxPoint x="719" y="103" />
 20 |             </Array>
 21 |           </mxGeometry>
 22 |         </mxCell>
 23 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-10" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0;entryY=0.5;entryDx=0;entryDy=0;entryPerimeter=0;fontColor=#FFFFFF;startArrow=classic;startFill=1;strokeColor=#FFFFFF;exitX=0.99;exitY=0.536;exitDx=0;exitDy=0;exitPerimeter=0;" edge="1" parent="1" source="32pcUigWsBj-OOpQZ1Kn-1" target="q12cR9YonJrgc3rlI6Qn-2">
 24 |           <mxGeometry relative="1" as="geometry">
 25 |             <mxPoint x="725" y="218" as="sourcePoint" />
 26 |             <Array as="points">
 27 |               <mxPoint x="773" y="220" />
 28 |               <mxPoint x="773" y="55" />
 29 |             </Array>
 30 |           </mxGeometry>
 31 |         </mxCell>
 32 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-1" value="&amp;nbsp;BLOOM FILTERS" style="aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Cache_Redis_Product.svg;labelPosition=right;verticalLabelPosition=bottom;verticalAlign=top;labelBackgroundColor=none;fontColor=#FFFFFF;" parent="1" vertex="1">
 33 |           <mxGeometry x="694" y="197" width="50" height="42" as="geometry" />
 34 |         </mxCell>
 35 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-2" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-2" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 36 |           <mxGeometry relative="1" as="geometry">
 37 |             <Array as="points">
 38 |               <mxPoint x="550" y="103" />
 39 |               <mxPoint x="550" y="120" />
 40 |             </Array>
 41 |           </mxGeometry>
 42 |         </mxCell>
 43 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-2" value="HOME WORKERS" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Docker.svg;labelPosition=center;verticalLabelPosition=top;verticalAlign=bottom;labelBackgroundColor=none;fontColor=#FFFFFF;noLabel=0;direction=east;" parent="1" vertex="1">
 44 |           <mxGeometry x="600" y="82" width="50" height="41" as="geometry" />
 45 |         </mxCell>
 46 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-3" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
 47 |           <mxGeometry relative="1" as="geometry" />
 48 |         </mxCell>
 49 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-3" value="SINGLE CPU WORKERS SWARMS&amp;nbsp;&lt;br&gt;MULTICPU WORKER DC NODES&amp;nbsp;&lt;br&gt;VM OR BARE METAL&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;fillColor=#505050;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_farm;fillColor=#7FBA42;fontColor=#FFFFFF;" parent="1" vertex="1">
 50 |           <mxGeometry x="480" y="30" width="56" height="49" as="geometry" />
 51 |         </mxCell>
 52 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-4" target="32pcUigWsBj-OOpQZ1Kn-14" edge="1">
 53 |           <mxGeometry relative="1" as="geometry" />
 54 |         </mxCell>
 55 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-4" edge="1">
 56 |           <mxGeometry relative="1" as="geometry">
 57 |             <mxPoint x="720" y="240" as="targetPoint" />
 58 |           </mxGeometry>
 59 |         </mxCell>
 60 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-4" value="MAIN STAGING&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#2072B8;fontColor=#FFFFFF;" parent="1" vertex="1">
 61 |           <mxGeometry x="494.5" y="280" width="27" height="52" as="geometry" />
 62 |         </mxCell>
 63 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-5" target="32pcUigWsBj-OOpQZ1Kn-6" edge="1">
 64 |           <mxGeometry relative="1" as="geometry" />
 65 |         </mxCell>
 66 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-5" value="DL STAGING&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#DA4026;fontColor=#FFFFFF;" parent="1" vertex="1">
 67 |           <mxGeometry x="495" y="111" width="27" height="52" as="geometry" />
 68 |         </mxCell>
 69 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-6" target="32pcUigWsBj-OOpQZ1Kn-4" edge="1">
 70 |           <mxGeometry relative="1" as="geometry" />
 71 |         </mxCell>
 72 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;startArrow=classic;startFill=1;strokeColor=#FFFFFF;" edge="1" parent="1" target="q12cR9YonJrgc3rlI6Qn-1">
 73 |           <mxGeometry relative="1" as="geometry">
 74 |             <mxPoint x="540" y="240" as="sourcePoint" />
 75 |             <Array as="points">
 76 |               <mxPoint x="540" y="280" />
 77 |               <mxPoint x="950" y="280" />
 78 |               <mxPoint x="950" y="137" />
 79 |             </Array>
 80 |           </mxGeometry>
 81 |         </mxCell>
 82 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-6" value="GPU INFERENCE NODES&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.devices.workstation;fontColor=#FCFCFC;fillColor=#B5B5B5;" parent="1" vertex="1">
 83 |           <mxGeometry x="482" y="190" width="53" height="56" as="geometry" />
 84 |         </mxCell>
 85 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.523;entryY=-0.007;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-14" target="32pcUigWsBj-OOpQZ1Kn-17" edge="1">
 86 |           <mxGeometry relative="1" as="geometry" />
 87 |         </mxCell>
 88 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-14" value="" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Discs.svg;" parent="1" vertex="1">
 89 |           <mxGeometry x="483" y="386" width="50" height="42" as="geometry" />
 90 |         </mxCell>
 91 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-16" value="" style="aspect=fixed;html=1;points=[];align=center;image;fontSize=12;image=img/lib/mscae/Discs.svg;" parent="1" vertex="1">
 92 |           <mxGeometry x="483" y="486" width="50" height="42" as="geometry" />
 93 |         </mxCell>
 94 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-21" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="32pcUigWsBj-OOpQZ1Kn-17" edge="1">
 95 |           <mxGeometry relative="1" as="geometry">
 96 |             <Array as="points">
 97 |               <mxPoint x="719" y="514" />
 98 |             </Array>
 99 |             <mxPoint x="520" y="514" as="targetPoint" />
100 |           </mxGeometry>
101 |         </mxCell>
102 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-17" value="&amp;nbsp;IMG2DATASET" style="aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Cloud_Service.svg;labelPosition=right;verticalLabelPosition=middle;verticalAlign=middle;labelBackgroundColor=none;fontColor=#FFFFFF;" parent="1" vertex="1">
103 |           <mxGeometry x="694" y="440" width="50" height="42" as="geometry" />
104 |         </mxCell>
105 |         <mxCell id="32pcUigWsBj-OOpQZ1Kn-20" value="STORAGE" style="whiteSpace=wrap;html=1;aspect=fixed;dashed=1;rounded=1;fillColor=none;strokeColor=#FFFFFF;fontColor=#FFFFFF;" parent="1" vertex="1">
106 |           <mxGeometry x="403" y="356" width="210" height="210" as="geometry" />
107 |         </mxCell>
108 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-6" edge="1">
109 |           <mxGeometry relative="1" as="geometry" />
110 |         </mxCell>
111 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-2" edge="1">
112 |           <mxGeometry relative="1" as="geometry" />
113 |         </mxCell>
114 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-3" edge="1">
115 |           <mxGeometry relative="1" as="geometry">
116 |             <Array as="points">
117 |               <mxPoint x="625" y="150" />
118 |               <mxPoint x="570" y="150" />
119 |               <mxPoint x="570" y="55" />
120 |             </Array>
121 |           </mxGeometry>
122 |         </mxCell>
123 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;strokeColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-1" edge="1">
124 |           <mxGeometry relative="1" as="geometry" />
125 |         </mxCell>
126 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;strokeColor=#FFFFFF;" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="32pcUigWsBj-OOpQZ1Kn-5" edge="1">
127 |           <mxGeometry relative="1" as="geometry">
128 |             <Array as="points">
129 |               <mxPoint x="625" y="150" />
130 |             </Array>
131 |           </mxGeometry>
132 |         </mxCell>
133 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#FFFFFF;" edge="1" parent="1" source="aqjGa1b1h-gu6OBOAQX7-1" target="q12cR9YonJrgc3rlI6Qn-2">
134 |           <mxGeometry relative="1" as="geometry">
135 |             <Array as="points">
136 |               <mxPoint x="625" y="160" />
137 |               <mxPoint x="755" y="160" />
138 |               <mxPoint x="755" y="20" />
139 |               <mxPoint x="830" y="20" />
140 |             </Array>
141 |           </mxGeometry>
142 |         </mxCell>
143 |         <mxCell id="aqjGa1b1h-gu6OBOAQX7-1" value="TRACKER" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=center;verticalLabelPosition=bottom;verticalAlign=top;outlineConnect=0;align=center;shape=mxgraph.office.servers.sql_server;fillColor=#FFCE9F;fontColor=#FFFFFF;" parent="1" vertex="1">
144 |           <mxGeometry x="603" y="190" width="43" height="56" as="geometry" />
145 |         </mxCell>
146 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=1;entryY=0.5;entryDx=0;entryDy=0;entryPerimeter=0;strokeColor=#FFFFFF;startArrow=classic;startFill=1;" edge="1" parent="1" source="q12cR9YonJrgc3rlI6Qn-1" target="q12cR9YonJrgc3rlI6Qn-3">
147 |           <mxGeometry relative="1" as="geometry" />
148 |         </mxCell>
149 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-1" value="DATABASE" style="sketch=0;aspect=fixed;html=1;points=[];align=left;image;fontSize=12;image=img/lib/mscae/Azure_Database_for_PostgreSQL_servers.svg;labelBackgroundColor=none;fontColor=#FFFFFF;labelPosition=right;verticalLabelPosition=top;verticalAlign=bottom;" vertex="1" parent="1">
150 |           <mxGeometry x="898" y="112" width="38" height="50" as="geometry" />
151 |         </mxCell>
152 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;" edge="1" parent="1" source="q12cR9YonJrgc3rlI6Qn-2" target="q12cR9YonJrgc3rlI6Qn-1">
153 |           <mxGeometry relative="1" as="geometry" />
154 |         </mxCell>
155 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-2" value="PREPROCESSORS" style="sketch=0;aspect=fixed;pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=center;verticalLabelPosition=bottom;verticalAlign=top;align=center;fillColor=#00188D;shape=mxgraph.mscae.enterprise.server_farm;fontColor=#FFFFFF;" vertex="1" parent="1">
156 |           <mxGeometry x="805" y="38.5" width="50" height="32" as="geometry" />
157 |         </mxCell>
158 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#FFFFFF;exitX=0.605;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;" edge="1" parent="1" source="q12cR9YonJrgc3rlI6Qn-1" target="32pcUigWsBj-OOpQZ1Kn-6">
159 |           <mxGeometry relative="1" as="geometry">
160 |             <mxPoint x="1010" y="170" as="sourcePoint" />
161 |             <Array as="points">
162 |               <mxPoint x="921" y="170" />
163 |               <mxPoint x="509" y="170" />
164 |             </Array>
165 |           </mxGeometry>
166 |         </mxCell>
167 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;startArrow=none;startFill=0;strokeColor=#FFFFFF;" edge="1" parent="1" source="q12cR9YonJrgc3rlI6Qn-3" target="q12cR9YonJrgc3rlI6Qn-12">
168 |           <mxGeometry relative="1" as="geometry">
169 |             <Array as="points">
170 |               <mxPoint x="830" y="180" />
171 |               <mxPoint x="917" y="180" />
172 |             </Array>
173 |           </mxGeometry>
174 |         </mxCell>
175 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-3" value="DOWNLOADERS" style="sketch=0;aspect=fixed;pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=center;verticalLabelPosition=top;verticalAlign=bottom;align=center;fillColor=#00188D;shape=mxgraph.mscae.enterprise.server_farm;fontColor=#FFFFFF;" vertex="1" parent="1">
176 |           <mxGeometry x="805" y="121" width="50" height="32" as="geometry" />
177 |         </mxCell>
178 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;fontColor=#FFFFFF;startArrow=none;startFill=0;strokeColor=#FFFFFF;" edge="1" parent="1" source="q12cR9YonJrgc3rlI6Qn-12">
179 |           <mxGeometry relative="1" as="geometry">
180 |             <mxPoint x="540" y="250" as="targetPoint" />
181 |             <Array as="points">
182 |               <mxPoint x="920" y="280" />
183 |               <mxPoint x="540" y="280" />
184 |             </Array>
185 |           </mxGeometry>
186 |         </mxCell>
187 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-12" value="DL STAGING 2&amp;nbsp;" style="pointerEvents=1;shadow=0;dashed=0;html=1;strokeColor=none;labelPosition=left;verticalLabelPosition=middle;verticalAlign=middle;outlineConnect=0;align=right;shape=mxgraph.office.servers.server_generic;fillColor=#DA4026;fontColor=#FFFFFF;" vertex="1" parent="1">
188 |           <mxGeometry x="906.5" y="192" width="27" height="52" as="geometry" />
189 |         </mxCell>
190 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-15" value="CLASSIC&lt;br&gt;(2 stages workflow)" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;fontColor=#FFFFFF;" vertex="1" parent="1">
191 |           <mxGeometry x="550" y="-30" width="120" height="30" as="geometry" />
192 |         </mxCell>
193 |         <mxCell id="q12cR9YonJrgc3rlI6Qn-16" value="DATABASE&lt;br&gt;(3 stages workflow)" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;fontColor=#FFFFFF;" vertex="1" parent="1">
194 |           <mxGeometry x="786.5" y="-30" width="120" height="30" as="geometry" />
195 |         </mxCell>
196 |       </root>
197 |     </mxGraphModel>
198 |   </diagram>
199 | </mxfile>
200 | 


--------------------------------------------------------------------------------
/dbdl.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Encoding image analyzing errors: Add the numbers below to 8 to encode all types of errors (so status=9...23 is reserved to describe the errors)
  3 | - general exception: 1
  4 | - bad format: 2
  5 | - image too big: 4
  6 | - image too small: 8
  7 | - any combination of above 
  8 | 
  9 | '''
 10 | 
 11 | 
 12 | import gc
 13 | from logging import raiseExceptions 
 14 | import os
 15 | import ssl
 16 | import sys
 17 | import time
 18 | import trio
 19 | import uuid
 20 | import ujson
 21 | import shutil
 22 | import tarfile
 23 | import argparse
 24 | import pandas as pd
 25 | from glob import glob
 26 | from uuid import uuid1
 27 | from io import BytesIO
 28 | from tqdm.auto import tqdm
 29 | from datetime import datetime
 30 | from sqlalchemy import create_engine
 31 | from configparser import ConfigParser
 32 | from PIL import Image, ImageFile, UnidentifiedImageError 
 33 | from random_user_agent.user_agent import UserAgent
 34 | from random_user_agent.params import SoftwareName, OperatingSystem
 35 | from multiprocessing import Process, cpu_count
 36 | 
 37 | sys.path.append('./crawlingathome-worker/')
 38 | 
 39 | import asks
 40 | asks.init("trio")
 41 | 
 42 | ImageFile.LOAD_TRUNCATED_IMAGES = True  # https://stackoverflow.com/a/47958486
 43 | ssl_ctx = ssl.create_default_context()
 44 | ssl_ctx.check_hostname = False
 45 | ssl_ctx.verify_mode = ssl.CERT_NONE
 46 | 
 47 | def config(filename='database.ini', section='cah_production'):
 48 |     # create a parser
 49 |     parser = ConfigParser()
 50 |     # read config file
 51 |     parser.read(filename)
 52 | 
 53 |     # get section, default to postgresql
 54 |     db = {}
 55 |     if parser.has_section(section):
 56 |         params = parser.items(section)
 57 |         for param in params:
 58 |             db[param[0]] = param[1]
 59 |     else:
 60 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 61 | 
 62 |     return db
 63 | 
 64 | class Tracer(trio.abc.Instrument):
 65 | 
 66 |     def __init__(self, pbar: tqdm):
 67 |         self.exceptions = 0
 68 |         self.requests = 0
 69 |         self.downloads = 0
 70 |         self.imgproc_duration = 0
 71 |         self.download_duration = 0
 72 |         self.error_duration = 0
 73 |         self.pbar = pbar
 74 | 
 75 |     def task_exited(self, task):
 76 |         if task.custom_sleep_data is not None:
 77 |             self.pbar.update(1)
 78 |             if task.custom_sleep_data[0] in [1, 3]: # this is exception
 79 |                 self.exceptions += 1
 80 |                 self.error_duration += task.custom_sleep_data[2]
 81 |             if task.custom_sleep_data[0] == 0: # this is image downloaded
 82 |                 self.download_duration += task.custom_sleep_data[1]
 83 |                 self.imgproc_duration += task.custom_sleep_data[2]
 84 |                 self.downloads += 1
 85 | 
 86 |     def after_run(self):
 87 |         rate = round(self.exceptions / (self.exceptions + self.downloads + sys.float_info.epsilon), 2)
 88 |         avg_download = round(self.download_duration / (self.downloads + sys.float_info.epsilon), 2)
 89 |         avg_process = round(self.imgproc_duration / (self.downloads + sys.float_info.epsilon), 2)
 90 |         avg_error = round(self.error_duration / (self.exceptions + sys.float_info.epsilon), 2)
 91 |         print(f"[instrumentation] While scraping there were {self.exceptions} errors within {self.downloads + self.exceptions} candidates (error rate = {round(rate * 100,2)} %). {self.downloads} images were downloaded.")
 92 |         print(f"[instrumentation] Cumulative image processing duration {round(self.imgproc_duration, 2)} s.")
 93 |         print(f"[instrumentation] Average downloading time {avg_download} s/img, image processing time {avg_process} s/img, exceptions processing time {avg_error} s/link")
 94 | 
 95 | def log(e):
 96 |     with open("errors.txt","a") as f:
 97 |         f.write(str(e.__class__.__name__) + " " + str(e) + "\n")
 98 | 
 99 | 
100 | def process_img_content(response, alt_text, license, sample_id, language, i):
101 |     """
102 |     Function to process downloaded image. Use use PIL from pillow-simd 
103 |         (faster than open cv that in return is faster than original pillow)
104 |     
105 |     input: web request response, ALT text, license and sample id
106 | 
107 |     output: list of image parameters or None if image is rejected
108 |     """
109 |     img_output_folder = f"./{i}/save/images/"
110 |     error_code = 8
111 | 
112 |     #temp 2 lines
113 |     if language == "" or language is None:
114 |         language = "en" 
115 | 
116 |     def _resize(im: Image):
117 |         width, height = im.size
118 |         ratio = min(width, height) / 224
119 |         new_width = int(round(width/ratio,0))
120 |         new_height = int(round(height/ratio,0))
121 |         im = im.resize((new_width, new_height), resample=Image.BICUBIC)
122 |         if new_width > 224 or new_height > 224:
123 |             left = (new_width - 224)/2
124 |             top = (new_height - 224)/2
125 |             right = (new_width + 224)/2
126 |             bottom = (new_height + 224)/2
127 |             # Crop the center of the image
128 |             im = im.crop((left, top, right, bottom))
129 |         return im
130 |     try:
131 |         # reject too small images
132 |         if len(response.content) < 5000:
133 |             error_code += 8
134 |         img_data = BytesIO(response.content)
135 |         with Image.open(img_data) as im:
136 |             width, height = im.size
137 |             # reject if too large (might be a DOS decompression bomb)
138 |             if width * height > 89478484:
139 |                 error_code += 4
140 |             else:
141 |                 im_format = im.format
142 |                 out_fname = f"{img_output_folder}{str(sample_id)}.{im_format.lower()}"
143 |                 # reject if format is not in this list
144 |                 if im_format not in ["JPEG", "JPG", "PNG", "WEBP"]:
145 |                     error_code += 2
146 |                 if min(width, height) > 224:
147 |                     im = _resize(im)
148 |                 
149 |                 # convert all images to RGB (necessary for CLIP, also CLIP is doing it again so do we need it here?)
150 |                 if im.mode != "RGB":
151 |                     im = im.convert("RGB")
152 |                 if error_code == 8:
153 |                     im.save(out_fname) # do not retain images we do not need
154 |     except (KeyError, UnidentifiedImageError):
155 |         out_fname = ""
156 |         width = 0
157 |         height = 0
158 |         error_code += 1
159 |     
160 |     if error_code == 8:
161 |         error_code = 2 # mark succesful lines with status = 2
162 | 
163 |     return [str(sample_id), out_fname, response.url, alt_text, width, height, license, language, error_code]
164 | 
165 | 
166 | async def request_image(parsed_df, i):
167 |     """
168 |     This function initiates many parallel async connections to try download the images from provided links
169 |     
170 |     input: dataset of validated links, the sample id to start with
171 | 
172 |     output: list of lists with succesfully downloaded images and their parameters. this list is dumped on disk as json file
173 |     """
174 |     tmp_data = []
175 |     limit = trio.CapacityLimiter(1000)
176 | 
177 |     # change the number of parallel connections based on CPU speed, network capabilities, etc.
178 |     # the number of 192 is optimized for 1 vCPU droplet at Hetzner Cloud (code CX11)
179 |     session = asks.Session(connections=64, ssl_context=ssl_ctx)
180 | 
181 |     software_names = [SoftwareName.CHROME.value]
182 |     operating_systems = [OperatingSystem.LINUX.value]   
183 | 
184 |     user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=2000)
185 |     user_agent = user_agent_rotator.get_random_user_agent()
186 | 
187 |     # try to make the bot website friendly
188 |     session.headers = {
189 |         "User-Agent": user_agent,
190 |         "Accept-Language": "en-US,en;q=0.5",
191 |         "Accept-Encoding": "gzip, deflate",
192 |         "Referer": "https://google.com",
193 |         "DNT": "1",
194 |         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
195 |     }
196 | 
197 |     async def _request(row, i):
198 |         start=time.time()
199 |         sample_id = row[0]
200 |         url = row[1]
201 |         alt_text = row[2]
202 |         license = row[3]
203 |         language = row[4]
204 |         # the following 2 lines are related to Trio Instrument to capture events from multiple threads
205 |         task = trio.lowlevel.current_task()
206 |         try:
207 |             response = await session.get(url, timeout=10, connection_timeout=20)
208 |             dltime = round(time.time()-start, 2)
209 |             start=time.time()
210 |             proces = process_img_content(
211 |                 # tune timeout and connection_timeout to grab more or less files. shorter timeouts will exclude bad performing websites
212 |                 response, alt_text, license, sample_id, language, i
213 |             )
214 |             proctime = round(time.time()-start, 2)
215 |             task.custom_sleep_data = (0, dltime, proctime) # for success do not count errors
216 |             if proces is not None:
217 |                 tmp_data.append(proces)
218 |         except Exception as e:
219 |             log(e)
220 |             task.custom_sleep_data = (1, 0, round(time.time()-start,2)) # when exception is hit, count it
221 | 
222 |     async with trio.open_nursery() as n:
223 |         for index, row in parsed_df.iterrows():
224 |             async with limit:
225 |                 n.start_soon(_request, row, i)
226 |             
227 |     # trio makes sure at this point all async tasks were executed
228 |     with open(f"./{i}/.tmp/{uuid1()}.json", "w") as f:
229 |         ujson.dump(tmp_data, f)
230 |     gc.collect()
231 | 
232 |     return
233 | 
234 | 
235 | def dl_wat(parsed_df, i, pbar): # replace valid data and start sampleid with parsed_df
236 |     """
237 |     This function initiates download attempt of validated parsed links
238 |     It launches multithreaded tasks by using trio module
239 |     
240 |     input: dataset of validated links, the sample id to start with
241 | 
242 |     output: dataframe of downloaded images and their parameters
243 |     """
244 |     
245 |     # Download every image available
246 |     processed_samples = []
247 |     #trio.run(request_image, valid_data, first_sample_id, instruments=[TrioProgress(len(valid_data), False)] )
248 |     trio.run( request_image, parsed_df, i, instruments=[Tracer(pbar)] )
249 | 
250 |     for tmpf in glob(f"./{i}/.tmp/*.json"):
251 |         processed_samples.extend(ujson.load(open(tmpf)))
252 |     return pd.DataFrame(
253 |         processed_samples,
254 |         columns=["SAMPLE_ID", "PATH", "URL", "TEXT", "HEIGHT", "WIDTH", "LICENSE", "LANGUAGE", "STATUS"],
255 |     )
256 | 
257 | def upload(source: str, clientType: str, target: str):
258 |     with tarfile.open(f"{source}.tar.gz", "w:gz") as tar:
259 |         tar.add(source, arcname=os.path.basename(source))
260 |     result = os.system(f"rsync -av {source}.tar.gz {target}")
261 |     if os.path.exists(f"{source}.tar.gz"):
262 |         os.remove(f"{source}.tar.gz")
263 |     if os.path.exists(f"{source}"):
264 |         shutil.rmtree(f"{source}", ignore_errors=True)
265 |     return result
266 | 
267 | def newJob(engine, dataset, depth, tablesample):
268 |     # selection on domains based on distribution of URLs per domain
269 |     select_stmt1 = f"UPDATE dataset_{dataset} SET status = 1 WHERE sampleid IN (SELECT sampleid FROM dataset_{dataset} TABLESAMPLE SYSTEM ({tablesample}) WHERE status = 0 LIMIT {depth} FOR UPDATE SKIP LOCKED) AND status = 0 RETURNING sampleid"
270 |     conn = engine.raw_connection()
271 |     cur = conn.cursor()
272 |     cur.execute(select_stmt1)
273 |     result = cur.fetchall()
274 |     conn.commit()
275 |     cur.close()
276 | 
277 |     values = ",".join([str(tuple[0]) for tuple in result])
278 |     select_stmt2 = f"SELECT sampleid, url, text, license, language FROM dataset_{dataset} WHERE sampleid in ({values})"
279 |     df = pd.read_sql_query(select_stmt2, conn)
280 |     conn.close()
281 |     return df
282 | 
283 | def completeJob2(engine, prefix, parsed_df, dlparse_df, dataset):
284 |     # prepare data for EN
285 |     values2 = ",".join(parsed_df["sampleid"].astype(str))
286 |     update_stmt1 = ""
287 |     for i, row in dlparse_df.iterrows():
288 |         update_stmt1 += f'UPDATE dataset_nolang SET status={row["STATUS"]}, width={row["HEIGHT"]}, height={row["WIDTH"]} where sampleid = {row["SAMPLE_ID"]};'
289 |         # this is intentional mix between width and heigth to account for the but in previous laion release
290 |         # the csv will go scrambled but in database we want good values
291 |     insert_stmt = f"INSERT INTO jobs_{dataset} (jobid) VALUES ('{prefix}')"
292 | 
293 |     if len(dlparse_df.index > 0):
294 |         conn = engine.raw_connection()
295 |         cur = conn.cursor()
296 |         cur.execute(update_stmt1)
297 |         cur.execute(insert_stmt)
298 |         conn.commit()
299 |         cur.close()
300 |         conn.close()
301 | 
302 |     # in case there are samples unaccounted for, we try to mark them with general error status
303 |     update_stmt2 = f"UPDATE dataset_{dataset} SET status = 9 where status = 1 AND sampleid in ({values2})"
304 | 
305 |     conn = engine.raw_connection()
306 |     cur = conn.cursor()
307 |     cur.execute(update_stmt2)
308 |     conn.commit()
309 |     cur.close()
310 |     conn.close()
311 |     return
312 | 
313 | def worker(engine, i, dataset, depth, tablesample, target):
314 | 
315 |     # initialize working folders
316 |     tmp_folder = f"./{i}/.tmp/"
317 |     output_folder = f"./{i}/save/"
318 |     img_output_folder = output_folder + "images/"
319 | 
320 |     while True:
321 |         try:
322 |             start = time.time()
323 |             start0 = start
324 | 
325 |             parsed_df = newJob(engine, dataset, depth, tablesample)
326 |             
327 |             prefix = uuid.uuid4().hex
328 |             result = 0
329 | 
330 |             # clear working folders for a new job
331 |             if os.path.exists(output_folder):
332 |                 shutil.rmtree(output_folder, ignore_errors=True)
333 |             if os.path.exists(tmp_folder):
334 |                 shutil.rmtree(tmp_folder, ignore_errors=True)
335 | 
336 |             os.makedirs(output_folder)
337 |             os.makedirs(img_output_folder)
338 |             os.makedirs(tmp_folder)
339 | 
340 |             # compute output file names base
341 |             out_fname = f"3_staged_workflow_job_{prefix}_full_wat"
342 |             print(f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] Job acquired in {round(time.time()-start,2)} sec")
343 |             start = time.time()
344 | 
345 |             print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] This job has {len(parsed_df)} candidates")
346 |             pbar = tqdm(total=len(parsed_df),position=i,desc=f"worker {i}")
347 |         
348 |             # attempt to download validated links and save to disk for stats and blocking lists
349 |             dlparse_df = dl_wat(parsed_df, i, pbar)
350 |             dlparse_df_save = dlparse_df[dlparse_df["STATUS"]==2] # remove rejected items from gpu jobs
351 |             dlparse_df_save.to_csv(output_folder + out_fname + ".csv", index=False, sep="|")
352 |             # at this point we finishes the CPU node job, need to make the data available for GPU worker
353 |             os.mkdir(prefix)
354 |             os.system(f"mv ./{i}/save/* {prefix}/")
355 |             result += upload(prefix, "CPU", target) #todo find the IP and endpoint
356 |             if result == 0:
357 |                 completeJob2(engine, prefix, parsed_df, dlparse_df, dataset)
358 | 
359 |             print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] pairs retained {len(dlparse_df_save)} in {round(time.time() - start, 2)}")
360 |             print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] scraping efficiency {len(dlparse_df_save)/(time.time() - start)} img/sec")
361 |             print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] crawling efficiency {len(parsed_df)/(time.time() - start)} links/sec")
362 | 
363 | 
364 |             last = round(time.time() - start0)
365 | 
366 |             print(f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] Job completed in {last} seconds")
367 |         
368 |         except Exception as e:
369 |             print (e)
370 |             print (f"{datetime.now().strftime('%H:%M:%S')} Worker {i} crashed")
371 |             time.sleep(60)
372 | 
373 | if __name__ == "__main__":
374 | 
375 |     print (f"starting session")
376 | 
377 |     parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -s/--set -d/--depth')
378 |     parser.add_argument("-s","--set",action='append',help="Which dataset to download (en, intl, nolang)", required=False)
379 |     parser.add_argument("-d","--depth",action='append',help="How many samples to download (10000)", required=False)
380 |     parser.add_argument("-t","--tablesample",action='append',help="Tablesample ratio (0.05)", required=False)
381 |     parser.add_argument("-r","--rsync",action='append',help="Rsync target where to store results", required=False)
382 |     parser.add_argument("-c","--cpus",action='append',help="How many cpus to use",required=False)
383 |     args = parser.parse_args()
384 | 
385 |     dataset = "en"
386 |     if args.set is not None:
387 |         dataset = args.set[0]
388 |     
389 |     depth = 10000
390 |     if args.depth is not None:
391 |         depth = int(args.depth[0])
392 | 
393 |     tablesample = 0.05
394 |     if args.tablesample is not None:
395 |         tablesample = float(args.tablesample[0])
396 |     
397 |     print(tablesample)
398 |     time.sleep(30)
399 | 
400 |     target = "archiveteam@176.9.4.150::gpujobsnolang"
401 |     if args.rsync is not None:
402 |         target = args.rsync[0]
403 |     
404 |     procs = cpu_count()
405 |     if args.cpus is not None and int(args.cpus[0]) > 0:
406 |         procs = int(args.cpus[0])
407 | 
408 |     params = config()
409 |     engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_size=procs, max_overflow=int(procs*1.5), pool_recycle=60, pool_pre_ping=True )
410 | 
411 |     for i in range(procs):
412 |         Process(target=worker, args=[engine, i, dataset, depth, tablesample, target], daemon=True).start()
413 | 
414 |     try:
415 |         while True:
416 |             time.sleep(30)
417 |     except KeyboardInterrupt:
418 |         sys.exit()
419 | 


--------------------------------------------------------------------------------
/ccpp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import ftfy
  5 | import ujson
  6 | import gcld3
  7 | import uuid
  8 | import shutil
  9 | import argparse
 10 | import hashlib
 11 | import tarfile
 12 | import psycopg2
 13 | import requests
 14 | import numpy as np
 15 | import pandas as pd
 16 | from tqdm.auto import tqdm
 17 | from random import randint
 18 | from datetime import datetime
 19 | from sqlalchemy import create_engine
 20 | from configparser import ConfigParser
 21 | from urllib.parse import urlparse, urljoin
 22 | from django.core.validators import URLValidator
 23 | from django.core.exceptions import ValidationError
 24 | from multiprocessing import Process, cpu_count
 25 | from crawlingathome_client.temp import TempCPUWorker
 26 | 
 27 | 
 28 | def config(filename='database.ini', mode="test"):
 29 |     # create a parser
 30 |     parser = ConfigParser()
 31 |     # read config file
 32 |     parser.read(filename)
 33 | 
 34 |     section='postgresql'
 35 |     if mode == "production":
 36 |         section='cah_production'
 37 | 
 38 |     # get section, default to postgresql
 39 |     db = {}
 40 |     if parser.has_section(section):
 41 |         params = parser.items(section)
 42 |         for param in params:
 43 |             db[param[0]] = param[1]
 44 |     else:
 45 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
 46 | 
 47 |     return db
 48 | 
 49 | def is_valid_url(url_string: str) -> bool:
 50 |     validate_url = URLValidator()
 51 |     try:
 52 |         validate_url(url_string)
 53 |     except ValidationError as e:
 54 |         return False
 55 |     return True
 56 | 
 57 | def log(e):
 58 |     with open("errors.txt","a") as f:
 59 |         f.write(str(e.__class__.__name__) + " " + str(e) + "\n")
 60 | 
 61 | def remove_bad_chars(text):
 62 |     # cleanup text so language can be detected
 63 |     return "".join(c for c in text if c.isprintable())
 64 | 
 65 | def timeit(debug, tick, msg):
 66 |     if not debug:
 67 |         return
 68 |     else:
 69 |         print (f"{msg} time chunk {round(time.time()-tick,2)} sec.")
 70 |         return time.time()
 71 | 
 72 | 
 73 | def parse_wat(content, i, debug):
 74 |     tick = time.time()
 75 |     """
 76 |     This function checks the wat file content and attempts to extract valid candidates of image urls and alt texts
 77 | 
 78 |     input: content = wat file content; start = start line number; line_count = how many lines to parse
 79 |             usually a wat file is split in 2 halfs or 2 shards. shard 0 starts at the first line and line_count is about 1/2 of wat file lines
 80 |             shard 1 starts at the middle of wat file and ends with the last line of wat
 81 |     
 82 |     output: a list of tuples (url, text, license, domain, hash)
 83 |     """
 84 | 
 85 |     bloomip = "116.202.162.146"
 86 |     bloom2ip = "94.130.167.172"
 87 | 
 88 |     print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] start parsing")
 89 |     tick = timeit(debug, tick, "start parsing")
 90 | 
 91 |     detector = gcld3.NNetLanguageIdentifier(min_num_bytes=5, max_num_bytes=2000)
 92 | 
 93 |     clpd = 0
 94 |     valid_data = []
 95 |     check_flag = set() # track urls and make them unique
 96 |     content.seek(0)
 97 | 
 98 |     for line in tqdm(content, position=i, desc=f"{i} parser"):
 99 |         if "IMG@" not in line:
100 |             continue
101 |         line_str = line.strip()
102 |         data = ujson.loads(line_str)
103 |         # find all links inside the line
104 |         linklist = data["Envelope"]["Payload-Metadata"]["HTTP-Response-Metadata"]["HTML-Metadata"]["Links"]
105 |         # get base url
106 |         base_url = os.path.dirname(
107 |             data["Envelope"]["WARC-Header-Metadata"]["WARC-Target-URI"]
108 |         )
109 |         license = "?"
110 |         for e in linklist:
111 |             if "url" in e and "creativecommons.org/licenses/" in e["url"]:
112 |                 license = e["url"][0:80].replace("\n","").replace('\\','\\\\')
113 |             if not "url" in e:
114 |                 continue
115 |             url = e["url"][0:2000].replace("\n","").replace('\\','\\\\')
116 |             try:
117 |                 if not is_valid_url(url):
118 |                     continue
119 |             except:
120 |                 continue
121 |             # reject links of svg, gif or scripted images content
122 |             if any( x in url for x in {".svg", ".gif", "data:image", "javascript:"} ):
123 |                 continue
124 |             try:
125 |                 domain = urlparse(url).hostname
126 |             except:
127 |                 continue
128 |             if domain is None or domain == "":
129 |                 continue
130 |             if len(str(domain)) > 60:
131 |                 continue
132 |             detlang = ""
133 |             alt_text = ""
134 |             try:
135 |                 if "alt" in e:
136 |                     # detect ALT text language
137 |                     alt_text = ftfy.fix_text(e["alt"].replace("\n", " ")).strip()
138 |                     alt_text = remove_bad_chars(alt_text)
139 |                     res = detector.FindLanguage(alt_text)
140 |                     detlang = res.language
141 |                     rel = res.is_reliable
142 |                     if not rel:
143 |                         detlang = ""
144 |             except:
145 |                 pass
146 |             # keep pair or just url if we made it so far
147 |             """ 
148 |             if detlang in ['bn', 'co', 'eo', 'fil', 'fy', 'gd', 'ha', 'haw', 'hmn', 'ig', 'km', 'ku', 'ky', 'lo', 'mi', 'mn', 'mt', 'ny', 'sd', 'si', 'sm', 'sn', 'so', 'st', 'su', 'sw', 'xh', 'yi', 'zu']:
149 |             """
150 |             # get rid of Latn suffix when detected
151 |             if detlang != "":
152 |                 detlang = detlang.split("-")[0]
153 |             if alt_text == "" or alt_text is None:
154 |                 continue
155 |             if len(alt_text) < 5:
156 |                 continue
157 |             alt_text = alt_text[0:2000].replace("\t"," ").replace("\n"," ").replace('\\','\\\\').replace('|', ' ') # will use tab as field separator for copy source
158 |             if not url.startswith("http"):
159 |                 url = urljoin(base_url, url)
160 |             hash = hashlib.md5((url + alt_text).encode("utf-8")).hexdigest()
161 |             if url not in check_flag:
162 |                 valid_data.append((url, alt_text, license, domain, detlang, hash))
163 |                 check_flag.add(url)
164 | 
165 | 
166 |     tick = timeit(debug, tick, "loop finished")        
167 |     print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] lenght of pairs to filter {len(valid_data)}")
168 |     s = time.time()
169 | 
170 |     if len(valid_data) > 0:
171 |         s = time.time()
172 |         # remove from valid_data elements rejected by parsed bloom server
173 |         with open(f'{i}/hash.txt', 'w') as f:
174 |             for item in valid_data:
175 |                 f.write(item[0].strip()+"\n")
176 |         post = {
177 |             'file': ('hash.txt', open(f'{i}/hash.txt', 'rb')),
178 |             'key': (None, 'parsed'),
179 |         }
180 |         
181 |         tick = timeit(debug, tick, "parsed bloom prepared")
182 |         failure = True
183 |         for _ in range(10):
184 |             try:
185 |                 response = requests.post(f'http://{bloom2ip}:8000/deduplicate/', files=post)
186 |                 if response.status_code != 200:
187 |                     print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] bloom server error, retrying... got {response.status_code}")
188 |                     time.sleep(randint(5,30))
189 |                 else:
190 |                     failure = False
191 |                     break
192 |             except:
193 |                 time.sleep(30)
194 |         if failure:
195 |             print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] crash, cannot contact the parsed bloom server, please fix")
196 |             return (None, 0, 0)
197 | 
198 |         valid_urls = set(response.content.decode("utf-8").split("\n"))
199 | 
200 |         print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] parsed bloom server returned {len(valid_urls)} in {round(time.time()-s,3)} sec")
201 |         tick = timeit(debug, tick, "parsed bloom done")
202 | 
203 |         valid_data = [t for t in {tuple(i) for i in valid_data}]
204 |     
205 |     final_kept_data = []
206 |     prsd = len(valid_data)
207 | 
208 |     for item in valid_data:
209 |         if item[0].strip() in valid_urls:
210 |             final_kept_data.append(item)
211 |             prsd -= 1
212 | 
213 |     print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] lenght of deduplicated pairs to return {len(final_kept_data)}")
214 | 
215 |     if len(final_kept_data) > 0:
216 |         # add parsed urls to parsed bloom server
217 |         with open('hash.txt', 'w') as f:
218 |             for url in final_kept_data:
219 |                 f.write(url[0].strip()+"\n")
220 |         post = {
221 |             'file': ('hash.txt', open('hash.txt', 'rb')),
222 |             'key': (None, 'parsed'),
223 |         }
224 |         
225 |         tick = timeit(debug, tick, "add to parsed bloom prepared")
226 |         failure = True
227 |         for _ in range(10):
228 |             try:
229 |                 response = requests.post(f'http://{bloom2ip}:8000/add/', files=post)
230 |                 if response.status_code != 200:
231 |                     print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] bloom server error, retrying... got {response.status_code}")
232 |                     time.sleep(randint(5,30))
233 |                 else:
234 |                     failure = False
235 |                     print(f"bloom add response: {response.text}")
236 |                     break
237 |             except:
238 |                 time.sleep(15)
239 |         if failure:
240 |             print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] crash, cannot contact the parsed bloom server, please fix")
241 |             return (None, 0, 0)
242 | 
243 |         tick = timeit(debug, tick, "add to parsed bloom done")
244 | 
245 |     return (final_kept_data, clpd, prsd)  # use a dict in order to remove duplicate tuples from list
246 | 
247 | def proc_worker(i: int, YOUR_NICKNAME_FOR_THE_LEADERBOARD,  CRAWLINGATHOME_SERVER_URL, engine, host, debug, current_set):
248 |     # initialize working folders
249 |     tmp_folder = f"./{i}/.tmp/"
250 | 
251 |     if os.path.exists(tmp_folder):
252 |         shutil.rmtree(tmp_folder)
253 | 
254 |     # connect to C@H server and initialize client
255 |     client = TempCPUWorker(url=CRAWLINGATHOME_SERVER_URL, nickname=YOUR_NICKNAME_FOR_THE_LEADERBOARD)
256 | 
257 |     # initialize stats variables for previous job
258 |     last = 0
259 | 
260 |     # this makes a loop to download new jobs while the script is running
261 |     # normally it reads while client.jobCount() > 0
262 |     conn = engine.raw_connection()
263 |     while True:
264 |         try:
265 |             # clean the folder
266 |             if os.path.exists(f"{i}"):
267 |                 shutil.rmtree(f"{i}", ignore_errors=True)
268 |             os.makedirs(tmp_folder)
269 | 
270 |             tick = time.time()
271 |             print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] clock is {datetime.now().strftime('%H:%M:%S')}")
272 | 
273 |             start = time.time()
274 |             start0 = start
275 | 
276 |             # get new job and download the wat file
277 |             client.newJob()
278 |             tick = timeit(debug, tick, "got new job")
279 |             client.downloadWat(tmp_folder)
280 |             tick = timeit(debug, tick, "downloaded wat")
281 | 
282 |             #fix tracker db error
283 |             client.shards = client.shards[0:2]
284 | 
285 |             print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] downloaded wat in {round(time.time()-start,2)}")
286 |             start = time.time()
287 | 
288 |             first_sample_id = np.int64(client.shards[0][1]["start_id"])
289 |                         
290 |             # parse valid links from wat file
291 |             with open(tmp_folder + "shard.wat", "r") as infile:
292 |                 parsed_data, clpd, prsd = parse_wat(infile, i, debug)
293 | 
294 |             if parsed_data is None:
295 |                 continue
296 |             tick = timeit(debug, tick, "parsing finalized")
297 | 
298 |             print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] parsed wat in {round(time.time()-start,2)}")
299 |             start = time.time()
300 | 
301 |             # convert to dataframe and save to disk (for statistics and generating blocking lists)
302 |             if len(parsed_data) > 0:
303 |                 parsed_df = pd.DataFrame(parsed_data, columns=["url","text","license","domain","language","hash"])
304 |                 parsed_df = parsed_df.drop_duplicates(subset=["url"])
305 |                 parsed_df.insert(0, 'sampleid', range(first_sample_id, first_sample_id + len(parsed_df)))
306 |                 parsed_df["wat"] = int(client.shards[-1][0])
307 |                 parsed_df = parsed_df[["sampleid","url","text","license","domain","wat","hash","language"]]
308 | 
309 |                 # postgres should only ingest current working data not all
310 |                 en_df = parsed_df[parsed_df["language"]=="en"]
311 |                 nolang_df = parsed_df[parsed_df["language"]==""]
312 |                 multilang_df = parsed_df[(parsed_df["language"]!="en") & (parsed_df["language"]!="")]
313 | 
314 |                 not_nolang = parsed_df[(parsed_df["language"]!="")]
315 | 
316 |                 tick = timeit(debug, tick, "dataframe preparation done")
317 |                 current = en_df
318 |                 if current_set == "":
319 |                     current = nolang_df
320 |                     print(f"currently working on nolang dataset")
321 |                 if current_set == "multilang":
322 |                     current = multilang_df
323 |                     print(f"currently working on multilang dataset")
324 | 
325 |                 tick = timeit(debug, tick, "before sql copy")
326 |                 not_nolang.to_csv(f"{i}/export_sql.txt", sep='\t', index=False, header=False)
327 |                 
328 |                 cur = conn.cursor()
329 |                 with open(f"{i}/export_sql.txt", "rt") as f:
330 |                     cur.copy_from(f, 'dataset_buffer', columns=("sampleid","url","text","license","domain","wat","hash","language"))
331 |                 conn.commit()
332 |                 cur.close()
333 |                 
334 |                 tick = timeit(debug, tick, "finished sql copy")
335 | 
336 |                 uid = uuid.uuid4().hex
337 | 
338 |                 nolang_df.to_csv(f"{i}/nolang-{uid}.txt", sep='\t', index=False, header=False)
339 |                 os.system(f"rsync -amv --include='*{uid}.txt' --include='*/' --exclude='*' ./{i}/ postgres@185.154.158.196::aidb")
340 |                 
341 |                 '''
342 |                 if not current.equals(en_df):
343 |                     en_df.to_csv(f"{i}/en-{uid}.txt", sep='\t', index=False, header=False)
344 |                 if not current.equals(nolang_df):
345 |                     nolang_df.to_csv(f"{i}/nolang-{uid}.txt", sep='\t', index=False, header=False)
346 |                 if not current.equals(multilang_df):
347 |                     multilang_df.to_csv(f"{i}/intl-{uid}.txt", sep='\t', index=False, header=False)
348 |                 
349 |                 os.system(f"rsync -amv --include='*{uid}.txt' --include='*/' --exclude='*' ./{i}/ postgres@185.154.158.196::aidb")
350 |                 '''
351 | 
352 |                 print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] saved links in {round(time.time()-start,2)}")
353 | 
354 |                 lastlinks = len(parsed_data)
355 |                 en_pairs = len(en_df.index)
356 |                 nolang_pairs = len(nolang_df.index)
357 |                 intl_pairs = len(multilang_df.index)
358 |                 print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] this job has {lastlinks} links left after removing {clpd} already clipped and {prsd} already parsed")
359 |                 print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] links are split into {en_pairs} english, {intl_pairs} multilanguage and {nolang_pairs} without language")
360 |                 with open("datapoints.txt", "a") as f:
361 |                     f.write(f"{time.time()}\t{en_pairs}\t{intl_pairs}\t{nolang_pairs}\n")
362 |             else:
363 |                 print(f"This WAT file does not contain any useful candidate")
364 | 
365 |             prefixes = {}
366 |             prefixes[str(client.shards[0][0])] = f"postgres {host}"
367 |             prefixes[str(client.shards[1][0])] = f"postgres {host}"
368 |             client.completeJob(prefixes)
369 |             tick = timeit(debug, tick, "executed complete job")
370 | 
371 |             last = round(time.time() - start0)
372 |             print(f"[{datetime.now().strftime('%H:%M:%S')} {i} stats] WAT job completed in {last} seconds")
373 |            
374 |         except Exception as e:
375 |             print (f"[{datetime.now().strftime('%H:%M:%S')} exception {i} parser] {e}")
376 |             print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] worker crashed")
377 |             time.sleep(60)
378 |             client = TempCPUWorker(url=CRAWLINGATHOME_SERVER_URL, nickname=YOUR_NICKNAME_FOR_THE_LEADERBOARD)
379 |     conn.close()
380 | 
381 | if __name__ == '__main__':
382 |     
383 |     parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -m/--mode -c/--cpus -n/--name -d/--debug')
384 |     parser.add_argument("-n","--name",action='append',help="Your leaderboard nickname",required=False)
385 |     parser.add_argument("-c","--cpus",action='append',help="How many cpus to use",required=False)
386 |     parser.add_argument("-d","--debug",action='append',help="Print debug lines?",required=False)
387 |     parser.add_argument("-m","--mode",action='append',help="Mode to run",required=True)
388 |     parser.add_argument("-s","--set",action='append',help="Choose current set (en, nolang, multilang)",required=True)
389 |     args = parser.parse_args()
390 | 
391 |     # initialize client variables
392 |     YOUR_NICKNAME_FOR_THE_LEADERBOARD = None
393 |     if args.name is not None:
394 |         YOUR_NICKNAME_FOR_THE_LEADERBOARD = " ".join(args.name)
395 | 
396 |     if YOUR_NICKNAME_FOR_THE_LEADERBOARD in (None,""):
397 |         YOUR_NICKNAME_FOR_THE_LEADERBOARD = "ccpp-dev"
398 |     CRAWLINGATHOME_SERVER_URL = "http://cah.io.community/"
399 | 
400 |     print (f"starting session under `{YOUR_NICKNAME_FOR_THE_LEADERBOARD}` nickname")
401 | 
402 |     procs = cpu_count()
403 |     if args.cpus is not None and int(args.cpus[0]) > 0:
404 |         procs = int(args.cpus[0])
405 | 
406 |     debug = False
407 |     if args.debug is not None and args.debug[0] == "true":
408 |         debug = True
409 | 
410 |     params = config(mode=args.mode[0])
411 | 
412 |     engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_size=procs, max_overflow=int(procs*1.5), pool_pre_ping=True)
413 |     workers = []
414 |     for i in range ( procs ):
415 |         #use this queue to annount that bloom is currently processing and please do not update filters. if queue is not empty please wait, if queue is empty you may update filters
416 |         workers.append(Process(target=proc_worker, args= [i, YOUR_NICKNAME_FOR_THE_LEADERBOARD,  CRAWLINGATHOME_SERVER_URL, engine, params["host"], debug, args.set[0]], daemon=True))
417 | 
418 |     time.sleep(10)
419 | 
420 |     for worker in workers:
421 |         worker.start()
422 |         time.sleep(8)
423 |     
424 |     while True:
425 |         #keep main process alive
426 |         time.sleep(60)
427 | 


--------------------------------------------------------------------------------