├── .gitattributes ├── helpers ├── crontab2 ├── org_files_by_date.py └── parquet.ipynb ├── alibaba_workers ├── alibaba_tokens └── alibaba-upgrade-SAS-image.ipynb ├── audio ├── demo_audio.parquet └── audio_resample.ipynb ├── postCLIP_staging ├── bloom.sh ├── postCLIP.md ├── crontab ├── archive.sh ├── dashboard.py ├── movefiles.py ├── rsyncd.conf └── bloom.py ├── preCLIP_staging ├── cleanup.sh ├── capacity.py ├── rsyncd.conf ├── cleanup3db.py └── cleanup.py ├── gpu-requirements.txt ├── worker-requirements.txt ├── postgres ├── fstab ├── cold_storage.py ├── jobstables.sql ├── triggers.sql ├── fix_bad_csv.py ├── dedup_csv.py ├── tables.sql ├── stage_db.py └── dump_db.ipynb ├── worker-setup.sh ├── cleanup.py ├── LICENSE ├── gpu-setup.sh ├── bloom_server ├── bloom_dash.py ├── parquet2bloom.py └── bloomexport.py ├── notebooks └── query-bloom.ipynb ├── .gitignore ├── cloud boot ├── boot.sh └── cloud-init.yaml ├── README.md ├── docs ├── architecture_white.drawio ├── 3stage_architecture_white.drawio └── architecture.drawio ├── infrastructure.py ├── dbdl.py └── ccpp.py /.gitattributes: -------------------------------------------------------------------------------- 1 | duplicates*.txt filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /helpers/crontab2: -------------------------------------------------------------------------------- 1 | 0 */1 * * * flock -n clean.lock /home/archiveteam/cleanup.sh -------------------------------------------------------------------------------- /alibaba_workers/alibaba_tokens: -------------------------------------------------------------------------------- 1 | [tokens] 2 | id=<> 3 | secret=<> -------------------------------------------------------------------------------- /audio/demo_audio.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rvencu/crawlingathome-gpu-hcloud/HEAD/audio/demo_audio.parquet -------------------------------------------------------------------------------- /postCLIP_staging/bloom.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # use in cron to update bloom filter 3 | 4 | python3 bloom.py >> bloom.log 5 | -------------------------------------------------------------------------------- /preCLIP_staging/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # use in cron to update bloom filter 3 | 4 | python3 cleanup.py >> cleanup.log -------------------------------------------------------------------------------- /gpu-requirements.txt: -------------------------------------------------------------------------------- 1 | ftfy 2 | regex 3 | trio 4 | ujson 5 | colorama 6 | dashing 7 | psycopg2 8 | requests 9 | pandas 10 | sqlalchemy 11 | sentence_transformers 12 | sentry_sdk -------------------------------------------------------------------------------- /worker-requirements.txt: -------------------------------------------------------------------------------- 1 | ftfy 2 | pandas 3 | gcld3 4 | trio 5 | ujson 6 | asks 7 | bloom-filter2 8 | pillow 9 | glances 10 | sqlalchemy 11 | psycopg2 12 | django 13 | tqdm 14 | -------------------------------------------------------------------------------- /postCLIP_staging/postCLIP.md: -------------------------------------------------------------------------------- 1 | This server will store the results of CLIP inference. Once arrived it arranges the files in date and time based subfolders, and will update the dataset bloom filters to prevent further duplication. 2 | 3 | The bloom log also exposes a dashboard with velocity information (24h, 7d and 1 month) -------------------------------------------------------------------------------- /postCLIP_staging/crontab: -------------------------------------------------------------------------------- 1 | */1 * * * * flock -n archive.lock /home/archiveteam/archive.sh 2 | */1 * * * * flock -n bloom.lock /home/archiveteam/bloom.sh 3 | 0 0 */1 * * flock -n clpmove.lock python3 movefiles.py -e clp -d /home/archiveteam/CAH/clipped 4 | 10 0 */1 * * flock -n hshmove.lock python3 movefiles.py -e hsh -d /home/archiveteam/CAH/hashes 5 | @reboot python3 /home/archiveteam/dashboard.py & -------------------------------------------------------------------------------- /postgres/fstab: -------------------------------------------------------------------------------- 1 | /dev/disk/by-uuid/9986-9974 /boot/efi vfat defaults 0 1 2 | /dev/md1 /mnt/md1 ext4 defaults 0 0 3 | /swap.img none swap sw 0 0 4 | tmpfs /mnt/ramdisk tmpfs rw,size=10G 0 0 5 | none /mnt/huge hugetlbfs pagesize=1G,size=210G 0 0 6 | //smb/share /mnt/smb cifs vers=3.11,uid=postgres,username=<>,password=<>,iocharset=utf8 0 0 7 | -------------------------------------------------------------------------------- /worker-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # use this if you manually install download worker on your box 3 | 4 | echo "insert your nickname for the leaderboard or press Enter for anonymous..." 5 | read nickname 6 | export CAH_NICKNAME=$nickname 7 | 8 | git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client 9 | pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir 10 | pip3 install -r worker-requirements.txt --no-cache-dir 11 | pip install random_user_agent 12 | 13 | -------------------------------------------------------------------------------- /postCLIP_staging/archive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # use in cron to move resulted files to the eye egress location (in staging server) 3 | 4 | CURRENTDATE=`date +"%Y%m%d"` 5 | CURRENTTIME=`date +"%H"` 6 | mkdir --parents /home/archiveteam/CAH/ds/${CURRENTDATE}/${CURRENTTIME}/ 7 | 8 | find /home/archiveteam/CAH/results/*.tfrecord -mmin +1 -type f -exec rm "{}" \; 9 | find /home/archiveteam/CAH/results/*.hsh -mmin +1 -type f -exec mv "{}" /home/archiveteam/CAH/hashes/ \; 10 | find /home/archiveteam/CAH/results/*.clp -mmin +1 -type f -exec mv "{}" /home/archiveteam/CAH/clipped/ \; 11 | find /home/archiveteam/CAH/results/ -mmin +5 -type f -exec mv "{}" /home/archiveteam/CAH/ds/${CURRENTDATE}/${CURRENTTIME}/ \; 12 | -------------------------------------------------------------------------------- /preCLIP_staging/capacity.py: -------------------------------------------------------------------------------- 1 | # use in staging server to return disk capacity level 2 | import shutil 3 | import json 4 | import os 5 | 6 | from aioserver import Application 7 | app = Application() 8 | 9 | # Path 10 | path = "/home/archiveteam/CAH/gpujobs" 11 | # Get the disk usage statistics 12 | # about the given path 13 | @app.get('/disk') 14 | async def index(request): 15 | stat = str(shutil.disk_usage(path)) 16 | stat = stat.split("(")[1].split(")")[0] 17 | stat = '{"' + stat.replace('=','":').replace(', ',', "') + '}' 18 | json_object = json.loads(stat) 19 | json_object["utilization"] = round(json_object["used"]/json_object["total"], 2) 20 | json_object["jobscount"] = len([name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))]) 21 | 22 | return 200, {'Content-Type': 'application/json; charset=utf-8'}, json.dumps(json_object) 23 | 24 | app.run(host='0.0.0.0', port=8080) -------------------------------------------------------------------------------- /cleanup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import shutil 4 | 5 | # initial cleanup - delete all working files in case of crash recovery 6 | reg_compile = re.compile(r"^\d{1,3}-\d{1,3}-\d{1,3}-\d{1,3}$") 7 | for root, dirnames, filenames in os.walk("."): 8 | for filename in filenames: 9 | if filename.startswith("gpujob.zip_"): 10 | os.remove(filename) 11 | for dir in dirnames: 12 | if reg_compile.match(dir): 13 | shutil.rmtree(dir) 14 | re_uuid = re.compile(r'[0-9a-f]{32}', re.I) 15 | for root, dirnames, filenames in os.walk("."): 16 | for dir in dirnames: 17 | if re_uuid.match(dir): 18 | shutil.rmtree(dir) 19 | re_gz = re.compile(r'.*.tar.gz.*', re.I) 20 | for root, dirnames, filenames in os.walk("."): 21 | for file in filenames: 22 | if re_gz.match(file): 23 | os.remove(file) 24 | 25 | for i in range(24): 26 | os.system(f"rm -rf ./{i}") 27 | 28 | os.system(f"rm -rf ./save") 29 | os.system(f"rm -rf ./stats") 30 | os.system(f"rm ./shard.wat") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Richard Vencu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /gpu-setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "insert your nickname for the leaderboard or press Enter for anonymous..." 4 | read nickname 5 | export CAH_NICKNAME=$nickname 6 | 7 | sudo apt-get update 8 | sudo apt-get install -y git build-essential python3-dev python3-pip libjpeg-dev zip libwebp-dev 9 | 10 | git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client 11 | #pip3 install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html 12 | pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html 13 | #pip3 install git+https://github.com/rvencu/asks 14 | pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir 15 | pip3 install -r gpu-requirements.txt --no-cache-dir 16 | #pip install tensorflow==2.5 --no-cache-dir 17 | pip install clip-anytorch 18 | 19 | git clone "https://github.com/hetznercloud/hcloud-python" hcloud 20 | pip3 install -e ./hcloud 21 | 22 | pip install parallel-ssh 23 | 24 | yes | ssh-keygen -t rsa -b 4096 -f $HOME/.ssh/id_cah -q -P "" 25 | 26 | yes | pip uninstall pillow 27 | CC="cc -mavx2" pip install -U --force-reinstall pillow-simd 28 | -------------------------------------------------------------------------------- /postgres/cold_storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from multiprocessing import Process, Queue 4 | 5 | def worker(queue): 6 | while not queue.empty(): 7 | cmd = queue.get() 8 | print (cmd) 9 | os.system(cmd) 10 | return 11 | 12 | q = Queue() 13 | while True: 14 | # Calculate which files are currently open (i.e. the ones currently being written to) 15 | # and avoid uploading it. This is to ensure that when we process files on the server, they 16 | # are complete. 17 | i = 0 18 | for root, dirs, files in os.walk("/mnt/md1/export/", topdown = False): 19 | for file in files: 20 | fullpath = os.path.join(root,file) 21 | if file.endswith(".gz") and os.path.getmtime(fullpath) < time.time() - 60*60: 22 | dest = str(fullpath).replace("md1","smb") 23 | q.put(f"mv {fullpath} {dest}") 24 | i += 1 25 | if i % 1000 == 0: 26 | break 27 | 28 | procs = [] 29 | for i in range(16): 30 | p = Process(target=worker, args=[q], daemon=False) 31 | procs.append(p) 32 | p.start() 33 | 34 | for proc in procs: 35 | proc.join() 36 | 37 | print("Finish") 38 | time.sleep(10) 39 | -------------------------------------------------------------------------------- /helpers/org_files_by_date.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import shutil 4 | 5 | 6 | # Change the directory and jump to the location 7 | # where you want to arrange the files 8 | os.chdir(r"/mnt/md1/export/rsync") 9 | 10 | files = os.listdir('.') 11 | # files in the current directory 12 | i = 0 13 | for file in files: 14 | if os.path.isfile(file) and os.path.getmtime(file) < time.time() - 60*60 and file.endswith("gz"): 15 | # Get all the details of the file creation 16 | # and modification 17 | time_format = time.gmtime(os.path.getmtime(file)) 18 | 19 | # Give the name of the folder 20 | dir_name = str(time_format.tm_year) + "-" + \ 21 | str(time_format.tm_mon) + '-' + \ 22 | str(time_format.tm_mday) 23 | 24 | # Check if the folder exists or not 25 | if not os.path.isdir(dir_name): 26 | 27 | # If not then make the new folder 28 | os.mkdir(dir_name) 29 | dest = dir_name 30 | 31 | # Move all the files to their respective folders 32 | try: 33 | shutil.move(file, dest) 34 | files.remove(file) 35 | i += 1 36 | if i%1000 == 0: 37 | print ("+1000 files") 38 | except: 39 | continue 40 | 41 | print("successfully moved...") -------------------------------------------------------------------------------- /postCLIP_staging/dashboard.py: -------------------------------------------------------------------------------- 1 | # use in staging server to create a small web dashboard with bloom filters stats 2 | 3 | from aioserver import Application 4 | app = Application() 5 | 6 | @app.get('/') 7 | async def index(request): 8 | reply = "" 9 | with open('/home/archiveteam/dashboard.txt', 'rt') as file: 10 | reply = file.read() 11 | return 200, {'Content-Type': 'text/html; charset=utf-8'}, "" + reply + "" 12 | 13 | @app.get('/stats') 14 | async def index(request): 15 | uniques = [] 16 | total = [] 17 | clipped = [] 18 | with open('/home/archiveteam/dashboard.txt', 'rt') as file: 19 | lines = file.readlines() 20 | for line in lines: 21 | if line.startswith("M unique pairs"): 22 | chunks = line.split("
") 23 | uniques.append(chunks[0].split(" ")[-1]) 24 | total.append(chunks[1].split(" ")[-1]) 25 | clipped.append(chunks[2].split(" ")[-1]) 26 | 27 | reply = '{"total": {"uniques":' + uniques[0] + ',"pairs":' + total[0] + ',"clips":' + clipped[0] + '},' 28 | reply += '"day": {"uniques":' + uniques[1] + ',"pairs":' + total[1] + ',"clips":' + clipped[1] + '},' 29 | reply += '"week": {"uniques":' + uniques[2] + ',"pairs":' + total[2] + ',"clips":' + clipped[2] + "}}" 30 | 31 | return 200, {'Content-Type': 'application.json; charset=utf-8'}, reply 32 | 33 | app.run(host='0.0.0.0', port=8080) -------------------------------------------------------------------------------- /bloom_server/bloom_dash.py: -------------------------------------------------------------------------------- 1 | # use in staging server to create a small web dashboard with bloom filters stats 2 | 3 | from aioserver import Application 4 | app = Application() 5 | 6 | @app.get('/') 7 | async def index(request): 8 | reply = "" 9 | with open('/home/archiveteam/dashboard.txt', 'rt') as file: 10 | reply = file.read() 11 | return 200, {'Content-Type': 'text/html; charset=utf-8'}, "" + reply + "" 12 | 13 | @app.get('/stats') 14 | async def index(request): 15 | uniques = [] 16 | total = [] 17 | clipped = [] 18 | with open('/home/archiveteam/dashboard.txt', 'rt') as file: 19 | lines = file.readlines() 20 | for line in lines: 21 | if line.startswith("M unique pairs"): 22 | chunks = line.split("
") 23 | uniques.append(chunks[0].split(" ")[-1]) 24 | total.append(chunks[1].split(" ")[-1]) 25 | clipped.append(chunks[2].split(" ")[-1]) 26 | 27 | reply = '{"total": {"uniques":' + uniques[0] + ',"pairs":' + total[0] + ',"clips":' + clipped[0] + '},' 28 | reply += '"day": {"uniques":' + uniques[1] + ',"pairs":' + total[1] + ',"clips":' + clipped[1] + '},' 29 | reply += '"week": {"uniques":' + uniques[2] + ',"pairs":' + total[2] + ',"clips":' + clipped[2] + "}}" 30 | 31 | return 200, {'Content-Type': 'application.json; charset=utf-8'}, reply 32 | 33 | app.run(host='0.0.0.0', port=8080) 34 | -------------------------------------------------------------------------------- /postgres/jobstables.sql: -------------------------------------------------------------------------------- 1 | create table jobs_en 2 | ( 3 | jobid varchar(32) not null 4 | constraint jobs_pk 5 | primary key, 6 | status integer default 0 not null, 7 | modified timestamp 8 | ); 9 | 10 | alter table jobs_en 11 | owner to cah; 12 | 13 | create unique index jobs_jobid_uindex 14 | on jobs (jobid); 15 | 16 | create trigger update_job_modtime 17 | before update 18 | on jobs_en 19 | for each row 20 | execute procedure update_modified_column(); 21 | 22 | create table jobs_intl 23 | ( 24 | jobid varchar(32) not null 25 | constraint jobs_intl_pk 26 | primary key, 27 | status integer default 0 not null, 28 | modified timestamp 29 | ); 30 | 31 | alter table jobs_intl 32 | owner to cah; 33 | 34 | create unique index jobs_intl_jobid_uindex 35 | on jobs_intl (jobid); 36 | 37 | create trigger update_job_modtime 38 | before update 39 | on jobs_intl 40 | for each row 41 | execute procedure update_modified_column(); 42 | 43 | create table jobs_nolang 44 | ( 45 | jobid varchar(32) not null 46 | constraint jobs_nolang_pk 47 | primary key, 48 | status integer default 0 not null, 49 | modified timestamp 50 | ); 51 | 52 | alter table jobs_nolang 53 | owner to cah; 54 | 55 | create unique index jobs_nolang_jobid_uindex 56 | on jobs_nolang (jobid); 57 | 58 | create trigger update_job_modtime 59 | before update 60 | on jobs_nolang 61 | for each row 62 | execute procedure update_modified_column(); 63 | 64 | -------------------------------------------------------------------------------- /postgres/triggers.sql: -------------------------------------------------------------------------------- 1 | create user cah 2 | superuser 3 | createdb 4 | createrole; 5 | 6 | 7 | create function update_modified_column() returns trigger 8 | language plpgsql 9 | as 10 | $$ 11 | BEGIN 12 | NEW.modified = now(); 13 | RETURN NEW; 14 | END; 15 | $$; 16 | 17 | alter function update_modified_column() owner to cah; 18 | 19 | create function on_insert_in_original_table() returns trigger 20 | language plpgsql 21 | as 22 | $$ 23 | BEGIN 24 | BEGIN 25 | IF NEW.language = 'en' THEN 26 | INSERT INTO dataset_en (sampleid, url, text, license, domain, wat, hash, modified, language, width, height) 27 | VALUES (NEW.sampleid, NEW.url, NEW.text, NEW.license, NEW.domain, NEW.wat, NEW.hash, NEW.modified, NEW.language, 28 | NEW.width, NEW.height) 29 | ON CONFLICT DO NOTHING; 30 | ELSIF NEW.language = '' THEN 31 | INSERT INTO dataset_nolang (sampleid, url, text, license, domain, wat, hash, modified, language, width, height) 32 | VALUES (NEW.sampleid, NEW.url, NEW.text, NEW.license, NEW.domain, NEW.wat, NEW.hash, NEW.modified, NEW.language, 33 | NEW.width, NEW.height) 34 | ON CONFLICT DO NOTHING; 35 | ELSE 36 | INSERT INTO dataset_intl (sampleid, url, text, license, domain, wat, hash, modified, language, width, height) 37 | VALUES (NEW.sampleid, NEW.url, NEW.text, NEW.license, NEW.domain, NEW.wat, NEW.hash, NEW.modified, NEW.language, 38 | NEW.width, NEW.height) 39 | ON CONFLICT DO NOTHING; 40 | END IF; 41 | EXCEPTION 42 | WHEN OTHERS THEN 43 | NULL; 44 | END; 45 | RETURN NULL; 46 | END; 47 | $$; 48 | 49 | alter function on_insert_in_original_table() owner to cah; 50 | 51 | -------------------------------------------------------------------------------- /postgres/fix_bad_csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from glob import glob 3 | from tqdm.auto import tqdm 4 | import numpy as np 5 | import os 6 | import re 7 | 8 | def wonky_parser(fn): 9 | txt = open(fn).read() 10 | # This is where I specified 8 tabs 11 | # V 12 | preparse = re.findall('(([^\t]*\t[^\t]*){7}(\n|\Z))', txt) 13 | parsed = [t[0].split('\t') for t in preparse] 14 | return pd.DataFrame(parsed) 15 | 16 | def is_num(x): 17 | try: 18 | x = int(float(x)) 19 | return True 20 | except: 21 | return False 22 | 23 | files = glob("*.bad") 24 | 25 | for file in tqdm(files): 26 | try: 27 | df = pd.read_csv(file, sep="\t", names=["s","url","a","b","c","d","e","f"], dtype={'s': 'int', 'd': 'int', 'url': 'str', 'a': 'str'}) 28 | df.drop_duplicates(subset="url", keep='first').reset_index(drop=True) 29 | df.s = df.s.astype(int) 30 | df.d = df.d.astype(int) 31 | df.to_csv(file+".fix", sep="\t", index=False, header=False) 32 | os.system(f"rm {file}") 33 | os.system(f"mv {file}.fix {file}") 34 | except: 35 | df = wonky_parser(file) 36 | df.columns=["s","url","a","b","c","d","e","f"] 37 | df = df[df.s.apply(lambda x: is_num(x))] 38 | df = df[df.d.apply(lambda x: is_num(x))] 39 | df.drop_duplicates(subset="url", keep='first').reset_index(drop=True) 40 | df.s = df.s.apply(lambda x: int(float(x))) 41 | df.d = df.d.apply(lambda x: int(float(x))) 42 | df["s"] = df["s"].astype(int) 43 | df["d"] = df["d"].astype(int) 44 | df.to_csv(file+".fix", sep="\t", index=False, header=False) 45 | os.system(f"rm {file}") 46 | os.system(f"mv {file}.fix {file}") 47 | 48 | 49 | -------------------------------------------------------------------------------- /postCLIP_staging/movefiles.py: -------------------------------------------------------------------------------- 1 | import os, datetime, errno, argparse, sys 2 | 3 | def create_file_list(CWD): 4 | """ takes string as path, returns tuple(files,date) """ 5 | 6 | files_with_mtime = [] 7 | for filename in [f for f in os.listdir(CWD) if os.path.splitext(f)[1] in ext and datetime.datetime.fromtimestamp(os.stat(os.path.join(CWD,f)).st_mtime) < datetime.datetime.now()-datetime.timedelta(days=1)]: 8 | files_with_mtime.append((filename,datetime.datetime.fromtimestamp(os.stat(os.path.join(CWD,filename)).st_mtime).strftime('%Y-%m-%d'))) 9 | return files_with_mtime 10 | 11 | def create_directories(files, CWD): 12 | """ takes tuple(file,date) from create_file_list() """ 13 | 14 | m = [] 15 | for i in files: 16 | m.append(i[1]) 17 | for i in set(m): 18 | try: 19 | os.makedirs(os.path.join(CWD,i)) 20 | except OSError as exception: 21 | if exception.errno != errno.EEXIST: 22 | raise 23 | 24 | def move_files_to_folders(files, CWD): 25 | """ gets tuple(file,date) from create_file_list() """ 26 | for i in files: 27 | try: 28 | os.rename(os.path.join(CWD,i[0]), os.path.join(CWD,(i[1] + '/' + i[0]))) 29 | except Exception as e: 30 | raise 31 | return len(files) 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s [options]') 37 | parser.add_argument("-e","--extension",action='append',help="File extensions to match",required=True) 38 | parser.add_argument("-d","--directory",action='append',help="Target directory",required=True) 39 | args = parser.parse_args() 40 | 41 | ext = ['.' + e for e in args.extension] 42 | print (f"Moving files with extensions:{ext}") 43 | print(args.directory) 44 | files = create_file_list(args.directory[0]) 45 | create_directories(files,args.directory[0]) 46 | print ("Moved %i files" % move_files_to_folders(files, args.directory[0])) -------------------------------------------------------------------------------- /notebooks/query-bloom.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "source": [ 7 | "import requests\n", 8 | "from pathlib import Path\n", 9 | "\n", 10 | "bloomip = \"116.202.162.146\"" 11 | ], 12 | "outputs": [], 13 | "metadata": {} 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "source": [ 19 | "file = \"/home/rvencu/gpuhcloud/crawlingathome-gpu-hcloud/notebooks/test.hsh\"\n", 20 | "stem = Path(file).stem.strip(\".\")\n", 21 | "post = {\n", 22 | " 'file': (stem, open(file, 'rb')),\n", 23 | " 'key': (None, 'clipped'),\n", 24 | "}\n", 25 | "response = requests.post(f'http://{bloomip}:8000/deduplicate/', files=post)\n", 26 | "if response.status_code == 200:\n", 27 | " print(response.content.decode(\"utf-8\"))" 28 | ], 29 | "outputs": [ 30 | { 31 | "output_type": "stream", 32 | "name": "stdout", 33 | "text": [ 34 | "d10e361a18d69ef07c08857fb1198d22\n", 35 | "136061a9e0426bc0a6274d70ab834e60\n", 36 | "563a4dd46eb8dddf7b85d35f236cb12b\n", 37 | "d10e361a18d69ef07c08857fb1198d23\n", 38 | "136061a9e0426bc0a6274d70ab834e61\n", 39 | "563a4dd46eb8dddf7b85d35f236cb12c\n", 40 | "\n" 41 | ] 42 | } 43 | ], 44 | "metadata": {} 45 | } 46 | ], 47 | "metadata": { 48 | "orig_nbformat": 4, 49 | "language_info": { 50 | "name": "python", 51 | "version": "3.8.8", 52 | "mimetype": "text/x-python", 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "pygments_lexer": "ipython3", 58 | "nbconvert_exporter": "python", 59 | "file_extension": ".py" 60 | }, 61 | "kernelspec": { 62 | "name": "python3", 63 | "display_name": "Python 3.8.8 64-bit ('gpuhcloud': conda)" 64 | }, 65 | "interpreter": { 66 | "hash": "bc322c11e8113b1b1dfcd753c5702c5c5d95a81c495f9a7060b170a2a7888bca" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 2 71 | } -------------------------------------------------------------------------------- /audio/audio_resample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 37, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "----------\n", 13 | "Source: ./_sample_data/e2b474d6c0094c3da22788e7875f7787.mp3\n", 14 | "----------\n", 15 | " - File size: 603742 bytes\n", 16 | " - AudioMetaData(sample_rate=22050, num_frames=1664202, num_channels=2, bits_per_sample=0, encoding=MP3)\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "import torchaudio\n", 22 | "import torchaudio.transforms as T\n", 23 | "import requests\n", 24 | "import uuid\n", 25 | "\n", 26 | "url = 'https://deploy.laion.ai/0fed69941baaabaeccedc2aaaaaaaaaa/WeSoundEffects/Glitchedtones/Urban%20Traffic/urban%20traffic%2C%20bus%20journey%2C%20interior%2C%20newcastle%2C%20uk%20%282%29.wav'\n", 27 | "file = uuid.uuid4().hex\n", 28 | "with requests.get(url) as response:\n", 29 | " with open(file, 'wb') as f:\n", 30 | " f.write(response.content)\n", 31 | " waveform, sample_rate = torchaudio.load(file)\n", 32 | " resampler = T.Resample(sample_rate, 20050, dtype=waveform.dtype)\n", 33 | " resampled_waveform = resampler(waveform)\n", 34 | " path = f\"{file}.mp3\"\n", 35 | " torchaudio.save(path, resampled_waveform, 20050, format=\"mp3\")" 36 | ] 37 | } 38 | ], 39 | "metadata": { 40 | "interpreter": { 41 | "hash": "ee22a52db22349ad32e35f3b499efddea1c9229e771c5fd65652469b6b2f1979" 42 | }, 43 | "kernelspec": { 44 | "display_name": "Python 3.9.7 ('gpu')", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.9.7" 59 | }, 60 | "orig_nbformat": 4 61 | }, 62 | "nbformat": 4, 63 | "nbformat_minor": 2 64 | } 65 | -------------------------------------------------------------------------------- /bloom_server/parquet2bloom.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import requests 5 | from glob import glob 6 | from random import randint 7 | from tqdm.auto import tqdm 8 | import os.path as path 9 | import time 10 | bloomip = "116.202.162.146" 11 | 12 | files = glob("**/*.parquet", recursive=True) 13 | with tqdm(total=len(files), file=sys.stdout) as pbar: 14 | 15 | for file in files: 16 | 17 | df = pd.read_parquet(file) 18 | 19 | n = 100000 #chunk row size 20 | list_df = [df[i:i+n] for i in range(0,df.shape[0],n)] 21 | 22 | for ldf in list_df: 23 | with open('hash.txt', 'w') as f: 24 | f.write(ldf['URL'].str.cat(sep='\n')) 25 | post = { 26 | 'file': ('hash.txt', open('hash.txt', 'rb')), 27 | 'key': (None, "dedup"), 28 | } 29 | os.remove('hash.txt') 30 | 31 | failure = True 32 | for _ in range(10): 33 | response = requests.post(f'http://{bloomip}:8000/add/', files=post) 34 | if response.status_code != 200: 35 | time.sleep(randint(5,30)) 36 | else: 37 | failure = False 38 | break 39 | if failure: 40 | print("could not add chunk") 41 | continue 42 | 43 | with open('hash.txt', 'w') as f: 44 | f.write(ldf['URL'].str.cat(sep='\n')) 45 | post = { 46 | 'file': ('hash.txt', open('hash.txt', 'rb')), 47 | 'key': (None, "nolang"), 48 | } 49 | os.remove('hash.txt') 50 | 51 | failure = True 52 | for _ in range(10): 53 | response = requests.post(f'http://{bloomip}:8000/add/', files=post) 54 | if response.status_code != 200: 55 | time.sleep(randint(5,30)) 56 | else: 57 | failure = False 58 | break 59 | if failure: 60 | print("could not add chunk") 61 | continue 62 | 63 | pbar.update(1) 64 | 65 | -------------------------------------------------------------------------------- /postCLIP_staging/rsyncd.conf: -------------------------------------------------------------------------------- 1 | # GLOBAL OPTIONS 2 | 3 | #motd file=/etc/motd 4 | #log file=/var/log/rsyncd 5 | # for pid file, do not use /var/run/rsync.pid if 6 | # you are going to run rsync out of the init.d script. 7 | # The init.d script does its own pid file handling, 8 | # so omit the "pid file" line completely in that case. 9 | # pid file=/var/run/rsyncd.pid 10 | #syslog facility=daemon 11 | #socket options= 12 | 13 | use chroot = yes 14 | lock file = /var/lock/rsyncd 15 | uid = archiveteam 16 | gid = archiveteam 17 | log file = /mnt/rs.log 18 | # exclude = 19 | # exclude from = 20 | # include = 21 | # include from = 22 | # auth users = 23 | # secrets file = /etc/rsyncd.secrets 24 | strict modes = yes 25 | # hosts allow = 26 | # hosts deny = 27 | ignore errors = no 28 | ignore nonreadable = yes 29 | transfer logging = no 30 | # log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes. 31 | timeout = 600 32 | refuse options = checksum dry-run 33 | dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz 34 | 35 | # MODULE OPTIONS 36 | 37 | [CAH] 38 | comment = CAH dataset archive 39 | path = /home/archiveteam/CAH/results/ 40 | # max connections=10 41 | # the default for read only is yes... 42 | read only = no 43 | write only = yes 44 | list = yes 45 | # exclude = 46 | # exclude from = 47 | # include = 48 | # include from = 49 | # auth users = 50 | # secrets file = /etc/rsyncd.secrets 51 | # hosts allow = 52 | # hosts deny = 53 | 54 | [bloom] 55 | comment = update filters 56 | path = /home/archiveteam/CAH/bloom/ 57 | # max connections=10 58 | # lock file = /var/lock/rsyncd_gpu 59 | # the default for read only is yes... 60 | read only = yes 61 | write only = no 62 | list = yes 63 | 64 | [CAHINTL] 65 | comment = CAH INTL dataset archive 66 | path = /home/archiveteam/CAH/resultsintl/ 67 | # max connections=10 68 | # the default for read only is yes... 69 | read only = no 70 | write only = yes 71 | list = yes 72 | # exclude = 73 | # exclude from = 74 | # include = 75 | # include from = 76 | # auth users = 77 | # secrets file = /etc/rsyncd.secrets 78 | # hosts allow = 79 | # hosts deny = path = /home/archiveteam/CAH/resultsnolang/ 80 | # max connections=10 81 | # the default for read only is yes... 82 | read only = no 83 | write only = yes 84 | list = yes 85 | # exclude = 86 | # exclude from = 87 | # include = 88 | # include from = 89 | # auth users = 90 | # secrets file = /etc/rsyncd.secrets 91 | # hosts allow = 92 | # hosts deny = 93 | 94 | 95 | [CAHNOLANG] 96 | comment = CAH dataset archive 97 | -------------------------------------------------------------------------------- /bloom_server/bloomexport.py: -------------------------------------------------------------------------------- 1 | ''' 2 | preable: 3 | script is adapted for large redisbloom filters and will not require double memory size since it iteratively dumps or load chunks of 512MB to disk 4 | 5 | arguments: 6 | -m/--mode (dump|restore) 7 | -k/--key key to be dumped or source key for backup to restore 8 | -d/--dest key to be restored to 9 | -p/--path where to store/retrieve the backup files 10 | 11 | usage: 12 | 1. backup of key "main" 13 | 14 | python3 bloomexport.py -m dump -k main 15 | 16 | 2. restore from backup of key "main" into key "test" (destination key should not exist, it will be created) 17 | 18 | python3 bloomexport.py -m restore -k main -d test 19 | 20 | ''' 21 | 22 | import sys 23 | import glob 24 | import pickle 25 | import argparse 26 | from redisbloom.client import Client 27 | r = Client() 28 | 29 | def make_dump(r, key, path): 30 | iter = 0 31 | while True: 32 | iter, data = r.bfScandump(key, iter) 33 | if iter == 0: 34 | return 35 | else: 36 | print(iter) 37 | with open(f"{path}/{iter}.{key}.bloom","wb") as f: 38 | pickle.dump(data, f) 39 | 40 | def restore_dump(r, source, dest, path): 41 | iters = [] 42 | files = glob.glob(f"{path}/*.bloom") 43 | for file in files: 44 | try: 45 | iter, key, ext = file.split("/")[-1].split(".") 46 | if key == source: 47 | iters.append(iter) 48 | except: 49 | pass 50 | # reorder chunks ascending 51 | iters.sort(key=lambda x: int(x)) 52 | for iter in iters: 53 | with open(f"{path}/{iter}.{source}.bloom","rb") as f: 54 | data = pickle.load(f) 55 | r.bfLoadChunk(dest, iter, data) 56 | print(iter) 57 | return 58 | 59 | if __name__ == "__main__": 60 | # script initialization 61 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -m/--mode -k/--key -p/--path') 62 | parser.add_argument("-m","--mode",action='append',help="Choose mode dump or restore", required=True) 63 | parser.add_argument("-k","--key",action='append',help="Choose bloom key", required=True) 64 | parser.add_argument("-d","--destination",action='append',help="Choose destination bloom key at restore", required=False) 65 | parser.add_argument("-p","--path",action='append',help="Choose folder", required=False) 66 | args = parser.parse_args() 67 | path = "." 68 | if args.path is not None: 69 | path = args.path[0] 70 | key = args.key[0] 71 | dest = key 72 | if args.destination is not None: 73 | dest = args.destination[0] 74 | if args.mode[0] == "dump": 75 | make_dump(r, key, path) 76 | print(f"dump for {key} saved in {path}") 77 | elif args.mode[0] == "restore": 78 | restore_dump(r, key, dest, path) 79 | print(f"dump for {key} restored as {dest} from {path}") 80 | else: 81 | print("bad mode entered") -------------------------------------------------------------------------------- /postgres/dedup_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | import requests 5 | from glob import glob 6 | from random import randint 7 | from tqdm.auto import tqdm 8 | import os.path as path 9 | import time 10 | bloomip = "116.202.162.146" 11 | 12 | files = glob("*.txt") 13 | with tqdm(total=len(files), file=sys.stdout) as pbar: 14 | pbar.desc = "1" 15 | for file in files: 16 | age = time.time() - path.getmtime(file) 17 | if not os.path.isfile(f"{file}.deduped") and age > 24*60*60: 18 | try: 19 | df = pd.read_csv(file, sep="\t", names=["s","url","a","b","c","d","e","f"]) 20 | df.drop_duplicates(subset="url", keep='first').reset_index(drop=True) 21 | 22 | with open('hash.txt', 'w') as f: 23 | f.write(df['url'].str.cat(sep='\n')) 24 | post = { 25 | 'file': ('hash.txt', open('hash.txt', 'rb')), 26 | 'key': (None, "dedup"), 27 | } 28 | os.remove('hash.txt') 29 | 30 | failure = True 31 | for _ in range(10): 32 | response = requests.post(f'http://{bloomip}:8000/deduplicate/', files=post) 33 | if response.status_code != 200: 34 | time.sleep(randint(5,30)) 35 | else: 36 | failure = False 37 | break 38 | if failure: 39 | continue 40 | 41 | valid_urls = response.content.decode("utf-8").split("\n") 42 | 43 | ratio = round(len(valid_urls) / len(df.index), 2) 44 | 45 | df = df[df.url.isin(valid_urls)] 46 | df.reset_index(inplace=True, drop=True) 47 | 48 | df.to_csv(file+".deduped", sep="\t", index=False, header=False) 49 | 50 | # add parsed urls to parsed bloom server 51 | with open('hash.txt', 'w') as f: 52 | for url in valid_urls: 53 | f.write(url.strip()+"\n") 54 | post = { 55 | 'file': ('hash.txt', open('hash.txt', 'rb')), 56 | 'key': (None, 'dedup'), 57 | } 58 | os.remove('hash.txt') 59 | 60 | failure = True 61 | for _ in range(10): 62 | try: 63 | response = requests.post(f'http://{bloomip}:8000/add/', files=post) 64 | if response.status_code != 200: 65 | time.sleep(randint(5,30)) 66 | else: 67 | failure = False 68 | break 69 | except: 70 | time.sleep(15) 71 | if failure: 72 | continue 73 | os.system(f"rm {file}") 74 | os.system(f"mv {file}.deduped {file}") 75 | pbar.desc = str(ratio) 76 | pbar.update(1) 77 | except Exception as e: 78 | print (e) 79 | -------------------------------------------------------------------------------- /preCLIP_staging/rsyncd.conf: -------------------------------------------------------------------------------- 1 | # GLOBAL OPTIONS 2 | 3 | #motd file=/etc/motd 4 | #log file=/var/log/rsyncd 5 | # for pid file, do not use /var/run/rsync.pid if 6 | # you are going to run rsync out of the init.d script. 7 | # The init.d script does its own pid file handling, 8 | # so omit the "pid file" line completely in that case. 9 | # pid file=/var/run/rsyncd.pid 10 | #syslog facility=daemon 11 | #socket options= 12 | 13 | # MODULE OPTIONS 14 | 15 | [gpujobs] 16 | 17 | comment = database 3 staged gpu jobs storage 18 | path = /mnt/md0/gpujobs 19 | use chroot = yes 20 | # max connections=10 21 | lock file = /var/lock/rsyncd 22 | # the default for read only is yes... 23 | read only = no 24 | write only = no 25 | list = no 26 | uid = archiveteam 27 | gid = archiveteam 28 | # exclude = 29 | # exclude from = 30 | # include = 31 | # include from = 32 | # auth users = 33 | # secrets file = /etc/rsyncd.secrets 34 | strict modes = yes 35 | # hosts allow = 36 | # hosts deny = 37 | ignore errors = no 38 | ignore nonreadable = yes 39 | transfer logging = no 40 | # log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes. 41 | timeout = 600 42 | refuse options = checksum dry-run 43 | dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz 44 | 45 | [gpujobsml] 46 | 47 | comment = database 3 staged gpu jobs storage 48 | path = /mnt/md0/gpujobsml 49 | use chroot = yes 50 | # max connections=10 51 | lock file = /var/lock/rsyncd 52 | # the default for read only is yes... 53 | read only = no 54 | write only = no 55 | list = no 56 | uid = archiveteam 57 | gid = archiveteam 58 | # exclude = 59 | # exclude from = 60 | # include = 61 | # include from = 62 | # auth users = 63 | # secrets file = /etc/rsyncd.secrets 64 | strict modes = yes 65 | # hosts allow = 66 | # hosts deny = 67 | ignore errors = no 68 | ignore nonreadable = yes 69 | transfer logging = no 70 | # log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes. 71 | timeout = 600 72 | refuse options = checksum dry-run 73 | dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz 74 | 75 | [gpujobsnolang] 76 | 77 | comment = database 3 staged gpu jobs storage 78 | path = /mnt/md0/gpujobsnolang 79 | use chroot = yes 80 | # max connections=10 81 | lock file = /var/lock/rsyncd 82 | # the default for read only is yes... 83 | read only = no 84 | write only = no 85 | list = no 86 | uid = archiveteam 87 | gid = archiveteam 88 | # exclude = 89 | # exclude from = 90 | # include = 91 | # include from = 92 | # auth users = 93 | # secrets file = /etc/rsyncd.secrets 94 | strict modes = yes 95 | # hosts allow = 96 | # hosts deny = 97 | ignore errors = no 98 | ignore nonreadable = yes 99 | transfer logging = no 100 | # log format = %t: host %h (%a) %o %f (%l bytes). Total %b bytes. 101 | timeout = 600 102 | refuse options = checksum dry-run 103 | dont compress = *.gz *.tgz *.zip *.z *.rpm *.deb *.iso *.bz2 *.tbz -------------------------------------------------------------------------------- /preCLIP_staging/cleanup3db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import argparse 5 | from glob import glob 6 | import os.path as path 7 | from datetime import datetime 8 | from multiprocessing import Process, Queue 9 | from sqlalchemy import create_engine 10 | from configparser import ConfigParser 11 | 12 | def config(filename='database.ini', section='cah_production'): 13 | # create a parser 14 | parser = ConfigParser() 15 | # read config file 16 | parser.read(filename) 17 | # get section, default to postgresql 18 | db = {} 19 | if parser.has_section(section): 20 | params = parser.items(section) 21 | for param in params: 22 | db[param[0]] = param[1] 23 | else: 24 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 25 | return db 26 | 27 | def confirm_delete(engine, uuid, jobset="en"): 28 | jobtable = "jobs" 29 | if jobset=="intl": 30 | jobtable = "jobs_intl" 31 | select_stmt1 = f"select count(*) from {jobtable} where status > 1 and jobid = '{uuid}'" 32 | conn = engine.raw_connection() 33 | cur = conn.cursor() 34 | cur.execute(select_stmt1) 35 | jobcount = int(cur.fetchone()[0]) 36 | conn.commit() 37 | cur.close() 38 | conn.close() 39 | return jobcount 40 | 41 | def worker(engine, q: Queue, jobset = "en"): 42 | jobspath = '/mnt/md0/gpujobs/' 43 | if jobset == "intl": 44 | jobspath = '/mnt/md0/gpujobsml/' 45 | while q.qsize()>0: 46 | try: 47 | uuid = q.get_nowait() 48 | if confirm_delete(engine, uuid, jobset)==1: 49 | file = f"{jobspath}{uuid}.tar.gz" 50 | if os.path.isfile(file) and os.path.getmtime(file) < time.time() - 60*60: # this makes the code more robust 51 | os.remove(file) 52 | print(f"deleted {file}") 53 | except Exception as e: 54 | print (f"worker raised error {e}") 55 | pass 56 | 57 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -s/--set') 58 | parser.add_argument("-s","--set",action='append',help="Choose current set (en, nolang, intl)",required=False) 59 | args = parser.parse_args() 60 | 61 | params = config() 62 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}',pool_size=25, max_overflow=50) 63 | 64 | jobset = "en" 65 | 66 | if args.set is not None: 67 | jobset = args.set[0] 68 | 69 | jobspath = '/mnt/md0/gpujobs/*.tar.gz' 70 | if jobset == "intl": 71 | jobspath = '/mnt/md0/gpujobsml/*.tar.gz' 72 | 73 | now = datetime.now().strftime("%Y/%m/%d_%H:%M") 74 | list_of_files = glob(jobspath) 75 | frm = len(list_of_files) 76 | 77 | start = time.time() 78 | q = Queue() 79 | procs = [] 80 | for i in range(10): 81 | procs.append(Process(target=worker, args=[engine, q, jobset])) 82 | 83 | for file in list_of_files: 84 | if time.time() - path.getmtime(file) < 300: 85 | continue 86 | uuid = file.split("/")[4].split(".")[0] 87 | q.put(uuid) 88 | 89 | time.sleep(20) 90 | 91 | for proc in procs: 92 | proc.start() 93 | for proc in procs: 94 | proc.join() 95 | 96 | list_of_files = glob(jobspath) 97 | end = len(list_of_files) 98 | 99 | with open("jobs.txt","wt") as f: 100 | for file in list_of_files: 101 | f.write(file + "\n") 102 | 103 | print(f"[{now}] from {frm} to {end} \"task executed in\" {round(time.time()-start,2)} sec") 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env* 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # Crawling@Home 141 | CLOP 142 | clip 143 | crawlingathome_client 144 | save/* 145 | *.wat 146 | hcloud-python 147 | hcloud 148 | cloud-init 149 | gpusemaphore 150 | *.zip 151 | superseeded* 152 | kaggle.py 153 | workers.txt 154 | crawling_at_home* 155 | FIRST_SAMPLE* 156 | image_embedding* 157 | *.jpg 158 | *.png 159 | *.jpeg 160 | *.webp 161 | [1-9]*-[1-9]*-[1-9]*-[1-9]*/ 162 | test.* 163 | duplicates/ 164 | gpujob.zip_* 165 | results/ 166 | stats/ 167 | save/ 168 | parquet/ 169 | blocklists/* 170 | *.tar 171 | *.tar.gz 172 | hash.txt 173 | hetzner.txt 174 | alibaba.txt 175 | oracle.txt 176 | test/ 177 | gpuerr.txt 178 | database.ini 179 | temp.gz 180 | gpuout.txt 181 | [0-9]/ 182 | [1-9][0-9]/ 183 | errors.txt 184 | *_full_wat.csv 185 | *.prod 186 | alibaba_instances.csv 187 | _sample_data/ 188 | chromedriver 189 | -------------------------------------------------------------------------------- /preCLIP_staging/cleanup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import requests 5 | import argparse 6 | from glob import glob 7 | import os.path as path 8 | from datetime import datetime 9 | from multiprocessing import Process, Queue 10 | from sqlalchemy import create_engine 11 | from configparser import ConfigParser 12 | 13 | def config(filename='database.ini', section='cah_production'): 14 | # create a parser 15 | parser = ConfigParser() 16 | # read config file 17 | parser.read(filename) 18 | # get section, default to postgresql 19 | db = {} 20 | if parser.has_section(section): 21 | params = parser.items(section) 22 | for param in params: 23 | db[param[0]] = param[1] 24 | else: 25 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 26 | return db 27 | 28 | def confirm_delete(engine, uuid, jobset="en"): 29 | jobtable = "jobs" 30 | if jobset != "en": 31 | jobtable = f"jobs_{jobset}" 32 | select_stmt1 = f"select count(*) from {jobtable} where status > 1 and jobid = '{uuid}'" 33 | conn = engine.raw_connection() 34 | cur = conn.cursor() 35 | cur.execute(select_stmt1) 36 | jobcount = int(cur.fetchone()[0]) 37 | conn.commit() 38 | cur.close() 39 | conn.close() 40 | return jobcount 41 | 42 | def worker(engine, q: Queue, jobset = "en"): 43 | jobspath = '/mnt/md0/gpujobs/' 44 | if jobset == "intl": 45 | jobspath = '/mnt/md0/gpujobsml/' 46 | if jobset == "nolang": 47 | jobspath = '/mnt/md0/gpujobsnolang/' 48 | while q.qsize()>0: 49 | try: 50 | uuid = q.get_nowait() 51 | if confirm_delete(engine, uuid, jobset)==1: 52 | file = f"{jobspath}{uuid}.tar.gz" 53 | if os.path.isfile(file) and os.path.getmtime(file) < time.time() - 60*60: # this makes the code more robust 54 | os.remove(file) 55 | print(f"deleted {file}") 56 | except Exception as e: 57 | print (f"worker raised error {e}") 58 | pass 59 | 60 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -s/--set') 61 | parser.add_argument("-s","--set",action='append',help="Choose current set (en, nolang, intl)",required=False) 62 | args = parser.parse_args() 63 | 64 | params = config() 65 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}',pool_size=25, max_overflow=50) 66 | 67 | jobset = "en" 68 | 69 | if args.set is not None: 70 | jobset = args.set[0] 71 | 72 | jobspath = '/mnt/md0/gpujobs/*.tar.gz' 73 | if jobset == "intl": 74 | jobspath = '/mnt/md0/gpujobsml/*.tar.gz' 75 | if jobset == "nolang": 76 | jobspath = '/mnt/md0/gpujobsnolang/*.tar.gz' 77 | 78 | now = datetime.now().strftime("%Y/%m/%d_%H:%M") 79 | list_of_files = glob(jobspath) 80 | frm = len(list_of_files) 81 | 82 | start = time.time() 83 | q = Queue() 84 | procs = [] 85 | for i in range(10): 86 | procs.append(Process(target=worker, args=[engine, q, jobset])) 87 | 88 | for file in list_of_files: 89 | if time.time() - path.getmtime(file) < 300: 90 | continue 91 | uuid = file.split("/")[4].split(".")[0] 92 | q.put(uuid) 93 | 94 | time.sleep(20) 95 | 96 | for proc in procs: 97 | proc.start() 98 | for proc in procs: 99 | proc.join() 100 | 101 | list_of_files = glob(jobspath) 102 | end = len(list_of_files) 103 | 104 | with open("jobs.txt","wt") as f: 105 | for file in list_of_files: 106 | f.write(file + "\n") 107 | 108 | print(f"[{now}] from {frm} to {end} \"task executed in\" {round(time.time()-start,2)} sec") 109 | -------------------------------------------------------------------------------- /postgres/tables.sql: -------------------------------------------------------------------------------- 1 | create table dataset_en 2 | ( 3 | sampleid bigint not null 4 | constraint dataset_en_pk 5 | primary key, 6 | url text not null, 7 | text text not null, 8 | license varchar, 9 | domain varchar, 10 | wat integer, 11 | status smallint default 0, 12 | illegal boolean default false, 13 | hash varchar not null, 14 | modified timestamp, 15 | language varchar not null, 16 | width integer, 17 | height integer 18 | ) 19 | with (autovacuum_analyze_threshold = 10000, autovacuum_vacuum_cost_limit = 50, autovacuum_vacuum_cost_delay = 0.1, autovacuum_vacuum_scale_factor = 0.1); 20 | 21 | alter table dataset_en 22 | owner to cah; 23 | 24 | create index dataset_en_status_index 25 | on dataset_en (status); 26 | 27 | create trigger update_customer_modtime 28 | before update 29 | on dataset_en 30 | for each row 31 | execute procedure update_modified_column(); 32 | 33 | create table dataset_intl 34 | ( 35 | sampleid bigint not null 36 | constraint dataset_pk 37 | primary key, 38 | url text not null, 39 | text text not null, 40 | license varchar, 41 | domain varchar, 42 | wat integer, 43 | status smallint default 0, 44 | illegal boolean default false, 45 | hash varchar not null, 46 | modified timestamp, 47 | language varchar not null, 48 | width integer, 49 | height integer 50 | ) 51 | with (autovacuum_analyze_threshold = 10000000, autovacuum_vacuum_cost_limit = 150, autovacuum_vacuum_cost_delay = 0.1, autovacuum_vacuum_scale_factor = 0); 52 | 53 | alter table dataset_intl 54 | owner to cah; 55 | 56 | create index dataset_status_index 57 | on dataset_intl (status); 58 | 59 | create trigger update_customer_modtime 60 | before update 61 | on dataset_intl 62 | for each row 63 | execute procedure update_modified_column(); 64 | 65 | create table dataset_nolang 66 | ( 67 | sampleid bigint not null 68 | constraint dataset_nolang_pk 69 | primary key, 70 | url text not null, 71 | text text not null, 72 | license varchar, 73 | domain varchar, 74 | wat integer, 75 | status smallint default 0, 76 | illegal boolean default false, 77 | hash varchar not null, 78 | modified timestamp, 79 | language varchar not null, 80 | width integer, 81 | height integer 82 | ) 83 | with (autovacuum_analyze_threshold = 10000000, autovacuum_vacuum_cost_limit = 150, autovacuum_vacuum_cost_delay = 0.1, autovacuum_vacuum_scale_factor = 0); 84 | 85 | alter table dataset_nolang 86 | owner to cah; 87 | 88 | create index dataset_nolang_status_index 89 | on dataset_nolang (status); 90 | 91 | create trigger update_customer_modtime 92 | before update 93 | on dataset_nolang 94 | for each row 95 | execute procedure update_modified_column(); 96 | 97 | create table dataset_buffer 98 | ( 99 | sampleid bigint, 100 | url text not null, 101 | text text not null, 102 | license varchar, 103 | domain varchar, 104 | wat integer, 105 | status smallint default 0, 106 | illegal boolean default false, 107 | hash varchar not null, 108 | modified timestamp, 109 | language varchar not null, 110 | width integer, 111 | height integer 112 | ); 113 | 114 | alter table dataset_buffer 115 | owner to cah; 116 | 117 | create trigger skip_errors 118 | before insert 119 | on dataset_buffer 120 | for each row 121 | execute procedure on_insert_in_original_table(); 122 | 123 | -------------------------------------------------------------------------------- /postgres/stage_db.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import psycopg2 5 | import argparse 6 | import fileinput 7 | from glob import glob 8 | import pandas as pd 9 | from sqlalchemy import create_engine 10 | from configparser import ConfigParser 11 | from tqdm.auto import tqdm 12 | 13 | 14 | def config(filename='database.ini', mode="test"): 15 | # create a parser 16 | parser = ConfigParser() 17 | # read config file 18 | parser.read(filename) 19 | 20 | section='postgresql' 21 | if mode == "production": 22 | section='cah_production' 23 | 24 | # get section, default to postgresql 25 | db = {} 26 | if parser.has_section(section): 27 | params = parser.items(section) 28 | for param in params: 29 | db[param[0]] = param[1] 30 | else: 31 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 32 | 33 | return db 34 | 35 | def get_count(engine, ds="intl"): 36 | table="dataset_intl" 37 | if ds == "en": 38 | table = "dataset_en" 39 | elif ds == "nolang": 40 | table = "dataset_nolang" 41 | select_stmt1 = f"select count(*) from {table} where status = 0" 42 | conn = engine.raw_connection() 43 | cur = conn.cursor() 44 | cur.execute(select_stmt1) 45 | count = cur.fetchone() 46 | conn.commit() 47 | cur.close() 48 | conn.close() 49 | return str(count[0]) 50 | 51 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -m/--mode -s/--set -p/--path') 52 | parser.add_argument("-m","--mode",action='append',help="Mode to run", required=True) 53 | parser.add_argument("-s","--set",action='append',help="Dataset to run", required=False) 54 | parser.add_argument("-p","--path",action='append',help="Choose source path", required=False) 55 | parser.add_argument("-l","--limit",action='append',help="Specify DB table limit", required=False) 56 | args = parser.parse_args() 57 | 58 | dir = "/mnt/md1/export/staging" 59 | if args.path is not None: 60 | dir = args.path[0] 61 | 62 | mode = "txt" 63 | if args.mode is not None: 64 | mode = args.mode[0] 65 | 66 | ds = "intl" 67 | if args.set is not None: 68 | ds = args.set[0] 69 | 70 | limit = 500000000 71 | if args.limit is not None: 72 | limit = int(args.limit[0]) 73 | 74 | i = 0 75 | 76 | params = config(mode="production") 77 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_size=5, max_overflow=10, pool_pre_ping=True) 78 | 79 | files = glob(f'{dir}/*.{mode}') 80 | 81 | conn = engine.raw_connection() 82 | 83 | j = 3 84 | if mode == "txt": 85 | j = 1000 86 | 87 | with tqdm(total=len(files), file=sys.stdout) as pbar: 88 | pbar.desc = get_count(engine, ds) 89 | for file in files: 90 | try: 91 | cur = conn.cursor() 92 | with open(file, "rt") as f: 93 | if mode == "txt": 94 | cur.copy_from(f, 'dataset_buffer', columns=("sampleid","url","text","license","domain","wat","hash","language")) 95 | elif mode == "csv": 96 | cur.copy_expert("COPY dataset_buffer from STDIN DELIMITER '|' CSV HEADER", f) 97 | else: 98 | print("bad mode, choose txt or csv only") 99 | break 100 | conn.commit() 101 | cur.close() 102 | os.system(f"mv {file} {file}.done") 103 | i+=1 104 | if i % j == 0: 105 | count = get_count(engine, ds) 106 | if int(count) > limit: 107 | break 108 | else: 109 | pbar.desc = count 110 | pbar.update(1) 111 | 112 | except Exception as e: 113 | print(f"error {file} because {e}") 114 | for line in fileinput.input(file, inplace = True): 115 | if not re.search(r'\x00', line): 116 | print(line, end="") 117 | try: 118 | df = pd.read_csv(file, sep="\t", on_bad_lines='skip', header=None) 119 | df[2] = df[2].apply(lambda x: x.replace("\n","")) 120 | df[5] = df[5].apply(lambda x: int(x)) 121 | df.to_csv(file, sep="\t", index=False, header=False) 122 | except: 123 | #os.system(f"mv {file} {file}.error") 124 | pass 125 | conn.close() 126 | conn = engine.raw_connection() 127 | conn.close() 128 | 129 | print("if you had files with error of \x00 present in file, files were automatically corrected, please rerun the script") -------------------------------------------------------------------------------- /cloud boot/boot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | sudo su root 3 | 4 | apt update && yes | DEBIAN_FRONTEND=noninteractive apt upgrade 5 | yes | apt install python3-pip git build-essential libssl-dev libffi-dev python3-dev libpq-dev libwebp-dev libjpeg-dev libtiff-dev libopenjp2-7-dev zlib1g-dev knot-dnsutils resolvconf protobuf-compiler libprotobuf-dev 6 | wget https://secure.nic.cz/files/knot-resolver/knot-resolver-release.deb 7 | sudo dpkg -i knot-resolver-release.deb 8 | sudo apt install -y knot-resolver 9 | systemctl enable --now kresd@{1..2}.service 10 | systemctl disable systemd-resolved 11 | 12 | echo 'CAH_NICKNAME="Caricature, Inc"' >> /etc/environment 13 | 14 | adduser --system --group --shell /bin/bash crawl 15 | echo 'crawl ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers 16 | 17 | touch /home/crawl/worker-reset.sh 18 | chmod 0744 /home/crawl/worker-reset.sh 19 | echo '#!/bin/bash' >> /home/crawl/worker-reset.sh 20 | echo '# Updates and resets the worker via SSH command' >> /home/crawl/worker-reset.sh 21 | echo 'rm -rf /home/crawl/*.tar.gz' >> /home/crawl/worker-reset.sh 22 | echo 'cd /home/crawl/crawlingathome-gpu-hcloud' >> /home/crawl/worker-reset.sh 23 | echo 'git pull' >> /home/crawl/worker-reset.sh 24 | echo 'systemctl restart crawl' >> /home/crawl/worker-reset.sh 25 | 26 | echo "* soft nproc 65535 " >> /etc/security/limits.conf 27 | echo "* hard nproc 65535 " >> /etc/security/limits.conf 28 | echo "* soft nofile 65535" >> /etc/security/limits.conf 29 | echo "* hard nofile 65535" >> /etc/security/limits.conf 30 | echo "root soft nproc 65535 " >> /etc/security/limits.conf 31 | echo "root hard nproc 65535 " >> /etc/security/limits.conf 32 | echo "root soft nofile 65535" >> /etc/security/limits.conf 33 | echo "root hard nofile 65535" >> /etc/security/limits.conf 34 | echo "session required pam_limits.so" >> /etc/pam.d/common-session 35 | echo "fs.file-max = 2097152" >> /etc/sysctl.conf 36 | 37 | echo "[Unit]" >> /etc/systemd/system/crawl.service 38 | echo "After=network.service" >> /etc/systemd/system/crawl.service 39 | echo "Description=Crawling @ Home" >> /etc/systemd/system/crawl.service 40 | echo "[Service]" >> /etc/systemd/system/crawl.service 41 | echo "Type=simple" >> /etc/systemd/system/crawl.service 42 | echo "LimitNOFILE=2097152" >> /etc/systemd/system/crawl.service 43 | echo "WorkingDirectory=/home/crawl" >> /etc/systemd/system/crawl.service 44 | echo "ExecStart=/home/crawl/crawl.sh" >> /etc/systemd/system/crawl.service 45 | echo "EnvironmentFile=/etc/environment" >> /etc/systemd/system/crawl.service 46 | echo "User=crawl" >> /etc/systemd/system/crawl.service 47 | echo "Nice=10" >> /etc/systemd/system/crawl.service 48 | echo "[Install]" >> /etc/systemd/system/crawl.service 49 | echo "WantedBy=multi-user.target" >> /etc/systemd/system/crawl.service 50 | chmod 664 /etc/systemd/system/crawl.service 51 | 52 | systemctl daemon-reload 53 | systemctl enable crawl.service 54 | 55 | touch /home/crawl/crawl.sh 56 | echo '#!/bin/bash' >> /home/crawl/crawl.sh 57 | echo "while true" >> /home/crawl/crawl.sh 58 | echo "do" >> /home/crawl/crawl.sh 59 | echo "python3 -u /home/crawl/crawlingathome-gpu-hcloud/worker.py >> /home/crawl/crawl.log 2>&1" >> /home/crawl/crawl.sh 60 | echo "sleep 1" >> /home/crawl/crawl.sh 61 | echo "done" >> /home/crawl/crawl.sh 62 | chmod 744 /home/crawl/crawl.sh 63 | mkdir /home/crawl/.ssh 64 | echo 'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC0Ff0RcDRafX/VyxYJTeMWJrJGHIKvAvIG+nUmUR73iQFcwF7JP8FucLO0baVIPb029DI469SOZJWh6FTwt5T+IT5jm0UDAs2gwYClS+tRbohr27kXoILhlugFiCor4TD0mMhBTKme4RPLlcbLYaZq4r7Rep0rbWn46f3Gma2fDXgpy3v1JZBa30yHxQVO+s2UjbqPk9RcsWNQ7oap36yGrVb6Bc8ucwAM6pGTdJMQBZoTj0tgI/b9cSgKO1JRyUTt6HhuW+DDfrOuZPJLqOq0f5sNV0gD+89K9zNEtZeO+bpQuZvf+cwhb10XQc4t0Yd8EsyhxSbWbdvn6Utb9yQwmk7ThJkxLLLmDp5LtClOvp6PTFUooDjj3DgFfD8ZBK+sckwu1TPAKa8Y8jU+q4GfF5abAej5rXObVjVcKHsziBSsSG6yViVtoFAvqh0dYfM/Ujz7dj6KtfRs67J5X+8CJvvKokRZcjMs6neJNHoRll5t6K/uhQgKHvBRpFqL9kGS4hTEdJog47w9o8qmLTMYQ340ckEZkRh/c1lWu51wNycLW1iab40D2F/ymMihGxMo9AqHKoqE/cnh9SaZr1EGr7s4BhBnAvyOwHh2+sW5ndOenDOZ1wGbYbwVJznSG8I1tdlJzEjf2GuW1HZtxE/95yW0zlEQkue8mBfNUL+Q6Q== Generated by richa@RICHARD' >> /home/crawl/.ssh/authorized_keys 65 | 66 | sed -i -e '/^\(#\|\)cache\.size/s/^.*$/cache\.size = 10000 \* MB/' /etc/knot-resolver/kresd.conf 67 | echo "trust_anchors.remove('.')" >> /etc/knot-resolver/kresd.conf 68 | echo "policy.add(policy.all(policy.FORWARD({'1.1.1.1'})))" >> /etc/knot-resolver/kresd.conf 69 | 70 | cd /home/crawl 71 | 72 | git clone https://github.com/rvencu/crawlingathome-gpu-hcloud 73 | cd crawlingathome-gpu-hcloud 74 | git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client 75 | pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir 76 | pip3 install -r worker-requirements.txt --no-cache-dir 77 | pip install random_user_agent 78 | 79 | chown crawl:crawl -R /home/crawl/ 80 | 81 | sudo apt clean 82 | sudo reboot 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawling@Home GPU controlled Hetzner Cloud swarm of scrapers 2 | 3 | > Help us build a billion-scale image-caption dataset by filtering Common Crawl with OpenAI CLIP. At the time of this writing we are up to 5 billion high quality pairs ready for training various models but we still expect your help to advance to the potential 6 billion quality pairs estimated to exist in the commoncrawl data. This dataset is intended for public use and towards a truly open access to AI for everyone ! 4 | 5 | ## Concept 6 | This image-text scraping task comes with specific characteristics: link lists might be old and images might not be online anymore, even entire domains might be missing. Also there are seldom multiple links pointing to the same domain, so the DNS queries are many and often. Finally after the actual scraping there is a computational intensive task to calculate similarities between images themselves and their captions. 7 | 8 | On a normal CPU machine, scraping and filtering take almost the same time. On a GPU though filtering is much faster, in order of 60x faster than on single CPU. 9 | 10 | Hence this concept for crawling@home where we created a data pipeline on 3 levels: 11 | 1. commoncrawl preprocessing, where we use a swarm of about 500 cpus to download, parse and send results to a database node with candidates for our dataset, meaning image urls with alt text, plus the detected language using gcld3. By the language detection we split the candidates into English, Multilanguage (non English) and Nolang (language not detected with confidence) categories. 12 | 2. image downloading and inspection, prefiltering by image type and resolution, producing further candidates for CLIP or mCLIP inference 13 | 3. CLIP style inference where we calculate similarity of image embeddings with text embeddings and retain only pairs with higher similarity than a manually set threshold 14 | 15 | Common Crawl jobs are coordinated by a tracker with dashboard at http://cah.io.community/ 16 | 17 | ## Cloud workers 18 | We used AWS workers for first level of the above pipeline, Hetzner and Alibaba workers for the second level and home GPU plus AWS GPU nodes for the third level. 19 | 20 | Thus the code migrated to: 21 | 1. Hetzner swarm control: use `infrastructure.py` to control the swarm at Hetzner Cloud via commands like `python3 infrastructure.py up 20 fsn1` where up means bring up swarm, 20 is the desired number of nodes, and fsn1 is the desired datacenter location. 22 | 2. Alibaba swarm control: due to cost restrictions we used Simple Application Servers with Alibaba, and developed a limited scope control script 23 | 3. CPU clients: 24 | a) `ccpp.py` is used to preprocess common crawl wat files. Nodes require minimum one CPU core and 1GB RAM for each CPU. 25 | b) `dbdl.py` is used to download images. Nodes require minimum one CPU core and 1GB RAM for each CPU. 26 | 3. GPU clients only consume max 3.5GB of GPU VRAM so any nVidia GPU card with 4GB VRAM or more is deemed compatible: 27 | a) run `python3 gpu_inference.py` from any Linux based PC with an Nvidia GPU and correct drivers installed 28 | 29 | If you want to install on your own box, then 30 | ## Prerequisites 31 | 1. Ubuntu box with 4GB+ Nvidia GPU 32 | 2. Nvidia driver installed 33 | 3. Cuda toolkit 11+ (also corresponding cudnn is recommended for future) 34 | 4. check driver installation with `nvidia-smi` command 35 | 5. your user is able to run `sudo` commands 36 | 6. install `python3-pip` and `git` packages 37 | ## Distributed infrastructure setup and run 38 | 1. Make an account at Hetzner Cloud (https://www.hetzner.com/) and issue an API token 39 | 2. create the `.env` file and paste your HCLOUD API key in it. optionally, if you have more than one account, paste all API keys each on a separate line 40 | 3. bring up infrastructure at any time with `python3 infrastructure.py up N` in order to raise *N* nodes. It will scan all API keys and create maximum available servers on each until *N* limit is met 41 | 4. tear down infrastructure at any time with `python3 infrastructure.py down` in order to shutdown things (and save cash). this will shut down all cloud servers that belong to all API tokens saved in the `.env` file. Be aware, this command will delete all servers in the accounts even if they are NOT related to this project !!! 42 | 43 | If you wish to SSH into any droplet you can use this command: `ssh -oStrictHostKeyChecking=no -oIdentitiesOnly=yes -i~/.ssh/id_cah crawl@<>`. The crawling script is ran as a service, check logs with `tail -f crawl.log`. Access service status or commands with `sudo systemctl stop|restart|start crawl` 44 | 45 | If you are asked for any droplet root password at any time, it means you need to rerun `git pull` and `source conda-setup.sh` to refresh the files and regenerate the ssh keys pair. 46 | 47 | ## How to run GPU node from home computer 48 | 1. run `git clone https://github.com/rvencu/crawlingathome-gpu-hcloud`, to download crawlingathome GPU node script 49 | 2. run `cd crawlingathome-gpu-hcloud`, to enter the newly created directory 50 | 3. run `source conda-setup.sh` to setup the environment if you use anaconda. otherwise use `source pip-setup.sh`. the script will ask for a nickame to be used on leaderboard as well as for the sudo password 51 | 4. run `gpu_inference.py`. The script will run in a loop that can be interrupted at any time with Ctrl-C. 52 | 53 | This work is based on code written by: 54 | - https://github.com/TheoCoombes/crawlingathome 55 | - https://github.com/Wikidepia/crawlingathome-worker 56 | 57 | This is a subproject ran by the community around https://github.com/lucidrains/DALLE-pytorch 58 | -------------------------------------------------------------------------------- /cloud boot/cloud-init.yaml: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | users: 3 | - default 4 | - name: crawl 5 | groups: users, adm 6 | sudo: ALL=(ALL) NOPASSWD:ALL 7 | shell: /bin/bash 8 | ssh_authorized_keys: 9 | - ssh-rsa <> 10 | package_update: true 11 | package_upgrade: true 12 | packages: 13 | - python3-pip 14 | - git 15 | - build-essential 16 | - libssl-dev 17 | - libffi-dev 18 | - python3-dev 19 | - libwebp-dev 20 | - libjpeg-dev 21 | - libwebp-dev 22 | - libtiff-dev 23 | - libopenjp2-7-dev 24 | - zlib1g-dev 25 | - libpq-dev 26 | - knot-dnsutils 27 | - resolvconf 28 | - protobuf-compiler 29 | - libprotobuf-dev 30 | bootcmd: 31 | # setup knot resolver 32 | - wget https://secure.nic.cz/files/knot-resolver/knot-resolver-release.deb 33 | - sudo dpkg -i knot-resolver-release.deb 34 | - sudo apt install -y knot-resolver 35 | #- systemctl enable --now kresd@{1..2}.service 36 | #- systemctl disable systemd-resolved 37 | write_files: 38 | - path: /etc/environment 39 | content: | 40 | CAH_NICKNAME="<>" 41 | CLOUD="<>" 42 | append: true 43 | - path: /home/crawl/worker-reset.sh 44 | permissions: '0744' 45 | content: | 46 | #!/bin/bash 47 | # Updates and resets the worker via SSH command 48 | rm -rf /home/crawl/*.tar.gz 49 | cd /home/crawl/crawlingathome-gpu-hcloud 50 | git pull 51 | chown crawl:adm -R /home/crawl/ 52 | systemctl restart crawl 53 | - path: /etc/security/limits.conf 54 | content: | 55 | * soft nproc 65535 56 | * hard nproc 65535 57 | * soft nofile 65535 58 | * hard nofile 65535 59 | root soft nproc 65535 60 | root hard nproc 65535 61 | root soft nofile 65535 62 | root hard nofile 65535 63 | append: true 64 | - path: /home/crawl/crawl.sh 65 | permissions: '0744' 66 | content: | 67 | #!/bin/bash 68 | while true 69 | do 70 | python3 -u /home/crawl/crawlingathome-gpu-hcloud/dbdl.py -s nolang >> /home/crawl/crawl.log 2>&1 71 | sleep 1 72 | done 73 | - path: /home/crawl/database.ini 74 | permissions: '0744' 75 | content: | 76 | [cah_production] 77 | host=<> 78 | database=<> 79 | user=<> 80 | password=<> 81 | - path: /etc/systemd/system/crawl.service 82 | permissions: '0664' 83 | content: | 84 | [Unit] 85 | After=network.service 86 | Description=Crawling @ Home 87 | [Service] 88 | Type=simple 89 | LimitNOFILE=2097152 90 | WorkingDirectory=/home/crawl 91 | ExecStart=/home/crawl/crawl.sh 92 | EnvironmentFile=/etc/environment 93 | User=crawl 94 | Nice=10 95 | [Install] 96 | WantedBy=multi-user.target 97 | - path: /etc/knot-resolver/kresd.conf 98 | content: | 99 | -- SPDX-License-Identifier: CC0-1.0 100 | -- vim:syntax=lua:set ts=4 sw=4: 101 | -- Refer to manual: https://knot-resolver.readthedocs.org/en/stable/ 102 | -- Network interface configuration 103 | net.listen('127.0.0.1', 53, { kind = 'dns' }) 104 | net.listen('127.0.0.1', 853, { kind = 'tls' }) 105 | --net.listen('127.0.0.1', 443, { kind = 'doh2' }) 106 | net.listen('::1', 53, { kind = 'dns', freebind = true }) 107 | net.listen('::1', 853, { kind = 'tls', freebind = true }) 108 | --net.listen('::1', 443, { kind = 'doh2' }) 109 | -- Load useful modules 110 | modules = { 111 | 'hints > iterate', -- Load /etc/hosts and allow custom root hints 112 | 'stats', -- Track internal statistics 113 | 'predict', -- Prefetch expiring/frequent records 114 | 'serve_stale < cache' -- Server from cache if forwarder is staled 115 | } 116 | -- Cache size 117 | cache.size = 100 * MB 118 | -- Disable DNSSEC 119 | trust_anchors.remove('.') 120 | -- forward all traffic to specified IP addresses (selected automatically) 121 | -- policy.add(policy.all(policy.FORWARD({'10.254.0.5', '1.1.1.1'}))) 122 | runcmd: 123 | - [ ls, -l, / ] 124 | - [ sh, -xc, "echo $(date) ': hello crawl!'" ] 125 | - [ sh, -c, echo "=========hello crawl'=========" ] 126 | - ls -l /root 127 | # take care of max open files 128 | - echo "session required pam_limits.so" >> /etc/pam.d/common-session 129 | - echo "fs.file-max = 2097152" >> /etc/sysctl.conf 130 | # secure ssh 131 | - sed -i -e '/^\(#\|\)PermitRootLogin/s/^.*$/PermitRootLogin no/' /etc/ssh/sshd_config 132 | - sed -i -e '/^\(#\|\)PasswordAuthentication/s/^.*$/PasswordAuthentication no/' /etc/ssh/sshd_config 133 | - sed -i -e '/^\(#\|\)X11Forwarding/s/^.*$/X11Forwarding no/' /etc/ssh/sshd_config 134 | - sed -i -e '/^\(#\|\)MaxAuthTries/s/^.*$/MaxAuthTries 2/' /etc/ssh/sshd_config 135 | - sed -i -e '/^\(#\|\)AllowTcpForwarding/s/^.*$/AllowTcpForwarding no/' /etc/ssh/sshd_config 136 | - sed -i -e '/^\(#\|\)AllowAgentForwarding/s/^.*$/AllowAgentForwarding no/' /etc/ssh/sshd_config 137 | - sed -i -e '/^\(#\|\)AuthorizedKeysFile/s/^.*$/AuthorizedKeysFile .ssh\/authorized_keys/' /etc/ssh/sshd_config 138 | - sed -i '$a AllowUsers crawl' /etc/ssh/sshd_config 139 | # install the script 140 | - cd /home/crawl 141 | - git clone https://github.com/rvencu/crawlingathome-gpu-hcloud 142 | - cd crawlingathome-gpu-hcloud 143 | #- git clone "https://github.com/TheoCoombes/crawlingathome" crawlingathome_client 144 | #- pip3 install -r crawlingathome_client/requirements.txt --no-cache-dir 145 | - pip3 install -r worker-requirements.txt --no-cache-dir 146 | - pip install random_user_agent 147 | # make the script run as a service at startup 148 | - systemctl daemon-reload 149 | - systemctl enable crawl.service 150 | - chown crawl:adm -R /home/crawl/ 151 | - apt clean 152 | - reboot 153 | final_message: "The system is finally up, after $UPTIME seconds" 154 | 155 | -------------------------------------------------------------------------------- /postCLIP_staging/bloom.py: -------------------------------------------------------------------------------- 1 | # use this file inside every minute cron in order to recalculate bloom filters. location: staging server 2 | # folder structure 3 | # /home/archiveteam/CAH/ 4 | # |_bloom archiveteam@IP::bloom contains bloom filters 5 | # |_clipped contains clipped lists 6 | # |_ds contains files ready to be sent to the eye 7 | # |_hashes contains list of hashes of files inserted into the dataset 8 | # |_results archiveteam@IP::CAH incoming folder for the final results from workers 9 | 10 | # Stacked bloom filters. Naming convention: 11 | # frozen filters: filter.bin, filter1.bin, filter2.bin 12 | # active filters: filter_active.bin 13 | # 14 | # 15 | import sys 16 | import time 17 | import requests 18 | import pandas as pd 19 | from glob import glob 20 | from pathlib import Path 21 | from datetime import datetime 22 | from bloom_filter2 import BloomFilter 23 | 24 | with open("bloomlog.txt","a") as log: 25 | 26 | # update the bloom server filters too 27 | bloomip = "116.202.162.146" 28 | 29 | serverbloom = BloomFilter(max_elements=10000000, error_rate=0.01, filename=(f"/home/archiveteam/bloom-{bloomip}.bin",-1)) 30 | intlbloom = BloomFilter(max_elements=10000000, error_rate=0.01, filename=(f"/home/archiveteam/intl-{bloomip}.bin",-1)) 31 | serverclip = BloomFilter(max_elements=10000000, error_rate=0.01, filename=(f"/home/archiveteam/clip-{bloomip}.bin",-1)) 32 | 33 | start = time.time() 34 | now = datetime.now().strftime("%Y/%m/%d_%H:%M") 35 | 36 | time.sleep(5) 37 | counter = 0 38 | counterintl = 0 39 | uniques = 0 40 | uniquesintl = 0 41 | main = [(0,0)] 42 | intl = [(0,0)] 43 | for file in glob("/home/archiveteam/CAH/hashes/*.hsh"): 44 | stem = Path(file).stem.strip(".") 45 | if stem not in serverbloom: 46 | with open(file,"rt") as f: 47 | for line in f.readlines(): 48 | counter += 1 49 | post = { 50 | 'file': (stem, open(file, 'rb')), 51 | 'key': (None, 'main'), 52 | } 53 | response = requests.post(f'http://{bloomip}:8000/add/', files=post) 54 | if response.status_code == 200: 55 | serverbloom.add(stem) 56 | uniques += int(response.text) 57 | main.append(tuple(map(lambda i, j: i - j, (counter,uniques), main[-1]))) 58 | del(main[0]) 59 | #log.write(str(main) + "\n") 60 | for file in glob("/home/archiveteam/CAH/hashesintl/*.hsh"): 61 | stem = Path(file).stem.strip(".") 62 | if stem not in intlbloom: 63 | with open(file,"rt") as f: 64 | for line in f.readlines(): 65 | counterintl += 1 66 | post = { 67 | 'file': (stem, open(file, 'rb')), 68 | 'key': (None, 'multilanguage'), 69 | } 70 | response = requests.post(f'http://{bloomip}:8000/add/', files=post) 71 | if response.status_code == 200: 72 | intlbloom.add(stem) 73 | uniquesintl += int(response.text) 74 | intl.append(tuple(map(lambda i, j: i - j, (counterintl,uniquesintl), intl[-1]))) 75 | del(intl[0]) 76 | 77 | clippedlist=[0] 78 | clipped_counter = 0 79 | for file in glob("/home/archiveteam/CAH/clipped/*.clp"): 80 | stem = Path(file).stem.strip(".") 81 | if stem not in serverclip: 82 | post = { 83 | 'file': (stem, open(file, 'rb')), 84 | 'key': (None, 'clipped'), 85 | } 86 | response = requests.post(f'http://{bloomip}:8000/add/', files=post) 87 | if response.status_code == 200: 88 | serverclip.add(stem) 89 | clipped_counter += int(response.text) 90 | clippedlist.append(clipped_counter-clippedlist[-1]) 91 | del clippedlist[0] 92 | #log.write(str(clippedlist) + "\n") 93 | 94 | pd.set_option('precision', 2) 95 | df = pd.read_csv("bloom.log", sep=" ",header=None, names=["Date", "a", "unique pairs (5%)", "b", "total including duplicates","c","clipped filter (5%)","d","failed filter","e"]) 96 | df["Date"]=df.Date.apply(lambda x: datetime.strptime(x, "[%Y/%m/%d_%H:%M]")) 97 | df["unique pairs (5%)"]=df["unique pairs (5%)"]/1000000 98 | df["total including duplicates"]=df["total including duplicates"]/1000000 99 | df["clipped filter (5%)"]=df["clipped filter (5%)"]/1000000 100 | 101 | #log.write("Done df calc \n") 102 | if uniques + uniquesintl + clipped_counter > 0: 103 | print(f"[{now}] added {uniques + uniquesintl} \"from total of\" {counter + counterintl} \"( {str(main)} i.e. {round((counter + counterintl - uniques - uniquesintl)*100/(counter + counterintl + sys.float_info.epsilon), 2)}% duplication in {round(time.time()-start,2)} sec) Also added \" {clipped_counter} \" {str(clippedlist)} clipped\" and 0 failed") 104 | 105 | #log.write("Printed stats \n") 106 | 107 | with open('dashboard.txt', 'w') as file: 108 | file.write("
Crawling at Home project
\n") 109 | file.write("

Bloom filters status

\n") 110 | file.write("

All time stats

\n") 111 | file.write("
initialized from first parquet files
\n") 112 | file.write(str(df.sum(axis=0, numeric_only=True)).replace("\n","
")) 113 | file.write("

") 114 | file.write("

Last day stats

\n") 115 | file.write(str(df[df.Date > datetime.now() - pd.to_timedelta("1day")].sum(axis=0, numeric_only=True)).replace("\n","
")) 116 | file.write("

Last week stats

\n") 117 | file.write("
Last reset date: 01 December 2021
\n") 118 | file.write(str(df[df.Date > datetime.now() - pd.to_timedelta("7day")].sum(axis=0, numeric_only=True)).replace("\n","
")) 119 | #log.write("Printed dashboard \n") -------------------------------------------------------------------------------- /helpers/parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "source": [ 7 | "import pandas as pd\n", 8 | "from glob import glob" 9 | ], 10 | "outputs": [], 11 | "metadata": {} 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 5, 16 | "source": [ 17 | "total = 0\n", 18 | "count = 0\n", 19 | "for file in glob(\"../parquet/3080.rom1504.fr/cah/cah_dataframe_unique/*.parquet\"):\n", 20 | " df = pd.read_parquet(file)\n", 21 | " total += len(df.index)\n", 22 | " df.dropna(subset=['similarity'], inplace=True)\n", 23 | " count += len(df.index)\n", 24 | "print(count)" 25 | ], 26 | "outputs": [ 27 | { 28 | "output_type": "stream", 29 | "name": "stdout", 30 | "text": [ 31 | "63505626\n" 32 | ] 33 | } 34 | ], 35 | "metadata": {} 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 6, 40 | "source": [ 41 | "print(total)" 42 | ], 43 | "outputs": [ 44 | { 45 | "output_type": "stream", 46 | "name": "stdout", 47 | "text": [ 48 | "70153985\n" 49 | ] 50 | } 51 | ], 52 | "metadata": {} 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 8, 57 | "source": [ 58 | "\n", 59 | "df = pd.DataFrame\n", 60 | "i=0\n", 61 | "for file in glob(\"../parquet/3080.rom1504.fr/cah/cah_dataframe_unique/*.parquet\"):\n", 62 | " dfp = pd.read_parquet(file)\n", 63 | " dfp = dfp[dfp['similarity'].isna()]\n", 64 | " if i == 0:\n", 65 | " df = dfp\n", 66 | " else:\n", 67 | " df = df.append(dfp)\n", 68 | " i += 1\n", 69 | "df.shape" 70 | ], 71 | "outputs": [ 72 | { 73 | "output_type": "execute_result", 74 | "data": { 75 | "text/plain": [ 76 | "(6648359, 8)" 77 | ] 78 | }, 79 | "metadata": {}, 80 | "execution_count": 8 81 | } 82 | ], 83 | "metadata": {} 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 28, 88 | "source": [ 89 | "df.head()" 90 | ], 91 | "outputs": [ 92 | { 93 | "output_type": "execute_result", 94 | "data": { 95 | "text/plain": [ 96 | " SAMPLE_ID URL \\\n", 97 | "2 NaN http://cdn2.newsok.biz/cache/sq105-9f961ae77a2... \n", 98 | "5 NaN http://images.tastespotting.com/thumbnails/707... \n", 99 | "23 NaN http://cdn.archinect.net/images/195x140/fc/fcl... \n", 100 | "44 NaN http://patentimages.storage.googleapis.com/thu... \n", 101 | "62 NaN http://demandware.edgesuite.net/sits_pod18/dw/... \n", 102 | "\n", 103 | " TEXT HEIGHT WIDTH LICENSE \\\n", 104 | "2 Oklahoma City's Russell Westbrook (0) reacts a... NaN NaN ? \n", 105 | "5 {recipe} Cucumber Salad with Cilantro Lime Hon... NaN NaN ? \n", 106 | "23 Onerahi Bach Project - Design Details NaN NaN ? \n", 107 | "44 Patent Drawing NaN NaN ? \n", 108 | "62 Rouge In Love NaN NaN ? \n", 109 | "\n", 110 | " NSFW similarity hash \n", 111 | "2 NaN d0533b647020bb8b9ea78c309c4b6457 \n", 112 | "5 NaN a146a91137119c0bcaa350d94415359f \n", 113 | "23 NaN 67344a9e6e9abcf1f01830f21bfc6b89 \n", 114 | "44 NaN 6488ba8ef23f5756f8b4db231a69db6c \n", 115 | "62 NaN 2cbb4495c590cc34327978633a7f8d18 " 116 | ], 117 | "text/html": [ 118 | "
\n", 119 | "\n", 132 | "\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | "
SAMPLE_IDURLTEXTHEIGHTWIDTHLICENSENSFWsimilarityhash
2NaNhttp://cdn2.newsok.biz/cache/sq105-9f961ae77a2...Oklahoma City's Russell Westbrook (0) reacts a...NaNNaN?NaNd0533b647020bb8b9ea78c309c4b6457
5NaNhttp://images.tastespotting.com/thumbnails/707...{recipe} Cucumber Salad with Cilantro Lime Hon...NaNNaN?NaNa146a91137119c0bcaa350d94415359f
23NaNhttp://cdn.archinect.net/images/195x140/fc/fcl...Onerahi Bach Project - Design DetailsNaNNaN?NaN67344a9e6e9abcf1f01830f21bfc6b89
44NaNhttp://patentimages.storage.googleapis.com/thu...Patent DrawingNaNNaN?NaN6488ba8ef23f5756f8b4db231a69db6c
62NaNhttp://demandware.edgesuite.net/sits_pod18/dw/...Rouge In LoveNaNNaN?NaN2cbb4495c590cc34327978633a7f8d18
\n", 210 | "
" 211 | ] 212 | }, 213 | "metadata": {}, 214 | "execution_count": 28 215 | } 216 | ], 217 | "metadata": {} 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 23, 222 | "source": [ 223 | "df.dropna(subset=['URL','TEXT'], inplace=True)\n", 224 | "df.shape" 225 | ], 226 | "outputs": [ 227 | { 228 | "output_type": "execute_result", 229 | "data": { 230 | "text/plain": [ 231 | "(6633380, 8)" 232 | ] 233 | }, 234 | "metadata": {}, 235 | "execution_count": 23 236 | } 237 | ], 238 | "metadata": {} 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 27, 243 | "source": [ 244 | "import hashlib\n", 245 | "df[\"hash\"] = df.apply(lambda x: hashlib.md5((str(x.URL) + str(x.TEXT)).encode(\"utf-8\")).hexdigest(), axis=1)" 246 | ], 247 | "outputs": [], 248 | "metadata": {} 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 29, 253 | "source": [ 254 | "df.to_csv(\"nansim.csv\",sep=\"|\",index=False)" 255 | ], 256 | "outputs": [], 257 | "metadata": {} 258 | } 259 | ], 260 | "metadata": { 261 | "orig_nbformat": 4, 262 | "language_info": { 263 | "name": "python", 264 | "version": "3.8.8", 265 | "mimetype": "text/x-python", 266 | "codemirror_mode": { 267 | "name": "ipython", 268 | "version": 3 269 | }, 270 | "pygments_lexer": "ipython3", 271 | "nbconvert_exporter": "python", 272 | "file_extension": ".py" 273 | }, 274 | "kernelspec": { 275 | "name": "python3", 276 | "display_name": "Python 3.8.8 64-bit ('gpuhcloud': conda)" 277 | }, 278 | "interpreter": { 279 | "hash": "bc322c11e8113b1b1dfcd753c5702c5c5d95a81c495f9a7060b170a2a7888bca" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 2 284 | } -------------------------------------------------------------------------------- /postgres/dump_db.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from multiprocessing import Process, Queue\n", 10 | "from sqlalchemy import create_engine, text\n", 11 | "from sqlalchemy.pool import NullPool\n", 12 | "from configparser import ConfigParser\n", 13 | "from tqdm.auto import tqdm, trange\n", 14 | "import uuid\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 7, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "def config(filename='database.ini', mode=\"test\"):\n", 25 | " # create a parser\n", 26 | " parser = ConfigParser()\n", 27 | " # read config file\n", 28 | " parser.read(filename)\n", 29 | " section='postgresql'\n", 30 | " if mode == \"production\":\n", 31 | " section = \"cah_production\"\n", 32 | " # get section, default to postgresql\n", 33 | " db = {}\n", 34 | " if parser.has_section(section):\n", 35 | " params = parser.items(section)\n", 36 | " for param in params:\n", 37 | " db[param[0]] = param[1]\n", 38 | " else:\n", 39 | " raise Exception('Section {0} not found in the {1} file'.format(section, filename))\n", 40 | " return db" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 8, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "def dump_3m(j, workers, engine, jobtype, cycles, queue, path, dataset):\n", 50 | " engine.dispose()\n", 51 | " with engine.connect() as eng:\n", 52 | " conn = engine.raw_connection()\n", 53 | " for i in range(cycles):\n", 54 | " file = uuid.uuid4()\n", 55 | " # clipped out\n", 56 | " if jobtype == \"clipped\":\n", 57 | " if dataset == \"en\":\n", 58 | " select_stmt1 = f\"\"\"BEGIN;\n", 59 | " SET work_mem = '1GB';\n", 60 | " -- query --\n", 61 | " COPY (\n", 62 | " DELETE FROM dataset_en WHERE sampleid in (\n", 63 | " select sampleid from dataset_en where status = 2 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n", 64 | " ) RETURNING *\n", 65 | " ) TO '{path}/clipped/ok-en-{file}.csv' DELIMITER '|' CSV HEADER;\n", 66 | " SET work_mem = default;\n", 67 | " COMMIT;\"\"\"\n", 68 | " else:\n", 69 | " select_stmt1 = f\"\"\"BEGIN;\n", 70 | " SET work_mem = '1GB';\n", 71 | " -- query --\n", 72 | " COPY (\n", 73 | " DELETE FROM dataset_{dataset} WHERE sampleid in (\n", 74 | " select sampleid from dataset_{dataset} where status = 2 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n", 75 | " ) RETURNING *\n", 76 | " ) TO '{path}/clipped/ok-{dataset}-{file}.csv' DELIMITER '|' CSV HEADER;\n", 77 | " SET work_mem = default;\n", 78 | " COMMIT;\"\"\"\n", 79 | " # rejected out\n", 80 | " elif jobtype == \"rejected\":\n", 81 | " if dataset == \"en\":\n", 82 | " select_stmt1 = f\"\"\"BEGIN;\n", 83 | " SET work_mem = '1GB';\n", 84 | " -- query --\n", 85 | " COPY (\n", 86 | " DELETE FROM dataset_en WHERE sampleid in (\n", 87 | " select sampleid from dataset_en where status > 8 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n", 88 | " ) RETURNING *\n", 89 | " ) TO '{path}/rejected/bad-en-{file}.csv' DELIMITER '|' CSV HEADER;\n", 90 | " SET work_mem = default;\n", 91 | " COMMIT;\"\"\"\n", 92 | " else:\n", 93 | " select_stmt1 = f\"\"\"BEGIN;\n", 94 | " SET work_mem = '1GB';\n", 95 | " -- query --\n", 96 | " COPY (\n", 97 | " DELETE FROM dataset_{dataset} WHERE sampleid in (\n", 98 | " select sampleid from dataset_{dataset} where status > 8 order by sampleid limit 5000000 FOR UPDATE SKIP LOCKED\n", 99 | " ) RETURNING *\n", 100 | " ) TO '{path}/rejected/bad-{dataset}-{file}.csv' DELIMITER '|' CSV HEADER;\n", 101 | " SET work_mem = default;\n", 102 | " COMMIT;\"\"\"\n", 103 | "\n", 104 | " else:\n", 105 | " continue\n", 106 | " try:\n", 107 | " cur = conn.cursor()\n", 108 | " cur.execute(select_stmt1)\n", 109 | " conn.commit()\n", 110 | " except Exception as e:\n", 111 | " print(f\"error: {e}\")\n", 112 | " queue.put(1)\n", 113 | " return" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 32, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "application/vnd.jupyter.widget-view+json": { 124 | "model_id": "a5699d0df7814b728b5f170c06711abf", 125 | "version_major": 2, 126 | "version_minor": 0 127 | }, 128 | "text/plain": [ 129 | " 0%| | 0/25 [00:00 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /docs/3stage_architecture_white.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /infrastructure.py: -------------------------------------------------------------------------------- 1 | # usage: 2 | # starting swarm 3 | # python3 infrastructure.py command cloud nodes datacenter 4 | # where 5 | # 1st arg can be up, down, reset 6 | # 2nd arg can be hetzner, vultr, alibaba, hostwinds 7 | # 3rd arg is optional, number of nodes, implicit 1 8 | # 4th arg is optionsl, datacenter for hetzner (fsn1, ) 9 | # 10 | # the .env file format with single space delimiter 11 | # lx2evY5dL2uScjjp...Hjsobzcxvbm5Ng9gb27gulMC...CsobCmqOKlCmwzn6Qi rvencu -1 rv 12 | # API token nickname nodes real_name 13 | # where nodes = -1 means we can spin up to the very server limit 14 | # nodes = 0 - do not use this key 15 | # nodes > 0 - spin up only to the minimum between this number and server limit 16 | 17 | import os 18 | import sys 19 | import trio 20 | import time 21 | import pipes 22 | #import subprocess 23 | from configparser import ConfigParser 24 | from itertools import cycle 25 | from hcloud import Client 26 | from hcloud.images.domain import Image 27 | from hcloud.hcloud import APIException 28 | from hcloud.server_types.client import ServerType 29 | #from hcloud.servers.client import BoundServer, CreateServerResponse 30 | from pssh.clients import ParallelSSHClient, SSHClient 31 | from gevent import joinall 32 | 33 | def config(filename='database.ini', mode="test"): 34 | # create a parser 35 | parser = ConfigParser() 36 | # read config file 37 | parser.read(filename) 38 | section='postgresql' 39 | if mode == "production": 40 | section = "cah_production" 41 | # get section, default to postgresql 42 | db = {} 43 | if parser.has_section(section): 44 | params = parser.items(section) 45 | for param in params: 46 | db[param[0]] = param[1] 47 | else: 48 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 49 | return db 50 | 51 | async def list_servers(tok=""): 52 | servers = [] 53 | tokens = [] 54 | if tok == "": 55 | with open(".env", "r") as auth: 56 | tokens = auth.readlines() 57 | else: 58 | tokens = [tok] 59 | for token in tokens: 60 | hclient = Client(token=token.rstrip()) # Please paste your API token here between the quotes 61 | servers = servers + hclient.servers.get_all() 62 | return servers 63 | 64 | async def up(nodes, pref_loc, server_type="cx11", nick=""): 65 | workers = [] 66 | tokens = [] 67 | script = "" 68 | nodes = int(nodes) 69 | with open(".env", "r") as auth: 70 | tokens = [x.split(" ") for x in auth.readlines()] 71 | with open("cloud-init", "r") as user_data: 72 | script = user_data.read() 73 | for token in tokens: 74 | if nick != "" and nick != token[1]: 75 | continue 76 | number = nodes 77 | if int(token[2])>0: 78 | number = min(nodes, int(token[2])) 79 | init = script.replace("<>", token[1]) 80 | print(f"[swarm] nodes to spin up: {nodes}") 81 | if (number > 0 and int(token[2])!=0): 82 | try: 83 | hclient = Client(token=token[0]) 84 | if pref_loc == None: 85 | print ("[swarm] no specific location provided") 86 | locations = hclient.locations.get_all() 87 | loc = cycle(locations) 88 | zip = [[i, next(loc)] for i in range(number)] 89 | else: 90 | print (f"[swarm] using {pref_loc} location") 91 | location = hclient.locations.get_by_name(pref_loc) 92 | zip = [[i, location] for i in range(number)] 93 | for i, loc in zip: 94 | try: 95 | response = hclient.servers.create( 96 | "cah-worker-"+str(i), 97 | ServerType(name=server_type), 98 | Image(name="ubuntu-20.04"), 99 | hclient.ssh_keys.get_all(), 100 | None, #volumes 101 | None, #firewalls 102 | None, #networks 103 | init, 104 | None, #labels 105 | loc, #location - todo: create servers in all locations 106 | None, #datacenter 107 | ) 108 | srv = response.server 109 | workers.append((srv.public_net.ipv4.ip, token[1])) # tuple IP and nickname 110 | nodes = nodes - 1 111 | except APIException as e: 112 | print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")") 113 | break 114 | except Exception as e: 115 | print(e) 116 | break 117 | except APIException as e: 118 | print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")") 119 | continue 120 | except Exception as e: 121 | print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")") 122 | continue 123 | 124 | print (f"[swarm] Cloud swarm intialized with {len(workers)} nodes. If this is less than expected please check your account limits") 125 | return workers 126 | 127 | async def down(cloud, nick=""): 128 | workers = [] 129 | nicknames = [] 130 | if os.path.exists(f"{cloud}.txt"): 131 | with open(f"{cloud}.txt", "r") as f: 132 | for line in f.readlines(): 133 | workers.append(line.split(" ")[0]) 134 | nicknames.append(line.split(" ")[1]) 135 | with open(".env", "r") as auth: 136 | tokens = [x.split(" ") for x in auth.readlines()] 137 | for token in tokens: 138 | if nick != "" and nick != token[1]: 139 | continue 140 | if int(token[2]) != 0: 141 | try: 142 | servers = await list_servers(token[0]) 143 | hclient = Client(token=token[0]) 144 | for server in servers: 145 | server = hclient.servers.get_by_name(server.name) 146 | ip = server.public_net.ipv4.ip 147 | if ip not in workers: 148 | continue 149 | server.delete() 150 | except APIException as e: 151 | print (f"[swarm] API Exception: " + str(e) + " ("+ token[0] + " " + token[1] + ")") 152 | continue 153 | 154 | async def respawn(workers, ip, server_type="cx11"): 155 | with open(".env", "r") as auth: 156 | tokens = auth.readlines().split(" ") 157 | for token in tokens: 158 | hclient = Client(token=token[0]) 159 | index = workers.index(ip) 160 | server = hclient.servers.get_by_name(f"cah-worker-{index}") 161 | if server is None: 162 | continue 163 | try: 164 | # first attempt to restart the crawl service 165 | aclient = SSHClient(ip, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False) 166 | aclient.execute('systemctl restart crawl', sudo=True ) 167 | aclient.disconnect() 168 | 169 | except: 170 | # if impossible to restart the service then delete the worker and try to re-create it 171 | server.delete() 172 | with open("cloud-init", "r") as user_data: 173 | script = user_data.read().replace("<>", token[1]) 174 | try: 175 | response = hclient.servers.create( 176 | "cah-worker-"+index, 177 | ServerType(name=server_type), 178 | Image(name="ubuntu-20.04"), 179 | hclient.ssh_keys.get_all(), 180 | None, #volumes 181 | None, #firewalls 182 | None, #networks 183 | script, 184 | None, #labels 185 | None, #location - todo: create servers in all locations 186 | None, #datacenter 187 | ) 188 | srv = response.server 189 | workers[index] = srv.public_net.ipv4.ip 190 | except APIException as e: 191 | # problem. we remove the worker from the dispatcher 192 | print (f"[swarm] API Exception: " + str(e)) 193 | workers.remove(ip) 194 | return workers 195 | return workers 196 | 197 | def exists_remote(host, path, silent=False): 198 | """Test if a file exists at path on a host accessible with SSH.""" 199 | aclient = SSHClient(host, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False ) 200 | #_start = time.time() 201 | output = aclient.run_command("test -f {}".format(pipes.quote(path))) 202 | 203 | status = output.exit_code 204 | 205 | aclient.disconnect() 206 | 207 | if not silent: 208 | print(".", end = "", flush=True) 209 | if status == 0: 210 | return True 211 | if status == 1 or status == 255: 212 | return False 213 | 214 | async def wait_for_infrastructure (workers): # here workers is a list of IPs 215 | print(f"[swarm] Waiting for {len(workers)} nodes to become ready. Polling starts after 4 minutes...") 216 | time.sleep(240) 217 | ready = [] 218 | pclient = ParallelSSHClient(workers, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False ) 219 | while len(ready) < len(workers): 220 | print(".", end = "", flush=True) 221 | ready = [] 222 | #_start = time.time() 223 | output = pclient.run_command('test -f /home/crawl/crawl.log') 224 | pclient.join(output) 225 | for host_output in output: 226 | hostname = host_output.host 227 | exit_code = host_output.exit_code 228 | if exit_code == 0: 229 | ready.append(hostname) 230 | #print(len(ready)) 231 | time.sleep(10) 232 | 233 | def last_status(ip, path): 234 | aclient = SSHClient(ip, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False) 235 | read = aclient.run_command("tail -1 {}".format(pipes.quote(path))) 236 | aclient.disconnect() 237 | return read.stdout 238 | 239 | def reset_workers(cloud): 240 | workers = [] 241 | with open(f"{cloud}.txt", "r") as f: 242 | for line in f.readlines(): 243 | workers.append(line.split(" ")[0]) 244 | if cloud in ["oracle"]: 245 | pclient = ParallelSSHClient(workers, user='ubuntu', pkey="~/gpuhcloud/richard", identity_auth=False ) 246 | output = pclient.run_command('cd /home/crawl & source worker-reset.sh', sudo=True) 247 | pclient.join(output) 248 | else: 249 | pclient = ParallelSSHClient(workers, user='crawl', pkey="~/.ssh/id_cah", identity_auth=False ) 250 | output = pclient.run_command('source worker-reset.sh', sudo=True) 251 | pclient.join(output) 252 | 253 | if __name__ == "__main__": 254 | command = sys.argv[1] 255 | cloud = sys.argv[2] 256 | location = "" 257 | if len(sys.argv) > 3: 258 | nodes = int(sys.argv[3]) 259 | else: 260 | nodes = 1 261 | if len(sys.argv) > 4: 262 | location = sys.argv[4] 263 | 264 | params = config(mode="production") 265 | 266 | if command == "up": 267 | try: 268 | start = time.time() 269 | sshkey="" 270 | escape = ["\\","$",".","*","[","^","/"] 271 | with open (f"{os.getenv('HOME')}/.ssh/richard.pub","rt") as f: 272 | sshkey = f.read().split(" ")[1] 273 | for char in escape: 274 | sshkey = sshkey.replace(char,"\\"+char) 275 | #print(sshkey) 276 | if cloud in ["hetzner"]: 277 | if os.path.exists("cloud-init"): 278 | os.system("rm cloud-init") 279 | os.system("cp 'cloud boot/cloud-init.yaml' cloud-init") 280 | os.system(f"sed -i -e \"s/<>/{sshkey}/\" cloud-init") 281 | os.system(f"sed -i -e \"s/<>/{cloud}/\" cloud-init") 282 | os.system(f"sed -i -e \"s/<>/{params['host']}/\" cloud-init") 283 | os.system(f"sed -i -e \"s/<>/{params['database']}/\" cloud-init") 284 | os.system(f"sed -i -e \"s/<>/{params['user']}/\" cloud-init") 285 | os.system(f"sed -i -e \"s/<>/{params['password']}/\" cloud-init") 286 | elif cloud in ["vultr"]: 287 | # do some boot.sh API calls 288 | os.system("rm boot") 289 | os.system("cp 'cloud boot/boot.sh' boot") 290 | os.system(f"sed -i -e \"s/<>/{os.getenv('CAH_NICKNAME')}/\" boot") 291 | os.system(f"sed -i -e \"s/<>/{sshkey}/\" boot") 292 | os.system(f"sed -i -e \"s/<>/{cloud}/\" boot") 293 | print ("Manual setup: please use `boot` file to manually initialize your cloud nodes.") 294 | sys.exit() 295 | else: 296 | print ("not recognized cloud, abandoning") 297 | sys.exit() 298 | # generate cloud workers 299 | workers = trio.run(up, nodes, location) 300 | with open(f"{cloud}.txt", "w") as f: 301 | for ip, nickname in workers: 302 | f.write(ip + " " + nickname + "\n") 303 | trio.run(wait_for_infrastructure, workers) 304 | print( 305 | f"[swarm] {len(workers)} nodes cloud swarm is up in {cloud} cloud and was initialized in {round(time.time() - start)}s") 306 | except KeyboardInterrupt: 307 | print(f"[swarm] Abort! Deleting cloud swarm...") 308 | trio.run(down) 309 | print(f"[swarm] Cloud swarm was shutdown") 310 | sys.exit() 311 | except Exception as e: 312 | print(f"[swarm] Error, could not bring up swarm... please consider shutting down all workers via `python3 infrastructure.py down`") 313 | print(e) 314 | sys.exit() 315 | elif command == "down": 316 | trio.run(down, cloud) 317 | print (f"[swarm] Cloud swarm was shutdown") 318 | elif command == "reset": 319 | reset_workers(cloud) 320 | print(f"[swarm] All workers were reset") 321 | -------------------------------------------------------------------------------- /alibaba_workers/alibaba-upgrade-SAS-image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 22, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import hmac\n", 11 | "import uuid\n", 12 | "import hmac\n", 13 | "import time\n", 14 | "import base64\n", 15 | "import hashlib\n", 16 | "import datetime\n", 17 | "import requests\n", 18 | "import pandas as pd\n", 19 | "from urllib.parse import quote\n", 20 | "from configparser import ConfigParser\n", 21 | "\n", 22 | "baseInstances = [\"9d4c9fd55d884badba2540b561432c1e\", \"952dbd0188ff4603b94c81f69398ed75\"]\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 23, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "class SignatureUrl():\n", 32 | " \"\"\"python 计算openapi的签名\"\"\"\n", 33 | "\n", 34 | " def __init__(self, public_param, private_param, secret):\n", 35 | " self.public_param = public_param\n", 36 | " self.private_param = private_param\n", 37 | " self.secret = secret\n", 38 | "\n", 39 | " def get_timestamp(self):\n", 40 | " time_format = \"%Y-%m-%dT%H:%M:%SZ\"\n", 41 | " return datetime.datetime.utcnow().strftime(time_format)\n", 42 | "\n", 43 | " def get_uuid(self):\n", 44 | " return str(uuid.uuid1())\n", 45 | "\n", 46 | " def url_encode_str(self, all_params):\n", 47 | " sort_all_params = list()\n", 48 | " for key, value in all_params.items():\n", 49 | " params = key + '=' + value\n", 50 | " sort_all_params.append(params)\n", 51 | " # 对参数进行升序排序\n", 52 | " sort_all_params.sort()\n", 53 | "\n", 54 | " for i in range(len(sort_all_params)):\n", 55 | " # 对参数以及参数值进行urlencode处理,注意:’=‘此时不能处理,否则后面会再次对%3D进行encode\n", 56 | " sort_all_params[i] = quote(sort_all_params[i], '=')\n", 57 | " # 对encode之后的字符串进行再处理\n", 58 | " tmp = sort_all_params[i]\n", 59 | " if tmp.find('+'):\n", 60 | " tmp.replace('+','%20')\n", 61 | " elif tmp.find('*'):\n", 62 | " tmp.replace('*','%2A')\n", 63 | " elif tmp.find('%7E'):\n", 64 | " tmp.replace('%7E','~')\n", 65 | " \n", 66 | " sort_all_params[i] = tmp\n", 67 | " return sort_all_params\n", 68 | "\n", 69 | " def get_signature(self, param, http_method, AccesskeySecret):\n", 70 | " str_to_sign = ''\n", 71 | " sort_all_params = self.url_encode_str(param)\n", 72 | " #print(sort_all_params)\n", 73 | " for i in range(len(sort_all_params)):\n", 74 | " str_to_sign = str_to_sign + sort_all_params[i] + '&'\n", 75 | "\n", 76 | " # 将最后一位&给截取掉\n", 77 | " str_to_sign = http_method + '&%2F&' + quote(str_to_sign[:-1])\n", 78 | " #print(str_to_sign)\n", 79 | " key = AccesskeySecret+'&'\n", 80 | " signature = hmac.new(key.encode(\n", 81 | " 'utf-8'), str_to_sign.encode('utf-8'), digestmod=hashlib.sha1)\n", 82 | " signature = base64.b64encode(signature.digest()).decode().rstrip(\"\\n\")\n", 83 | " # 解决签名中包含有'+'的特殊情况\n", 84 | " signature = list(signature)\n", 85 | " for i in range(len(signature)):\n", 86 | " #signature[i] = str(signature[i])\n", 87 | " if signature[i] == '+':\n", 88 | " signature[i] = '%2B'\n", 89 | " newSignature = ''.join(signature)\n", 90 | " #print (\"Signature: \" + newSignature)\n", 91 | " self.private_param['Signature'] = newSignature\n", 92 | "\n", 93 | " def url_factory(self, method):\n", 94 | " all_params = dict(self.public_param, **self.private_param)\n", 95 | " self.get_signature(all_params, method, self.secret)\n", 96 | " url = ''\n", 97 | " par=[]\n", 98 | " for key, value in all_params.items():\n", 99 | " params = key + '=' + value\n", 100 | " par.append(params)\n", 101 | " for i in range(len(par)):\n", 102 | " url = url + par[i] + '&'\n", 103 | " url = 'http://swas.eu-central-1.aliyuncs.com?' + url[:-1] + '&Signature=' + self.private_param['Signature']\n", 104 | " #print('url is : ' + url)\n", 105 | " return url" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 24, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "def config(filename='alibaba_tokens.prod', section='tokens'):\n", 115 | " # create a parser\n", 116 | " parser = ConfigParser()\n", 117 | " # read config file\n", 118 | " parser.read(filename)\n", 119 | "\n", 120 | " # get section, default to postgresql\n", 121 | " cfg = {}\n", 122 | " if parser.has_section(section):\n", 123 | " params = parser.items(section)\n", 124 | " for param in params:\n", 125 | " cfg[param[0]] = param[1]\n", 126 | " else:\n", 127 | " raise Exception('Section {0} not found in the {1} file'.format(section, filename))\n", 128 | " return cfg\n", 129 | "\n", 130 | "def upgradeInstance(InstanceId, ImageId, public_param, secret):\n", 131 | " action_param = dict()\n", 132 | " action_param[\"Action\"] = \"ResetSystem\"\n", 133 | "\n", 134 | " private_param = dict()\n", 135 | " private_param[\"Action\"] = action_param[\"Action\"]\n", 136 | " private_param[\"InstanceId\"] = InstanceId\n", 137 | " private_param[\"ImageId\"] = ImageId\n", 138 | "\n", 139 | " sig = SignatureUrl(public_param, private_param, secret)\n", 140 | " sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n", 141 | " sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n", 142 | " url = sig.url_factory('GET')\n", 143 | "\n", 144 | " result = requests.request(method=\"get\",url=url)\n", 145 | "\n", 146 | " print(\"Instance reset: \" + result.text)\n", 147 | "\n", 148 | "def renameInstance(InstanceId, Name, public_param, secret):\n", 149 | " action_param = dict()\n", 150 | " action_param[\"Action\"] = \"UpdateInstanceAttribute\"\n", 151 | "\n", 152 | " private_param = dict()\n", 153 | " private_param[\"Action\"] = action_param[\"Action\"]\n", 154 | " private_param[\"InstanceId\"] = InstanceId\n", 155 | " private_param[\"InstanceName\"] = Name\n", 156 | "\n", 157 | " sig = SignatureUrl(public_param, private_param, secret)\n", 158 | " sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n", 159 | " sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n", 160 | " url = sig.url_factory('GET')\n", 161 | "\n", 162 | " result = requests.request(method=\"get\",url=url)\n", 163 | "\n", 164 | " print(\"Instance renamed: \" + result.text)\n", 165 | "\n", 166 | "def startInstance(InstanceId, public_param, secret):\n", 167 | " action_param = dict()\n", 168 | " action_param[\"Action\"] = \"StartInstance\"\n", 169 | "\n", 170 | " private_param = dict()\n", 171 | " private_param[\"Action\"] = action_param[\"Action\"]\n", 172 | " private_param[\"InstanceId\"] = InstanceId\n", 173 | "\n", 174 | "\n", 175 | " sig = SignatureUrl(public_param, private_param, secret)\n", 176 | " sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n", 177 | " sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n", 178 | " url = sig.url_factory('GET')\n", 179 | "\n", 180 | " result = requests.request(method=\"get\",url=url)\n", 181 | "\n", 182 | " print(\"Instance started: \" + result.text)\n", 183 | "\n", 184 | "def stopInstance(InstanceId, public_param, secret):\n", 185 | " action_param = dict()\n", 186 | " action_param[\"Action\"] = \"StopInstance\"\n", 187 | "\n", 188 | " private_param = dict()\n", 189 | " private_param[\"Action\"] = action_param[\"Action\"]\n", 190 | " private_param[\"InstanceId\"] = InstanceId\n", 191 | "\n", 192 | "\n", 193 | " sig = SignatureUrl(public_param, private_param, secret)\n", 194 | " sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n", 195 | " sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n", 196 | " url = sig.url_factory('GET')\n", 197 | "\n", 198 | " result = requests.request(method=\"get\",url=url)\n", 199 | "\n", 200 | " print(\"Instance released: \" + result.text)\n", 201 | "\n", 202 | "\n", 203 | "def rebootInstance(InstanceId, public_param, secret):\n", 204 | " action_param = dict()\n", 205 | " action_param[\"Action\"] = \"RebootInstance\"\n", 206 | "\n", 207 | " private_param = dict()\n", 208 | " private_param[\"Action\"] = action_param[\"Action\"]\n", 209 | " private_param[\"InstanceId\"] = InstanceId\n", 210 | "\n", 211 | "\n", 212 | " sig = SignatureUrl(public_param, private_param, secret)\n", 213 | " sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n", 214 | " sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n", 215 | " url = sig.url_factory('GET')\n", 216 | "\n", 217 | " result = requests.request(method=\"get\",url=url)\n", 218 | "\n", 219 | " print(\"Instance rebooted: \" + result.text)\n", 220 | "\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "# Start/stop/reboot/reset/list all instances" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 25, 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "page 0 attempt failed\n", 240 | "page 0 appended\n", 241 | "page 1 attempt failed\n", 242 | "page 1 appended\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "params = config()\n", 248 | "mode = \"list\" # start or stop or reboot or reset or list\n", 249 | "\n", 250 | "ratio = 1.0\n", 251 | "\n", 252 | "public_param = dict()\n", 253 | "public_param[\"AccessKeyId\"] = params[\"id\"]\n", 254 | "public_param[\"SignatureMethod\"] = 'HMAC-SHA1'\n", 255 | "public_param[\"SignatureVersion\"] = '1.0'\n", 256 | "public_param[\"Version\"] = \"2020-06-01\"\n", 257 | "public_param[\"Format\"] = 'json'\n", 258 | "\n", 259 | "action_param = dict()\n", 260 | "action_param[\"Action\"] = \"ListInstances\"\n", 261 | "\n", 262 | "instances = []\n", 263 | "ilist = []\n", 264 | "j=0\n", 265 | "k=0\n", 266 | "for i in range(2):\n", 267 | " private_param = dict()\n", 268 | " private_param[\"Action\"] = action_param[\"Action\"]\n", 269 | " private_param[\"RegionId\"] = \"eu-central-1\"\n", 270 | " private_param[\"PageSize\"] = \"100\"\n", 271 | " private_param[\"PageNumber\"] = str(i+1)\n", 272 | "\n", 273 | " sig = SignatureUrl(public_param, private_param, params[\"secret\"])\n", 274 | " sig.public_param[\"Timestamp\"] = sig.get_timestamp()\n", 275 | " sig.public_param[\"SignatureNonce\"] = sig.get_uuid()\n", 276 | " url = sig.url_factory('GET')\n", 277 | " for _ in range(5):\n", 278 | " result = requests.request(method=\"get\",url=url)\n", 279 | " if result.status_code == 200:\n", 280 | " print(f\"page {i} attempt failed\")\n", 281 | " break\n", 282 | " time.sleep(20)\n", 283 | " \n", 284 | " print(f\"page {i} appended\")\n", 285 | " \n", 286 | " instances = instances + json.loads(result.text)[\"Instances\"]\n", 287 | " ilist.append( pd.json_normalize( json.loads(result.text), record_path = ['Instances'] ))\n", 288 | "\n", 289 | "if mode == \"list\":\n", 290 | " ilist[0].to_csv(\"alibaba_instances_0.csv\", index=None)\n", 291 | " ilist[1].to_csv(\"alibaba_instances_1.csv\", index=None)\n", 292 | "else:\n", 293 | " \n", 294 | " for instance in instances:\n", 295 | " print(str(j+k) + \". \" + instance[\"InstanceId\"])\n", 296 | "\n", 297 | " if instance[\"InstanceId\"] not in baseInstances:\n", 298 | " if mode == \"start\":\n", 299 | " if instance[\"Status\"] == \"Stopped\":\n", 300 | " startInstance(instance[\"InstanceId\"], public_param, params[\"secret\"])\n", 301 | " elif mode == \"stop\":\n", 302 | " if instance[\"Status\"] == \"Running\":\n", 303 | " stopInstance(instance[\"InstanceId\"], public_param, params[\"secret\"])\n", 304 | " elif mode == \"reboot\":\n", 305 | " if 1 == 1:\n", 306 | " j += 1\n", 307 | " rebootInstance(instance[\"InstanceId\"], public_param, params[\"secret\"])\n", 308 | " renameInstance(instance[\"InstanceId\"], \"nolang_v1_\" + str(j), public_param, params[\"secret\"])\n", 309 | " print (\"instance was rebooted\")\n", 310 | " else:\n", 311 | " k+=1\n", 312 | " renameInstance(instance[\"InstanceId\"], \"overquota_\" + str(k), public_param, params[\"secret\"])\n", 313 | " print (\"instance inactive\")\n", 314 | " elif mode == \"reset\":\n", 315 | " j += 1\n", 316 | " print(str(j) + \". \" + instance[\"InstanceId\"] + \" has ImageId \" + instance[\"InstanceId\"])\n", 317 | " time.sleep(0.5)\n", 318 | " if instance[\"InstanceId\"] not in baseInstances: # and instance[\"Status\"] == \"Running\":\n", 319 | " if j/len(instances) < ratio:\n", 320 | " upgradeInstance(instance[\"InstanceId\"], \"m-gw8iyh9kb8hp3b8ed6gm\", public_param, params[\"secret\"])\n", 321 | " renameInstance(instance[\"InstanceId\"], \"i2d_\" + str(j), public_param, params[\"secret\"])\n", 322 | " else:\n", 323 | " upgradeInstance(instance[\"InstanceId\"], \"m-gw8iyh9kb8hp3b8ed6gm\", public_param, params[\"secret\"])\n", 324 | " renameInstance(instance[\"InstanceId\"], \"intl_v1_\" + str(j), public_param, params[\"secret\"])\n", 325 | " time.sleep(8)\n", 326 | " else:\n", 327 | " pass\n", 328 | " time.sleep(8)\n", 329 | " " 330 | ] 331 | } 332 | ], 333 | "metadata": { 334 | "interpreter": { 335 | "hash": "ee22a52db22349ad32e35f3b499efddea1c9229e771c5fd65652469b6b2f1979" 336 | }, 337 | "kernelspec": { 338 | "display_name": "Python 3.9.7 64-bit ('gpu': conda)", 339 | "name": "python3" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": { 343 | "name": "ipython", 344 | "version": 3 345 | }, 346 | "file_extension": ".py", 347 | "mimetype": "text/x-python", 348 | "name": "python", 349 | "nbconvert_exporter": "python", 350 | "pygments_lexer": "ipython3", 351 | "version": "3.9.7" 352 | }, 353 | "orig_nbformat": 4 354 | }, 355 | "nbformat": 4, 356 | "nbformat_minor": 2 357 | } 358 | -------------------------------------------------------------------------------- /docs/architecture.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /dbdl.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Encoding image analyzing errors: Add the numbers below to 8 to encode all types of errors (so status=9...23 is reserved to describe the errors) 3 | - general exception: 1 4 | - bad format: 2 5 | - image too big: 4 6 | - image too small: 8 7 | - any combination of above 8 | 9 | ''' 10 | 11 | 12 | import gc 13 | from logging import raiseExceptions 14 | import os 15 | import ssl 16 | import sys 17 | import time 18 | import trio 19 | import uuid 20 | import ujson 21 | import shutil 22 | import tarfile 23 | import argparse 24 | import pandas as pd 25 | from glob import glob 26 | from uuid import uuid1 27 | from io import BytesIO 28 | from tqdm.auto import tqdm 29 | from datetime import datetime 30 | from sqlalchemy import create_engine 31 | from configparser import ConfigParser 32 | from PIL import Image, ImageFile, UnidentifiedImageError 33 | from random_user_agent.user_agent import UserAgent 34 | from random_user_agent.params import SoftwareName, OperatingSystem 35 | from multiprocessing import Process, cpu_count 36 | 37 | sys.path.append('./crawlingathome-worker/') 38 | 39 | import asks 40 | asks.init("trio") 41 | 42 | ImageFile.LOAD_TRUNCATED_IMAGES = True # https://stackoverflow.com/a/47958486 43 | ssl_ctx = ssl.create_default_context() 44 | ssl_ctx.check_hostname = False 45 | ssl_ctx.verify_mode = ssl.CERT_NONE 46 | 47 | def config(filename='database.ini', section='cah_production'): 48 | # create a parser 49 | parser = ConfigParser() 50 | # read config file 51 | parser.read(filename) 52 | 53 | # get section, default to postgresql 54 | db = {} 55 | if parser.has_section(section): 56 | params = parser.items(section) 57 | for param in params: 58 | db[param[0]] = param[1] 59 | else: 60 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 61 | 62 | return db 63 | 64 | class Tracer(trio.abc.Instrument): 65 | 66 | def __init__(self, pbar: tqdm): 67 | self.exceptions = 0 68 | self.requests = 0 69 | self.downloads = 0 70 | self.imgproc_duration = 0 71 | self.download_duration = 0 72 | self.error_duration = 0 73 | self.pbar = pbar 74 | 75 | def task_exited(self, task): 76 | if task.custom_sleep_data is not None: 77 | self.pbar.update(1) 78 | if task.custom_sleep_data[0] in [1, 3]: # this is exception 79 | self.exceptions += 1 80 | self.error_duration += task.custom_sleep_data[2] 81 | if task.custom_sleep_data[0] == 0: # this is image downloaded 82 | self.download_duration += task.custom_sleep_data[1] 83 | self.imgproc_duration += task.custom_sleep_data[2] 84 | self.downloads += 1 85 | 86 | def after_run(self): 87 | rate = round(self.exceptions / (self.exceptions + self.downloads + sys.float_info.epsilon), 2) 88 | avg_download = round(self.download_duration / (self.downloads + sys.float_info.epsilon), 2) 89 | avg_process = round(self.imgproc_duration / (self.downloads + sys.float_info.epsilon), 2) 90 | avg_error = round(self.error_duration / (self.exceptions + sys.float_info.epsilon), 2) 91 | print(f"[instrumentation] While scraping there were {self.exceptions} errors within {self.downloads + self.exceptions} candidates (error rate = {round(rate * 100,2)} %). {self.downloads} images were downloaded.") 92 | print(f"[instrumentation] Cumulative image processing duration {round(self.imgproc_duration, 2)} s.") 93 | print(f"[instrumentation] Average downloading time {avg_download} s/img, image processing time {avg_process} s/img, exceptions processing time {avg_error} s/link") 94 | 95 | def log(e): 96 | with open("errors.txt","a") as f: 97 | f.write(str(e.__class__.__name__) + " " + str(e) + "\n") 98 | 99 | 100 | def process_img_content(response, alt_text, license, sample_id, language, i): 101 | """ 102 | Function to process downloaded image. Use use PIL from pillow-simd 103 | (faster than open cv that in return is faster than original pillow) 104 | 105 | input: web request response, ALT text, license and sample id 106 | 107 | output: list of image parameters or None if image is rejected 108 | """ 109 | img_output_folder = f"./{i}/save/images/" 110 | error_code = 8 111 | 112 | #temp 2 lines 113 | if language == "" or language is None: 114 | language = "en" 115 | 116 | def _resize(im: Image): 117 | width, height = im.size 118 | ratio = min(width, height) / 224 119 | new_width = int(round(width/ratio,0)) 120 | new_height = int(round(height/ratio,0)) 121 | im = im.resize((new_width, new_height), resample=Image.BICUBIC) 122 | if new_width > 224 or new_height > 224: 123 | left = (new_width - 224)/2 124 | top = (new_height - 224)/2 125 | right = (new_width + 224)/2 126 | bottom = (new_height + 224)/2 127 | # Crop the center of the image 128 | im = im.crop((left, top, right, bottom)) 129 | return im 130 | try: 131 | # reject too small images 132 | if len(response.content) < 5000: 133 | error_code += 8 134 | img_data = BytesIO(response.content) 135 | with Image.open(img_data) as im: 136 | width, height = im.size 137 | # reject if too large (might be a DOS decompression bomb) 138 | if width * height > 89478484: 139 | error_code += 4 140 | else: 141 | im_format = im.format 142 | out_fname = f"{img_output_folder}{str(sample_id)}.{im_format.lower()}" 143 | # reject if format is not in this list 144 | if im_format not in ["JPEG", "JPG", "PNG", "WEBP"]: 145 | error_code += 2 146 | if min(width, height) > 224: 147 | im = _resize(im) 148 | 149 | # convert all images to RGB (necessary for CLIP, also CLIP is doing it again so do we need it here?) 150 | if im.mode != "RGB": 151 | im = im.convert("RGB") 152 | if error_code == 8: 153 | im.save(out_fname) # do not retain images we do not need 154 | except (KeyError, UnidentifiedImageError): 155 | out_fname = "" 156 | width = 0 157 | height = 0 158 | error_code += 1 159 | 160 | if error_code == 8: 161 | error_code = 2 # mark succesful lines with status = 2 162 | 163 | return [str(sample_id), out_fname, response.url, alt_text, width, height, license, language, error_code] 164 | 165 | 166 | async def request_image(parsed_df, i): 167 | """ 168 | This function initiates many parallel async connections to try download the images from provided links 169 | 170 | input: dataset of validated links, the sample id to start with 171 | 172 | output: list of lists with succesfully downloaded images and their parameters. this list is dumped on disk as json file 173 | """ 174 | tmp_data = [] 175 | limit = trio.CapacityLimiter(1000) 176 | 177 | # change the number of parallel connections based on CPU speed, network capabilities, etc. 178 | # the number of 192 is optimized for 1 vCPU droplet at Hetzner Cloud (code CX11) 179 | session = asks.Session(connections=64, ssl_context=ssl_ctx) 180 | 181 | software_names = [SoftwareName.CHROME.value] 182 | operating_systems = [OperatingSystem.LINUX.value] 183 | 184 | user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=2000) 185 | user_agent = user_agent_rotator.get_random_user_agent() 186 | 187 | # try to make the bot website friendly 188 | session.headers = { 189 | "User-Agent": user_agent, 190 | "Accept-Language": "en-US,en;q=0.5", 191 | "Accept-Encoding": "gzip, deflate", 192 | "Referer": "https://google.com", 193 | "DNT": "1", 194 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 195 | } 196 | 197 | async def _request(row, i): 198 | start=time.time() 199 | sample_id = row[0] 200 | url = row[1] 201 | alt_text = row[2] 202 | license = row[3] 203 | language = row[4] 204 | # the following 2 lines are related to Trio Instrument to capture events from multiple threads 205 | task = trio.lowlevel.current_task() 206 | try: 207 | response = await session.get(url, timeout=10, connection_timeout=20) 208 | dltime = round(time.time()-start, 2) 209 | start=time.time() 210 | proces = process_img_content( 211 | # tune timeout and connection_timeout to grab more or less files. shorter timeouts will exclude bad performing websites 212 | response, alt_text, license, sample_id, language, i 213 | ) 214 | proctime = round(time.time()-start, 2) 215 | task.custom_sleep_data = (0, dltime, proctime) # for success do not count errors 216 | if proces is not None: 217 | tmp_data.append(proces) 218 | except Exception as e: 219 | log(e) 220 | task.custom_sleep_data = (1, 0, round(time.time()-start,2)) # when exception is hit, count it 221 | 222 | async with trio.open_nursery() as n: 223 | for index, row in parsed_df.iterrows(): 224 | async with limit: 225 | n.start_soon(_request, row, i) 226 | 227 | # trio makes sure at this point all async tasks were executed 228 | with open(f"./{i}/.tmp/{uuid1()}.json", "w") as f: 229 | ujson.dump(tmp_data, f) 230 | gc.collect() 231 | 232 | return 233 | 234 | 235 | def dl_wat(parsed_df, i, pbar): # replace valid data and start sampleid with parsed_df 236 | """ 237 | This function initiates download attempt of validated parsed links 238 | It launches multithreaded tasks by using trio module 239 | 240 | input: dataset of validated links, the sample id to start with 241 | 242 | output: dataframe of downloaded images and their parameters 243 | """ 244 | 245 | # Download every image available 246 | processed_samples = [] 247 | #trio.run(request_image, valid_data, first_sample_id, instruments=[TrioProgress(len(valid_data), False)] ) 248 | trio.run( request_image, parsed_df, i, instruments=[Tracer(pbar)] ) 249 | 250 | for tmpf in glob(f"./{i}/.tmp/*.json"): 251 | processed_samples.extend(ujson.load(open(tmpf))) 252 | return pd.DataFrame( 253 | processed_samples, 254 | columns=["SAMPLE_ID", "PATH", "URL", "TEXT", "HEIGHT", "WIDTH", "LICENSE", "LANGUAGE", "STATUS"], 255 | ) 256 | 257 | def upload(source: str, clientType: str, target: str): 258 | with tarfile.open(f"{source}.tar.gz", "w:gz") as tar: 259 | tar.add(source, arcname=os.path.basename(source)) 260 | result = os.system(f"rsync -av {source}.tar.gz {target}") 261 | if os.path.exists(f"{source}.tar.gz"): 262 | os.remove(f"{source}.tar.gz") 263 | if os.path.exists(f"{source}"): 264 | shutil.rmtree(f"{source}", ignore_errors=True) 265 | return result 266 | 267 | def newJob(engine, dataset, depth, tablesample): 268 | # selection on domains based on distribution of URLs per domain 269 | select_stmt1 = f"UPDATE dataset_{dataset} SET status = 1 WHERE sampleid IN (SELECT sampleid FROM dataset_{dataset} TABLESAMPLE SYSTEM ({tablesample}) WHERE status = 0 LIMIT {depth} FOR UPDATE SKIP LOCKED) AND status = 0 RETURNING sampleid" 270 | conn = engine.raw_connection() 271 | cur = conn.cursor() 272 | cur.execute(select_stmt1) 273 | result = cur.fetchall() 274 | conn.commit() 275 | cur.close() 276 | 277 | values = ",".join([str(tuple[0]) for tuple in result]) 278 | select_stmt2 = f"SELECT sampleid, url, text, license, language FROM dataset_{dataset} WHERE sampleid in ({values})" 279 | df = pd.read_sql_query(select_stmt2, conn) 280 | conn.close() 281 | return df 282 | 283 | def completeJob2(engine, prefix, parsed_df, dlparse_df, dataset): 284 | # prepare data for EN 285 | values2 = ",".join(parsed_df["sampleid"].astype(str)) 286 | update_stmt1 = "" 287 | for i, row in dlparse_df.iterrows(): 288 | update_stmt1 += f'UPDATE dataset_nolang SET status={row["STATUS"]}, width={row["HEIGHT"]}, height={row["WIDTH"]} where sampleid = {row["SAMPLE_ID"]};' 289 | # this is intentional mix between width and heigth to account for the but in previous laion release 290 | # the csv will go scrambled but in database we want good values 291 | insert_stmt = f"INSERT INTO jobs_{dataset} (jobid) VALUES ('{prefix}')" 292 | 293 | if len(dlparse_df.index > 0): 294 | conn = engine.raw_connection() 295 | cur = conn.cursor() 296 | cur.execute(update_stmt1) 297 | cur.execute(insert_stmt) 298 | conn.commit() 299 | cur.close() 300 | conn.close() 301 | 302 | # in case there are samples unaccounted for, we try to mark them with general error status 303 | update_stmt2 = f"UPDATE dataset_{dataset} SET status = 9 where status = 1 AND sampleid in ({values2})" 304 | 305 | conn = engine.raw_connection() 306 | cur = conn.cursor() 307 | cur.execute(update_stmt2) 308 | conn.commit() 309 | cur.close() 310 | conn.close() 311 | return 312 | 313 | def worker(engine, i, dataset, depth, tablesample, target): 314 | 315 | # initialize working folders 316 | tmp_folder = f"./{i}/.tmp/" 317 | output_folder = f"./{i}/save/" 318 | img_output_folder = output_folder + "images/" 319 | 320 | while True: 321 | try: 322 | start = time.time() 323 | start0 = start 324 | 325 | parsed_df = newJob(engine, dataset, depth, tablesample) 326 | 327 | prefix = uuid.uuid4().hex 328 | result = 0 329 | 330 | # clear working folders for a new job 331 | if os.path.exists(output_folder): 332 | shutil.rmtree(output_folder, ignore_errors=True) 333 | if os.path.exists(tmp_folder): 334 | shutil.rmtree(tmp_folder, ignore_errors=True) 335 | 336 | os.makedirs(output_folder) 337 | os.makedirs(img_output_folder) 338 | os.makedirs(tmp_folder) 339 | 340 | # compute output file names base 341 | out_fname = f"3_staged_workflow_job_{prefix}_full_wat" 342 | print(f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] Job acquired in {round(time.time()-start,2)} sec") 343 | start = time.time() 344 | 345 | print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] This job has {len(parsed_df)} candidates") 346 | pbar = tqdm(total=len(parsed_df),position=i,desc=f"worker {i}") 347 | 348 | # attempt to download validated links and save to disk for stats and blocking lists 349 | dlparse_df = dl_wat(parsed_df, i, pbar) 350 | dlparse_df_save = dlparse_df[dlparse_df["STATUS"]==2] # remove rejected items from gpu jobs 351 | dlparse_df_save.to_csv(output_folder + out_fname + ".csv", index=False, sep="|") 352 | # at this point we finishes the CPU node job, need to make the data available for GPU worker 353 | os.mkdir(prefix) 354 | os.system(f"mv ./{i}/save/* {prefix}/") 355 | result += upload(prefix, "CPU", target) #todo find the IP and endpoint 356 | if result == 0: 357 | completeJob2(engine, prefix, parsed_df, dlparse_df, dataset) 358 | 359 | print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] pairs retained {len(dlparse_df_save)} in {round(time.time() - start, 2)}") 360 | print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] scraping efficiency {len(dlparse_df_save)/(time.time() - start)} img/sec") 361 | print (f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] crawling efficiency {len(parsed_df)/(time.time() - start)} links/sec") 362 | 363 | 364 | last = round(time.time() - start0) 365 | 366 | print(f"[{datetime.now().strftime('%H:%M:%S')} stats {i}] Job completed in {last} seconds") 367 | 368 | except Exception as e: 369 | print (e) 370 | print (f"{datetime.now().strftime('%H:%M:%S')} Worker {i} crashed") 371 | time.sleep(60) 372 | 373 | if __name__ == "__main__": 374 | 375 | print (f"starting session") 376 | 377 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -s/--set -d/--depth') 378 | parser.add_argument("-s","--set",action='append',help="Which dataset to download (en, intl, nolang)", required=False) 379 | parser.add_argument("-d","--depth",action='append',help="How many samples to download (10000)", required=False) 380 | parser.add_argument("-t","--tablesample",action='append',help="Tablesample ratio (0.05)", required=False) 381 | parser.add_argument("-r","--rsync",action='append',help="Rsync target where to store results", required=False) 382 | parser.add_argument("-c","--cpus",action='append',help="How many cpus to use",required=False) 383 | args = parser.parse_args() 384 | 385 | dataset = "en" 386 | if args.set is not None: 387 | dataset = args.set[0] 388 | 389 | depth = 10000 390 | if args.depth is not None: 391 | depth = int(args.depth[0]) 392 | 393 | tablesample = 0.05 394 | if args.tablesample is not None: 395 | tablesample = float(args.tablesample[0]) 396 | 397 | print(tablesample) 398 | time.sleep(30) 399 | 400 | target = "archiveteam@176.9.4.150::gpujobsnolang" 401 | if args.rsync is not None: 402 | target = args.rsync[0] 403 | 404 | procs = cpu_count() 405 | if args.cpus is not None and int(args.cpus[0]) > 0: 406 | procs = int(args.cpus[0]) 407 | 408 | params = config() 409 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_size=procs, max_overflow=int(procs*1.5), pool_recycle=60, pool_pre_ping=True ) 410 | 411 | for i in range(procs): 412 | Process(target=worker, args=[engine, i, dataset, depth, tablesample, target], daemon=True).start() 413 | 414 | try: 415 | while True: 416 | time.sleep(30) 417 | except KeyboardInterrupt: 418 | sys.exit() 419 | -------------------------------------------------------------------------------- /ccpp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import ftfy 5 | import ujson 6 | import gcld3 7 | import uuid 8 | import shutil 9 | import argparse 10 | import hashlib 11 | import tarfile 12 | import psycopg2 13 | import requests 14 | import numpy as np 15 | import pandas as pd 16 | from tqdm.auto import tqdm 17 | from random import randint 18 | from datetime import datetime 19 | from sqlalchemy import create_engine 20 | from configparser import ConfigParser 21 | from urllib.parse import urlparse, urljoin 22 | from django.core.validators import URLValidator 23 | from django.core.exceptions import ValidationError 24 | from multiprocessing import Process, cpu_count 25 | from crawlingathome_client.temp import TempCPUWorker 26 | 27 | 28 | def config(filename='database.ini', mode="test"): 29 | # create a parser 30 | parser = ConfigParser() 31 | # read config file 32 | parser.read(filename) 33 | 34 | section='postgresql' 35 | if mode == "production": 36 | section='cah_production' 37 | 38 | # get section, default to postgresql 39 | db = {} 40 | if parser.has_section(section): 41 | params = parser.items(section) 42 | for param in params: 43 | db[param[0]] = param[1] 44 | else: 45 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 46 | 47 | return db 48 | 49 | def is_valid_url(url_string: str) -> bool: 50 | validate_url = URLValidator() 51 | try: 52 | validate_url(url_string) 53 | except ValidationError as e: 54 | return False 55 | return True 56 | 57 | def log(e): 58 | with open("errors.txt","a") as f: 59 | f.write(str(e.__class__.__name__) + " " + str(e) + "\n") 60 | 61 | def remove_bad_chars(text): 62 | # cleanup text so language can be detected 63 | return "".join(c for c in text if c.isprintable()) 64 | 65 | def timeit(debug, tick, msg): 66 | if not debug: 67 | return 68 | else: 69 | print (f"{msg} time chunk {round(time.time()-tick,2)} sec.") 70 | return time.time() 71 | 72 | 73 | def parse_wat(content, i, debug): 74 | tick = time.time() 75 | """ 76 | This function checks the wat file content and attempts to extract valid candidates of image urls and alt texts 77 | 78 | input: content = wat file content; start = start line number; line_count = how many lines to parse 79 | usually a wat file is split in 2 halfs or 2 shards. shard 0 starts at the first line and line_count is about 1/2 of wat file lines 80 | shard 1 starts at the middle of wat file and ends with the last line of wat 81 | 82 | output: a list of tuples (url, text, license, domain, hash) 83 | """ 84 | 85 | bloomip = "116.202.162.146" 86 | bloom2ip = "94.130.167.172" 87 | 88 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] start parsing") 89 | tick = timeit(debug, tick, "start parsing") 90 | 91 | detector = gcld3.NNetLanguageIdentifier(min_num_bytes=5, max_num_bytes=2000) 92 | 93 | clpd = 0 94 | valid_data = [] 95 | check_flag = set() # track urls and make them unique 96 | content.seek(0) 97 | 98 | for line in tqdm(content, position=i, desc=f"{i} parser"): 99 | if "IMG@" not in line: 100 | continue 101 | line_str = line.strip() 102 | data = ujson.loads(line_str) 103 | # find all links inside the line 104 | linklist = data["Envelope"]["Payload-Metadata"]["HTTP-Response-Metadata"]["HTML-Metadata"]["Links"] 105 | # get base url 106 | base_url = os.path.dirname( 107 | data["Envelope"]["WARC-Header-Metadata"]["WARC-Target-URI"] 108 | ) 109 | license = "?" 110 | for e in linklist: 111 | if "url" in e and "creativecommons.org/licenses/" in e["url"]: 112 | license = e["url"][0:80].replace("\n","").replace('\\','\\\\') 113 | if not "url" in e: 114 | continue 115 | url = e["url"][0:2000].replace("\n","").replace('\\','\\\\') 116 | try: 117 | if not is_valid_url(url): 118 | continue 119 | except: 120 | continue 121 | # reject links of svg, gif or scripted images content 122 | if any( x in url for x in {".svg", ".gif", "data:image", "javascript:"} ): 123 | continue 124 | try: 125 | domain = urlparse(url).hostname 126 | except: 127 | continue 128 | if domain is None or domain == "": 129 | continue 130 | if len(str(domain)) > 60: 131 | continue 132 | detlang = "" 133 | alt_text = "" 134 | try: 135 | if "alt" in e: 136 | # detect ALT text language 137 | alt_text = ftfy.fix_text(e["alt"].replace("\n", " ")).strip() 138 | alt_text = remove_bad_chars(alt_text) 139 | res = detector.FindLanguage(alt_text) 140 | detlang = res.language 141 | rel = res.is_reliable 142 | if not rel: 143 | detlang = "" 144 | except: 145 | pass 146 | # keep pair or just url if we made it so far 147 | """ 148 | if detlang in ['bn', 'co', 'eo', 'fil', 'fy', 'gd', 'ha', 'haw', 'hmn', 'ig', 'km', 'ku', 'ky', 'lo', 'mi', 'mn', 'mt', 'ny', 'sd', 'si', 'sm', 'sn', 'so', 'st', 'su', 'sw', 'xh', 'yi', 'zu']: 149 | """ 150 | # get rid of Latn suffix when detected 151 | if detlang != "": 152 | detlang = detlang.split("-")[0] 153 | if alt_text == "" or alt_text is None: 154 | continue 155 | if len(alt_text) < 5: 156 | continue 157 | alt_text = alt_text[0:2000].replace("\t"," ").replace("\n"," ").replace('\\','\\\\').replace('|', ' ') # will use tab as field separator for copy source 158 | if not url.startswith("http"): 159 | url = urljoin(base_url, url) 160 | hash = hashlib.md5((url + alt_text).encode("utf-8")).hexdigest() 161 | if url not in check_flag: 162 | valid_data.append((url, alt_text, license, domain, detlang, hash)) 163 | check_flag.add(url) 164 | 165 | 166 | tick = timeit(debug, tick, "loop finished") 167 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] lenght of pairs to filter {len(valid_data)}") 168 | s = time.time() 169 | 170 | if len(valid_data) > 0: 171 | s = time.time() 172 | # remove from valid_data elements rejected by parsed bloom server 173 | with open(f'{i}/hash.txt', 'w') as f: 174 | for item in valid_data: 175 | f.write(item[0].strip()+"\n") 176 | post = { 177 | 'file': ('hash.txt', open(f'{i}/hash.txt', 'rb')), 178 | 'key': (None, 'parsed'), 179 | } 180 | 181 | tick = timeit(debug, tick, "parsed bloom prepared") 182 | failure = True 183 | for _ in range(10): 184 | try: 185 | response = requests.post(f'http://{bloom2ip}:8000/deduplicate/', files=post) 186 | if response.status_code != 200: 187 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] bloom server error, retrying... got {response.status_code}") 188 | time.sleep(randint(5,30)) 189 | else: 190 | failure = False 191 | break 192 | except: 193 | time.sleep(30) 194 | if failure: 195 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] crash, cannot contact the parsed bloom server, please fix") 196 | return (None, 0, 0) 197 | 198 | valid_urls = set(response.content.decode("utf-8").split("\n")) 199 | 200 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] parsed bloom server returned {len(valid_urls)} in {round(time.time()-s,3)} sec") 201 | tick = timeit(debug, tick, "parsed bloom done") 202 | 203 | valid_data = [t for t in {tuple(i) for i in valid_data}] 204 | 205 | final_kept_data = [] 206 | prsd = len(valid_data) 207 | 208 | for item in valid_data: 209 | if item[0].strip() in valid_urls: 210 | final_kept_data.append(item) 211 | prsd -= 1 212 | 213 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] lenght of deduplicated pairs to return {len(final_kept_data)}") 214 | 215 | if len(final_kept_data) > 0: 216 | # add parsed urls to parsed bloom server 217 | with open('hash.txt', 'w') as f: 218 | for url in final_kept_data: 219 | f.write(url[0].strip()+"\n") 220 | post = { 221 | 'file': ('hash.txt', open('hash.txt', 'rb')), 222 | 'key': (None, 'parsed'), 223 | } 224 | 225 | tick = timeit(debug, tick, "add to parsed bloom prepared") 226 | failure = True 227 | for _ in range(10): 228 | try: 229 | response = requests.post(f'http://{bloom2ip}:8000/add/', files=post) 230 | if response.status_code != 200: 231 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] bloom server error, retrying... got {response.status_code}") 232 | time.sleep(randint(5,30)) 233 | else: 234 | failure = False 235 | print(f"bloom add response: {response.text}") 236 | break 237 | except: 238 | time.sleep(15) 239 | if failure: 240 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] crash, cannot contact the parsed bloom server, please fix") 241 | return (None, 0, 0) 242 | 243 | tick = timeit(debug, tick, "add to parsed bloom done") 244 | 245 | return (final_kept_data, clpd, prsd) # use a dict in order to remove duplicate tuples from list 246 | 247 | def proc_worker(i: int, YOUR_NICKNAME_FOR_THE_LEADERBOARD, CRAWLINGATHOME_SERVER_URL, engine, host, debug, current_set): 248 | # initialize working folders 249 | tmp_folder = f"./{i}/.tmp/" 250 | 251 | if os.path.exists(tmp_folder): 252 | shutil.rmtree(tmp_folder) 253 | 254 | # connect to C@H server and initialize client 255 | client = TempCPUWorker(url=CRAWLINGATHOME_SERVER_URL, nickname=YOUR_NICKNAME_FOR_THE_LEADERBOARD) 256 | 257 | # initialize stats variables for previous job 258 | last = 0 259 | 260 | # this makes a loop to download new jobs while the script is running 261 | # normally it reads while client.jobCount() > 0 262 | conn = engine.raw_connection() 263 | while True: 264 | try: 265 | # clean the folder 266 | if os.path.exists(f"{i}"): 267 | shutil.rmtree(f"{i}", ignore_errors=True) 268 | os.makedirs(tmp_folder) 269 | 270 | tick = time.time() 271 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] clock is {datetime.now().strftime('%H:%M:%S')}") 272 | 273 | start = time.time() 274 | start0 = start 275 | 276 | # get new job and download the wat file 277 | client.newJob() 278 | tick = timeit(debug, tick, "got new job") 279 | client.downloadWat(tmp_folder) 280 | tick = timeit(debug, tick, "downloaded wat") 281 | 282 | #fix tracker db error 283 | client.shards = client.shards[0:2] 284 | 285 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] downloaded wat in {round(time.time()-start,2)}") 286 | start = time.time() 287 | 288 | first_sample_id = np.int64(client.shards[0][1]["start_id"]) 289 | 290 | # parse valid links from wat file 291 | with open(tmp_folder + "shard.wat", "r") as infile: 292 | parsed_data, clpd, prsd = parse_wat(infile, i, debug) 293 | 294 | if parsed_data is None: 295 | continue 296 | tick = timeit(debug, tick, "parsing finalized") 297 | 298 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] parsed wat in {round(time.time()-start,2)}") 299 | start = time.time() 300 | 301 | # convert to dataframe and save to disk (for statistics and generating blocking lists) 302 | if len(parsed_data) > 0: 303 | parsed_df = pd.DataFrame(parsed_data, columns=["url","text","license","domain","language","hash"]) 304 | parsed_df = parsed_df.drop_duplicates(subset=["url"]) 305 | parsed_df.insert(0, 'sampleid', range(first_sample_id, first_sample_id + len(parsed_df))) 306 | parsed_df["wat"] = int(client.shards[-1][0]) 307 | parsed_df = parsed_df[["sampleid","url","text","license","domain","wat","hash","language"]] 308 | 309 | # postgres should only ingest current working data not all 310 | en_df = parsed_df[parsed_df["language"]=="en"] 311 | nolang_df = parsed_df[parsed_df["language"]==""] 312 | multilang_df = parsed_df[(parsed_df["language"]!="en") & (parsed_df["language"]!="")] 313 | 314 | not_nolang = parsed_df[(parsed_df["language"]!="")] 315 | 316 | tick = timeit(debug, tick, "dataframe preparation done") 317 | current = en_df 318 | if current_set == "": 319 | current = nolang_df 320 | print(f"currently working on nolang dataset") 321 | if current_set == "multilang": 322 | current = multilang_df 323 | print(f"currently working on multilang dataset") 324 | 325 | tick = timeit(debug, tick, "before sql copy") 326 | not_nolang.to_csv(f"{i}/export_sql.txt", sep='\t', index=False, header=False) 327 | 328 | cur = conn.cursor() 329 | with open(f"{i}/export_sql.txt", "rt") as f: 330 | cur.copy_from(f, 'dataset_buffer', columns=("sampleid","url","text","license","domain","wat","hash","language")) 331 | conn.commit() 332 | cur.close() 333 | 334 | tick = timeit(debug, tick, "finished sql copy") 335 | 336 | uid = uuid.uuid4().hex 337 | 338 | nolang_df.to_csv(f"{i}/nolang-{uid}.txt", sep='\t', index=False, header=False) 339 | os.system(f"rsync -amv --include='*{uid}.txt' --include='*/' --exclude='*' ./{i}/ postgres@185.154.158.196::aidb") 340 | 341 | ''' 342 | if not current.equals(en_df): 343 | en_df.to_csv(f"{i}/en-{uid}.txt", sep='\t', index=False, header=False) 344 | if not current.equals(nolang_df): 345 | nolang_df.to_csv(f"{i}/nolang-{uid}.txt", sep='\t', index=False, header=False) 346 | if not current.equals(multilang_df): 347 | multilang_df.to_csv(f"{i}/intl-{uid}.txt", sep='\t', index=False, header=False) 348 | 349 | os.system(f"rsync -amv --include='*{uid}.txt' --include='*/' --exclude='*' ./{i}/ postgres@185.154.158.196::aidb") 350 | ''' 351 | 352 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] saved links in {round(time.time()-start,2)}") 353 | 354 | lastlinks = len(parsed_data) 355 | en_pairs = len(en_df.index) 356 | nolang_pairs = len(nolang_df.index) 357 | intl_pairs = len(multilang_df.index) 358 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] this job has {lastlinks} links left after removing {clpd} already clipped and {prsd} already parsed") 359 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] links are split into {en_pairs} english, {intl_pairs} multilanguage and {nolang_pairs} without language") 360 | with open("datapoints.txt", "a") as f: 361 | f.write(f"{time.time()}\t{en_pairs}\t{intl_pairs}\t{nolang_pairs}\n") 362 | else: 363 | print(f"This WAT file does not contain any useful candidate") 364 | 365 | prefixes = {} 366 | prefixes[str(client.shards[0][0])] = f"postgres {host}" 367 | prefixes[str(client.shards[1][0])] = f"postgres {host}" 368 | client.completeJob(prefixes) 369 | tick = timeit(debug, tick, "executed complete job") 370 | 371 | last = round(time.time() - start0) 372 | print(f"[{datetime.now().strftime('%H:%M:%S')} {i} stats] WAT job completed in {last} seconds") 373 | 374 | except Exception as e: 375 | print (f"[{datetime.now().strftime('%H:%M:%S')} exception {i} parser] {e}") 376 | print (f"[{datetime.now().strftime('%H:%M:%S')} {i} parser] worker crashed") 377 | time.sleep(60) 378 | client = TempCPUWorker(url=CRAWLINGATHOME_SERVER_URL, nickname=YOUR_NICKNAME_FOR_THE_LEADERBOARD) 379 | conn.close() 380 | 381 | if __name__ == '__main__': 382 | 383 | parser = argparse.ArgumentParser(prog=sys.argv[0], usage='%(prog)s -m/--mode -c/--cpus -n/--name -d/--debug') 384 | parser.add_argument("-n","--name",action='append',help="Your leaderboard nickname",required=False) 385 | parser.add_argument("-c","--cpus",action='append',help="How many cpus to use",required=False) 386 | parser.add_argument("-d","--debug",action='append',help="Print debug lines?",required=False) 387 | parser.add_argument("-m","--mode",action='append',help="Mode to run",required=True) 388 | parser.add_argument("-s","--set",action='append',help="Choose current set (en, nolang, multilang)",required=True) 389 | args = parser.parse_args() 390 | 391 | # initialize client variables 392 | YOUR_NICKNAME_FOR_THE_LEADERBOARD = None 393 | if args.name is not None: 394 | YOUR_NICKNAME_FOR_THE_LEADERBOARD = " ".join(args.name) 395 | 396 | if YOUR_NICKNAME_FOR_THE_LEADERBOARD in (None,""): 397 | YOUR_NICKNAME_FOR_THE_LEADERBOARD = "ccpp-dev" 398 | CRAWLINGATHOME_SERVER_URL = "http://cah.io.community/" 399 | 400 | print (f"starting session under `{YOUR_NICKNAME_FOR_THE_LEADERBOARD}` nickname") 401 | 402 | procs = cpu_count() 403 | if args.cpus is not None and int(args.cpus[0]) > 0: 404 | procs = int(args.cpus[0]) 405 | 406 | debug = False 407 | if args.debug is not None and args.debug[0] == "true": 408 | debug = True 409 | 410 | params = config(mode=args.mode[0]) 411 | 412 | engine = create_engine(f'postgresql://{params["user"]}:{params["password"]}@{params["host"]}:5432/{params["database"]}', pool_size=procs, max_overflow=int(procs*1.5), pool_pre_ping=True) 413 | workers = [] 414 | for i in range ( procs ): 415 | #use this queue to annount that bloom is currently processing and please do not update filters. if queue is not empty please wait, if queue is empty you may update filters 416 | workers.append(Process(target=proc_worker, args= [i, YOUR_NICKNAME_FOR_THE_LEADERBOARD, CRAWLINGATHOME_SERVER_URL, engine, params["host"], debug, args.set[0]], daemon=True)) 417 | 418 | time.sleep(10) 419 | 420 | for worker in workers: 421 | worker.start() 422 | time.sleep(8) 423 | 424 | while True: 425 | #keep main process alive 426 | time.sleep(60) 427 | --------------------------------------------------------------------------------