├── ,gitattributes ├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── README.md ├── __init__.py ├── api.py ├── app.py ├── captcha.py ├── captchas └── .gitkeep ├── common.py ├── config.py ├── database.py ├── do_recrawl.py ├── docker-compose.yml ├── export.py ├── high_level_diagram.dia ├── high_level_diagram.png ├── init_script.sql ├── main.py ├── mass_import.py ├── od_util.py ├── reddit_bot.py ├── requirements.txt ├── search ├── __init__.py ├── filter.py └── search.py ├── static ├── Hack-Regular.ttf ├── css │ ├── bootstrap.min.css │ ├── fa-brands.css │ ├── fa-brands.min.css │ ├── fa-regular.css │ ├── fa-regular.min.css │ ├── fa-solid.css │ ├── fa-solid.min.css │ ├── fontawesome-all.css │ ├── fontawesome-all.min.css │ ├── fontawesome.css │ ├── fontawesome.min.css │ ├── ion.rangeSlider.css │ ├── ion.rangeSlider.skinFlat.css │ ├── main.css │ └── style.css ├── downloads │ └── README.md ├── img │ ├── bg.png │ ├── forkme_right_white_ffffff.png │ └── sprite-skin-flat.png ├── js │ ├── Chart.min.js │ ├── bootstrap.min.js │ ├── ion.rangeSlider.min.js │ ├── jquery.min.js │ ├── popper.min.js │ ├── report.js │ └── script.js └── webfonts │ ├── fa-brands-400.eot │ ├── fa-brands-400.svg │ ├── fa-brands-400.ttf │ ├── fa-brands-400.woff │ ├── fa-brands-400.woff2 │ ├── fa-regular-400.eot │ ├── fa-regular-400.svg │ ├── fa-regular-400.ttf │ ├── fa-regular-400.woff │ ├── fa-regular-400.woff2 │ ├── fa-solid-900.eot │ ├── fa-solid-900.svg │ ├── fa-solid-900.ttf │ ├── fa-solid-900.woff │ └── fa-solid-900.woff2 ├── tasks.py ├── template_filters.py ├── templates ├── admin.html ├── contribute.html ├── dashboard.html ├── downloads.html ├── home.html ├── layout.html ├── search.html ├── stats.html ├── submit.html ├── website.html └── websites.html ├── tt_config.yml ├── uwsgi.ini └── views.py /,gitattributes: -------------------------------------------------------------------------------- 1 | static/css/* linguist-vendored 2 | static/css/main.css linguist-vendored=false 3 | static/js/* linguist-vendored -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | /static/downloads/ 3 | !/static/downloads/README.md 4 | __pycache__/ 5 | captchas/ 6 | _stats.json 7 | oddb.log 8 | praw.ini 9 | env/ 10 | worker.json 11 | search_blacklist.txt 12 | *.iml 13 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "fold_to_ascii"] 2 | path = fold_to_ascii 3 | url = https://github.com/spanishdict/fold_to_ascii 4 | [submodule "task_tracker_drone"] 5 | path = task_tracker_drone 6 | url = https://github.com/simon987/task_tracker_drone 7 | [submodule "ws_bucket_client"] 8 | path = ws_bucket_client 9 | url = https://github.com/simon987/ws_bucket_client 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | WORKDIR /app 4 | 5 | ADD requirements.txt /app/requirements.txt 6 | RUN pip install -r requirements.txt 7 | 8 | ENTRYPOINT ["python", "app.py"] 9 | 10 | COPY . /app 11 | 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Simon Fortier 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OD-Database 2 | 3 | OD-Database is a web-crawling project that aims to index a very large number of file links and their basic metadata from open directories (misconfigured Apache/Nginx/FTP servers, or more often, mirrors of various public services). 4 | 5 | Each crawler instance fetches tasks from the central server and pushes the result once completed. A single instance can crawl hundreds of websites at the same time (Both FTP and HTTP(S)) and the central server is capable of ingesting thousands of new documents per second. 6 | 7 | The data is indexed into elasticsearch and made available via the web frontend (Currently hosted at https://od-db.the-eye.eu/). There is currently ~1.93 billion files indexed (total of about 300Gb of raw data). The raw data is made available as a CSV file [here](https://od-db.the-eye.eu/dl). 8 | 9 | ![2018-09-20-194116_1127x639_scrot](https://user-images.githubusercontent.com/7120851/45852325-281cca00-bd0d-11e8-9fed-49a54518e972.png) 10 | 11 | 12 | ### Contributing 13 | Suggestions/concerns/PRs are welcome 14 | 15 | ## Installation (Docker) 16 | ```bash 17 | git clone --recursive https://github.com/simon987/od-database 18 | cd od-database 19 | mkdir oddb_pg_data/ tt_pg_data/ es_data/ wsb_data/ 20 | docker-compose up 21 | ``` 22 | 23 | ## Architecture 24 | 25 | ![diag](high_level_diagram.png) 26 | 27 | ## Running the crawl server 28 | The python crawler that was a part of this project is discontinued, 29 | [the go implementation](https://github.com/terorie/od-database-crawler) is currently in use. 30 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | import json 2 | from uuid import uuid4 3 | 4 | from flask import request, abort, send_file, session 5 | 6 | import captcha 7 | import common as oddb 8 | from common import taskManager 9 | from database import Website 10 | from search.search import InvalidQueryException 11 | 12 | 13 | def setup_api(app): 14 | taskManager.start_indexer_threads() 15 | 16 | @app.route("/api/website/by_url", methods=["GET"]) 17 | def api_website_by_url(): 18 | token = request.args.get("token") 19 | name = oddb.db.check_api_token(token) 20 | 21 | if name: 22 | url = request.args.get("url") 23 | website = oddb.db.get_website_by_url(url) 24 | oddb.logger.info("API get website by url '" + url + "' by " + name) 25 | if website: 26 | return str(website.id) 27 | return abort(404) 28 | else: 29 | return abort(403) 30 | 31 | @app.route("/api/website/blacklisted", methods=["GET"]) 32 | def api_website_is_blacklisted(): 33 | token = request.args.get("token") 34 | url = request.args.get("url") 35 | name = oddb.db.check_api_token(token) 36 | 37 | if name: 38 | oddb.logger.info("API get website is blacklisted '" + url + "' by " + name) 39 | return str(oddb.db.is_blacklisted(url)) 40 | else: 41 | return abort(403) 42 | 43 | @app.route("/api/website/add", methods=["GET"]) 44 | def api_add_website(): 45 | token = request.args.get("token") 46 | url = request.args.get("url") 47 | 48 | name = oddb.db.check_api_token(token) 49 | if name: 50 | 51 | website_id = oddb.db.insert_website(Website(url, str(request.remote_addr + "_" + 52 | request.headers.get("X-Forwarded-For", "")), 53 | "API_CLIENT_" + name)) 54 | oddb.logger.info("API add website '" + url + "' by " + name + "(" + str(website_id) + ")") 55 | return str(website_id) 56 | else: 57 | return abort(403) 58 | 59 | @app.route("/api/website/random") 60 | def api_random_website(): 61 | token = request.json["token"] 62 | name = oddb.db.check_api_token(token) 63 | 64 | if name: 65 | oddb.logger.info("API get random website by " + name) 66 | return str(oddb.db.get_random_website_id()) 67 | else: 68 | return abort(403) 69 | 70 | @app.route("/api/search", methods=["POST"]) 71 | def api_search(): 72 | token = request.json["token"] 73 | name = oddb.db.check_api_token(token) 74 | 75 | if name: 76 | 77 | try: 78 | hits = oddb.searchEngine.search( 79 | request.json["query"], 80 | request.json["page"], request.json["per_page"], 81 | request.json["sort_order"], 82 | request.json["extensions"], 83 | request.json["size_min"], request.json["size_max"], 84 | request.json["match_all"], 85 | request.json["fields"], 86 | request.json["date_min"], request.json["date_max"] 87 | ) 88 | 89 | hits = oddb.db.join_website_on_search_result(hits) 90 | oddb.logger.info("API search '" + request.json["query"] + "' by " + name) 91 | return json.dumps(hits) 92 | 93 | except InvalidQueryException as e: 94 | oddb.logger.info("API search failed: " + str(e)) 95 | return str(e) 96 | else: 97 | return abort(403) 98 | 99 | @app.route("/cap", methods=["GET"]) 100 | def cap(): 101 | word = captcha.make_captcha() 102 | cap_id = uuid4().__str__() 103 | session["cap"] = cap_id 104 | 105 | oddb.redis.set(cap_id, word) 106 | 107 | return send_file(captcha.get_path(word), cache_timeout=0) 108 | 109 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | import api 4 | import common 5 | import config 6 | import template_filters 7 | import views 8 | import os 9 | 10 | app = Flask(__name__) 11 | app.secret_key = config.FLASK_SECRET 12 | template_filters.setup_template_filters(app) 13 | 14 | views.setup_views(app) 15 | api.setup_api(app) 16 | 17 | 18 | if os.environ.get("ODDB_USER", False) and os.environ.get("ODDB_PASSWORD", False): 19 | user = os.environ["ODDB_USER"] 20 | password = os.environ["ODDB_PASSWORD"] 21 | try: 22 | common.db.generate_login(user, password) 23 | print("Generated user %s" % user) 24 | except: 25 | pass 26 | 27 | if __name__ == '__main__': 28 | app.run("0.0.0.0", port=80, threaded=True) 29 | -------------------------------------------------------------------------------- /captcha.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | from PIL import Image, ImageDraw, ImageFont 5 | from flask import request, session 6 | 7 | import common as oddb 8 | import config 9 | 10 | 11 | def get_code(): 12 | 13 | if "cap_remaining" in session and session["cap_remaining"] > 0: 14 | return """ 15 | You will not be asked to complete a captcha for the next {} pages 16 | """.format(session["cap_remaining"]) 17 | 18 | return """ 19 |
20 | cap 21 | 22 |
23 | """ 24 | 25 | 26 | def get_path(word): 27 | return "captchas/{}.png".format(word) 28 | 29 | 30 | def verify(): 31 | if "cap_remaining" in session and session["cap_remaining"] > 0: 32 | session["cap_remaining"] -= 1 33 | return True 34 | 35 | attempt = request.form.get("cap") if "cap" in request.form else ( 36 | request.args.get("cap") if "cap" in request.args else "" 37 | ) 38 | 39 | if "cap" in session: 40 | expected = oddb.redis.get(session["cap"]) 41 | expected = expected.decode("utf8") if expected is not None else "" 42 | oddb.redis.delete(session["cap"]) 43 | 44 | if expected == attempt: 45 | session["cap_remaining"] = config.CAPTCHA_EVERY 46 | return True 47 | 48 | return False 49 | 50 | 51 | cfg = { 52 | "image": { 53 | "size": (200, 72), 54 | "supersampling": 2 55 | }, 56 | "noise": { 57 | "min": 100, 58 | "max": 250 59 | }, 60 | "colors": { 61 | "green": [(1, 51, 1), (34, 204, 34)], 62 | "yellow": [(67, 67, 1), (221, 221, 0)], 63 | "cyan": [(17, 51, 85), (85, 187, 254)], 64 | "magenta": [(51, 1, 51), (254, 0, 254)], 65 | "red": [(67, 1, 1), (254, 68, 68)], 66 | "orange": [(68, 51, 1), (255, 153, 0)] 67 | }, 68 | "lines": { 69 | "back_thin": {"n": 3, "w": 5}, 70 | "back_thick": {"n": 3, "w": 6}, 71 | "back_positions": [ 72 | { 73 | "ax": (0, 10), 74 | "ay": (0, 36), 75 | "bx": (150, 200), 76 | "by": (18, 50) 77 | }, 78 | { 79 | "ax": (0, 10), 80 | "ay": (18, 50), 81 | "bx": (150, 200), 82 | "by": (0, 17) 83 | } 84 | ], 85 | "front_horizontal_thin": {"n": 2, "w": 3}, 86 | "front_horizontal_thick": {"n": 2, "w": 4}, 87 | "front_horizontal_positions": [ 88 | { 89 | "ax": (0, 20), 90 | "ay": (0, 34), 91 | "bx": (150, 200), 92 | "by": (18, 50) 93 | }, 94 | { 95 | "ax": (0, 20), 96 | "ay": (18, 72), 97 | "bx": (140, 200), 98 | "by": (0, 36) 99 | }, 100 | ], 101 | "front_vertical": {"n": 2, "w": 4}, 102 | "front_vertical_positions": { 103 | "outside": 5, 104 | "font_width": 13, 105 | "ay": (0, 16), 106 | "by": (54, 72) 107 | } 108 | }, 109 | "text": { 110 | "font": { 111 | "path": "static/Hack-Regular.ttf", 112 | "size": 60, 113 | "outline": [1, 2] 114 | }, 115 | "letters": { 116 | "3": { 117 | "count": 3, 118 | "x_min": 35, 119 | "x_max": 50, 120 | "y_min": -5, 121 | "y_max": 8 122 | }, 123 | "4": { 124 | "count": 4, 125 | "x_min": 20, 126 | "x_max": 35, 127 | "y_min": -5, 128 | "y_max": 8 129 | }, 130 | "5": { 131 | "count": 5, 132 | "x_min": 5, 133 | "x_max": 20, 134 | "y_min": -5, 135 | "y_max": 8 136 | } 137 | } 138 | } 139 | } 140 | 141 | size = cfg["image"]["size"] 142 | c = cfg["image"]["supersampling"] 143 | 144 | # Additional config 145 | letter_count = "4" 146 | 147 | 148 | def horizontal_lines(draw, c, line_par, line_pos, fill): 149 | for _ in range(line_par["n"]): 150 | pos = random.randrange(0, len(line_pos)) 151 | ax = random.randint(*line_pos[pos]["ax"]) 152 | ay = random.randint(*line_pos[pos]["ay"]) 153 | bx = random.randint(*line_pos[pos]["bx"]) 154 | by = random.randint(*line_pos[pos]["by"]) 155 | draw.line([(ax*c, ay*c), (bx*c, by*c)], width=line_par["w"]*c, fill=fill) 156 | 157 | 158 | def make_captcha(): 159 | 160 | color_name, color = random.choice(list(cfg["colors"].items())) 161 | text = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(cfg["text"]["letters"][letter_count]["count"])) 162 | 163 | path = get_path(text) 164 | 165 | w = size[0]*c 166 | h = size[1]*c 167 | 168 | img = Image.new('RGB', (w, h)) 169 | pixels = img.load() 170 | 171 | # noise 172 | for x in range(w): 173 | for y in range(h): 174 | rcol = random.randint(cfg["noise"]["min"], cfg["noise"]["max"]) 175 | pixels[x, y] = (rcol, rcol, rcol) 176 | 177 | # background lines 178 | draw = ImageDraw.Draw(img) 179 | 180 | horizontal_lines(draw, c, cfg["lines"]["back_thin"], cfg["lines"]["back_positions"], color[0]) 181 | horizontal_lines(draw, c, cfg["lines"]["back_thick"], cfg["lines"]["back_positions"], color[0]) 182 | 183 | # text 184 | ctx = cfg["text"]["font"] 185 | font = ImageFont.truetype(ctx["path"], ctx["size"]*c) 186 | outline = random.choice(ctx["outline"]) 187 | 188 | ctx = cfg["text"]["letters"][letter_count] 189 | x = random.randint(ctx["x_min"], ctx["x_max"]) 190 | y = random.randint(ctx["y_min"], ctx["y_max"]) 191 | draw.text((x*c-outline*c, y*c-outline*c), text, color[0], font=font) 192 | draw.text((x*c-outline*c, y*c), text, color[0], font=font) 193 | draw.text((x*c-outline*c, y*c+outline*c), text, color[0], font=font) 194 | draw.text((x*c, y*c-outline*c), text, color[0], font=font) 195 | draw.text((x*c, y*c+outline*c), text, color[0], font=font) 196 | draw.text((x*c+outline*c, y*c-outline*c), text, color[0], font=font) 197 | draw.text((x*c+outline*c, y*c), text, color[0], font=font) 198 | draw.text((x*c+outline*c, y*c+outline*c), text, color[0], font=font) 199 | draw.text((x*c, y*c), text, color[1], font=font) 200 | 201 | # foreground lines 202 | horizontal_lines(draw, c, cfg["lines"]["front_horizontal_thin"], cfg["lines"]["front_horizontal_positions"], color[1]) 203 | horizontal_lines(draw, c, cfg["lines"]["front_horizontal_thick"], cfg["lines"]["front_horizontal_positions"], color[1]) 204 | 205 | # vertical lines 206 | line_par = cfg["lines"]["front_vertical"] 207 | line_pos = cfg["lines"]["front_vertical_positions"] 208 | 209 | for _ in range(line_par["n"]): 210 | ax = random.randint(x-line_pos["outside"], x+line_pos["outside"] + cfg["text"]["letters"][letter_count]["count"]*line_pos["font_width"]) 211 | bx = ax + random.randint(-line_pos["font_width"], line_pos["font_width"]) 212 | ay = random.randint(*line_pos["ay"]) 213 | by = random.randint(*line_pos["by"]) 214 | draw.line([(ax*c, ay*c), (bx*c, by*c)], width=line_par["w"]*c, fill=color[1]) 215 | 216 | img.thumbnail(cfg["image"]["size"], Image.ANTIALIAS) 217 | img.save(path, "png") 218 | 219 | return text 220 | 221 | 222 | if __name__ == "__main__": 223 | make_captcha() 224 | -------------------------------------------------------------------------------- /captchas/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/captchas/.gitkeep -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from logging import FileHandler, StreamHandler 4 | 5 | import redis as r 6 | from flask import session, abort 7 | 8 | import config 9 | from database import Database 10 | from search.search import ElasticSearchEngine 11 | from tasks import TaskManager 12 | 13 | # Disable flask logging 14 | flaskLogger = logging.getLogger('werkzeug') 15 | flaskLogger.setLevel(logging.ERROR) 16 | 17 | logger = logging.getLogger("default") 18 | logger.setLevel(logging.DEBUG) 19 | 20 | formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') 21 | file_handler = FileHandler("oddb.log") 22 | file_handler.setFormatter(formatter) 23 | for h in logger.handlers: 24 | logger.removeHandler(h) 25 | logger.addHandler(file_handler) 26 | logger.addHandler(StreamHandler(sys.stdout)) 27 | 28 | taskManager = TaskManager() 29 | searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) 30 | searchEngine.start_stats_scheduler() 31 | db = Database(config.DB_CONN_STR) 32 | 33 | redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT) 34 | 35 | 36 | def require_role(role: str): 37 | if db.get_user_role(session.get("username", None)) != role: 38 | abort(403) 39 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | from os import environ 2 | 3 | CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False)) 4 | CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False)) 5 | CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False)) 6 | CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10)) 7 | 8 | FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret") 9 | RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000) 10 | 11 | SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False)) 12 | SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True)) 13 | 14 | TT_API = environ.get("TT_API", "http://localhost:3010") 15 | TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3)) 16 | TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9)) 17 | 18 | WSB_API = environ.get("WSB_API", "http://localhost:3020") 19 | WSB_SECRET = environ.get("WSB_SECRET", "default_secret") 20 | 21 | ES_URL = environ.get("ES_URL", "http://localhost:9200") 22 | ES_INDEX = environ.get("ES_INDEX", "od-database") 23 | 24 | REDIS_HOST = environ.get("REDIS_HOST", "localhost") 25 | REDIS_PORT = environ.get("REDIS_PORT", 6379) 26 | 27 | DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database") 28 | RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000) 29 | INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3)) 30 | -------------------------------------------------------------------------------- /database.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import uuid 4 | from urllib.parse import urlparse, urljoin 5 | 6 | import bcrypt 7 | import psycopg2 8 | 9 | 10 | class BlacklistedWebsite: 11 | def __init__(self, blacklist_id, url): 12 | self.id = blacklist_id 13 | self.netloc = url 14 | 15 | 16 | class Website: 17 | 18 | def __init__(self, url, logged_ip, logged_useragent, last_modified=None, website_id=None): 19 | self.url = url 20 | self.logged_ip = logged_ip 21 | self.logged_useragent = logged_useragent 22 | self.last_modified = last_modified 23 | self.id = website_id 24 | 25 | 26 | class ApiClient: 27 | 28 | def __init__(self, token, name): 29 | self.token = token 30 | self.name = name 31 | 32 | 33 | class Database: 34 | 35 | def __init__(self, db_conn_str): 36 | self.db_conn_str = db_conn_str 37 | self.website_cache = dict() 38 | self.website_cache_time = 0 39 | 40 | with psycopg2.connect(self.db_conn_str) as conn: 41 | cursor = conn.cursor() 42 | cursor.execute("SELECT EXISTS (SELECT 1 FROM pg_tables " 43 | "WHERE tablename = 'searchlogentry')") 44 | 45 | if not cursor.fetchone()[0]: 46 | self.init_database() 47 | 48 | def init_database(self): 49 | 50 | print("Initializing database") 51 | 52 | with open("init_script.sql", "r") as f: 53 | init_script = f.read() 54 | 55 | with psycopg2.connect(self.db_conn_str) as conn: 56 | cur = conn.cursor() 57 | cur.execute(init_script) 58 | 59 | def update_website_date_if_exists(self, website_id): 60 | 61 | with psycopg2.connect(self.db_conn_str) as conn: 62 | cursor = conn.cursor() 63 | cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=%s", (website_id,)) 64 | conn.commit() 65 | 66 | def insert_website(self, website: Website): 67 | 68 | with psycopg2.connect(self.db_conn_str) as conn: 69 | cursor = conn.cursor() 70 | cursor.execute("INSERT INTO Website (url, logged_ip, logged_useragent) VALUES (%s,%s,%s) RETURNING id", 71 | (website.url, str(website.logged_ip), str(website.logged_useragent))) 72 | 73 | website_id = cursor.fetchone()[0] 74 | conn.commit() 75 | 76 | return website_id 77 | 78 | def get_website_by_url(self, url): 79 | 80 | with psycopg2.connect(self.db_conn_str) as conn: 81 | cursor = conn.cursor() 82 | 83 | cursor.execute("SELECT id, url, logged_ip, logged_useragent, last_modified FROM Website WHERE url=%s", 84 | (url,)) 85 | db_web = cursor.fetchone() 86 | if db_web: 87 | website = Website(db_web[1], db_web[2], db_web[3], db_web[4], str(db_web[0])) 88 | return website 89 | else: 90 | return None 91 | 92 | def get_website_by_id(self, website_id): 93 | 94 | with psycopg2.connect(self.db_conn_str) as conn: 95 | cursor = conn.cursor() 96 | 97 | cursor.execute("SELECT * FROM Website WHERE id=%s", (website_id,)) 98 | db_web = cursor.fetchone() 99 | 100 | if db_web: 101 | website = Website(db_web[1], db_web[2], db_web[3], str(db_web[4])) 102 | website.id = db_web[0] 103 | return website 104 | else: 105 | return None 106 | 107 | def get_websites(self, per_page, page: int, url): 108 | """Get all websites""" 109 | with psycopg2.connect(self.db_conn_str) as conn: 110 | cursor = conn.cursor() 111 | 112 | cursor.execute("SELECT Website.id, Website.url, Website.last_modified FROM Website " 113 | "WHERE Website.url LIKE %s " 114 | "ORDER BY last_modified DESC LIMIT %s OFFSET %s", (url + "%", per_page, page * per_page)) 115 | 116 | return cursor.fetchall() 117 | 118 | def get_random_website_id(self): 119 | 120 | with psycopg2.connect(self.db_conn_str) as conn: 121 | cursor = conn.cursor() 122 | cursor.execute("SELECT id FROM Website ORDER BY random() LIMIT 1") 123 | 124 | row = cursor.fetchone() 125 | if row: 126 | return row[0] 127 | return None 128 | 129 | def website_exists(self, url): 130 | """Check if an url or the parent directory of an url already exists""" 131 | with psycopg2.connect(self.db_conn_str) as conn: 132 | cursor = conn.cursor() 133 | 134 | cursor.execute("SELECT id FROM Website WHERE url = substr(%s, 0, length(url) + 1)", (url,)) 135 | website_id = cursor.fetchone() 136 | return website_id[0] if website_id else None 137 | 138 | def delete_website(self, website_id): 139 | 140 | with psycopg2.connect(self.db_conn_str) as conn: 141 | cursor = conn.cursor() 142 | 143 | cursor.execute("DELETE FROM Website WHERE id=%s", (website_id,)) 144 | conn.commit() 145 | 146 | def check_login(self, username, password) -> bool: 147 | with psycopg2.connect(self.db_conn_str) as conn: 148 | cursor = conn.cursor() 149 | 150 | cursor.execute("SELECT password FROM Admin WHERE username=%s", (username,)) 151 | 152 | db_user = cursor.fetchone() 153 | 154 | if db_user: 155 | return bcrypt.checkpw(password.encode(), db_user[0].tobytes()) 156 | return False 157 | 158 | def get_user_role(self, username: str): 159 | with psycopg2.connect(self.db_conn_str) as conn: 160 | cursor = conn.cursor() 161 | 162 | cursor.execute("SELECT role FROM Admin WHERE username=%s", (username,)) 163 | 164 | db_user = cursor.fetchone() 165 | 166 | if db_user: 167 | return db_user[0] 168 | return False 169 | 170 | def generate_login(self, username, password) -> None: 171 | 172 | with psycopg2.connect(self.db_conn_str) as conn: 173 | cursor = conn.cursor() 174 | 175 | hashed_pw = bcrypt.hashpw(password.encode(), bcrypt.gensalt(12)) 176 | 177 | cursor.execute("INSERT INTO Admin (username, password, role) VALUES (%s,%s, 'admin')", 178 | (username, hashed_pw)) 179 | conn.commit() 180 | 181 | def check_api_token(self, token) -> str: 182 | 183 | with psycopg2.connect(self.db_conn_str) as conn: 184 | cursor = conn.cursor() 185 | 186 | cursor.execute("SELECT name FROM ApiClient WHERE token=%s", (token,)) 187 | result = cursor.fetchone() 188 | return result[0] if result else None 189 | 190 | def generate_api_token(self, name: str) -> str: 191 | 192 | with psycopg2.connect(self.db_conn_str) as conn: 193 | cursor = conn.cursor() 194 | 195 | token = str(uuid.uuid4()) 196 | cursor.execute("INSERT INTO ApiClient (token, name) VALUES (%s, %s)", (token, name)) 197 | conn.commit() 198 | 199 | return token 200 | 201 | def get_tokens(self) -> list: 202 | 203 | with psycopg2.connect(self.db_conn_str) as conn: 204 | cursor = conn.cursor() 205 | 206 | cursor.execute("SELECT token, name FROM ApiClient") 207 | 208 | return [ApiClient(x[0], x[1]) for x in cursor.fetchall()] 209 | 210 | def delete_token(self, token: str) -> None: 211 | 212 | with psycopg2.connect(self.db_conn_str) as conn: 213 | cursor = conn.cursor() 214 | 215 | cursor.execute("DELETE FROM ApiClient WHERE token=%s", (token,)) 216 | conn.commit() 217 | 218 | def get_all_websites(self) -> dict: 219 | if self.website_cache_time + 120 < time.time(): 220 | with psycopg2.connect(self.db_conn_str) as conn: 221 | cursor = conn.cursor() 222 | 223 | cursor.execute("SELECT id, url FROM Website") 224 | 225 | result = dict() 226 | 227 | for db_website in cursor.fetchall(): 228 | result[db_website[0]] = db_website[1] 229 | 230 | self.website_cache = result 231 | self.website_cache_time = time.time() 232 | 233 | return self.website_cache 234 | 235 | def join_website_on_search_result(self, page: dict) -> dict: 236 | 237 | websites = self.get_all_websites() 238 | 239 | for hit in page["hits"]["hits"]: 240 | if hit["_source"]["website_id"] in websites: 241 | hit["_source"]["website_url"] = urljoin(websites[hit["_source"]["website_id"]], "/") 242 | else: 243 | hit["_source"]["website_url"] = "[DELETED]" 244 | 245 | return page 246 | 247 | def join_website_url(self, docs): 248 | 249 | websites = self.get_all_websites() 250 | 251 | for doc in docs: 252 | if doc["_source"]["website_id"] in websites: 253 | doc["_source"]["website_url"] = urljoin(websites[doc["_source"]["website_id"]], "/") 254 | else: 255 | doc["_source"]["website_url"] = "[DELETED]" 256 | 257 | yield doc 258 | 259 | def join_website_on_stats(self, stats): 260 | 261 | websites = self.get_all_websites() 262 | 263 | for website in stats["website_scatter"]: 264 | website[0] = websites.get(website[0], "[DELETED]") 265 | 266 | def add_blacklist_website(self, url): 267 | 268 | with psycopg2.connect(self.db_conn_str) as conn: 269 | cursor = conn.cursor() 270 | parsed_url = urlparse(url) 271 | url = parsed_url.scheme + "://" + parsed_url.netloc 272 | cursor.execute("INSERT INTO BlacklistedWebsite (url) VALUES (%s)", (url,)) 273 | conn.commit() 274 | 275 | def remove_blacklist_website(self, blacklist_id): 276 | 277 | with psycopg2.connect(self.db_conn_str) as conn: 278 | cursor = conn.cursor() 279 | 280 | cursor.execute("DELETE FROM BlacklistedWebsite WHERE id=%s", (blacklist_id,)) 281 | conn.commit() 282 | 283 | def is_blacklisted(self, url): 284 | 285 | with psycopg2.connect(self.db_conn_str) as conn: 286 | cursor = conn.cursor() 287 | parsed_url = urlparse(url) 288 | url = parsed_url.scheme + "://" + parsed_url.netloc 289 | print(url) 290 | cursor.execute("SELECT id FROM BlacklistedWebsite WHERE url LIKE %s LIMIT 1", (url,)) 291 | 292 | return cursor.fetchone() is not None 293 | 294 | def get_blacklist(self): 295 | 296 | with psycopg2.connect(self.db_conn_str) as conn: 297 | cursor = conn.cursor() 298 | 299 | cursor.execute("SELECT * FROM BlacklistedWebsite") 300 | return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()] 301 | 302 | def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took): 303 | 304 | with psycopg2.connect(self.db_conn_str) as conn: 305 | cursor = conn.cursor() 306 | 307 | cursor.execute( 308 | "INSERT INTO SearchLogEntry " 309 | "(remote_addr, forwarded_for, query, extensions, page, blocked, results, took) " 310 | "VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", 311 | (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took)) 312 | 313 | conn.commit() 314 | 315 | def get_oldest_updated_websites(self, size: int, prefix: str): 316 | 317 | with psycopg2.connect(self.db_conn_str) as conn: 318 | cursor = conn.cursor() 319 | 320 | cursor.execute("SELECT id, url, last_modified FROM website " 321 | "WHERE url LIKE %s " 322 | "ORDER BY last_modified ASC LIMIT %s", 323 | (prefix + "%", size, )) 324 | return [Website(url=r[1], 325 | website_id=r[0], 326 | last_modified=r[2], 327 | logged_ip=None, 328 | logged_useragent=None 329 | ) 330 | for r in cursor.fetchall()] 331 | -------------------------------------------------------------------------------- /do_recrawl.py: -------------------------------------------------------------------------------- 1 | from tasks import TaskManager 2 | 3 | tm = TaskManager() 4 | tm.do_recrawl() 5 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.1" 2 | services: 3 | oddb: 4 | build: . 5 | ports: 6 | - 5020:80 7 | environment: 8 | - "CAPTCHA_LOGIN=True" 9 | - "CAPTCHA_SUBMIT=True" 10 | - "CAPTCHA_SEARCH=True" 11 | - "CAPTCHA_EVERY=10" 12 | - "FLASK_SECRET=changeme" 13 | - "SUBMIT_FTP=False" 14 | - "SUBMIT_HTTP=True" 15 | - "TT_API=http://tt:3010" 16 | - "TT_CRAWL_PROJECT=1" 17 | - "TT_INDEX_PROJECT=2" 18 | - "WSB_API=http://wsb:3020" 19 | - "WSB_SECRET=changeme" 20 | - "REDIS_HOST=oddb_redis" 21 | - "ES_URL=es:9200" 22 | - "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable" 23 | - "RECRAWL_POOL_SIZE=10000" 24 | - "INDEXER_THREADS=2" 25 | - "ODDB_USER=admin" 26 | - "ODDB_PASSWORD=changeme" 27 | depends_on: 28 | wsb: 29 | condition: service_started 30 | tt: 31 | condition: service_started 32 | oddb_db: 33 | condition: service_healthy 34 | es: 35 | condition: service_healthy 36 | restart: always 37 | oddb_db: 38 | image: postgres 39 | volumes: 40 | - ./oddb_pg_data:/var/lib/postgresql/data 41 | environment: 42 | - "POSTGRES_USER=od_database" 43 | - "POSTGRES_PASSWORD=changeme" 44 | healthcheck: 45 | test: ["CMD-SHELL", "pg_isready -U od_database"] 46 | interval: 5s 47 | timeout: 5s 48 | retries: 5 49 | oddb_redis: 50 | image: redis 51 | wsb: 52 | image: simon987/wsb_bucket 53 | volumes: 54 | - ./wsb_data:/data 55 | environment: 56 | - "WS_BUCKET_SECRET=changeme" 57 | ports: 58 | - 3020:3020 59 | tt: 60 | image: simon987/task_tracker 61 | volumes: 62 | - ./tt_config.yml:/root/config.yml 63 | ports: 64 | - 3010:80 65 | depends_on: 66 | tt_db: 67 | condition: service_healthy 68 | tt_web: 69 | image: simon987/task_tracker_web 70 | ports: 71 | - 3011:80 72 | depends_on: 73 | tt: 74 | condition: service_started 75 | tt_db: 76 | image: postgres 77 | volumes: 78 | - ./tt_pg_data:/var/lib/postgresql/data 79 | environment: 80 | - "POSTGRES_USER=task_tracker" 81 | - "POSTGRES_PASSWORD=changeme" 82 | healthcheck: 83 | test: ["CMD-SHELL", "pg_isready -U task_tracker"] 84 | interval: 3s 85 | timeout: 2s 86 | retries: 10 87 | es: 88 | image: docker.elastic.co/elasticsearch/elasticsearch:7.5.2 89 | environment: 90 | - discovery.type=single-node 91 | - "ES_JAVA_OPTS=-Xms1G -Xmx4G" 92 | volumes: 93 | - ./es_data:/usr/share/elasticsearch/data 94 | healthcheck: 95 | test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"] 96 | interval: 5s 97 | timeout: 5s 98 | retries: 5 99 | # (Optional) 100 | kibana: 101 | image: docker.elastic.co/kibana/kibana:7.5.2 102 | environment: 103 | - ELASTICSEARCH_HOSTS=http://es:9200 104 | ports: 105 | - 5021:5601 106 | depends_on: 107 | es: 108 | condition: service_healthy 109 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import lz4.frame 5 | 6 | import config 7 | from database import Database 8 | from search.search import ElasticSearchEngine 9 | 10 | 11 | def quote(string): 12 | if "\"" in string: 13 | return "\"" + string.replace("\"", "\"\"") + "\"" 14 | elif "," in string: 15 | return "\"" + string + "\"" 16 | else: 17 | return string 18 | 19 | 20 | outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime()) 21 | dldir = "static/downloads/" 22 | 23 | print("Deleting existing dumps") 24 | for file in os.listdir(dldir): 25 | if file.endswith("_dump.csv.lz4"): 26 | os.remove(os.path.join(dldir, file)) 27 | 28 | print("Export started, connecting to databases...") 29 | 30 | db = Database(config.DB_CONN_STR) 31 | es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) 32 | 33 | docs_with_url = db.join_website_url(es.stream_all_docs()) 34 | 35 | print("Connected, writing to csv") 36 | 37 | with lz4.frame.open(outfile + ".part", mode='wb', 38 | compression_level=9, 39 | block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp: 40 | fp.write((",".join( 41 | ["website_id", "website_url", "path", "name", "ext", "size", "mtime"] 42 | ) + "\n").encode()) 43 | 44 | for doc in docs_with_url: 45 | try: 46 | fp.write( 47 | (",".join( 48 | [ 49 | str(doc["_source"]["website_id"]), 50 | quote(doc["_source"]["website_url"]), 51 | quote(doc["_source"]["path"]), 52 | quote(doc["_source"]["name"]), 53 | quote(doc["_source"]["ext"]), 54 | str(doc["_source"]["size"]), 55 | str(doc["_source"]["mtime"]) 56 | ] 57 | ) + "\n").encode()) 58 | except Exception as e: 59 | print(e) 60 | print(doc) 61 | 62 | 63 | os.rename(outfile + ".part", os.path.join(dldir, outfile)) 64 | -------------------------------------------------------------------------------- /high_level_diagram.dia: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/high_level_diagram.dia -------------------------------------------------------------------------------- /high_level_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/high_level_diagram.png -------------------------------------------------------------------------------- /init_script.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS Website, Admin, BlacklistedWebsite, ApiClient, SearchLogEntry; 2 | 3 | CREATE TABLE Website ( 4 | 5 | id SERIAL PRIMARY KEY NOT NULL, 6 | url TEXT, 7 | logged_ip TEXT, 8 | logged_useragent TEXT, 9 | last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP 10 | ); 11 | 12 | CREATE TABLE Admin ( 13 | username TEXT PRIMARY KEY NOT NULL, 14 | password BYTEA, 15 | role TEXT 16 | ); 17 | 18 | CREATE TABLE BlacklistedWebsite ( 19 | id SERIAL PRIMARY KEY NOT NULL, 20 | url TEXT 21 | ); 22 | 23 | CREATE TABLE ApiClient ( 24 | name TEXT PRIMARY KEY NOT NULL, 25 | token TEXT NOT NULL 26 | ); 27 | 28 | CREATE TABLE SearchLogEntry ( 29 | id SERIAL PRIMARY KEY, 30 | search_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 31 | remote_addr TEXT, 32 | forwarded_for TEXT, 33 | query TEXT, 34 | extensions TEXT, 35 | page INT, 36 | blocked BOOLEAN DEFAULT FALSE, 37 | results INT DEFAULT 0, 38 | took INT DEFAULT 0 39 | ); 40 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from app import app 2 | 3 | if __name__ == '__main__': 4 | app.run("0.0.0.0", port=12345) 5 | -------------------------------------------------------------------------------- /mass_import.py: -------------------------------------------------------------------------------- 1 | import fileinput 2 | import os 3 | from multiprocessing.pool import Pool 4 | 5 | import od_util 6 | from common import db, taskManager 7 | from database import Website 8 | from tasks import Task 9 | 10 | urls = (line for line in fileinput.input()) 11 | 12 | 13 | def try_enqueue(url): 14 | url = os.path.join(url, "") 15 | url = od_util.get_top_directory(url) 16 | 17 | if not od_util.is_valid_url(url): 18 | return "Error: Invalid url. Make sure to include the appropriate scheme." 19 | 20 | website = db.get_website_by_url(url) 21 | if website: 22 | return "Website already exists" 23 | 24 | website = db.website_exists(url) 25 | if website: 26 | return "A parent directory of this url has already been posted" 27 | 28 | if db.is_blacklisted(url): 29 | return "Error: " \ 30 | "Sorry, this website has been blacklisted. If you think " \ 31 | "this is an error, please contact me." 32 | 33 | if not od_util.is_od(url): 34 | return "Error:" \ 35 | "The anti-spam algorithm determined that the submitted url is not " \ 36 | "an open directory or the server is not responding. If you think " \ 37 | "this is an error, please contact me." 38 | 39 | website_id = db.insert_website(Website(url, "localhost", "mass_import.py")) 40 | 41 | task = Task(website_id, url, priority=2) 42 | taskManager.queue_task(task) 43 | 44 | return "The website has been added to the queue" 45 | 46 | 47 | def check_url(url): 48 | url = os.path.join(url.strip(), "") 49 | try: 50 | print(try_enqueue(url)) 51 | except: 52 | pass 53 | return None 54 | 55 | 56 | pool = Pool(processes=50) 57 | pool.map(func=check_url, iterable=urls) 58 | pool.close() 59 | -------------------------------------------------------------------------------- /od_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from ftplib import FTP 4 | from urllib.parse import urljoin, urlparse 5 | 6 | import requests 7 | import validators 8 | from bs4 import BeautifulSoup 9 | 10 | # TODO: find a better way to do this 11 | try: 12 | from . import config 13 | except (ImportError, SystemError): 14 | import config 15 | 16 | import urllib3 17 | urllib3.disable_warnings() 18 | 19 | 20 | def truncate_path(path, max_len): 21 | pattern = re.compile(r"/?.*?/") 22 | 23 | for i in range(1, path.count("/")): 24 | new_path = pattern.sub(".../", path, i) 25 | if len(new_path) < max_len: 26 | return new_path 27 | return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path 28 | 29 | 30 | category_map = { 31 | 32 | # Application category 33 | 'bcpio': 'application', 'bin': 'application', 'cdf': 'application', 34 | 'csh': 'application', 'dll': 'application', 'doc': 'application', 35 | 'dot': 'application', 'dvi': 'application', 'eml': 'application', 36 | 'exe': 'application', 'hdf': 'application', 37 | 'man': 'application', 'me': 'application', 'mht': 'application', 38 | 'mhtml': 'application', 'mif': 'application', 'ms': 'application', 39 | 'nc': 'application', 'nws': 'application', 'o': 'application', 40 | 'obj': 'application', 'oda': 'application', 'p12': 'application', 41 | 'p7c': 'application', 'pfx': 'application', 'tr': 'application', 42 | 'ppa': 'application', 'pps': 'application', 'ppt': 'application', 43 | 'ps': 'application', 'pwz': 'application', 'pyc': 'application', 44 | 'pyo': 'application', 'ram': 'application', 'rdf': 'application', 45 | 'roff': 'application', 'sh': 'application', 'so': 'application', 46 | 'src': 'application', 'sv4cpio': 'application', 'sv4crc': 'application', 47 | 't': 'application', 'tcl': 'application', 'tex': 'application', 48 | 'texi': 'application', 'texinfo': 'application', 'ustar': 'application', 49 | 'wiz': 'application', 'wsdl': 'application', 'xlb': 'application', 50 | 'xls': 'application', 'xpdl': 'application', 'xsl': 'application', 51 | 'torrent': 'application', 'rpm': 'application', 'deb': 'application', 52 | 'atr': 'application', 'class': 'application', 'ttf': 'application', 53 | 'img': 'application', 'msi': 'application', 'run': 'application', 54 | 'drpm': 'application', 'udeb': 'application', 'patch': 'application', 55 | 'nes': 'application', 'ebuild': 'application', 'scr': 'application', 56 | # Text category 57 | 'java': 'text', 'cpp': 'text', 'rb': 'text', 58 | 'bat': 'text', 'latex': 'text', 'xml': 'text', 59 | 'etx': 'text', 'htm': 'text', 'c': 'text', 60 | 'css': 'text', 'csv': 'text', 'html': 'text', 61 | 'js': 'text', 'json': 'text', 'ksh': 'text', 62 | 'pl': 'text', 'pot': 'application', 'py': 'text', 63 | 'h': 'text', 'tsv': 'text', 'rtx': 'text', 64 | 'sgm': 'text', 'sgml': 'text', 'txt': 'text', 65 | 'vcf': 'text', 'pdf': 'text', 'epub': 'text', 66 | 'srt': 'text', 'inc': 'text', 'php': 'text', 67 | 'cbz': 'text', 'docx': 'text', 'mobi': 'text', 68 | 'chm': 'text', 'xlsx': "text", 'djvu': 'text', 69 | 'rtf': 'text', 'log': 'text', 'md': 'text', 70 | 'dsc': 'text', 'info': 'text', 71 | # Video category 72 | '3g2': 'video', '3gp': 'video', 'asf': 'video', 73 | 'asx': 'video', 'avi': 'video', 'flv': 'video', 74 | 'swf': 'video', 'vob:': 'video', 'qt': 'video', 75 | 'webm': 'video', 'mov': 'video', 'm1v': 'video', 76 | 'm3u': 'video', 'm3u8': 'video', 'movie': 'video', 77 | 'mp4': 'video', 'mpa': 'video', 'mpe': 'video', 78 | 'mpeg': 'video', 'mpg': 'video', 'mkv': 'video', 79 | 'wmv': 'video', 'm4s': 'video', 'ogv': 'video', 80 | 'm4b': 'video', 'm4v': 'video', 'ts': 'video', 81 | 82 | # Audio category 83 | 'wav': 'audio', 'snd': 'audio', 'mp2': 'audio', 84 | 'aif': 'audio', 'iff': 'audio', 'm4a': 'audio', 85 | 'mid': 'audio', 'midi': 'audio', 'mp3': 'audio', 86 | 'wma': 'audio', 'ra': 'audio', 'aifc': 'audio', 87 | 'aiff': 'audio', 'au': 'audio', 'flac': 'audio', 88 | 'ogg': 'audio', 'oga': 'audio', 'mka': 'video', 89 | 'ac3': 'audio', 90 | # Image category 91 | 'bmp': 'image', 'gif': 'image', 'jpg': 'image', 92 | 'xwd': 'image', 'tif': 'image', 'tiff': 'image', 93 | 'png': 'image', 'pnm': 'image', 'ras': 'image', 94 | 'ico': 'image', 'ief': 'image', 'pgm': 'image', 95 | 'jpe': 'image', 'pbm': 'image', 'jpeg': 'image', 96 | 'ppm': 'image', 'xpm': 'image', 'xbm': 'image', 97 | 'rgb': 'image', 'svg': 'image', 'psd': 'image', 98 | 'yuv': 'image', 'ai': 'image', 'eps': 'image', 99 | 'bw': 'image', 'hdr': 'image', 100 | # Archive category 101 | 'ar': 'archive', 'cpio': 'archive', 'shar': 'archive', 102 | 'iso': 'archive', 'lbr': 'archive', 'mar': 'archive', 103 | 'sbx': 'archive', 'bz2': 'archive', 'f': 'archive', 104 | 'gz': 'archive', 'lz': 'archive', 'lzma': 'archive', 105 | 'lzo': 'archive', 'rz': 'archive', 'sfark': 'archive', 106 | 'sz': 'archive', 'z': 'archive', '7z': 'archive', 107 | 's7z': 'archive', 'ace': 'archive', 'afa': 'archive', 108 | 'alz': 'archive', 'apk': 'archive', 'arc': 'archive', 109 | 'arj': 'archive', 'b1': 'archive', 'b6z': 'archive', 110 | 'a': 'archive', 'bh': 'archive', 'cab': 'archive', 111 | 'car': 'archive', 'cfs': 'archive', 'cpt': 'archive', 112 | 'dar': 'archive', 'dd': 'archive', 'dgc': 'archive', 113 | 'dmg': 'archive', 'ear': 'archive', 'gca': 'archive', 114 | 'ha': 'archive', 'hki': 'archive', 'ice': 'archive', 115 | 'jar': 'archive', 'kgb': 'archive', 'lzh': 'archive', 116 | 'lha': 'archive', 'lzx': 'archive', 'pak': 'archive', 117 | 'partimg': 'archive', 'paq6': 'archive', 'paq7': 'archive', 118 | 'paq8': 'archive', 'pea': 'archive', 'pim': 'archive', 119 | 'pit': 'archive', 'qda': 'archive', 'rar': 'archive', 120 | 'rk': 'archive', 'sda': 'archive', 'sea': 'archive', 121 | 'sen': 'archive', 'sfx': 'archive', 'shk': 'archive', 122 | 'sit': 'archive', 'sitx': 'archive', 'sqx': 'archive', 123 | 'tbz2': 'archive', 'tlz': 'archive', 'xz': 'archive', 124 | 'txz': 'archive', 'uc': 'archive', 'uc0': 'archive', 125 | 'uc2': 'archive', 'ucn': 'archive', 'ur2': 'archive', 126 | 'ue2': 'archive', 'uca': 'archive', 'uha': 'archive', 127 | 'war': 'archive', 'wim': 'archive', 'xar': 'archive', 128 | 'xp3': 'archive', 'yz1': 'archive', 'zip': 'archive', 129 | 'zipx': 'archive', 'zoo': 'archive', 'zpaq': 'archive', 130 | 'zz': 'archive', 'xpi': 'archive', 'tgz': 'archive', 131 | 'tbz': 'archive', 'tar': 'archive', 'bz': 'archive', 132 | 'diz': 'archive', 133 | } 134 | 135 | colors = { 136 | "application": "bg-application", 137 | "text": "bg-text", 138 | "video": "bg-video", 139 | "image": "bg-image", 140 | "audio": "bg-audio", 141 | "archive": "bg-archive" 142 | } 143 | 144 | 145 | def get_color(category): 146 | return colors.get(category, None) 147 | 148 | 149 | def get_category(extension): 150 | return category_map.get(extension, None) 151 | 152 | 153 | def is_valid_url(url): 154 | if not url.endswith("/"): 155 | return False 156 | 157 | if not url.startswith(("http://", "https://", "ftp://")): 158 | return False 159 | 160 | return validators.url(url) 161 | 162 | 163 | def has_extension(link): 164 | return len(os.path.splitext(link)[1]) > 0 165 | 166 | 167 | def is_external_link(base_url, url: str): 168 | url = urljoin(base_url, url).strip() 169 | 170 | if base_url in url: 171 | return False 172 | return True 173 | 174 | 175 | def is_od(url): 176 | if not url.endswith("/"): 177 | print("Url does not end with trailing /") 178 | return False 179 | 180 | try: 181 | if url.startswith("ftp://") and config.SUBMIT_FTP: 182 | ftp = FTP(urlparse(url).netloc) 183 | ftp.login() 184 | ftp.close() 185 | return True 186 | elif config.SUBMIT_HTTP: 187 | r = requests.get(url, timeout=30, allow_redirects=False, verify=False) 188 | if r.status_code != 200: 189 | # print("No redirects allowed!") 190 | return False 191 | soup = BeautifulSoup(r.text, "lxml") 192 | 193 | external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a")) 194 | link_tags = len(list(soup.find_all("link"))) 195 | script_tags = len(list(soup.find_all("script"))) 196 | 197 | if external_links > 11: 198 | # print("Too many external links!") 199 | return False 200 | 201 | if link_tags > 5: 202 | # print("Too many link tags!") 203 | return False 204 | 205 | if script_tags > 7: 206 | # print("Too many script tags!") 207 | return False 208 | 209 | return True 210 | 211 | except Exception as e: 212 | # print(e) 213 | return False 214 | 215 | 216 | def has_parent_dir(url): 217 | 218 | parsed_url = urlparse(url) 219 | 220 | if parsed_url.path == "/": 221 | return False 222 | 223 | parent_url = urljoin(url, "../") 224 | try: 225 | r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False) 226 | if r.status_code != 200: 227 | return False 228 | soup = BeautifulSoup(r.text, "lxml") 229 | 230 | for anchor in soup.find_all("a"): 231 | if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url: 232 | # The parent page exists, and has a link to the child directory 233 | return is_od(parent_url) 234 | 235 | except: 236 | return False 237 | 238 | # Parent page exists, but does not have a link to the child directory 239 | return False 240 | 241 | 242 | def get_top_directory(url): 243 | if url.startswith("ftp://"): 244 | return url 245 | 246 | while has_parent_dir(url): 247 | url = urljoin(url, "../") 248 | return url 249 | -------------------------------------------------------------------------------- /reddit_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import humanfriendly 5 | import praw 6 | 7 | 8 | class RedditBot: 9 | 10 | bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \ 11 | "open directories posted in /r/opendirectories/)" 12 | 13 | def __init__(self, log_file: str, reddit: praw.Reddit): 14 | 15 | self.log_file = log_file 16 | 17 | self.crawled = [] 18 | self.load_from_file() 19 | self.reddit = reddit 20 | 21 | def log_crawl(self, post_id): 22 | 23 | self.load_from_file() 24 | self.crawled.append(post_id) 25 | 26 | with open(self.log_file, "w") as f: 27 | for post_id in self.crawled: 28 | f.write(post_id + "\n") 29 | 30 | def has_crawled(self, post_id): 31 | self.load_from_file() 32 | return post_id in self.crawled 33 | 34 | def load_from_file(self): 35 | if not os.path.isfile(self.log_file): 36 | self.crawled = [] 37 | else: 38 | with open(self.log_file, "r") as f: 39 | self.crawled = list(filter(None, f.read().split("\n"))) 40 | 41 | def reply(self, reddit_obj, comment: str): 42 | 43 | while True: 44 | try: 45 | if not self.has_crawled(reddit_obj.id): 46 | reply = reddit_obj.reply(comment) 47 | self.log_crawl(reddit_obj.id) 48 | print("Reply to " + reddit_obj.id) 49 | return reply 50 | break 51 | except Exception as e: 52 | print("Waiting 5 minutes: " + str(e)) 53 | time.sleep(300) 54 | continue 55 | 56 | def edit(self, reddit_comment, new_message): 57 | 58 | while True: 59 | try: 60 | reddit_comment.edit(new_message) 61 | print("Edit comment " + reddit_comment.id) 62 | break 63 | except Exception as e: 64 | print("Waiting 5 minutes: " + str(e)) 65 | time.sleep(300) 66 | continue 67 | 68 | @staticmethod 69 | def get_comment(stats: dict, website_id, message: str = ""): 70 | comment = message + " \n" if message else "" 71 | 72 | comment += RedditBot.format_stats(stats) 73 | 74 | comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)" 75 | comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)" 76 | comment += " | [Source](https://github.com/simon987) \n" 77 | comment += "*** \n" 78 | comment += RedditBot.bottom_line 79 | 80 | return comment 81 | 82 | @staticmethod 83 | def format_stats(stats): 84 | 85 | result = " \n" 86 | result += "File types | Count | Total Size\n" 87 | result += ":-- | :-- | :-- \n" 88 | counter = 0 89 | for mime in stats["ext_stats"]: 90 | result += mime[2] 91 | result += " | " + str(mime[1]) 92 | result += " | " + humanfriendly.format_size(mime[0]) + " \n" 93 | 94 | counter += 1 95 | if counter >= 3: 96 | break 97 | 98 | result += "**Total** | **" + str(stats["total_count"]) + "** | **" 99 | result += humanfriendly.format_size(stats["total_size"]) + "** \n\n" 100 | return result 101 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask_testing 3 | requests 4 | bs4 5 | validators 6 | Flask-Caching 7 | praw 8 | humanfriendly 9 | apscheduler 10 | bcrypt 11 | elasticsearch 12 | python-dateutil 13 | flask_httpauth 14 | ujson 15 | urllib3 16 | pyOpenSSL 17 | lxml 18 | pillow 19 | Wand 20 | numpy 21 | uwsgi 22 | redis 23 | psycopg2-binary 24 | lz4 -------------------------------------------------------------------------------- /search/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging import FileHandler 3 | 4 | logger = logging.getLogger("default") 5 | logger.setLevel(logging.DEBUG) 6 | 7 | formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s') 8 | file_handler = FileHandler("oddb.log") 9 | file_handler.setFormatter(formatter) 10 | logger.addHandler(file_handler) 11 | # logger.addHandler(StreamHandler(sys.stdout)) 12 | -------------------------------------------------------------------------------- /search/filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | try: 4 | from fold_to_ascii.fold_to_ascii import mapping 5 | except: 6 | from ..fold_to_ascii.fold_to_ascii import mapping 7 | 8 | 9 | class SearchFilter: 10 | 11 | def __init__(self): 12 | 13 | self.blacklisted_terms = set() 14 | self.table = str.maketrans(dict(mapping.translate_table)) 15 | 16 | if os.path.exists("search_blacklist.txt"): 17 | with open("search_blacklist.txt") as f: 18 | self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip()) 19 | 20 | def should_block(self, query) -> bool: 21 | 22 | query = query.translate(self.table) 23 | query = query.lower() 24 | 25 | for raw_token in query.split(): 26 | 27 | token = raw_token.strip("\"'/\\").strip() 28 | if token in self.blacklisted_terms: 29 | return True 30 | 31 | return False 32 | -------------------------------------------------------------------------------- /search/search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from urllib.parse import urljoin 4 | 5 | import elasticsearch 6 | import ujson 7 | from apscheduler.schedulers.background import BackgroundScheduler 8 | from elasticsearch import helpers 9 | 10 | from search import logger 11 | from search.filter import SearchFilter 12 | 13 | 14 | class InvalidQueryException(Exception): 15 | pass 16 | 17 | 18 | class IndexingError(Exception): 19 | pass 20 | 21 | 22 | class ElasticSearchEngine: 23 | SORT_ORDERS = { 24 | "score": ["_score"], 25 | "size_asc": [{"size": {"order": "asc"}}], 26 | "size_dsc": [{"size": {"order": "desc"}}], 27 | "date_asc": [{"mtime": {"order": "asc"}}], 28 | "date_desc": [{"mtime": {"order": "desc"}}], 29 | "none": [] 30 | } 31 | 32 | def __init__(self, url, index_name): 33 | super().__init__() 34 | self.index_name = index_name 35 | logger.info("Connecting to ES @ %s" % url) 36 | self.es = elasticsearch.Elasticsearch(hosts=[url]) 37 | self.filter = SearchFilter() 38 | 39 | if not self.es.indices.exists(self.index_name): 40 | self.init() 41 | 42 | def start_stats_scheduler(self): 43 | scheduler = BackgroundScheduler() 44 | scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120) 45 | scheduler.start() 46 | 47 | def init(self): 48 | logger.info("Elasticsearch first time setup") 49 | if self.es.indices.exists(self.index_name): 50 | self.es.indices.delete(index=self.index_name) 51 | self.es.indices.create(index=self.index_name, body={ 52 | "settings": { 53 | "index": { 54 | "number_of_shards": 50, 55 | "number_of_replicas": 0, 56 | "refresh_interval": "30s", 57 | "codec": "best_compression" 58 | }, 59 | "analysis": { 60 | "analyzer": { 61 | "my_nGram": { 62 | "tokenizer": "my_nGram_tokenizer", 63 | "filter": ["lowercase", "asciifolding"] 64 | } 65 | }, 66 | "tokenizer": { 67 | "my_nGram_tokenizer": { 68 | "type": "nGram", "min_gram": 3, "max_gram": 3 69 | } 70 | } 71 | } 72 | } 73 | }) 74 | 75 | # Index Mappings 76 | self.es.indices.put_mapping(body={ 77 | "properties": { 78 | "path": {"analyzer": "standard", "type": "text"}, 79 | "name": {"analyzer": "standard", "type": "text", 80 | "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}}, 81 | "mtime": {"type": "date", "format": "epoch_second"}, 82 | "size": {"type": "long"}, 83 | "website_id": {"type": "integer"}, 84 | "ext": {"type": "keyword"}, 85 | }, 86 | "_routing": {"required": True} 87 | }, doc_type="file", index=self.index_name, include_type_name=True) 88 | 89 | self.es.indices.open(index=self.index_name) 90 | 91 | def delete_docs(self, website_id): 92 | 93 | while True: 94 | try: 95 | logger.debug("Deleting docs of " + str(website_id)) 96 | 97 | to_delete = helpers.scan(query={ 98 | "query": { 99 | "term": { 100 | "website_id": website_id 101 | } 102 | } 103 | }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id) 104 | 105 | buf = [] 106 | counter = 0 107 | for doc in to_delete: 108 | buf.append(doc) 109 | counter += 1 110 | 111 | if counter >= 10000: 112 | self._delete(buf, website_id) 113 | buf.clear() 114 | counter = 0 115 | if counter > 0: 116 | self._delete(buf, website_id) 117 | break 118 | 119 | except Exception as e: 120 | logger.error("During delete: " + str(e)) 121 | time.sleep(10) 122 | 123 | logger.debug("Done deleting for " + str(website_id)) 124 | 125 | def _delete(self, docs, website_id): 126 | bulk_string = self.create_bulk_delete_string(docs) 127 | result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30, 128 | routing=website_id) 129 | 130 | if result["errors"]: 131 | logger.error("Error in ES bulk delete: \n" + result["errors"]) 132 | raise IndexingError 133 | 134 | def import_json(self, in_lines, website_id: int): 135 | 136 | import_every = 10000 137 | cooldown_time = 0 138 | 139 | docs = [] 140 | 141 | for line in in_lines: 142 | try: 143 | doc = ujson.loads(line) 144 | name, ext = os.path.splitext(doc["name"]) 145 | doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else "" 146 | doc["name"] = name 147 | doc["website_id"] = website_id 148 | docs.append(doc) 149 | except Exception as e: 150 | logger.error("Error in import_json: " + str(e) + " for line : + \n" + line) 151 | 152 | if len(docs) >= import_every: 153 | self._index(docs) 154 | docs.clear() 155 | time.sleep(cooldown_time) 156 | 157 | if docs: 158 | self._index(docs) 159 | 160 | def _index(self, docs): 161 | while True: 162 | try: 163 | logger.debug("Indexing " + str(len(docs)) + " docs") 164 | bulk_string = ElasticSearchEngine.create_bulk_index_string(docs) 165 | self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30, 166 | routing=docs[0]["website_id"]) 167 | break 168 | except Exception as e: 169 | logger.error("Error in _index: " + str(e) + ", retrying") 170 | time.sleep(10) 171 | 172 | @staticmethod 173 | def create_bulk_index_string(docs: list): 174 | 175 | action_string = '{"index":{}}\n' 176 | return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs) 177 | 178 | @staticmethod 179 | def create_bulk_delete_string(docs: list): 180 | 181 | return "\n".join("".join(["{\"delete\":{\"_id\":\"", doc["_id"], "\"}}"]) for doc in docs) 182 | 183 | def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min, 184 | date_max) -> {}: 185 | 186 | if self.filter.should_block(query): 187 | logger.info("Search was blocked") 188 | raise InvalidQueryException("One or more terms in your query is blocked by the search filter. " 189 | "This incident has been reported.") 190 | 191 | filters = [] 192 | if extensions: 193 | filters.append({"terms": {"ext": extensions}}) 194 | 195 | if size_min > 0 or size_max: 196 | size_filer = dict() 197 | new_filter = {"range": {"size": size_filer}} 198 | 199 | if size_min > 0: 200 | size_filer["gte"] = size_min 201 | if size_max: 202 | size_filer["lte"] = size_max 203 | 204 | filters.append(new_filter) 205 | 206 | if date_min > 0 or date_max: 207 | date_filer = dict() 208 | new_filter = {"range": {"mtime": date_filer}} 209 | 210 | if date_min > 0: 211 | date_filer["gte"] = date_min 212 | if date_max: 213 | date_filer["lte"] = date_max 214 | 215 | filters.append(new_filter) 216 | 217 | sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, []) 218 | 219 | page = self.es.search(body={ 220 | "query": { 221 | "bool": { 222 | "must": { 223 | "multi_match": { 224 | "query": query, 225 | "fields": fields, 226 | "operator": "or" if match_all else "and" 227 | } 228 | }, 229 | "filter": filters 230 | } 231 | }, 232 | "sort": sort_by, 233 | "highlight": { 234 | "fields": { 235 | "name": {"pre_tags": [""], "post_tags": [""]}, 236 | "name.nGram": {"pre_tags": [""], "post_tags": [""]}, 237 | "path": {"pre_tags": [""], "post_tags": [""]} 238 | } 239 | }, 240 | "size": per_page, "from": min(page * per_page, 10000 - per_page)}, 241 | index=self.index_name, request_timeout=20) 242 | 243 | return page 244 | 245 | def get_stats(self, website_id: int, subdir: str = None): 246 | 247 | result = self.es.search(body={ 248 | "query": { 249 | "constant_score": { 250 | "filter": { 251 | "term": {"website_id": website_id} 252 | } 253 | } 254 | }, 255 | "aggs": { 256 | "ext_group": { 257 | "terms": { 258 | "field": "ext", 259 | "size": 12 260 | }, 261 | "aggs": { 262 | "size": { 263 | "sum": { 264 | "field": "size" 265 | } 266 | } 267 | } 268 | }, 269 | "total_size": { 270 | "sum_bucket": { 271 | "buckets_path": "ext_group>size" 272 | } 273 | } 274 | }, 275 | "size": 0 276 | }, index=self.index_name, request_timeout=30, routing=website_id) 277 | 278 | stats = dict() 279 | stats["total_size"] = result["aggregations"]["total_size"]["value"] 280 | stats["total_count"] = result["hits"]["total"] 281 | stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) 282 | for b in result["aggregations"]["ext_group"]["buckets"]] 283 | 284 | return stats 285 | 286 | def get_link_list(self, website_id, base_url): 287 | 288 | hits = helpers.scan(client=self.es, 289 | query={ 290 | "_source": { 291 | "includes": ["path", "name", "ext"] 292 | }, 293 | "query": { 294 | "constant_score": { 295 | "filter": { 296 | "term": {"website_id": website_id} 297 | } 298 | } 299 | }, 300 | }, 301 | index=self.index_name, request_timeout=20, routing=website_id) 302 | for hit in hits: 303 | src = hit["_source"] 304 | yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \ 305 | ("." if src["ext"] != "" else "") + src["ext"] 306 | 307 | @staticmethod 308 | def get_global_stats(): 309 | 310 | if os.path.exists("_stats.json"): 311 | with open("_stats.json", "r") as f: 312 | return ujson.load(f) 313 | else: 314 | return None 315 | 316 | def _generate_global_stats(self): 317 | 318 | size_per_ext = self.es.search(body={ 319 | "query": { 320 | "bool": { 321 | "filter": [ 322 | {"range": { 323 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB 324 | }} 325 | ] 326 | } 327 | }, 328 | "aggs": { 329 | "ext_group": { 330 | "terms": { 331 | "field": "ext", 332 | "size": 40 333 | }, 334 | "aggs": { 335 | "size": { 336 | "sum": { 337 | "field": "size" 338 | } 339 | } 340 | } 341 | } 342 | }, 343 | "size": 0 344 | 345 | }, index=self.index_name, request_timeout=240) 346 | 347 | total_stats = self.es.search(body={ 348 | "query": { 349 | "bool": { 350 | "filter": [ 351 | {"range": { 352 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB 353 | }} 354 | ] 355 | } 356 | }, 357 | "aggs": { 358 | "file_stats": { 359 | "extended_stats": { 360 | "field": "size", 361 | "sigma": 1 362 | } 363 | } 364 | }, 365 | "size": 0 366 | 367 | }, index=self.index_name, request_timeout=241) 368 | 369 | size_and_date_histogram = self.es.search(body={ 370 | "query": { 371 | "bool": { 372 | "filter": [ 373 | {"range": { 374 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB 375 | }}, 376 | {"range": { 377 | "mtime": { 378 | "gt": 0 # 1970-01-01 379 | } 380 | }} 381 | ] 382 | } 383 | }, 384 | "aggs": { 385 | "sizes": { 386 | "histogram": { 387 | "field": "size", 388 | "interval": 100000000, # 100Mb 389 | "min_doc_count": 500 390 | } 391 | }, 392 | "dates": { 393 | "date_histogram": { 394 | "field": "mtime", 395 | "interval": "1y", 396 | "min_doc_count": 500, 397 | "format": "yyyy" 398 | } 399 | } 400 | }, 401 | "size": 0 402 | }, index=self.index_name, request_timeout=242) 403 | 404 | website_scatter = self.es.search(body={ 405 | "query": { 406 | "bool": { 407 | "filter": [ 408 | {"range": { 409 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB 410 | }} 411 | ] 412 | } 413 | }, 414 | "aggs": { 415 | "websites": { 416 | "terms": { 417 | "field": "website_id", 418 | "size": 600 # TODO: Figure out what size is appropriate 419 | }, 420 | "aggs": { 421 | "size": { 422 | "sum": { 423 | "field": "size" 424 | } 425 | } 426 | } 427 | } 428 | }, 429 | "size": 0 430 | }, index=self.index_name, request_timeout=243) 431 | 432 | es_stats = self.es.indices.stats(self.index_name, request_timeout=244) 433 | 434 | stats = dict() 435 | stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"] 436 | stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"] 437 | stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"] 438 | stats["es_search_time_avg"] = stats["es_search_time"] / ( 439 | stats["es_search_count"] if stats["es_search_count"] != 0 else 1) 440 | 441 | stats["total_count"] = total_stats["aggregations"]["file_stats"]["count"] 442 | stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"] 443 | stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"] 444 | stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"] 445 | stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"] 446 | stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"] 447 | stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"]) 448 | for b in size_per_ext["aggregations"]["ext_group"]["buckets"]] 449 | stats["sizes_histogram"] = [(b["key"], b["doc_count"]) 450 | for b in size_and_date_histogram["aggregations"]["sizes"]["buckets"]] 451 | stats["dates_histogram"] = [(b["key_as_string"], b["doc_count"]) 452 | for b in size_and_date_histogram["aggregations"]["dates"]["buckets"]] 453 | stats["website_scatter"] = [[b["key"], b["doc_count"], b["size"]["value"]] 454 | for b in website_scatter["aggregations"]["websites"]["buckets"]] 455 | stats["base_url"] = "entire database" 456 | 457 | with open("_stats.json", "w") as f: 458 | ujson.dump(stats, f) 459 | 460 | def stream_all_docs(self): 461 | return helpers.scan(query={ 462 | "query": { 463 | "match_all": {} 464 | } 465 | }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30) 466 | 467 | def refresh(self): 468 | self.es.indices.refresh(self.index_name) 469 | -------------------------------------------------------------------------------- /static/Hack-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/Hack-Regular.ttf -------------------------------------------------------------------------------- /static/css/fa-brands.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com 3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) 4 | */ 5 | @font-face { 6 | font-family: 'Font Awesome 5 Brands'; 7 | font-style: normal; 8 | font-weight: normal; 9 | src: url("../webfonts/fa-brands-400.eot"); 10 | src: url("../webfonts/fa-brands-400.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-brands-400.woff2") format("woff2"), url("../webfonts/fa-brands-400.woff") format("woff"), url("../webfonts/fa-brands-400.ttf") format("truetype"), url("../webfonts/fa-brands-400.svg#fontawesome") format("svg"); } 11 | 12 | .fab { 13 | font-family: 'Font Awesome 5 Brands'; } 14 | -------------------------------------------------------------------------------- /static/css/fa-brands.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com 3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) 4 | */ 5 | @font-face{font-family:Font Awesome\ 5 Brands;font-style:normal;font-weight:400;src:url(../webfonts/fa-brands-400.eot);src:url(../webfonts/fa-brands-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-brands-400.woff2) format("woff2"),url(../webfonts/fa-brands-400.woff) format("woff"),url(../webfonts/fa-brands-400.ttf) format("truetype"),url(../webfonts/fa-brands-400.svg#fontawesome) format("svg")}.fab{font-family:Font Awesome\ 5 Brands} -------------------------------------------------------------------------------- /static/css/fa-regular.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com 3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) 4 | */ 5 | @font-face { 6 | font-family: 'Font Awesome 5 Free'; 7 | font-style: normal; 8 | font-weight: 400; 9 | src: url("../webfonts/fa-regular-400.eot"); 10 | src: url("../webfonts/fa-regular-400.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-regular-400.woff2") format("woff2"), url("../webfonts/fa-regular-400.woff") format("woff"), url("../webfonts/fa-regular-400.ttf") format("truetype"), url("../webfonts/fa-regular-400.svg#fontawesome") format("svg"); } 11 | 12 | .far { 13 | font-family: 'Font Awesome 5 Free'; 14 | font-weight: 400; } 15 | -------------------------------------------------------------------------------- /static/css/fa-regular.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com 3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) 4 | */ 5 | @font-face{font-family:Font Awesome\ 5 Free;font-style:normal;font-weight:400;src:url(../webfonts/fa-regular-400.eot);src:url(../webfonts/fa-regular-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-regular-400.woff2) format("woff2"),url(../webfonts/fa-regular-400.woff) format("woff"),url(../webfonts/fa-regular-400.ttf) format("truetype"),url(../webfonts/fa-regular-400.svg#fontawesome) format("svg")}.far{font-family:Font Awesome\ 5 Free;font-weight:400} -------------------------------------------------------------------------------- /static/css/fa-solid.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com 3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) 4 | */ 5 | @font-face { 6 | font-family: 'Font Awesome 5 Free'; 7 | font-style: normal; 8 | font-weight: 900; 9 | src: url("../webfonts/fa-solid-900.eot"); 10 | src: url("../webfonts/fa-solid-900.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-solid-900.woff2") format("woff2"), url("../webfonts/fa-solid-900.woff") format("woff"), url("../webfonts/fa-solid-900.ttf") format("truetype"), url("../webfonts/fa-solid-900.svg#fontawesome") format("svg"); } 11 | 12 | .fa, 13 | .fas { 14 | font-family: 'Font Awesome 5 Free'; 15 | font-weight: 900; } 16 | -------------------------------------------------------------------------------- /static/css/fa-solid.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com 3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) 4 | */ 5 | @font-face{font-family:Font Awesome\ 5 Free;font-style:normal;font-weight:900;src:url(../webfonts/fa-solid-900.eot);src:url(../webfonts/fa-solid-900.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-solid-900.woff2) format("woff2"),url(../webfonts/fa-solid-900.woff) format("woff"),url(../webfonts/fa-solid-900.ttf) format("truetype"),url(../webfonts/fa-solid-900.svg#fontawesome) format("svg")}.fa,.fas{font-family:Font Awesome\ 5 Free;font-weight:900} -------------------------------------------------------------------------------- /static/css/ion.rangeSlider.css: -------------------------------------------------------------------------------- 1 | /* Ion.RangeSlider 2 | // css version 2.0.3 3 | // © 2013-2014 Denis Ineshin | IonDen.com 4 | // ===================================================================================================================*/ 5 | 6 | /* ===================================================================================================================== 7 | // RangeSlider */ 8 | 9 | .irs { 10 | position: relative; display: block; 11 | -webkit-touch-callout: none; 12 | -webkit-user-select: none; 13 | -khtml-user-select: none; 14 | -moz-user-select: none; 15 | -ms-user-select: none; 16 | user-select: none; 17 | } 18 | .irs-line { 19 | position: relative; display: block; 20 | overflow: hidden; 21 | outline: none !important; 22 | } 23 | .irs-line-left, .irs-line-mid, .irs-line-right { 24 | position: absolute; display: block; 25 | top: 0; 26 | } 27 | .irs-line-left { 28 | left: 0; width: 11%; 29 | } 30 | .irs-line-mid { 31 | left: 9%; width: 82%; 32 | } 33 | .irs-line-right { 34 | right: 0; width: 11%; 35 | } 36 | 37 | .irs-bar { 38 | position: absolute; display: block; 39 | left: 0; width: 0; 40 | } 41 | .irs-bar-edge { 42 | position: absolute; display: block; 43 | top: 0; left: 0; 44 | } 45 | 46 | .irs-shadow { 47 | position: absolute; display: none; 48 | left: 0; width: 0; 49 | } 50 | 51 | .irs-slider { 52 | position: absolute; display: block; 53 | cursor: default; 54 | z-index: 1; 55 | } 56 | .irs-slider.single { 57 | 58 | } 59 | .irs-slider.from { 60 | 61 | } 62 | .irs-slider.to { 63 | 64 | } 65 | .irs-slider.type_last { 66 | z-index: 2; 67 | } 68 | 69 | .irs-min { 70 | position: absolute; display: block; 71 | left: 0; 72 | cursor: default; 73 | } 74 | .irs-max { 75 | position: absolute; display: block; 76 | right: 0; 77 | cursor: default; 78 | } 79 | 80 | .irs-from, .irs-to, .irs-single { 81 | position: absolute; display: block; 82 | top: 0; left: 0; 83 | cursor: default; 84 | white-space: nowrap; 85 | } 86 | 87 | .irs-grid { 88 | position: absolute; display: none; 89 | bottom: 0; left: 0; 90 | width: 100%; height: 20px; 91 | } 92 | .irs-with-grid .irs-grid { 93 | display: block; 94 | } 95 | .irs-grid-pol { 96 | position: absolute; 97 | top: 0; left: 0; 98 | width: 1px; height: 8px; 99 | background: #000; 100 | } 101 | .irs-grid-pol.small { 102 | height: 4px; 103 | } 104 | .irs-grid-text { 105 | position: absolute; 106 | bottom: 0; left: 0; 107 | white-space: nowrap; 108 | text-align: center; 109 | font-size: 9px; line-height: 9px; 110 | padding: 0 3px; 111 | color: #000; 112 | } 113 | 114 | .irs-disable-mask { 115 | position: absolute; display: block; 116 | top: 0; left: -1%; 117 | width: 102%; height: 100%; 118 | cursor: default; 119 | background: rgba(0,0,0,0.0); 120 | z-index: 2; 121 | } 122 | .irs-disabled { 123 | opacity: 0.4; 124 | } 125 | .lt-ie9 .irs-disabled { 126 | filter: alpha(opacity=40); 127 | } 128 | 129 | 130 | .irs-hidden-input { 131 | position: absolute !important; 132 | display: block !important; 133 | top: 0 !important; 134 | left: 0 !important; 135 | width: 0 !important; 136 | height: 0 !important; 137 | font-size: 0 !important; 138 | line-height: 0 !important; 139 | padding: 0 !important; 140 | margin: 0 !important; 141 | outline: none !important; 142 | z-index: -9999 !important; 143 | background: none !important; 144 | border-style: solid !important; 145 | border-color: transparent !important; 146 | } 147 | -------------------------------------------------------------------------------- /static/css/ion.rangeSlider.skinFlat.css: -------------------------------------------------------------------------------- 1 | /* Ion.RangeSlider, Flat UI Skin 2 | // css version 2.0.3 3 | // © Denis Ineshin, 2014 https://github.com/IonDen 4 | // ===================================================================================================================*/ 5 | 6 | /* ===================================================================================================================== 7 | // Skin details */ 8 | 9 | .irs-line-mid, 10 | .irs-line-left, 11 | .irs-line-right, 12 | .irs-bar, 13 | .irs-bar-edge, 14 | .irs-slider { 15 | background: url(../img/sprite-skin-flat.png) repeat-x; 16 | } 17 | 18 | .irs { 19 | height: 40px; 20 | } 21 | .irs-with-grid { 22 | height: 60px; 23 | } 24 | .irs-line { 25 | height: 12px; top: 25px; 26 | } 27 | .irs-line-left { 28 | height: 12px; 29 | background-position: 0 -30px; 30 | } 31 | .irs-line-mid { 32 | height: 12px; 33 | background-position: 0 0; 34 | } 35 | .irs-line-right { 36 | height: 12px; 37 | background-position: 100% -30px; 38 | } 39 | 40 | .irs-bar { 41 | height: 12px; top: 25px; 42 | background-position: 0 -60px; 43 | } 44 | .irs-bar-edge { 45 | top: 25px; 46 | height: 12px; width: 9px; 47 | background-position: 0 -90px; 48 | } 49 | 50 | .irs-shadow { 51 | height: 3px; top: 34px; 52 | background: #000; 53 | opacity: 0.25; 54 | } 55 | .lt-ie9 .irs-shadow { 56 | filter: alpha(opacity=25); 57 | } 58 | 59 | .irs-slider { 60 | width: 16px; height: 18px; 61 | top: 22px; 62 | background-position: 0 -120px; 63 | } 64 | .irs-slider.state_hover, .irs-slider:hover { 65 | background-position: 0 -150px; 66 | } 67 | 68 | .irs-min, .irs-max { 69 | color: #999; 70 | font-size: 10px; line-height: 1.333; 71 | text-shadow: none; 72 | top: 0; padding: 1px 3px; 73 | background: #e1e4e9; 74 | -moz-border-radius: 4px; 75 | border-radius: 4px; 76 | } 77 | 78 | .irs-from, .irs-to, .irs-single { 79 | color: #fff; 80 | font-size: 10px; line-height: 1.333; 81 | text-shadow: none; 82 | padding: 1px 5px; 83 | background: #dc7846; 84 | -moz-border-radius: 4px; 85 | border-radius: 4px; 86 | } 87 | .irs-from:after, .irs-to:after, .irs-single:after { 88 | position: absolute; display: block; content: ""; 89 | bottom: -6px; left: 50%; 90 | width: 0; height: 0; 91 | margin-left: -3px; 92 | overflow: hidden; 93 | border: 3px solid transparent; 94 | border-top-color: #dc7846; 95 | } 96 | 97 | 98 | .irs-grid-pol { 99 | background: #e1e4e9; 100 | } 101 | .irs-grid-text { 102 | color: #999; 103 | } 104 | -------------------------------------------------------------------------------- /static/css/main.css: -------------------------------------------------------------------------------- 1 | a { 2 | border-bottom: none !important; 3 | } 4 | .card { 5 | margin-top: 1em; 6 | } 7 | .jumbotron { 8 | margin-top: 1em; 9 | } 10 | .list-group { 11 | margin-top: 1em; 12 | } 13 | .list-group-item { 14 | padding-bottom: 0.3rem; 15 | } 16 | .badge { 17 | padding-bottom: 0; 18 | } 19 | .table td { 20 | padding: 2px 0; 21 | } 22 | .td-numeric { 23 | text-align: end; 24 | padding-right: 1em; 25 | } 26 | 27 | .bg-application { 28 | background: #8FB847; 29 | color: #FFFFFF; 30 | } 31 | 32 | .bg-archive { 33 | background: #1fa32a; 34 | color: #FFFFFF; 35 | } 36 | 37 | .bg-audio { 38 | background: #009CD8; 39 | color: #FFFFFF; 40 | } 41 | 42 | .bg-video { 43 | background: #DC7D6C; 44 | color: #FFFFFF; 45 | } 46 | 47 | .bg-text { 48 | background: #E19A36; 49 | color: #FFFFFF; 50 | } 51 | 52 | .bg-image { 53 | background: #998AB5; 54 | color: #FFFFFF; 55 | } 56 | .vim-caret { 57 | -webkit-animation: vimCaret 1s linear infinite; 58 | -o-animation: vimCaret 1s linear infinite; 59 | animation: vimCaret 1s linear infinite; } 60 | 61 | .prev-img { 62 | width: 100%; 63 | max-width: 250px; 64 | height: 100%; 65 | } 66 | 67 | .prev-icon { 68 | cursor: pointer; 69 | } 70 | @-webkit-keyframes vimCaret { 71 | 0% { 72 | background-color: transparent; } 73 | 49% { 74 | background-color: transparent; } 75 | 50% { 76 | background-color: rgba(255, 255, 255, 0.6); } 77 | 100% { 78 | background-color: rgba(255, 255, 255, 0.6); } } 79 | 80 | @-o-keyframes vimCaret { 81 | 0% { 82 | background-color: transparent; } 83 | 49% { 84 | background-color: transparent; } 85 | 50% { 86 | background-color: rgba(255, 255, 255, 0.6); } 87 | 100% { 88 | background-color: rgba(255, 255, 255, 0.6); } } 89 | 90 | @keyframes vimCaret { 91 | 0% { 92 | background-color: transparent; } 93 | 49% { 94 | background-color: transparent; } 95 | 50% { 96 | background-color: rgba(255, 255, 255, 0.6); } 97 | 100% { 98 | background-color: rgba(255, 255, 255, 0.6); } } 99 | 100 | mark { 101 | background-color: rgba(255, 255, 0, 0.4); 102 | border-radius: 0; 103 | padding: 1px 0; 104 | } 105 | body { 106 | color: #BBBBBB; 107 | font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif; 108 | background-image: url(/static/img/bg.png); 109 | } 110 | 111 | .card { 112 | background-color: #36393e; 113 | border: 3px double #262626; 114 | } 115 | 116 | .navbar { 117 | background: #36393e; 118 | font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif; 119 | } 120 | 121 | .navbar-brand { 122 | border: none; 123 | } 124 | 125 | .nav-link { 126 | color: #616161; 127 | border-bottom: 2px solid #6c6c6c; 128 | } 129 | .navbar-toggler-icon { 130 | background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255,255,255, 0.6)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E"); 131 | } 132 | 133 | .active { 134 | border-color: #b3b3b3; 135 | color: #E6E6E6; 136 | } 137 | 138 | .nav-link:hover { 139 | color: #c7c7c7; 140 | } 141 | 142 | .jumbotron { 143 | background: #36393e; 144 | } 145 | 146 | a { 147 | color: #fff; 148 | border-bottom: 1px dotted #e0e0e0; 149 | } 150 | 151 | a:hover { 152 | color:#ddd; 153 | text-decoration: none; 154 | } 155 | 156 | .table a { 157 | border: none; 158 | } 159 | 160 | .table th, .table td { 161 | border-top: 1px solid #666a6e; 162 | } 163 | 164 | .table thead th { 165 | border-bottom: 2px solid #999da1; 166 | } 167 | .form-control { 168 | background-color: #2f3136; 169 | color: inherit; 170 | border: 1px solid #282b30; 171 | } 172 | 173 | .form-control:focus { 174 | background-color: #2f3136; 175 | border-color: #80bdff; 176 | color: inherit; 177 | } 178 | 179 | .input-group-text { 180 | border: 1px solid #282b30; 181 | background-color: #686d75; 182 | color: #e9ecef; 183 | } 184 | 185 | .nav-tabs .nav-link { 186 | border-color: transparent; 187 | } 188 | 189 | .nav-tabs .nav-link.active { 190 | border-color: #8e9296 #8e9296; 191 | background-color: transparent; 192 | color: #E6E6E6; 193 | } 194 | 195 | .nav-tabs .nav-link:hover { 196 | border-color: #e9ecef #e9ecef transparent #e9ecef; 197 | } 198 | 199 | .card-header-tabs { 200 | border-bottom: 1px solid #a1a5a9; 201 | } 202 | 203 | * { 204 | outline: none; 205 | } 206 | 207 | #sizeSlider { 208 | width: 100%; 209 | } 210 | 211 | .irs-single, .irs-from, .irs-to { 212 | font-size: 13px; 213 | } 214 | 215 | .irs-slider { 216 | cursor: col-resize; 217 | } 218 | 219 | .custom-select { 220 | overflow: auto; 221 | } 222 | 223 | .irs { 224 | margin-bottom: 1em; 225 | } 226 | 227 | .github-banner { 228 | position: absolute; 229 | top: 0; 230 | right: 0; 231 | border: 0; 232 | } 233 | 234 | @media (max-width: 990px) { 235 | .github-banner { 236 | display: none; 237 | } 238 | } -------------------------------------------------------------------------------- /static/downloads/README.md: -------------------------------------------------------------------------------- 1 | CSV exports of the database will be available here. -------------------------------------------------------------------------------- /static/img/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/bg.png -------------------------------------------------------------------------------- /static/img/forkme_right_white_ffffff.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/forkme_right_white_ffffff.png -------------------------------------------------------------------------------- /static/img/sprite-skin-flat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/sprite-skin-flat.png -------------------------------------------------------------------------------- /static/js/popper.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) Federico Zivolo 2018 3 | Distributed under the MIT License (license terms are at http://opensource.org/licenses/MIT). 4 | */(function(e,t){'object'==typeof exports&&'undefined'!=typeof module?module.exports=t():'function'==typeof define&&define.amd?define(t):e.Popper=t()})(this,function(){'use strict';function e(e){return e&&'[object Function]'==={}.toString.call(e)}function t(e,t){if(1!==e.nodeType)return[];var o=getComputedStyle(e,null);return t?o[t]:o}function o(e){return'HTML'===e.nodeName?e:e.parentNode||e.host}function n(e){if(!e)return document.body;switch(e.nodeName){case'HTML':case'BODY':return e.ownerDocument.body;case'#document':return e.body;}var i=t(e),r=i.overflow,p=i.overflowX,s=i.overflowY;return /(auto|scroll|overlay)/.test(r+s+p)?e:n(o(e))}function r(e){if(!e)return document.documentElement;for(var o=ie(10)?document.body:null,n=e.offsetParent;n===o&&e.nextElementSibling;)n=(e=e.nextElementSibling).offsetParent;var i=n&&n.nodeName;return i&&'BODY'!==i&&'HTML'!==i?-1!==['TD','TABLE'].indexOf(n.nodeName)&&'static'===t(n,'position')?r(n):n:e?e.ownerDocument.documentElement:document.documentElement}function p(e){var t=e.nodeName;return'BODY'!==t&&('HTML'===t||r(e.firstElementChild)===e)}function s(e){return null===e.parentNode?e:s(e.parentNode)}function d(e,t){if(!e||!e.nodeType||!t||!t.nodeType)return document.documentElement;var o=e.compareDocumentPosition(t)&Node.DOCUMENT_POSITION_FOLLOWING,n=o?e:t,i=o?t:e,a=document.createRange();a.setStart(n,0),a.setEnd(i,0);var l=a.commonAncestorContainer;if(e!==l&&t!==l||n.contains(i))return p(l)?l:r(l);var f=s(e);return f.host?d(f.host,t):d(e,s(t).host)}function a(e){var t=1=o.clientWidth&&n>=o.clientHeight}),l=0n[e]&&!t.escapeWithReference&&(i=X(p[o],n[e]-('right'===e?p.width:p.height))),se({},o,i)}};return i.forEach(function(e){var t=-1===['left','top'].indexOf(e)?'secondary':'primary';p=de({},p,s[t](e))}),e.offsets.popper=p,e},priority:['left','right','top','bottom'],padding:5,boundariesElement:'scrollParent'},keepTogether:{order:400,enabled:!0,fn:function(e){var t=e.offsets,o=t.popper,n=t.reference,i=e.placement.split('-')[0],r=J,p=-1!==['top','bottom'].indexOf(i),s=p?'right':'bottom',d=p?'left':'top',a=p?'width':'height';return o[s]r(n[s])&&(e.offsets.popper[d]=r(n[s])),e}},arrow:{order:500,enabled:!0,fn:function(e,o){var n;if(!q(e.instance.modifiers,'arrow','keepTogether'))return e;var i=o.element;if('string'==typeof i){if(i=e.instance.popper.querySelector(i),!i)return e;}else if(!e.instance.popper.contains(i))return console.warn('WARNING: `arrow.element` must be child of its popper element!'),e;var r=e.placement.split('-')[0],p=e.offsets,s=p.popper,d=p.reference,a=-1!==['left','right'].indexOf(r),l=a?'height':'width',f=a?'Top':'Left',m=f.toLowerCase(),h=a?'left':'top',g=a?'bottom':'right',u=L(i)[l];d[g]-us[g]&&(e.offsets.popper[m]+=d[m]+u-s[g]),e.offsets.popper=c(e.offsets.popper);var b=d[m]+d[l]/2-u/2,y=t(e.instance.popper),w=parseFloat(y['margin'+f],10),E=parseFloat(y['border'+f+'Width'],10),v=b-e.offsets.popper[m]-w-E;return v=Q(X(s[l]-u,v),0),e.arrowElement=i,e.offsets.arrow=(n={},se(n,m,Math.round(v)),se(n,h,''),n),e},element:'[x-arrow]'},flip:{order:600,enabled:!0,fn:function(e,t){if(P(e.instance.modifiers,'inner'))return e;if(e.flipped&&e.placement===e.originalPlacement)return e;var o=E(e.instance.popper,e.instance.reference,t.padding,t.boundariesElement,e.positionFixed),n=e.placement.split('-')[0],i=S(n),r=e.placement.split('-')[1]||'',p=[];switch(t.behavior){case fe.FLIP:p=[n,i];break;case fe.CLOCKWISE:p=V(n);break;case fe.COUNTERCLOCKWISE:p=V(n,!0);break;default:p=t.behavior;}return p.forEach(function(s,d){if(n!==s||p.length===d+1)return e;n=e.placement.split('-')[0],i=S(n);var a=e.offsets.popper,l=e.offsets.reference,f=J,m='left'===n&&f(a.right)>f(l.left)||'right'===n&&f(a.left)f(l.top)||'bottom'===n&&f(a.top)f(o.right),g=f(a.top)f(o.bottom),b='left'===n&&h||'right'===n&&c||'top'===n&&g||'bottom'===n&&u,y=-1!==['top','bottom'].indexOf(n),w=!!t.flipVariations&&(y&&'start'===r&&h||y&&'end'===r&&c||!y&&'start'===r&&g||!y&&'end'===r&&u);(m||b||w)&&(e.flipped=!0,(m||b)&&(n=p[d+1]),w&&(r=K(r)),e.placement=n+(r?'-'+r:''),e.offsets.popper=de({},e.offsets.popper,T(e.instance.popper,e.offsets.reference,e.placement)),e=N(e.instance.modifiers,e,'flip'))}),e},behavior:'flip',padding:5,boundariesElement:'viewport'},inner:{order:700,enabled:!1,fn:function(e){var t=e.placement,o=t.split('-')[0],n=e.offsets,i=n.popper,r=n.reference,p=-1!==['left','right'].indexOf(o),s=-1===['top','left'].indexOf(o);return i[p?'left':'top']=r[o]-(s?i[p?'width':'height']:0),e.placement=S(t),e.offsets.popper=c(i),e}},hide:{order:800,enabled:!0,fn:function(e){if(!q(e.instance.modifiers,'hide','preventOverflow'))return e;var t=e.offsets.reference,o=D(e.instance.modifiers,function(e){return'preventOverflow'===e.name}).boundaries;if(t.bottomo.right||t.top>o.bottom||t.right= thresh && u < units.length - 1); 423 | 424 | return bytes.toFixed(1) + ' ' + units[u]; 425 | } -------------------------------------------------------------------------------- /static/webfonts/fa-brands-400.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.eot -------------------------------------------------------------------------------- /static/webfonts/fa-brands-400.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.ttf -------------------------------------------------------------------------------- /static/webfonts/fa-brands-400.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.woff -------------------------------------------------------------------------------- /static/webfonts/fa-brands-400.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.woff2 -------------------------------------------------------------------------------- /static/webfonts/fa-regular-400.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.eot -------------------------------------------------------------------------------- /static/webfonts/fa-regular-400.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.ttf -------------------------------------------------------------------------------- /static/webfonts/fa-regular-400.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.woff -------------------------------------------------------------------------------- /static/webfonts/fa-regular-400.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.woff2 -------------------------------------------------------------------------------- /static/webfonts/fa-solid-900.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.eot -------------------------------------------------------------------------------- /static/webfonts/fa-solid-900.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.ttf -------------------------------------------------------------------------------- /static/webfonts/fa-solid-900.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.woff -------------------------------------------------------------------------------- /static/webfonts/fa-solid-900.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.woff2 -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import time 5 | import traceback 6 | from multiprocessing.pool import ThreadPool 7 | from tempfile import NamedTemporaryFile 8 | from threading import Thread 9 | from uuid import uuid4 10 | 11 | import requests 12 | import urllib3 13 | 14 | import config 15 | import database 16 | from database import Website 17 | from search.search import ElasticSearchEngine 18 | from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker 19 | from ws_bucket_client.api import WsBucketApi 20 | 21 | urllib3.disable_warnings() 22 | 23 | logger = logging.getLogger("default") 24 | 25 | 26 | class Task: 27 | 28 | def __init__(self, website_id: int, url: str, priority: int = 1, 29 | callback_type: str = None, callback_args: str = None, 30 | upload_token: str = None): 31 | self.website_id = website_id 32 | self.url = url 33 | self.priority = priority 34 | self.callback_type = callback_type 35 | self.callback_args = json.loads(callback_args) if callback_args else {} 36 | self.upload_token = upload_token 37 | 38 | def to_json(self): 39 | return { 40 | "website_id": self.website_id, 41 | "url": self.url, 42 | "callback_type": self.callback_type, 43 | "callback_args": json.dumps(self.callback_args), 44 | "upload_token": self.upload_token 45 | } 46 | 47 | def __str__(self): 48 | return json.dumps(self.to_json()) 49 | 50 | def __repr__(self): 51 | return self.__str__() 52 | 53 | 54 | class IndexingTask: 55 | 56 | def __init__(self, website_id: int, file_path: str, callback_type: str, callback_args): 57 | self.website_id = website_id 58 | self.file_path = file_path 59 | self.callback_type = callback_type 60 | self.callback_args = callback_args 61 | 62 | 63 | class TaskManager: 64 | 65 | def __init__(self): 66 | self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX) 67 | self.db = database.Database(config.DB_CONN_STR) 68 | self.tracker = TaskTrackerApi(config.TT_API) 69 | 70 | self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET) 71 | self._indexer_threads = list() 72 | 73 | self.worker = Worker.from_file(self.tracker) 74 | if not self.worker: 75 | self.worker = self.tracker.make_worker("$oddb_master") 76 | if not self.worker: 77 | print("Could not create worker: %s" % traceback.format_exc()) 78 | return 79 | self.worker.dump_to_file() 80 | self.worker.request_access(config.TT_CRAWL_PROJECT, False, True) 81 | self.worker.request_access(config.TT_INDEX_PROJECT, True, False) 82 | 83 | def start_indexer_threads(self): 84 | logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, )) 85 | for _ in range(config.INDEXER_THREADS): 86 | t = Thread(target=self._do_indexing) 87 | t.setDaemon(True) 88 | self._indexer_threads.append(t) 89 | t.start() 90 | 91 | def _do_indexing(self): 92 | 93 | while True: 94 | task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT) 95 | 96 | if task: 97 | try: 98 | recipe = task.json_recipe() 99 | logger.debug("Got indexing task: " + str(recipe)) 100 | 101 | filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"]) 102 | 103 | self._complete_task(filename, Task(recipe["website_id"], recipe["url"])) 104 | except Exception as e: 105 | self.worker.release_task(task_id=task.id, result=1, verification=0) 106 | finally: 107 | try: 108 | self.worker.release_task(task_id=task.id, result=0, verification=0) 109 | except: 110 | pass 111 | else: 112 | time.sleep(5) 113 | 114 | def _complete_task(self, file_list, task): 115 | 116 | self.search.delete_docs(task.website_id) 117 | 118 | if file_list: 119 | def iter_lines(): 120 | with open(file_list, "r") as f: 121 | line = f.readline() 122 | while line: 123 | yield line 124 | line = f.readline() 125 | 126 | self.search.import_json(iter_lines(), task.website_id) 127 | os.remove(file_list) 128 | 129 | self.db.update_website_date_if_exists(task.website_id) 130 | 131 | def do_recrawl(self): 132 | logger.debug("Creating re-crawl tasks") 133 | self._generate_crawling_tasks() 134 | 135 | def _generate_crawling_tasks(self): 136 | 137 | # TODO: Insert more in-depth re-crawl logic here 138 | websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http") 139 | 140 | def recrawl(website: Website): 141 | crawl_task = Task(website.id, website.url, 142 | priority=(int((time.time() - website.last_modified.timestamp()) / 3600))) 143 | self.queue_task(crawl_task) 144 | 145 | pool = ThreadPool(processes=30) 146 | pool.map(func=recrawl, iterable=websites_to_crawl) 147 | pool.close() 148 | 149 | def queue_task(self, task: Task): 150 | max_assign_time = 24 * 4 * 3600 151 | upload_token = uuid4().__str__() 152 | 153 | task.upload_token = upload_token 154 | tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT, 155 | recipe=task.__str__(), 156 | priority=task.priority, 157 | max_assign_time=max_assign_time, 158 | hash64=task.website_id, 159 | verification_count=1, 160 | max_retries=3 161 | ) 162 | print(tracker_response.text) 163 | logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text)) 164 | if not tracker_response.json()["ok"]: 165 | return 166 | 167 | bucket_response = self.bucket.allocate(upload_token.__str__(), 168 | 21474837499, # 20Gib 169 | format_file_name(task.website_id, upload_token), 170 | to_dispose_date=int(time.time() + max_assign_time), 171 | upload_hook="") 172 | logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text)) 173 | 174 | 175 | def format_file_name(website_id, token): 176 | return "%d_%s.NDJSON" % (website_id, token,) 177 | 178 | 179 | def download_file(url): 180 | r = requests.get(url, stream=True,) 181 | 182 | if r.status_code != 200: 183 | raise ValueError("HTTP error %d: %s" % (r.status_code, url)) 184 | 185 | tmp = NamedTemporaryFile(delete=False) 186 | for chunk in r.iter_content(chunk_size=4096): 187 | if chunk: 188 | tmp.write(chunk) 189 | tmp.close() 190 | 191 | return tmp.name 192 | -------------------------------------------------------------------------------- /template_filters.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | 4 | import od_util 5 | 6 | 7 | def setup_template_filters(app): 8 | 9 | app.jinja_env.globals.update(truncate_path=od_util.truncate_path) 10 | app.jinja_env.globals.update(get_color=od_util.get_color) 11 | app.jinja_env.globals.update(get_mime=od_util.get_category) 12 | 13 | @app.template_filter("date_format") 14 | def date_format(value, format='%Y-%m-%d'): 15 | return time.strftime(format, time.gmtime(value)) 16 | 17 | @app.template_filter("datetime_format") 18 | def datetime_format(value, format='%Y-%m-%d %H:%M:%S'): 19 | return time.strftime(format, time.gmtime(value)) 20 | 21 | @app.template_filter("duration_format") 22 | def duration_format(value): 23 | delay = datetime.timedelta(seconds=value) 24 | if delay.days > 0: 25 | out = str(delay).replace(" days, ", ":") 26 | else: 27 | out = str(delay) 28 | out_ar = out.split(':') 29 | out_ar = ["%02d" % (int(float(x))) for x in out_ar] 30 | out = ":".join(out_ar) 31 | return out 32 | 33 | @app.template_filter("from_timestamp") 34 | def from_timestamp(value): 35 | return datetime.datetime.fromtimestamp(value) 36 | -------------------------------------------------------------------------------- /templates/admin.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "Admin login - OD-Database" %} 3 | 4 | {% block body %} 5 |
6 |
7 |
Admin login
8 |
9 |
10 | 11 |
12 | 13 |
14 |
15 | 16 |
17 | 18 | {% if show_captcha %} 19 | {{ captcha.get_code()|safe }} 20 | {% endif %} 21 | 22 | 23 | 24 |
25 |
26 |
27 |
28 | {% endblock body %} 29 | -------------------------------------------------------------------------------- /templates/contribute.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set current_page = "contribute" %} 3 | 4 | {% block body %} 5 |
6 |
7 |
How to contribute
8 |
9 |

Fork on GitHub or create an issue

10 |

Or submit a website

11 |

You can also contact me on Reddit

12 |
13 |
14 |
15 | {% endblock body %} 16 | -------------------------------------------------------------------------------- /templates/dashboard.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "Dashboard - OD-Database" %} 3 | 4 | {% block body %} 5 |
6 |
7 |
Dashboard
8 |
9 | 10 |

API Keys

11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | {% for token in api_tokens %} 22 | 23 | 24 | 25 | 31 | 32 | {% endfor %} 33 | 34 |
NameTokenAction
{{ token.name }}{{ token.token }} 26 |
27 | 28 | 29 |
30 |
35 |
36 |
37 |
38 | 39 |
40 |
41 | 42 |
43 |
44 |
45 | 46 |
47 |
48 |

Blacklist

49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | {% for item in blacklist %} 58 | 59 | 60 | 61 | 62 | {% endfor %} 63 | 64 |
NetlocAction
{{ item.netloc }}Delete
65 |
66 |
67 |
68 | 69 |
70 |
71 | 72 |
73 |
74 |
75 | 76 |
77 |
78 | Logout 79 |
80 |
81 |
82 | {% endblock body %} 83 | -------------------------------------------------------------------------------- /templates/downloads.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "Downloads - OD-Database" %} 3 | {% set current_page = "dl" %} 4 | 5 | {% block body %} 6 |
7 |
8 |
Downloads
9 |
10 | 11 |

Please let me know if you used the database in a project!

12 |

The entire database is exported to CSV regularly

13 | 14 | {% if not export_file_stats %} 15 |
16 |

No files available.

17 | {% else %} 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | {% for name, path, stat in export_file_stats %} 30 | 31 | 32 | 33 | 34 | 35 | {% endfor %} 36 | 37 | 38 |
DescriptionSizeDate
{{ name }}{{ stat.st_size |filesizeformat }}{{ stat.st_mtime|datetime_format }} UTC
39 | {% endif %} 40 |
41 |
42 |
43 | {% endblock body %} 44 | -------------------------------------------------------------------------------- /templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set current_page = "home" %} 3 | {% set title = "OD-Database - Home" %} 4 | 5 | {% block body %} 6 | 7 |
8 |
9 |

OD-Database

10 | 11 | {% if stats and stats["total_size"] %} 12 |

{{ stats["total_count"] }} files totalling 13 | ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites

14 | {% else %} 15 |

We are currently experiencing a high volume of traffic. The search function 16 | may be unresponsive.

17 | {% endif %} 18 |

19 |
20 |
21 |
22 | 23 |
24 |
Search
25 |
26 |
27 | 28 |
29 | 30 | 31 |
32 | {% if show_captcha %} 33 | {{ captcha.get_code()|safe }} 34 | {% endif %} 35 |
36 |
37 |
38 | 39 |
40 |
About
41 |
42 |

Web frontend and backend by simon987, 43 | HTTP crawler by terorie, 44 | hosting provided by The eye 45 |

46 |
47 |
48 | 49 |
50 | 51 | 52 | Fork me on GitHub 53 | 54 | {% endblock body %} 55 | -------------------------------------------------------------------------------- /templates/layout.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 40 | 41 | 42 | {% block alert_messages %} 43 | 44 | 45 | {% with messages = get_flashed_messages(with_categories=true) %} 46 | {% if messages %} 47 |
48 | {% for category, message in messages %} 49 |
50 | × 51 | {{ message | safe }} 52 |
53 | {% endfor %} 54 |
55 | {% endif %} 56 | {% endwith %} 57 | {% endblock %} 58 | 59 | {% block body %} 60 | {% endblock body %} 61 | 62 | 63 | -------------------------------------------------------------------------------- /templates/search.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set current_page = "search" %} 3 | 4 | {% set title = "OD-Database - Search" %} 5 | 6 | {% block body %} 7 |
8 | 9 |
10 |
Search
11 |
12 |
13 | 14 |
15 | 16 | {# Query #} 17 |
18 |
19 |
20 | 21 | 23 |
24 |
25 | 26 | 27 |
28 |
29 | {# Size #} 30 |
File size
31 | 32 | 33 | 34 | {# Date #} 35 |
File date
36 | 37 | 38 | 39 | 40 |
41 | {# File extension #} 42 |
43 |
File extension
44 |
45 |
46 |
.
47 |
48 | 50 |
51 |
52 | {# Fields #} 53 |
54 |
Search in
55 |
56 | 58 | 59 |
60 |
61 | 63 | 64 |
65 |
66 | 68 | 69 |
70 |
71 |
72 | 73 |
Display options
74 |
75 | 76 |
77 | {# Sort order #} 78 | 98 |
99 | {# Results per page #} 100 |
101 | 107 |
108 | 109 | 110 | {# Search button #} 111 |
112 | 113 | 115 |
116 |
117 | {% if show_captcha %} 118 | {{ captcha.get_code()|safe }} 119 | {% endif %} 120 | 121 |
122 |
123 |
124 | 125 | {% if count > 0 %} 126 |
127 |
128 | 129 | {{ count }} result(s) in {{ results["took"] }}ms 130 | 131 |
132 | 133 | 134 | 135 | {% for hit in results["hits"]["hits"] %} 136 | {% set src = hit["_source"] %} 137 | {% if "name" in hit["highlight"] %} 138 | {% set hl_name = hit["highlight"]["name"][0] %} 139 | {% elif "name.nGram" in hit["highlight"] %} 140 | {% set hl_name = hit["highlight"]["name.nGram"][0] %} 141 | {% else %} 142 | {% set hl_name = src["name"] %} 143 | {% endif %} 144 | 145 | {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %} 146 | 147 | 148 | 170 | {# File size & date #} 171 | 175 | 176 | {% endfor %} 177 | 178 |
149 | {% set category = get_mime(src["ext"]) %} 150 | {% set url = src["website_url"] + "/" + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] %} 151 | {# Preview #} 152 | {% if category == "image" %} 153 | 155 | {% endif %} 156 | {# File name & link #} 157 | {{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }} 158 | {# File type badge #} 159 | {% if category %} 160 | 161 | {{ src["ext"] }} 162 | 163 | {% endif %} 164 | {# File path #} 165 |
166 | {{ src["website_url"] }}/{{ hl_path|safe }} 168 |
169 |
172 |
{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}
173 | {{ src["mtime"] | date_format }} 174 |
179 |
180 | {% if count > (p + 1) * per_page %} 181 | 182 | {% endif %} 183 | {% if p > 0 %} 184 | 185 | {% endif %} 186 | 187 |
188 |
189 | {% else %} 190 |
191 |
192 |

No results.

193 |

For better results:

194 |
    195 |
  • Try checking the 'Match any word' box for a broader search.
  • 196 |
  • Make sure you don't include the file extension in your query (Use the appropriate field to 197 | filter file types) 198 |
  • 199 |
  • If you're searching for files in a particular website, use the website 200 | search page
  • 201 |
202 |
203 | 204 |
205 | {% endif %} 206 | 207 | 208 | 209 | 284 |
285 | 286 | 287 | {% endblock body %} 288 | -------------------------------------------------------------------------------- /templates/stats.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "Stats - OD-Database" %} 3 | {% set current_page = "stats" %} 4 | 5 | {% block body %} 6 |
7 | 8 |
9 |
Statistics
10 |
11 | 12 |
13 |

Calculating...

14 | 15 |
16 |
17 | 18 |
19 |
20 | 21 |
22 |
23 | 24 |
25 | 26 | 27 |

Database stats

28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |
Database index size
Query count
Total query time
Average time per query
Total file count
Size total
Size average
Size standard deviation
Size standard deviation bounds (σ = 1)
Size variance
72 |
73 |
74 | 75 |
76 | 77 | 78 | 98 | {% endblock body %} 99 | -------------------------------------------------------------------------------- /templates/submit.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "OD-Database - Submit website" %} 3 | {% set current_page = "submit" %} 4 | 5 | {% block body %} 6 |
7 |
8 |
9 | 19 |
20 |
21 | 22 |
23 |
24 | {# Single website #} 25 |
26 |
27 | 28 |
29 | {% if show_captcha %} 30 |
31 | {{ captcha.get_code()|safe }} 32 |
33 | {% endif %} 34 | 35 |
36 | 37 |
38 |
39 | {# Bulk #} 40 |
41 |
42 | 44 |
45 | {% if show_captcha %} 46 |
47 | {{ captcha.get_code()|safe }} 48 |
49 | {% endif %} 50 | 51 | 53 |
54 |
55 |
56 | 57 | 58 |
59 | 60 |

By submitting this form you agree that your IP address and User Agent will be 61 | saved (for debugging purposes only). 62 |

63 |
64 |
65 |
66 | {% endblock body %} 67 | -------------------------------------------------------------------------------- /templates/website.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "OD-Database - Website details" %} 3 | {% set current_page = "website" %} 4 | 5 | {% block body %} 6 |
7 |
8 |
Information for {{ website.url | truncate(80) }}
9 |
10 | 11 |
12 |

Calculating...

13 | 14 | 15 | 16 |
17 | 18 |
19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
Base url
File count
Total size
Last updated
40 |
41 | 42 |
43 | Link list 44 | Summary (JSON) 45 | {% if "username" in session %} 46 | 47 | Clear 48 | 49 | Delete 50 | 51 | rescan 52 | {% endif %} 53 |
54 |
55 |
56 | 73 | {% endblock body %} 74 | -------------------------------------------------------------------------------- /templates/websites.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% set title = "OD-Database - Websites" %} 3 | {% set current_page = "website" %} 4 | 5 | 6 | {% block body %} 7 |
8 |
9 |
Go to website
10 |
11 | 12 | Go to random website 13 |
14 |

Website search

15 |
16 |
17 |
18 | 19 |
20 |
21 | 22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
Websites
31 |
32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {% for website in websites %} 41 | 42 | 43 | 44 | 45 | {% endfor %} 46 |
UrlLast crawled
{{ website[1] | truncate(70) }}{{ website[2] }}
47 | {% if websites|length == per_page %} 48 | Next 49 | {% endif %} 50 | {% if p > 0 %} 51 | Previous 52 | {% endif %} 53 |
54 |
55 |
56 | {% endblock body %} 57 | -------------------------------------------------------------------------------- /tt_config.yml: -------------------------------------------------------------------------------- 1 | server: 2 | address: "0.0.0.0:3010" 3 | 4 | database: 5 | conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable" 6 | log_levels: ["error", "info", "warn"] 7 | 8 | git: 9 | webhook_hash: "sha256" 10 | webhook_sig_header: "X-Gogs-Signature" 11 | 12 | log: 13 | level: "trace" 14 | 15 | session: 16 | cookie_name: "tt" 17 | expiration: "48h" 18 | 19 | monitoring: 20 | snapshot_interval: "120s" 21 | history_length: "1800h" 22 | 23 | maintenance: 24 | reset_timed_out_tasks_interval: "10m" 25 | -------------------------------------------------------------------------------- /uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | module = main 3 | callable = app 4 | 5 | enable-threads = true 6 | processes = 4 7 | threads = 16 8 | 9 | disable-logging = True -------------------------------------------------------------------------------- /views.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from multiprocessing.pool import Pool 4 | from urllib.parse import urlparse 5 | 6 | from flask import render_template, redirect, request, flash, abort, Response, session 7 | from flask_caching import Cache 8 | 9 | import captcha 10 | import config 11 | import od_util 12 | from common import db, taskManager, searchEngine, logger, require_role 13 | from database import Website 14 | from search.search import InvalidQueryException 15 | from tasks import Task 16 | 17 | 18 | def setup_views(app): 19 | cache = Cache(app, config={ 20 | "CACHE_TYPE": "redis", 21 | "CACHE_REDIS_HOST": config.REDIS_HOST, 22 | "CACHE_REDIS_PORT": config.REDIS_PORT, 23 | }) 24 | 25 | @app.route("/dl") 26 | @cache.cached(120) 27 | def downloads(): 28 | # Get content of downloads directory 29 | dl_dir = "static/downloads/" 30 | dir_content = os.listdir(dl_dir) 31 | 32 | # Make paths relative to working directory 33 | # Only allow csv files 34 | files = [ 35 | (name, os.path.join(dl_dir, name)) 36 | for name in dir_content 37 | if name.find(".csv") != -1 38 | ] 39 | 40 | # Stat files 41 | # Remove any dirs placed accidentally 42 | files = [ 43 | (f, full, os.stat(full)) 44 | for f, full in files 45 | if os.path.isfile(full) 46 | ] 47 | 48 | if len(files) == 0: 49 | logger.warning("No export file to display in /dl") 50 | 51 | return render_template("downloads.html", export_file_stats=files) 52 | 53 | @app.route("/stats") 54 | @cache.cached(120) 55 | def stats_page(): 56 | return render_template("stats.html") 57 | 58 | @app.route("/stats/json_chart") 59 | @cache.cached(240) 60 | def stats_json(): 61 | stats = searchEngine.get_global_stats() 62 | if stats: 63 | db.join_website_on_stats(stats) 64 | return Response(json.dumps(stats), mimetype="application/json") 65 | return abort(500) 66 | 67 | @app.route("/website//") 68 | def website_info(website_id): 69 | website = db.get_website_by_id(website_id) 70 | 71 | if website: 72 | return render_template("website.html", website=website) 73 | else: 74 | abort(404) 75 | 76 | @app.route("/website//json_chart") 77 | @cache.memoize(60) 78 | def website_json_chart(website_id): 79 | website = db.get_website_by_id(website_id) 80 | 81 | if website: 82 | stats = searchEngine.get_stats(website_id) 83 | stats["base_url"] = website.url 84 | stats["report_time"] = website.last_modified 85 | return Response(json.dumps(stats), mimetype="application/json") 86 | else: 87 | abort(404) 88 | 89 | @app.route("/website//links") 90 | def website_links(website_id): 91 | website = db.get_website_by_id(website_id) 92 | 93 | if website: 94 | links = searchEngine.get_link_list(website_id, website.url) 95 | return Response("\n".join(links), mimetype="text/plain") 96 | else: 97 | abort(404) 98 | 99 | @app.route("/website/") 100 | def websites(): 101 | page = int(request.args.get("p")) if "p" in request.args else 0 102 | url = request.args.get("url") if "url" in request.args else "" 103 | if url: 104 | parsed_url = urlparse(url) 105 | if parsed_url.scheme: 106 | search_term = (parsed_url.scheme + "://" + parsed_url.netloc) 107 | else: 108 | flash("Sorry, I was not able to parse this url format. " 109 | "Make sure you include the appropriate scheme (http/https/ftp)", "warning") 110 | search_term = "" 111 | else: 112 | search_term = url 113 | 114 | return render_template("websites.html", 115 | websites=db.get_websites(50, page, search_term), 116 | p=page, url=search_term, per_page=50) 117 | 118 | @app.route("/website/random") 119 | def random_website(): 120 | rand_id = db.get_random_website_id() 121 | if rand_id: 122 | return redirect("/website/" + str()) 123 | return redirect("/website/") 124 | 125 | @app.route("/website//clear") 126 | def admin_clear_website(website_id): 127 | require_role("admin") 128 | 129 | searchEngine.delete_docs(website_id) 130 | flash("Cleared all documents associated with this website", "success") 131 | return redirect("/website/" + str(website_id)) 132 | 133 | @app.route("/website//delete") 134 | def admin_delete_website(website_id): 135 | require_role("admin") 136 | 137 | searchEngine.delete_docs(website_id) 138 | db.delete_website(website_id) 139 | flash("Deleted website " + str(website_id), "success") 140 | return redirect("/website/") 141 | 142 | @app.route("/website//rescan") 143 | def admin_rescan_website(website_id): 144 | require_role("admin") 145 | website = db.get_website_by_id(website_id) 146 | 147 | if website: 148 | priority = request.args.get("priority") if "priority" in request.args else 1 149 | task = Task(website_id, website.url, priority) 150 | taskManager.queue_task(task) 151 | 152 | flash("Enqueued rescan task", "success") 153 | else: 154 | flash("Website does not exist", "danger") 155 | return redirect("/website/" + str(website_id)) 156 | 157 | @app.route("/search") 158 | def search(): 159 | results = 0 160 | q = request.args.get("q") if "q" in request.args else "" 161 | sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score" 162 | 163 | page = request.args.get("p") if "p" in request.args else "0" 164 | page = int(page) if page.isdigit() else 0 165 | 166 | per_page = request.args.get("per_page") if "per_page" in request.args else "50" 167 | per_page = int(per_page) if per_page.isdigit() else "50" 168 | per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50 169 | 170 | extensions = request.args.get("ext") if "ext" in request.args else None 171 | extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else [] 172 | 173 | size_min = request.args.get("size_min") if "size_min" in request.args else "size_min" 174 | size_min = int(size_min) if size_min.isdigit() else 0 175 | size_max = request.args.get("size_max") if "size_max" in request.args else "size_max" 176 | size_max = int(size_max) if size_max.isdigit() else 0 177 | 178 | date_min = request.args.get("date_min") if "date_min" in request.args else "date_min" 179 | date_min = int(date_min) if date_min.isdigit() else 0 180 | date_max = request.args.get("date_max") if "date_max" in request.args else "date_max" 181 | date_max = int(date_max) if date_max.isdigit() else 0 182 | 183 | match_all = "all" in request.args 184 | 185 | field_name = "field_name" in request.args 186 | field_trigram = "field_trigram" in request.args 187 | field_path = "field_path" in request.args 188 | 189 | if not field_name and not field_trigram and not field_path: 190 | # If no fields are selected, search in all 191 | field_name = field_path = field_trigram = True 192 | 193 | fields = [] 194 | if field_path: 195 | fields.append("path") 196 | if field_name: 197 | fields.append("name^5") 198 | if field_trigram: 199 | fields.append("name.nGram^2") 200 | 201 | if len(q) >= 3: 202 | 203 | blocked = False 204 | hits = None 205 | if not config.CAPTCHA_SEARCH or captcha.verify(): 206 | 207 | try: 208 | hits = searchEngine.search(q, page, per_page, sort_order, 209 | extensions, size_min, size_max, match_all, fields, date_min, date_max) 210 | hits = db.join_website_on_search_result(hits) 211 | except InvalidQueryException as e: 212 | flash("Invalid query: " + str(e), "warning") 213 | blocked = True 214 | except: 215 | flash("Query failed, this could mean that the search server is overloaded or is not reachable. " 216 | "Please try again later", "danger") 217 | 218 | results = hits["hits"]["total"]["value"] if not isinstance(hits["hits"]["total"], int) else \ 219 | hits["hits"]["total"] if hits else -1 220 | took = hits["took"] if hits else -1 221 | forwarded_for = request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None 222 | 223 | logger.info("SEARCH '{}' [res={}, t={}, p={}x{}, ext={}] by {}{}" 224 | .format(q, results, took, page, per_page, str(extensions), 225 | request.remote_addr, "_" + forwarded_for if forwarded_for else "")) 226 | 227 | db.log_search(request.remote_addr, forwarded_for, q, extensions, page, blocked, results, took) 228 | if blocked: 229 | return redirect("/search") 230 | else: 231 | flash("Error: Invalid captcha please try again", "danger") 232 | 233 | else: 234 | hits = None 235 | 236 | return render_template("search.html", 237 | count=results, 238 | results=hits, 239 | q=q, 240 | p=page, per_page=per_page, 241 | sort_order=sort_order, 242 | results_set=config.RESULTS_PER_PAGE, 243 | extensions=",".join(extensions), 244 | size_min=size_min, size_max=size_max, 245 | match_all=match_all, 246 | field_trigram=field_trigram, field_path=field_path, field_name=field_name, 247 | date_min=date_min, date_max=date_max, 248 | show_captcha=config.CAPTCHA_SEARCH, captcha=captcha) 249 | 250 | @app.route("/contribute") 251 | @cache.cached(600) 252 | def contribute(): 253 | return render_template("contribute.html") 254 | 255 | @app.route("/") 256 | def home(): 257 | try: 258 | stats = searchEngine.get_global_stats() 259 | stats["website_count"] = len(db.get_all_websites()) 260 | except: 261 | stats = {} 262 | return render_template("home.html", stats=stats, 263 | show_captcha=config.CAPTCHA_SEARCH, captcha=captcha) 264 | 265 | @app.route("/submit") 266 | def submit(): 267 | return render_template("submit.html", captcha=captcha, show_captcha=config.CAPTCHA_SUBMIT) 268 | 269 | def try_enqueue(url): 270 | url = os.path.join(url, "") 271 | url = od_util.get_top_directory(url) 272 | 273 | if not od_util.is_valid_url(url): 274 | return "Error: Invalid url. Make sure to include the appropriate scheme.", "warning" 275 | 276 | website = db.get_website_by_url(url) 277 | if website: 278 | return "Website already exists", "danger" 279 | 280 | website = db.website_exists(url) 281 | if website: 282 | return "A parent directory of this url has already been posted", "danger" 283 | 284 | if db.is_blacklisted(url): 285 | return "Error: " \ 286 | "Sorry, this website has been blacklisted. If you think " \ 287 | "this is an error, please contact me.", "danger" 288 | 289 | if not od_util.is_od(url): 290 | return "Error:" \ 291 | "The anti-spam algorithm determined that the submitted url is not " \ 292 | "an open directory or the server is not responding. If you think " \ 293 | "this is an error, please contact me.", "danger" 294 | 295 | website_id = db.insert_website(Website(url, str(request.remote_addr + "_" + 296 | request.headers.get("X-Forwarded-For", "")), 297 | request.user_agent)) 298 | 299 | task = Task(website_id, url, priority=1) 300 | taskManager.queue_task(task) 301 | 302 | return "The website has been added to the queue", "success" 303 | 304 | @app.route("/enqueue", methods=["POST"]) 305 | def enqueue(): 306 | if not config.CAPTCHA_SUBMIT or captcha.verify(): 307 | 308 | url = os.path.join(request.form.get("url"), "") 309 | message, msg_type = try_enqueue(url) 310 | flash(message, msg_type) 311 | 312 | return redirect("/submit") 313 | 314 | else: 315 | flash("Error: Invalid captcha please try again", "danger") 316 | return redirect("/submit") 317 | 318 | def check_url(url): 319 | url = os.path.join(url, "") 320 | try_enqueue(url) 321 | return None 322 | 323 | @app.route("/enqueue_bulk", methods=["POST"]) 324 | def enqueue_bulk(): 325 | if not config.CAPTCHA_SUBMIT or captcha.verify(): 326 | 327 | urls = request.form.get("urls") 328 | if urls: 329 | urls = urls.split() 330 | 331 | if 0 < len(urls) <= 1000: # TODO: Load from config & adjust placeholder/messages? 332 | 333 | pool = Pool(processes=6) 334 | pool.map(func=check_url, iterable=urls) 335 | pool.close() 336 | 337 | flash("Submitted websites to the queue", "success") 338 | 339 | return redirect("/submit") 340 | 341 | else: 342 | flash("Too few or too many urls, please submit 1-10 urls", "danger") 343 | return redirect("/submit") 344 | else: 345 | flash("Too few or too many urls, please submit 1-10 urls", "danger") 346 | return redirect("/submit") 347 | else: 348 | flash("Error: Invalid captcha please try again", "danger") 349 | return redirect("/submit") 350 | 351 | @app.route("/admin") 352 | def admin_login_form(): 353 | if "username" in session: 354 | return redirect("/dashboard") 355 | return render_template("admin.html", captcha=captcha, show_captcha=config.CAPTCHA_LOGIN) 356 | 357 | @app.route("/login", methods=["POST"]) 358 | def admin_login(): 359 | if not config.CAPTCHA_LOGIN or captcha.verify(): 360 | 361 | username = request.form.get("username") 362 | password = request.form.get("password") 363 | 364 | if db.check_login(username, password): 365 | session["username"] = username 366 | flash("Logged in", "success") 367 | return redirect("/dashboard") 368 | 369 | flash("Invalid username/password combo", "danger") 370 | return redirect("/admin") 371 | 372 | else: 373 | flash("Invalid captcha", "danger") 374 | return redirect("/admin") 375 | 376 | @app.route("/logout") 377 | def admin_logout(): 378 | session.clear() 379 | flash("Logged out", "info") 380 | return redirect("/") 381 | 382 | @app.route("/dashboard") 383 | def admin_dashboard(): 384 | require_role("admin") 385 | tokens = db.get_tokens() 386 | blacklist = db.get_blacklist() 387 | 388 | return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist) 389 | 390 | @app.route("/blacklist/add", methods=["POST"]) 391 | def admin_blacklist_add(): 392 | require_role("admin") 393 | url = request.form.get("url") 394 | db.add_blacklist_website(url) 395 | flash("Added item to blacklist", "success") 396 | return redirect("/dashboard") 397 | 398 | @app.route("/blacklist//delete") 399 | def admin_blacklist_remove(blacklist_id): 400 | require_role("admin") 401 | db.remove_blacklist_website(blacklist_id) 402 | flash("Removed blacklist item", "success") 403 | return redirect("/dashboard") 404 | 405 | @app.route("/generate_token", methods=["POST"]) 406 | def admin_generate_token(): 407 | require_role("admin") 408 | description = request.form.get("description") 409 | 410 | db.generate_api_token(description) 411 | flash("Generated API token", "success") 412 | 413 | return redirect("/dashboard") 414 | 415 | @app.route("/del_token", methods=["POST"]) 416 | def admin_del_token(): 417 | require_role("admin") 418 | token = request.form.get("token") 419 | 420 | db.delete_token(token) 421 | flash("Deleted API token", "success") 422 | return redirect("/dashboard") 423 | --------------------------------------------------------------------------------