├── ,gitattributes
├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── api.py
├── app.py
├── captcha.py
├── captchas
└── .gitkeep
├── common.py
├── config.py
├── database.py
├── do_recrawl.py
├── docker-compose.yml
├── export.py
├── high_level_diagram.dia
├── high_level_diagram.png
├── init_script.sql
├── main.py
├── mass_import.py
├── od_util.py
├── reddit_bot.py
├── requirements.txt
├── search
├── __init__.py
├── filter.py
└── search.py
├── static
├── Hack-Regular.ttf
├── css
│ ├── bootstrap.min.css
│ ├── fa-brands.css
│ ├── fa-brands.min.css
│ ├── fa-regular.css
│ ├── fa-regular.min.css
│ ├── fa-solid.css
│ ├── fa-solid.min.css
│ ├── fontawesome-all.css
│ ├── fontawesome-all.min.css
│ ├── fontawesome.css
│ ├── fontawesome.min.css
│ ├── ion.rangeSlider.css
│ ├── ion.rangeSlider.skinFlat.css
│ ├── main.css
│ └── style.css
├── downloads
│ └── README.md
├── img
│ ├── bg.png
│ ├── forkme_right_white_ffffff.png
│ └── sprite-skin-flat.png
├── js
│ ├── Chart.min.js
│ ├── bootstrap.min.js
│ ├── ion.rangeSlider.min.js
│ ├── jquery.min.js
│ ├── popper.min.js
│ ├── report.js
│ └── script.js
└── webfonts
│ ├── fa-brands-400.eot
│ ├── fa-brands-400.svg
│ ├── fa-brands-400.ttf
│ ├── fa-brands-400.woff
│ ├── fa-brands-400.woff2
│ ├── fa-regular-400.eot
│ ├── fa-regular-400.svg
│ ├── fa-regular-400.ttf
│ ├── fa-regular-400.woff
│ ├── fa-regular-400.woff2
│ ├── fa-solid-900.eot
│ ├── fa-solid-900.svg
│ ├── fa-solid-900.ttf
│ ├── fa-solid-900.woff
│ └── fa-solid-900.woff2
├── tasks.py
├── template_filters.py
├── templates
├── admin.html
├── contribute.html
├── dashboard.html
├── downloads.html
├── home.html
├── layout.html
├── search.html
├── stats.html
├── submit.html
├── website.html
└── websites.html
├── tt_config.yml
├── uwsgi.ini
└── views.py
/,gitattributes:
--------------------------------------------------------------------------------
1 | static/css/* linguist-vendored
2 | static/css/main.css linguist-vendored=false
3 | static/js/* linguist-vendored
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | /static/downloads/
3 | !/static/downloads/README.md
4 | __pycache__/
5 | captchas/
6 | _stats.json
7 | oddb.log
8 | praw.ini
9 | env/
10 | worker.json
11 | search_blacklist.txt
12 | *.iml
13 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "fold_to_ascii"]
2 | path = fold_to_ascii
3 | url = https://github.com/spanishdict/fold_to_ascii
4 | [submodule "task_tracker_drone"]
5 | path = task_tracker_drone
6 | url = https://github.com/simon987/task_tracker_drone
7 | [submodule "ws_bucket_client"]
8 | path = ws_bucket_client
9 | url = https://github.com/simon987/ws_bucket_client
10 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | WORKDIR /app
4 |
5 | ADD requirements.txt /app/requirements.txt
6 | RUN pip install -r requirements.txt
7 |
8 | ENTRYPOINT ["python", "app.py"]
9 |
10 | COPY . /app
11 |
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Simon Fortier
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OD-Database
2 |
3 | OD-Database is a web-crawling project that aims to index a very large number of file links and their basic metadata from open directories (misconfigured Apache/Nginx/FTP servers, or more often, mirrors of various public services).
4 |
5 | Each crawler instance fetches tasks from the central server and pushes the result once completed. A single instance can crawl hundreds of websites at the same time (Both FTP and HTTP(S)) and the central server is capable of ingesting thousands of new documents per second.
6 |
7 | The data is indexed into elasticsearch and made available via the web frontend (Currently hosted at https://od-db.the-eye.eu/). There is currently ~1.93 billion files indexed (total of about 300Gb of raw data). The raw data is made available as a CSV file [here](https://od-db.the-eye.eu/dl).
8 |
9 | 
10 |
11 |
12 | ### Contributing
13 | Suggestions/concerns/PRs are welcome
14 |
15 | ## Installation (Docker)
16 | ```bash
17 | git clone --recursive https://github.com/simon987/od-database
18 | cd od-database
19 | mkdir oddb_pg_data/ tt_pg_data/ es_data/ wsb_data/
20 | docker-compose up
21 | ```
22 |
23 | ## Architecture
24 |
25 | 
26 |
27 | ## Running the crawl server
28 | The python crawler that was a part of this project is discontinued,
29 | [the go implementation](https://github.com/terorie/od-database-crawler) is currently in use.
30 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
1 | import json
2 | from uuid import uuid4
3 |
4 | from flask import request, abort, send_file, session
5 |
6 | import captcha
7 | import common as oddb
8 | from common import taskManager
9 | from database import Website
10 | from search.search import InvalidQueryException
11 |
12 |
13 | def setup_api(app):
14 | taskManager.start_indexer_threads()
15 |
16 | @app.route("/api/website/by_url", methods=["GET"])
17 | def api_website_by_url():
18 | token = request.args.get("token")
19 | name = oddb.db.check_api_token(token)
20 |
21 | if name:
22 | url = request.args.get("url")
23 | website = oddb.db.get_website_by_url(url)
24 | oddb.logger.info("API get website by url '" + url + "' by " + name)
25 | if website:
26 | return str(website.id)
27 | return abort(404)
28 | else:
29 | return abort(403)
30 |
31 | @app.route("/api/website/blacklisted", methods=["GET"])
32 | def api_website_is_blacklisted():
33 | token = request.args.get("token")
34 | url = request.args.get("url")
35 | name = oddb.db.check_api_token(token)
36 |
37 | if name:
38 | oddb.logger.info("API get website is blacklisted '" + url + "' by " + name)
39 | return str(oddb.db.is_blacklisted(url))
40 | else:
41 | return abort(403)
42 |
43 | @app.route("/api/website/add", methods=["GET"])
44 | def api_add_website():
45 | token = request.args.get("token")
46 | url = request.args.get("url")
47 |
48 | name = oddb.db.check_api_token(token)
49 | if name:
50 |
51 | website_id = oddb.db.insert_website(Website(url, str(request.remote_addr + "_" +
52 | request.headers.get("X-Forwarded-For", "")),
53 | "API_CLIENT_" + name))
54 | oddb.logger.info("API add website '" + url + "' by " + name + "(" + str(website_id) + ")")
55 | return str(website_id)
56 | else:
57 | return abort(403)
58 |
59 | @app.route("/api/website/random")
60 | def api_random_website():
61 | token = request.json["token"]
62 | name = oddb.db.check_api_token(token)
63 |
64 | if name:
65 | oddb.logger.info("API get random website by " + name)
66 | return str(oddb.db.get_random_website_id())
67 | else:
68 | return abort(403)
69 |
70 | @app.route("/api/search", methods=["POST"])
71 | def api_search():
72 | token = request.json["token"]
73 | name = oddb.db.check_api_token(token)
74 |
75 | if name:
76 |
77 | try:
78 | hits = oddb.searchEngine.search(
79 | request.json["query"],
80 | request.json["page"], request.json["per_page"],
81 | request.json["sort_order"],
82 | request.json["extensions"],
83 | request.json["size_min"], request.json["size_max"],
84 | request.json["match_all"],
85 | request.json["fields"],
86 | request.json["date_min"], request.json["date_max"]
87 | )
88 |
89 | hits = oddb.db.join_website_on_search_result(hits)
90 | oddb.logger.info("API search '" + request.json["query"] + "' by " + name)
91 | return json.dumps(hits)
92 |
93 | except InvalidQueryException as e:
94 | oddb.logger.info("API search failed: " + str(e))
95 | return str(e)
96 | else:
97 | return abort(403)
98 |
99 | @app.route("/cap", methods=["GET"])
100 | def cap():
101 | word = captcha.make_captcha()
102 | cap_id = uuid4().__str__()
103 | session["cap"] = cap_id
104 |
105 | oddb.redis.set(cap_id, word)
106 |
107 | return send_file(captcha.get_path(word), cache_timeout=0)
108 |
109 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 |
3 | import api
4 | import common
5 | import config
6 | import template_filters
7 | import views
8 | import os
9 |
10 | app = Flask(__name__)
11 | app.secret_key = config.FLASK_SECRET
12 | template_filters.setup_template_filters(app)
13 |
14 | views.setup_views(app)
15 | api.setup_api(app)
16 |
17 |
18 | if os.environ.get("ODDB_USER", False) and os.environ.get("ODDB_PASSWORD", False):
19 | user = os.environ["ODDB_USER"]
20 | password = os.environ["ODDB_PASSWORD"]
21 | try:
22 | common.db.generate_login(user, password)
23 | print("Generated user %s" % user)
24 | except:
25 | pass
26 |
27 | if __name__ == '__main__':
28 | app.run("0.0.0.0", port=80, threaded=True)
29 |
--------------------------------------------------------------------------------
/captcha.py:
--------------------------------------------------------------------------------
1 | import random
2 | import string
3 |
4 | from PIL import Image, ImageDraw, ImageFont
5 | from flask import request, session
6 |
7 | import common as oddb
8 | import config
9 |
10 |
11 | def get_code():
12 |
13 | if "cap_remaining" in session and session["cap_remaining"] > 0:
14 | return """
15 | You will not be asked to complete a captcha for the next {} pages
16 | """.format(session["cap_remaining"])
17 |
18 | return """
19 |
23 | """
24 |
25 |
26 | def get_path(word):
27 | return "captchas/{}.png".format(word)
28 |
29 |
30 | def verify():
31 | if "cap_remaining" in session and session["cap_remaining"] > 0:
32 | session["cap_remaining"] -= 1
33 | return True
34 |
35 | attempt = request.form.get("cap") if "cap" in request.form else (
36 | request.args.get("cap") if "cap" in request.args else ""
37 | )
38 |
39 | if "cap" in session:
40 | expected = oddb.redis.get(session["cap"])
41 | expected = expected.decode("utf8") if expected is not None else ""
42 | oddb.redis.delete(session["cap"])
43 |
44 | if expected == attempt:
45 | session["cap_remaining"] = config.CAPTCHA_EVERY
46 | return True
47 |
48 | return False
49 |
50 |
51 | cfg = {
52 | "image": {
53 | "size": (200, 72),
54 | "supersampling": 2
55 | },
56 | "noise": {
57 | "min": 100,
58 | "max": 250
59 | },
60 | "colors": {
61 | "green": [(1, 51, 1), (34, 204, 34)],
62 | "yellow": [(67, 67, 1), (221, 221, 0)],
63 | "cyan": [(17, 51, 85), (85, 187, 254)],
64 | "magenta": [(51, 1, 51), (254, 0, 254)],
65 | "red": [(67, 1, 1), (254, 68, 68)],
66 | "orange": [(68, 51, 1), (255, 153, 0)]
67 | },
68 | "lines": {
69 | "back_thin": {"n": 3, "w": 5},
70 | "back_thick": {"n": 3, "w": 6},
71 | "back_positions": [
72 | {
73 | "ax": (0, 10),
74 | "ay": (0, 36),
75 | "bx": (150, 200),
76 | "by": (18, 50)
77 | },
78 | {
79 | "ax": (0, 10),
80 | "ay": (18, 50),
81 | "bx": (150, 200),
82 | "by": (0, 17)
83 | }
84 | ],
85 | "front_horizontal_thin": {"n": 2, "w": 3},
86 | "front_horizontal_thick": {"n": 2, "w": 4},
87 | "front_horizontal_positions": [
88 | {
89 | "ax": (0, 20),
90 | "ay": (0, 34),
91 | "bx": (150, 200),
92 | "by": (18, 50)
93 | },
94 | {
95 | "ax": (0, 20),
96 | "ay": (18, 72),
97 | "bx": (140, 200),
98 | "by": (0, 36)
99 | },
100 | ],
101 | "front_vertical": {"n": 2, "w": 4},
102 | "front_vertical_positions": {
103 | "outside": 5,
104 | "font_width": 13,
105 | "ay": (0, 16),
106 | "by": (54, 72)
107 | }
108 | },
109 | "text": {
110 | "font": {
111 | "path": "static/Hack-Regular.ttf",
112 | "size": 60,
113 | "outline": [1, 2]
114 | },
115 | "letters": {
116 | "3": {
117 | "count": 3,
118 | "x_min": 35,
119 | "x_max": 50,
120 | "y_min": -5,
121 | "y_max": 8
122 | },
123 | "4": {
124 | "count": 4,
125 | "x_min": 20,
126 | "x_max": 35,
127 | "y_min": -5,
128 | "y_max": 8
129 | },
130 | "5": {
131 | "count": 5,
132 | "x_min": 5,
133 | "x_max": 20,
134 | "y_min": -5,
135 | "y_max": 8
136 | }
137 | }
138 | }
139 | }
140 |
141 | size = cfg["image"]["size"]
142 | c = cfg["image"]["supersampling"]
143 |
144 | # Additional config
145 | letter_count = "4"
146 |
147 |
148 | def horizontal_lines(draw, c, line_par, line_pos, fill):
149 | for _ in range(line_par["n"]):
150 | pos = random.randrange(0, len(line_pos))
151 | ax = random.randint(*line_pos[pos]["ax"])
152 | ay = random.randint(*line_pos[pos]["ay"])
153 | bx = random.randint(*line_pos[pos]["bx"])
154 | by = random.randint(*line_pos[pos]["by"])
155 | draw.line([(ax*c, ay*c), (bx*c, by*c)], width=line_par["w"]*c, fill=fill)
156 |
157 |
158 | def make_captcha():
159 |
160 | color_name, color = random.choice(list(cfg["colors"].items()))
161 | text = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(cfg["text"]["letters"][letter_count]["count"]))
162 |
163 | path = get_path(text)
164 |
165 | w = size[0]*c
166 | h = size[1]*c
167 |
168 | img = Image.new('RGB', (w, h))
169 | pixels = img.load()
170 |
171 | # noise
172 | for x in range(w):
173 | for y in range(h):
174 | rcol = random.randint(cfg["noise"]["min"], cfg["noise"]["max"])
175 | pixels[x, y] = (rcol, rcol, rcol)
176 |
177 | # background lines
178 | draw = ImageDraw.Draw(img)
179 |
180 | horizontal_lines(draw, c, cfg["lines"]["back_thin"], cfg["lines"]["back_positions"], color[0])
181 | horizontal_lines(draw, c, cfg["lines"]["back_thick"], cfg["lines"]["back_positions"], color[0])
182 |
183 | # text
184 | ctx = cfg["text"]["font"]
185 | font = ImageFont.truetype(ctx["path"], ctx["size"]*c)
186 | outline = random.choice(ctx["outline"])
187 |
188 | ctx = cfg["text"]["letters"][letter_count]
189 | x = random.randint(ctx["x_min"], ctx["x_max"])
190 | y = random.randint(ctx["y_min"], ctx["y_max"])
191 | draw.text((x*c-outline*c, y*c-outline*c), text, color[0], font=font)
192 | draw.text((x*c-outline*c, y*c), text, color[0], font=font)
193 | draw.text((x*c-outline*c, y*c+outline*c), text, color[0], font=font)
194 | draw.text((x*c, y*c-outline*c), text, color[0], font=font)
195 | draw.text((x*c, y*c+outline*c), text, color[0], font=font)
196 | draw.text((x*c+outline*c, y*c-outline*c), text, color[0], font=font)
197 | draw.text((x*c+outline*c, y*c), text, color[0], font=font)
198 | draw.text((x*c+outline*c, y*c+outline*c), text, color[0], font=font)
199 | draw.text((x*c, y*c), text, color[1], font=font)
200 |
201 | # foreground lines
202 | horizontal_lines(draw, c, cfg["lines"]["front_horizontal_thin"], cfg["lines"]["front_horizontal_positions"], color[1])
203 | horizontal_lines(draw, c, cfg["lines"]["front_horizontal_thick"], cfg["lines"]["front_horizontal_positions"], color[1])
204 |
205 | # vertical lines
206 | line_par = cfg["lines"]["front_vertical"]
207 | line_pos = cfg["lines"]["front_vertical_positions"]
208 |
209 | for _ in range(line_par["n"]):
210 | ax = random.randint(x-line_pos["outside"], x+line_pos["outside"] + cfg["text"]["letters"][letter_count]["count"]*line_pos["font_width"])
211 | bx = ax + random.randint(-line_pos["font_width"], line_pos["font_width"])
212 | ay = random.randint(*line_pos["ay"])
213 | by = random.randint(*line_pos["by"])
214 | draw.line([(ax*c, ay*c), (bx*c, by*c)], width=line_par["w"]*c, fill=color[1])
215 |
216 | img.thumbnail(cfg["image"]["size"], Image.ANTIALIAS)
217 | img.save(path, "png")
218 |
219 | return text
220 |
221 |
222 | if __name__ == "__main__":
223 | make_captcha()
224 |
--------------------------------------------------------------------------------
/captchas/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/captchas/.gitkeep
--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from logging import FileHandler, StreamHandler
4 |
5 | import redis as r
6 | from flask import session, abort
7 |
8 | import config
9 | from database import Database
10 | from search.search import ElasticSearchEngine
11 | from tasks import TaskManager
12 |
13 | # Disable flask logging
14 | flaskLogger = logging.getLogger('werkzeug')
15 | flaskLogger.setLevel(logging.ERROR)
16 |
17 | logger = logging.getLogger("default")
18 | logger.setLevel(logging.DEBUG)
19 |
20 | formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
21 | file_handler = FileHandler("oddb.log")
22 | file_handler.setFormatter(formatter)
23 | for h in logger.handlers:
24 | logger.removeHandler(h)
25 | logger.addHandler(file_handler)
26 | logger.addHandler(StreamHandler(sys.stdout))
27 |
28 | taskManager = TaskManager()
29 | searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
30 | searchEngine.start_stats_scheduler()
31 | db = Database(config.DB_CONN_STR)
32 |
33 | redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)
34 |
35 |
36 | def require_role(role: str):
37 | if db.get_user_role(session.get("username", None)) != role:
38 | abort(403)
39 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | from os import environ
2 |
3 | CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False))
4 | CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False))
5 | CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False))
6 | CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10))
7 |
8 | FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret")
9 | RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000)
10 |
11 | SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False))
12 | SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True))
13 |
14 | TT_API = environ.get("TT_API", "http://localhost:3010")
15 | TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3))
16 | TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9))
17 |
18 | WSB_API = environ.get("WSB_API", "http://localhost:3020")
19 | WSB_SECRET = environ.get("WSB_SECRET", "default_secret")
20 |
21 | ES_URL = environ.get("ES_URL", "http://localhost:9200")
22 | ES_INDEX = environ.get("ES_INDEX", "od-database")
23 |
24 | REDIS_HOST = environ.get("REDIS_HOST", "localhost")
25 | REDIS_PORT = environ.get("REDIS_PORT", 6379)
26 |
27 | DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database")
28 | RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000)
29 | INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3))
30 |
--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import uuid
4 | from urllib.parse import urlparse, urljoin
5 |
6 | import bcrypt
7 | import psycopg2
8 |
9 |
10 | class BlacklistedWebsite:
11 | def __init__(self, blacklist_id, url):
12 | self.id = blacklist_id
13 | self.netloc = url
14 |
15 |
16 | class Website:
17 |
18 | def __init__(self, url, logged_ip, logged_useragent, last_modified=None, website_id=None):
19 | self.url = url
20 | self.logged_ip = logged_ip
21 | self.logged_useragent = logged_useragent
22 | self.last_modified = last_modified
23 | self.id = website_id
24 |
25 |
26 | class ApiClient:
27 |
28 | def __init__(self, token, name):
29 | self.token = token
30 | self.name = name
31 |
32 |
33 | class Database:
34 |
35 | def __init__(self, db_conn_str):
36 | self.db_conn_str = db_conn_str
37 | self.website_cache = dict()
38 | self.website_cache_time = 0
39 |
40 | with psycopg2.connect(self.db_conn_str) as conn:
41 | cursor = conn.cursor()
42 | cursor.execute("SELECT EXISTS (SELECT 1 FROM pg_tables "
43 | "WHERE tablename = 'searchlogentry')")
44 |
45 | if not cursor.fetchone()[0]:
46 | self.init_database()
47 |
48 | def init_database(self):
49 |
50 | print("Initializing database")
51 |
52 | with open("init_script.sql", "r") as f:
53 | init_script = f.read()
54 |
55 | with psycopg2.connect(self.db_conn_str) as conn:
56 | cur = conn.cursor()
57 | cur.execute(init_script)
58 |
59 | def update_website_date_if_exists(self, website_id):
60 |
61 | with psycopg2.connect(self.db_conn_str) as conn:
62 | cursor = conn.cursor()
63 | cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=%s", (website_id,))
64 | conn.commit()
65 |
66 | def insert_website(self, website: Website):
67 |
68 | with psycopg2.connect(self.db_conn_str) as conn:
69 | cursor = conn.cursor()
70 | cursor.execute("INSERT INTO Website (url, logged_ip, logged_useragent) VALUES (%s,%s,%s) RETURNING id",
71 | (website.url, str(website.logged_ip), str(website.logged_useragent)))
72 |
73 | website_id = cursor.fetchone()[0]
74 | conn.commit()
75 |
76 | return website_id
77 |
78 | def get_website_by_url(self, url):
79 |
80 | with psycopg2.connect(self.db_conn_str) as conn:
81 | cursor = conn.cursor()
82 |
83 | cursor.execute("SELECT id, url, logged_ip, logged_useragent, last_modified FROM Website WHERE url=%s",
84 | (url,))
85 | db_web = cursor.fetchone()
86 | if db_web:
87 | website = Website(db_web[1], db_web[2], db_web[3], db_web[4], str(db_web[0]))
88 | return website
89 | else:
90 | return None
91 |
92 | def get_website_by_id(self, website_id):
93 |
94 | with psycopg2.connect(self.db_conn_str) as conn:
95 | cursor = conn.cursor()
96 |
97 | cursor.execute("SELECT * FROM Website WHERE id=%s", (website_id,))
98 | db_web = cursor.fetchone()
99 |
100 | if db_web:
101 | website = Website(db_web[1], db_web[2], db_web[3], str(db_web[4]))
102 | website.id = db_web[0]
103 | return website
104 | else:
105 | return None
106 |
107 | def get_websites(self, per_page, page: int, url):
108 | """Get all websites"""
109 | with psycopg2.connect(self.db_conn_str) as conn:
110 | cursor = conn.cursor()
111 |
112 | cursor.execute("SELECT Website.id, Website.url, Website.last_modified FROM Website "
113 | "WHERE Website.url LIKE %s "
114 | "ORDER BY last_modified DESC LIMIT %s OFFSET %s", (url + "%", per_page, page * per_page))
115 |
116 | return cursor.fetchall()
117 |
118 | def get_random_website_id(self):
119 |
120 | with psycopg2.connect(self.db_conn_str) as conn:
121 | cursor = conn.cursor()
122 | cursor.execute("SELECT id FROM Website ORDER BY random() LIMIT 1")
123 |
124 | row = cursor.fetchone()
125 | if row:
126 | return row[0]
127 | return None
128 |
129 | def website_exists(self, url):
130 | """Check if an url or the parent directory of an url already exists"""
131 | with psycopg2.connect(self.db_conn_str) as conn:
132 | cursor = conn.cursor()
133 |
134 | cursor.execute("SELECT id FROM Website WHERE url = substr(%s, 0, length(url) + 1)", (url,))
135 | website_id = cursor.fetchone()
136 | return website_id[0] if website_id else None
137 |
138 | def delete_website(self, website_id):
139 |
140 | with psycopg2.connect(self.db_conn_str) as conn:
141 | cursor = conn.cursor()
142 |
143 | cursor.execute("DELETE FROM Website WHERE id=%s", (website_id,))
144 | conn.commit()
145 |
146 | def check_login(self, username, password) -> bool:
147 | with psycopg2.connect(self.db_conn_str) as conn:
148 | cursor = conn.cursor()
149 |
150 | cursor.execute("SELECT password FROM Admin WHERE username=%s", (username,))
151 |
152 | db_user = cursor.fetchone()
153 |
154 | if db_user:
155 | return bcrypt.checkpw(password.encode(), db_user[0].tobytes())
156 | return False
157 |
158 | def get_user_role(self, username: str):
159 | with psycopg2.connect(self.db_conn_str) as conn:
160 | cursor = conn.cursor()
161 |
162 | cursor.execute("SELECT role FROM Admin WHERE username=%s", (username,))
163 |
164 | db_user = cursor.fetchone()
165 |
166 | if db_user:
167 | return db_user[0]
168 | return False
169 |
170 | def generate_login(self, username, password) -> None:
171 |
172 | with psycopg2.connect(self.db_conn_str) as conn:
173 | cursor = conn.cursor()
174 |
175 | hashed_pw = bcrypt.hashpw(password.encode(), bcrypt.gensalt(12))
176 |
177 | cursor.execute("INSERT INTO Admin (username, password, role) VALUES (%s,%s, 'admin')",
178 | (username, hashed_pw))
179 | conn.commit()
180 |
181 | def check_api_token(self, token) -> str:
182 |
183 | with psycopg2.connect(self.db_conn_str) as conn:
184 | cursor = conn.cursor()
185 |
186 | cursor.execute("SELECT name FROM ApiClient WHERE token=%s", (token,))
187 | result = cursor.fetchone()
188 | return result[0] if result else None
189 |
190 | def generate_api_token(self, name: str) -> str:
191 |
192 | with psycopg2.connect(self.db_conn_str) as conn:
193 | cursor = conn.cursor()
194 |
195 | token = str(uuid.uuid4())
196 | cursor.execute("INSERT INTO ApiClient (token, name) VALUES (%s, %s)", (token, name))
197 | conn.commit()
198 |
199 | return token
200 |
201 | def get_tokens(self) -> list:
202 |
203 | with psycopg2.connect(self.db_conn_str) as conn:
204 | cursor = conn.cursor()
205 |
206 | cursor.execute("SELECT token, name FROM ApiClient")
207 |
208 | return [ApiClient(x[0], x[1]) for x in cursor.fetchall()]
209 |
210 | def delete_token(self, token: str) -> None:
211 |
212 | with psycopg2.connect(self.db_conn_str) as conn:
213 | cursor = conn.cursor()
214 |
215 | cursor.execute("DELETE FROM ApiClient WHERE token=%s", (token,))
216 | conn.commit()
217 |
218 | def get_all_websites(self) -> dict:
219 | if self.website_cache_time + 120 < time.time():
220 | with psycopg2.connect(self.db_conn_str) as conn:
221 | cursor = conn.cursor()
222 |
223 | cursor.execute("SELECT id, url FROM Website")
224 |
225 | result = dict()
226 |
227 | for db_website in cursor.fetchall():
228 | result[db_website[0]] = db_website[1]
229 |
230 | self.website_cache = result
231 | self.website_cache_time = time.time()
232 |
233 | return self.website_cache
234 |
235 | def join_website_on_search_result(self, page: dict) -> dict:
236 |
237 | websites = self.get_all_websites()
238 |
239 | for hit in page["hits"]["hits"]:
240 | if hit["_source"]["website_id"] in websites:
241 | hit["_source"]["website_url"] = urljoin(websites[hit["_source"]["website_id"]], "/")
242 | else:
243 | hit["_source"]["website_url"] = "[DELETED]"
244 |
245 | return page
246 |
247 | def join_website_url(self, docs):
248 |
249 | websites = self.get_all_websites()
250 |
251 | for doc in docs:
252 | if doc["_source"]["website_id"] in websites:
253 | doc["_source"]["website_url"] = urljoin(websites[doc["_source"]["website_id"]], "/")
254 | else:
255 | doc["_source"]["website_url"] = "[DELETED]"
256 |
257 | yield doc
258 |
259 | def join_website_on_stats(self, stats):
260 |
261 | websites = self.get_all_websites()
262 |
263 | for website in stats["website_scatter"]:
264 | website[0] = websites.get(website[0], "[DELETED]")
265 |
266 | def add_blacklist_website(self, url):
267 |
268 | with psycopg2.connect(self.db_conn_str) as conn:
269 | cursor = conn.cursor()
270 | parsed_url = urlparse(url)
271 | url = parsed_url.scheme + "://" + parsed_url.netloc
272 | cursor.execute("INSERT INTO BlacklistedWebsite (url) VALUES (%s)", (url,))
273 | conn.commit()
274 |
275 | def remove_blacklist_website(self, blacklist_id):
276 |
277 | with psycopg2.connect(self.db_conn_str) as conn:
278 | cursor = conn.cursor()
279 |
280 | cursor.execute("DELETE FROM BlacklistedWebsite WHERE id=%s", (blacklist_id,))
281 | conn.commit()
282 |
283 | def is_blacklisted(self, url):
284 |
285 | with psycopg2.connect(self.db_conn_str) as conn:
286 | cursor = conn.cursor()
287 | parsed_url = urlparse(url)
288 | url = parsed_url.scheme + "://" + parsed_url.netloc
289 | print(url)
290 | cursor.execute("SELECT id FROM BlacklistedWebsite WHERE url LIKE %s LIMIT 1", (url,))
291 |
292 | return cursor.fetchone() is not None
293 |
294 | def get_blacklist(self):
295 |
296 | with psycopg2.connect(self.db_conn_str) as conn:
297 | cursor = conn.cursor()
298 |
299 | cursor.execute("SELECT * FROM BlacklistedWebsite")
300 | return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()]
301 |
302 | def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
303 |
304 | with psycopg2.connect(self.db_conn_str) as conn:
305 | cursor = conn.cursor()
306 |
307 | cursor.execute(
308 | "INSERT INTO SearchLogEntry "
309 | "(remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
310 | "VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
311 | (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
312 |
313 | conn.commit()
314 |
315 | def get_oldest_updated_websites(self, size: int, prefix: str):
316 |
317 | with psycopg2.connect(self.db_conn_str) as conn:
318 | cursor = conn.cursor()
319 |
320 | cursor.execute("SELECT id, url, last_modified FROM website "
321 | "WHERE url LIKE %s "
322 | "ORDER BY last_modified ASC LIMIT %s",
323 | (prefix + "%", size, ))
324 | return [Website(url=r[1],
325 | website_id=r[0],
326 | last_modified=r[2],
327 | logged_ip=None,
328 | logged_useragent=None
329 | )
330 | for r in cursor.fetchall()]
331 |
--------------------------------------------------------------------------------
/do_recrawl.py:
--------------------------------------------------------------------------------
1 | from tasks import TaskManager
2 |
3 | tm = TaskManager()
4 | tm.do_recrawl()
5 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "2.1"
2 | services:
3 | oddb:
4 | build: .
5 | ports:
6 | - 5020:80
7 | environment:
8 | - "CAPTCHA_LOGIN=True"
9 | - "CAPTCHA_SUBMIT=True"
10 | - "CAPTCHA_SEARCH=True"
11 | - "CAPTCHA_EVERY=10"
12 | - "FLASK_SECRET=changeme"
13 | - "SUBMIT_FTP=False"
14 | - "SUBMIT_HTTP=True"
15 | - "TT_API=http://tt:3010"
16 | - "TT_CRAWL_PROJECT=1"
17 | - "TT_INDEX_PROJECT=2"
18 | - "WSB_API=http://wsb:3020"
19 | - "WSB_SECRET=changeme"
20 | - "REDIS_HOST=oddb_redis"
21 | - "ES_URL=es:9200"
22 | - "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable"
23 | - "RECRAWL_POOL_SIZE=10000"
24 | - "INDEXER_THREADS=2"
25 | - "ODDB_USER=admin"
26 | - "ODDB_PASSWORD=changeme"
27 | depends_on:
28 | wsb:
29 | condition: service_started
30 | tt:
31 | condition: service_started
32 | oddb_db:
33 | condition: service_healthy
34 | es:
35 | condition: service_healthy
36 | restart: always
37 | oddb_db:
38 | image: postgres
39 | volumes:
40 | - ./oddb_pg_data:/var/lib/postgresql/data
41 | environment:
42 | - "POSTGRES_USER=od_database"
43 | - "POSTGRES_PASSWORD=changeme"
44 | healthcheck:
45 | test: ["CMD-SHELL", "pg_isready -U od_database"]
46 | interval: 5s
47 | timeout: 5s
48 | retries: 5
49 | oddb_redis:
50 | image: redis
51 | wsb:
52 | image: simon987/wsb_bucket
53 | volumes:
54 | - ./wsb_data:/data
55 | environment:
56 | - "WS_BUCKET_SECRET=changeme"
57 | ports:
58 | - 3020:3020
59 | tt:
60 | image: simon987/task_tracker
61 | volumes:
62 | - ./tt_config.yml:/root/config.yml
63 | ports:
64 | - 3010:80
65 | depends_on:
66 | tt_db:
67 | condition: service_healthy
68 | tt_web:
69 | image: simon987/task_tracker_web
70 | ports:
71 | - 3011:80
72 | depends_on:
73 | tt:
74 | condition: service_started
75 | tt_db:
76 | image: postgres
77 | volumes:
78 | - ./tt_pg_data:/var/lib/postgresql/data
79 | environment:
80 | - "POSTGRES_USER=task_tracker"
81 | - "POSTGRES_PASSWORD=changeme"
82 | healthcheck:
83 | test: ["CMD-SHELL", "pg_isready -U task_tracker"]
84 | interval: 3s
85 | timeout: 2s
86 | retries: 10
87 | es:
88 | image: docker.elastic.co/elasticsearch/elasticsearch:7.5.2
89 | environment:
90 | - discovery.type=single-node
91 | - "ES_JAVA_OPTS=-Xms1G -Xmx4G"
92 | volumes:
93 | - ./es_data:/usr/share/elasticsearch/data
94 | healthcheck:
95 | test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
96 | interval: 5s
97 | timeout: 5s
98 | retries: 5
99 | # (Optional)
100 | kibana:
101 | image: docker.elastic.co/kibana/kibana:7.5.2
102 | environment:
103 | - ELASTICSEARCH_HOSTS=http://es:9200
104 | ports:
105 | - 5021:5601
106 | depends_on:
107 | es:
108 | condition: service_healthy
109 |
--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | import lz4.frame
5 |
6 | import config
7 | from database import Database
8 | from search.search import ElasticSearchEngine
9 |
10 |
11 | def quote(string):
12 | if "\"" in string:
13 | return "\"" + string.replace("\"", "\"\"") + "\""
14 | elif "," in string:
15 | return "\"" + string + "\""
16 | else:
17 | return string
18 |
19 |
20 | outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
21 | dldir = "static/downloads/"
22 |
23 | print("Deleting existing dumps")
24 | for file in os.listdir(dldir):
25 | if file.endswith("_dump.csv.lz4"):
26 | os.remove(os.path.join(dldir, file))
27 |
28 | print("Export started, connecting to databases...")
29 |
30 | db = Database(config.DB_CONN_STR)
31 | es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
32 |
33 | docs_with_url = db.join_website_url(es.stream_all_docs())
34 |
35 | print("Connected, writing to csv")
36 |
37 | with lz4.frame.open(outfile + ".part", mode='wb',
38 | compression_level=9,
39 | block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
40 | fp.write((",".join(
41 | ["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
42 | ) + "\n").encode())
43 |
44 | for doc in docs_with_url:
45 | try:
46 | fp.write(
47 | (",".join(
48 | [
49 | str(doc["_source"]["website_id"]),
50 | quote(doc["_source"]["website_url"]),
51 | quote(doc["_source"]["path"]),
52 | quote(doc["_source"]["name"]),
53 | quote(doc["_source"]["ext"]),
54 | str(doc["_source"]["size"]),
55 | str(doc["_source"]["mtime"])
56 | ]
57 | ) + "\n").encode())
58 | except Exception as e:
59 | print(e)
60 | print(doc)
61 |
62 |
63 | os.rename(outfile + ".part", os.path.join(dldir, outfile))
64 |
--------------------------------------------------------------------------------
/high_level_diagram.dia:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/high_level_diagram.dia
--------------------------------------------------------------------------------
/high_level_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/high_level_diagram.png
--------------------------------------------------------------------------------
/init_script.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS Website, Admin, BlacklistedWebsite, ApiClient, SearchLogEntry;
2 |
3 | CREATE TABLE Website (
4 |
5 | id SERIAL PRIMARY KEY NOT NULL,
6 | url TEXT,
7 | logged_ip TEXT,
8 | logged_useragent TEXT,
9 | last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
10 | );
11 |
12 | CREATE TABLE Admin (
13 | username TEXT PRIMARY KEY NOT NULL,
14 | password BYTEA,
15 | role TEXT
16 | );
17 |
18 | CREATE TABLE BlacklistedWebsite (
19 | id SERIAL PRIMARY KEY NOT NULL,
20 | url TEXT
21 | );
22 |
23 | CREATE TABLE ApiClient (
24 | name TEXT PRIMARY KEY NOT NULL,
25 | token TEXT NOT NULL
26 | );
27 |
28 | CREATE TABLE SearchLogEntry (
29 | id SERIAL PRIMARY KEY,
30 | search_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
31 | remote_addr TEXT,
32 | forwarded_for TEXT,
33 | query TEXT,
34 | extensions TEXT,
35 | page INT,
36 | blocked BOOLEAN DEFAULT FALSE,
37 | results INT DEFAULT 0,
38 | took INT DEFAULT 0
39 | );
40 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from app import app
2 |
3 | if __name__ == '__main__':
4 | app.run("0.0.0.0", port=12345)
5 |
--------------------------------------------------------------------------------
/mass_import.py:
--------------------------------------------------------------------------------
1 | import fileinput
2 | import os
3 | from multiprocessing.pool import Pool
4 |
5 | import od_util
6 | from common import db, taskManager
7 | from database import Website
8 | from tasks import Task
9 |
10 | urls = (line for line in fileinput.input())
11 |
12 |
13 | def try_enqueue(url):
14 | url = os.path.join(url, "")
15 | url = od_util.get_top_directory(url)
16 |
17 | if not od_util.is_valid_url(url):
18 | return "Error: Invalid url. Make sure to include the appropriate scheme."
19 |
20 | website = db.get_website_by_url(url)
21 | if website:
22 | return "Website already exists"
23 |
24 | website = db.website_exists(url)
25 | if website:
26 | return "A parent directory of this url has already been posted"
27 |
28 | if db.is_blacklisted(url):
29 | return "Error: " \
30 | "Sorry, this website has been blacklisted. If you think " \
31 | "this is an error, please contact me."
32 |
33 | if not od_util.is_od(url):
34 | return "Error:" \
35 | "The anti-spam algorithm determined that the submitted url is not " \
36 | "an open directory or the server is not responding. If you think " \
37 | "this is an error, please contact me."
38 |
39 | website_id = db.insert_website(Website(url, "localhost", "mass_import.py"))
40 |
41 | task = Task(website_id, url, priority=2)
42 | taskManager.queue_task(task)
43 |
44 | return "The website has been added to the queue"
45 |
46 |
47 | def check_url(url):
48 | url = os.path.join(url.strip(), "")
49 | try:
50 | print(try_enqueue(url))
51 | except:
52 | pass
53 | return None
54 |
55 |
56 | pool = Pool(processes=50)
57 | pool.map(func=check_url, iterable=urls)
58 | pool.close()
59 |
--------------------------------------------------------------------------------
/od_util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from ftplib import FTP
4 | from urllib.parse import urljoin, urlparse
5 |
6 | import requests
7 | import validators
8 | from bs4 import BeautifulSoup
9 |
10 | # TODO: find a better way to do this
11 | try:
12 | from . import config
13 | except (ImportError, SystemError):
14 | import config
15 |
16 | import urllib3
17 | urllib3.disable_warnings()
18 |
19 |
20 | def truncate_path(path, max_len):
21 | pattern = re.compile(r"/?.*?/")
22 |
23 | for i in range(1, path.count("/")):
24 | new_path = pattern.sub(".../", path, i)
25 | if len(new_path) < max_len:
26 | return new_path
27 | return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path
28 |
29 |
30 | category_map = {
31 |
32 | # Application category
33 | 'bcpio': 'application', 'bin': 'application', 'cdf': 'application',
34 | 'csh': 'application', 'dll': 'application', 'doc': 'application',
35 | 'dot': 'application', 'dvi': 'application', 'eml': 'application',
36 | 'exe': 'application', 'hdf': 'application',
37 | 'man': 'application', 'me': 'application', 'mht': 'application',
38 | 'mhtml': 'application', 'mif': 'application', 'ms': 'application',
39 | 'nc': 'application', 'nws': 'application', 'o': 'application',
40 | 'obj': 'application', 'oda': 'application', 'p12': 'application',
41 | 'p7c': 'application', 'pfx': 'application', 'tr': 'application',
42 | 'ppa': 'application', 'pps': 'application', 'ppt': 'application',
43 | 'ps': 'application', 'pwz': 'application', 'pyc': 'application',
44 | 'pyo': 'application', 'ram': 'application', 'rdf': 'application',
45 | 'roff': 'application', 'sh': 'application', 'so': 'application',
46 | 'src': 'application', 'sv4cpio': 'application', 'sv4crc': 'application',
47 | 't': 'application', 'tcl': 'application', 'tex': 'application',
48 | 'texi': 'application', 'texinfo': 'application', 'ustar': 'application',
49 | 'wiz': 'application', 'wsdl': 'application', 'xlb': 'application',
50 | 'xls': 'application', 'xpdl': 'application', 'xsl': 'application',
51 | 'torrent': 'application', 'rpm': 'application', 'deb': 'application',
52 | 'atr': 'application', 'class': 'application', 'ttf': 'application',
53 | 'img': 'application', 'msi': 'application', 'run': 'application',
54 | 'drpm': 'application', 'udeb': 'application', 'patch': 'application',
55 | 'nes': 'application', 'ebuild': 'application', 'scr': 'application',
56 | # Text category
57 | 'java': 'text', 'cpp': 'text', 'rb': 'text',
58 | 'bat': 'text', 'latex': 'text', 'xml': 'text',
59 | 'etx': 'text', 'htm': 'text', 'c': 'text',
60 | 'css': 'text', 'csv': 'text', 'html': 'text',
61 | 'js': 'text', 'json': 'text', 'ksh': 'text',
62 | 'pl': 'text', 'pot': 'application', 'py': 'text',
63 | 'h': 'text', 'tsv': 'text', 'rtx': 'text',
64 | 'sgm': 'text', 'sgml': 'text', 'txt': 'text',
65 | 'vcf': 'text', 'pdf': 'text', 'epub': 'text',
66 | 'srt': 'text', 'inc': 'text', 'php': 'text',
67 | 'cbz': 'text', 'docx': 'text', 'mobi': 'text',
68 | 'chm': 'text', 'xlsx': "text", 'djvu': 'text',
69 | 'rtf': 'text', 'log': 'text', 'md': 'text',
70 | 'dsc': 'text', 'info': 'text',
71 | # Video category
72 | '3g2': 'video', '3gp': 'video', 'asf': 'video',
73 | 'asx': 'video', 'avi': 'video', 'flv': 'video',
74 | 'swf': 'video', 'vob:': 'video', 'qt': 'video',
75 | 'webm': 'video', 'mov': 'video', 'm1v': 'video',
76 | 'm3u': 'video', 'm3u8': 'video', 'movie': 'video',
77 | 'mp4': 'video', 'mpa': 'video', 'mpe': 'video',
78 | 'mpeg': 'video', 'mpg': 'video', 'mkv': 'video',
79 | 'wmv': 'video', 'm4s': 'video', 'ogv': 'video',
80 | 'm4b': 'video', 'm4v': 'video', 'ts': 'video',
81 |
82 | # Audio category
83 | 'wav': 'audio', 'snd': 'audio', 'mp2': 'audio',
84 | 'aif': 'audio', 'iff': 'audio', 'm4a': 'audio',
85 | 'mid': 'audio', 'midi': 'audio', 'mp3': 'audio',
86 | 'wma': 'audio', 'ra': 'audio', 'aifc': 'audio',
87 | 'aiff': 'audio', 'au': 'audio', 'flac': 'audio',
88 | 'ogg': 'audio', 'oga': 'audio', 'mka': 'video',
89 | 'ac3': 'audio',
90 | # Image category
91 | 'bmp': 'image', 'gif': 'image', 'jpg': 'image',
92 | 'xwd': 'image', 'tif': 'image', 'tiff': 'image',
93 | 'png': 'image', 'pnm': 'image', 'ras': 'image',
94 | 'ico': 'image', 'ief': 'image', 'pgm': 'image',
95 | 'jpe': 'image', 'pbm': 'image', 'jpeg': 'image',
96 | 'ppm': 'image', 'xpm': 'image', 'xbm': 'image',
97 | 'rgb': 'image', 'svg': 'image', 'psd': 'image',
98 | 'yuv': 'image', 'ai': 'image', 'eps': 'image',
99 | 'bw': 'image', 'hdr': 'image',
100 | # Archive category
101 | 'ar': 'archive', 'cpio': 'archive', 'shar': 'archive',
102 | 'iso': 'archive', 'lbr': 'archive', 'mar': 'archive',
103 | 'sbx': 'archive', 'bz2': 'archive', 'f': 'archive',
104 | 'gz': 'archive', 'lz': 'archive', 'lzma': 'archive',
105 | 'lzo': 'archive', 'rz': 'archive', 'sfark': 'archive',
106 | 'sz': 'archive', 'z': 'archive', '7z': 'archive',
107 | 's7z': 'archive', 'ace': 'archive', 'afa': 'archive',
108 | 'alz': 'archive', 'apk': 'archive', 'arc': 'archive',
109 | 'arj': 'archive', 'b1': 'archive', 'b6z': 'archive',
110 | 'a': 'archive', 'bh': 'archive', 'cab': 'archive',
111 | 'car': 'archive', 'cfs': 'archive', 'cpt': 'archive',
112 | 'dar': 'archive', 'dd': 'archive', 'dgc': 'archive',
113 | 'dmg': 'archive', 'ear': 'archive', 'gca': 'archive',
114 | 'ha': 'archive', 'hki': 'archive', 'ice': 'archive',
115 | 'jar': 'archive', 'kgb': 'archive', 'lzh': 'archive',
116 | 'lha': 'archive', 'lzx': 'archive', 'pak': 'archive',
117 | 'partimg': 'archive', 'paq6': 'archive', 'paq7': 'archive',
118 | 'paq8': 'archive', 'pea': 'archive', 'pim': 'archive',
119 | 'pit': 'archive', 'qda': 'archive', 'rar': 'archive',
120 | 'rk': 'archive', 'sda': 'archive', 'sea': 'archive',
121 | 'sen': 'archive', 'sfx': 'archive', 'shk': 'archive',
122 | 'sit': 'archive', 'sitx': 'archive', 'sqx': 'archive',
123 | 'tbz2': 'archive', 'tlz': 'archive', 'xz': 'archive',
124 | 'txz': 'archive', 'uc': 'archive', 'uc0': 'archive',
125 | 'uc2': 'archive', 'ucn': 'archive', 'ur2': 'archive',
126 | 'ue2': 'archive', 'uca': 'archive', 'uha': 'archive',
127 | 'war': 'archive', 'wim': 'archive', 'xar': 'archive',
128 | 'xp3': 'archive', 'yz1': 'archive', 'zip': 'archive',
129 | 'zipx': 'archive', 'zoo': 'archive', 'zpaq': 'archive',
130 | 'zz': 'archive', 'xpi': 'archive', 'tgz': 'archive',
131 | 'tbz': 'archive', 'tar': 'archive', 'bz': 'archive',
132 | 'diz': 'archive',
133 | }
134 |
135 | colors = {
136 | "application": "bg-application",
137 | "text": "bg-text",
138 | "video": "bg-video",
139 | "image": "bg-image",
140 | "audio": "bg-audio",
141 | "archive": "bg-archive"
142 | }
143 |
144 |
145 | def get_color(category):
146 | return colors.get(category, None)
147 |
148 |
149 | def get_category(extension):
150 | return category_map.get(extension, None)
151 |
152 |
153 | def is_valid_url(url):
154 | if not url.endswith("/"):
155 | return False
156 |
157 | if not url.startswith(("http://", "https://", "ftp://")):
158 | return False
159 |
160 | return validators.url(url)
161 |
162 |
163 | def has_extension(link):
164 | return len(os.path.splitext(link)[1]) > 0
165 |
166 |
167 | def is_external_link(base_url, url: str):
168 | url = urljoin(base_url, url).strip()
169 |
170 | if base_url in url:
171 | return False
172 | return True
173 |
174 |
175 | def is_od(url):
176 | if not url.endswith("/"):
177 | print("Url does not end with trailing /")
178 | return False
179 |
180 | try:
181 | if url.startswith("ftp://") and config.SUBMIT_FTP:
182 | ftp = FTP(urlparse(url).netloc)
183 | ftp.login()
184 | ftp.close()
185 | return True
186 | elif config.SUBMIT_HTTP:
187 | r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
188 | if r.status_code != 200:
189 | # print("No redirects allowed!")
190 | return False
191 | soup = BeautifulSoup(r.text, "lxml")
192 |
193 | external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a"))
194 | link_tags = len(list(soup.find_all("link")))
195 | script_tags = len(list(soup.find_all("script")))
196 |
197 | if external_links > 11:
198 | # print("Too many external links!")
199 | return False
200 |
201 | if link_tags > 5:
202 | # print("Too many link tags!")
203 | return False
204 |
205 | if script_tags > 7:
206 | # print("Too many script tags!")
207 | return False
208 |
209 | return True
210 |
211 | except Exception as e:
212 | # print(e)
213 | return False
214 |
215 |
216 | def has_parent_dir(url):
217 |
218 | parsed_url = urlparse(url)
219 |
220 | if parsed_url.path == "/":
221 | return False
222 |
223 | parent_url = urljoin(url, "../")
224 | try:
225 | r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
226 | if r.status_code != 200:
227 | return False
228 | soup = BeautifulSoup(r.text, "lxml")
229 |
230 | for anchor in soup.find_all("a"):
231 | if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
232 | # The parent page exists, and has a link to the child directory
233 | return is_od(parent_url)
234 |
235 | except:
236 | return False
237 |
238 | # Parent page exists, but does not have a link to the child directory
239 | return False
240 |
241 |
242 | def get_top_directory(url):
243 | if url.startswith("ftp://"):
244 | return url
245 |
246 | while has_parent_dir(url):
247 | url = urljoin(url, "../")
248 | return url
249 |
--------------------------------------------------------------------------------
/reddit_bot.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | import humanfriendly
5 | import praw
6 |
7 |
8 | class RedditBot:
9 |
10 | bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \
11 | "open directories posted in /r/opendirectories/)"
12 |
13 | def __init__(self, log_file: str, reddit: praw.Reddit):
14 |
15 | self.log_file = log_file
16 |
17 | self.crawled = []
18 | self.load_from_file()
19 | self.reddit = reddit
20 |
21 | def log_crawl(self, post_id):
22 |
23 | self.load_from_file()
24 | self.crawled.append(post_id)
25 |
26 | with open(self.log_file, "w") as f:
27 | for post_id in self.crawled:
28 | f.write(post_id + "\n")
29 |
30 | def has_crawled(self, post_id):
31 | self.load_from_file()
32 | return post_id in self.crawled
33 |
34 | def load_from_file(self):
35 | if not os.path.isfile(self.log_file):
36 | self.crawled = []
37 | else:
38 | with open(self.log_file, "r") as f:
39 | self.crawled = list(filter(None, f.read().split("\n")))
40 |
41 | def reply(self, reddit_obj, comment: str):
42 |
43 | while True:
44 | try:
45 | if not self.has_crawled(reddit_obj.id):
46 | reply = reddit_obj.reply(comment)
47 | self.log_crawl(reddit_obj.id)
48 | print("Reply to " + reddit_obj.id)
49 | return reply
50 | break
51 | except Exception as e:
52 | print("Waiting 5 minutes: " + str(e))
53 | time.sleep(300)
54 | continue
55 |
56 | def edit(self, reddit_comment, new_message):
57 |
58 | while True:
59 | try:
60 | reddit_comment.edit(new_message)
61 | print("Edit comment " + reddit_comment.id)
62 | break
63 | except Exception as e:
64 | print("Waiting 5 minutes: " + str(e))
65 | time.sleep(300)
66 | continue
67 |
68 | @staticmethod
69 | def get_comment(stats: dict, website_id, message: str = ""):
70 | comment = message + " \n" if message else ""
71 |
72 | comment += RedditBot.format_stats(stats)
73 |
74 | comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
75 | comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
76 | comment += " | [Source](https://github.com/simon987) \n"
77 | comment += "*** \n"
78 | comment += RedditBot.bottom_line
79 |
80 | return comment
81 |
82 | @staticmethod
83 | def format_stats(stats):
84 |
85 | result = " \n"
86 | result += "File types | Count | Total Size\n"
87 | result += ":-- | :-- | :-- \n"
88 | counter = 0
89 | for mime in stats["ext_stats"]:
90 | result += mime[2]
91 | result += " | " + str(mime[1])
92 | result += " | " + humanfriendly.format_size(mime[0]) + " \n"
93 |
94 | counter += 1
95 | if counter >= 3:
96 | break
97 |
98 | result += "**Total** | **" + str(stats["total_count"]) + "** | **"
99 | result += humanfriendly.format_size(stats["total_size"]) + "** \n\n"
100 | return result
101 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flask_testing
3 | requests
4 | bs4
5 | validators
6 | Flask-Caching
7 | praw
8 | humanfriendly
9 | apscheduler
10 | bcrypt
11 | elasticsearch
12 | python-dateutil
13 | flask_httpauth
14 | ujson
15 | urllib3
16 | pyOpenSSL
17 | lxml
18 | pillow
19 | Wand
20 | numpy
21 | uwsgi
22 | redis
23 | psycopg2-binary
24 | lz4
--------------------------------------------------------------------------------
/search/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from logging import FileHandler
3 |
4 | logger = logging.getLogger("default")
5 | logger.setLevel(logging.DEBUG)
6 |
7 | formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
8 | file_handler = FileHandler("oddb.log")
9 | file_handler.setFormatter(formatter)
10 | logger.addHandler(file_handler)
11 | # logger.addHandler(StreamHandler(sys.stdout))
12 |
--------------------------------------------------------------------------------
/search/filter.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | try:
4 | from fold_to_ascii.fold_to_ascii import mapping
5 | except:
6 | from ..fold_to_ascii.fold_to_ascii import mapping
7 |
8 |
9 | class SearchFilter:
10 |
11 | def __init__(self):
12 |
13 | self.blacklisted_terms = set()
14 | self.table = str.maketrans(dict(mapping.translate_table))
15 |
16 | if os.path.exists("search_blacklist.txt"):
17 | with open("search_blacklist.txt") as f:
18 | self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
19 |
20 | def should_block(self, query) -> bool:
21 |
22 | query = query.translate(self.table)
23 | query = query.lower()
24 |
25 | for raw_token in query.split():
26 |
27 | token = raw_token.strip("\"'/\\").strip()
28 | if token in self.blacklisted_terms:
29 | return True
30 |
31 | return False
32 |
--------------------------------------------------------------------------------
/search/search.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | from urllib.parse import urljoin
4 |
5 | import elasticsearch
6 | import ujson
7 | from apscheduler.schedulers.background import BackgroundScheduler
8 | from elasticsearch import helpers
9 |
10 | from search import logger
11 | from search.filter import SearchFilter
12 |
13 |
14 | class InvalidQueryException(Exception):
15 | pass
16 |
17 |
18 | class IndexingError(Exception):
19 | pass
20 |
21 |
22 | class ElasticSearchEngine:
23 | SORT_ORDERS = {
24 | "score": ["_score"],
25 | "size_asc": [{"size": {"order": "asc"}}],
26 | "size_dsc": [{"size": {"order": "desc"}}],
27 | "date_asc": [{"mtime": {"order": "asc"}}],
28 | "date_desc": [{"mtime": {"order": "desc"}}],
29 | "none": []
30 | }
31 |
32 | def __init__(self, url, index_name):
33 | super().__init__()
34 | self.index_name = index_name
35 | logger.info("Connecting to ES @ %s" % url)
36 | self.es = elasticsearch.Elasticsearch(hosts=[url])
37 | self.filter = SearchFilter()
38 |
39 | if not self.es.indices.exists(self.index_name):
40 | self.init()
41 |
42 | def start_stats_scheduler(self):
43 | scheduler = BackgroundScheduler()
44 | scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
45 | scheduler.start()
46 |
47 | def init(self):
48 | logger.info("Elasticsearch first time setup")
49 | if self.es.indices.exists(self.index_name):
50 | self.es.indices.delete(index=self.index_name)
51 | self.es.indices.create(index=self.index_name, body={
52 | "settings": {
53 | "index": {
54 | "number_of_shards": 50,
55 | "number_of_replicas": 0,
56 | "refresh_interval": "30s",
57 | "codec": "best_compression"
58 | },
59 | "analysis": {
60 | "analyzer": {
61 | "my_nGram": {
62 | "tokenizer": "my_nGram_tokenizer",
63 | "filter": ["lowercase", "asciifolding"]
64 | }
65 | },
66 | "tokenizer": {
67 | "my_nGram_tokenizer": {
68 | "type": "nGram", "min_gram": 3, "max_gram": 3
69 | }
70 | }
71 | }
72 | }
73 | })
74 |
75 | # Index Mappings
76 | self.es.indices.put_mapping(body={
77 | "properties": {
78 | "path": {"analyzer": "standard", "type": "text"},
79 | "name": {"analyzer": "standard", "type": "text",
80 | "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
81 | "mtime": {"type": "date", "format": "epoch_second"},
82 | "size": {"type": "long"},
83 | "website_id": {"type": "integer"},
84 | "ext": {"type": "keyword"},
85 | },
86 | "_routing": {"required": True}
87 | }, doc_type="file", index=self.index_name, include_type_name=True)
88 |
89 | self.es.indices.open(index=self.index_name)
90 |
91 | def delete_docs(self, website_id):
92 |
93 | while True:
94 | try:
95 | logger.debug("Deleting docs of " + str(website_id))
96 |
97 | to_delete = helpers.scan(query={
98 | "query": {
99 | "term": {
100 | "website_id": website_id
101 | }
102 | }
103 | }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id)
104 |
105 | buf = []
106 | counter = 0
107 | for doc in to_delete:
108 | buf.append(doc)
109 | counter += 1
110 |
111 | if counter >= 10000:
112 | self._delete(buf, website_id)
113 | buf.clear()
114 | counter = 0
115 | if counter > 0:
116 | self._delete(buf, website_id)
117 | break
118 |
119 | except Exception as e:
120 | logger.error("During delete: " + str(e))
121 | time.sleep(10)
122 |
123 | logger.debug("Done deleting for " + str(website_id))
124 |
125 | def _delete(self, docs, website_id):
126 | bulk_string = self.create_bulk_delete_string(docs)
127 | result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
128 | routing=website_id)
129 |
130 | if result["errors"]:
131 | logger.error("Error in ES bulk delete: \n" + result["errors"])
132 | raise IndexingError
133 |
134 | def import_json(self, in_lines, website_id: int):
135 |
136 | import_every = 10000
137 | cooldown_time = 0
138 |
139 | docs = []
140 |
141 | for line in in_lines:
142 | try:
143 | doc = ujson.loads(line)
144 | name, ext = os.path.splitext(doc["name"])
145 | doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else ""
146 | doc["name"] = name
147 | doc["website_id"] = website_id
148 | docs.append(doc)
149 | except Exception as e:
150 | logger.error("Error in import_json: " + str(e) + " for line : + \n" + line)
151 |
152 | if len(docs) >= import_every:
153 | self._index(docs)
154 | docs.clear()
155 | time.sleep(cooldown_time)
156 |
157 | if docs:
158 | self._index(docs)
159 |
160 | def _index(self, docs):
161 | while True:
162 | try:
163 | logger.debug("Indexing " + str(len(docs)) + " docs")
164 | bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
165 | self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
166 | routing=docs[0]["website_id"])
167 | break
168 | except Exception as e:
169 | logger.error("Error in _index: " + str(e) + ", retrying")
170 | time.sleep(10)
171 |
172 | @staticmethod
173 | def create_bulk_index_string(docs: list):
174 |
175 | action_string = '{"index":{}}\n'
176 | return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)
177 |
178 | @staticmethod
179 | def create_bulk_delete_string(docs: list):
180 |
181 | return "\n".join("".join(["{\"delete\":{\"_id\":\"", doc["_id"], "\"}}"]) for doc in docs)
182 |
183 | def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
184 | date_max) -> {}:
185 |
186 | if self.filter.should_block(query):
187 | logger.info("Search was blocked")
188 | raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
189 | "This incident has been reported.")
190 |
191 | filters = []
192 | if extensions:
193 | filters.append({"terms": {"ext": extensions}})
194 |
195 | if size_min > 0 or size_max:
196 | size_filer = dict()
197 | new_filter = {"range": {"size": size_filer}}
198 |
199 | if size_min > 0:
200 | size_filer["gte"] = size_min
201 | if size_max:
202 | size_filer["lte"] = size_max
203 |
204 | filters.append(new_filter)
205 |
206 | if date_min > 0 or date_max:
207 | date_filer = dict()
208 | new_filter = {"range": {"mtime": date_filer}}
209 |
210 | if date_min > 0:
211 | date_filer["gte"] = date_min
212 | if date_max:
213 | date_filer["lte"] = date_max
214 |
215 | filters.append(new_filter)
216 |
217 | sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, [])
218 |
219 | page = self.es.search(body={
220 | "query": {
221 | "bool": {
222 | "must": {
223 | "multi_match": {
224 | "query": query,
225 | "fields": fields,
226 | "operator": "or" if match_all else "and"
227 | }
228 | },
229 | "filter": filters
230 | }
231 | },
232 | "sort": sort_by,
233 | "highlight": {
234 | "fields": {
235 | "name": {"pre_tags": [""], "post_tags": [""]},
236 | "name.nGram": {"pre_tags": [""], "post_tags": [""]},
237 | "path": {"pre_tags": [""], "post_tags": [""]}
238 | }
239 | },
240 | "size": per_page, "from": min(page * per_page, 10000 - per_page)},
241 | index=self.index_name, request_timeout=20)
242 |
243 | return page
244 |
245 | def get_stats(self, website_id: int, subdir: str = None):
246 |
247 | result = self.es.search(body={
248 | "query": {
249 | "constant_score": {
250 | "filter": {
251 | "term": {"website_id": website_id}
252 | }
253 | }
254 | },
255 | "aggs": {
256 | "ext_group": {
257 | "terms": {
258 | "field": "ext",
259 | "size": 12
260 | },
261 | "aggs": {
262 | "size": {
263 | "sum": {
264 | "field": "size"
265 | }
266 | }
267 | }
268 | },
269 | "total_size": {
270 | "sum_bucket": {
271 | "buckets_path": "ext_group>size"
272 | }
273 | }
274 | },
275 | "size": 0
276 | }, index=self.index_name, request_timeout=30, routing=website_id)
277 |
278 | stats = dict()
279 | stats["total_size"] = result["aggregations"]["total_size"]["value"]
280 | stats["total_count"] = result["hits"]["total"]
281 | stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
282 | for b in result["aggregations"]["ext_group"]["buckets"]]
283 |
284 | return stats
285 |
286 | def get_link_list(self, website_id, base_url):
287 |
288 | hits = helpers.scan(client=self.es,
289 | query={
290 | "_source": {
291 | "includes": ["path", "name", "ext"]
292 | },
293 | "query": {
294 | "constant_score": {
295 | "filter": {
296 | "term": {"website_id": website_id}
297 | }
298 | }
299 | },
300 | },
301 | index=self.index_name, request_timeout=20, routing=website_id)
302 | for hit in hits:
303 | src = hit["_source"]
304 | yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
305 | ("." if src["ext"] != "" else "") + src["ext"]
306 |
307 | @staticmethod
308 | def get_global_stats():
309 |
310 | if os.path.exists("_stats.json"):
311 | with open("_stats.json", "r") as f:
312 | return ujson.load(f)
313 | else:
314 | return None
315 |
316 | def _generate_global_stats(self):
317 |
318 | size_per_ext = self.es.search(body={
319 | "query": {
320 | "bool": {
321 | "filter": [
322 | {"range": {
323 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
324 | }}
325 | ]
326 | }
327 | },
328 | "aggs": {
329 | "ext_group": {
330 | "terms": {
331 | "field": "ext",
332 | "size": 40
333 | },
334 | "aggs": {
335 | "size": {
336 | "sum": {
337 | "field": "size"
338 | }
339 | }
340 | }
341 | }
342 | },
343 | "size": 0
344 |
345 | }, index=self.index_name, request_timeout=240)
346 |
347 | total_stats = self.es.search(body={
348 | "query": {
349 | "bool": {
350 | "filter": [
351 | {"range": {
352 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
353 | }}
354 | ]
355 | }
356 | },
357 | "aggs": {
358 | "file_stats": {
359 | "extended_stats": {
360 | "field": "size",
361 | "sigma": 1
362 | }
363 | }
364 | },
365 | "size": 0
366 |
367 | }, index=self.index_name, request_timeout=241)
368 |
369 | size_and_date_histogram = self.es.search(body={
370 | "query": {
371 | "bool": {
372 | "filter": [
373 | {"range": {
374 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
375 | }},
376 | {"range": {
377 | "mtime": {
378 | "gt": 0 # 1970-01-01
379 | }
380 | }}
381 | ]
382 | }
383 | },
384 | "aggs": {
385 | "sizes": {
386 | "histogram": {
387 | "field": "size",
388 | "interval": 100000000, # 100Mb
389 | "min_doc_count": 500
390 | }
391 | },
392 | "dates": {
393 | "date_histogram": {
394 | "field": "mtime",
395 | "interval": "1y",
396 | "min_doc_count": 500,
397 | "format": "yyyy"
398 | }
399 | }
400 | },
401 | "size": 0
402 | }, index=self.index_name, request_timeout=242)
403 |
404 | website_scatter = self.es.search(body={
405 | "query": {
406 | "bool": {
407 | "filter": [
408 | {"range": {
409 | "size": {"gte": 0, "lte": (1000000000000 - 1)} # 0-1TB
410 | }}
411 | ]
412 | }
413 | },
414 | "aggs": {
415 | "websites": {
416 | "terms": {
417 | "field": "website_id",
418 | "size": 600 # TODO: Figure out what size is appropriate
419 | },
420 | "aggs": {
421 | "size": {
422 | "sum": {
423 | "field": "size"
424 | }
425 | }
426 | }
427 | }
428 | },
429 | "size": 0
430 | }, index=self.index_name, request_timeout=243)
431 |
432 | es_stats = self.es.indices.stats(self.index_name, request_timeout=244)
433 |
434 | stats = dict()
435 | stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
436 | stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
437 | stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
438 | stats["es_search_time_avg"] = stats["es_search_time"] / (
439 | stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
440 |
441 | stats["total_count"] = total_stats["aggregations"]["file_stats"]["count"]
442 | stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
443 | stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
444 | stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"]
445 | stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"]
446 | stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"]
447 | stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
448 | for b in size_per_ext["aggregations"]["ext_group"]["buckets"]]
449 | stats["sizes_histogram"] = [(b["key"], b["doc_count"])
450 | for b in size_and_date_histogram["aggregations"]["sizes"]["buckets"]]
451 | stats["dates_histogram"] = [(b["key_as_string"], b["doc_count"])
452 | for b in size_and_date_histogram["aggregations"]["dates"]["buckets"]]
453 | stats["website_scatter"] = [[b["key"], b["doc_count"], b["size"]["value"]]
454 | for b in website_scatter["aggregations"]["websites"]["buckets"]]
455 | stats["base_url"] = "entire database"
456 |
457 | with open("_stats.json", "w") as f:
458 | ujson.dump(stats, f)
459 |
460 | def stream_all_docs(self):
461 | return helpers.scan(query={
462 | "query": {
463 | "match_all": {}
464 | }
465 | }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)
466 |
467 | def refresh(self):
468 | self.es.indices.refresh(self.index_name)
469 |
--------------------------------------------------------------------------------
/static/Hack-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/Hack-Regular.ttf
--------------------------------------------------------------------------------
/static/css/fa-brands.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 | */
5 | @font-face {
6 | font-family: 'Font Awesome 5 Brands';
7 | font-style: normal;
8 | font-weight: normal;
9 | src: url("../webfonts/fa-brands-400.eot");
10 | src: url("../webfonts/fa-brands-400.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-brands-400.woff2") format("woff2"), url("../webfonts/fa-brands-400.woff") format("woff"), url("../webfonts/fa-brands-400.ttf") format("truetype"), url("../webfonts/fa-brands-400.svg#fontawesome") format("svg"); }
11 |
12 | .fab {
13 | font-family: 'Font Awesome 5 Brands'; }
14 |
--------------------------------------------------------------------------------
/static/css/fa-brands.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 | */
5 | @font-face{font-family:Font Awesome\ 5 Brands;font-style:normal;font-weight:400;src:url(../webfonts/fa-brands-400.eot);src:url(../webfonts/fa-brands-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-brands-400.woff2) format("woff2"),url(../webfonts/fa-brands-400.woff) format("woff"),url(../webfonts/fa-brands-400.ttf) format("truetype"),url(../webfonts/fa-brands-400.svg#fontawesome) format("svg")}.fab{font-family:Font Awesome\ 5 Brands}
--------------------------------------------------------------------------------
/static/css/fa-regular.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 | */
5 | @font-face {
6 | font-family: 'Font Awesome 5 Free';
7 | font-style: normal;
8 | font-weight: 400;
9 | src: url("../webfonts/fa-regular-400.eot");
10 | src: url("../webfonts/fa-regular-400.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-regular-400.woff2") format("woff2"), url("../webfonts/fa-regular-400.woff") format("woff"), url("../webfonts/fa-regular-400.ttf") format("truetype"), url("../webfonts/fa-regular-400.svg#fontawesome") format("svg"); }
11 |
12 | .far {
13 | font-family: 'Font Awesome 5 Free';
14 | font-weight: 400; }
15 |
--------------------------------------------------------------------------------
/static/css/fa-regular.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 | */
5 | @font-face{font-family:Font Awesome\ 5 Free;font-style:normal;font-weight:400;src:url(../webfonts/fa-regular-400.eot);src:url(../webfonts/fa-regular-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-regular-400.woff2) format("woff2"),url(../webfonts/fa-regular-400.woff) format("woff"),url(../webfonts/fa-regular-400.ttf) format("truetype"),url(../webfonts/fa-regular-400.svg#fontawesome) format("svg")}.far{font-family:Font Awesome\ 5 Free;font-weight:400}
--------------------------------------------------------------------------------
/static/css/fa-solid.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 | */
5 | @font-face {
6 | font-family: 'Font Awesome 5 Free';
7 | font-style: normal;
8 | font-weight: 900;
9 | src: url("../webfonts/fa-solid-900.eot");
10 | src: url("../webfonts/fa-solid-900.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-solid-900.woff2") format("woff2"), url("../webfonts/fa-solid-900.woff") format("woff"), url("../webfonts/fa-solid-900.ttf") format("truetype"), url("../webfonts/fa-solid-900.svg#fontawesome") format("svg"); }
11 |
12 | .fa,
13 | .fas {
14 | font-family: 'Font Awesome 5 Free';
15 | font-weight: 900; }
16 |
--------------------------------------------------------------------------------
/static/css/fa-solid.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 | * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 | */
5 | @font-face{font-family:Font Awesome\ 5 Free;font-style:normal;font-weight:900;src:url(../webfonts/fa-solid-900.eot);src:url(../webfonts/fa-solid-900.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-solid-900.woff2) format("woff2"),url(../webfonts/fa-solid-900.woff) format("woff"),url(../webfonts/fa-solid-900.ttf) format("truetype"),url(../webfonts/fa-solid-900.svg#fontawesome) format("svg")}.fa,.fas{font-family:Font Awesome\ 5 Free;font-weight:900}
--------------------------------------------------------------------------------
/static/css/ion.rangeSlider.css:
--------------------------------------------------------------------------------
1 | /* Ion.RangeSlider
2 | // css version 2.0.3
3 | // © 2013-2014 Denis Ineshin | IonDen.com
4 | // ===================================================================================================================*/
5 |
6 | /* =====================================================================================================================
7 | // RangeSlider */
8 |
9 | .irs {
10 | position: relative; display: block;
11 | -webkit-touch-callout: none;
12 | -webkit-user-select: none;
13 | -khtml-user-select: none;
14 | -moz-user-select: none;
15 | -ms-user-select: none;
16 | user-select: none;
17 | }
18 | .irs-line {
19 | position: relative; display: block;
20 | overflow: hidden;
21 | outline: none !important;
22 | }
23 | .irs-line-left, .irs-line-mid, .irs-line-right {
24 | position: absolute; display: block;
25 | top: 0;
26 | }
27 | .irs-line-left {
28 | left: 0; width: 11%;
29 | }
30 | .irs-line-mid {
31 | left: 9%; width: 82%;
32 | }
33 | .irs-line-right {
34 | right: 0; width: 11%;
35 | }
36 |
37 | .irs-bar {
38 | position: absolute; display: block;
39 | left: 0; width: 0;
40 | }
41 | .irs-bar-edge {
42 | position: absolute; display: block;
43 | top: 0; left: 0;
44 | }
45 |
46 | .irs-shadow {
47 | position: absolute; display: none;
48 | left: 0; width: 0;
49 | }
50 |
51 | .irs-slider {
52 | position: absolute; display: block;
53 | cursor: default;
54 | z-index: 1;
55 | }
56 | .irs-slider.single {
57 |
58 | }
59 | .irs-slider.from {
60 |
61 | }
62 | .irs-slider.to {
63 |
64 | }
65 | .irs-slider.type_last {
66 | z-index: 2;
67 | }
68 |
69 | .irs-min {
70 | position: absolute; display: block;
71 | left: 0;
72 | cursor: default;
73 | }
74 | .irs-max {
75 | position: absolute; display: block;
76 | right: 0;
77 | cursor: default;
78 | }
79 |
80 | .irs-from, .irs-to, .irs-single {
81 | position: absolute; display: block;
82 | top: 0; left: 0;
83 | cursor: default;
84 | white-space: nowrap;
85 | }
86 |
87 | .irs-grid {
88 | position: absolute; display: none;
89 | bottom: 0; left: 0;
90 | width: 100%; height: 20px;
91 | }
92 | .irs-with-grid .irs-grid {
93 | display: block;
94 | }
95 | .irs-grid-pol {
96 | position: absolute;
97 | top: 0; left: 0;
98 | width: 1px; height: 8px;
99 | background: #000;
100 | }
101 | .irs-grid-pol.small {
102 | height: 4px;
103 | }
104 | .irs-grid-text {
105 | position: absolute;
106 | bottom: 0; left: 0;
107 | white-space: nowrap;
108 | text-align: center;
109 | font-size: 9px; line-height: 9px;
110 | padding: 0 3px;
111 | color: #000;
112 | }
113 |
114 | .irs-disable-mask {
115 | position: absolute; display: block;
116 | top: 0; left: -1%;
117 | width: 102%; height: 100%;
118 | cursor: default;
119 | background: rgba(0,0,0,0.0);
120 | z-index: 2;
121 | }
122 | .irs-disabled {
123 | opacity: 0.4;
124 | }
125 | .lt-ie9 .irs-disabled {
126 | filter: alpha(opacity=40);
127 | }
128 |
129 |
130 | .irs-hidden-input {
131 | position: absolute !important;
132 | display: block !important;
133 | top: 0 !important;
134 | left: 0 !important;
135 | width: 0 !important;
136 | height: 0 !important;
137 | font-size: 0 !important;
138 | line-height: 0 !important;
139 | padding: 0 !important;
140 | margin: 0 !important;
141 | outline: none !important;
142 | z-index: -9999 !important;
143 | background: none !important;
144 | border-style: solid !important;
145 | border-color: transparent !important;
146 | }
147 |
--------------------------------------------------------------------------------
/static/css/ion.rangeSlider.skinFlat.css:
--------------------------------------------------------------------------------
1 | /* Ion.RangeSlider, Flat UI Skin
2 | // css version 2.0.3
3 | // © Denis Ineshin, 2014 https://github.com/IonDen
4 | // ===================================================================================================================*/
5 |
6 | /* =====================================================================================================================
7 | // Skin details */
8 |
9 | .irs-line-mid,
10 | .irs-line-left,
11 | .irs-line-right,
12 | .irs-bar,
13 | .irs-bar-edge,
14 | .irs-slider {
15 | background: url(../img/sprite-skin-flat.png) repeat-x;
16 | }
17 |
18 | .irs {
19 | height: 40px;
20 | }
21 | .irs-with-grid {
22 | height: 60px;
23 | }
24 | .irs-line {
25 | height: 12px; top: 25px;
26 | }
27 | .irs-line-left {
28 | height: 12px;
29 | background-position: 0 -30px;
30 | }
31 | .irs-line-mid {
32 | height: 12px;
33 | background-position: 0 0;
34 | }
35 | .irs-line-right {
36 | height: 12px;
37 | background-position: 100% -30px;
38 | }
39 |
40 | .irs-bar {
41 | height: 12px; top: 25px;
42 | background-position: 0 -60px;
43 | }
44 | .irs-bar-edge {
45 | top: 25px;
46 | height: 12px; width: 9px;
47 | background-position: 0 -90px;
48 | }
49 |
50 | .irs-shadow {
51 | height: 3px; top: 34px;
52 | background: #000;
53 | opacity: 0.25;
54 | }
55 | .lt-ie9 .irs-shadow {
56 | filter: alpha(opacity=25);
57 | }
58 |
59 | .irs-slider {
60 | width: 16px; height: 18px;
61 | top: 22px;
62 | background-position: 0 -120px;
63 | }
64 | .irs-slider.state_hover, .irs-slider:hover {
65 | background-position: 0 -150px;
66 | }
67 |
68 | .irs-min, .irs-max {
69 | color: #999;
70 | font-size: 10px; line-height: 1.333;
71 | text-shadow: none;
72 | top: 0; padding: 1px 3px;
73 | background: #e1e4e9;
74 | -moz-border-radius: 4px;
75 | border-radius: 4px;
76 | }
77 |
78 | .irs-from, .irs-to, .irs-single {
79 | color: #fff;
80 | font-size: 10px; line-height: 1.333;
81 | text-shadow: none;
82 | padding: 1px 5px;
83 | background: #dc7846;
84 | -moz-border-radius: 4px;
85 | border-radius: 4px;
86 | }
87 | .irs-from:after, .irs-to:after, .irs-single:after {
88 | position: absolute; display: block; content: "";
89 | bottom: -6px; left: 50%;
90 | width: 0; height: 0;
91 | margin-left: -3px;
92 | overflow: hidden;
93 | border: 3px solid transparent;
94 | border-top-color: #dc7846;
95 | }
96 |
97 |
98 | .irs-grid-pol {
99 | background: #e1e4e9;
100 | }
101 | .irs-grid-text {
102 | color: #999;
103 | }
104 |
--------------------------------------------------------------------------------
/static/css/main.css:
--------------------------------------------------------------------------------
1 | a {
2 | border-bottom: none !important;
3 | }
4 | .card {
5 | margin-top: 1em;
6 | }
7 | .jumbotron {
8 | margin-top: 1em;
9 | }
10 | .list-group {
11 | margin-top: 1em;
12 | }
13 | .list-group-item {
14 | padding-bottom: 0.3rem;
15 | }
16 | .badge {
17 | padding-bottom: 0;
18 | }
19 | .table td {
20 | padding: 2px 0;
21 | }
22 | .td-numeric {
23 | text-align: end;
24 | padding-right: 1em;
25 | }
26 |
27 | .bg-application {
28 | background: #8FB847;
29 | color: #FFFFFF;
30 | }
31 |
32 | .bg-archive {
33 | background: #1fa32a;
34 | color: #FFFFFF;
35 | }
36 |
37 | .bg-audio {
38 | background: #009CD8;
39 | color: #FFFFFF;
40 | }
41 |
42 | .bg-video {
43 | background: #DC7D6C;
44 | color: #FFFFFF;
45 | }
46 |
47 | .bg-text {
48 | background: #E19A36;
49 | color: #FFFFFF;
50 | }
51 |
52 | .bg-image {
53 | background: #998AB5;
54 | color: #FFFFFF;
55 | }
56 | .vim-caret {
57 | -webkit-animation: vimCaret 1s linear infinite;
58 | -o-animation: vimCaret 1s linear infinite;
59 | animation: vimCaret 1s linear infinite; }
60 |
61 | .prev-img {
62 | width: 100%;
63 | max-width: 250px;
64 | height: 100%;
65 | }
66 |
67 | .prev-icon {
68 | cursor: pointer;
69 | }
70 | @-webkit-keyframes vimCaret {
71 | 0% {
72 | background-color: transparent; }
73 | 49% {
74 | background-color: transparent; }
75 | 50% {
76 | background-color: rgba(255, 255, 255, 0.6); }
77 | 100% {
78 | background-color: rgba(255, 255, 255, 0.6); } }
79 |
80 | @-o-keyframes vimCaret {
81 | 0% {
82 | background-color: transparent; }
83 | 49% {
84 | background-color: transparent; }
85 | 50% {
86 | background-color: rgba(255, 255, 255, 0.6); }
87 | 100% {
88 | background-color: rgba(255, 255, 255, 0.6); } }
89 |
90 | @keyframes vimCaret {
91 | 0% {
92 | background-color: transparent; }
93 | 49% {
94 | background-color: transparent; }
95 | 50% {
96 | background-color: rgba(255, 255, 255, 0.6); }
97 | 100% {
98 | background-color: rgba(255, 255, 255, 0.6); } }
99 |
100 | mark {
101 | background-color: rgba(255, 255, 0, 0.4);
102 | border-radius: 0;
103 | padding: 1px 0;
104 | }
105 | body {
106 | color: #BBBBBB;
107 | font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif;
108 | background-image: url(/static/img/bg.png);
109 | }
110 |
111 | .card {
112 | background-color: #36393e;
113 | border: 3px double #262626;
114 | }
115 |
116 | .navbar {
117 | background: #36393e;
118 | font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif;
119 | }
120 |
121 | .navbar-brand {
122 | border: none;
123 | }
124 |
125 | .nav-link {
126 | color: #616161;
127 | border-bottom: 2px solid #6c6c6c;
128 | }
129 | .navbar-toggler-icon {
130 | background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255,255,255, 0.6)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E");
131 | }
132 |
133 | .active {
134 | border-color: #b3b3b3;
135 | color: #E6E6E6;
136 | }
137 |
138 | .nav-link:hover {
139 | color: #c7c7c7;
140 | }
141 |
142 | .jumbotron {
143 | background: #36393e;
144 | }
145 |
146 | a {
147 | color: #fff;
148 | border-bottom: 1px dotted #e0e0e0;
149 | }
150 |
151 | a:hover {
152 | color:#ddd;
153 | text-decoration: none;
154 | }
155 |
156 | .table a {
157 | border: none;
158 | }
159 |
160 | .table th, .table td {
161 | border-top: 1px solid #666a6e;
162 | }
163 |
164 | .table thead th {
165 | border-bottom: 2px solid #999da1;
166 | }
167 | .form-control {
168 | background-color: #2f3136;
169 | color: inherit;
170 | border: 1px solid #282b30;
171 | }
172 |
173 | .form-control:focus {
174 | background-color: #2f3136;
175 | border-color: #80bdff;
176 | color: inherit;
177 | }
178 |
179 | .input-group-text {
180 | border: 1px solid #282b30;
181 | background-color: #686d75;
182 | color: #e9ecef;
183 | }
184 |
185 | .nav-tabs .nav-link {
186 | border-color: transparent;
187 | }
188 |
189 | .nav-tabs .nav-link.active {
190 | border-color: #8e9296 #8e9296;
191 | background-color: transparent;
192 | color: #E6E6E6;
193 | }
194 |
195 | .nav-tabs .nav-link:hover {
196 | border-color: #e9ecef #e9ecef transparent #e9ecef;
197 | }
198 |
199 | .card-header-tabs {
200 | border-bottom: 1px solid #a1a5a9;
201 | }
202 |
203 | * {
204 | outline: none;
205 | }
206 |
207 | #sizeSlider {
208 | width: 100%;
209 | }
210 |
211 | .irs-single, .irs-from, .irs-to {
212 | font-size: 13px;
213 | }
214 |
215 | .irs-slider {
216 | cursor: col-resize;
217 | }
218 |
219 | .custom-select {
220 | overflow: auto;
221 | }
222 |
223 | .irs {
224 | margin-bottom: 1em;
225 | }
226 |
227 | .github-banner {
228 | position: absolute;
229 | top: 0;
230 | right: 0;
231 | border: 0;
232 | }
233 |
234 | @media (max-width: 990px) {
235 | .github-banner {
236 | display: none;
237 | }
238 | }
--------------------------------------------------------------------------------
/static/downloads/README.md:
--------------------------------------------------------------------------------
1 | CSV exports of the database will be available here.
--------------------------------------------------------------------------------
/static/img/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/bg.png
--------------------------------------------------------------------------------
/static/img/forkme_right_white_ffffff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/forkme_right_white_ffffff.png
--------------------------------------------------------------------------------
/static/img/sprite-skin-flat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/sprite-skin-flat.png
--------------------------------------------------------------------------------
/static/js/popper.min.js:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (C) Federico Zivolo 2018
3 | Distributed under the MIT License (license terms are at http://opensource.org/licenses/MIT).
4 | */(function(e,t){'object'==typeof exports&&'undefined'!=typeof module?module.exports=t():'function'==typeof define&&define.amd?define(t):e.Popper=t()})(this,function(){'use strict';function e(e){return e&&'[object Function]'==={}.toString.call(e)}function t(e,t){if(1!==e.nodeType)return[];var o=getComputedStyle(e,null);return t?o[t]:o}function o(e){return'HTML'===e.nodeName?e:e.parentNode||e.host}function n(e){if(!e)return document.body;switch(e.nodeName){case'HTML':case'BODY':return e.ownerDocument.body;case'#document':return e.body;}var i=t(e),r=i.overflow,p=i.overflowX,s=i.overflowY;return /(auto|scroll|overlay)/.test(r+s+p)?e:n(o(e))}function r(e){if(!e)return document.documentElement;for(var o=ie(10)?document.body:null,n=e.offsetParent;n===o&&e.nextElementSibling;)n=(e=e.nextElementSibling).offsetParent;var i=n&&n.nodeName;return i&&'BODY'!==i&&'HTML'!==i?-1!==['TD','TABLE'].indexOf(n.nodeName)&&'static'===t(n,'position')?r(n):n:e?e.ownerDocument.documentElement:document.documentElement}function p(e){var t=e.nodeName;return'BODY'!==t&&('HTML'===t||r(e.firstElementChild)===e)}function s(e){return null===e.parentNode?e:s(e.parentNode)}function d(e,t){if(!e||!e.nodeType||!t||!t.nodeType)return document.documentElement;var o=e.compareDocumentPosition(t)&Node.DOCUMENT_POSITION_FOLLOWING,n=o?e:t,i=o?t:e,a=document.createRange();a.setStart(n,0),a.setEnd(i,0);var l=a.commonAncestorContainer;if(e!==l&&t!==l||n.contains(i))return p(l)?l:r(l);var f=s(e);return f.host?d(f.host,t):d(e,s(t).host)}function a(e){var t=1=o.clientWidth&&n>=o.clientHeight}),l=0n[e]&&!t.escapeWithReference&&(i=X(p[o],n[e]-('right'===e?p.width:p.height))),se({},o,i)}};return i.forEach(function(e){var t=-1===['left','top'].indexOf(e)?'secondary':'primary';p=de({},p,s[t](e))}),e.offsets.popper=p,e},priority:['left','right','top','bottom'],padding:5,boundariesElement:'scrollParent'},keepTogether:{order:400,enabled:!0,fn:function(e){var t=e.offsets,o=t.popper,n=t.reference,i=e.placement.split('-')[0],r=J,p=-1!==['top','bottom'].indexOf(i),s=p?'right':'bottom',d=p?'left':'top',a=p?'width':'height';return o[s]r(n[s])&&(e.offsets.popper[d]=r(n[s])),e}},arrow:{order:500,enabled:!0,fn:function(e,o){var n;if(!q(e.instance.modifiers,'arrow','keepTogether'))return e;var i=o.element;if('string'==typeof i){if(i=e.instance.popper.querySelector(i),!i)return e;}else if(!e.instance.popper.contains(i))return console.warn('WARNING: `arrow.element` must be child of its popper element!'),e;var r=e.placement.split('-')[0],p=e.offsets,s=p.popper,d=p.reference,a=-1!==['left','right'].indexOf(r),l=a?'height':'width',f=a?'Top':'Left',m=f.toLowerCase(),h=a?'left':'top',g=a?'bottom':'right',u=L(i)[l];d[g]-us[g]&&(e.offsets.popper[m]+=d[m]+u-s[g]),e.offsets.popper=c(e.offsets.popper);var b=d[m]+d[l]/2-u/2,y=t(e.instance.popper),w=parseFloat(y['margin'+f],10),E=parseFloat(y['border'+f+'Width'],10),v=b-e.offsets.popper[m]-w-E;return v=Q(X(s[l]-u,v),0),e.arrowElement=i,e.offsets.arrow=(n={},se(n,m,Math.round(v)),se(n,h,''),n),e},element:'[x-arrow]'},flip:{order:600,enabled:!0,fn:function(e,t){if(P(e.instance.modifiers,'inner'))return e;if(e.flipped&&e.placement===e.originalPlacement)return e;var o=E(e.instance.popper,e.instance.reference,t.padding,t.boundariesElement,e.positionFixed),n=e.placement.split('-')[0],i=S(n),r=e.placement.split('-')[1]||'',p=[];switch(t.behavior){case fe.FLIP:p=[n,i];break;case fe.CLOCKWISE:p=V(n);break;case fe.COUNTERCLOCKWISE:p=V(n,!0);break;default:p=t.behavior;}return p.forEach(function(s,d){if(n!==s||p.length===d+1)return e;n=e.placement.split('-')[0],i=S(n);var a=e.offsets.popper,l=e.offsets.reference,f=J,m='left'===n&&f(a.right)>f(l.left)||'right'===n&&f(a.left)f(l.top)||'bottom'===n&&f(a.top)f(o.right),g=f(a.top)f(o.bottom),b='left'===n&&h||'right'===n&&c||'top'===n&&g||'bottom'===n&&u,y=-1!==['top','bottom'].indexOf(n),w=!!t.flipVariations&&(y&&'start'===r&&h||y&&'end'===r&&c||!y&&'start'===r&&g||!y&&'end'===r&&u);(m||b||w)&&(e.flipped=!0,(m||b)&&(n=p[d+1]),w&&(r=K(r)),e.placement=n+(r?'-'+r:''),e.offsets.popper=de({},e.offsets.popper,T(e.instance.popper,e.offsets.reference,e.placement)),e=N(e.instance.modifiers,e,'flip'))}),e},behavior:'flip',padding:5,boundariesElement:'viewport'},inner:{order:700,enabled:!1,fn:function(e){var t=e.placement,o=t.split('-')[0],n=e.offsets,i=n.popper,r=n.reference,p=-1!==['left','right'].indexOf(o),s=-1===['top','left'].indexOf(o);return i[p?'left':'top']=r[o]-(s?i[p?'width':'height']:0),e.placement=S(t),e.offsets.popper=c(i),e}},hide:{order:800,enabled:!0,fn:function(e){if(!q(e.instance.modifiers,'hide','preventOverflow'))return e;var t=e.offsets.reference,o=D(e.instance.modifiers,function(e){return'preventOverflow'===e.name}).boundaries;if(t.bottomo.right||t.top>o.bottom||t.right= thresh && u < units.length - 1);
423 |
424 | return bytes.toFixed(1) + ' ' + units[u];
425 | }
--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.eot
--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.ttf
--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.woff
--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.woff2
--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.eot
--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.ttf
--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.woff
--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.woff2
--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.eot
--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.ttf
--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.woff
--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.woff2
--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import time
5 | import traceback
6 | from multiprocessing.pool import ThreadPool
7 | from tempfile import NamedTemporaryFile
8 | from threading import Thread
9 | from uuid import uuid4
10 |
11 | import requests
12 | import urllib3
13 |
14 | import config
15 | import database
16 | from database import Website
17 | from search.search import ElasticSearchEngine
18 | from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
19 | from ws_bucket_client.api import WsBucketApi
20 |
21 | urllib3.disable_warnings()
22 |
23 | logger = logging.getLogger("default")
24 |
25 |
26 | class Task:
27 |
28 | def __init__(self, website_id: int, url: str, priority: int = 1,
29 | callback_type: str = None, callback_args: str = None,
30 | upload_token: str = None):
31 | self.website_id = website_id
32 | self.url = url
33 | self.priority = priority
34 | self.callback_type = callback_type
35 | self.callback_args = json.loads(callback_args) if callback_args else {}
36 | self.upload_token = upload_token
37 |
38 | def to_json(self):
39 | return {
40 | "website_id": self.website_id,
41 | "url": self.url,
42 | "callback_type": self.callback_type,
43 | "callback_args": json.dumps(self.callback_args),
44 | "upload_token": self.upload_token
45 | }
46 |
47 | def __str__(self):
48 | return json.dumps(self.to_json())
49 |
50 | def __repr__(self):
51 | return self.__str__()
52 |
53 |
54 | class IndexingTask:
55 |
56 | def __init__(self, website_id: int, file_path: str, callback_type: str, callback_args):
57 | self.website_id = website_id
58 | self.file_path = file_path
59 | self.callback_type = callback_type
60 | self.callback_args = callback_args
61 |
62 |
63 | class TaskManager:
64 |
65 | def __init__(self):
66 | self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
67 | self.db = database.Database(config.DB_CONN_STR)
68 | self.tracker = TaskTrackerApi(config.TT_API)
69 |
70 | self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
71 | self._indexer_threads = list()
72 |
73 | self.worker = Worker.from_file(self.tracker)
74 | if not self.worker:
75 | self.worker = self.tracker.make_worker("$oddb_master")
76 | if not self.worker:
77 | print("Could not create worker: %s" % traceback.format_exc())
78 | return
79 | self.worker.dump_to_file()
80 | self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
81 | self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
82 |
83 | def start_indexer_threads(self):
84 | logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
85 | for _ in range(config.INDEXER_THREADS):
86 | t = Thread(target=self._do_indexing)
87 | t.setDaemon(True)
88 | self._indexer_threads.append(t)
89 | t.start()
90 |
91 | def _do_indexing(self):
92 |
93 | while True:
94 | task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT)
95 |
96 | if task:
97 | try:
98 | recipe = task.json_recipe()
99 | logger.debug("Got indexing task: " + str(recipe))
100 |
101 | filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"])
102 |
103 | self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
104 | except Exception as e:
105 | self.worker.release_task(task_id=task.id, result=1, verification=0)
106 | finally:
107 | try:
108 | self.worker.release_task(task_id=task.id, result=0, verification=0)
109 | except:
110 | pass
111 | else:
112 | time.sleep(5)
113 |
114 | def _complete_task(self, file_list, task):
115 |
116 | self.search.delete_docs(task.website_id)
117 |
118 | if file_list:
119 | def iter_lines():
120 | with open(file_list, "r") as f:
121 | line = f.readline()
122 | while line:
123 | yield line
124 | line = f.readline()
125 |
126 | self.search.import_json(iter_lines(), task.website_id)
127 | os.remove(file_list)
128 |
129 | self.db.update_website_date_if_exists(task.website_id)
130 |
131 | def do_recrawl(self):
132 | logger.debug("Creating re-crawl tasks")
133 | self._generate_crawling_tasks()
134 |
135 | def _generate_crawling_tasks(self):
136 |
137 | # TODO: Insert more in-depth re-crawl logic here
138 | websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")
139 |
140 | def recrawl(website: Website):
141 | crawl_task = Task(website.id, website.url,
142 | priority=(int((time.time() - website.last_modified.timestamp()) / 3600)))
143 | self.queue_task(crawl_task)
144 |
145 | pool = ThreadPool(processes=30)
146 | pool.map(func=recrawl, iterable=websites_to_crawl)
147 | pool.close()
148 |
149 | def queue_task(self, task: Task):
150 | max_assign_time = 24 * 4 * 3600
151 | upload_token = uuid4().__str__()
152 |
153 | task.upload_token = upload_token
154 | tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
155 | recipe=task.__str__(),
156 | priority=task.priority,
157 | max_assign_time=max_assign_time,
158 | hash64=task.website_id,
159 | verification_count=1,
160 | max_retries=3
161 | )
162 | print(tracker_response.text)
163 | logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
164 | if not tracker_response.json()["ok"]:
165 | return
166 |
167 | bucket_response = self.bucket.allocate(upload_token.__str__(),
168 | 21474837499, # 20Gib
169 | format_file_name(task.website_id, upload_token),
170 | to_dispose_date=int(time.time() + max_assign_time),
171 | upload_hook="")
172 | logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
173 |
174 |
175 | def format_file_name(website_id, token):
176 | return "%d_%s.NDJSON" % (website_id, token,)
177 |
178 |
179 | def download_file(url):
180 | r = requests.get(url, stream=True,)
181 |
182 | if r.status_code != 200:
183 | raise ValueError("HTTP error %d: %s" % (r.status_code, url))
184 |
185 | tmp = NamedTemporaryFile(delete=False)
186 | for chunk in r.iter_content(chunk_size=4096):
187 | if chunk:
188 | tmp.write(chunk)
189 | tmp.close()
190 |
191 | return tmp.name
192 |
--------------------------------------------------------------------------------
/template_filters.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 |
4 | import od_util
5 |
6 |
7 | def setup_template_filters(app):
8 |
9 | app.jinja_env.globals.update(truncate_path=od_util.truncate_path)
10 | app.jinja_env.globals.update(get_color=od_util.get_color)
11 | app.jinja_env.globals.update(get_mime=od_util.get_category)
12 |
13 | @app.template_filter("date_format")
14 | def date_format(value, format='%Y-%m-%d'):
15 | return time.strftime(format, time.gmtime(value))
16 |
17 | @app.template_filter("datetime_format")
18 | def datetime_format(value, format='%Y-%m-%d %H:%M:%S'):
19 | return time.strftime(format, time.gmtime(value))
20 |
21 | @app.template_filter("duration_format")
22 | def duration_format(value):
23 | delay = datetime.timedelta(seconds=value)
24 | if delay.days > 0:
25 | out = str(delay).replace(" days, ", ":")
26 | else:
27 | out = str(delay)
28 | out_ar = out.split(':')
29 | out_ar = ["%02d" % (int(float(x))) for x in out_ar]
30 | out = ":".join(out_ar)
31 | return out
32 |
33 | @app.template_filter("from_timestamp")
34 | def from_timestamp(value):
35 | return datetime.datetime.fromtimestamp(value)
36 |
--------------------------------------------------------------------------------
/templates/admin.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "Admin login - OD-Database" %}
3 |
4 | {% block body %}
5 |
28 | {% endblock body %}
29 |
--------------------------------------------------------------------------------
/templates/contribute.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set current_page = "contribute" %}
3 |
4 | {% block body %}
5 |
15 | {% endblock body %}
16 |
--------------------------------------------------------------------------------
/templates/dashboard.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "Dashboard - OD-Database" %}
3 |
4 | {% block body %}
5 |
6 |
7 |
8 |
9 |
10 |
API Keys
11 |
12 |
13 |
14 | Name |
15 | Token |
16 | Action |
17 |
18 |
19 |
20 |
21 | {% for token in api_tokens %}
22 |
23 | {{ token.name }} |
24 | {{ token.token }} |
25 |
26 |
30 | |
31 |
32 | {% endfor %}
33 |
34 |
35 |
45 |
46 |
47 |
48 |
Blacklist
49 |
50 |
51 |
52 | Netloc |
53 | Action |
54 |
55 |
56 |
57 | {% for item in blacklist %}
58 |
59 | {{ item.netloc }} |
60 | Delete |
61 |
62 | {% endfor %}
63 |
64 |
65 |
75 |
76 |
77 |
78 |
Logout
79 |
80 |
81 |
82 | {% endblock body %}
83 |
--------------------------------------------------------------------------------
/templates/downloads.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "Downloads - OD-Database" %}
3 | {% set current_page = "dl" %}
4 |
5 | {% block body %}
6 |
7 |
8 |
9 |
10 |
11 |
Please let me know if you used the database in a project!
12 |
The entire database is exported to CSV regularly
13 |
14 | {% if not export_file_stats %}
15 |
16 |
No files available.
17 | {% else %}
18 |
19 |
20 |
21 |
22 | Description |
23 | Size |
24 | Date |
25 |
26 |
27 |
28 |
29 | {% for name, path, stat in export_file_stats %}
30 |
31 | {{ name }} |
32 | {{ stat.st_size |filesizeformat }} |
33 | {{ stat.st_mtime|datetime_format }} UTC |
34 |
35 | {% endfor %}
36 |
37 |
38 |
39 | {% endif %}
40 |
41 |
42 |
43 | {% endblock body %}
44 |
--------------------------------------------------------------------------------
/templates/home.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set current_page = "home" %}
3 | {% set title = "OD-Database - Home" %}
4 |
5 | {% block body %}
6 |
7 |
8 |
9 |
OD-Database
10 |
11 | {% if stats and stats["total_size"] %}
12 |
{{ stats["total_count"] }} files totalling
13 | ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites
14 | {% else %}
15 |
We are currently experiencing a high volume of traffic. The search function
16 | may be unresponsive.
17 | {% endif %}
18 |
19 |
20 |
21 |
22 |
23 |
38 |
39 |
40 |
41 |
42 |
Web frontend and backend by simon987,
43 | HTTP crawler by terorie,
44 | hosting provided by The eye
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 | {% endblock body %}
55 |
--------------------------------------------------------------------------------
/templates/layout.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
40 |
41 |
42 | {% block alert_messages %}
43 |
44 |
45 | {% with messages = get_flashed_messages(with_categories=true) %}
46 | {% if messages %}
47 |
48 | {% for category, message in messages %}
49 |
50 |
×
51 | {{ message | safe }}
52 |
53 | {% endfor %}
54 |
55 | {% endif %}
56 | {% endwith %}
57 | {% endblock %}
58 |
59 | {% block body %}
60 | {% endblock body %}
61 |
62 |
63 |
--------------------------------------------------------------------------------
/templates/search.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set current_page = "search" %}
3 |
4 | {% set title = "OD-Database - Search" %}
5 |
6 | {% block body %}
7 |
8 |
9 |
124 |
125 | {% if count > 0 %}
126 |
127 |
128 |
129 |
{{ count }} result(s) in {{ results["took"] }}ms
130 |
131 |
132 |
133 |
134 |
135 | {% for hit in results["hits"]["hits"] %}
136 | {% set src = hit["_source"] %}
137 | {% if "name" in hit["highlight"] %}
138 | {% set hl_name = hit["highlight"]["name"][0] %}
139 | {% elif "name.nGram" in hit["highlight"] %}
140 | {% set hl_name = hit["highlight"]["name.nGram"][0] %}
141 | {% else %}
142 | {% set hl_name = src["name"] %}
143 | {% endif %}
144 |
145 | {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
146 |
147 |
148 |
149 | {% set category = get_mime(src["ext"]) %}
150 | {% set url = src["website_url"] + "/" + src["path"] + "/" + src["name"] + ("." if src["ext"] != "" else "") + src["ext"] %}
151 | {# Preview #}
152 | {% if category == "image" %}
153 |
155 | {% endif %}
156 | {# File name & link #}
157 | {{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}
158 | {# File type badge #}
159 | {% if category %}
160 |
161 | {{ src["ext"] }}
162 |
163 | {% endif %}
164 | {# File path #}
165 |
166 | {{ src["website_url"] }}/{{ hl_path|safe }}
168 |
169 | |
170 | {# File size & date #}
171 |
172 | {{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}
173 | {{ src["mtime"] | date_format }}
174 | |
175 |
176 | {% endfor %}
177 |
178 |
179 |
180 | {% if count > (p + 1) * per_page %}
181 |
182 | {% endif %}
183 | {% if p > 0 %}
184 |
185 | {% endif %}
186 |
187 |
188 |
189 | {% else %}
190 |
191 |
192 |
No results.
193 |
For better results:
194 |
195 | - Try checking the 'Match any word' box for a broader search.
196 | - Make sure you don't include the file extension in your query (Use the appropriate field to
197 | filter file types)
198 |
199 | - If you're searching for files in a particular website, use the website
200 | search page
201 |
202 |
203 |
204 |
205 | {% endif %}
206 |
207 |
208 |
209 |
284 |
285 |
286 |
287 | {% endblock body %}
288 |
--------------------------------------------------------------------------------
/templates/stats.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "Stats - OD-Database" %}
3 | {% set current_page = "stats" %}
4 |
5 | {% block body %}
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
Calculating...
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
Database stats
28 |
29 |
30 |
31 | Database index size |
32 | |
33 |
34 |
35 | Query count |
36 | |
37 |
38 |
39 | Total query time |
40 | |
41 |
42 |
43 | Average time per query |
44 | |
45 |
46 |
47 | Total file count |
48 | |
49 |
50 |
51 | Size total |
52 | |
53 |
54 |
55 | Size average |
56 | |
57 |
58 |
59 | Size standard deviation |
60 | |
61 |
62 |
63 | Size standard deviation bounds (σ = 1) |
64 | |
65 |
66 |
67 | Size variance |
68 | |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
98 | {% endblock body %}
99 |
--------------------------------------------------------------------------------
/templates/submit.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "OD-Database - Submit website" %}
3 | {% set current_page = "submit" %}
4 |
5 | {% block body %}
6 |
7 |
8 |
20 |
21 |
22 |
23 |
24 | {# Single website #}
25 |
36 |
37 |
38 |
39 | {# Bulk #}
40 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
By submitting this form you agree that your IP address and User Agent will be
61 | saved (for debugging purposes only).
62 |
63 |
64 |
65 |
66 | {% endblock body %}
67 |
--------------------------------------------------------------------------------
/templates/website.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "OD-Database - Website details" %}
3 | {% set current_page = "website" %}
4 |
5 | {% block body %}
6 |
7 |
8 |
9 |
10 |
11 |
12 |
Calculating...
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 | Base url |
22 | |
23 |
24 |
25 |
26 | File count |
27 | |
28 |
29 |
30 |
31 | Total size |
32 | |
33 |
34 |
35 |
36 | Last updated |
37 | |
38 |
39 |
40 |
41 |
42 |
43 |
Link list
44 |
Summary (JSON)
45 | {% if "username" in session %}
46 |
47 | Clear
48 |
49 | Delete
50 |
51 | rescan
52 | {% endif %}
53 |
54 |
55 |
56 |
73 | {% endblock body %}
74 |
--------------------------------------------------------------------------------
/templates/websites.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 | {% set title = "OD-Database - Websites" %}
3 | {% set current_page = "website" %}
4 |
5 |
6 | {% block body %}
7 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 | Url |
36 | Last crawled |
37 |
38 |
39 |
40 | {% for website in websites %}
41 |
42 | {{ website[1] | truncate(70) }} |
43 | {{ website[2] }} |
44 |
45 | {% endfor %}
46 |
47 | {% if websites|length == per_page %}
48 |
Next
49 | {% endif %}
50 | {% if p > 0 %}
51 |
Previous
52 | {% endif %}
53 |
54 |
55 |
56 | {% endblock body %}
57 |
--------------------------------------------------------------------------------
/tt_config.yml:
--------------------------------------------------------------------------------
1 | server:
2 | address: "0.0.0.0:3010"
3 |
4 | database:
5 | conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable"
6 | log_levels: ["error", "info", "warn"]
7 |
8 | git:
9 | webhook_hash: "sha256"
10 | webhook_sig_header: "X-Gogs-Signature"
11 |
12 | log:
13 | level: "trace"
14 |
15 | session:
16 | cookie_name: "tt"
17 | expiration: "48h"
18 |
19 | monitoring:
20 | snapshot_interval: "120s"
21 | history_length: "1800h"
22 |
23 | maintenance:
24 | reset_timed_out_tasks_interval: "10m"
25 |
--------------------------------------------------------------------------------
/uwsgi.ini:
--------------------------------------------------------------------------------
1 | [uwsgi]
2 | module = main
3 | callable = app
4 |
5 | enable-threads = true
6 | processes = 4
7 | threads = 16
8 |
9 | disable-logging = True
--------------------------------------------------------------------------------
/views.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from multiprocessing.pool import Pool
4 | from urllib.parse import urlparse
5 |
6 | from flask import render_template, redirect, request, flash, abort, Response, session
7 | from flask_caching import Cache
8 |
9 | import captcha
10 | import config
11 | import od_util
12 | from common import db, taskManager, searchEngine, logger, require_role
13 | from database import Website
14 | from search.search import InvalidQueryException
15 | from tasks import Task
16 |
17 |
18 | def setup_views(app):
19 | cache = Cache(app, config={
20 | "CACHE_TYPE": "redis",
21 | "CACHE_REDIS_HOST": config.REDIS_HOST,
22 | "CACHE_REDIS_PORT": config.REDIS_PORT,
23 | })
24 |
25 | @app.route("/dl")
26 | @cache.cached(120)
27 | def downloads():
28 | # Get content of downloads directory
29 | dl_dir = "static/downloads/"
30 | dir_content = os.listdir(dl_dir)
31 |
32 | # Make paths relative to working directory
33 | # Only allow csv files
34 | files = [
35 | (name, os.path.join(dl_dir, name))
36 | for name in dir_content
37 | if name.find(".csv") != -1
38 | ]
39 |
40 | # Stat files
41 | # Remove any dirs placed accidentally
42 | files = [
43 | (f, full, os.stat(full))
44 | for f, full in files
45 | if os.path.isfile(full)
46 | ]
47 |
48 | if len(files) == 0:
49 | logger.warning("No export file to display in /dl")
50 |
51 | return render_template("downloads.html", export_file_stats=files)
52 |
53 | @app.route("/stats")
54 | @cache.cached(120)
55 | def stats_page():
56 | return render_template("stats.html")
57 |
58 | @app.route("/stats/json_chart")
59 | @cache.cached(240)
60 | def stats_json():
61 | stats = searchEngine.get_global_stats()
62 | if stats:
63 | db.join_website_on_stats(stats)
64 | return Response(json.dumps(stats), mimetype="application/json")
65 | return abort(500)
66 |
67 | @app.route("/website//")
68 | def website_info(website_id):
69 | website = db.get_website_by_id(website_id)
70 |
71 | if website:
72 | return render_template("website.html", website=website)
73 | else:
74 | abort(404)
75 |
76 | @app.route("/website//json_chart")
77 | @cache.memoize(60)
78 | def website_json_chart(website_id):
79 | website = db.get_website_by_id(website_id)
80 |
81 | if website:
82 | stats = searchEngine.get_stats(website_id)
83 | stats["base_url"] = website.url
84 | stats["report_time"] = website.last_modified
85 | return Response(json.dumps(stats), mimetype="application/json")
86 | else:
87 | abort(404)
88 |
89 | @app.route("/website//links")
90 | def website_links(website_id):
91 | website = db.get_website_by_id(website_id)
92 |
93 | if website:
94 | links = searchEngine.get_link_list(website_id, website.url)
95 | return Response("\n".join(links), mimetype="text/plain")
96 | else:
97 | abort(404)
98 |
99 | @app.route("/website/")
100 | def websites():
101 | page = int(request.args.get("p")) if "p" in request.args else 0
102 | url = request.args.get("url") if "url" in request.args else ""
103 | if url:
104 | parsed_url = urlparse(url)
105 | if parsed_url.scheme:
106 | search_term = (parsed_url.scheme + "://" + parsed_url.netloc)
107 | else:
108 | flash("Sorry, I was not able to parse this url format. "
109 | "Make sure you include the appropriate scheme (http/https/ftp)", "warning")
110 | search_term = ""
111 | else:
112 | search_term = url
113 |
114 | return render_template("websites.html",
115 | websites=db.get_websites(50, page, search_term),
116 | p=page, url=search_term, per_page=50)
117 |
118 | @app.route("/website/random")
119 | def random_website():
120 | rand_id = db.get_random_website_id()
121 | if rand_id:
122 | return redirect("/website/" + str())
123 | return redirect("/website/")
124 |
125 | @app.route("/website//clear")
126 | def admin_clear_website(website_id):
127 | require_role("admin")
128 |
129 | searchEngine.delete_docs(website_id)
130 | flash("Cleared all documents associated with this website", "success")
131 | return redirect("/website/" + str(website_id))
132 |
133 | @app.route("/website//delete")
134 | def admin_delete_website(website_id):
135 | require_role("admin")
136 |
137 | searchEngine.delete_docs(website_id)
138 | db.delete_website(website_id)
139 | flash("Deleted website " + str(website_id), "success")
140 | return redirect("/website/")
141 |
142 | @app.route("/website//rescan")
143 | def admin_rescan_website(website_id):
144 | require_role("admin")
145 | website = db.get_website_by_id(website_id)
146 |
147 | if website:
148 | priority = request.args.get("priority") if "priority" in request.args else 1
149 | task = Task(website_id, website.url, priority)
150 | taskManager.queue_task(task)
151 |
152 | flash("Enqueued rescan task", "success")
153 | else:
154 | flash("Website does not exist", "danger")
155 | return redirect("/website/" + str(website_id))
156 |
157 | @app.route("/search")
158 | def search():
159 | results = 0
160 | q = request.args.get("q") if "q" in request.args else ""
161 | sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
162 |
163 | page = request.args.get("p") if "p" in request.args else "0"
164 | page = int(page) if page.isdigit() else 0
165 |
166 | per_page = request.args.get("per_page") if "per_page" in request.args else "50"
167 | per_page = int(per_page) if per_page.isdigit() else "50"
168 | per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
169 |
170 | extensions = request.args.get("ext") if "ext" in request.args else None
171 | extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else []
172 |
173 | size_min = request.args.get("size_min") if "size_min" in request.args else "size_min"
174 | size_min = int(size_min) if size_min.isdigit() else 0
175 | size_max = request.args.get("size_max") if "size_max" in request.args else "size_max"
176 | size_max = int(size_max) if size_max.isdigit() else 0
177 |
178 | date_min = request.args.get("date_min") if "date_min" in request.args else "date_min"
179 | date_min = int(date_min) if date_min.isdigit() else 0
180 | date_max = request.args.get("date_max") if "date_max" in request.args else "date_max"
181 | date_max = int(date_max) if date_max.isdigit() else 0
182 |
183 | match_all = "all" in request.args
184 |
185 | field_name = "field_name" in request.args
186 | field_trigram = "field_trigram" in request.args
187 | field_path = "field_path" in request.args
188 |
189 | if not field_name and not field_trigram and not field_path:
190 | # If no fields are selected, search in all
191 | field_name = field_path = field_trigram = True
192 |
193 | fields = []
194 | if field_path:
195 | fields.append("path")
196 | if field_name:
197 | fields.append("name^5")
198 | if field_trigram:
199 | fields.append("name.nGram^2")
200 |
201 | if len(q) >= 3:
202 |
203 | blocked = False
204 | hits = None
205 | if not config.CAPTCHA_SEARCH or captcha.verify():
206 |
207 | try:
208 | hits = searchEngine.search(q, page, per_page, sort_order,
209 | extensions, size_min, size_max, match_all, fields, date_min, date_max)
210 | hits = db.join_website_on_search_result(hits)
211 | except InvalidQueryException as e:
212 | flash("Invalid query: " + str(e), "warning")
213 | blocked = True
214 | except:
215 | flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
216 | "Please try again later", "danger")
217 |
218 | results = hits["hits"]["total"]["value"] if not isinstance(hits["hits"]["total"], int) else \
219 | hits["hits"]["total"] if hits else -1
220 | took = hits["took"] if hits else -1
221 | forwarded_for = request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None
222 |
223 | logger.info("SEARCH '{}' [res={}, t={}, p={}x{}, ext={}] by {}{}"
224 | .format(q, results, took, page, per_page, str(extensions),
225 | request.remote_addr, "_" + forwarded_for if forwarded_for else ""))
226 |
227 | db.log_search(request.remote_addr, forwarded_for, q, extensions, page, blocked, results, took)
228 | if blocked:
229 | return redirect("/search")
230 | else:
231 | flash("Error: Invalid captcha please try again", "danger")
232 |
233 | else:
234 | hits = None
235 |
236 | return render_template("search.html",
237 | count=results,
238 | results=hits,
239 | q=q,
240 | p=page, per_page=per_page,
241 | sort_order=sort_order,
242 | results_set=config.RESULTS_PER_PAGE,
243 | extensions=",".join(extensions),
244 | size_min=size_min, size_max=size_max,
245 | match_all=match_all,
246 | field_trigram=field_trigram, field_path=field_path, field_name=field_name,
247 | date_min=date_min, date_max=date_max,
248 | show_captcha=config.CAPTCHA_SEARCH, captcha=captcha)
249 |
250 | @app.route("/contribute")
251 | @cache.cached(600)
252 | def contribute():
253 | return render_template("contribute.html")
254 |
255 | @app.route("/")
256 | def home():
257 | try:
258 | stats = searchEngine.get_global_stats()
259 | stats["website_count"] = len(db.get_all_websites())
260 | except:
261 | stats = {}
262 | return render_template("home.html", stats=stats,
263 | show_captcha=config.CAPTCHA_SEARCH, captcha=captcha)
264 |
265 | @app.route("/submit")
266 | def submit():
267 | return render_template("submit.html", captcha=captcha, show_captcha=config.CAPTCHA_SUBMIT)
268 |
269 | def try_enqueue(url):
270 | url = os.path.join(url, "")
271 | url = od_util.get_top_directory(url)
272 |
273 | if not od_util.is_valid_url(url):
274 | return "Error: Invalid url. Make sure to include the appropriate scheme.", "warning"
275 |
276 | website = db.get_website_by_url(url)
277 | if website:
278 | return "Website already exists", "danger"
279 |
280 | website = db.website_exists(url)
281 | if website:
282 | return "A parent directory of this url has already been posted", "danger"
283 |
284 | if db.is_blacklisted(url):
285 | return "Error: " \
286 | "Sorry, this website has been blacklisted. If you think " \
287 | "this is an error, please contact me.", "danger"
288 |
289 | if not od_util.is_od(url):
290 | return "Error:" \
291 | "The anti-spam algorithm determined that the submitted url is not " \
292 | "an open directory or the server is not responding. If you think " \
293 | "this is an error, please contact me.", "danger"
294 |
295 | website_id = db.insert_website(Website(url, str(request.remote_addr + "_" +
296 | request.headers.get("X-Forwarded-For", "")),
297 | request.user_agent))
298 |
299 | task = Task(website_id, url, priority=1)
300 | taskManager.queue_task(task)
301 |
302 | return "The website has been added to the queue", "success"
303 |
304 | @app.route("/enqueue", methods=["POST"])
305 | def enqueue():
306 | if not config.CAPTCHA_SUBMIT or captcha.verify():
307 |
308 | url = os.path.join(request.form.get("url"), "")
309 | message, msg_type = try_enqueue(url)
310 | flash(message, msg_type)
311 |
312 | return redirect("/submit")
313 |
314 | else:
315 | flash("Error: Invalid captcha please try again", "danger")
316 | return redirect("/submit")
317 |
318 | def check_url(url):
319 | url = os.path.join(url, "")
320 | try_enqueue(url)
321 | return None
322 |
323 | @app.route("/enqueue_bulk", methods=["POST"])
324 | def enqueue_bulk():
325 | if not config.CAPTCHA_SUBMIT or captcha.verify():
326 |
327 | urls = request.form.get("urls")
328 | if urls:
329 | urls = urls.split()
330 |
331 | if 0 < len(urls) <= 1000: # TODO: Load from config & adjust placeholder/messages?
332 |
333 | pool = Pool(processes=6)
334 | pool.map(func=check_url, iterable=urls)
335 | pool.close()
336 |
337 | flash("Submitted websites to the queue", "success")
338 |
339 | return redirect("/submit")
340 |
341 | else:
342 | flash("Too few or too many urls, please submit 1-10 urls", "danger")
343 | return redirect("/submit")
344 | else:
345 | flash("Too few or too many urls, please submit 1-10 urls", "danger")
346 | return redirect("/submit")
347 | else:
348 | flash("Error: Invalid captcha please try again", "danger")
349 | return redirect("/submit")
350 |
351 | @app.route("/admin")
352 | def admin_login_form():
353 | if "username" in session:
354 | return redirect("/dashboard")
355 | return render_template("admin.html", captcha=captcha, show_captcha=config.CAPTCHA_LOGIN)
356 |
357 | @app.route("/login", methods=["POST"])
358 | def admin_login():
359 | if not config.CAPTCHA_LOGIN or captcha.verify():
360 |
361 | username = request.form.get("username")
362 | password = request.form.get("password")
363 |
364 | if db.check_login(username, password):
365 | session["username"] = username
366 | flash("Logged in", "success")
367 | return redirect("/dashboard")
368 |
369 | flash("Invalid username/password combo", "danger")
370 | return redirect("/admin")
371 |
372 | else:
373 | flash("Invalid captcha", "danger")
374 | return redirect("/admin")
375 |
376 | @app.route("/logout")
377 | def admin_logout():
378 | session.clear()
379 | flash("Logged out", "info")
380 | return redirect("/")
381 |
382 | @app.route("/dashboard")
383 | def admin_dashboard():
384 | require_role("admin")
385 | tokens = db.get_tokens()
386 | blacklist = db.get_blacklist()
387 |
388 | return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist)
389 |
390 | @app.route("/blacklist/add", methods=["POST"])
391 | def admin_blacklist_add():
392 | require_role("admin")
393 | url = request.form.get("url")
394 | db.add_blacklist_website(url)
395 | flash("Added item to blacklist", "success")
396 | return redirect("/dashboard")
397 |
398 | @app.route("/blacklist//delete")
399 | def admin_blacklist_remove(blacklist_id):
400 | require_role("admin")
401 | db.remove_blacklist_website(blacklist_id)
402 | flash("Removed blacklist item", "success")
403 | return redirect("/dashboard")
404 |
405 | @app.route("/generate_token", methods=["POST"])
406 | def admin_generate_token():
407 | require_role("admin")
408 | description = request.form.get("description")
409 |
410 | db.generate_api_token(description)
411 | flash("Generated API token", "success")
412 |
413 | return redirect("/dashboard")
414 |
415 | @app.route("/del_token", methods=["POST"])
416 | def admin_del_token():
417 | require_role("admin")
418 | token = request.form.get("token")
419 |
420 | db.delete_token(token)
421 | flash("Deleted API token", "success")
422 | return redirect("/dashboard")
423 |
--------------------------------------------------------------------------------