├── ,gitattributes
├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── api.py
├── app.py
├── captcha.py
├── captchas
    └── .gitkeep
├── common.py
├── config.py
├── database.py
├── do_recrawl.py
├── docker-compose.yml
├── export.py
├── high_level_diagram.dia
├── high_level_diagram.png
├── init_script.sql
├── main.py
├── mass_import.py
├── od_util.py
├── reddit_bot.py
├── requirements.txt
├── search
    ├── __init__.py
    ├── filter.py
    └── search.py
├── static
    ├── Hack-Regular.ttf
    ├── css
    │   ├── bootstrap.min.css
    │   ├── fa-brands.css
    │   ├── fa-brands.min.css
    │   ├── fa-regular.css
    │   ├── fa-regular.min.css
    │   ├── fa-solid.css
    │   ├── fa-solid.min.css
    │   ├── fontawesome-all.css
    │   ├── fontawesome-all.min.css
    │   ├── fontawesome.css
    │   ├── fontawesome.min.css
    │   ├── ion.rangeSlider.css
    │   ├── ion.rangeSlider.skinFlat.css
    │   ├── main.css
    │   └── style.css
    ├── downloads
    │   └── README.md
    ├── img
    │   ├── bg.png
    │   ├── forkme_right_white_ffffff.png
    │   └── sprite-skin-flat.png
    ├── js
    │   ├── Chart.min.js
    │   ├── bootstrap.min.js
    │   ├── ion.rangeSlider.min.js
    │   ├── jquery.min.js
    │   ├── popper.min.js
    │   ├── report.js
    │   └── script.js
    └── webfonts
    │   ├── fa-brands-400.eot
    │   ├── fa-brands-400.svg
    │   ├── fa-brands-400.ttf
    │   ├── fa-brands-400.woff
    │   ├── fa-brands-400.woff2
    │   ├── fa-regular-400.eot
    │   ├── fa-regular-400.svg
    │   ├── fa-regular-400.ttf
    │   ├── fa-regular-400.woff
    │   ├── fa-regular-400.woff2
    │   ├── fa-solid-900.eot
    │   ├── fa-solid-900.svg
    │   ├── fa-solid-900.ttf
    │   ├── fa-solid-900.woff
    │   └── fa-solid-900.woff2
├── tasks.py
├── template_filters.py
├── templates
    ├── admin.html
    ├── contribute.html
    ├── dashboard.html
    ├── downloads.html
    ├── home.html
    ├── layout.html
    ├── search.html
    ├── stats.html
    ├── submit.html
    ├── website.html
    └── websites.html
├── tt_config.yml
├── uwsgi.ini
└── views.py


/,gitattributes:
--------------------------------------------------------------------------------
1 | static/css/* linguist-vendored
2 | static/css/main.css linguist-vendored=false
3 | static/js/* linguist-vendored


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | /static/downloads/
 3 | !/static/downloads/README.md
 4 | __pycache__/
 5 | captchas/
 6 | _stats.json
 7 | oddb.log
 8 | praw.ini
 9 | env/
10 | worker.json
11 | search_blacklist.txt
12 | *.iml
13 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "fold_to_ascii"]
 2 | 	path = fold_to_ascii
 3 | 	url = https://github.com/spanishdict/fold_to_ascii
 4 | [submodule "task_tracker_drone"]
 5 | 	path = task_tracker_drone
 6 | 	url = https://github.com/simon987/task_tracker_drone
 7 | [submodule "ws_bucket_client"]
 8 | 	path = ws_bucket_client
 9 | 	url = https://github.com/simon987/ws_bucket_client
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | ADD requirements.txt /app/requirements.txt
 6 | RUN pip install -r requirements.txt
 7 | 
 8 | ENTRYPOINT ["python", "app.py"]
 9 | 
10 | COPY . /app
11 | 
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Simon Fortier
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OD-Database
 2 | 
 3 | OD-Database is a web-crawling project that aims to index a very large number of file links and their basic metadata from open directories (misconfigured Apache/Nginx/FTP servers, or more often, mirrors of various public services).
 4 | 
 5 | Each crawler instance fetches tasks from the central server and pushes the result once completed. A single instance can crawl hundreds of websites at the same time (Both FTP and HTTP(S)) and the central server is capable of ingesting thousands of new documents per second. 
 6 | 
 7 | The data is indexed into elasticsearch and made available via the web frontend (Currently hosted at https://od-db.the-eye.eu/). There is currently ~1.93 billion files indexed (total of about 300Gb of raw data). The raw data is made available as a CSV file [here](https://od-db.the-eye.eu/dl).
 8 | 
 9 | ![2018-09-20-194116_1127x639_scrot](https://user-images.githubusercontent.com/7120851/45852325-281cca00-bd0d-11e8-9fed-49a54518e972.png)
10 | 
11 | 
12 | ### Contributing   
13 | Suggestions/concerns/PRs are welcome
14 | 
15 | ## Installation (Docker)
16 | ```bash
17 | git clone --recursive https://github.com/simon987/od-database
18 | cd od-database
19 | mkdir oddb_pg_data/ tt_pg_data/ es_data/ wsb_data/
20 | docker-compose up
21 | ```
22 | 
23 | ## Architecture
24 | 
25 | ![diag](high_level_diagram.png)
26 | 
27 | ## Running the crawl server
28 | The python crawler that was a part of this project is discontinued,
29 | [the go implementation](https://github.com/terorie/od-database-crawler) is currently in use.
30 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from uuid import uuid4
  3 | 
  4 | from flask import request, abort, send_file, session
  5 | 
  6 | import captcha
  7 | import common as oddb
  8 | from common import taskManager
  9 | from database import Website
 10 | from search.search import InvalidQueryException
 11 | 
 12 | 
 13 | def setup_api(app):
 14 |     taskManager.start_indexer_threads()
 15 | 
 16 |     @app.route("/api/website/by_url", methods=["GET"])
 17 |     def api_website_by_url():
 18 |         token = request.args.get("token")
 19 |         name = oddb.db.check_api_token(token)
 20 | 
 21 |         if name:
 22 |             url = request.args.get("url")
 23 |             website = oddb.db.get_website_by_url(url)
 24 |             oddb.logger.info("API get website by url '" + url + "' by " + name)
 25 |             if website:
 26 |                 return str(website.id)
 27 |             return abort(404)
 28 |         else:
 29 |             return abort(403)
 30 | 
 31 |     @app.route("/api/website/blacklisted", methods=["GET"])
 32 |     def api_website_is_blacklisted():
 33 |         token = request.args.get("token")
 34 |         url = request.args.get("url")
 35 |         name = oddb.db.check_api_token(token)
 36 | 
 37 |         if name:
 38 |             oddb.logger.info("API get website is blacklisted '" + url + "' by " + name)
 39 |             return str(oddb.db.is_blacklisted(url))
 40 |         else:
 41 |             return abort(403)
 42 | 
 43 |     @app.route("/api/website/add", methods=["GET"])
 44 |     def api_add_website():
 45 |         token = request.args.get("token")
 46 |         url = request.args.get("url")
 47 | 
 48 |         name = oddb.db.check_api_token(token)
 49 |         if name:
 50 | 
 51 |             website_id = oddb.db.insert_website(Website(url, str(request.remote_addr + "_" +
 52 |                                                                  request.headers.get("X-Forwarded-For", "")),
 53 |                                                         "API_CLIENT_" + name))
 54 |             oddb.logger.info("API add website '" + url + "' by " + name + "(" + str(website_id) + ")")
 55 |             return str(website_id)
 56 |         else:
 57 |             return abort(403)
 58 | 
 59 |     @app.route("/api/website/random")
 60 |     def api_random_website():
 61 |         token = request.json["token"]
 62 |         name = oddb.db.check_api_token(token)
 63 | 
 64 |         if name:
 65 |             oddb.logger.info("API get random website by " + name)
 66 |             return str(oddb.db.get_random_website_id())
 67 |         else:
 68 |             return abort(403)
 69 | 
 70 |     @app.route("/api/search", methods=["POST"])
 71 |     def api_search():
 72 |         token = request.json["token"]
 73 |         name = oddb.db.check_api_token(token)
 74 | 
 75 |         if name:
 76 | 
 77 |             try:
 78 |                 hits = oddb.searchEngine.search(
 79 |                     request.json["query"],
 80 |                     request.json["page"], request.json["per_page"],
 81 |                     request.json["sort_order"],
 82 |                     request.json["extensions"],
 83 |                     request.json["size_min"], request.json["size_max"],
 84 |                     request.json["match_all"],
 85 |                     request.json["fields"],
 86 |                     request.json["date_min"], request.json["date_max"]
 87 |                 )
 88 | 
 89 |                 hits = oddb.db.join_website_on_search_result(hits)
 90 |                 oddb.logger.info("API search '" + request.json["query"] + "' by " + name)
 91 |                 return json.dumps(hits)
 92 | 
 93 |             except InvalidQueryException as e:
 94 |                 oddb.logger.info("API search failed: " + str(e))
 95 |                 return str(e)
 96 |         else:
 97 |             return abort(403)
 98 | 
 99 |     @app.route("/cap", methods=["GET"])
100 |     def cap():
101 |         word = captcha.make_captcha()
102 |         cap_id = uuid4().__str__()
103 |         session["cap"] = cap_id
104 | 
105 |         oddb.redis.set(cap_id, word)
106 | 
107 |         return send_file(captcha.get_path(word), cache_timeout=0)
108 | 
109 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | 
 3 | import api
 4 | import common
 5 | import config
 6 | import template_filters
 7 | import views
 8 | import os
 9 | 
10 | app = Flask(__name__)
11 | app.secret_key = config.FLASK_SECRET
12 | template_filters.setup_template_filters(app)
13 | 
14 | views.setup_views(app)
15 | api.setup_api(app)
16 | 
17 | 
18 | if os.environ.get("ODDB_USER", False) and os.environ.get("ODDB_PASSWORD", False):
19 |     user = os.environ["ODDB_USER"]
20 |     password = os.environ["ODDB_PASSWORD"]
21 |     try:
22 |         common.db.generate_login(user, password)
23 |         print("Generated user %s" % user)
24 |     except:
25 |         pass
26 | 
27 | if __name__ == '__main__':
28 |     app.run("0.0.0.0", port=80, threaded=True)
29 | 


--------------------------------------------------------------------------------
/captcha.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import string
  3 | 
  4 | from PIL import Image, ImageDraw, ImageFont
  5 | from flask import request, session
  6 | 
  7 | import common as oddb
  8 | import config
  9 | 
 10 | 
 11 | def get_code():
 12 | 
 13 |     if "cap_remaining" in session and session["cap_remaining"] > 0:
 14 |         return """
 15 |         <span class='text-muted' style='margin: 10px'>You will not be asked to complete a captcha for the next {} pages</span>
 16 |         """.format(session["cap_remaining"])
 17 | 
 18 |     return """
 19 |     <div class='form-group' style='text-align: center'>
 20 |     <img src='./cap' alt='cap'  class='img-fluid' style='margin: 10px;'>
 21 |     <input class="form-control" name="cap" id="cap" placeholder="Verification captcha">
 22 |     </div>
 23 |     """
 24 | 
 25 | 
 26 | def get_path(word):
 27 |     return "captchas/{}.png".format(word)
 28 | 
 29 | 
 30 | def verify():
 31 |     if "cap_remaining" in session and session["cap_remaining"] > 0:
 32 |         session["cap_remaining"] -= 1
 33 |         return True
 34 | 
 35 |     attempt = request.form.get("cap") if "cap" in request.form else (
 36 |         request.args.get("cap") if "cap" in request.args else ""
 37 |     )
 38 | 
 39 |     if "cap" in session:
 40 |         expected = oddb.redis.get(session["cap"])
 41 |         expected = expected.decode("utf8") if expected is not None else ""
 42 |         oddb.redis.delete(session["cap"])
 43 | 
 44 |         if expected == attempt:
 45 |             session["cap_remaining"] = config.CAPTCHA_EVERY
 46 |             return True
 47 | 
 48 |     return False
 49 | 
 50 | 
 51 | cfg = {
 52 |     "image": {
 53 |         "size": (200, 72),
 54 |         "supersampling": 2
 55 |     },
 56 |     "noise": {
 57 |         "min": 100,
 58 |         "max": 250
 59 |     },
 60 |     "colors": {
 61 |         "green": [(1, 51, 1), (34, 204, 34)],
 62 |         "yellow": [(67, 67, 1), (221, 221, 0)],
 63 |         "cyan": [(17, 51, 85), (85, 187, 254)],
 64 |         "magenta": [(51, 1, 51), (254, 0, 254)],
 65 |         "red": [(67, 1, 1), (254, 68, 68)],
 66 |         "orange": [(68, 51, 1), (255, 153, 0)]
 67 |     },
 68 |     "lines": {
 69 |         "back_thin": {"n": 3, "w": 5},
 70 |         "back_thick": {"n": 3, "w": 6},
 71 |         "back_positions": [
 72 |             {
 73 |                 "ax": (0, 10),
 74 |                 "ay": (0, 36),
 75 |                 "bx": (150, 200),
 76 |                 "by": (18, 50)
 77 |             },
 78 |             {
 79 |                 "ax": (0, 10),
 80 |                 "ay": (18, 50),
 81 |                 "bx": (150, 200),
 82 |                 "by": (0, 17)
 83 |             }
 84 |         ],
 85 |         "front_horizontal_thin": {"n": 2, "w": 3},
 86 |         "front_horizontal_thick": {"n": 2, "w": 4},
 87 |         "front_horizontal_positions": [
 88 |             {
 89 |                 "ax": (0, 20),
 90 |                 "ay": (0, 34),
 91 |                 "bx": (150, 200),
 92 |                 "by": (18, 50)
 93 |             },
 94 |             {
 95 |                 "ax": (0, 20),
 96 |                 "ay": (18, 72),
 97 |                 "bx": (140, 200),
 98 |                 "by": (0, 36)
 99 |             },
100 |         ],
101 |         "front_vertical": {"n": 2, "w": 4},
102 |         "front_vertical_positions": {
103 |             "outside": 5,
104 |             "font_width": 13,
105 |             "ay": (0, 16),
106 |             "by": (54, 72)
107 |         }
108 |     },
109 |     "text": {
110 |         "font": {
111 |             "path": "static/Hack-Regular.ttf",
112 |             "size": 60,
113 |             "outline": [1, 2]
114 |         },
115 |         "letters": {
116 |             "3": {
117 |                 "count": 3,
118 |                 "x_min": 35,
119 |                 "x_max": 50,
120 |                 "y_min": -5,
121 |                 "y_max": 8
122 |             },
123 |             "4": {
124 |                 "count": 4,
125 |                 "x_min": 20,
126 |                 "x_max": 35,
127 |                 "y_min": -5,
128 |                 "y_max": 8
129 |             },
130 |             "5": {
131 |                 "count": 5,
132 |                 "x_min": 5,
133 |                 "x_max": 20,
134 |                 "y_min": -5,
135 |                 "y_max": 8
136 |             }
137 |         }
138 |     }
139 | }
140 | 
141 | size = cfg["image"]["size"]
142 | c = cfg["image"]["supersampling"]
143 | 
144 | # Additional config
145 | letter_count = "4"
146 | 
147 | 
148 | def horizontal_lines(draw, c, line_par, line_pos, fill):
149 |     for _ in range(line_par["n"]):
150 |         pos = random.randrange(0, len(line_pos))
151 |         ax = random.randint(*line_pos[pos]["ax"])
152 |         ay = random.randint(*line_pos[pos]["ay"])
153 |         bx = random.randint(*line_pos[pos]["bx"])
154 |         by = random.randint(*line_pos[pos]["by"])
155 |         draw.line([(ax*c, ay*c), (bx*c, by*c)], width=line_par["w"]*c, fill=fill)
156 | 
157 | 
158 | def make_captcha():
159 | 
160 |     color_name, color = random.choice(list(cfg["colors"].items()))
161 |     text = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(cfg["text"]["letters"][letter_count]["count"]))
162 | 
163 |     path = get_path(text)
164 | 
165 |     w = size[0]*c
166 |     h = size[1]*c
167 | 
168 |     img = Image.new('RGB', (w, h))
169 |     pixels = img.load()
170 | 
171 |     # noise
172 |     for x in range(w):
173 |         for y in range(h):
174 |             rcol = random.randint(cfg["noise"]["min"], cfg["noise"]["max"])
175 |             pixels[x, y] = (rcol, rcol, rcol)
176 | 
177 |     # background lines
178 |     draw = ImageDraw.Draw(img)
179 | 
180 |     horizontal_lines(draw, c, cfg["lines"]["back_thin"], cfg["lines"]["back_positions"], color[0])
181 |     horizontal_lines(draw, c, cfg["lines"]["back_thick"], cfg["lines"]["back_positions"], color[0])
182 | 
183 |     # text
184 |     ctx = cfg["text"]["font"]
185 |     font = ImageFont.truetype(ctx["path"], ctx["size"]*c)
186 |     outline = random.choice(ctx["outline"])
187 | 
188 |     ctx = cfg["text"]["letters"][letter_count]
189 |     x = random.randint(ctx["x_min"], ctx["x_max"])
190 |     y = random.randint(ctx["y_min"], ctx["y_max"])
191 |     draw.text((x*c-outline*c, y*c-outline*c), text, color[0], font=font)
192 |     draw.text((x*c-outline*c, y*c), text, color[0], font=font)
193 |     draw.text((x*c-outline*c, y*c+outline*c), text, color[0], font=font)
194 |     draw.text((x*c, y*c-outline*c), text, color[0], font=font)
195 |     draw.text((x*c, y*c+outline*c), text, color[0], font=font)
196 |     draw.text((x*c+outline*c, y*c-outline*c), text, color[0], font=font)
197 |     draw.text((x*c+outline*c, y*c), text, color[0], font=font)
198 |     draw.text((x*c+outline*c, y*c+outline*c), text, color[0], font=font)
199 |     draw.text((x*c, y*c), text, color[1], font=font)
200 | 
201 |     # foreground lines
202 |     horizontal_lines(draw, c, cfg["lines"]["front_horizontal_thin"], cfg["lines"]["front_horizontal_positions"], color[1])
203 |     horizontal_lines(draw, c, cfg["lines"]["front_horizontal_thick"], cfg["lines"]["front_horizontal_positions"], color[1])
204 | 
205 |     # vertical lines
206 |     line_par = cfg["lines"]["front_vertical"]
207 |     line_pos = cfg["lines"]["front_vertical_positions"]
208 | 
209 |     for _ in range(line_par["n"]):
210 |         ax = random.randint(x-line_pos["outside"], x+line_pos["outside"] + cfg["text"]["letters"][letter_count]["count"]*line_pos["font_width"])
211 |         bx = ax + random.randint(-line_pos["font_width"], line_pos["font_width"])
212 |         ay = random.randint(*line_pos["ay"])
213 |         by = random.randint(*line_pos["by"])
214 |         draw.line([(ax*c, ay*c), (bx*c, by*c)], width=line_par["w"]*c, fill=color[1])
215 | 
216 |     img.thumbnail(cfg["image"]["size"], Image.ANTIALIAS)
217 |     img.save(path, "png")
218 | 
219 |     return text
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     make_captcha()
224 | 


--------------------------------------------------------------------------------
/captchas/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/captchas/.gitkeep


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from logging import FileHandler, StreamHandler
 4 | 
 5 | import redis as r
 6 | from flask import session, abort
 7 | 
 8 | import config
 9 | from database import Database
10 | from search.search import ElasticSearchEngine
11 | from tasks import TaskManager
12 | 
13 | # Disable flask logging
14 | flaskLogger = logging.getLogger('werkzeug')
15 | flaskLogger.setLevel(logging.ERROR)
16 | 
17 | logger = logging.getLogger("default")
18 | logger.setLevel(logging.DEBUG)
19 | 
20 | formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
21 | file_handler = FileHandler("oddb.log")
22 | file_handler.setFormatter(formatter)
23 | for h in logger.handlers:
24 |     logger.removeHandler(h)
25 | logger.addHandler(file_handler)
26 | logger.addHandler(StreamHandler(sys.stdout))
27 | 
28 | taskManager = TaskManager()
29 | searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
30 | searchEngine.start_stats_scheduler()
31 | db = Database(config.DB_CONN_STR)
32 | 
33 | redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)
34 | 
35 | 
36 | def require_role(role: str):
37 |     if db.get_user_role(session.get("username", None)) != role:
38 |         abort(403)
39 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | from os import environ
 2 | 
 3 | CAPTCHA_LOGIN = bool(environ.get("CAPTCHA_LOGIN", False))
 4 | CAPTCHA_SUBMIT = bool(environ.get("CAPTCHA_SUBMIT", False))
 5 | CAPTCHA_SEARCH = bool(environ.get("CAPTCHA_SEARCH", False))
 6 | CAPTCHA_EVERY = int(environ.get("CAPTCHA_EVERY", 10))
 7 | 
 8 | FLASK_SECRET = environ.get("FLASK_SECRET", "A very secret secret")
 9 | RESULTS_PER_PAGE = (12, 25, 50, 100, 250, 500, 1000)
10 | 
11 | SUBMIT_FTP = bool(environ.get("SUBMIT_FTP", False))
12 | SUBMIT_HTTP = bool(environ.get("SUBMIT_HTTP", True))
13 | 
14 | TT_API = environ.get("TT_API", "http://localhost:3010")
15 | TT_CRAWL_PROJECT = int(environ.get("TT_CRAWL_PROJECT", 3))
16 | TT_INDEX_PROJECT = int(environ.get("TT_INDEX_PROJECT", 9))
17 | 
18 | WSB_API = environ.get("WSB_API", "http://localhost:3020")
19 | WSB_SECRET = environ.get("WSB_SECRET", "default_secret")
20 | 
21 | ES_URL = environ.get("ES_URL", "http://localhost:9200")
22 | ES_INDEX = environ.get("ES_INDEX", "od-database")
23 | 
24 | REDIS_HOST = environ.get("REDIS_HOST", "localhost")
25 | REDIS_PORT = environ.get("REDIS_PORT", 6379)
26 | 
27 | DB_CONN_STR = environ.get("DB_CONN_STR", "dbname=od_database user=od_database password=od_database")
28 | RECRAWL_POOL_SIZE = environ.get("RECRAWL_POOL_SIZE", 10000)
29 | INDEXER_THREADS = int(environ.get("INDEXER_THREAD", 3))
30 | 


--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import uuid
  4 | from urllib.parse import urlparse, urljoin
  5 | 
  6 | import bcrypt
  7 | import psycopg2
  8 | 
  9 | 
 10 | class BlacklistedWebsite:
 11 |     def __init__(self, blacklist_id, url):
 12 |         self.id = blacklist_id
 13 |         self.netloc = url
 14 | 
 15 | 
 16 | class Website:
 17 | 
 18 |     def __init__(self, url, logged_ip, logged_useragent, last_modified=None, website_id=None):
 19 |         self.url = url
 20 |         self.logged_ip = logged_ip
 21 |         self.logged_useragent = logged_useragent
 22 |         self.last_modified = last_modified
 23 |         self.id = website_id
 24 | 
 25 | 
 26 | class ApiClient:
 27 | 
 28 |     def __init__(self, token, name):
 29 |         self.token = token
 30 |         self.name = name
 31 | 
 32 | 
 33 | class Database:
 34 | 
 35 |     def __init__(self, db_conn_str):
 36 |         self.db_conn_str = db_conn_str
 37 |         self.website_cache = dict()
 38 |         self.website_cache_time = 0
 39 | 
 40 |         with psycopg2.connect(self.db_conn_str) as conn:
 41 |             cursor = conn.cursor()
 42 |             cursor.execute("SELECT EXISTS (SELECT 1 FROM pg_tables "
 43 |                            "WHERE tablename = 'searchlogentry')")
 44 | 
 45 |             if not cursor.fetchone()[0]:
 46 |                 self.init_database()
 47 | 
 48 |     def init_database(self):
 49 | 
 50 |         print("Initializing database")
 51 | 
 52 |         with open("init_script.sql", "r") as f:
 53 |             init_script = f.read()
 54 | 
 55 |         with psycopg2.connect(self.db_conn_str) as conn:
 56 |             cur = conn.cursor()
 57 |             cur.execute(init_script)
 58 | 
 59 |     def update_website_date_if_exists(self, website_id):
 60 | 
 61 |         with psycopg2.connect(self.db_conn_str) as conn:
 62 |             cursor = conn.cursor()
 63 |             cursor.execute("UPDATE Website SET last_modified=CURRENT_TIMESTAMP WHERE id=%s", (website_id,))
 64 |             conn.commit()
 65 | 
 66 |     def insert_website(self, website: Website):
 67 | 
 68 |         with psycopg2.connect(self.db_conn_str) as conn:
 69 |             cursor = conn.cursor()
 70 |             cursor.execute("INSERT INTO Website (url, logged_ip, logged_useragent) VALUES (%s,%s,%s) RETURNING id",
 71 |                            (website.url, str(website.logged_ip), str(website.logged_useragent)))
 72 | 
 73 |             website_id = cursor.fetchone()[0]
 74 |             conn.commit()
 75 | 
 76 |         return website_id
 77 | 
 78 |     def get_website_by_url(self, url):
 79 | 
 80 |         with psycopg2.connect(self.db_conn_str) as conn:
 81 |             cursor = conn.cursor()
 82 | 
 83 |             cursor.execute("SELECT id, url, logged_ip, logged_useragent, last_modified FROM Website WHERE url=%s",
 84 |                            (url,))
 85 |             db_web = cursor.fetchone()
 86 |         if db_web:
 87 |             website = Website(db_web[1], db_web[2], db_web[3], db_web[4], str(db_web[0]))
 88 |             return website
 89 |         else:
 90 |             return None
 91 | 
 92 |     def get_website_by_id(self, website_id):
 93 | 
 94 |         with psycopg2.connect(self.db_conn_str) as conn:
 95 |             cursor = conn.cursor()
 96 | 
 97 |             cursor.execute("SELECT * FROM Website WHERE id=%s", (website_id,))
 98 |             db_web = cursor.fetchone()
 99 | 
100 |             if db_web:
101 |                 website = Website(db_web[1], db_web[2], db_web[3], str(db_web[4]))
102 |                 website.id = db_web[0]
103 |                 return website
104 |             else:
105 |                 return None
106 | 
107 |     def get_websites(self, per_page, page: int, url):
108 |         """Get all websites"""
109 |         with psycopg2.connect(self.db_conn_str) as conn:
110 |             cursor = conn.cursor()
111 | 
112 |             cursor.execute("SELECT Website.id, Website.url, Website.last_modified FROM Website "
113 |                            "WHERE Website.url LIKE %s "
114 |                            "ORDER BY last_modified DESC LIMIT %s OFFSET %s", (url + "%", per_page, page * per_page))
115 | 
116 |             return cursor.fetchall()
117 | 
118 |     def get_random_website_id(self):
119 | 
120 |         with psycopg2.connect(self.db_conn_str) as conn:
121 |             cursor = conn.cursor()
122 |             cursor.execute("SELECT id FROM Website ORDER BY random() LIMIT 1")
123 | 
124 |             row = cursor.fetchone()
125 |             if row:
126 |                 return row[0]
127 |             return None
128 | 
129 |     def website_exists(self, url):
130 |         """Check if an url or the parent directory of an url already exists"""
131 |         with psycopg2.connect(self.db_conn_str) as conn:
132 |             cursor = conn.cursor()
133 | 
134 |             cursor.execute("SELECT id FROM Website WHERE url = substr(%s, 0, length(url) + 1)", (url,))
135 |             website_id = cursor.fetchone()
136 |             return website_id[0] if website_id else None
137 | 
138 |     def delete_website(self, website_id):
139 | 
140 |         with psycopg2.connect(self.db_conn_str) as conn:
141 |             cursor = conn.cursor()
142 | 
143 |             cursor.execute("DELETE FROM Website WHERE id=%s", (website_id,))
144 |             conn.commit()
145 | 
146 |     def check_login(self, username, password) -> bool:
147 |         with psycopg2.connect(self.db_conn_str) as conn:
148 |             cursor = conn.cursor()
149 | 
150 |             cursor.execute("SELECT password FROM Admin WHERE username=%s", (username,))
151 | 
152 |             db_user = cursor.fetchone()
153 | 
154 |             if db_user:
155 |                 return bcrypt.checkpw(password.encode(), db_user[0].tobytes())
156 |             return False
157 | 
158 |     def get_user_role(self, username: str):
159 |         with psycopg2.connect(self.db_conn_str) as conn:
160 |             cursor = conn.cursor()
161 | 
162 |             cursor.execute("SELECT role FROM Admin WHERE username=%s", (username,))
163 | 
164 |             db_user = cursor.fetchone()
165 | 
166 |             if db_user:
167 |                 return db_user[0]
168 |             return False
169 | 
170 |     def generate_login(self, username, password) -> None:
171 | 
172 |         with psycopg2.connect(self.db_conn_str) as conn:
173 |             cursor = conn.cursor()
174 | 
175 |             hashed_pw = bcrypt.hashpw(password.encode(), bcrypt.gensalt(12))
176 | 
177 |             cursor.execute("INSERT INTO Admin (username, password, role) VALUES (%s,%s, 'admin')",
178 |                            (username, hashed_pw))
179 |             conn.commit()
180 | 
181 |     def check_api_token(self, token) -> str:
182 | 
183 |         with psycopg2.connect(self.db_conn_str) as conn:
184 |             cursor = conn.cursor()
185 | 
186 |             cursor.execute("SELECT name FROM ApiClient WHERE token=%s", (token,))
187 |             result = cursor.fetchone()
188 |             return result[0] if result else None
189 | 
190 |     def generate_api_token(self, name: str) -> str:
191 | 
192 |         with psycopg2.connect(self.db_conn_str) as conn:
193 |             cursor = conn.cursor()
194 | 
195 |             token = str(uuid.uuid4())
196 |             cursor.execute("INSERT INTO ApiClient (token, name) VALUES (%s, %s)", (token, name))
197 |             conn.commit()
198 | 
199 |             return token
200 | 
201 |     def get_tokens(self) -> list:
202 | 
203 |         with psycopg2.connect(self.db_conn_str) as conn:
204 |             cursor = conn.cursor()
205 | 
206 |             cursor.execute("SELECT token, name FROM ApiClient")
207 | 
208 |             return [ApiClient(x[0], x[1]) for x in cursor.fetchall()]
209 | 
210 |     def delete_token(self, token: str) -> None:
211 | 
212 |         with psycopg2.connect(self.db_conn_str) as conn:
213 |             cursor = conn.cursor()
214 | 
215 |             cursor.execute("DELETE FROM ApiClient WHERE token=%s", (token,))
216 |             conn.commit()
217 | 
218 |     def get_all_websites(self) -> dict:
219 |         if self.website_cache_time + 120 < time.time():
220 |             with psycopg2.connect(self.db_conn_str) as conn:
221 |                 cursor = conn.cursor()
222 | 
223 |                 cursor.execute("SELECT id, url FROM Website")
224 | 
225 |                 result = dict()
226 | 
227 |                 for db_website in cursor.fetchall():
228 |                     result[db_website[0]] = db_website[1]
229 | 
230 |                 self.website_cache = result
231 |                 self.website_cache_time = time.time()
232 | 
233 |         return self.website_cache
234 | 
235 |     def join_website_on_search_result(self, page: dict) -> dict:
236 | 
237 |         websites = self.get_all_websites()
238 | 
239 |         for hit in page["hits"]["hits"]:
240 |             if hit["_source"]["website_id"] in websites:
241 |                 hit["_source"]["website_url"] = urljoin(websites[hit["_source"]["website_id"]], "/")
242 |             else:
243 |                 hit["_source"]["website_url"] = "[DELETED]"
244 | 
245 |         return page
246 | 
247 |     def join_website_url(self, docs):
248 | 
249 |         websites = self.get_all_websites()
250 | 
251 |         for doc in docs:
252 |             if doc["_source"]["website_id"] in websites:
253 |                 doc["_source"]["website_url"] = urljoin(websites[doc["_source"]["website_id"]], "/")
254 |             else:
255 |                 doc["_source"]["website_url"] = "[DELETED]"
256 | 
257 |             yield doc
258 | 
259 |     def join_website_on_stats(self, stats):
260 | 
261 |         websites = self.get_all_websites()
262 | 
263 |         for website in stats["website_scatter"]:
264 |             website[0] = websites.get(website[0], "[DELETED]")
265 | 
266 |     def add_blacklist_website(self, url):
267 | 
268 |         with psycopg2.connect(self.db_conn_str) as conn:
269 |             cursor = conn.cursor()
270 |             parsed_url = urlparse(url)
271 |             url = parsed_url.scheme + "://" + parsed_url.netloc
272 |             cursor.execute("INSERT INTO BlacklistedWebsite (url) VALUES (%s)", (url,))
273 |             conn.commit()
274 | 
275 |     def remove_blacklist_website(self, blacklist_id):
276 | 
277 |         with psycopg2.connect(self.db_conn_str) as conn:
278 |             cursor = conn.cursor()
279 | 
280 |             cursor.execute("DELETE FROM BlacklistedWebsite WHERE id=%s", (blacklist_id,))
281 |             conn.commit()
282 | 
283 |     def is_blacklisted(self, url):
284 | 
285 |         with psycopg2.connect(self.db_conn_str) as conn:
286 |             cursor = conn.cursor()
287 |             parsed_url = urlparse(url)
288 |             url = parsed_url.scheme + "://" + parsed_url.netloc
289 |             print(url)
290 |             cursor.execute("SELECT id FROM BlacklistedWebsite WHERE url LIKE %s LIMIT 1", (url,))
291 | 
292 |             return cursor.fetchone() is not None
293 | 
294 |     def get_blacklist(self):
295 | 
296 |         with psycopg2.connect(self.db_conn_str) as conn:
297 |             cursor = conn.cursor()
298 | 
299 |             cursor.execute("SELECT * FROM BlacklistedWebsite")
300 |             return [BlacklistedWebsite(r[0], r[1]) for r in cursor.fetchall()]
301 | 
302 |     def log_search(self, remote_addr, forwarded_for, q, exts, page, blocked, results, took):
303 | 
304 |         with psycopg2.connect(self.db_conn_str) as conn:
305 |             cursor = conn.cursor()
306 | 
307 |             cursor.execute(
308 |                 "INSERT INTO SearchLogEntry "
309 |                 "(remote_addr, forwarded_for, query, extensions, page, blocked, results, took) "
310 |                 "VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
311 |                 (remote_addr, forwarded_for, q, ",".join(exts), page, blocked, results, took))
312 | 
313 |             conn.commit()
314 | 
315 |     def get_oldest_updated_websites(self, size: int, prefix: str):
316 | 
317 |         with psycopg2.connect(self.db_conn_str) as conn:
318 |             cursor = conn.cursor()
319 | 
320 |             cursor.execute("SELECT id, url, last_modified FROM website "
321 |                            "WHERE url LIKE %s "
322 |                            "ORDER BY last_modified ASC LIMIT %s",
323 |                            (prefix + "%", size, ))
324 |             return [Website(url=r[1],
325 |                             website_id=r[0],
326 |                             last_modified=r[2],
327 |                             logged_ip=None,
328 |                             logged_useragent=None
329 |                             )
330 |                     for r in cursor.fetchall()]
331 | 


--------------------------------------------------------------------------------
/do_recrawl.py:
--------------------------------------------------------------------------------
1 | from tasks import TaskManager
2 | 
3 | tm = TaskManager()
4 | tm.do_recrawl()
5 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: "2.1"
  2 | services:
  3 |   oddb:
  4 |     build: .
  5 |     ports:
  6 |       - 5020:80
  7 |     environment:
  8 |       - "CAPTCHA_LOGIN=True"
  9 |       - "CAPTCHA_SUBMIT=True"
 10 |       - "CAPTCHA_SEARCH=True"
 11 |       - "CAPTCHA_EVERY=10"
 12 |       - "FLASK_SECRET=changeme"
 13 |       - "SUBMIT_FTP=False"
 14 |       - "SUBMIT_HTTP=True"
 15 |       - "TT_API=http://tt:3010"
 16 |       - "TT_CRAWL_PROJECT=1"
 17 |       - "TT_INDEX_PROJECT=2"
 18 |       - "WSB_API=http://wsb:3020"
 19 |       - "WSB_SECRET=changeme"
 20 |       - "REDIS_HOST=oddb_redis"
 21 |       - "ES_URL=es:9200"
 22 |       - "DB_CONN_STR=postgres://od_database:changeme@oddb_db/od_database?sslmode=disable"
 23 |       - "RECRAWL_POOL_SIZE=10000"
 24 |       - "INDEXER_THREADS=2"
 25 |       - "ODDB_USER=admin"
 26 |       - "ODDB_PASSWORD=changeme"
 27 |     depends_on:
 28 |       wsb:
 29 |         condition: service_started
 30 |       tt:
 31 |         condition: service_started
 32 |       oddb_db:
 33 |         condition: service_healthy
 34 |       es:
 35 |         condition: service_healthy
 36 |     restart: always
 37 |   oddb_db:
 38 |     image: postgres
 39 |     volumes:
 40 |       - ./oddb_pg_data:/var/lib/postgresql/data
 41 |     environment:
 42 |       - "POSTGRES_USER=od_database"
 43 |       - "POSTGRES_PASSWORD=changeme"
 44 |     healthcheck:
 45 |       test: ["CMD-SHELL", "pg_isready -U od_database"]
 46 |       interval: 5s
 47 |       timeout: 5s
 48 |       retries: 5
 49 |   oddb_redis:
 50 |     image: redis
 51 |   wsb:
 52 |     image: simon987/wsb_bucket
 53 |     volumes:
 54 |       - ./wsb_data:/data
 55 |     environment:
 56 |       - "WS_BUCKET_SECRET=changeme"
 57 |     ports:
 58 |       - 3020:3020
 59 |   tt:
 60 |     image: simon987/task_tracker
 61 |     volumes:
 62 |       - ./tt_config.yml:/root/config.yml
 63 |     ports:
 64 |       - 3010:80
 65 |     depends_on:
 66 |       tt_db:
 67 |         condition: service_healthy
 68 |   tt_web:
 69 |     image: simon987/task_tracker_web
 70 |     ports:
 71 |       - 3011:80
 72 |     depends_on:
 73 |       tt:
 74 |         condition: service_started
 75 |   tt_db:
 76 |     image: postgres
 77 |     volumes:
 78 |       - ./tt_pg_data:/var/lib/postgresql/data
 79 |     environment:
 80 |       - "POSTGRES_USER=task_tracker"
 81 |       - "POSTGRES_PASSWORD=changeme"
 82 |     healthcheck:
 83 |       test: ["CMD-SHELL", "pg_isready -U task_tracker"]
 84 |       interval: 3s
 85 |       timeout: 2s
 86 |       retries: 10
 87 |   es:
 88 |     image: docker.elastic.co/elasticsearch/elasticsearch:7.5.2
 89 |     environment:
 90 |       - discovery.type=single-node
 91 |       - "ES_JAVA_OPTS=-Xms1G -Xmx4G"
 92 |     volumes:
 93 |       - ./es_data:/usr/share/elasticsearch/data
 94 |     healthcheck:
 95 |       test: ["CMD-SHELL", "curl --silent --fail localhost:9200/_cluster/health || exit 1"]
 96 |       interval: 5s
 97 |       timeout: 5s
 98 |       retries: 5
 99 |   # (Optional)
100 |   kibana:
101 |     image: docker.elastic.co/kibana/kibana:7.5.2
102 |     environment:
103 |       - ELASTICSEARCH_HOSTS=http://es:9200
104 |     ports:
105 |       - 5021:5601
106 |     depends_on:
107 |       es:
108 |         condition: service_healthy
109 | 


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | import lz4.frame
 5 | 
 6 | import config
 7 | from database import Database
 8 | from search.search import ElasticSearchEngine
 9 | 
10 | 
11 | def quote(string):
12 |     if "\"" in string:
13 |         return "\"" + string.replace("\"", "\"\"") + "\""
14 |     elif "," in string:
15 |         return "\"" + string + "\""
16 |     else:
17 |         return string
18 | 
19 | 
20 | outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
21 | dldir = "static/downloads/"
22 | 
23 | print("Deleting existing dumps")
24 | for file in os.listdir(dldir):
25 |     if file.endswith("_dump.csv.lz4"):
26 |         os.remove(os.path.join(dldir, file))
27 | 
28 | print("Export started, connecting to databases...")
29 | 
30 | db = Database(config.DB_CONN_STR)
31 | es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
32 | 
33 | docs_with_url = db.join_website_url(es.stream_all_docs())
34 | 
35 | print("Connected, writing to csv")
36 | 
37 | with lz4.frame.open(outfile + ".part", mode='wb',
38 |                     compression_level=9,
39 |                     block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
40 |     fp.write((",".join(
41 |         ["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
42 |     ) + "\n").encode())
43 | 
44 |     for doc in docs_with_url:
45 |         try:
46 |             fp.write(
47 |                 (",".join(
48 |                     [
49 |                         str(doc["_source"]["website_id"]),
50 |                         quote(doc["_source"]["website_url"]),
51 |                         quote(doc["_source"]["path"]),
52 |                         quote(doc["_source"]["name"]),
53 |                         quote(doc["_source"]["ext"]),
54 |                         str(doc["_source"]["size"]),
55 |                         str(doc["_source"]["mtime"])
56 |                     ]
57 |                 ) + "\n").encode())
58 |         except Exception as e:
59 |             print(e)
60 |             print(doc)
61 | 
62 | 
63 | os.rename(outfile + ".part", os.path.join(dldir, outfile))
64 | 


--------------------------------------------------------------------------------
/high_level_diagram.dia:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/high_level_diagram.dia


--------------------------------------------------------------------------------
/high_level_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/high_level_diagram.png


--------------------------------------------------------------------------------
/init_script.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS Website, Admin, BlacklistedWebsite, ApiClient, SearchLogEntry;
 2 | 
 3 | CREATE TABLE Website (
 4 | 
 5 |   id SERIAL PRIMARY KEY NOT NULL,
 6 |   url TEXT,
 7 |   logged_ip TEXT,
 8 |   logged_useragent TEXT,
 9 |   last_modified TIMESTAMP DEFAULT CURRENT_TIMESTAMP
10 | );
11 | 
12 | CREATE TABLE Admin (
13 |   username TEXT PRIMARY KEY NOT NULL,
14 |   password BYTEA,
15 |   role TEXT
16 | );
17 | 
18 | CREATE TABLE BlacklistedWebsite (
19 |   id SERIAL PRIMARY KEY NOT NULL,
20 |   url TEXT
21 | );
22 | 
23 | CREATE TABLE ApiClient (
24 |   name TEXT PRIMARY KEY NOT NULL,
25 |   token TEXT NOT NULL
26 | );
27 | 
28 | CREATE TABLE SearchLogEntry (
29 |   id SERIAL PRIMARY KEY,
30 |   search_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
31 |   remote_addr TEXT,
32 |   forwarded_for TEXT,
33 |   query TEXT,
34 |   extensions TEXT,
35 |   page INT,
36 |   blocked BOOLEAN DEFAULT FALSE,
37 |   results INT DEFAULT 0,
38 |   took INT DEFAULT 0
39 | );
40 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from app import app
2 | 
3 | if __name__ == '__main__':
4 |     app.run("0.0.0.0", port=12345)
5 | 


--------------------------------------------------------------------------------
/mass_import.py:
--------------------------------------------------------------------------------
 1 | import fileinput
 2 | import os
 3 | from multiprocessing.pool import Pool
 4 | 
 5 | import od_util
 6 | from common import db, taskManager
 7 | from database import Website
 8 | from tasks import Task
 9 | 
10 | urls = (line for line in fileinput.input())
11 | 
12 | 
13 | def try_enqueue(url):
14 |     url = os.path.join(url, "")
15 |     url = od_util.get_top_directory(url)
16 | 
17 |     if not od_util.is_valid_url(url):
18 |         return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme."
19 | 
20 |     website = db.get_website_by_url(url)
21 |     if website:
22 |         return "Website already exists"
23 | 
24 |     website = db.website_exists(url)
25 |     if website:
26 |         return "A parent directory of this url has already been posted"
27 | 
28 |     if db.is_blacklisted(url):
29 |         return "<strong>Error:</strong> " \
30 |                "Sorry, this website has been blacklisted. If you think " \
31 |                "this is an error, please <a href='/contribute'>contact me</a>."
32 | 
33 |     if not od_util.is_od(url):
34 |         return "<strong>Error:</strong>" \
35 |                "The anti-spam algorithm determined that the submitted url is not " \
36 |                "an open directory or the server is not responding. If you think " \
37 |                "this is an error, please <a href='/contribute'>contact me</a>."
38 | 
39 |     website_id = db.insert_website(Website(url, "localhost", "mass_import.py"))
40 | 
41 |     task = Task(website_id, url, priority=2)
42 |     taskManager.queue_task(task)
43 | 
44 |     return "The website has been added to the queue"
45 | 
46 | 
47 | def check_url(url):
48 |     url = os.path.join(url.strip(), "")
49 |     try:
50 |         print(try_enqueue(url))
51 |     except:
52 |         pass
53 |     return None
54 | 
55 | 
56 | pool = Pool(processes=50)
57 | pool.map(func=check_url, iterable=urls)
58 | pool.close()
59 | 


--------------------------------------------------------------------------------
/od_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from ftplib import FTP
  4 | from urllib.parse import urljoin, urlparse
  5 | 
  6 | import requests
  7 | import validators
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | # TODO: find a better way to do this
 11 | try:
 12 |     from . import config
 13 | except (ImportError, SystemError):
 14 |     import config
 15 | 
 16 | import urllib3
 17 | urllib3.disable_warnings()
 18 | 
 19 | 
 20 | def truncate_path(path, max_len):
 21 |     pattern = re.compile(r"/?.*?/")
 22 | 
 23 |     for i in range(1, path.count("/")):
 24 |         new_path = pattern.sub(".../", path, i)
 25 |         if len(new_path) < max_len:
 26 |             return new_path
 27 |     return ".../" + path.rsplit("/", maxsplit=1)[1] if "/" in path else path
 28 | 
 29 | 
 30 | category_map = {
 31 | 
 32 |     # Application category
 33 |     'bcpio': 'application', 'bin': 'application', 'cdf': 'application',
 34 |     'csh': 'application', 'dll': 'application', 'doc': 'application',
 35 |     'dot': 'application', 'dvi': 'application', 'eml': 'application',
 36 |     'exe': 'application', 'hdf': 'application',
 37 |     'man': 'application', 'me': 'application', 'mht': 'application',
 38 |     'mhtml': 'application', 'mif': 'application', 'ms': 'application',
 39 |     'nc': 'application', 'nws': 'application', 'o': 'application',
 40 |     'obj': 'application', 'oda': 'application', 'p12': 'application',
 41 |     'p7c': 'application', 'pfx': 'application', 'tr': 'application',
 42 |     'ppa': 'application', 'pps': 'application', 'ppt': 'application',
 43 |     'ps': 'application', 'pwz': 'application', 'pyc': 'application',
 44 |     'pyo': 'application', 'ram': 'application', 'rdf': 'application',
 45 |     'roff': 'application', 'sh': 'application', 'so': 'application',
 46 |     'src': 'application', 'sv4cpio': 'application', 'sv4crc': 'application',
 47 |     't': 'application', 'tcl': 'application', 'tex': 'application',
 48 |     'texi': 'application', 'texinfo': 'application', 'ustar': 'application',
 49 |     'wiz': 'application', 'wsdl': 'application', 'xlb': 'application',
 50 |     'xls': 'application', 'xpdl': 'application', 'xsl': 'application',
 51 |     'torrent': 'application', 'rpm': 'application', 'deb': 'application',
 52 |     'atr': 'application', 'class': 'application', 'ttf': 'application',
 53 |     'img': 'application', 'msi': 'application', 'run': 'application',
 54 |     'drpm': 'application', 'udeb': 'application', 'patch': 'application',
 55 |     'nes': 'application', 'ebuild': 'application', 'scr': 'application',
 56 |     # Text category
 57 |     'java': 'text', 'cpp': 'text', 'rb': 'text',
 58 |     'bat': 'text', 'latex': 'text', 'xml': 'text',
 59 |     'etx': 'text', 'htm': 'text', 'c': 'text',
 60 |     'css': 'text', 'csv': 'text', 'html': 'text',
 61 |     'js': 'text', 'json': 'text', 'ksh': 'text',
 62 |     'pl': 'text', 'pot': 'application', 'py': 'text',
 63 |     'h': 'text', 'tsv': 'text', 'rtx': 'text',
 64 |     'sgm': 'text', 'sgml': 'text', 'txt': 'text',
 65 |     'vcf': 'text', 'pdf': 'text', 'epub': 'text',
 66 |     'srt': 'text', 'inc': 'text', 'php': 'text',
 67 |     'cbz': 'text', 'docx': 'text', 'mobi': 'text',
 68 |     'chm': 'text', 'xlsx': "text", 'djvu': 'text',
 69 |     'rtf': 'text', 'log': 'text', 'md': 'text',
 70 |     'dsc': 'text', 'info': 'text',
 71 |     # Video category
 72 |     '3g2': 'video', '3gp': 'video', 'asf': 'video',
 73 |     'asx': 'video', 'avi': 'video', 'flv': 'video',
 74 |     'swf': 'video', 'vob:': 'video', 'qt': 'video',
 75 |     'webm': 'video', 'mov': 'video', 'm1v': 'video',
 76 |     'm3u': 'video', 'm3u8': 'video', 'movie': 'video',
 77 |     'mp4': 'video', 'mpa': 'video', 'mpe': 'video',
 78 |     'mpeg': 'video', 'mpg': 'video', 'mkv': 'video',
 79 |     'wmv': 'video', 'm4s': 'video', 'ogv': 'video',
 80 |     'm4b': 'video', 'm4v': 'video', 'ts': 'video',
 81 | 
 82 |     # Audio category
 83 |     'wav': 'audio', 'snd': 'audio', 'mp2': 'audio',
 84 |     'aif': 'audio', 'iff': 'audio', 'm4a': 'audio',
 85 |     'mid': 'audio', 'midi': 'audio', 'mp3': 'audio',
 86 |     'wma': 'audio', 'ra': 'audio', 'aifc': 'audio',
 87 |     'aiff': 'audio', 'au': 'audio', 'flac': 'audio',
 88 |     'ogg': 'audio', 'oga': 'audio', 'mka': 'video',
 89 |     'ac3': 'audio',
 90 |     # Image category
 91 |     'bmp': 'image', 'gif': 'image', 'jpg': 'image',
 92 |     'xwd': 'image', 'tif': 'image', 'tiff': 'image',
 93 |     'png': 'image', 'pnm': 'image', 'ras': 'image',
 94 |     'ico': 'image', 'ief': 'image', 'pgm': 'image',
 95 |     'jpe': 'image', 'pbm': 'image', 'jpeg': 'image',
 96 |     'ppm': 'image', 'xpm': 'image', 'xbm': 'image',
 97 |     'rgb': 'image', 'svg': 'image', 'psd': 'image',
 98 |     'yuv': 'image', 'ai': 'image', 'eps': 'image',
 99 |     'bw': 'image', 'hdr': 'image',
100 |     # Archive category
101 |     'ar': 'archive', 'cpio': 'archive', 'shar': 'archive',
102 |     'iso': 'archive', 'lbr': 'archive', 'mar': 'archive',
103 |     'sbx': 'archive', 'bz2': 'archive', 'f': 'archive',
104 |     'gz': 'archive', 'lz': 'archive', 'lzma': 'archive',
105 |     'lzo': 'archive', 'rz': 'archive', 'sfark': 'archive',
106 |     'sz': 'archive', 'z': 'archive', '7z': 'archive',
107 |     's7z': 'archive', 'ace': 'archive', 'afa': 'archive',
108 |     'alz': 'archive', 'apk': 'archive', 'arc': 'archive',
109 |     'arj': 'archive', 'b1': 'archive', 'b6z': 'archive',
110 |     'a': 'archive', 'bh': 'archive', 'cab': 'archive',
111 |     'car': 'archive', 'cfs': 'archive', 'cpt': 'archive',
112 |     'dar': 'archive', 'dd': 'archive', 'dgc': 'archive',
113 |     'dmg': 'archive', 'ear': 'archive', 'gca': 'archive',
114 |     'ha': 'archive', 'hki': 'archive', 'ice': 'archive',
115 |     'jar': 'archive', 'kgb': 'archive', 'lzh': 'archive',
116 |     'lha': 'archive', 'lzx': 'archive', 'pak': 'archive',
117 |     'partimg': 'archive', 'paq6': 'archive', 'paq7': 'archive',
118 |     'paq8': 'archive', 'pea': 'archive', 'pim': 'archive',
119 |     'pit': 'archive', 'qda': 'archive', 'rar': 'archive',
120 |     'rk': 'archive', 'sda': 'archive', 'sea': 'archive',
121 |     'sen': 'archive', 'sfx': 'archive', 'shk': 'archive',
122 |     'sit': 'archive', 'sitx': 'archive', 'sqx': 'archive',
123 |     'tbz2': 'archive', 'tlz': 'archive', 'xz': 'archive',
124 |     'txz': 'archive', 'uc': 'archive', 'uc0': 'archive',
125 |     'uc2': 'archive', 'ucn': 'archive', 'ur2': 'archive',
126 |     'ue2': 'archive', 'uca': 'archive', 'uha': 'archive',
127 |     'war': 'archive', 'wim': 'archive', 'xar': 'archive',
128 |     'xp3': 'archive', 'yz1': 'archive', 'zip': 'archive',
129 |     'zipx': 'archive', 'zoo': 'archive', 'zpaq': 'archive',
130 |     'zz': 'archive', 'xpi': 'archive', 'tgz': 'archive',
131 |     'tbz': 'archive', 'tar': 'archive', 'bz': 'archive',
132 |     'diz': 'archive',
133 | }
134 | 
135 | colors = {
136 |     "application": "bg-application",
137 |     "text": "bg-text",
138 |     "video": "bg-video",
139 |     "image": "bg-image",
140 |     "audio": "bg-audio",
141 |     "archive": "bg-archive"
142 | }
143 | 
144 | 
145 | def get_color(category):
146 |     return colors.get(category, None)
147 | 
148 | 
149 | def get_category(extension):
150 |     return category_map.get(extension, None)
151 | 
152 | 
153 | def is_valid_url(url):
154 |     if not url.endswith("/"):
155 |         return False
156 | 
157 |     if not url.startswith(("http://", "https://", "ftp://")):
158 |         return False
159 | 
160 |     return validators.url(url)
161 | 
162 | 
163 | def has_extension(link):
164 |     return len(os.path.splitext(link)[1]) > 0
165 | 
166 | 
167 | def is_external_link(base_url, url: str):
168 |     url = urljoin(base_url, url).strip()
169 | 
170 |     if base_url in url:
171 |         return False
172 |     return True
173 | 
174 | 
175 | def is_od(url):
176 |     if not url.endswith("/"):
177 |         print("Url does not end with trailing /")
178 |         return False
179 | 
180 |     try:
181 |         if url.startswith("ftp://") and config.SUBMIT_FTP:
182 |             ftp = FTP(urlparse(url).netloc)
183 |             ftp.login()
184 |             ftp.close()
185 |             return True
186 |         elif config.SUBMIT_HTTP:
187 |             r = requests.get(url, timeout=30, allow_redirects=False, verify=False)
188 |             if r.status_code != 200:
189 |                 # print("No redirects allowed!")
190 |                 return False
191 |             soup = BeautifulSoup(r.text, "lxml")
192 | 
193 |             external_links = sum(1 if is_external_link(url, a.get("href")) else 0 for a in soup.find_all("a"))
194 |             link_tags = len(list(soup.find_all("link")))
195 |             script_tags = len(list(soup.find_all("script")))
196 | 
197 |             if external_links > 11:
198 |                 # print("Too many external links!")
199 |                 return False
200 | 
201 |             if link_tags > 5:
202 |                 # print("Too many link tags!")
203 |                 return False
204 | 
205 |             if script_tags > 7:
206 |                 # print("Too many script tags!")
207 |                 return False
208 | 
209 |             return True
210 | 
211 |     except Exception as e:
212 |         # print(e)
213 |         return False
214 | 
215 | 
216 | def has_parent_dir(url):
217 | 
218 |     parsed_url = urlparse(url)
219 | 
220 |     if parsed_url.path == "/":
221 |         return False
222 | 
223 |     parent_url = urljoin(url, "../")
224 |     try:
225 |         r = requests.get(parent_url, timeout=30, allow_redirects=False, verify=False)
226 |         if r.status_code != 200:
227 |             return False
228 |         soup = BeautifulSoup(r.text, "lxml")
229 | 
230 |         for anchor in soup.find_all("a"):
231 |             if anchor.get("href") and anchor.get("href").endswith("/") and urljoin(parent_url, anchor.get("href")) == url:
232 |                 # The parent page exists, and has a link to the child directory
233 |                 return is_od(parent_url)
234 | 
235 |     except:
236 |         return False
237 | 
238 |     # Parent page exists, but does not have a link to the child directory
239 |     return False
240 | 
241 | 
242 | def get_top_directory(url):
243 |     if url.startswith("ftp://"):
244 |         return url
245 | 
246 |     while has_parent_dir(url):
247 |         url = urljoin(url, "../")
248 |     return url
249 | 


--------------------------------------------------------------------------------
/reddit_bot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import humanfriendly
  5 | import praw
  6 | 
  7 | 
  8 | class RedditBot:
  9 | 
 10 |     bottom_line = "^(Beep boop. I am a bot that calculates the file sizes & count of " \
 11 |                   "open directories posted in /r/opendirectories/)"
 12 | 
 13 |     def __init__(self, log_file: str, reddit: praw.Reddit):
 14 | 
 15 |         self.log_file = log_file
 16 | 
 17 |         self.crawled = []
 18 |         self.load_from_file()
 19 |         self.reddit = reddit
 20 | 
 21 |     def log_crawl(self, post_id):
 22 | 
 23 |         self.load_from_file()
 24 |         self.crawled.append(post_id)
 25 | 
 26 |         with open(self.log_file, "w") as f:
 27 |             for post_id in self.crawled:
 28 |                 f.write(post_id + "\n")
 29 | 
 30 |     def has_crawled(self, post_id):
 31 |         self.load_from_file()
 32 |         return post_id in self.crawled
 33 | 
 34 |     def load_from_file(self):
 35 |         if not os.path.isfile(self.log_file):
 36 |             self.crawled = []
 37 |         else:
 38 |             with open(self.log_file, "r") as f:
 39 |                 self.crawled = list(filter(None, f.read().split("\n")))
 40 | 
 41 |     def reply(self, reddit_obj, comment: str):
 42 | 
 43 |         while True:
 44 |             try:
 45 |                 if not self.has_crawled(reddit_obj.id):
 46 |                     reply = reddit_obj.reply(comment)
 47 |                     self.log_crawl(reddit_obj.id)
 48 |                     print("Reply to " + reddit_obj.id)
 49 |                     return reply
 50 |                 break
 51 |             except Exception as e:
 52 |                 print("Waiting 5 minutes: " + str(e))
 53 |                 time.sleep(300)
 54 |                 continue
 55 | 
 56 |     def edit(self, reddit_comment, new_message):
 57 | 
 58 |         while True:
 59 |             try:
 60 |                 reddit_comment.edit(new_message)
 61 |                 print("Edit comment " + reddit_comment.id)
 62 |                 break
 63 |             except Exception as e:
 64 |                 print("Waiting 5 minutes: " + str(e))
 65 |                 time.sleep(300)
 66 |                 continue
 67 | 
 68 |     @staticmethod
 69 |     def get_comment(stats: dict, website_id, message: str = ""):
 70 |         comment = message + "    \n" if message else ""
 71 | 
 72 |         comment += RedditBot.format_stats(stats)
 73 | 
 74 |         comment += "[Full Report](https://od-db.the-eye.eu/website/" + str(website_id) + "/)"
 75 |         comment += " | [Link list](https://od-db.the-eye.eu/website/" + str(website_id) + "/links)"
 76 |         comment += " | [Source](https://github.com/simon987)    \n"
 77 |         comment += "***    \n"
 78 |         comment += RedditBot.bottom_line
 79 | 
 80 |         return comment
 81 | 
 82 |     @staticmethod
 83 |     def format_stats(stats):
 84 | 
 85 |         result = "    \n"
 86 |         result += "File types | Count | Total Size\n"
 87 |         result += ":-- | :-- | :--    \n"
 88 |         counter = 0
 89 |         for mime in stats["ext_stats"]:
 90 |             result += mime[2]
 91 |             result += " | " + str(mime[1])
 92 |             result += " | " + humanfriendly.format_size(mime[0]) + "    \n"
 93 | 
 94 |             counter += 1
 95 |             if counter >= 3:
 96 |                 break
 97 | 
 98 |         result += "**Total** | **" + str(stats["total_count"]) + "** | **"
 99 |         result += humanfriendly.format_size(stats["total_size"]) + "**    \n\n"
100 |         return result
101 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | flask
 2 | flask_testing
 3 | requests
 4 | bs4
 5 | validators
 6 | Flask-Caching
 7 | praw
 8 | humanfriendly
 9 | apscheduler
10 | bcrypt
11 | elasticsearch
12 | python-dateutil
13 | flask_httpauth
14 | ujson
15 | urllib3
16 | pyOpenSSL
17 | lxml
18 | pillow
19 | Wand
20 | numpy
21 | uwsgi
22 | redis
23 | psycopg2-binary
24 | lz4


--------------------------------------------------------------------------------
/search/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from logging import FileHandler
 3 | 
 4 | logger = logging.getLogger("default")
 5 | logger.setLevel(logging.DEBUG)
 6 | 
 7 | formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
 8 | file_handler = FileHandler("oddb.log")
 9 | file_handler.setFormatter(formatter)
10 | logger.addHandler(file_handler)
11 | # logger.addHandler(StreamHandler(sys.stdout))
12 | 


--------------------------------------------------------------------------------
/search/filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | try:
 4 |     from fold_to_ascii.fold_to_ascii import mapping
 5 | except:
 6 |     from ..fold_to_ascii.fold_to_ascii import mapping
 7 | 
 8 | 
 9 | class SearchFilter:
10 | 
11 |     def __init__(self):
12 | 
13 |         self.blacklisted_terms = set()
14 |         self.table = str.maketrans(dict(mapping.translate_table))
15 | 
16 |         if os.path.exists("search_blacklist.txt"):
17 |             with open("search_blacklist.txt") as f:
18 |                 self.blacklisted_terms.update(line.strip() for line in f.readlines() if line[0] != "#" and line.strip())
19 | 
20 |     def should_block(self, query) -> bool:
21 | 
22 |         query = query.translate(self.table)
23 |         query = query.lower()
24 | 
25 |         for raw_token in query.split():
26 | 
27 |             token = raw_token.strip("\"'/\\").strip()
28 |             if token in self.blacklisted_terms:
29 |                 return True
30 | 
31 |         return False
32 | 


--------------------------------------------------------------------------------
/search/search.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from urllib.parse import urljoin
  4 | 
  5 | import elasticsearch
  6 | import ujson
  7 | from apscheduler.schedulers.background import BackgroundScheduler
  8 | from elasticsearch import helpers
  9 | 
 10 | from search import logger
 11 | from search.filter import SearchFilter
 12 | 
 13 | 
 14 | class InvalidQueryException(Exception):
 15 |     pass
 16 | 
 17 | 
 18 | class IndexingError(Exception):
 19 |     pass
 20 | 
 21 | 
 22 | class ElasticSearchEngine:
 23 |     SORT_ORDERS = {
 24 |         "score": ["_score"],
 25 |         "size_asc": [{"size": {"order": "asc"}}],
 26 |         "size_dsc": [{"size": {"order": "desc"}}],
 27 |         "date_asc": [{"mtime": {"order": "asc"}}],
 28 |         "date_desc": [{"mtime": {"order": "desc"}}],
 29 |         "none": []
 30 |     }
 31 | 
 32 |     def __init__(self, url, index_name):
 33 |         super().__init__()
 34 |         self.index_name = index_name
 35 |         logger.info("Connecting to ES @ %s" % url)
 36 |         self.es = elasticsearch.Elasticsearch(hosts=[url])
 37 |         self.filter = SearchFilter()
 38 | 
 39 |         if not self.es.indices.exists(self.index_name):
 40 |             self.init()
 41 | 
 42 |     def start_stats_scheduler(self):
 43 |         scheduler = BackgroundScheduler()
 44 |         scheduler.add_job(self._generate_global_stats, "interval", seconds=60 * 120)
 45 |         scheduler.start()
 46 | 
 47 |     def init(self):
 48 |         logger.info("Elasticsearch first time setup")
 49 |         if self.es.indices.exists(self.index_name):
 50 |             self.es.indices.delete(index=self.index_name)
 51 |         self.es.indices.create(index=self.index_name, body={
 52 |             "settings": {
 53 |                 "index": {
 54 |                     "number_of_shards": 50,
 55 |                     "number_of_replicas": 0,
 56 |                     "refresh_interval": "30s",
 57 |                     "codec": "best_compression"
 58 |                 },
 59 |                 "analysis": {
 60 |                     "analyzer": {
 61 |                         "my_nGram": {
 62 |                             "tokenizer": "my_nGram_tokenizer",
 63 |                             "filter": ["lowercase", "asciifolding"]
 64 |                         }
 65 |                     },
 66 |                     "tokenizer": {
 67 |                         "my_nGram_tokenizer": {
 68 |                             "type": "nGram", "min_gram": 3, "max_gram": 3
 69 |                         }
 70 |                     }
 71 |                 }
 72 |             }
 73 |         })
 74 | 
 75 |         # Index Mappings
 76 |         self.es.indices.put_mapping(body={
 77 |             "properties": {
 78 |                 "path": {"analyzer": "standard", "type": "text"},
 79 |                 "name": {"analyzer": "standard", "type": "text",
 80 |                          "fields": {"nGram": {"type": "text", "analyzer": "my_nGram"}}},
 81 |                 "mtime": {"type": "date", "format": "epoch_second"},
 82 |                 "size": {"type": "long"},
 83 |                 "website_id": {"type": "integer"},
 84 |                 "ext": {"type": "keyword"},
 85 |             },
 86 |             "_routing": {"required": True}
 87 |         }, doc_type="file", index=self.index_name, include_type_name=True)
 88 | 
 89 |         self.es.indices.open(index=self.index_name)
 90 | 
 91 |     def delete_docs(self, website_id):
 92 | 
 93 |         while True:
 94 |             try:
 95 |                 logger.debug("Deleting docs of " + str(website_id))
 96 | 
 97 |                 to_delete = helpers.scan(query={
 98 |                     "query": {
 99 |                         "term": {
100 |                             "website_id": website_id
101 |                         }
102 |                     }
103 |                 }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id)
104 | 
105 |                 buf = []
106 |                 counter = 0
107 |                 for doc in to_delete:
108 |                     buf.append(doc)
109 |                     counter += 1
110 | 
111 |                     if counter >= 10000:
112 |                         self._delete(buf, website_id)
113 |                         buf.clear()
114 |                         counter = 0
115 |                 if counter > 0:
116 |                     self._delete(buf, website_id)
117 |                 break
118 | 
119 |             except Exception as e:
120 |                 logger.error("During delete: " + str(e))
121 |                 time.sleep(10)
122 | 
123 |         logger.debug("Done deleting for " + str(website_id))
124 | 
125 |     def _delete(self, docs, website_id):
126 |         bulk_string = self.create_bulk_delete_string(docs)
127 |         result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
128 |                               routing=website_id)
129 | 
130 |         if result["errors"]:
131 |             logger.error("Error in ES bulk delete: \n" + result["errors"])
132 |             raise IndexingError
133 | 
134 |     def import_json(self, in_lines, website_id: int):
135 | 
136 |         import_every = 10000
137 |         cooldown_time = 0
138 | 
139 |         docs = []
140 | 
141 |         for line in in_lines:
142 |             try:
143 |                 doc = ujson.loads(line)
144 |                 name, ext = os.path.splitext(doc["name"])
145 |                 doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else ""
146 |                 doc["name"] = name
147 |                 doc["website_id"] = website_id
148 |                 docs.append(doc)
149 |             except Exception as e:
150 |                 logger.error("Error in import_json: " + str(e) + " for line : + \n" + line)
151 | 
152 |             if len(docs) >= import_every:
153 |                 self._index(docs)
154 |                 docs.clear()
155 |                 time.sleep(cooldown_time)
156 | 
157 |         if docs:
158 |             self._index(docs)
159 | 
160 |     def _index(self, docs):
161 |         while True:
162 |             try:
163 |                 logger.debug("Indexing " + str(len(docs)) + " docs")
164 |                 bulk_string = ElasticSearchEngine.create_bulk_index_string(docs)
165 |                 self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30,
166 |                              routing=docs[0]["website_id"])
167 |                 break
168 |             except Exception as e:
169 |                 logger.error("Error in _index: " + str(e) + ", retrying")
170 |                 time.sleep(10)
171 | 
172 |     @staticmethod
173 |     def create_bulk_index_string(docs: list):
174 | 
175 |         action_string = '{"index":{}}\n'
176 |         return "\n".join("".join([action_string, ujson.dumps(doc)]) for doc in docs)
177 | 
178 |     @staticmethod
179 |     def create_bulk_delete_string(docs: list):
180 | 
181 |         return "\n".join("".join(["{\"delete\":{\"_id\":\"", doc["_id"], "\"}}"]) for doc in docs)
182 | 
183 |     def search(self, query, page, per_page, sort_order, extensions, size_min, size_max, match_all, fields, date_min,
184 |                date_max) -> {}:
185 | 
186 |         if self.filter.should_block(query):
187 |             logger.info("Search was blocked")
188 |             raise InvalidQueryException("One or more terms in your query is blocked by the search filter. "
189 |                                         "This incident has been reported.")
190 | 
191 |         filters = []
192 |         if extensions:
193 |             filters.append({"terms": {"ext": extensions}})
194 | 
195 |         if size_min > 0 or size_max:
196 |             size_filer = dict()
197 |             new_filter = {"range": {"size": size_filer}}
198 | 
199 |             if size_min > 0:
200 |                 size_filer["gte"] = size_min
201 |             if size_max:
202 |                 size_filer["lte"] = size_max
203 | 
204 |             filters.append(new_filter)
205 | 
206 |         if date_min > 0 or date_max:
207 |             date_filer = dict()
208 |             new_filter = {"range": {"mtime": date_filer}}
209 | 
210 |             if date_min > 0:
211 |                 date_filer["gte"] = date_min
212 |             if date_max:
213 |                 date_filer["lte"] = date_max
214 | 
215 |             filters.append(new_filter)
216 | 
217 |         sort_by = ElasticSearchEngine.SORT_ORDERS.get(sort_order, [])
218 | 
219 |         page = self.es.search(body={
220 |             "query": {
221 |                 "bool": {
222 |                     "must": {
223 |                         "multi_match": {
224 |                             "query": query,
225 |                             "fields": fields,
226 |                             "operator": "or" if match_all else "and"
227 |                         }
228 |                     },
229 |                     "filter": filters
230 |                 }
231 |             },
232 |             "sort": sort_by,
233 |             "highlight": {
234 |                 "fields": {
235 |                     "name": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
236 |                     "name.nGram": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]},
237 |                     "path": {"pre_tags": ["<mark>"], "post_tags": ["</mark>"]}
238 |                 }
239 |             },
240 |             "size": per_page, "from": min(page * per_page, 10000 - per_page)},
241 |             index=self.index_name, request_timeout=20)
242 | 
243 |         return page
244 | 
245 |     def get_stats(self, website_id: int, subdir: str = None):
246 | 
247 |         result = self.es.search(body={
248 |             "query": {
249 |                 "constant_score": {
250 |                     "filter": {
251 |                         "term": {"website_id": website_id}
252 |                     }
253 |                 }
254 |             },
255 |             "aggs": {
256 |                 "ext_group": {
257 |                     "terms": {
258 |                         "field": "ext",
259 |                         "size": 12
260 |                     },
261 |                     "aggs": {
262 |                         "size": {
263 |                             "sum": {
264 |                                 "field": "size"
265 |                             }
266 |                         }
267 |                     }
268 |                 },
269 |                 "total_size": {
270 |                     "sum_bucket": {
271 |                         "buckets_path": "ext_group>size"
272 |                     }
273 |                 }
274 |             },
275 |             "size": 0
276 |         }, index=self.index_name, request_timeout=30, routing=website_id)
277 | 
278 |         stats = dict()
279 |         stats["total_size"] = result["aggregations"]["total_size"]["value"]
280 |         stats["total_count"] = result["hits"]["total"]
281 |         stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
282 |                               for b in result["aggregations"]["ext_group"]["buckets"]]
283 | 
284 |         return stats
285 | 
286 |     def get_link_list(self, website_id, base_url):
287 | 
288 |         hits = helpers.scan(client=self.es,
289 |                             query={
290 |                                 "_source": {
291 |                                     "includes": ["path", "name", "ext"]
292 |                                 },
293 |                                 "query": {
294 |                                     "constant_score": {
295 |                                         "filter": {
296 |                                             "term": {"website_id": website_id}
297 |                                         }
298 |                                     }
299 |                                 },
300 |                             },
301 |                             index=self.index_name, request_timeout=20, routing=website_id)
302 |         for hit in hits:
303 |             src = hit["_source"]
304 |             yield urljoin(base_url, "/") + src["path"] + ("/" if src["path"] != "" else "") + src["name"] + \
305 |                   ("." if src["ext"] != "" else "") + src["ext"]
306 | 
307 |     @staticmethod
308 |     def get_global_stats():
309 | 
310 |         if os.path.exists("_stats.json"):
311 |             with open("_stats.json", "r") as f:
312 |                 return ujson.load(f)
313 |         else:
314 |             return None
315 | 
316 |     def _generate_global_stats(self):
317 | 
318 |         size_per_ext = self.es.search(body={
319 |             "query": {
320 |                 "bool": {
321 |                     "filter": [
322 |                         {"range": {
323 |                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
324 |                         }}
325 |                     ]
326 |                 }
327 |             },
328 |             "aggs": {
329 |                 "ext_group": {
330 |                     "terms": {
331 |                         "field": "ext",
332 |                         "size": 40
333 |                     },
334 |                     "aggs": {
335 |                         "size": {
336 |                             "sum": {
337 |                                 "field": "size"
338 |                             }
339 |                         }
340 |                     }
341 |                 }
342 |             },
343 |             "size": 0
344 | 
345 |         }, index=self.index_name, request_timeout=240)
346 | 
347 |         total_stats = self.es.search(body={
348 |             "query": {
349 |                 "bool": {
350 |                     "filter": [
351 |                         {"range": {
352 |                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
353 |                         }}
354 |                     ]
355 |                 }
356 |             },
357 |             "aggs": {
358 |                 "file_stats": {
359 |                     "extended_stats": {
360 |                         "field": "size",
361 |                         "sigma": 1
362 |                     }
363 |                 }
364 |             },
365 |             "size": 0
366 | 
367 |         }, index=self.index_name, request_timeout=241)
368 | 
369 |         size_and_date_histogram = self.es.search(body={
370 |             "query": {
371 |                 "bool": {
372 |                     "filter": [
373 |                         {"range": {
374 |                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
375 |                         }},
376 |                         {"range": {
377 |                             "mtime": {
378 |                                 "gt": 0  # 1970-01-01
379 |                             }
380 |                         }}
381 |                     ]
382 |                 }
383 |             },
384 |             "aggs": {
385 |                 "sizes": {
386 |                     "histogram": {
387 |                         "field": "size",
388 |                         "interval": 100000000,  # 100Mb
389 |                         "min_doc_count": 500
390 |                     }
391 |                 },
392 |                 "dates": {
393 |                     "date_histogram": {
394 |                         "field": "mtime",
395 |                         "interval": "1y",
396 |                         "min_doc_count": 500,
397 |                         "format": "yyyy"
398 |                     }
399 |                 }
400 |             },
401 |             "size": 0
402 |         }, index=self.index_name, request_timeout=242)
403 | 
404 |         website_scatter = self.es.search(body={
405 |             "query": {
406 |                 "bool": {
407 |                     "filter": [
408 |                         {"range": {
409 |                             "size": {"gte": 0, "lte": (1000000000000 - 1)}  # 0-1TB
410 |                         }}
411 |                     ]
412 |                 }
413 |             },
414 |             "aggs": {
415 |                 "websites": {
416 |                     "terms": {
417 |                         "field": "website_id",
418 |                         "size": 600  # TODO: Figure out what size is appropriate
419 |                     },
420 |                     "aggs": {
421 |                         "size": {
422 |                             "sum": {
423 |                                 "field": "size"
424 |                             }
425 |                         }
426 |                     }
427 |                 }
428 |             },
429 |             "size": 0
430 |         }, index=self.index_name, request_timeout=243)
431 | 
432 |         es_stats = self.es.indices.stats(self.index_name, request_timeout=244)
433 | 
434 |         stats = dict()
435 |         stats["es_index_size"] = es_stats["indices"][self.index_name]["total"]["store"]["size_in_bytes"]
436 |         stats["es_search_count"] = es_stats["indices"][self.index_name]["total"]["search"]["query_total"]
437 |         stats["es_search_time"] = es_stats["indices"][self.index_name]["total"]["search"]["query_time_in_millis"]
438 |         stats["es_search_time_avg"] = stats["es_search_time"] / (
439 |             stats["es_search_count"] if stats["es_search_count"] != 0 else 1)
440 | 
441 |         stats["total_count"] = total_stats["aggregations"]["file_stats"]["count"]
442 |         stats["total_size"] = total_stats["aggregations"]["file_stats"]["sum"]
443 |         stats["size_avg"] = total_stats["aggregations"]["file_stats"]["avg"]
444 |         stats["size_std_deviation"] = total_stats["aggregations"]["file_stats"]["std_deviation"]
445 |         stats["size_std_deviation_bounds"] = total_stats["aggregations"]["file_stats"]["std_deviation_bounds"]
446 |         stats["size_variance"] = total_stats["aggregations"]["file_stats"]["variance"]
447 |         stats["ext_stats"] = [(b["size"]["value"], b["doc_count"], b["key"])
448 |                               for b in size_per_ext["aggregations"]["ext_group"]["buckets"]]
449 |         stats["sizes_histogram"] = [(b["key"], b["doc_count"])
450 |                                     for b in size_and_date_histogram["aggregations"]["sizes"]["buckets"]]
451 |         stats["dates_histogram"] = [(b["key_as_string"], b["doc_count"])
452 |                                     for b in size_and_date_histogram["aggregations"]["dates"]["buckets"]]
453 |         stats["website_scatter"] = [[b["key"], b["doc_count"], b["size"]["value"]]
454 |                                     for b in website_scatter["aggregations"]["websites"]["buckets"]]
455 |         stats["base_url"] = "entire database"
456 | 
457 |         with open("_stats.json", "w") as f:
458 |             ujson.dump(stats, f)
459 | 
460 |     def stream_all_docs(self):
461 |         return helpers.scan(query={
462 |             "query": {
463 |                 "match_all": {}
464 |             }
465 |         }, scroll="30s", client=self.es, index=self.index_name, request_timeout=30)
466 | 
467 |     def refresh(self):
468 |         self.es.indices.refresh(self.index_name)
469 | 


--------------------------------------------------------------------------------
/static/Hack-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/Hack-Regular.ttf


--------------------------------------------------------------------------------
/static/css/fa-brands.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
 3 |  * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
 4 |  */
 5 | @font-face {
 6 |   font-family: 'Font Awesome 5 Brands';
 7 |   font-style: normal;
 8 |   font-weight: normal;
 9 |   src: url("../webfonts/fa-brands-400.eot");
10 |   src: url("../webfonts/fa-brands-400.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-brands-400.woff2") format("woff2"), url("../webfonts/fa-brands-400.woff") format("woff"), url("../webfonts/fa-brands-400.ttf") format("truetype"), url("../webfonts/fa-brands-400.svg#fontawesome") format("svg"); }
11 | 
12 | .fab {
13 |   font-family: 'Font Awesome 5 Brands'; }
14 | 


--------------------------------------------------------------------------------
/static/css/fa-brands.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 |  * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 |  */
5 | @font-face{font-family:Font Awesome\ 5 Brands;font-style:normal;font-weight:400;src:url(../webfonts/fa-brands-400.eot);src:url(../webfonts/fa-brands-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-brands-400.woff2) format("woff2"),url(../webfonts/fa-brands-400.woff) format("woff"),url(../webfonts/fa-brands-400.ttf) format("truetype"),url(../webfonts/fa-brands-400.svg#fontawesome) format("svg")}.fab{font-family:Font Awesome\ 5 Brands}


--------------------------------------------------------------------------------
/static/css/fa-regular.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
 3 |  * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
 4 |  */
 5 | @font-face {
 6 |   font-family: 'Font Awesome 5 Free';
 7 |   font-style: normal;
 8 |   font-weight: 400;
 9 |   src: url("../webfonts/fa-regular-400.eot");
10 |   src: url("../webfonts/fa-regular-400.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-regular-400.woff2") format("woff2"), url("../webfonts/fa-regular-400.woff") format("woff"), url("../webfonts/fa-regular-400.ttf") format("truetype"), url("../webfonts/fa-regular-400.svg#fontawesome") format("svg"); }
11 | 
12 | .far {
13 |   font-family: 'Font Awesome 5 Free';
14 |   font-weight: 400; }
15 | 


--------------------------------------------------------------------------------
/static/css/fa-regular.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 |  * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 |  */
5 | @font-face{font-family:Font Awesome\ 5 Free;font-style:normal;font-weight:400;src:url(../webfonts/fa-regular-400.eot);src:url(../webfonts/fa-regular-400.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-regular-400.woff2) format("woff2"),url(../webfonts/fa-regular-400.woff) format("woff"),url(../webfonts/fa-regular-400.ttf) format("truetype"),url(../webfonts/fa-regular-400.svg#fontawesome) format("svg")}.far{font-family:Font Awesome\ 5 Free;font-weight:400}


--------------------------------------------------------------------------------
/static/css/fa-solid.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
 3 |  * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
 4 |  */
 5 | @font-face {
 6 |   font-family: 'Font Awesome 5 Free';
 7 |   font-style: normal;
 8 |   font-weight: 900;
 9 |   src: url("../webfonts/fa-solid-900.eot");
10 |   src: url("../webfonts/fa-solid-900.eot?#iefix") format("embedded-opentype"), url("../webfonts/fa-solid-900.woff2") format("woff2"), url("../webfonts/fa-solid-900.woff") format("woff"), url("../webfonts/fa-solid-900.ttf") format("truetype"), url("../webfonts/fa-solid-900.svg#fontawesome") format("svg"); }
11 | 
12 | .fa,
13 | .fas {
14 |   font-family: 'Font Awesome 5 Free';
15 |   font-weight: 900; }
16 | 


--------------------------------------------------------------------------------
/static/css/fa-solid.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Font Awesome Free 5.0.8 by @fontawesome - https://fontawesome.com
3 |  * License - https://fontawesome.com/license (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
4 |  */
5 | @font-face{font-family:Font Awesome\ 5 Free;font-style:normal;font-weight:900;src:url(../webfonts/fa-solid-900.eot);src:url(../webfonts/fa-solid-900.eot?#iefix) format("embedded-opentype"),url(../webfonts/fa-solid-900.woff2) format("woff2"),url(../webfonts/fa-solid-900.woff) format("woff"),url(../webfonts/fa-solid-900.ttf) format("truetype"),url(../webfonts/fa-solid-900.svg#fontawesome) format("svg")}.fa,.fas{font-family:Font Awesome\ 5 Free;font-weight:900}


--------------------------------------------------------------------------------
/static/css/ion.rangeSlider.css:
--------------------------------------------------------------------------------
  1 | /* Ion.RangeSlider
  2 | // css version 2.0.3
  3 | // © 2013-2014 Denis Ineshin | IonDen.com
  4 | // ===================================================================================================================*/
  5 | 
  6 | /* =====================================================================================================================
  7 | // RangeSlider */
  8 | 
  9 | .irs {
 10 |     position: relative; display: block;
 11 |     -webkit-touch-callout: none;
 12 |     -webkit-user-select: none;
 13 |      -khtml-user-select: none;
 14 |        -moz-user-select: none;
 15 |         -ms-user-select: none;
 16 |             user-select: none;
 17 | }
 18 |     .irs-line {
 19 |         position: relative; display: block;
 20 |         overflow: hidden;
 21 |         outline: none !important;
 22 |     }
 23 |         .irs-line-left, .irs-line-mid, .irs-line-right {
 24 |             position: absolute; display: block;
 25 |             top: 0;
 26 |         }
 27 |         .irs-line-left {
 28 |             left: 0; width: 11%;
 29 |         }
 30 |         .irs-line-mid {
 31 |             left: 9%; width: 82%;
 32 |         }
 33 |         .irs-line-right {
 34 |             right: 0; width: 11%;
 35 |         }
 36 | 
 37 |     .irs-bar {
 38 |         position: absolute; display: block;
 39 |         left: 0; width: 0;
 40 |     }
 41 |         .irs-bar-edge {
 42 |             position: absolute; display: block;
 43 |             top: 0; left: 0;
 44 |         }
 45 | 
 46 |     .irs-shadow {
 47 |         position: absolute; display: none;
 48 |         left: 0; width: 0;
 49 |     }
 50 | 
 51 |     .irs-slider {
 52 |         position: absolute; display: block;
 53 |         cursor: default;
 54 |         z-index: 1;
 55 |     }
 56 |         .irs-slider.single {
 57 | 
 58 |         }
 59 |         .irs-slider.from {
 60 | 
 61 |         }
 62 |         .irs-slider.to {
 63 | 
 64 |         }
 65 |         .irs-slider.type_last {
 66 |             z-index: 2;
 67 |         }
 68 | 
 69 |     .irs-min {
 70 |         position: absolute; display: block;
 71 |         left: 0;
 72 |         cursor: default;
 73 |     }
 74 |     .irs-max {
 75 |         position: absolute; display: block;
 76 |         right: 0;
 77 |         cursor: default;
 78 |     }
 79 | 
 80 |     .irs-from, .irs-to, .irs-single {
 81 |         position: absolute; display: block;
 82 |         top: 0; left: 0;
 83 |         cursor: default;
 84 |         white-space: nowrap;
 85 |     }
 86 | 
 87 | .irs-grid {
 88 |     position: absolute; display: none;
 89 |     bottom: 0; left: 0;
 90 |     width: 100%; height: 20px;
 91 | }
 92 | .irs-with-grid .irs-grid {
 93 |     display: block;
 94 | }
 95 |     .irs-grid-pol {
 96 |         position: absolute;
 97 |         top: 0; left: 0;
 98 |         width: 1px; height: 8px;
 99 |         background: #000;
100 |     }
101 |     .irs-grid-pol.small {
102 |         height: 4px;
103 |     }
104 |     .irs-grid-text {
105 |         position: absolute;
106 |         bottom: 0; left: 0;
107 |         white-space: nowrap;
108 |         text-align: center;
109 |         font-size: 9px; line-height: 9px;
110 |         padding: 0 3px;
111 |         color: #000;
112 |     }
113 | 
114 | .irs-disable-mask {
115 |     position: absolute; display: block;
116 |     top: 0; left: -1%;
117 |     width: 102%; height: 100%;
118 |     cursor: default;
119 |     background: rgba(0,0,0,0.0);
120 |     z-index: 2;
121 | }
122 | .irs-disabled {
123 |     opacity: 0.4;
124 | }
125 | .lt-ie9 .irs-disabled {
126 |     filter: alpha(opacity=40);
127 | }
128 | 
129 | 
130 | .irs-hidden-input {
131 |     position: absolute !important;
132 |     display: block !important;
133 |     top: 0 !important;
134 |     left: 0 !important;
135 |     width: 0 !important;
136 |     height: 0 !important;
137 |     font-size: 0 !important;
138 |     line-height: 0 !important;
139 |     padding: 0 !important;
140 |     margin: 0 !important;
141 |     outline: none !important;
142 |     z-index: -9999 !important;
143 |     background: none !important;
144 |     border-style: solid !important;
145 |     border-color: transparent !important;
146 | }
147 | 


--------------------------------------------------------------------------------
/static/css/ion.rangeSlider.skinFlat.css:
--------------------------------------------------------------------------------
  1 | /* Ion.RangeSlider, Flat UI Skin
  2 | // css version 2.0.3
  3 | // © Denis Ineshin, 2014    https://github.com/IonDen
  4 | // ===================================================================================================================*/
  5 | 
  6 | /* =====================================================================================================================
  7 | // Skin details */
  8 | 
  9 | .irs-line-mid,
 10 | .irs-line-left,
 11 | .irs-line-right,
 12 | .irs-bar,
 13 | .irs-bar-edge,
 14 | .irs-slider {
 15 |     background: url(../img/sprite-skin-flat.png) repeat-x;
 16 | }
 17 | 
 18 | .irs {
 19 |     height: 40px;
 20 | }
 21 | .irs-with-grid {
 22 |     height: 60px;
 23 | }
 24 | .irs-line {
 25 |     height: 12px; top: 25px;
 26 | }
 27 |     .irs-line-left {
 28 |         height: 12px;
 29 |         background-position: 0 -30px;
 30 |     }
 31 |     .irs-line-mid {
 32 |         height: 12px;
 33 |         background-position: 0 0;
 34 |     }
 35 |     .irs-line-right {
 36 |         height: 12px;
 37 |         background-position: 100% -30px;
 38 |     }
 39 | 
 40 | .irs-bar {
 41 |     height: 12px; top: 25px;
 42 |     background-position: 0 -60px;
 43 | }
 44 |     .irs-bar-edge {
 45 |         top: 25px;
 46 |         height: 12px; width: 9px;
 47 |         background-position: 0 -90px;
 48 |     }
 49 | 
 50 | .irs-shadow {
 51 |     height: 3px; top: 34px;
 52 |     background: #000;
 53 |     opacity: 0.25;
 54 | }
 55 | .lt-ie9 .irs-shadow {
 56 |     filter: alpha(opacity=25);
 57 | }
 58 | 
 59 | .irs-slider {
 60 |     width: 16px; height: 18px;
 61 |     top: 22px;
 62 |     background-position: 0 -120px;
 63 | }
 64 | .irs-slider.state_hover, .irs-slider:hover {
 65 |     background-position: 0 -150px;
 66 | }
 67 | 
 68 | .irs-min, .irs-max {
 69 |     color: #999;
 70 |     font-size: 10px; line-height: 1.333;
 71 |     text-shadow: none;
 72 |     top: 0; padding: 1px 3px;
 73 |     background: #e1e4e9;
 74 |     -moz-border-radius: 4px;
 75 |     border-radius: 4px;
 76 | }
 77 | 
 78 | .irs-from, .irs-to, .irs-single {
 79 |     color: #fff;
 80 |     font-size: 10px; line-height: 1.333;
 81 |     text-shadow: none;
 82 |     padding: 1px 5px;
 83 |     background: #dc7846;
 84 |     -moz-border-radius: 4px;
 85 |     border-radius: 4px;
 86 | }
 87 | .irs-from:after, .irs-to:after, .irs-single:after {
 88 |     position: absolute; display: block; content: "";
 89 |     bottom: -6px; left: 50%;
 90 |     width: 0; height: 0;
 91 |     margin-left: -3px;
 92 |     overflow: hidden;
 93 |     border: 3px solid transparent;
 94 |     border-top-color: #dc7846;
 95 | }
 96 | 
 97 | 
 98 | .irs-grid-pol {
 99 |     background: #e1e4e9;
100 | }
101 | .irs-grid-text {
102 |     color: #999;
103 | }
104 | 


--------------------------------------------------------------------------------
/static/css/main.css:
--------------------------------------------------------------------------------
  1 | a {
  2 |     border-bottom: none !important;
  3 | }
  4 | .card {
  5 |     margin-top: 1em;
  6 | }
  7 | .jumbotron {
  8 |     margin-top: 1em;
  9 | }
 10 | .list-group {
 11 |     margin-top: 1em;
 12 | }
 13 | .list-group-item {
 14 |     padding-bottom: 0.3rem;
 15 | }
 16 | .badge {
 17 |     padding-bottom: 0;
 18 | }
 19 | .table td {
 20 |     padding: 2px 0;
 21 | }
 22 | .td-numeric {
 23 |     text-align: end;
 24 |     padding-right: 1em;
 25 | }
 26 | 
 27 | .bg-application {
 28 |     background: #8FB847;
 29 |     color: #FFFFFF;
 30 | }
 31 | 
 32 | .bg-archive {
 33 |     background: #1fa32a;
 34 |     color: #FFFFFF;
 35 | }
 36 | 
 37 | .bg-audio {
 38 |     background: #009CD8;
 39 |     color: #FFFFFF;
 40 | }
 41 | 
 42 | .bg-video {
 43 |     background: #DC7D6C;
 44 |     color: #FFFFFF;
 45 | }
 46 | 
 47 | .bg-text {
 48 |     background: #E19A36;
 49 |     color: #FFFFFF;
 50 | }
 51 | 
 52 | .bg-image {
 53 |     background: #998AB5;
 54 |     color: #FFFFFF;
 55 | }
 56 | .vim-caret {
 57 |     -webkit-animation: vimCaret 1s linear infinite;
 58 |     -o-animation: vimCaret 1s linear infinite;
 59 |     animation: vimCaret 1s linear infinite; }
 60 | 
 61 | .prev-img {
 62 |     width: 100%;
 63 |     max-width: 250px;
 64 |     height: 100%;
 65 | }
 66 | 
 67 | .prev-icon {
 68 |     cursor: pointer;
 69 | }
 70 | @-webkit-keyframes vimCaret {
 71 |     0% {
 72 |         background-color: transparent; }
 73 |     49% {
 74 |         background-color: transparent; }
 75 |     50% {
 76 |         background-color: rgba(255, 255, 255, 0.6); }
 77 |     100% {
 78 |         background-color: rgba(255, 255, 255, 0.6); } }
 79 | 
 80 | @-o-keyframes vimCaret {
 81 |     0% {
 82 |         background-color: transparent; }
 83 |     49% {
 84 |         background-color: transparent; }
 85 |     50% {
 86 |         background-color: rgba(255, 255, 255, 0.6); }
 87 |     100% {
 88 |         background-color: rgba(255, 255, 255, 0.6); } }
 89 | 
 90 | @keyframes vimCaret {
 91 |     0% {
 92 |         background-color: transparent; }
 93 |     49% {
 94 |         background-color: transparent; }
 95 |     50% {
 96 |         background-color: rgba(255, 255, 255, 0.6); }
 97 |     100% {
 98 |         background-color: rgba(255, 255, 255, 0.6); } }
 99 | 
100 | mark {
101 |     background-color: rgba(255, 255, 0, 0.4);
102 |     border-radius: 0;
103 |     padding: 1px 0;
104 | }
105 | body {
106 |     color: #BBBBBB;
107 |     font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif;
108 |     background-image: url(/static/img/bg.png);
109 | }
110 | 
111 | .card {
112 |     background-color: #36393e;
113 |     border: 3px double #262626;
114 | }
115 | 
116 | .navbar {
117 |     background: #36393e;
118 |     font-family: Lato,'Helvetica Neue',Arial,Helvetica,sans-serif;
119 | }
120 | 
121 | .navbar-brand {
122 |     border: none;
123 | }
124 | 
125 | .nav-link {
126 |     color: #616161;
127 |     border-bottom: 2px solid #6c6c6c;
128 | }
129 | .navbar-toggler-icon {
130 |     background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255,255,255, 0.6)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E");
131 | }
132 | 
133 | .active {
134 |     border-color: #b3b3b3;
135 |     color: #E6E6E6;
136 | }
137 | 
138 | .nav-link:hover {
139 |     color: #c7c7c7;
140 | }
141 | 
142 | .jumbotron {
143 |     background: #36393e;
144 | }
145 | 
146 | a {
147 |     color: #fff;
148 |     border-bottom: 1px dotted #e0e0e0;
149 | }
150 | 
151 | a:hover {
152 |     color:#ddd;
153 |     text-decoration: none;
154 | }
155 | 
156 | .table a {
157 |     border: none;
158 | }
159 | 
160 | .table th, .table td {
161 |     border-top: 1px solid #666a6e;
162 | }
163 | 
164 | .table thead th {
165 |     border-bottom: 2px solid #999da1;
166 | }
167 | .form-control {
168 |     background-color: #2f3136;
169 |     color: inherit;
170 |     border: 1px solid #282b30;
171 | }
172 | 
173 | .form-control:focus {
174 |     background-color: #2f3136;
175 |     border-color: #80bdff;
176 |     color: inherit;
177 | }
178 | 
179 | .input-group-text {
180 |     border: 1px solid #282b30;
181 |     background-color: #686d75;
182 |     color: #e9ecef;
183 | }
184 | 
185 | .nav-tabs .nav-link {
186 |     border-color: transparent;
187 | }
188 | 
189 | .nav-tabs .nav-link.active {
190 |     border-color: #8e9296 #8e9296;
191 |     background-color: transparent;
192 |     color: #E6E6E6;
193 | }
194 | 
195 | .nav-tabs .nav-link:hover {
196 |     border-color: #e9ecef #e9ecef transparent #e9ecef;
197 | }
198 | 
199 | .card-header-tabs {
200 |     border-bottom: 1px solid #a1a5a9;
201 | }
202 | 
203 | * {
204 |     outline: none;
205 | }
206 | 
207 | #sizeSlider {
208 |     width: 100%;
209 | }
210 | 
211 | .irs-single, .irs-from, .irs-to {
212 |     font-size: 13px;
213 | }
214 | 
215 | .irs-slider {
216 |     cursor: col-resize;
217 | }
218 | 
219 | .custom-select {
220 |     overflow: auto;
221 | }
222 | 
223 | .irs {
224 |     margin-bottom: 1em;
225 | }
226 | 
227 | .github-banner {
228 |     position: absolute;
229 |     top: 0;
230 |     right: 0;
231 |     border: 0;
232 | }
233 | 
234 | @media (max-width: 990px) {
235 |     .github-banner {
236 |         display: none;
237 |     }
238 | }


--------------------------------------------------------------------------------
/static/downloads/README.md:
--------------------------------------------------------------------------------
1 | CSV exports of the database will be available here.


--------------------------------------------------------------------------------
/static/img/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/bg.png


--------------------------------------------------------------------------------
/static/img/forkme_right_white_ffffff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/forkme_right_white_ffffff.png


--------------------------------------------------------------------------------
/static/img/sprite-skin-flat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/img/sprite-skin-flat.png


--------------------------------------------------------------------------------
/static/js/popper.min.js:
--------------------------------------------------------------------------------
1 | /*
2 |  Copyright (C) Federico Zivolo 2018
3 |  Distributed under the MIT License (license terms are at http://opensource.org/licenses/MIT).
4 |  */(function(e,t){'object'==typeof exports&&'undefined'!=typeof module?module.exports=t():'function'==typeof define&&define.amd?define(t):e.Popper=t()})(this,function(){'use strict';function e(e){return e&&'[object Function]'==={}.toString.call(e)}function t(e,t){if(1!==e.nodeType)return[];var o=getComputedStyle(e,null);return t?o[t]:o}function o(e){return'HTML'===e.nodeName?e:e.parentNode||e.host}function n(e){if(!e)return document.body;switch(e.nodeName){case'HTML':case'BODY':return e.ownerDocument.body;case'#document':return e.body;}var i=t(e),r=i.overflow,p=i.overflowX,s=i.overflowY;return /(auto|scroll|overlay)/.test(r+s+p)?e:n(o(e))}function r(e){if(!e)return document.documentElement;for(var o=ie(10)?document.body:null,n=e.offsetParent;n===o&&e.nextElementSibling;)n=(e=e.nextElementSibling).offsetParent;var i=n&&n.nodeName;return i&&'BODY'!==i&&'HTML'!==i?-1!==['TD','TABLE'].indexOf(n.nodeName)&&'static'===t(n,'position')?r(n):n:e?e.ownerDocument.documentElement:document.documentElement}function p(e){var t=e.nodeName;return'BODY'!==t&&('HTML'===t||r(e.firstElementChild)===e)}function s(e){return null===e.parentNode?e:s(e.parentNode)}function d(e,t){if(!e||!e.nodeType||!t||!t.nodeType)return document.documentElement;var o=e.compareDocumentPosition(t)&Node.DOCUMENT_POSITION_FOLLOWING,n=o?e:t,i=o?t:e,a=document.createRange();a.setStart(n,0),a.setEnd(i,0);var l=a.commonAncestorContainer;if(e!==l&&t!==l||n.contains(i))return p(l)?l:r(l);var f=s(e);return f.host?d(f.host,t):d(e,s(t).host)}function a(e){var t=1<arguments.length&&void 0!==arguments[1]?arguments[1]:'top',o='top'===t?'scrollTop':'scrollLeft',n=e.nodeName;if('BODY'===n||'HTML'===n){var i=e.ownerDocument.documentElement,r=e.ownerDocument.scrollingElement||i;return r[o]}return e[o]}function l(e,t){var o=2<arguments.length&&void 0!==arguments[2]&&arguments[2],n=a(t,'top'),i=a(t,'left'),r=o?-1:1;return e.top+=n*r,e.bottom+=n*r,e.left+=i*r,e.right+=i*r,e}function f(e,t){var o='x'===t?'Left':'Top',n='Left'==o?'Right':'Bottom';return parseFloat(e['border'+o+'Width'],10)+parseFloat(e['border'+n+'Width'],10)}function m(e,t,o,n){return Q(t['offset'+e],t['scroll'+e],o['client'+e],o['offset'+e],o['scroll'+e],ie(10)?o['offset'+e]+n['margin'+('Height'===e?'Top':'Left')]+n['margin'+('Height'===e?'Bottom':'Right')]:0)}function h(){var e=document.body,t=document.documentElement,o=ie(10)&&getComputedStyle(t);return{height:m('Height',e,t,o),width:m('Width',e,t,o)}}function c(e){return de({},e,{right:e.left+e.width,bottom:e.top+e.height})}function g(e){var o={};try{if(ie(10)){o=e.getBoundingClientRect();var n=a(e,'top'),i=a(e,'left');o.top+=n,o.left+=i,o.bottom+=n,o.right+=i}else o=e.getBoundingClientRect()}catch(t){}var r={left:o.left,top:o.top,width:o.right-o.left,height:o.bottom-o.top},p='HTML'===e.nodeName?h():{},s=p.width||e.clientWidth||r.right-r.left,d=p.height||e.clientHeight||r.bottom-r.top,l=e.offsetWidth-s,m=e.offsetHeight-d;if(l||m){var g=t(e);l-=f(g,'x'),m-=f(g,'y'),r.width-=l,r.height-=m}return c(r)}function u(e,o){var i=2<arguments.length&&void 0!==arguments[2]&&arguments[2],r=ie(10),p='HTML'===o.nodeName,s=g(e),d=g(o),a=n(e),f=t(o),m=parseFloat(f.borderTopWidth,10),h=parseFloat(f.borderLeftWidth,10);i&&'HTML'===o.nodeName&&(d.top=Q(d.top,0),d.left=Q(d.left,0));var u=c({top:s.top-d.top-m,left:s.left-d.left-h,width:s.width,height:s.height});if(u.marginTop=0,u.marginLeft=0,!r&&p){var b=parseFloat(f.marginTop,10),y=parseFloat(f.marginLeft,10);u.top-=m-b,u.bottom-=m-b,u.left-=h-y,u.right-=h-y,u.marginTop=b,u.marginLeft=y}return(r&&!i?o.contains(a):o===a&&'BODY'!==a.nodeName)&&(u=l(u,o)),u}function b(e){var t=1<arguments.length&&void 0!==arguments[1]&&arguments[1],o=e.ownerDocument.documentElement,n=u(e,o),i=Q(o.clientWidth,window.innerWidth||0),r=Q(o.clientHeight,window.innerHeight||0),p=t?0:a(o),s=t?0:a(o,'left'),d={top:p-n.top+n.marginTop,left:s-n.left+n.marginLeft,width:i,height:r};return c(d)}function y(e){var n=e.nodeName;return'BODY'===n||'HTML'===n?!1:'fixed'===t(e,'position')||y(o(e))}function w(e){if(!e||!e.parentElement||ie())return document.documentElement;for(var o=e.parentElement;o&&'none'===t(o,'transform');)o=o.parentElement;return o||document.documentElement}function E(e,t,i,r){var p=4<arguments.length&&void 0!==arguments[4]&&arguments[4],s={top:0,left:0},a=p?w(e):d(e,t);if('viewport'===r)s=b(a,p);else{var l;'scrollParent'===r?(l=n(o(t)),'BODY'===l.nodeName&&(l=e.ownerDocument.documentElement)):'window'===r?l=e.ownerDocument.documentElement:l=r;var f=u(l,a,p);if('HTML'===l.nodeName&&!y(a)){var m=h(),c=m.height,g=m.width;s.top+=f.top-f.marginTop,s.bottom=c+f.top,s.left+=f.left-f.marginLeft,s.right=g+f.left}else s=f}return s.left+=i,s.top+=i,s.right-=i,s.bottom-=i,s}function v(e){var t=e.width,o=e.height;return t*o}function x(e,t,o,n,i){var r=5<arguments.length&&void 0!==arguments[5]?arguments[5]:0;if(-1===e.indexOf('auto'))return e;var p=E(o,n,r,i),s={top:{width:p.width,height:t.top-p.top},right:{width:p.right-t.right,height:p.height},bottom:{width:p.width,height:p.bottom-t.bottom},left:{width:t.left-p.left,height:p.height}},d=Object.keys(s).map(function(e){return de({key:e},s[e],{area:v(s[e])})}).sort(function(e,t){return t.area-e.area}),a=d.filter(function(e){var t=e.width,n=e.height;return t>=o.clientWidth&&n>=o.clientHeight}),l=0<a.length?a[0].key:d[0].key,f=e.split('-')[1];return l+(f?'-'+f:'')}function O(e,t,o){var n=3<arguments.length&&void 0!==arguments[3]?arguments[3]:null,i=n?w(t):d(t,o);return u(o,i,n)}function L(e){var t=getComputedStyle(e),o=parseFloat(t.marginTop)+parseFloat(t.marginBottom),n=parseFloat(t.marginLeft)+parseFloat(t.marginRight),i={width:e.offsetWidth+n,height:e.offsetHeight+o};return i}function S(e){var t={left:'right',right:'left',bottom:'top',top:'bottom'};return e.replace(/left|right|bottom|top/g,function(e){return t[e]})}function T(e,t,o){o=o.split('-')[0];var n=L(e),i={width:n.width,height:n.height},r=-1!==['right','left'].indexOf(o),p=r?'top':'left',s=r?'left':'top',d=r?'height':'width',a=r?'width':'height';return i[p]=t[p]+t[d]/2-n[d]/2,i[s]=o===s?t[s]-n[a]:t[S(s)],i}function D(e,t){return Array.prototype.find?e.find(t):e.filter(t)[0]}function C(e,t,o){if(Array.prototype.findIndex)return e.findIndex(function(e){return e[t]===o});var n=D(e,function(e){return e[t]===o});return e.indexOf(n)}function N(t,o,n){var i=void 0===n?t:t.slice(0,C(t,'name',n));return i.forEach(function(t){t['function']&&console.warn('`modifier.function` is deprecated, use `modifier.fn`!');var n=t['function']||t.fn;t.enabled&&e(n)&&(o.offsets.popper=c(o.offsets.popper),o.offsets.reference=c(o.offsets.reference),o=n(o,t))}),o}function k(){if(!this.state.isDestroyed){var e={instance:this,styles:{},arrowStyles:{},attributes:{},flipped:!1,offsets:{}};e.offsets.reference=O(this.state,this.popper,this.reference,this.options.positionFixed),e.placement=x(this.options.placement,e.offsets.reference,this.popper,this.reference,this.options.modifiers.flip.boundariesElement,this.options.modifiers.flip.padding),e.originalPlacement=e.placement,e.positionFixed=this.options.positionFixed,e.offsets.popper=T(this.popper,e.offsets.reference,e.placement),e.offsets.popper.position=this.options.positionFixed?'fixed':'absolute',e=N(this.modifiers,e),this.state.isCreated?this.options.onUpdate(e):(this.state.isCreated=!0,this.options.onCreate(e))}}function P(e,t){return e.some(function(e){var o=e.name,n=e.enabled;return n&&o===t})}function W(e){for(var t=[!1,'ms','Webkit','Moz','O'],o=e.charAt(0).toUpperCase()+e.slice(1),n=0;n<t.length;n++){var i=t[n],r=i?''+i+o:e;if('undefined'!=typeof document.body.style[r])return r}return null}function B(){return this.state.isDestroyed=!0,P(this.modifiers,'applyStyle')&&(this.popper.removeAttribute('x-placement'),this.popper.style.position='',this.popper.style.top='',this.popper.style.left='',this.popper.style.right='',this.popper.style.bottom='',this.popper.style.willChange='',this.popper.style[W('transform')]=''),this.disableEventListeners(),this.options.removeOnDestroy&&this.popper.parentNode.removeChild(this.popper),this}function H(e){var t=e.ownerDocument;return t?t.defaultView:window}function A(e,t,o,i){var r='BODY'===e.nodeName,p=r?e.ownerDocument.defaultView:e;p.addEventListener(t,o,{passive:!0}),r||A(n(p.parentNode),t,o,i),i.push(p)}function I(e,t,o,i){o.updateBound=i,H(e).addEventListener('resize',o.updateBound,{passive:!0});var r=n(e);return A(r,'scroll',o.updateBound,o.scrollParents),o.scrollElement=r,o.eventsEnabled=!0,o}function M(){this.state.eventsEnabled||(this.state=I(this.reference,this.options,this.state,this.scheduleUpdate))}function F(e,t){return H(e).removeEventListener('resize',t.updateBound),t.scrollParents.forEach(function(e){e.removeEventListener('scroll',t.updateBound)}),t.updateBound=null,t.scrollParents=[],t.scrollElement=null,t.eventsEnabled=!1,t}function R(){this.state.eventsEnabled&&(cancelAnimationFrame(this.scheduleUpdate),this.state=F(this.reference,this.state))}function U(e){return''!==e&&!isNaN(parseFloat(e))&&isFinite(e)}function Y(e,t){Object.keys(t).forEach(function(o){var n='';-1!==['width','height','top','right','bottom','left'].indexOf(o)&&U(t[o])&&(n='px'),e.style[o]=t[o]+n})}function j(e,t){Object.keys(t).forEach(function(o){var n=t[o];!1===n?e.removeAttribute(o):e.setAttribute(o,t[o])})}function q(e,t,o){var n=D(e,function(e){var o=e.name;return o===t}),i=!!n&&e.some(function(e){return e.name===o&&e.enabled&&e.order<n.order});if(!i){var r='`'+t+'`';console.warn('`'+o+'`'+' modifier is required by '+r+' modifier in order to work, be sure to include it before '+r+'!')}return i}function K(e){return'end'===e?'start':'start'===e?'end':e}function V(e){var t=1<arguments.length&&void 0!==arguments[1]&&arguments[1],o=le.indexOf(e),n=le.slice(o+1).concat(le.slice(0,o));return t?n.reverse():n}function z(e,t,o,n){var i=e.match(/((?:\-|\+)?\d*\.?\d*)(.*)/),r=+i[1],p=i[2];if(!r)return e;if(0===p.indexOf('%')){var s;switch(p){case'%p':s=o;break;case'%':case'%r':default:s=n;}var d=c(s);return d[t]/100*r}if('vh'===p||'vw'===p){var a;return a='vh'===p?Q(document.documentElement.clientHeight,window.innerHeight||0):Q(document.documentElement.clientWidth,window.innerWidth||0),a/100*r}return r}function G(e,t,o,n){var i=[0,0],r=-1!==['right','left'].indexOf(n),p=e.split(/(\+|\-)/).map(function(e){return e.trim()}),s=p.indexOf(D(p,function(e){return-1!==e.search(/,|\s/)}));p[s]&&-1===p[s].indexOf(',')&&console.warn('Offsets separated by white space(s) are deprecated, use a comma (,) instead.');var d=/\s*,\s*|\s+/,a=-1===s?[p]:[p.slice(0,s).concat([p[s].split(d)[0]]),[p[s].split(d)[1]].concat(p.slice(s+1))];return a=a.map(function(e,n){var i=(1===n?!r:r)?'height':'width',p=!1;return e.reduce(function(e,t){return''===e[e.length-1]&&-1!==['+','-'].indexOf(t)?(e[e.length-1]=t,p=!0,e):p?(e[e.length-1]+=t,p=!1,e):e.concat(t)},[]).map(function(e){return z(e,i,t,o)})}),a.forEach(function(e,t){e.forEach(function(o,n){U(o)&&(i[t]+=o*('-'===e[n-1]?-1:1))})}),i}function _(e,t){var o,n=t.offset,i=e.placement,r=e.offsets,p=r.popper,s=r.reference,d=i.split('-')[0];return o=U(+n)?[+n,0]:G(n,p,s,d),'left'===d?(p.top+=o[0],p.left-=o[1]):'right'===d?(p.top+=o[0],p.left+=o[1]):'top'===d?(p.left+=o[0],p.top-=o[1]):'bottom'===d&&(p.left+=o[0],p.top+=o[1]),e.popper=p,e}for(var X=Math.min,J=Math.floor,Q=Math.max,Z='undefined'!=typeof window&&'undefined'!=typeof document,$=['Edge','Trident','Firefox'],ee=0,te=0;te<$.length;te+=1)if(Z&&0<=navigator.userAgent.indexOf($[te])){ee=1;break}var i=Z&&window.Promise,oe=i?function(e){var t=!1;return function(){t||(t=!0,window.Promise.resolve().then(function(){t=!1,e()}))}}:function(e){var t=!1;return function(){t||(t=!0,setTimeout(function(){t=!1,e()},ee))}},ne={},ie=function(){var e=0<arguments.length&&void 0!==arguments[0]?arguments[0]:'all';return(e=e.toString(),ne.hasOwnProperty(e))?ne[e]:('11'===e?ne[e]=-1!==navigator.userAgent.indexOf('Trident'):'10'===e?ne[e]=-1!==navigator.appVersion.indexOf('MSIE 10'):'all'===e?ne[e]=-1!==navigator.userAgent.indexOf('Trident')||-1!==navigator.userAgent.indexOf('MSIE'):void 0,ne.all=ne.all||Object.keys(ne).some(function(e){return ne[e]}),ne[e])},re=function(e,t){if(!(e instanceof t))throw new TypeError('Cannot call a class as a function')},pe=function(){function e(e,t){for(var o,n=0;n<t.length;n++)o=t[n],o.enumerable=o.enumerable||!1,o.configurable=!0,'value'in o&&(o.writable=!0),Object.defineProperty(e,o.key,o)}return function(t,o,n){return o&&e(t.prototype,o),n&&e(t,n),t}}(),se=function(e,t,o){return t in e?Object.defineProperty(e,t,{value:o,enumerable:!0,configurable:!0,writable:!0}):e[t]=o,e},de=Object.assign||function(e){for(var t,o=1;o<arguments.length;o++)for(var n in t=arguments[o],t)Object.prototype.hasOwnProperty.call(t,n)&&(e[n]=t[n]);return e},ae=['auto-start','auto','auto-end','top-start','top','top-end','right-start','right','right-end','bottom-end','bottom','bottom-start','left-end','left','left-start'],le=ae.slice(3),fe={FLIP:'flip',CLOCKWISE:'clockwise',COUNTERCLOCKWISE:'counterclockwise'},me=function(){function t(o,n){var i=this,r=2<arguments.length&&void 0!==arguments[2]?arguments[2]:{};re(this,t),this.scheduleUpdate=function(){return requestAnimationFrame(i.update)},this.update=oe(this.update.bind(this)),this.options=de({},t.Defaults,r),this.state={isDestroyed:!1,isCreated:!1,scrollParents:[]},this.reference=o&&o.jquery?o[0]:o,this.popper=n&&n.jquery?n[0]:n,this.options.modifiers={},Object.keys(de({},t.Defaults.modifiers,r.modifiers)).forEach(function(e){i.options.modifiers[e]=de({},t.Defaults.modifiers[e]||{},r.modifiers?r.modifiers[e]:{})}),this.modifiers=Object.keys(this.options.modifiers).map(function(e){return de({name:e},i.options.modifiers[e])}).sort(function(e,t){return e.order-t.order}),this.modifiers.forEach(function(t){t.enabled&&e(t.onLoad)&&t.onLoad(i.reference,i.popper,i.options,t,i.state)}),this.update();var p=this.options.eventsEnabled;p&&this.enableEventListeners(),this.state.eventsEnabled=p}return pe(t,[{key:'update',value:function(){return k.call(this)}},{key:'destroy',value:function(){return B.call(this)}},{key:'enableEventListeners',value:function(){return M.call(this)}},{key:'disableEventListeners',value:function(){return R.call(this)}}]),t}();return me.Utils=('undefined'==typeof window?global:window).PopperUtils,me.placements=ae,me.Defaults={placement:'bottom',positionFixed:!1,eventsEnabled:!0,removeOnDestroy:!1,onCreate:function(){},onUpdate:function(){},modifiers:{shift:{order:100,enabled:!0,fn:function(e){var t=e.placement,o=t.split('-')[0],n=t.split('-')[1];if(n){var i=e.offsets,r=i.reference,p=i.popper,s=-1!==['bottom','top'].indexOf(o),d=s?'left':'top',a=s?'width':'height',l={start:se({},d,r[d]),end:se({},d,r[d]+r[a]-p[a])};e.offsets.popper=de({},p,l[n])}return e}},offset:{order:200,enabled:!0,fn:_,offset:0},preventOverflow:{order:300,enabled:!0,fn:function(e,t){var o=t.boundariesElement||r(e.instance.popper);e.instance.reference===o&&(o=r(o));var n=E(e.instance.popper,e.instance.reference,t.padding,o,e.positionFixed);t.boundaries=n;var i=t.priority,p=e.offsets.popper,s={primary:function(e){var o=p[e];return p[e]<n[e]&&!t.escapeWithReference&&(o=Q(p[e],n[e])),se({},e,o)},secondary:function(e){var o='right'===e?'left':'top',i=p[o];return p[e]>n[e]&&!t.escapeWithReference&&(i=X(p[o],n[e]-('right'===e?p.width:p.height))),se({},o,i)}};return i.forEach(function(e){var t=-1===['left','top'].indexOf(e)?'secondary':'primary';p=de({},p,s[t](e))}),e.offsets.popper=p,e},priority:['left','right','top','bottom'],padding:5,boundariesElement:'scrollParent'},keepTogether:{order:400,enabled:!0,fn:function(e){var t=e.offsets,o=t.popper,n=t.reference,i=e.placement.split('-')[0],r=J,p=-1!==['top','bottom'].indexOf(i),s=p?'right':'bottom',d=p?'left':'top',a=p?'width':'height';return o[s]<r(n[d])&&(e.offsets.popper[d]=r(n[d])-o[a]),o[d]>r(n[s])&&(e.offsets.popper[d]=r(n[s])),e}},arrow:{order:500,enabled:!0,fn:function(e,o){var n;if(!q(e.instance.modifiers,'arrow','keepTogether'))return e;var i=o.element;if('string'==typeof i){if(i=e.instance.popper.querySelector(i),!i)return e;}else if(!e.instance.popper.contains(i))return console.warn('WARNING: `arrow.element` must be child of its popper element!'),e;var r=e.placement.split('-')[0],p=e.offsets,s=p.popper,d=p.reference,a=-1!==['left','right'].indexOf(r),l=a?'height':'width',f=a?'Top':'Left',m=f.toLowerCase(),h=a?'left':'top',g=a?'bottom':'right',u=L(i)[l];d[g]-u<s[m]&&(e.offsets.popper[m]-=s[m]-(d[g]-u)),d[m]+u>s[g]&&(e.offsets.popper[m]+=d[m]+u-s[g]),e.offsets.popper=c(e.offsets.popper);var b=d[m]+d[l]/2-u/2,y=t(e.instance.popper),w=parseFloat(y['margin'+f],10),E=parseFloat(y['border'+f+'Width'],10),v=b-e.offsets.popper[m]-w-E;return v=Q(X(s[l]-u,v),0),e.arrowElement=i,e.offsets.arrow=(n={},se(n,m,Math.round(v)),se(n,h,''),n),e},element:'[x-arrow]'},flip:{order:600,enabled:!0,fn:function(e,t){if(P(e.instance.modifiers,'inner'))return e;if(e.flipped&&e.placement===e.originalPlacement)return e;var o=E(e.instance.popper,e.instance.reference,t.padding,t.boundariesElement,e.positionFixed),n=e.placement.split('-')[0],i=S(n),r=e.placement.split('-')[1]||'',p=[];switch(t.behavior){case fe.FLIP:p=[n,i];break;case fe.CLOCKWISE:p=V(n);break;case fe.COUNTERCLOCKWISE:p=V(n,!0);break;default:p=t.behavior;}return p.forEach(function(s,d){if(n!==s||p.length===d+1)return e;n=e.placement.split('-')[0],i=S(n);var a=e.offsets.popper,l=e.offsets.reference,f=J,m='left'===n&&f(a.right)>f(l.left)||'right'===n&&f(a.left)<f(l.right)||'top'===n&&f(a.bottom)>f(l.top)||'bottom'===n&&f(a.top)<f(l.bottom),h=f(a.left)<f(o.left),c=f(a.right)>f(o.right),g=f(a.top)<f(o.top),u=f(a.bottom)>f(o.bottom),b='left'===n&&h||'right'===n&&c||'top'===n&&g||'bottom'===n&&u,y=-1!==['top','bottom'].indexOf(n),w=!!t.flipVariations&&(y&&'start'===r&&h||y&&'end'===r&&c||!y&&'start'===r&&g||!y&&'end'===r&&u);(m||b||w)&&(e.flipped=!0,(m||b)&&(n=p[d+1]),w&&(r=K(r)),e.placement=n+(r?'-'+r:''),e.offsets.popper=de({},e.offsets.popper,T(e.instance.popper,e.offsets.reference,e.placement)),e=N(e.instance.modifiers,e,'flip'))}),e},behavior:'flip',padding:5,boundariesElement:'viewport'},inner:{order:700,enabled:!1,fn:function(e){var t=e.placement,o=t.split('-')[0],n=e.offsets,i=n.popper,r=n.reference,p=-1!==['left','right'].indexOf(o),s=-1===['top','left'].indexOf(o);return i[p?'left':'top']=r[o]-(s?i[p?'width':'height']:0),e.placement=S(t),e.offsets.popper=c(i),e}},hide:{order:800,enabled:!0,fn:function(e){if(!q(e.instance.modifiers,'hide','preventOverflow'))return e;var t=e.offsets.reference,o=D(e.instance.modifiers,function(e){return'preventOverflow'===e.name}).boundaries;if(t.bottom<o.top||t.left>o.right||t.top>o.bottom||t.right<o.left){if(!0===e.hide)return e;e.hide=!0,e.attributes['x-out-of-boundaries']=''}else{if(!1===e.hide)return e;e.hide=!1,e.attributes['x-out-of-boundaries']=!1}return e}},computeStyle:{order:850,enabled:!0,fn:function(e,t){var o=t.x,n=t.y,i=e.offsets.popper,p=D(e.instance.modifiers,function(e){return'applyStyle'===e.name}).gpuAcceleration;void 0!==p&&console.warn('WARNING: `gpuAcceleration` option moved to `computeStyle` modifier and will not be supported in future versions of Popper.js!');var s,d,a=void 0===p?t.gpuAcceleration:p,l=r(e.instance.popper),f=g(l),m={position:i.position},h={left:J(i.left),top:J(i.top),bottom:J(i.bottom),right:J(i.right)},c='bottom'===o?'top':'bottom',u='right'===n?'left':'right',b=W('transform');if(d='bottom'==c?-f.height+h.bottom:h.top,s='right'==u?-f.width+h.right:h.left,a&&b)m[b]='translate3d('+s+'px, '+d+'px, 0)',m[c]=0,m[u]=0,m.willChange='transform';else{var y='bottom'==c?-1:1,w='right'==u?-1:1;m[c]=d*y,m[u]=s*w,m.willChange=c+', '+u}var E={"x-placement":e.placement};return e.attributes=de({},E,e.attributes),e.styles=de({},m,e.styles),e.arrowStyles=de({},e.offsets.arrow,e.arrowStyles),e},gpuAcceleration:!0,x:'bottom',y:'right'},applyStyle:{order:900,enabled:!0,fn:function(e){return Y(e.instance.popper,e.styles),j(e.instance.popper,e.attributes),e.arrowElement&&Object.keys(e.arrowStyles).length&&Y(e.arrowElement,e.arrowStyles),e},onLoad:function(e,t,o,n,i){var r=O(i,t,e,o.positionFixed),p=x(o.placement,r,t,e,o.modifiers.flip.boundariesElement,o.modifiers.flip.padding);return t.setAttribute('x-placement',p),Y(t,{position:o.positionFixed?'fixed':'absolute'}),o},gpuAcceleration:void 0}}},me});


--------------------------------------------------------------------------------
/static/js/report.js:
--------------------------------------------------------------------------------
  1 | function drawWebsiteScatter(rData) {
  2 | 
  3 |     let dataSet = [];
  4 |     let labels = [];
  5 | 
  6 |     for (let i in rData["website_scatter"]) {
  7 | 
  8 |         let website = rData["website_scatter"][i];
  9 | 
 10 |         dataSet.push({x: website[1], y: website[2]});
 11 |         labels.push(website[0] + " " + website[1] + " files, " + humanFileSize(website[2]))
 12 |     }
 13 | 
 14 |     let ctx = document.getElementById('websiteScatter').getContext('2d');
 15 |     new Chart(ctx, {
 16 |         type: 'scatter',
 17 |         data: {
 18 |             datasets: [{
 19 |                 data: dataSet,
 20 |                 borderWidth: 1,
 21 |                 borderColor: "#E94700",
 22 |                 backgroundColor: "rgba(233, 71, 0, 0.6)"
 23 |             }],
 24 |             labels: labels
 25 |         },
 26 |         options: {
 27 |             title: {
 28 |                 display: true,
 29 |                 text: "Top " + labels.length + " websites",
 30 |                 fontColor: "#c6c6c6",
 31 |                 fontSize: 16,
 32 |                 fontFamily: "Lato,'Helvetica Neue',Arial,Helvetica,sans-serif"
 33 |             },
 34 |             legend: {
 35 |                 display: false
 36 |             },
 37 |             scales: {
 38 |                 xAxes: [
 39 |                     {
 40 |                         type: "logarithmic",
 41 |                         ticks: {
 42 |                             callback: function (value, index, values) {
 43 | 
 44 |                                 let log10 = Math.log10(value);
 45 | 
 46 |                                 if (Number.isInteger(log10)) {
 47 |                                     return value;
 48 |                                 }
 49 |                             }
 50 |                         },
 51 |                         scaleLabel: {
 52 |                             labelString: "File count",
 53 |                             display: true
 54 |                         }
 55 |                     }
 56 |                 ],
 57 |                 yAxes: [
 58 |                     {
 59 |                         type: "logarithmic",
 60 |                         ticks: {
 61 |                             callback: function (value, index, values) {
 62 | 
 63 |                                 let log10 = Math.log10(value);
 64 | 
 65 |                                 if (Number.isInteger(log10)) {
 66 |                                     return humanFileSize(value);
 67 |                                 }
 68 |                             }
 69 |                         }
 70 |                     }
 71 |                 ]
 72 |             },
 73 |             tooltips: {
 74 |                 callbacks: {
 75 |                     label: function (tooltipItem, data) {
 76 |                         return data.labels[tooltipItem.index];
 77 |                     }
 78 |                 }
 79 |             }
 80 |         },
 81 |     });
 82 | 
 83 | }
 84 | 
 85 | function drawSizeHistogram(rData) {
 86 | 
 87 |     let labels = [];
 88 |     let dataSet = [];
 89 | 
 90 |     for (let i in rData["sizes_histogram"]) {
 91 | 
 92 |         let slice = rData["sizes_histogram"][i];
 93 |         dataSet.push(slice[1]);
 94 |         labels.push("[" + humanFileSize(slice[0]) + ", " + humanFileSize(slice[0] + 50000000) + "]")
 95 |     }
 96 | 
 97 |     let ctx = document.getElementById('sizeHistogram').getContext('2d');
 98 |     new Chart(ctx, {
 99 |         type: 'line',
100 |         data: {
101 |             datasets: [{
102 |                 data: dataSet,
103 |                 borderWidth: 1,
104 |                 borderColor: "#E94700",
105 |                 backgroundColor: "rgba(233, 71, 0, 0.6)"
106 |             }],
107 |             labels: labels
108 |         },
109 |         options: {
110 |             title: {
111 |                 display: true,
112 |                 text: "Size histogram",
113 |                 fontColor: "#c6c6c6",
114 |                 fontSize: 16,
115 |                 fontFamily: "Lato,'Helvetica Neue',Arial,Helvetica,sans-serif"
116 |             },
117 |             legend: {
118 |                 display: false
119 |             },
120 |             scales: {
121 |                 yAxes: [
122 |                     {
123 |                         id: "count",
124 |                         type: "logarithmic",
125 |                         ticks: {
126 |                             // Include a dollar sign in the ticks
127 |                             callback: function (value, index, values) {
128 | 
129 |                                 let log10 = Math.log10(value);
130 | 
131 |                                 if (Number.isInteger(log10)) {
132 |                                     return value;
133 |                                 }
134 |                             }
135 |                         }
136 |                     }
137 |                 ]
138 |             }
139 |         }
140 |     });
141 | }
142 | 
143 | function drawDateHistogram(rData) {
144 | 
145 |     let labels = [];
146 |     let dataSet = [];
147 | 
148 |     for (let i in rData["dates_histogram"]) {
149 | 
150 |         let slice = rData["dates_histogram"][i];
151 |         dataSet.push(slice[1]);
152 |         labels.push(slice[0])
153 |     }
154 | 
155 |     let ctx = document.getElementById('dateHistogram').getContext('2d');
156 |     new Chart(ctx, {
157 |         type: 'line',
158 |         data: {
159 |             datasets: [{
160 |                 data: dataSet,
161 |                 borderWidth: 1,
162 |                 borderColor: "#E94700",
163 |                 backgroundColor: "rgba(233, 71, 0, 0.6)"
164 |             }],
165 |             labels: labels,
166 | 
167 |         },
168 |         options: {
169 |             title: {
170 |                 display: true,
171 |                 text: "Dates histogram",
172 |                 fontColor: "#c6c6c6",
173 |                 fontSize: 16,
174 |                 fontFamily: "Lato,'Helvetica Neue',Arial,Helvetica,sans-serif"
175 |             },
176 |             legend: {
177 |                 display: false
178 |             },
179 |             scales: {
180 |                 yAxes: [
181 |                     {
182 |                         id: "count",
183 |                         type: "logarithmic",
184 |                         ticks: {
185 |                             // Include a dollar sign in the ticks
186 |                             callback: function (value, index, values) {
187 | 
188 |                                 let log10 = Math.log10(value);
189 | 
190 |                                 if (Number.isInteger(log10)) {
191 |                                     return value;
192 |                                 }
193 |                             }
194 |                         }
195 |                     }
196 |                 ]
197 |             }
198 |         }
199 |     });
200 | }
201 | 
202 | function drawChart(rData) {
203 | 
204 |     let dataSetSize = [];
205 |     let dataSetCount = [];
206 |     let labels = [];
207 |     let colors = [];
208 | 
209 |     for (let ext in rData["ext_stats"]) {
210 | 
211 |         dataSetSize.push(Math.max(rData["ext_stats"][ext][0], 0));
212 |         dataSetCount.push(rData["ext_stats"][ext][1]);
213 |         labels.push(rData["ext_stats"][ext][2] + " x" + rData["ext_stats"][ext][1] + " (" + humanFileSize(rData["ext_stats"][ext][0]) + ")");
214 | 
215 |         let category = category_map.hasOwnProperty(rData["ext_stats"][ext][2]) ? category_map[rData["ext_stats"][ext][2]] : "default";
216 |         colors.push(getRandomTintOfColor(colors_map[category]));
217 |     }
218 | 
219 |     let ctx = document.getElementById('typesChart').getContext('2d');
220 | 
221 |     let fileTypePieChart = new Chart(ctx, {
222 |         type: 'doughnut',
223 |         data: {
224 |             datasets: [{
225 |                 data: dataSetSize,
226 |                 backgroundColor: colors,
227 |                 borderWidth: 1
228 |             }, {
229 |                 data: dataSetCount,
230 |                 backgroundColor: colors,
231 |                 borderWidth: 1
232 |             }],
233 |             labels: labels
234 | 
235 |         },
236 |         options: {
237 |             title: {
238 |                 display: true,
239 |                 text: "File types for " + rData["base_url"] + " - " + humanFileSize(rData["total_size"]),
240 |                 fontColor: "#c6c6c6",
241 |                 fontSize: 16,
242 |                 fontFamily: "Lato,'Helvetica Neue',Arial,Helvetica,sans-serif"
243 |             },
244 |             legend: {
245 |                 labels: {
246 |                     fontColor: "#bbbbbb",
247 |                     fontFamily: "Lato,'Helvetica Neue',Arial,Helvetica,sans-serif",
248 |                     boxWidth: 20,
249 |                 },
250 |                 position: "left"
251 |             },
252 |             cutoutPercentage: 15
253 |         }
254 |     });
255 | }
256 | 
257 | function fillWebsiteTable(rData) {
258 | 
259 |     document.getElementById("baseUrl").innerHTML = rData["base_url"];
260 |     document.getElementById("fileCount").innerHTML = rData["total_count"].hasOwnProperty("value") ? rData["total_count"]["value"] : rData["total_count"];
261 |     document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]);
262 |     document.getElementById("reportTime").innerHTML = rData["report_time"] + " UTC";
263 | 
264 | }
265 | 
266 | function fillDatabaseTable(rData) {
267 |     document.getElementById("esIndexSize").innerHTML = humanFileSize(rData["es_index_size"]);
268 |     document.getElementById("esSearchCount").innerHTML = rData["es_search_count"];
269 |     document.getElementById("esSearchTime").innerHTML = rData["es_search_time"] + "ms";
270 |     document.getElementById("esSearchTimeAvg").innerHTML = rData["es_search_time_avg"].toFixed(2) + "ms";
271 |     document.getElementById("totalCount").innerHTML = rData["total_count"];
272 |     document.getElementById("totalSize").innerHTML = humanFileSize(rData["total_size"]);
273 |     document.getElementById("sizeAvg").innerHTML = humanFileSize(rData["size_avg"]);
274 |     document.getElementById("sizeStdDeviation").innerHTML = humanFileSize(rData["size_std_deviation"]);
275 |     document.getElementById("sizeStdDeviationBounds").innerHTML = "[" + humanFileSize(rData["size_std_deviation_bounds"]["lower"]) +
276 |         ", " + humanFileSize(rData["size_std_deviation_bounds"]["upper"]) + "]";
277 |     document.getElementById("sizeVariance").innerHTML = humanFileSize(rData["size_variance"]);
278 | }
279 | 
280 | function getRandomTintOfColor(color) {
281 |     let p = 1,
282 |         temp,
283 |         random = Math.random(),
284 |         result = '#';
285 | 
286 |     while (p < color.length) {
287 |         temp = parseInt(color.slice(p, p += 2), 16);
288 |         temp += Math.floor((16 - temp) * random);
289 |         result += temp.toString(16).padStart(2, '0');
290 |     }
291 |     return color;
292 | }
293 | 
294 | category_map = {
295 |     // Application category
296 |     'bcpio': 'application', 'bin': 'application', 'cdf': 'application',
297 |     'csh': 'application', 'dll': 'application', 'doc': 'application',
298 |     'dot': 'application', 'dvi': 'application', 'eml': 'application',
299 |     'exe': 'application', 'hdf': 'application',
300 |     'man': 'application', 'me': 'application', 'mht': 'application',
301 |     'mhtml': 'application', 'mif': 'application', 'ms': 'application',
302 |     'nc': 'application', 'nws': 'application', 'o': 'application',
303 |     'obj': 'application', 'oda': 'application', 'p12': 'application',
304 |     'p7c': 'application', 'pfx': 'application', 'tr': 'application',
305 |     'ppa': 'application', 'pps': 'application', 'ppt': 'application',
306 |     'ps': 'application', 'pwz': 'application', 'pyc': 'application',
307 |     'pyo': 'application', 'ram': 'application', 'rdf': 'application',
308 |     'roff': 'application', 'sh': 'application', 'so': 'application',
309 |     'src': 'application', 'sv4cpio': 'application', 'sv4crc': 'application',
310 |     't': 'application', 'tcl': 'application', 'tex': 'application',
311 |     'texi': 'application', 'texinfo': 'application', 'ustar': 'application',
312 |     'wiz': 'application', 'wsdl': 'application', 'xlb': 'application',
313 |     'xls': 'application', 'xpdl': 'application', 'xsl': 'application',
314 |     'torrent': 'application', 'rpm': 'application', 'deb': 'application',
315 |     'atr': 'application', 'class': 'application', 'ttf': 'application',
316 |     'img': 'application', 'msi': 'application', 'run': 'application',
317 |     'drpm': 'application', 'udeb': 'application', 'patch': 'application',
318 |     'nes': 'application', 'ebuild': 'application', 'scr': 'application',
319 |     // Text category
320 |     'java': 'text', 'cpp': 'text', 'rb': 'text',
321 |     'bat': 'text', 'latex': 'text', 'xml': 'text',
322 |     'etx': 'text', 'htm': 'text', 'c': 'text',
323 |     'css': 'text', 'csv': 'text', 'html': 'text',
324 |     'js': 'text', 'json': 'text', 'ksh': 'text',
325 |     'pl': 'text', 'pot': 'application', 'py': 'text',
326 |     'h': 'text', 'tsv': 'text', 'rtx': 'text',
327 |     'sgm': 'text', 'sgml': 'text', 'txt': 'text',
328 |     'vcf': 'text', 'pdf': 'text', 'epub': 'text',
329 |     'srt': 'text', 'inc': 'text', 'php': 'text',
330 |     'cbz': 'text', 'docx': 'text', 'mobi': 'text',
331 |     'chm': 'text', 'xlsx': "text", 'djvu': 'text',
332 |     'rtf': 'text', 'log': 'text', 'md': 'text',
333 |     'dsc': 'text', 'info': 'text',
334 |     // Video category
335 |     '3g2': 'video', '3gp': 'video', 'asf': 'video',
336 |     'asx': 'video', 'avi': 'video', 'flv': 'video',
337 |     'swf': 'video', 'vob:': 'video', 'qt': 'video',
338 |     'webm': 'video', 'mov': 'video', 'm1v': 'video',
339 |     'm3u': 'video', 'm3u8': 'video', 'movie': 'video',
340 |     'mp4': 'video', 'mpa': 'video', 'mpe': 'video',
341 |     'mpeg': 'video', 'mpg': 'video', 'mkv': 'video',
342 |     'wmv': 'video', 'm4s': 'video', 'ogv': 'video',
343 |     'm4b': 'video', 'm4v': 'video', 'ts': 'video',
344 | 
345 |     // Audio category
346 |     'wav': 'audio', 'snd': 'audio', 'mp2': 'audio',
347 |     'aif': 'audio', 'iff': 'audio', 'm4a': 'audio',
348 |     'mid': 'audio', 'midi': 'audio', 'mp3': 'audio',
349 |     'wma': 'audio', 'ra': 'audio', 'aifc': 'audio',
350 |     'aiff': 'audio', 'au': 'audio', 'flac': 'audio',
351 |     'ogg': 'audio', 'oga': 'audio', 'mka': 'video',
352 |     'ac3': 'audio',
353 |     // Image category
354 |     'bmp': 'image', 'gif': 'image', 'jpg': 'image',
355 |     'xwd': 'image', 'tif': 'image', 'tiff': 'image',
356 |     'png': 'image', 'pnm': 'image', 'ras': 'image',
357 |     'ico': 'image', 'ief': 'image', 'pgm': 'image',
358 |     'jpe': 'image', 'pbm': 'image', 'jpeg': 'image',
359 |     'ppm': 'image', 'xpm': 'image', 'xbm': 'image',
360 |     'rgb': 'image', 'svg': 'image', 'psd': 'image',
361 |     'yuv': 'image', 'ai': 'image', 'eps': 'image',
362 |     'bw': 'image', 'hdr': 'image',
363 |     // Archive category
364 |     'ar': 'archive', 'cpio': 'archive', 'shar': 'archive',
365 |     'iso': 'archive', 'lbr': 'archive', 'mar': 'archive',
366 |     'sbx': 'archive', 'bz2': 'archive', 'f': 'archive',
367 |     'gz': 'archive', 'lz': 'archive', 'lzma': 'archive',
368 |     'lzo': 'archive', 'rz': 'archive', 'sfark': 'archive',
369 |     'sz': 'archive', 'z': 'archive', '7z': 'archive',
370 |     's7z': 'archive', 'ace': 'archive', 'afa': 'archive',
371 |     'alz': 'archive', 'apk': 'archive', 'arc': 'archive',
372 |     'arj': 'archive', 'b1': 'archive', 'b6z': 'archive',
373 |     'a': 'archive', 'bh': 'archive', 'cab': 'archive',
374 |     'car': 'archive', 'cfs': 'archive', 'cpt': 'archive',
375 |     'dar': 'archive', 'dd': 'archive', 'dgc': 'archive',
376 |     'dmg': 'archive', 'ear': 'archive', 'gca': 'archive',
377 |     'ha': 'archive', 'hki': 'archive', 'ice': 'archive',
378 |     'jar': 'archive', 'kgb': 'archive', 'lzh': 'archive',
379 |     'lha': 'archive', 'lzx': 'archive', 'pak': 'archive',
380 |     'partimg': 'archive', 'paq6': 'archive', 'paq7': 'archive',
381 |     'paq8': 'archive', 'pea': 'archive', 'pim': 'archive',
382 |     'pit': 'archive', 'qda': 'archive', 'rar': 'archive',
383 |     'rk': 'archive', 'sda': 'archive', 'sea': 'archive',
384 |     'sen': 'archive', 'sfx': 'archive', 'shk': 'archive',
385 |     'sit': 'archive', 'sitx': 'archive', 'sqx': 'archive',
386 |     'tbz2': 'archive', 'tlz': 'archive', 'xz': 'archive',
387 |     'txz': 'archive', 'uc': 'archive', 'uc0': 'archive',
388 |     'uc2': 'archive', 'ucn': 'archive', 'ur2': 'archive',
389 |     'ue2': 'archive', 'uca': 'archive', 'uha': 'archive',
390 |     'war': 'archive', 'wim': 'archive', 'xar': 'archive',
391 |     'xp3': 'archive', 'yz1': 'archive', 'zip': 'archive',
392 |     'zipx': 'archive', 'zoo': 'archive', 'zpaq': 'archive',
393 |     'zz': 'archive', 'xpi': 'archive', 'tgz': 'archive',
394 |     'tbz': 'archive', 'tar': 'archive', 'bz': 'archive',
395 |     'diz': 'archive',
396 | };
397 | 
398 | colors_map = {
399 |     "archive": "#23d630",
400 |     "application": "#8fb847",
401 |     "image": "#c55fce",
402 |     "audio": "#00a4e2",
403 |     "video": "#dc7846",
404 |     "text": "#e1ba45",
405 |     "default": "#CCCCCC"
406 | };
407 | 
408 | /**
409 |  * https://stackoverflow.com/questions/10420352
410 |  */
411 | function humanFileSize(bytes) {
412 | 
413 |     var thresh = 1000;
414 |     if (Math.abs(bytes) < thresh) {
415 |         return bytes + ' B';
416 |     }
417 |     var units = ['kB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'];
418 |     var u = -1;
419 |     do {
420 |         bytes /= thresh;
421 |         ++u;
422 |     } while (Math.abs(bytes) >= thresh && u < units.length - 1);
423 | 
424 |     return bytes.toFixed(1) + ' ' + units[u];
425 | }


--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.eot


--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.ttf


--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.woff


--------------------------------------------------------------------------------
/static/webfonts/fa-brands-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-brands-400.woff2


--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.eot


--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.ttf


--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.woff


--------------------------------------------------------------------------------
/static/webfonts/fa-regular-400.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-regular-400.woff2


--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.eot


--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.ttf


--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.woff


--------------------------------------------------------------------------------
/static/webfonts/fa-solid-900.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simon987/od-database/3df667deb4ee4eeefac293c0c90b0437d942231e/static/webfonts/fa-solid-900.woff2


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import time
  5 | import traceback
  6 | from multiprocessing.pool import ThreadPool
  7 | from tempfile import NamedTemporaryFile
  8 | from threading import Thread
  9 | from uuid import uuid4
 10 | 
 11 | import requests
 12 | import urllib3
 13 | 
 14 | import config
 15 | import database
 16 | from database import Website
 17 | from search.search import ElasticSearchEngine
 18 | from task_tracker_drone.src.tt_drone.api import TaskTrackerApi, Worker
 19 | from ws_bucket_client.api import WsBucketApi
 20 | 
 21 | urllib3.disable_warnings()
 22 | 
 23 | logger = logging.getLogger("default")
 24 | 
 25 | 
 26 | class Task:
 27 | 
 28 |     def __init__(self, website_id: int, url: str, priority: int = 1,
 29 |                  callback_type: str = None, callback_args: str = None,
 30 |                  upload_token: str = None):
 31 |         self.website_id = website_id
 32 |         self.url = url
 33 |         self.priority = priority
 34 |         self.callback_type = callback_type
 35 |         self.callback_args = json.loads(callback_args) if callback_args else {}
 36 |         self.upload_token = upload_token
 37 | 
 38 |     def to_json(self):
 39 |         return {
 40 |             "website_id": self.website_id,
 41 |             "url": self.url,
 42 |             "callback_type": self.callback_type,
 43 |             "callback_args": json.dumps(self.callback_args),
 44 |             "upload_token": self.upload_token
 45 |         }
 46 | 
 47 |     def __str__(self):
 48 |         return json.dumps(self.to_json())
 49 | 
 50 |     def __repr__(self):
 51 |         return self.__str__()
 52 | 
 53 | 
 54 | class IndexingTask:
 55 | 
 56 |     def __init__(self, website_id: int, file_path: str, callback_type: str, callback_args):
 57 |         self.website_id = website_id
 58 |         self.file_path = file_path
 59 |         self.callback_type = callback_type
 60 |         self.callback_args = callback_args
 61 | 
 62 | 
 63 | class TaskManager:
 64 | 
 65 |     def __init__(self):
 66 |         self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
 67 |         self.db = database.Database(config.DB_CONN_STR)
 68 |         self.tracker = TaskTrackerApi(config.TT_API)
 69 | 
 70 |         self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
 71 |         self._indexer_threads = list()
 72 | 
 73 |         self.worker = Worker.from_file(self.tracker)
 74 |         if not self.worker:
 75 |             self.worker = self.tracker.make_worker("$oddb_master")
 76 |             if not self.worker:
 77 |                 print("Could not create worker: %s" % traceback.format_exc())
 78 |                 return
 79 |             self.worker.dump_to_file()
 80 |             self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
 81 |             self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
 82 | 
 83 |     def start_indexer_threads(self):
 84 |         logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
 85 |         for _ in range(config.INDEXER_THREADS):
 86 |             t = Thread(target=self._do_indexing)
 87 |             t.setDaemon(True)
 88 |             self._indexer_threads.append(t)
 89 |             t.start()
 90 | 
 91 |     def _do_indexing(self):
 92 | 
 93 |         while True:
 94 |             task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT)
 95 | 
 96 |             if task:
 97 |                 try:
 98 |                     recipe = task.json_recipe()
 99 |                     logger.debug("Got indexing task: " + str(recipe))
100 | 
101 |                     filename = download_file(config.WSB_API + "/slot?token=" + recipe["upload_token"])
102 | 
103 |                     self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
104 |                 except Exception as e:
105 |                     self.worker.release_task(task_id=task.id, result=1, verification=0)
106 |                 finally:
107 |                     try:
108 |                         self.worker.release_task(task_id=task.id, result=0, verification=0)
109 |                     except:
110 |                         pass
111 |             else:
112 |                 time.sleep(5)
113 | 
114 |     def _complete_task(self, file_list, task):
115 | 
116 |         self.search.delete_docs(task.website_id)
117 | 
118 |         if file_list:
119 |             def iter_lines():
120 |                 with open(file_list, "r") as f:
121 |                     line = f.readline()
122 |                     while line:
123 |                         yield line
124 |                         line = f.readline()
125 | 
126 |             self.search.import_json(iter_lines(), task.website_id)
127 |             os.remove(file_list)
128 | 
129 |         self.db.update_website_date_if_exists(task.website_id)
130 | 
131 |     def do_recrawl(self):
132 |         logger.debug("Creating re-crawl tasks")
133 |         self._generate_crawling_tasks()
134 | 
135 |     def _generate_crawling_tasks(self):
136 | 
137 |         # TODO: Insert more in-depth re-crawl logic here
138 |         websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")
139 | 
140 |         def recrawl(website: Website):
141 |             crawl_task = Task(website.id, website.url,
142 |                               priority=(int((time.time() - website.last_modified.timestamp()) / 3600)))
143 |             self.queue_task(crawl_task)
144 | 
145 |         pool = ThreadPool(processes=30)
146 |         pool.map(func=recrawl, iterable=websites_to_crawl)
147 |         pool.close()
148 | 
149 |     def queue_task(self, task: Task):
150 |         max_assign_time = 24 * 4 * 3600
151 |         upload_token = uuid4().__str__()
152 | 
153 |         task.upload_token = upload_token
154 |         tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
155 |                                                    recipe=task.__str__(),
156 |                                                    priority=task.priority,
157 |                                                    max_assign_time=max_assign_time,
158 |                                                    hash64=task.website_id,
159 |                                                    verification_count=1,
160 |                                                    max_retries=3
161 |                                                    )
162 |         print(tracker_response.text)
163 |         logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
164 |         if not tracker_response.json()["ok"]:
165 |             return
166 | 
167 |         bucket_response = self.bucket.allocate(upload_token.__str__(),
168 |                                                21474837499,  # 20Gib
169 |                                                format_file_name(task.website_id, upload_token),
170 |                                                to_dispose_date=int(time.time() + max_assign_time),
171 |                                                upload_hook="")
172 |         logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
173 | 
174 | 
175 | def format_file_name(website_id, token):
176 |     return "%d_%s.NDJSON" % (website_id, token,)
177 | 
178 | 
179 | def download_file(url):
180 |     r = requests.get(url, stream=True,)
181 | 
182 |     if r.status_code != 200:
183 |         raise ValueError("HTTP error %d: %s" % (r.status_code, url))
184 | 
185 |     tmp = NamedTemporaryFile(delete=False)
186 |     for chunk in r.iter_content(chunk_size=4096):
187 |         if chunk:
188 |             tmp.write(chunk)
189 |     tmp.close()
190 | 
191 |     return tmp.name
192 | 


--------------------------------------------------------------------------------
/template_filters.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import time
 3 | 
 4 | import od_util
 5 | 
 6 | 
 7 | def setup_template_filters(app):
 8 | 
 9 |     app.jinja_env.globals.update(truncate_path=od_util.truncate_path)
10 |     app.jinja_env.globals.update(get_color=od_util.get_color)
11 |     app.jinja_env.globals.update(get_mime=od_util.get_category)
12 | 
13 |     @app.template_filter("date_format")
14 |     def date_format(value, format='%Y-%m-%d'):
15 |         return time.strftime(format, time.gmtime(value))
16 | 
17 |     @app.template_filter("datetime_format")
18 |     def datetime_format(value, format='%Y-%m-%d %H:%M:%S'):
19 |         return time.strftime(format, time.gmtime(value))
20 | 
21 |     @app.template_filter("duration_format")
22 |     def duration_format(value):
23 |         delay = datetime.timedelta(seconds=value)
24 |         if delay.days > 0:
25 |             out = str(delay).replace(" days, ", ":")
26 |         else:
27 |             out = str(delay)
28 |         out_ar = out.split(':')
29 |         out_ar = ["%02d" % (int(float(x))) for x in out_ar]
30 |         out = ":".join(out_ar)
31 |         return out
32 | 
33 |     @app.template_filter("from_timestamp")
34 |     def from_timestamp(value):
35 |         return datetime.datetime.fromtimestamp(value)
36 | 


--------------------------------------------------------------------------------
/templates/admin.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "Admin login - OD-Database" %}
 3 | 
 4 | {% block body %}
 5 |     <div class="container">
 6 |         <div class="card">
 7 |             <div class="card-header">Admin login</div>
 8 |             <div class="card-body">
 9 |                 <form action="/login" method="post">
10 | 
11 |                     <div class="form-group">
12 |                         <input class="form-control" name="username" placeholder="Username">
13 |                     </div>
14 |                     <div class="form-group">
15 |                         <input class="form-control" name="password" placeholder="Password" type="password">
16 |                     </div>
17 | 
18 |                     {% if show_captcha %}
19 |                         {{ captcha.get_code()|safe }}
20 |                     {% endif %}
21 | 
22 |                     <input type="submit" value="Login">
23 | 
24 |                 </form>
25 |             </div>
26 |         </div>
27 |     </div>
28 | {% endblock body %}
29 | 


--------------------------------------------------------------------------------
/templates/contribute.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set current_page = "contribute" %}
 3 | 
 4 | {% block body %}
 5 |     <div class="container">
 6 |     <div class="card">
 7 |         <div class="card-header">How to contribute</div>
 8 |         <div class="card-body">
 9 |             <p>Fork on <a href="https://github.com/simon987/od-database">GitHub</a> or create an issue</p>
10 |             <p>Or <a href="/submit">submit a website</a></p>
11 |             <p>You can also contact me <a href="https://reddit.com/message/compose?to=Hexahedr_n">on Reddit</a></p>
12 |         </div>
13 |     </div>
14 |     </div>
15 | {% endblock body %}
16 | 


--------------------------------------------------------------------------------
/templates/dashboard.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "Dashboard - OD-Database" %}
 3 | 
 4 | {% block body %}
 5 |     <div class="container">
 6 |         <div class="card">
 7 |             <div class="card-header">Dashboard</div>
 8 |             <div class="card-body">
 9 | 
10 |                 <h3>API Keys</h3>
11 |                 <table class="table table-striped">
12 |                     <thead>
13 |                     <tr>
14 |                         <th>Name</th>
15 |                         <th>Token</th>
16 |                         <th>Action</th>
17 |                     </tr>
18 |                     </thead>
19 | 
20 |                     <tbody>
21 |                     {% for token in api_tokens %}
22 |                         <tr>
23 |                             <td>{{ token.name }}</td>
24 |                             <td><code>{{ token.token }}</code></td>
25 |                             <td>
26 |                                 <form action="/del_token" method="post">
27 |                                     <input type="hidden" value="{{ token.token }}" name="token">
28 |                                     <input type="submit" class="btn btn-danger" value="Delete">
29 |                                 </form>
30 |                             </td>
31 |                         </tr>
32 |                     {% endfor %}
33 |                     </tbody>
34 |                 </table>
35 |                 <form action="/generate_token" method="post">
36 |                     <div class="form-row">
37 |                         <div class="col col-md-10">
38 |                             <input class="form-control" name="description" placeholder="Description">
39 |                         </div>
40 |                         <div class="col col-md-2">
41 |                             <input type="submit" class="form-control btn btn-primary" value="Generate API token">
42 |                         </div>
43 |                     </div>
44 |                 </form>
45 | 
46 |                 <br>
47 |                 <hr>
48 |                 <h3>Blacklist</h3>
49 |                 <table class="table table-striped">
50 |                     <thead>
51 |                     <tr>
52 |                         <th>Netloc</th>
53 |                         <th>Action</th>
54 |                     </tr>
55 |                     </thead>
56 |                     <tbody>
57 |                     {% for item in blacklist %}
58 |                         <tr>
59 |                             <td>{{ item.netloc }}</td>
60 |                             <td><a class="btn btn-danger" href="/blacklist/{{ item.id }}/delete">Delete</a></td>
61 |                         </tr>
62 |                     {% endfor %}
63 |                     </tbody>
64 |                 </table>
65 |                 <form class="form" action="/blacklist/add" method="POST">
66 |                     <div class="form-row">
67 |                         <div class="col col-md-10">
68 |                             <input class="form-control" name="url" placeholder="Url">
69 |                         </div>
70 |                         <div class="col col-md-2">
71 |                             <input type="submit" class="form-control btn btn-primary" value="Add">
72 |                         </div>
73 |                     </div>
74 |                 </form>
75 | 
76 |                 <br>
77 |                 <hr>
78 |                 <a class="btn btn-info" href="/logout">Logout</a>
79 |             </div>
80 |         </div>
81 |     </div>
82 | {% endblock body %}
83 | 


--------------------------------------------------------------------------------
/templates/downloads.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "Downloads - OD-Database" %}
 3 | {% set current_page = "dl" %}
 4 | 
 5 | {% block body %}
 6 |     <div class="container">
 7 |         <div class="card">
 8 |             <div class="card-header">Downloads</div>
 9 |             <div class="card-body">
10 | 
11 |                 <p>Please let me know if you used the database in a project!</p>
12 |                 <p>The entire database is exported to CSV regularly</p>
13 | 
14 |                 {% if not export_file_stats %}
15 |                 <br/>
16 |                 <p><em>No files available.</em></p>
17 |                 {% else %}
18 |                 <table class="table table-striped">
19 | 
20 |                     <thead>
21 |                     <tr>
22 |                         <th>Description</th>
23 |                         <th>Size</th>
24 |                         <th>Date</th>
25 |                     </tr>
26 |                     </thead>
27 | 
28 |                     <tbody>
29 |                     {% for name, path, stat in export_file_stats %}
30 |                     <tr>
31 |                         <td><a href="{{ path }}">{{ name }}</a></td>
32 |                         <td>{{ stat.st_size |filesizeformat }}</td>
33 |                         <td>{{ stat.st_mtime|datetime_format }} UTC</td>
34 |                     </tr>
35 |                     {% endfor %}
36 |                     </tbody>
37 | 
38 |                 </table>
39 |                 {% endif %}
40 |             </div>
41 |         </div>
42 |     </div>
43 | {% endblock body %}
44 | 


--------------------------------------------------------------------------------
/templates/home.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set current_page = "home" %}
 3 | {% set title = "OD-Database - Home" %}
 4 | 
 5 | {% block body %}
 6 | 
 7 |     <div class="container">
 8 |         <div class="jumbotron">
 9 |             <h1 class="text-monospace" style="font-size: 3.5rem;">OD-Database</h1>
10 | 
11 |             {% if stats and stats["total_size"] %}
12 |                 <p class="lead">{{ stats["total_count"] }} files totalling
13 |                     ~{{ stats["total_size"] | filesizeformat }} from {{ stats["website_count"] }} websites</p>
14 |             {% else %}
15 |                 <p class="lead">We are currently experiencing a high volume of traffic. The search function
16 |                     may be unresponsive.</p>
17 |             {% endif %}
18 |             <p></p>
19 |         </div>
20 |     </div>
21 |     <div class="container">
22 | 
23 |         <div class="card">
24 |             <div class="card-header">Search</div>
25 |             <div class="card-body">
26 |                 <form action="/search" id="sfrm">
27 | 
28 |                     <div class="form-row">
29 |                         <input class="form-control" style="max-width: calc(100% - 80px);" name="q" id="q" placeholder="Query">
30 |                         <input class="btn btn-primary btn-shadow" type="submit" value="Search" style="margin-left: 3px">
31 |                     </div>
32 |                     {% if show_captcha %}
33 |                         {{ captcha.get_code()|safe }}
34 |                     {% endif %}
35 |                 </form>
36 |             </div>
37 |         </div>
38 | 
39 |         <div class="card">
40 |             <div class="card-header">About</div>
41 |             <div class="card-body">
42 |                 <p>Web frontend and backend by <a href="https://github.com/simon987">simon987</a>,
43 |                     HTTP crawler by <a href="https://github.com/terorie">terorie</a>,
44 |                     hosting provided by <a href="https://the-eye.eu/">The eye</a>
45 |                 </p>
46 |             </div>
47 |         </div>
48 | 
49 |     </div>
50 | 
51 |     <a href="https://github.com/simon987">
52 |         <img class="github-banner" src="/static/img/forkme_right_white_ffffff.png" alt="Fork me on GitHub">
53 |     </a>
54 | {% endblock body %}
55 | 


--------------------------------------------------------------------------------
/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>{{ title }}</title>
 6 | 
 7 |     <meta name='viewport' content='width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no' />
 8 |     <script src="/static/js/script.js"></script>
 9 |     <link href="/static/css/style.css" rel="stylesheet" type="text/css">
10 | </head>
11 | <body>
12 | 
13 | 
14 | <nav class="navbar navbar-expand-lg">
15 |     <a class="navbar-brand" href="/"><i class="fas fa-database"></i> OD-Database</a>
16 |     <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarSupportedContent" aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
17 |         <span class="navbar-toggler-icon"></span>
18 |     </button>
19 | 
20 |     <div class="collapse navbar-collapse" id="navbarSupportedContent">
21 |         <ul class="navbar-nav mr-auto">
22 |             <li class="nav-item">
23 |                 <a class="nav-link {{ "active" if current_page == "website" else "" }}" href="/website/">Websites</a>
24 |             </li>
25 |             <li class="nav-item">
26 |                 <a class="nav-link {{ "active" if current_page == "submit" else "" }}" href="/submit">Submit website</a>
27 |             </li>
28 |             <li class="nav-item">
29 |                 <a class="nav-link {{ "active" if current_page == "contribute" else "" }}" href="/contribute">Contribute</a>
30 |             </li>
31 |             <li class="nav-item">
32 |                 <a class="nav-link {{ "active" if current_page == "dl" else "" }}" href="/dl">Downloads</a>
33 |             </li>
34 |             <li class="nav-item">
35 |                 <a class="nav-link {{ "active" if current_page == "stats" else "" }}" href="/stats">Stats</a>
36 |             </li>
37 |         </ul>
38 |     </div>
39 | </nav>
40 | 
41 | 
42 | {% block alert_messages %}
43 | 
44 | 
45 |     {% with messages = get_flashed_messages(with_categories=true) %}
46 |         {% if messages %}
47 |             <div class="container" style="margin-top: 1em">
48 |                 {% for category, message in messages %}
49 |                     <div class="alert alert-{{ category }}">
50 |                         <a href="#" class="close" data-dismiss="alert" aria-label="fermer">&times;</a>
51 |                         {{ message | safe }}
52 |                     </div>
53 |                 {% endfor %}
54 |             </div>
55 |         {% endif %}
56 |     {% endwith %}
57 | {% endblock %}
58 | 
59 | {% block body %}
60 | {% endblock body %}
61 | 
62 | </body>
63 | </html>


--------------------------------------------------------------------------------
/templates/search.html:
--------------------------------------------------------------------------------
  1 | {% extends "layout.html" %}
  2 | {% set current_page = "search" %}
  3 | 
  4 | {% set title = "OD-Database - Search" %}
  5 | 
  6 | {% block body %}
  7 |     <div class="container">
  8 | 
  9 |         <div class="card">
 10 |             <div class="card-header">Search</div>
 11 |             <div class="card-body">
 12 |                 <form action="/search" id="sfrm">
 13 | 
 14 |                     <div class="form-row">
 15 | 
 16 |                         {# Query #}
 17 |                         <div class="input-group form-group">
 18 |                             <div class="input-group-prepend">
 19 |                                 <div class="input-group-text">
 20 |                                     <label for="matchAll" style="margin-bottom: 0">Match any word&nbsp</label>
 21 |                                     <input title="Toggle between 'match all words' and 'match any word'" type="checkbox"
 22 |                                            id="matchAll" name="all" {{ "checked" if match_all }}>
 23 |                                 </div>
 24 |                             </div>
 25 |                             <input class="form-control" name="q" id="q" placeholder="Query" value="{{ q }}">
 26 |                             <input type="hidden" name="p" id="page" value="{{ p }}">
 27 |                         </div>
 28 |                     </div>
 29 |                     {# Size #}
 30 |                     <div class="text-muted" style="text-align: center">File size</div>
 31 |                     <input title="File size" id="sizeSlider">
 32 |                     <input type="hidden" name="size_min" id="sizeMin" value="{{ size_min }}">
 33 |                     <input type="hidden" name="size_max" id="sizeMax" value="{{ size_max }}">
 34 |                     {# Date #}
 35 |                     <div class="text-muted" style="text-align: center">File date</div>
 36 |                     <input title="File date" id="dateSlider">
 37 |                     <input type="hidden" name="date_min" id="dateMin" value="{{ date_min }}">
 38 |                     <input type="hidden" name="date_max" id="dateMax" value="{{ date_max }}">
 39 | 
 40 |                     <div class="form-row">
 41 |                         {# File extension #}
 42 |                         <div class="form-group col-md-6">
 43 |                             <div class="text-muted">File extension</div>
 44 |                             <div class="input-group">
 45 |                                 <div class="input-group-prepend">
 46 |                                     <div class="input-group-text">.</div>
 47 |                                 </div>
 48 |                                 <input name="ext" placeholder="Extensions, comma-separated" class="form-control"
 49 |                                        value="{{ extensions }}">
 50 |                             </div>
 51 |                         </div>
 52 |                         {# Fields #}
 53 |                         <div class="col-md-5" style="margin-left: 1em">
 54 |                             <div class="text-muted">Search in</div>
 55 |                             <div class="form-group form-check form-check-inline">
 56 |                                 <input class="form-check-input" type="checkbox" id="field_name"
 57 |                                        name="field_name"{{ "checked" if field_name else "" }}>
 58 |                                 <label class="form-check-label" for="field_name">name</label>
 59 |                             </div>
 60 |                             <div class="form-group form-check form-check-inline">
 61 |                                 <input class="form-check-input" type="checkbox" id="field_trigram"
 62 |                                        name="field_trigram"{{ "checked" if field_trigram else "" }}>
 63 |                                 <label class="form-check-label" for="field_trigram">name.trigram</label>
 64 |                             </div>
 65 |                             <div class="form-group form-check form-check-inline">
 66 |                                 <input class="form-check-input" type="checkbox" id="field_path"
 67 |                                        name="field_path" {{ "checked" if field_path else "" }}>
 68 |                                 <label class="form-check-label" for="field_path">path</label>
 69 |                             </div>
 70 |                         </div>
 71 |                     </div>
 72 | 
 73 |                     <div class="text-muted">Display options</div>
 74 |                     <div class="form-row">
 75 | 
 76 |                         <div class="form-group col-md-3">
 77 |                             {# Sort order #}
 78 |                             <select class="form-control" name="sort_order" title="Sort order">
 79 |                                 <option disabled>Sort order</option>
 80 |                                 <option value="score" {{ "selected" if sort_order == "score" else "" }}>Relevance
 81 |                                 </option>
 82 |                                 <option value="size_asc" {{ "selected" if sort_order == "size_asc" else "" }}>Size
 83 |                                     ascending
 84 |                                 </option>
 85 |                                 <option value="size_dsc" {{ "selected" if sort_order == "size_dsc" else "" }}>Size
 86 |                                     descending
 87 |                                 </option>
 88 |                                 <option value="date_asc" {{ "selected" if sort_order == "date_asc" else "" }}>Date
 89 |                                     ascending
 90 |                                 </option>
 91 |                                 <option value="date_dsc" {{ "selected" if sort_order == "date_dsc" else "" }}>Date
 92 |                                     descending
 93 |                                 </option>
 94 |                                 <option value="none" {{ "selected" if sort_order == "none" else "" }}>No order
 95 |                                     (faster)
 96 |                                 </option>
 97 |                             </select>
 98 |                         </div>
 99 |                         {# Results per page #}
100 |                         <div class="form-group col-md-2">
101 |                             <select class="form-control" name="per_page" title="Results per page">
102 |                                 <option disabled>Results per page</option>
103 |                                 {% for results in results_set %}
104 |                                     <option{{ " selected" if per_page == results }}>{{ results }}</option>
105 |                                 {% endfor %}
106 |                             </select>
107 |                         </div>
108 | 
109 | 
110 |                         {# Search button #}
111 |                         <div class="form-group col-md-7">
112 | 
113 |                             <input id="s" class="btn btn-primary btn-shadow" type="submit" value="Search"
114 |                                    style="float: right">
115 |                         </div>
116 |                     </div>
117 |                     {% if show_captcha %}
118 |                         {{ captcha.get_code()|safe }}
119 |                     {% endif %}
120 | 
121 |                 </form>
122 |             </div>
123 |         </div>
124 | 
125 |         {% if count > 0 %}
126 |             <div class="card">
127 |                 <div class="card-body">
128 | 
129 |                     <span class="text-muted">{{ count }} result(s) in {{ results["took"] }}ms</span>
130 | 
131 |                     <div class="table-responsive">
132 |                         <table class="table">
133 |                             <tbody>
134 | 
135 |                             {% for hit in results["hits"]["hits"] %}
136 |                                 {% set src = hit["_source"] %}
137 |                                 {% if "name" in hit["highlight"] %}
138 |                                     {% set hl_name = hit["highlight"]["name"][0] %}
139 |                                 {% elif "name.nGram" in hit["highlight"] %}
140 |                                     {% set hl_name = hit["highlight"]["name.nGram"][0] %}
141 |                                 {% else %}
142 |                                     {% set hl_name = src["name"] %}
143 |                                 {% endif %}
144 | 
145 |                                 {% set hl_path = hit["highlight"]["path"][0] if "path" in hit["highlight"] else src["path"] %}
146 | 
147 |                                 <tr>
148 |                                     <td>
149 |                                         {% set category = get_mime(src["ext"]) %}
150 |                                         {% set url = src["website_url"] + "/" + src["path"] + "/" +  src["name"] + ("." if src["ext"] != "" else "") + src["ext"] %}
151 |                                         {# Preview #}
152 |                                         {% if category == "image" %}
153 |                                             <i class="fas fa-eye prev-icon" data-toggle="popover" tabindex="-1"
154 |                                                data-content='<img class="prev-img" src="{{ url }}">'></i>
155 |                                         {% endif %}
156 |                                         {# File name & link #}
157 |                                         <a href="{{ url }}"><span>{{ hl_name |safe }}{{ ("." if src["ext"] != "" else "") + src["ext"] }}</span></a>
158 |                                         {# File type badge #}
159 |                                         {% if category %}
160 |                                             <span class="badge badge-pill {{ get_color(category ) }}">
161 |                                                 {{ src["ext"] }}
162 |                                             </span>
163 |                                         {% endif %}
164 |                                         {# File path #}
165 |                                         <div class="text-muted" title="{{ path }}" style="font-size: 10px;">
166 |                                             <a title="See files from this website"
167 |                                                href="/website/{{ src["website_id"] }}">{{ src["website_url"] }}/</a>{{ hl_path|safe }}
168 |                                         </div>
169 |                                     </td>
170 |                                     {# File size & date #}
171 |                                     <td style="white-space: nowrap; vertical-align: top; text-align: right; font-size: 14px">
172 |                                         <div>{{ src["size"] | filesizeformat if src["size"] >= 0 else "?" }}</div>
173 |                                         <code>{{ src["mtime"] | date_format }}</code>
174 |                                     </td>
175 |                                 </tr>
176 |                             {% endfor %}
177 |                             </tbody>
178 |                         </table>
179 |                     </div>
180 |                     {% if count > (p + 1) * per_page %}
181 |                         <button class="btn btn-primary" onclick="nextPage()" style="float: right">Next</button>
182 |                     {% endif %}
183 |                     {% if p > 0 %}
184 |                         <button class="btn btn-primary" onclick="prevPage()">Previous</button>
185 |                     {% endif %}
186 | 
187 |                 </div>
188 |             </div>
189 |         {% else %}
190 |             <div class="card">
191 |                 <div class="card-body">
192 |                     <p>No results.</p>
193 |                     <p>For better results:</p>
194 |                     <ul>
195 |                         <li>Try checking the 'Match any word' box for a broader search.</li>
196 |                         <li>Make sure you don't include the file extension in your query (Use the appropriate field to
197 |                             filter file types)
198 |                         </li>
199 |                         <li>If you're searching for files in a particular website, use the <a href="/website">website
200 |                             search page</a></li>
201 |                     </ul>
202 |                 </div>
203 | 
204 |             </div>
205 |         {% endif %}
206 | 
207 | 
208 |         <script src="/static/js/report.js"></script>
209 |         <script>
210 |             $('[data-toggle="popover"]').popover({
211 |                 trigger: "focus",
212 |                 delay: {"show": 0, "hide": 100},
213 |                 placement: "right",
214 |                 html: true
215 |             });
216 | 
217 |             //Size slider
218 |             $("#sizeSlider").ionRangeSlider({
219 |                 type: "double",
220 |                 grid: false,
221 |                 force_edges: true,
222 |                 min: 0,
223 |                 max: 3684.03149864,
224 |                 from: {{ size_min ** (1 / 3) if size_min else 0 }},
225 |                 to: {{ (size_max ** (1 / 3)) if size_max else 3684.03149864}},
226 |                 min_interval: 5,
227 |                 drag_interval: true,
228 |                 prettify: function (num) {
229 | 
230 |                     if (num === 0) {
231 |                         return "0 B"
232 |                     } else if (num >= 3684) {
233 |                         return humanFileSize(num * num * num) + "+";
234 |                     }
235 | 
236 |                     return humanFileSize(num * num * num)
237 |                 },
238 |                 onFinish: function (e) {
239 |                     let size_min = (e.from * e.from * e.from);
240 |                     let size_max = (e.to * e.to * e.to);
241 | 
242 |                     if (e.to >= 3684) {
243 |                         size_max = null;
244 |                     }
245 | 
246 |                     document.getElementById("sizeMin").value = size_min;
247 |                     document.getElementById("sizeMax").value = size_max;
248 |                 }
249 |             });
250 |             //Date slider
251 |             $("#dateSlider").ionRangeSlider({
252 |                 type: "double",
253 |                 grid: false,
254 |                 force_edges: true,
255 |                 min: 0,
256 |                 max: Math.floor(Date.now() / 1000),
257 |                 from: {{ date_min if date_min else 0 }},
258 |                 to: {{ date_max if date_max else "self.max"}},
259 |                 min_interval: 3600 * 24 * 7,
260 |                 step: 3600 * 24,
261 |                 drag_interval: true,
262 |                 prettify: function (num) {
263 |                     let date = (new Date(num * 1000));
264 |                     return date.getUTCFullYear() + "-" + ("0" + (date.getUTCMonth() + 1)).slice(-2) + "-" + ("0" + date.getUTCDate()).slice(-2)
265 |                 },
266 |                 onFinish: function (e) {
267 |                     document.getElementById("dateMin").value = e.from;
268 |                     document.getElementById("dateMax").value = e.to >= e.max ? null : e.to;
269 |                 }
270 |             });
271 | 
272 |             //Next button
273 |             function nextPage() {
274 |                 document.getElementById("page").value = parseInt(document.getElementById("page").value) + 1;
275 |                 document.getElementById("sfrm").submit();
276 |             }
277 | 
278 |             function prevPage() {
279 |                 document.getElementById("page").value = parseInt(document.getElementById("page").value) - 1;
280 |                 document.getElementById("sfrm").submit();
281 |             }
282 | 
283 |         </script>
284 |     </div>
285 |     <!-- test -->
286 | 
287 | {% endblock body %}
288 | 


--------------------------------------------------------------------------------
/templates/stats.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "Stats - OD-Database" %}
 3 | {% set current_page = "stats" %}
 4 | 
 5 | {% block body %}
 6 |     <div class="container">
 7 | 
 8 |         <div class="card">
 9 |             <div class="card-header">Statistics</div>
10 |             <div class="card-body">
11 | 
12 |                 <div id="chart-wrapper" style="margin-bottom: 1em">
13 |                     <p id="loading-text">Calculating...</p>
14 |                     <canvas id="typesChart"></canvas>
15 |                 </div>
16 |                 <div id="chart-wrapper" style="margin-bottom: 1em">
17 |                     <canvas id="sizeHistogram"></canvas>
18 |                 </div>
19 |                 <div id="chart-wrapper" style="margin-bottom: 1em">
20 |                     <canvas id="dateHistogram"></canvas>
21 |                 </div>
22 |                 <div id="chart-wrapper" style="margin-bottom: 1em">
23 |                     <canvas id="websiteScatter"></canvas>
24 |                 </div>
25 | 
26 | 
27 |                 <h4>Database stats</h4>
28 |                 <table class="table table-striped">
29 |                     <tbody>
30 |                     <tr>
31 |                         <th>Database index size</th>
32 |                         <td id="esIndexSize"></td>
33 |                     </tr>
34 |                     <tr>
35 |                         <th>Query count</th>
36 |                         <td id="esSearchCount"></td>
37 |                     </tr>
38 |                     <tr>
39 |                         <th>Total query time</th>
40 |                         <td id="esSearchTime"></td>
41 |                     </tr>
42 |                     <tr>
43 |                         <th>Average time per query</th>
44 |                         <td id="esSearchTimeAvg"></td>
45 |                     </tr>
46 |                     <tr>
47 |                         <th>Total file count</th>
48 |                         <td id="totalCount"></td>
49 |                     </tr>
50 |                     <tr>
51 |                         <th>Size total</th>
52 |                         <td id="totalSize"></td>
53 |                     </tr>
54 |                     <tr>
55 |                         <th>Size average</th>
56 |                         <td id="sizeAvg"></td>
57 |                     </tr>
58 |                     <tr>
59 |                         <th>Size standard deviation</th>
60 |                         <td id="sizeStdDeviation"></td>
61 |                     </tr>
62 |                     <tr>
63 |                         <th>Size standard deviation bounds (σ = 1)</th>
64 |                         <td id="sizeStdDeviationBounds"></td>
65 |                     </tr>
66 |                     <tr>
67 |                         <th>Size variance</th>
68 |                         <td id="sizeVariance"></td>
69 |                     </tr>
70 |                     </tbody>
71 |                 </table>
72 |             </div>
73 |         </div>
74 | 
75 |     </div>
76 | 
77 |     <script src="/static/js/report.js"></script>
78 |     <script>
79 |         var xhttp = new XMLHttpRequest();
80 | 
81 |         xhttp.onreadystatechange = function () {
82 |             if (this.readyState === 4 && this.status === 200) {
83 | 
84 |                 let rData = JSON.parse(this.responseText);
85 | 
86 |                 drawChart(rData);
87 |                 drawSizeHistogram(rData);
88 |                 drawDateHistogram(rData);
89 |                 drawWebsiteScatter(rData);
90 |                 fillDatabaseTable(rData);
91 | 
92 |                 document.getElementById("loading-text").innerHTML = "";
93 |             }
94 |         };
95 |         xhttp.open("GET", "/stats/json_chart", true);
96 |         xhttp.send();
97 |     </script>
98 | {% endblock body %}
99 | 


--------------------------------------------------------------------------------
/templates/submit.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "OD-Database - Submit website" %}
 3 | {% set current_page = "submit" %}
 4 | 
 5 | {% block body %}
 6 |     <div class="container">
 7 |         <div class="card">
 8 |             <div class="card-header">
 9 |                 <ul class="nav nav-tabs card-header-tabs" id="myTab" role="tablist">
10 |                     <li class="nav-item">
11 |                         <a class="nav-link active" id="normal-tab" data-toggle="tab" href="#normal" role="tab"
12 |                            aria-controls="normal" aria-selected="true">Single website</a>
13 |                     </li>
14 |                     <li class="nav-item">
15 |                         <a class="nav-link" id="bulk-tab" data-toggle="tab" href="#bulk" role="tab" aria-controls="bulk"
16 |                            aria-selected="false">Bulk</a>
17 |                     </li>
18 |                 </ul>
19 |             </div>
20 |             <div class="card-body">
21 | 
22 |                 <div class="tab-content" id="myTabContent">
23 |                     <div class="tab-pane fade show active" id="normal" role="tabpanel" aria-labelledby="normal-tab">
24 |                         {# Single website #}
25 |                         <form action="/enqueue" method="POST">
26 |                             <div class="form-group">
27 |                                 <input class="form-control" name="url" id="url" placeholder="URL">
28 |                             </div>
29 |                             {% if show_captcha %}
30 |                                 <div class="form-row">
31 |                                     {{ captcha.get_code()|safe }}
32 |                                 </div>
33 |                             {% endif %}
34 |                             <input class="btn btn-primary" type="submit" value="Submit" title="Submit open directory">
35 |                         </form>
36 | 
37 |                     </div>
38 |                     <div class="tab-pane fade" id="bulk" role="tabpanel" aria-labelledby="bulk-tab">
39 |                         {# Bulk #}
40 |                         <form action="/enqueue_bulk" method="POST">
41 |                             <div class="form-group">
42 |                                 <textarea class="form-control" name="urls" id="urls" rows="10"
43 |                                           placeholder="One URL per line, max. 10"></textarea>
44 |                             </div>
45 |                             {% if show_captcha %}
46 |                                 <div class="form-row">
47 |                                     {{ captcha.get_code()|safe }}
48 |                                 </div>
49 |                             {% endif %}
50 | 
51 |                             <input class="btn btn-primary" type="submit" value="Submit"
52 |                                    title="Submit open directories">
53 |                         </form>
54 |                     </div>
55 |                 </div>
56 | 
57 | 
58 |                 <hr>
59 | 
60 |                 <p class="text-muted">By submitting this form you agree that your IP address and User Agent will be
61 |                     saved (for debugging purposes only).
62 |                 </p>
63 |             </div>
64 |         </div>
65 |     </div>
66 | {% endblock body %}
67 | 


--------------------------------------------------------------------------------
/templates/website.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "OD-Database - Website details" %}
 3 | {% set current_page = "website" %}
 4 | 
 5 | {% block body %}
 6 |     <div class="container">
 7 |         <div class="card">
 8 |             <div class="card-header">Information for {{ website.url | truncate(80) }}</div>
 9 |             <div class="card-body">
10 | 
11 |                 <div id="chart-wrapper" style="margin-bottom: 1em">
12 |                     <p id="loading-text">Calculating...</p>
13 |                     <canvas id="typesChart"></canvas>
14 |                     <script src="/static/js/Chart.min.js"></script>
15 |                     <script src="/static/js/report.js"></script>
16 |                 </div>
17 | 
18 |                 <div class="table-responsive">
19 |                     <table class="table">
20 |                         <tr>
21 |                             <th>Base url</th>
22 |                             <td id="baseUrl"></td>
23 |                         </tr>
24 | 
25 |                         <tr>
26 |                             <th>File count</th>
27 |                             <td id="fileCount"></td>
28 |                         </tr>
29 | 
30 |                         <tr>
31 |                             <th>Total size</th>
32 |                             <td id="totalSize"></td>
33 |                         </tr>
34 | 
35 |                         <tr>
36 |                             <th>Last updated</th>
37 |                             <td id="reportTime"></td>
38 |                         </tr>
39 |                     </table>
40 |                 </div>
41 | 
42 |                 <hr>
43 |                 <a href="/website/{{ website.id }}/links" class="btn btn-shadow btn-primary">Link list</a>
44 |                 <a href="/website/{{ website.id }}/json_chart" class="btn btn-shadow btn-primary">Summary (JSON)</a>
45 |                 {% if "username" in session %}
46 |                     <a href="/website/{{ website.id }}/clear" class="btn btn-danger"><i class="fas fa-exclamation"></i>
47 |                         Clear</a>
48 |                     <a href="/website/{{ website.id }}/delete" class="btn btn-danger"><i class="fas fa-trash"></i>
49 |                         Delete</a>
50 |                     <a href="/website/{{ website.id }}/rescan" class="btn btn-secondary"><i class="fas fa-redo"></i>
51 |                         rescan</a>
52 |                 {% endif %}
53 |             </div>
54 |         </div>
55 |     </div>
56 |     <script>
57 |         var xhttp = new XMLHttpRequest();
58 | 
59 |         xhttp.onreadystatechange = function () {
60 |             if (this.readyState === 4 && this.status === 200) {
61 | 
62 |                 var rData = this.responseText;
63 | 
64 |                 drawChart(JSON.parse(rData));
65 |                 fillWebsiteTable(JSON.parse(rData));
66 | 
67 |                 document.getElementById("loading-text").innerHTML = "";
68 |             }
69 |         };
70 |         xhttp.open("GET", "./json_chart", true);
71 |         xhttp.send();
72 |     </script>
73 | {% endblock body %}
74 | 


--------------------------------------------------------------------------------
/templates/websites.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}
 2 | {% set title = "OD-Database - Websites" %}
 3 | {% set current_page = "website" %}
 4 | 
 5 | 
 6 | {% block body %}
 7 |     <div class="container">
 8 |         <div class="card">
 9 |             <div class="card-header">Go to website</div>
10 |             <div class="card-body">
11 | 
12 |                 <a class="btn btn-secondary" href="/website/random">Go to random website</a>
13 |                 <hr>
14 |                 <h3>Website search</h3>
15 |                 <form>
16 |                     <div class="form-row">
17 |                         <div class="col-md-11">
18 |                             <input class="form-control" placeholder="URL" name="url" value="{{ url }}">
19 |                         </div>
20 |                         <div class="col-md-1">
21 |                             <input class="btn btn-primary" type="submit" value="Search">
22 |                         </div>
23 |                     </div>
24 |                 </form>
25 |             </div>
26 |         </div>
27 |     </div>
28 |     <div class="container">
29 |         <div class="card">
30 |             <div class="card-header">Websites</div>
31 |             <div class="card-body">
32 |                 <table class="table table-striped">
33 |                     <thead>
34 |                     <tr>
35 |                         <th>Url</th>
36 |                         <th>Last crawled</th>
37 |                     </tr>
38 |                     </thead>
39 | 
40 |                     {% for website in websites %}
41 |                         <tr>
42 |                             <td><a title="Click for stats & link list" href="/website/{{ website[0] }}">{{ website[1] | truncate(70) }}</a></td>
43 |                             <td>{{ website[2] }}</td>
44 |                         </tr>
45 |                     {% endfor %}
46 |                 </table>
47 |                 {% if websites|length == per_page %}
48 |                     <a href="/website?url={{ url }}&p={{ p + 1 }}" class="btn btn-primary" style="float: right">Next</a>
49 |                 {% endif %}
50 |                 {% if p > 0 %}
51 |                     <a href="/website?url={{ url }}&p={{ p - 1 }}" class="btn btn-primary">Previous</a>
52 |                 {% endif %}
53 |             </div>
54 |         </div>
55 |     </div>
56 | {% endblock body %}
57 | 


--------------------------------------------------------------------------------
/tt_config.yml:
--------------------------------------------------------------------------------
 1 | server:
 2 |   address: "0.0.0.0:3010"
 3 | 
 4 | database:
 5 |   conn_str: "postgres://task_tracker:changeme@tt_db/task_tracker?sslmode=disable"
 6 |   log_levels: ["error", "info", "warn"]
 7 | 
 8 | git:
 9 |   webhook_hash: "sha256"
10 |   webhook_sig_header: "X-Gogs-Signature"
11 | 
12 | log:
13 |   level: "trace"
14 | 
15 | session:
16 |   cookie_name: "tt"
17 |   expiration: "48h"
18 | 
19 | monitoring:
20 |   snapshot_interval: "120s"
21 |   history_length: "1800h"
22 | 
23 | maintenance:
24 |   reset_timed_out_tasks_interval: "10m"
25 | 


--------------------------------------------------------------------------------
/uwsgi.ini:
--------------------------------------------------------------------------------
1 | [uwsgi]
2 | module = main
3 | callable = app
4 | 
5 | enable-threads = true
6 | processes = 4
7 | threads = 16
8 | 
9 | disable-logging = True


--------------------------------------------------------------------------------
/views.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from multiprocessing.pool import Pool
  4 | from urllib.parse import urlparse
  5 | 
  6 | from flask import render_template, redirect, request, flash, abort, Response, session
  7 | from flask_caching import Cache
  8 | 
  9 | import captcha
 10 | import config
 11 | import od_util
 12 | from common import db, taskManager, searchEngine, logger, require_role
 13 | from database import Website
 14 | from search.search import InvalidQueryException
 15 | from tasks import Task
 16 | 
 17 | 
 18 | def setup_views(app):
 19 |     cache = Cache(app, config={
 20 |         "CACHE_TYPE": "redis",
 21 |         "CACHE_REDIS_HOST": config.REDIS_HOST,
 22 |         "CACHE_REDIS_PORT": config.REDIS_PORT,
 23 |     })
 24 | 
 25 |     @app.route("/dl")
 26 |     @cache.cached(120)
 27 |     def downloads():
 28 |         # Get content of downloads directory
 29 |         dl_dir = "static/downloads/"
 30 |         dir_content = os.listdir(dl_dir)
 31 | 
 32 |         # Make paths relative to working directory
 33 |         # Only allow csv files
 34 |         files = [
 35 |             (name, os.path.join(dl_dir, name))
 36 |             for name in dir_content
 37 |             if name.find(".csv") != -1
 38 |         ]
 39 | 
 40 |         # Stat files
 41 |         # Remove any dirs placed accidentally
 42 |         files = [
 43 |             (f, full, os.stat(full))
 44 |             for f, full in files
 45 |             if os.path.isfile(full)
 46 |         ]
 47 | 
 48 |         if len(files) == 0:
 49 |             logger.warning("No export file to display in /dl")
 50 | 
 51 |         return render_template("downloads.html", export_file_stats=files)
 52 | 
 53 |     @app.route("/stats")
 54 |     @cache.cached(120)
 55 |     def stats_page():
 56 |         return render_template("stats.html")
 57 | 
 58 |     @app.route("/stats/json_chart")
 59 |     @cache.cached(240)
 60 |     def stats_json():
 61 |         stats = searchEngine.get_global_stats()
 62 |         if stats:
 63 |             db.join_website_on_stats(stats)
 64 |             return Response(json.dumps(stats), mimetype="application/json")
 65 |         return abort(500)
 66 | 
 67 |     @app.route("/website/<int:website_id>/")
 68 |     def website_info(website_id):
 69 |         website = db.get_website_by_id(website_id)
 70 | 
 71 |         if website:
 72 |             return render_template("website.html", website=website)
 73 |         else:
 74 |             abort(404)
 75 | 
 76 |     @app.route("/website/<int:website_id>/json_chart")
 77 |     @cache.memoize(60)
 78 |     def website_json_chart(website_id):
 79 |         website = db.get_website_by_id(website_id)
 80 | 
 81 |         if website:
 82 |             stats = searchEngine.get_stats(website_id)
 83 |             stats["base_url"] = website.url
 84 |             stats["report_time"] = website.last_modified
 85 |             return Response(json.dumps(stats), mimetype="application/json")
 86 |         else:
 87 |             abort(404)
 88 | 
 89 |     @app.route("/website/<int:website_id>/links")
 90 |     def website_links(website_id):
 91 |         website = db.get_website_by_id(website_id)
 92 | 
 93 |         if website:
 94 |             links = searchEngine.get_link_list(website_id, website.url)
 95 |             return Response("\n".join(links), mimetype="text/plain")
 96 |         else:
 97 |             abort(404)
 98 | 
 99 |     @app.route("/website/")
100 |     def websites():
101 |         page = int(request.args.get("p")) if "p" in request.args else 0
102 |         url = request.args.get("url") if "url" in request.args else ""
103 |         if url:
104 |             parsed_url = urlparse(url)
105 |             if parsed_url.scheme:
106 |                 search_term = (parsed_url.scheme + "://" + parsed_url.netloc)
107 |             else:
108 |                 flash("Sorry, I was not able to parse this url format. "
109 |                       "Make sure you include the appropriate scheme (http/https/ftp)", "warning")
110 |                 search_term = ""
111 |         else:
112 |             search_term = url
113 | 
114 |         return render_template("websites.html",
115 |                                websites=db.get_websites(50, page, search_term),
116 |                                p=page, url=search_term, per_page=50)
117 | 
118 |     @app.route("/website/random")
119 |     def random_website():
120 |         rand_id = db.get_random_website_id()
121 |         if rand_id:
122 |             return redirect("/website/" + str())
123 |         return redirect("/website/")
124 | 
125 |     @app.route("/website/<int:website_id>/clear")
126 |     def admin_clear_website(website_id):
127 |         require_role("admin")
128 | 
129 |         searchEngine.delete_docs(website_id)
130 |         flash("Cleared all documents associated with this website", "success")
131 |         return redirect("/website/" + str(website_id))
132 | 
133 |     @app.route("/website/<int:website_id>/delete")
134 |     def admin_delete_website(website_id):
135 |         require_role("admin")
136 | 
137 |         searchEngine.delete_docs(website_id)
138 |         db.delete_website(website_id)
139 |         flash("Deleted website " + str(website_id), "success")
140 |         return redirect("/website/")
141 | 
142 |     @app.route("/website/<int:website_id>/rescan")
143 |     def admin_rescan_website(website_id):
144 |         require_role("admin")
145 |         website = db.get_website_by_id(website_id)
146 | 
147 |         if website:
148 |             priority = request.args.get("priority") if "priority" in request.args else 1
149 |             task = Task(website_id, website.url, priority)
150 |             taskManager.queue_task(task)
151 | 
152 |             flash("Enqueued rescan task", "success")
153 |         else:
154 |             flash("Website does not exist", "danger")
155 |         return redirect("/website/" + str(website_id))
156 | 
157 |     @app.route("/search")
158 |     def search():
159 |         results = 0
160 |         q = request.args.get("q") if "q" in request.args else ""
161 |         sort_order = request.args.get("sort_order") if "sort_order" in request.args else "score"
162 | 
163 |         page = request.args.get("p") if "p" in request.args else "0"
164 |         page = int(page) if page.isdigit() else 0
165 | 
166 |         per_page = request.args.get("per_page") if "per_page" in request.args else "50"
167 |         per_page = int(per_page) if per_page.isdigit() else "50"
168 |         per_page = per_page if per_page in config.RESULTS_PER_PAGE else 50
169 | 
170 |         extensions = request.args.get("ext") if "ext" in request.args else None
171 |         extensions = [ext.strip().strip(".").lower() for ext in extensions.split(",")] if extensions else []
172 | 
173 |         size_min = request.args.get("size_min") if "size_min" in request.args else "size_min"
174 |         size_min = int(size_min) if size_min.isdigit() else 0
175 |         size_max = request.args.get("size_max") if "size_max" in request.args else "size_max"
176 |         size_max = int(size_max) if size_max.isdigit() else 0
177 | 
178 |         date_min = request.args.get("date_min") if "date_min" in request.args else "date_min"
179 |         date_min = int(date_min) if date_min.isdigit() else 0
180 |         date_max = request.args.get("date_max") if "date_max" in request.args else "date_max"
181 |         date_max = int(date_max) if date_max.isdigit() else 0
182 | 
183 |         match_all = "all" in request.args
184 | 
185 |         field_name = "field_name" in request.args
186 |         field_trigram = "field_trigram" in request.args
187 |         field_path = "field_path" in request.args
188 | 
189 |         if not field_name and not field_trigram and not field_path:
190 |             # If no fields are selected, search in all
191 |             field_name = field_path = field_trigram = True
192 | 
193 |         fields = []
194 |         if field_path:
195 |             fields.append("path")
196 |         if field_name:
197 |             fields.append("name^5")
198 |         if field_trigram:
199 |             fields.append("name.nGram^2")
200 | 
201 |         if len(q) >= 3:
202 | 
203 |             blocked = False
204 |             hits = None
205 |             if not config.CAPTCHA_SEARCH or captcha.verify():
206 | 
207 |                 try:
208 |                     hits = searchEngine.search(q, page, per_page, sort_order,
209 |                                                extensions, size_min, size_max, match_all, fields, date_min, date_max)
210 |                     hits = db.join_website_on_search_result(hits)
211 |                 except InvalidQueryException as e:
212 |                     flash("<strong>Invalid query:</strong> " + str(e), "warning")
213 |                     blocked = True
214 |                 except:
215 |                     flash("Query failed, this could mean that the search server is overloaded or is not reachable. "
216 |                           "Please try again later", "danger")
217 | 
218 |                 results = hits["hits"]["total"]["value"] if not isinstance(hits["hits"]["total"], int) else \
219 |                     hits["hits"]["total"] if hits else -1
220 |                 took = hits["took"] if hits else -1
221 |                 forwarded_for = request.headers["X-Forwarded-For"] if "X-Forwarded-For" in request.headers else None
222 | 
223 |                 logger.info("SEARCH '{}' [res={}, t={}, p={}x{}, ext={}] by {}{}"
224 |                             .format(q, results, took, page, per_page, str(extensions),
225 |                                     request.remote_addr, "_" + forwarded_for if forwarded_for else ""))
226 | 
227 |                 db.log_search(request.remote_addr, forwarded_for, q, extensions, page, blocked, results, took)
228 |                 if blocked:
229 |                     return redirect("/search")
230 |             else:
231 |                 flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
232 | 
233 |         else:
234 |             hits = None
235 | 
236 |         return render_template("search.html",
237 |                                count=results,
238 |                                results=hits,
239 |                                q=q,
240 |                                p=page, per_page=per_page,
241 |                                sort_order=sort_order,
242 |                                results_set=config.RESULTS_PER_PAGE,
243 |                                extensions=",".join(extensions),
244 |                                size_min=size_min, size_max=size_max,
245 |                                match_all=match_all,
246 |                                field_trigram=field_trigram, field_path=field_path, field_name=field_name,
247 |                                date_min=date_min, date_max=date_max,
248 |                                show_captcha=config.CAPTCHA_SEARCH, captcha=captcha)
249 | 
250 |     @app.route("/contribute")
251 |     @cache.cached(600)
252 |     def contribute():
253 |         return render_template("contribute.html")
254 | 
255 |     @app.route("/")
256 |     def home():
257 |         try:
258 |             stats = searchEngine.get_global_stats()
259 |             stats["website_count"] = len(db.get_all_websites())
260 |         except:
261 |             stats = {}
262 |         return render_template("home.html", stats=stats,
263 |                                show_captcha=config.CAPTCHA_SEARCH, captcha=captcha)
264 | 
265 |     @app.route("/submit")
266 |     def submit():
267 |         return render_template("submit.html", captcha=captcha, show_captcha=config.CAPTCHA_SUBMIT)
268 | 
269 |     def try_enqueue(url):
270 |         url = os.path.join(url, "")
271 |         url = od_util.get_top_directory(url)
272 | 
273 |         if not od_util.is_valid_url(url):
274 |             return "<strong>Error:</strong> Invalid url. Make sure to include the appropriate scheme.", "warning"
275 | 
276 |         website = db.get_website_by_url(url)
277 |         if website:
278 |             return "Website already exists", "danger"
279 | 
280 |         website = db.website_exists(url)
281 |         if website:
282 |             return "A parent directory of this url has already been posted", "danger"
283 | 
284 |         if db.is_blacklisted(url):
285 |             return "<strong>Error:</strong> " \
286 |                    "Sorry, this website has been blacklisted. If you think " \
287 |                    "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
288 | 
289 |         if not od_util.is_od(url):
290 |             return "<strong>Error:</strong>" \
291 |                    "The anti-spam algorithm determined that the submitted url is not " \
292 |                    "an open directory or the server is not responding. If you think " \
293 |                    "this is an error, please <a href='/contribute'>contact me</a>.", "danger"
294 | 
295 |         website_id = db.insert_website(Website(url, str(request.remote_addr + "_" +
296 |                                                         request.headers.get("X-Forwarded-For", "")),
297 |                                                request.user_agent))
298 | 
299 |         task = Task(website_id, url, priority=1)
300 |         taskManager.queue_task(task)
301 | 
302 |         return "The website has been added to the queue", "success"
303 | 
304 |     @app.route("/enqueue", methods=["POST"])
305 |     def enqueue():
306 |         if not config.CAPTCHA_SUBMIT or captcha.verify():
307 | 
308 |             url = os.path.join(request.form.get("url"), "")
309 |             message, msg_type = try_enqueue(url)
310 |             flash(message, msg_type)
311 | 
312 |             return redirect("/submit")
313 | 
314 |         else:
315 |             flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
316 |             return redirect("/submit")
317 | 
318 |     def check_url(url):
319 |         url = os.path.join(url, "")
320 |         try_enqueue(url)
321 |         return None
322 | 
323 |     @app.route("/enqueue_bulk", methods=["POST"])
324 |     def enqueue_bulk():
325 |         if not config.CAPTCHA_SUBMIT or captcha.verify():
326 | 
327 |             urls = request.form.get("urls")
328 |             if urls:
329 |                 urls = urls.split()
330 | 
331 |                 if 0 < len(urls) <= 1000:  # TODO: Load from config & adjust placeholder/messages?
332 | 
333 |                     pool = Pool(processes=6)
334 |                     pool.map(func=check_url, iterable=urls)
335 |                     pool.close()
336 | 
337 |                     flash("Submitted websites to the queue", "success")
338 | 
339 |                     return redirect("/submit")
340 | 
341 |                 else:
342 |                     flash("Too few or too many urls, please submit 1-10 urls", "danger")
343 |                     return redirect("/submit")
344 |             else:
345 |                 flash("Too few or too many urls, please submit 1-10 urls", "danger")
346 |                 return redirect("/submit")
347 |         else:
348 |             flash("<strong>Error:</strong> Invalid captcha please try again", "danger")
349 |             return redirect("/submit")
350 | 
351 |     @app.route("/admin")
352 |     def admin_login_form():
353 |         if "username" in session:
354 |             return redirect("/dashboard")
355 |         return render_template("admin.html", captcha=captcha, show_captcha=config.CAPTCHA_LOGIN)
356 | 
357 |     @app.route("/login", methods=["POST"])
358 |     def admin_login():
359 |         if not config.CAPTCHA_LOGIN or captcha.verify():
360 | 
361 |             username = request.form.get("username")
362 |             password = request.form.get("password")
363 | 
364 |             if db.check_login(username, password):
365 |                 session["username"] = username
366 |                 flash("Logged in", "success")
367 |                 return redirect("/dashboard")
368 | 
369 |             flash("Invalid username/password combo", "danger")
370 |             return redirect("/admin")
371 | 
372 |         else:
373 |             flash("Invalid captcha", "danger")
374 |             return redirect("/admin")
375 | 
376 |     @app.route("/logout")
377 |     def admin_logout():
378 |         session.clear()
379 |         flash("Logged out", "info")
380 |         return redirect("/")
381 | 
382 |     @app.route("/dashboard")
383 |     def admin_dashboard():
384 |         require_role("admin")
385 |         tokens = db.get_tokens()
386 |         blacklist = db.get_blacklist()
387 | 
388 |         return render_template("dashboard.html", api_tokens=tokens, blacklist=blacklist)
389 | 
390 |     @app.route("/blacklist/add", methods=["POST"])
391 |     def admin_blacklist_add():
392 |         require_role("admin")
393 |         url = request.form.get("url")
394 |         db.add_blacklist_website(url)
395 |         flash("Added item to blacklist", "success")
396 |         return redirect("/dashboard")
397 | 
398 |     @app.route("/blacklist/<int:blacklist_id>/delete")
399 |     def admin_blacklist_remove(blacklist_id):
400 |         require_role("admin")
401 |         db.remove_blacklist_website(blacklist_id)
402 |         flash("Removed blacklist item", "success")
403 |         return redirect("/dashboard")
404 | 
405 |     @app.route("/generate_token", methods=["POST"])
406 |     def admin_generate_token():
407 |         require_role("admin")
408 |         description = request.form.get("description")
409 | 
410 |         db.generate_api_token(description)
411 |         flash("Generated API token", "success")
412 | 
413 |         return redirect("/dashboard")
414 | 
415 |     @app.route("/del_token", methods=["POST"])
416 |     def admin_del_token():
417 |         require_role("admin")
418 |         token = request.form.get("token")
419 | 
420 |         db.delete_token(token)
421 |         flash("Deleted API token", "success")
422 |         return redirect("/dashboard")
423 | 


--------------------------------------------------------------------------------