├── .gitignore ├── ExtensionCrawler ├── __init__.py ├── archive.py ├── cdnjs_crawler.py ├── cdnjs_git.py ├── config.py ├── crx.py ├── db.py ├── dbbackend │ ├── __init__.py │ ├── mysql_backend.py │ └── mysql_process.py ├── discover.py ├── file_identifiers.py ├── js_decomposer.py ├── js_mincer.py ├── request_manager.py └── util.py ├── LICENSE ├── PermissionAnalysis └── grep-unused-permissions ├── README.md ├── analysis └── library-detector │ ├── angular │ ├── 2018-11-28-results.csv │ ├── angular.py │ ├── angularversions.txt │ ├── ideas.txt │ └── plotting.py │ └── jquery.py ├── cdnjs-git-miner ├── crawler ├── create-db ├── crx-extract ├── crx-jsinventory ├── crx-jsstrings ├── crx-tool ├── database ├── README.md ├── config │ └── my.cnf ├── queries │ ├── get_added_content_scripts.sql │ └── get_added_permissions.sql ├── schemas │ ├── category.sql │ ├── cdnjs.sql │ ├── content_script_url.sql │ ├── crx.sql │ ├── crxfile.sql │ ├── extension.sql │ ├── libdet.sql │ ├── permission.sql │ ├── reply.sql │ ├── reply_comment.sql │ ├── review.sql │ ├── review_comment.sql │ ├── status.sql │ ├── support.sql │ └── support_comment.sql ├── scripts │ ├── mariabackup-full │ ├── mariabackup-inc │ ├── mariabackup-schemas │ └── showgrants └── views │ ├── extension_most_recent.sql │ ├── extension_most_recent_small.sql │ ├── extension_most_recent_until_date.sql │ ├── extension_second_most_recent.sql │ ├── extension_second_most_recent_until_date.sql │ ├── extension_small.sql │ └── extension_update.sql ├── extgrep ├── requirements.txt ├── resources └── js_identifier.json ├── scripts ├── hpc-utilities │ └── hpc-submit ├── maintainance │ ├── maintain_archive │ └── xz.sge ├── monitoring │ ├── download-report-one-week.gp │ └── global_update_monitor.sh ├── singularity │ ├── ExtensionCrawler.def │ ├── build.sh │ ├── singularitybuilder-arch.Dockerfile │ └── singularitybuilder-arch.sh └── update │ ├── global_update.sh │ └── update_cdnjs.sh ├── setup.py ├── sge ├── create-db-cdnjs.sge ├── create-db.sge └── create-db.sh └── simhashbucket /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Python 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # vi 62 | *.swp 63 | 64 | # vscode 65 | .vscode 66 | 67 | archive 68 | .ropeproject 69 | ExtensionCrawler.img 70 | ExtensionCrawler-cdnjs.img 71 | 72 | .idea 73 | venv 74 | -------------------------------------------------------------------------------- /ExtensionCrawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalhacking/ExtensionCrawler/853d69d1a3478eaa3b8649f9dd754a044a561cc5/ExtensionCrawler/__init__.py -------------------------------------------------------------------------------- /ExtensionCrawler/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | """Global configuration of the Extension Crawler and related tools.""" 18 | 19 | import os 20 | import json 21 | 22 | 23 | def const_sitemap_url(): 24 | """Sitemap URL.""" 25 | return "https://chrome.google.com/webstore/sitemap" 26 | 27 | 28 | def const_sitemap_scheme(): 29 | """URL of Sitemap schema.""" 30 | return "http://www.sitemaps.org/schemas/sitemap/0.9" 31 | 32 | 33 | def const_overview_url(ext_id): 34 | """URL template for the overview page of an extension.""" 35 | return 'https://chrome.google.com/webstore/detail/{}'.format(ext_id) 36 | 37 | 38 | def const_store_url(): 39 | """Main URL of the Chrome store.""" 40 | return 'https://chrome.google.com/webstore' 41 | 42 | 43 | def const_review_url(): 44 | """Base URL of the review page of an extension.""" 45 | return 'https://chrome.google.com/reviews/components' 46 | 47 | 48 | def const_review_search_url(): 49 | """Base URL for review search.""" 50 | return 'https://chrome.google.com/reviews/json/search' 51 | 52 | 53 | def const_support_url(): 54 | """Base URL for support pages.""" 55 | return 'https://chrome.google.com/reviews/components' 56 | 57 | 58 | def const_download_url(): 59 | """Base download URL.""" 60 | return ('https://clients2.google.com/service/update2/' + 61 | 'crx?response=redirect&nacl_arch=x86-64&' + 62 | 'prodversion=9999.0.9999.0&x=id%3D{}%26uc') 63 | 64 | 65 | def const_categories(): 66 | """List of known categories.""" 67 | return [ 68 | 'extensions', 'ext/22-accessibility', 'ext/10-blogging', 69 | 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', 70 | 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', 71 | 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', 72 | 'ext/13-sports' 73 | ] 74 | 75 | 76 | def const_support_payload(ext_id, start, end): 77 | """Payload for requesting support pages.""" 78 | return ( 79 | 'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' + 80 | '"specs":[{{"type":"CommentThread",' + 81 | '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",' 82 | + '"groups":"chrome_webstore_support",' + '"startindex":"{}",' + 83 | '"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' + 84 | '"internedValues":[]}}').format(ext_id, start, end) 85 | 86 | 87 | def const_review_payload(ext_id, start, end): 88 | """Payload for requesting review pages.""" 89 | return ( 90 | 'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' + 91 | '"specs":[{{"type":"CommentThread",' + 92 | '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",' 93 | + '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' + 94 | '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' + 95 | '"internedKeys":[],' + '"internedValues":[]}}').format( 96 | ext_id, start, end) 97 | 98 | 99 | def const_review_search_payload(params): 100 | """Payload for searches.""" 101 | pre = """req={"applicationId":94,"searchSpecs":[""" 102 | post = """]}&requestSource=widget""" 103 | args = [] 104 | for extid, author, start, numresults, groups in params: 105 | args += [ 106 | """{{"requireComment":true,"entities":[{{"annotation":""" 107 | """{{"groups":{},"author":"{}",""" 108 | """"url":"http://chrome.google.com/extensions/permalink?id={}"}}}}],""" 109 | """"matchExtraGroups":true,"startIndex":{},"numResults":{},""" 110 | """"includeNicknames":true,"locale": {{"language": "en","country": "us"}}}}""" 111 | .format(json.dumps(groups), author, extid, start, numresults) 112 | ] 113 | 114 | return pre + ",".join(args) + post 115 | 116 | 117 | def get_local_archive_dir(ext_id): 118 | """Local archive dir of extension.""" 119 | return "{}".format(ext_id[:3]) 120 | 121 | 122 | def archive_file(archivedir, ext_id): 123 | """Archive tar of an extension.""" 124 | return os.path.join( 125 | str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar") 126 | 127 | 128 | def const_basedir(): 129 | """Top-level directory for the extension crawler archive.""" 130 | if "EXTENSION_ARCHIVE" in os.environ: 131 | return os.environ.get("EXTENSION_ARCHIVE") 132 | else: 133 | return "archive" 134 | 135 | 136 | def const_parallel_downloads(): 137 | """Number of parallel downloads.""" 138 | return 36 139 | 140 | 141 | def const_verbose(): 142 | """Default verbosity.""" 143 | return True 144 | 145 | 146 | def const_use_process_pool(): 147 | """Use ProcessPool (from module 'pebble') for concurrency.""" 148 | return False 149 | 150 | 151 | def const_log_format(ext_id="-"*32): 152 | return "%(process)6s %(asctime)s %(levelname)8s {} %(message)s".format(ext_id) 153 | 154 | 155 | def const_discover(): 156 | """Default configuration of discovery mode""" 157 | return False 158 | 159 | 160 | def const_ext_timeout(): 161 | """Timeout for downloading an individual extension (2 hours).""" 162 | return 2*60*60 163 | 164 | 165 | def const_mysql_config_file(): 166 | return os.path.expanduser("~/.my.cnf") 167 | 168 | 169 | def const_mysql_maxtries(): 170 | return 12 171 | 172 | 173 | def const_mysql_try_wait(): 174 | return 300 175 | -------------------------------------------------------------------------------- /ExtensionCrawler/crx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | """Utility functions for working with Chrome extensionsx archives, 19 | i.e., *.crx files.""" 20 | 21 | import io 22 | import os 23 | import zipfile 24 | import binascii 25 | from Cryptodome.PublicKey import RSA 26 | from Cryptodome.Hash import SHA 27 | from Cryptodome.Signature import PKCS1_v1_5 28 | 29 | 30 | class CrxFile: 31 | """Record class for storing crx files.""" 32 | 33 | def __init__(self, filename, magic, version, public_key_len, sig_len, 34 | public_key, sig, header_len, data): 35 | self.file = filename 36 | self.magic = magic 37 | self.version = version 38 | self.public_key_len = public_key_len 39 | self.sig_len = sig_len 40 | self.public_key = public_key 41 | self.sig = sig 42 | self.header_len = header_len 43 | self.data = data 44 | 45 | 46 | def is_valid_magic(magic): 47 | """Check magic matches the magic bytes of the crx specificaton.""" 48 | return magic == b'Cr24' 49 | 50 | 51 | def is_crxfile(filename): 52 | """Check magic number: crx files should start with \"Cr24\".""" 53 | file = open(filename, 'rb') 54 | magic = file.read(4) 55 | file.close() 56 | return is_valid_magic(magic) 57 | 58 | 59 | def check_signature(public_key, sig, data): 60 | """Check validity of signature contained in the crx file.""" 61 | key = RSA.importKey(public_key) 62 | crxhash = SHA.new(data) 63 | return PKCS1_v1_5.new(key).verify(crxhash, sig) 64 | 65 | 66 | def read_crx(filename): 67 | """Read header of an crx file (https://developer.chrome.com/extensions/crx).""" 68 | file = open(filename, 'rb') 69 | magic = file.read(4) 70 | version = int.from_bytes(file.read(4), byteorder='little') 71 | public_key_len = int.from_bytes(file.read(4), byteorder='little') 72 | sig_len = int.from_bytes(file.read(4), byteorder='little') 73 | public_key = file.read(public_key_len) 74 | sig = file.read(sig_len) 75 | header_len = 16 + public_key_len + sig_len 76 | data = file.read() 77 | file.close() 78 | return CrxFile(filename, magic, version, public_key_len, sig_len, 79 | public_key, sig, header_len, data) 80 | 81 | 82 | def print_crx_info(verbose, crx): 83 | """Print information extracted from a crx file.""" 84 | if is_valid_magic(crx.magic): 85 | magic = "valid" 86 | else: 87 | magic = "invalid" 88 | if check_signature(crx.public_key, crx.sig, crx.data): 89 | sig = "valid" 90 | else: 91 | sig = "invalid" 92 | print("Filename: " + crx.file) 93 | print("Header size: " + str(crx.header_len)) 94 | print("Size: " + str(crx.header_len + len(crx.data))) 95 | print("Magic byte: " + str(crx.magic.decode("utf-8")) + " (" + magic + 96 | ")") 97 | print("Version: " + str(crx.version)) 98 | print("Signature: " + sig) 99 | print("Public Key [" + str(crx.public_key_len) + "]:") 100 | key = RSA.importKey(crx.public_key) 101 | print(key.exportKey().decode("utf-8")) 102 | if verbose: 103 | print("Signature [" + str(crx.sig_len) + "]: " + str( 104 | binascii.hexlify(crx.sig))) 105 | out = io.BytesIO(crx.data) 106 | ziparchive = zipfile.ZipFile(out, 'r') 107 | print("Zip content:") 108 | for info in ziparchive.infolist(): 109 | print('{:8d} {:8d}'.format(info.file_size, info.compress_size), 110 | info.filename) 111 | 112 | 113 | def verify_crxfile(verbose, filename): 114 | """Verify integrity of a crx file.""" 115 | if is_crxfile(filename): 116 | if verbose: 117 | print("Found correct magic bytes.") 118 | print_crx_info(verbose, read_crx(filename)) 119 | return 0 120 | else: 121 | if verbose: 122 | print("No valid magic bytes found") 123 | return -1 124 | 125 | 126 | def extract_crxfile(verbose, force, filename, destdir): 127 | """Extract crxfile into specified destdir.""" 128 | crx = read_crx(filename) 129 | if is_valid_magic(crx.magic) | force: 130 | if (destdir == "") | (destdir is None): 131 | destdir = "." 132 | if filename.endswith(".crx"): 133 | extname = os.path.basename(filename) 134 | dirname = extname[0:len(os.path.basename(extname)) - 4] 135 | else: 136 | dirname = filename 137 | out = io.BytesIO(crx.data) 138 | ziparchive = zipfile.ZipFile(out, 'r') 139 | ziparchive.extractall(destdir + "/" + dirname) 140 | if verbose: 141 | print("Content extracted into: " + destdir + "/" + dirname) 142 | return 0 143 | else: 144 | print("Input file not valid.") 145 | return -1 146 | -------------------------------------------------------------------------------- /ExtensionCrawler/dbbackend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalhacking/ExtensionCrawler/853d69d1a3478eaa3b8649f9dd754a044a561cc5/ExtensionCrawler/dbbackend/__init__.py -------------------------------------------------------------------------------- /ExtensionCrawler/dbbackend/mysql_backend.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2017 The University of Sheffield, UK 3 | # 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | # 17 | 18 | import time 19 | import datetime 20 | from collections import OrderedDict 21 | from random import uniform 22 | import sys 23 | import configparser 24 | 25 | import MySQLdb 26 | import MySQLdb._exceptions 27 | 28 | import ExtensionCrawler.config as config 29 | from ExtensionCrawler.util import log_info, log_error, log_warning 30 | 31 | 32 | class MysqlBackend: 33 | def __init__(self, ext_id, delayed=False, cache_etags=False, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(), 34 | **kwargs): 35 | self.ext_id = ext_id 36 | self.delayed = delayed 37 | self.cache_etags = cache_etags 38 | self.dbargs = kwargs 39 | self.try_wait = try_wait 40 | self.maxtries = maxtries 41 | self.cache = {} 42 | self.crx_etag_cache = {} 43 | self.db = None 44 | self.cursor = None 45 | 46 | # For more info, see https://jira.mariadb.org/browse/CONC-359 47 | self._fix_missing_host(self.dbargs) 48 | 49 | def _fix_missing_host(self, dbargs): 50 | if "host" in dbargs: 51 | return 52 | 53 | if "read_default_file" not in dbargs: 54 | return 55 | 56 | config = configparser.ConfigParser() 57 | config.read(dbargs["read_default_file"]) 58 | if "host" not in config["client"]: 59 | return 60 | dbargs["host"] = config["client"]["host"] 61 | 62 | 63 | def __enter__(self): 64 | # We open a connection once we actually need it 65 | return self 66 | 67 | def __exit__(self, *args): 68 | for table, arglist in self.cache.items(): 69 | self._do_insert(table, arglist) 70 | self.cache[table] = [] 71 | self._close_conn() 72 | 73 | def _get_column_names(self, table): 74 | self.cursor.execute(f"select column_name from information_schema.columns where table_schema=database() and table_name=%s", (table,)) 75 | return [row[0] for row in self.cursor.fetchall()] 76 | 77 | 78 | def _do_insert(self, table, arglist): 79 | if len(arglist) == 0: 80 | return 81 | sorted_arglist = self.sort_by_primary_key(table, arglist) 82 | args = [tuple(arg.values()) for arg in sorted_arglist] 83 | 84 | if self.delayed: 85 | query = "INSERT DELAYED INTO {}({}) VALUES ({})".format( 86 | table, 87 | ",".join(sorted_arglist[0].keys()), 88 | ",".join(len(args[0]) * ["%s"])) 89 | else: 90 | column_names = self.retry(lambda: self._get_column_names(table)) 91 | if "last_modified" in column_names: 92 | additional_columns = ["last_modified"] 93 | else: 94 | additional_columns = [] 95 | # Looks like this, for example: 96 | # INSERT INTO category VALUES(extid,date,category) (%s,%s,%s) 97 | # ON DUPLICATE KEY UPDATE extid=VALUES(extid),date=VALUES(date) 98 | # ,category=VALUES(category) 99 | query = "INSERT INTO {}({}) VALUES ({}) ON DUPLICATE KEY UPDATE {}".format( 100 | table, 101 | ",".join(sorted_arglist[0].keys()), 102 | ",".join(len(args[0]) * ["%s"]), 103 | ",".join( 104 | ["{c}=VALUES({c})".format(c=c) for c in list(sorted_arglist[0].keys()) + additional_columns])) 105 | start = time.time() 106 | self.retry(lambda: self.cursor.executemany(query, args)) 107 | log_info("* Inserted {} bytes into {}, taking {}.".format(sum([sys.getsizeof(arg) for arg in args]), 108 | table, datetime.timedelta(seconds=int(time.time() - start))), 3) 109 | 110 | def _create_conn(self): 111 | if self.db is None: 112 | log_info("* self.db is None, open new connection ...", 3) 113 | self.db = MySQLdb.connect(**self.dbargs) 114 | self.db.autocommit(True) 115 | log_info("* success", 4) 116 | if self.cursor is None: 117 | log_info("* self.cursor is None, assigning new cursor ...", 3) 118 | self.cursor = self.db.cursor() 119 | log_info("* success", 4) 120 | 121 | def _close_conn(self): 122 | if self.cursor is not None: 123 | self.cursor.close() 124 | self.cursor = None 125 | if self.db is not None: 126 | self.db.close() 127 | self.db = None 128 | 129 | def retry(self, f): 130 | for t in range(self.maxtries): 131 | try: 132 | self._create_conn() 133 | return f() 134 | except MySQLdb._exceptions.OperationalError as e: 135 | last_exception = e 136 | 137 | try: 138 | self._close_conn() 139 | except Exception as e2: 140 | log_error("Suppressed exception: {}".format(str(e2)), 3) 141 | 142 | if t + 1 == self.maxtries: 143 | log_error("MySQL connection eventually failed, closing connection!", 3) 144 | raise last_exception 145 | else: 146 | factor = 0.2 147 | logmsg = ("Exception ({}) on mysql connection attempt " 148 | "{} of {}, wating {}s +/- {}% before retrying..." 149 | ).format(str(e), 150 | t + 1, 151 | self.maxtries, 152 | self.try_wait, factor * 100) 153 | log_warning(logmsg, 3) 154 | time.sleep(self.try_wait * uniform( 155 | 1 - factor, 1 + factor)) 156 | 157 | def get_single_value(self, query, args): 158 | self.retry(lambda: self.cursor.execute(query, args)) 159 | 160 | result = self.retry(lambda: self.cursor.fetchone()) 161 | if result is not None: 162 | return result[0] 163 | else: 164 | return None 165 | 166 | def sort_by_primary_key(self, table, arglist): 167 | self.retry(lambda: self.cursor.execute(f"SHOW KEYS FROM {table} WHERE Key_name = 'PRIMARY'")) 168 | primary_keys = [row[4] for row in self.cursor.fetchall()] 169 | 170 | sorted_arglist = sorted(arglist, key=lambda x: [x[pk] for pk in primary_keys]) 171 | 172 | def arglist_shuffler(x): 173 | try: 174 | return primary_keys.index(x) 175 | except ValueError: 176 | return len(primary_keys) 177 | shuffled_arglist = [OrderedDict(sorted(arg.items(), key=lambda x: arglist_shuffler(x[0]))) for arg in sorted_arglist] 178 | return shuffled_arglist 179 | 180 | 181 | def insertmany(self, table, arglist): 182 | if table not in self.cache: 183 | self.cache[table] = [] 184 | self.cache[table] += arglist 185 | if len(self.cache[table]) >= 128: 186 | self._do_insert(table, self.cache[table]) 187 | self.cache[table] = [] 188 | if self.cache_etags and table == "extension": 189 | for arg in arglist: 190 | self.crx_etag_cache[(arg["extid"], arg["date"])] = arg["crx_etag"] 191 | 192 | def insert(self, table, **kwargs): 193 | self.insertmany(table, [kwargs]) 194 | 195 | def get_etag(self, extid, date): 196 | if (extid, date) in self.crx_etag_cache: 197 | return self.crx_etag_cache[(extid, date)] 198 | else: 199 | return None 200 | 201 | def get_cdnjs_info(self, md5): 202 | query = """SELECT library, version, filename, add_date, typ from cdnjs where md5=%s""" 203 | args = [md5] 204 | self.retry(lambda: self.cursor.execute(query, args)) 205 | result = self.retry(lambda: self.cursor.fetchone()) 206 | return result 207 | 208 | 209 | def convert_date(date): 210 | return date[:-6] 211 | -------------------------------------------------------------------------------- /ExtensionCrawler/dbbackend/mysql_process.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (C) 2017 The University of Sheffield, UK 3 | # 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 3 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | # 17 | 18 | from multiprocessing import Process, Manager 19 | 20 | from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend 21 | from ExtensionCrawler.util import setup_logger, log_exception 22 | 23 | class MysqlProxy: 24 | def __init__(self, q): 25 | self.q = q 26 | 27 | def insertmany(self, table, arglist): 28 | self.q.put((MysqlProcessBackend.INSERT, (table, arglist))) 29 | 30 | def insert(self, table, **kwargs): 31 | self.insertmany(table, [kwargs]) 32 | 33 | def get_cdnjs_info(self, md5): 34 | return None 35 | 36 | 37 | def run(mysql_kwargs, q): 38 | setup_logger(True) 39 | finished = False 40 | 41 | try: 42 | with MysqlBackend(None, **mysql_kwargs) as db: 43 | while True: 44 | cmd, data = q.get() 45 | if cmd == MysqlProcessBackend.STOP: 46 | finished = True 47 | break 48 | if cmd == MysqlProcessBackend.INSERT: 49 | db.insertmany(*data) 50 | except: 51 | log_exception("Stopping Mysql backend and emptying queue...") 52 | if not finished: 53 | while True: 54 | cmd, data = q.get() 55 | if cmd == MysqlProcessBackend.STOP: 56 | break 57 | if cmd == MysqlProcessBackend.INSERT: 58 | pass 59 | 60 | 61 | class MysqlProcessBackend: 62 | STOP = "stop" 63 | INSERT = "insert" 64 | 65 | def __init__(self, ext_id, **mysql_kwargs): 66 | self.mysql_kwargs = mysql_kwargs 67 | self.m = Manager() 68 | self.queue = self.m.Queue() 69 | 70 | def __enter__(self): 71 | self.p = Process(target=run, args=(self.mysql_kwargs, self.queue)) 72 | self.p.start() 73 | return MysqlProxy(self.queue) 74 | 75 | def __exit__(self, *args): 76 | self.queue.put((MysqlProcessBackend.STOP, None)) 77 | self.p.join() 78 | -------------------------------------------------------------------------------- /ExtensionCrawler/discover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | """Python mnodule providing methods for discovering extensions in the 18 | Chrome extension store.""" 19 | 20 | from xml.etree.ElementTree import fromstring 21 | import re 22 | import requests 23 | from pebble import ThreadPool 24 | from ExtensionCrawler import config 25 | 26 | 27 | def get_inner_elems(doc): 28 | """Get inner element.""" 29 | return fromstring(doc).iterfind(r".//{{{}}}loc".format( 30 | config.const_sitemap_scheme())) 31 | 32 | 33 | def is_generic_url(url): 34 | """Check if URL is a generic extension URL.""" 35 | """The urls with a language parameter attached return a subset""" 36 | """of the ids that get returned by the plain urls, therefore we""" 37 | """skip urls with a language parameter.""" 38 | 39 | return re.match(r"^{}\?shard=\d+&numshards=\d+$".format( 40 | config.const_sitemap_url()), url) 41 | 42 | 43 | def iterate_shard(shard_url): 44 | if is_generic_url(shard_url): 45 | shard = requests.get(shard_url, timeout=10).text 46 | for inner_elem in get_inner_elems(shard): 47 | overview_url = inner_elem.text 48 | yield re.search("[a-z]{32}", overview_url).group(0) 49 | 50 | 51 | def process_shard(shard_url): 52 | return list(iterate_shard(shard_url)) 53 | 54 | 55 | def get_new_ids(known_ids, max_ids=None): 56 | """Crawl extension ids available in Chrome store.""" 57 | 58 | shard_urls = [shard_elem.text for shard_elem in get_inner_elems( 59 | requests.get(config.const_sitemap_url(), timeout=10).text)] 60 | with ThreadPool(16) as pool: 61 | future = pool.map(process_shard, shard_urls, chunksize=1) 62 | iterator = future.result() 63 | 64 | returned_ids = 0 65 | while True: 66 | try: 67 | for extid in next(iterator): 68 | if extid not in known_ids: 69 | yield extid 70 | returned_ids += 1 71 | if max_ids is not None and returned_ids >= max_ids: 72 | pool.stop() 73 | return 74 | except StopIteration: 75 | return 76 | -------------------------------------------------------------------------------- /ExtensionCrawler/file_identifiers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | """ Module for obtaining (normalized) hashes for files.""" 19 | 20 | import hashlib 21 | import os 22 | import re 23 | import zlib 24 | import mimetypes 25 | from io import StringIO 26 | from simhash import Simhash 27 | 28 | import cchardet as chardet 29 | import magic 30 | 31 | from ExtensionCrawler.js_mincer import mince_js 32 | 33 | 34 | def is_binary_resource(mimetype_magic): 35 | return (mimetype_magic.startswith("image/") or 36 | mimetype_magic.startswith("video/") or 37 | mimetype_magic.startswith("audio/") or 38 | mimetype_magic == "application/pdf") 39 | 40 | 41 | def normalize_jsdata(str_data): 42 | """Compute normalized code blocks of a JavaScript file""" 43 | txt = "" 44 | loc = 0 45 | with StringIO(str_data) as str_obj: 46 | for block in mince_js(str_obj): 47 | if block.is_code(): 48 | for line in block.content.splitlines(): 49 | txt += line.strip() 50 | loc += 1 51 | return txt.encode(), loc 52 | 53 | 54 | def get_features(s): 55 | """Compute feature set of text (represented as string).""" 56 | width = 3 57 | s = s.lower() 58 | s = re.sub(r'[^\w]+', '', s) 59 | return (s[i:i + width] for i in range(max(len(s) - width + 1, 1))) 60 | 61 | 62 | def get_simhash(encoding, data): 63 | """Compute simhash of text.""" 64 | if encoding is not None: 65 | # VISCII is not supported by python, UTF-8 parses at least the for us important parts 66 | if encoding == "VISCII": 67 | encoding = "UTF-8" 68 | str_data = data.decode(encoding=encoding, errors="replace") 69 | else: 70 | str_data = str(data) 71 | simhash = Simhash(get_features(str_data)).value 72 | return simhash 73 | 74 | 75 | def compute_difference(hx, hy): 76 | """Compute difference between two simhashes.""" 77 | assert hx.bit_length() == hy.bit_length() 78 | h = (hx ^ hy) & ((1 << 64) - 1) 79 | d = 0 80 | while h: 81 | d += 1 82 | h &= h - 1 83 | return d 84 | 85 | 86 | def get_data_identifiers(data): 87 | """Get basic data identifiers (size, hashes, normalized hashes, etc.).""" 88 | 89 | data_identifier = { 90 | 'encoding': None, 91 | 'description': None, 92 | 'size': None, 93 | 'loc': None, 94 | 'mimetype_magic': None, 95 | 'md5': None, 96 | 'sha1': None, 97 | 'sha256': None, 98 | 'simhash': None, 99 | 'size_stripped': None, 100 | 'normalized_encoding': None, 101 | 'normalized_description': None, 102 | 'normalized_size': None, 103 | 'normalized_loc': None, 104 | 'normalized_mimetype_magic': None, 105 | 'normalized_md5': None, 106 | 'normalized_sha1': None, 107 | 'normalized_sha256': None, 108 | 'normalized_simhash': None 109 | } 110 | 111 | mimetype_magic = magic.from_buffer(data, mime=True) 112 | 113 | try: 114 | magic_desc = magic.from_buffer(data) 115 | except magic.MagicException as exp: 116 | rgx = re.compile(r' name use count.*$') 117 | msg = str(exp.message) 118 | if re.search(rgx, msg): 119 | magic_desc = re.sub(rgx, '', msg) 120 | else: 121 | raise exp 122 | 123 | data_identifier['mimetype_magic'] = mimetype_magic 124 | data_identifier['md5'] = hashlib.md5(data).digest() 125 | data_identifier['sha1'] = hashlib.sha1(data).digest() 126 | data_identifier['sha256'] = hashlib.sha256(data).digest() 127 | data_identifier['size'] = len(data) 128 | data_identifier['description'] = magic_desc 129 | 130 | # We don't continue here with binary files, as that consumes too many 131 | # resources. 132 | if is_binary_resource(mimetype_magic): 133 | return data_identifier 134 | 135 | encoding = chardet.detect(data)['encoding'] 136 | 137 | data_identifier['simhash'] = get_simhash(encoding, data) 138 | data_identifier['size_stripped'] = len(data.strip()) 139 | data_identifier['loc'] = len(data.splitlines()) 140 | data_identifier['encoding'] = encoding 141 | try: 142 | normalized_data, normalized_loc = normalize_jsdata( 143 | data.decode(encoding=data_identifier['encoding'], errors="replace")) 144 | except Exception: 145 | normalized_data = None 146 | normalized_loc = 0 147 | 148 | if normalized_data is not None: 149 | normalized_magic_desc = "" 150 | try: 151 | normalized_magic_desc = magic.from_buffer(normalized_data) 152 | except magic.MagicException as exp: 153 | rgx = re.compile(r' name use count.*$') 154 | msg = str(exp.message) 155 | if re.search(rgx, msg): 156 | normalized_magic_desc = re.sub(rgx, '', msg) 157 | else: 158 | raise exp 159 | normalized_encoding = chardet.detect(normalized_data)['encoding'] 160 | data_identifier['normalized_encoding'] = normalized_encoding 161 | data_identifier['normalized_description'] = normalized_magic_desc 162 | data_identifier['normalized_size'] = len(normalized_data) 163 | data_identifier['normalized_loc'] = normalized_loc 164 | data_identifier['normalized_mimetype_magic'] = magic.from_buffer(normalized_data, mime=True) 165 | data_identifier['normalized_md5'] = hashlib.md5( 166 | normalized_data).digest() 167 | data_identifier['normalized_sha1'] = hashlib.sha1( 168 | normalized_data).digest() 169 | data_identifier['normalized_sha256'] = hashlib.sha256( 170 | normalized_data).digest() 171 | data_identifier['normalized_simhash'] = get_simhash( 172 | normalized_encoding, normalized_data) 173 | return data_identifier 174 | 175 | 176 | def get_file_identifiers(path, data=None): 177 | """Get basic file identifiers (path, filename, etc.) and data identifiers.""" 178 | dec_data_identifier = { 179 | 'mimetype_magic': None, 180 | 'md5': None, 181 | 'sha1': None, 182 | 'sha256': None, 183 | 'simhash': None, 184 | 'size': None, 185 | 'size_stripped': None, 186 | 'loc': None, 187 | 'description': None, 188 | 'encoding': None, 189 | 'normalized_mimetype_magic': None, 190 | 'normalized_loc': None, 191 | 'normalized_encoding': None, 192 | 'normalized_description': None, 193 | 'normalized_size': None, 194 | 'normalized_md5': None, 195 | 'normalized_sha1': None, 196 | 'normalized_sha256': None, 197 | 'normalized_simhash': None 198 | } 199 | if data is None: 200 | with open(path, 'rb') as fileobj: 201 | data = fileobj.read() 202 | 203 | data_identifier = get_data_identifiers(data) 204 | if data_identifier['description'].startswith('gzip'): 205 | try: 206 | dec = zlib.decompressobj(zlib.MAX_WBITS | 16) 207 | dec_data = dec.decompress(data, 100 * data_identifier['size']) 208 | dec_data_identifier = get_data_identifiers(dec_data) 209 | del dec_data 210 | except Exception as e: 211 | dec_data_identifier[ 212 | 'description'] = "Exception during compression (likely zip-bomb:" + str( 213 | e) 214 | file_identifier = { 215 | 'filename': 216 | os.path.basename(path), 217 | 'path': 218 | path, 219 | 'mimetype': 220 | mimetypes.guess_type(path), 221 | 'mimetype_magic': 222 | data_identifier['mimetype_magic'], 223 | 'md5': 224 | data_identifier['md5'], 225 | 'sha1': 226 | data_identifier['sha1'], 227 | 'sha256': 228 | data_identifier['sha256'], 229 | 'simhash': 230 | data_identifier['simhash'], 231 | 'size': 232 | data_identifier['size'], 233 | 'size_stripped': 234 | data_identifier['size_stripped'], 235 | 'loc': 236 | data_identifier['loc'], 237 | 'description': 238 | data_identifier['description'], 239 | 'encoding': 240 | data_identifier['encoding'], 241 | 'normalized_encoding': 242 | data_identifier['normalized_encoding'], 243 | 'normalized_description': 244 | data_identifier['normalized_description'], 245 | 'normalized_size': 246 | data_identifier['normalized_size'], 247 | 'normalized_loc': 248 | data_identifier['normalized_loc'], 249 | 'normalized_mimetype_magic': 250 | data_identifier['normalized_mimetype_magic'], 251 | 'normalized_md5': 252 | data_identifier['normalized_md5'], 253 | 'normalized_sha1': 254 | data_identifier['normalized_sha1'], 255 | 'normalized_sha256': 256 | data_identifier['normalized_sha256'], 257 | 'normalized_simhash': 258 | data_identifier['normalized_simhash'], 259 | 'dec_mimetype_magic': 260 | dec_data_identifier['mimetype_magic'], 261 | 'dec_md5': 262 | dec_data_identifier['md5'], 263 | 'dec_sha1': 264 | dec_data_identifier['sha1'], 265 | 'dec_sha256': 266 | dec_data_identifier['sha256'], 267 | 'dec_simhash': 268 | dec_data_identifier['simhash'], 269 | 'dec_size': 270 | dec_data_identifier['size'], 271 | 'dec_size_stripped': 272 | dec_data_identifier['size_stripped'], 273 | 'dec_loc': 274 | dec_data_identifier['loc'], 275 | 'dec_description': 276 | dec_data_identifier['description'], 277 | 'dec_encoding': 278 | dec_data_identifier['encoding'], 279 | 'dec_normalized_encoding': 280 | dec_data_identifier['normalized_encoding'], 281 | 'dec_normalized_description': 282 | dec_data_identifier['normalized_description'], 283 | 'dec_normalized_size': 284 | dec_data_identifier['normalized_size'], 285 | 'dec_normalized_loc': 286 | dec_data_identifier['normalized_loc'], 287 | 'dec_normalized_mimetype_magic': 288 | dec_data_identifier['normalized_mimetype_magic'], 289 | 'dec_normalized_md5': 290 | dec_data_identifier['normalized_md5'], 291 | 'dec_normalized_sha1': 292 | dec_data_identifier['normalized_sha1'], 293 | 'dec_normalized_sha256': 294 | dec_data_identifier['normalized_sha256'], 295 | 'dec_normalized_simhash': 296 | dec_data_identifier['normalized_simhash'] 297 | } 298 | 299 | return file_identifier 300 | -------------------------------------------------------------------------------- /ExtensionCrawler/js_mincer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | """ A mostly correct JavaScript analyzer that separates comments from code. The 18 | implementation prioritizes speed over correctness. """ 19 | 20 | from enum import Enum 21 | 22 | 23 | class JsBlockType(Enum): 24 | """Enumeration of the different JavaScript blocks.""" 25 | CODE_BLOCK = 1 26 | SINGLE_LINE_COMMENT = 2 27 | SINGLE_LINE_COMMENT_BLOCK = 3 28 | MULTI_LINE_COMMENT_BLOCK = 4 29 | STRING_SQ = 5 30 | STRING_DQ = 6 31 | 32 | 33 | def is_string_literal_sq(state): 34 | """Check if block is a single quote string literal.""" 35 | return state == JsBlockType.STRING_SQ 36 | 37 | 38 | def is_string_literal_dq(state): 39 | """Check if block is a double quote string literal.""" 40 | return state == JsBlockType.STRING_DQ 41 | 42 | 43 | def is_string_literal(state): 44 | """Check if block is a quote string literal.""" 45 | return is_string_literal_sq(state) or is_string_literal_dq(state) 46 | 47 | 48 | def is_code(state): 49 | """Check if block is code (without string literals).""" 50 | return state == JsBlockType.CODE_BLOCK 51 | 52 | 53 | def is_code_or_string_literal(state): 54 | """Check if block is code or a string literal.""" 55 | return is_code(state) or is_string_literal(state) 56 | 57 | 58 | def is_comment_multi_line(state): 59 | """Check if block is a multi line comment.""" 60 | return state == JsBlockType.MULTI_LINE_COMMENT_BLOCK 61 | 62 | 63 | def is_comment_single_line(state): 64 | """Check if block is a single line comment.""" 65 | return state == JsBlockType.SINGLE_LINE_COMMENT 66 | 67 | 68 | def is_comment_single_line_block(state): 69 | """Check if block is a single line comment block.""" 70 | return state == JsBlockType.SINGLE_LINE_COMMENT_BLOCK 71 | 72 | 73 | def is_comment(state): 74 | """Check if block is a comment.""" 75 | return is_comment_single_line(state) or is_comment_multi_line( 76 | state) or is_comment_single_line_block(state) 77 | 78 | 79 | def get_next_character(fileobj): 80 | """Get next character from (text) file.""" 81 | char = fileobj.read(1) 82 | while char: 83 | yield char 84 | char = fileobj.read(1) 85 | 86 | 87 | class JsBlock: 88 | """Class representing JavaScript blocks.""" 89 | 90 | def __init__(self, typ, start, end, content, string_literals=None): 91 | self.typ = typ 92 | self.start = start 93 | self.end = end 94 | self.content = content 95 | self.string_literals = string_literals 96 | 97 | def is_code(self): 98 | """Check if block is a code block.""" 99 | return not is_comment(self.typ) 100 | 101 | def is_comment(self): 102 | """Check if block is a comment.""" 103 | return is_comment(self.typ) 104 | 105 | def is_comment_single_line(self): 106 | """Check if block is a single line comment.""" 107 | return is_comment_single_line(self.typ) 108 | 109 | def is_comment_single_line_block(self): 110 | """Check if block is single line comment block.""" 111 | return is_comment_single_line_block(self.typ) 112 | 113 | def is_comment_multi_line_block(self): 114 | """Check if block is a multi line comment.""" 115 | return is_comment_multi_line(self.typ) 116 | 117 | def __str__(self): 118 | str_msg = "" 119 | if is_code(self.typ): 120 | str_msg = "** String Literals: " + str(len( 121 | self.string_literals)) + "\n" 122 | return ( 123 | "***************************************************************\n" 124 | + "** Type: " + str(self.typ.name) + "\n" + "** Start: " + str( 125 | self.start) + "\n" + "** End: " + str( 126 | self.end) + "\n" + str_msg + self.content.strip() + "\n" + 127 | "***************************************************************\n" 128 | ) 129 | 130 | 131 | def mince_js_fileobj(fileobj): 132 | """Mince JavaScript file object into code and comment blocks.""" 133 | line = 1 134 | cpos = 1 135 | escaped = False 136 | content = "" 137 | block_start_line = line 138 | block_start_cpos = cpos 139 | state = JsBlockType.CODE_BLOCK 140 | string_literals = [] 141 | current_string_literal = "" 142 | 143 | for char in get_next_character(fileobj): 144 | cpos += 1 145 | content += char 146 | suc_state = state 147 | if not escaped: 148 | if is_code_or_string_literal(state): 149 | if is_code(state): 150 | if char == "'": 151 | suc_state = JsBlockType.STRING_SQ 152 | if char == '"': 153 | suc_state = JsBlockType.STRING_DQ 154 | if char == '/': 155 | try: 156 | next_char = next(get_next_character(fileobj)) 157 | if next_char == '/': 158 | suc_state = JsBlockType.SINGLE_LINE_COMMENT 159 | elif next_char == '*': 160 | suc_state = JsBlockType.MULTI_LINE_COMMENT_BLOCK 161 | next_content = content[-1] + next_char 162 | content = content[:-1] 163 | cpos -= 1 164 | char = next_char 165 | except StopIteration: 166 | pass 167 | elif is_string_literal_dq(state): 168 | if char == '"': 169 | suc_state = JsBlockType.CODE_BLOCK 170 | string_literals.append(((line, cpos), 171 | current_string_literal)) 172 | current_string_literal = "" 173 | else: 174 | current_string_literal += char 175 | elif is_string_literal_sq(state): 176 | if char == "'": 177 | suc_state = JsBlockType.CODE_BLOCK 178 | string_literals.append(((line, cpos), 179 | current_string_literal)) 180 | current_string_literal = "" 181 | else: 182 | current_string_literal += char 183 | else: 184 | raise Exception("Unknown state") 185 | elif is_comment(state): 186 | if is_comment_single_line(state): 187 | if char == '\n': 188 | suc_state = JsBlockType.CODE_BLOCK 189 | elif is_comment_multi_line(state): 190 | if char == '*': 191 | try: 192 | next_char = next(get_next_character(fileobj)) 193 | if next_char == '/': 194 | suc_state = JsBlockType.CODE_BLOCK 195 | content = content + next_char 196 | cpos += 1 197 | char = next_char 198 | except StopIteration: 199 | pass 200 | 201 | if ((is_comment(state) and is_code_or_string_literal(suc_state)) or ( 202 | is_code_or_string_literal(state) and is_comment(suc_state))): 203 | if content.strip(): 204 | yield (JsBlock(state, (block_start_line, block_start_cpos), 205 | (line, cpos), content, string_literals)) 206 | if char == '\n': 207 | block_start_line = line + 1 208 | block_start_cpos = 1 209 | else: 210 | block_start_line = line 211 | block_start_cpos = cpos 212 | content = next_content 213 | next_content = "" 214 | string_literals = [] 215 | 216 | if char == '\n': 217 | line += 1 218 | cpos = 1 219 | 220 | escaped = bool(char == '\\' and not escaped) 221 | state = suc_state 222 | 223 | if content.strip(): 224 | yield (JsBlock(state, (block_start_line, block_start_cpos), 225 | (line, cpos), content, string_literals)) 226 | 227 | 228 | def mince_js_fileobj_slc_blocks(fileobj): 229 | """Mince JavaScript file object into code and comment blocks (join subsequent 230 | single line comments).""" 231 | for block in mince_js_fileobj(fileobj): 232 | if block.typ == JsBlockType.SINGLE_LINE_COMMENT: 233 | start = block.start 234 | end = block.end 235 | content = block.content 236 | single_block = False 237 | for suc in mince_js_fileobj(fileobj): 238 | if suc.typ == JsBlockType.SINGLE_LINE_COMMENT: 239 | content += suc.content 240 | end = suc.end 241 | single_block = True 242 | else: 243 | if single_block: 244 | yield (JsBlock(JsBlockType.SINGLE_LINE_COMMENT_BLOCK, 245 | start, end, content)) 246 | else: 247 | yield block 248 | content = "" 249 | yield suc 250 | break 251 | if content.strip() != "": 252 | yield (JsBlock(JsBlockType.SINGLE_LINE_COMMENT_BLOCK, start, 253 | end, content)) 254 | else: 255 | yield block 256 | 257 | 258 | def mince_js_file(file): 259 | """Mince JavaScript file into code and comment blocks.""" 260 | with open(file, encoding="utf-8") as fileobj: 261 | for block in mince_js_fileobj(fileobj): 262 | yield block 263 | 264 | 265 | def mince_js_file_slc_blocks(file): 266 | """Mince JavaScript file into code and comment blocks (join subsequent single 267 | line comments).""" 268 | with open(file, encoding="utf-8") as fileobj: 269 | for block in mince_js_fileobj_slc_blocks(fileobj): 270 | yield block 271 | 272 | 273 | def mince_js(file, single_line_comments_block=False): 274 | """Mince JavaScript file (either file name or open file object) into code and 275 | comment blocks. Subsequent comment line blocks can be minced into separate 276 | entities or merged.""" 277 | if isinstance(file, str): 278 | if single_line_comments_block: 279 | return mince_js_file_slc_blocks(file) 280 | else: 281 | return mince_js_file(file) 282 | else: 283 | if single_line_comments_block: 284 | return mince_js_fileobj_slc_blocks(file) 285 | else: 286 | return mince_js_fileobj(file) 287 | -------------------------------------------------------------------------------- /ExtensionCrawler/request_manager.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from contextlib import contextmanager 4 | from multiprocessing import Lock, BoundedSemaphore, Value 5 | 6 | 7 | class RequestManager: 8 | def __init__(self, max_workers): 9 | self.max_workers = max_workers 10 | self.lock = Lock() 11 | self.sem = BoundedSemaphore(max_workers) 12 | self.last_request = Value('d', 0.0) 13 | self.last_restricted_request = Value('d', 0.0) 14 | 15 | @contextmanager 16 | def normal_request(self): 17 | with self.lock: 18 | self.sem.acquire() 19 | time.sleep(max(0.0, self.last_restricted_request.value + 0.6 + (random.random() * 0.15) - time.time())) 20 | try: 21 | yield 22 | except Exception as e: 23 | raise e 24 | finally: 25 | self.last_request.value = time.time() 26 | self.sem.release() 27 | 28 | @contextmanager 29 | def restricted_request(self): 30 | with self.lock: 31 | for i in range(self.max_workers): 32 | self.sem.acquire() 33 | time.sleep(max(0.0, self.last_request.value + 0.6 + (random.random() * 0.15) - time.time())) 34 | try: 35 | yield 36 | except Exception as e: 37 | raise e 38 | finally: 39 | self.last_request.value = time.time() 40 | self.last_restricted_request.value = time.time() 41 | for i in range(self.max_workers): 42 | self.sem.release() 43 | -------------------------------------------------------------------------------- /ExtensionCrawler/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | """ Various utility methods.""" 19 | 20 | import traceback 21 | import logging 22 | import sys 23 | 24 | from ExtensionCrawler.config import const_log_format 25 | 26 | 27 | def value_of(value, default): 28 | """Get value or default value if None.""" 29 | if value is not None and value is not "": 30 | return value 31 | else: 32 | return default 33 | 34 | 35 | def log_debug(msg, indent_level=0): 36 | logging.debug(4 * indent_level * " " + str(msg)) 37 | 38 | 39 | def log_info(msg, indent_level=0): 40 | logging.info(4 * indent_level * " " + str(msg)) 41 | 42 | 43 | def log_warning(msg, indent_level=0): 44 | logging.warning(4 * indent_level * " " + str(msg)) 45 | 46 | 47 | def log_error(msg, indent_level=0): 48 | logging.error(4 * indent_level * " " + str(msg)) 49 | 50 | 51 | def log_exception(msg, indent_level=0): 52 | logging.error(4 * indent_level * " " + str(msg)) 53 | for line in traceback.format_exc().splitlines(): 54 | logging.error(4 * indent_level * " " + line) 55 | 56 | 57 | def set_logger_tag(ext_id): 58 | logger = logging.getLogger() 59 | for handler in logger.handlers: 60 | handler.setFormatter(logging.Formatter(const_log_format(ext_id))) 61 | 62 | 63 | def setup_logger(verbose): 64 | if verbose: 65 | loglevel = logging.INFO 66 | else: 67 | loglevel = logging.WARNING 68 | 69 | logger = logging.getLogger() 70 | logger.setLevel(loglevel) 71 | ch = logging.StreamHandler(sys.stdout) 72 | logger.addHandler(ch) 73 | 74 | set_logger_tag("-" * 32) 75 | -------------------------------------------------------------------------------- /PermissionAnalysis/grep-unused-permissions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2019 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | 20 | import argparse 21 | import io 22 | import logging 23 | import re 24 | import json 25 | import sys 26 | import csv 27 | from jsmin import jsmin 28 | import ast 29 | 30 | from zipfile import ZipFile 31 | 32 | from ExtensionCrawler.config import (const_log_format, const_basedir) 33 | from ExtensionCrawler.archive import iter_tar_entries_by_date 34 | from ExtensionCrawler.js_mincer import mince_js 35 | 36 | 37 | def get_etag(headers_content): 38 | d = ast.literal_eval(headers_content) 39 | if "ETag" in d: 40 | return d["ETag"] 41 | 42 | 43 | def get_metadata(overview_contents): 44 | # Extract extension name 45 | match = re.search("""""", 46 | overview_contents) 47 | name = match.group(1) if match else None 48 | 49 | # Extract extension version 50 | match = re.search( 51 | """""", overview_contents) 52 | version = match.group(1) if match else None 53 | 54 | # Extracts extension categories 55 | match = re.search( 56 | """Attribute name="category">(.+?)""", overview_contents) 57 | categories = match.group(1).split(",") if match else [] 58 | 59 | # Extracts the number of downloads 60 | match = re.search( 61 | """ 0: 103 | has_crx_file = True 104 | with ZipFile(tarfile) as zf: 105 | for zipentry in zf.infolist(): 106 | if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"): 107 | with zf.open(zipentry) as f: 108 | verbatim_lines = [] 109 | for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): 110 | verbatim_lines += block.content.splitlines() 111 | 112 | for permission, evidences in permission_map.items(): 113 | for evidence in evidences: 114 | for line in verbatim_lines: 115 | if evidence in line: 116 | date_matches[permission] = True 117 | break 118 | 119 | if zipentry.filename == "manifest.json": 120 | with zf.open(zipentry) as m: 121 | raw_content = m.read() 122 | # There are some manifests that seem to have weird encodings... 123 | try: 124 | content = raw_content.decode("utf-8-sig") 125 | except UnicodeDecodeError: 126 | # Trying a different encoding, manifests are weird... 127 | content = raw_content.decode("latin1") 128 | 129 | manifest = json.loads(jsmin(content), strict=False) 130 | if "permissions" in manifest: 131 | for permission in manifest["permissions"]: 132 | used_permissions.add(str(permission)) 133 | 134 | if has_crx_file: 135 | line = [date, crx_etag, name, version, "+".join(categories), downloads] 136 | for permission in sorted(list(permission_map.keys())): 137 | if permission in used_permissions: 138 | if date_matches[permission]: 139 | line += ["REQ_AND_FOUND"] 140 | else: 141 | line += ["REQ_AND_NOT_FOUND"] 142 | else: 143 | if date_matches[permission]: 144 | line += ["NOT_REQ_AND_FOUND"] 145 | else: 146 | line += ["NOT_REQ_AND_NOT_FOUND"] 147 | results += [line] 148 | 149 | for result in results: 150 | csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]) 151 | 152 | 153 | def main(conf): 154 | logger = logging.getLogger() 155 | ch = logging.StreamHandler(sys.stderr) 156 | ch.setFormatter(logging.Formatter(const_log_format())) 157 | logger.addHandler(ch) 158 | if conf.verbose: 159 | logger.setLevel(logging.DEBUG) 160 | else: 161 | logger.setLevel(logging.WARNING) 162 | 163 | with open(conf.MAP_FILE) as f: 164 | permission_map = json.load(f) 165 | 166 | with open(conf.EXTID_FILE) as f: 167 | csvwriter = csv.writer(sys.stdout, csv.unix_dialect) 168 | csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "categories", "downloads"] 169 | + sorted(list(permission_map.keys()))) 170 | for extid in [l.strip() for l in f.readlines()]: 171 | try: 172 | handle_extid(conf, extid, permission_map, csvwriter) 173 | except Exception as e: 174 | logging.exception(f"Fatal error when handling extension '{extid}'") 175 | 176 | 177 | def build_parser(): 178 | main_parser = argparse.ArgumentParser( 179 | formatter_class=argparse.RawTextHelpFormatter, 180 | description='Search extensions for unused permissions') 181 | main_parser.add_argument( 182 | 'MAP_FILE', 183 | help='json file with permission - literal string mapping') 184 | main_parser.add_argument( 185 | 'EXTID_FILE', 186 | help='file with extension ids') 187 | main_parser.add_argument( 188 | '-v', 189 | '--verbose', 190 | action='store_true', 191 | default=False, 192 | help='increase verbosity') 193 | 194 | 195 | main_parser.add_argument( 196 | '-D', 197 | '--latest-date', 198 | metavar='DATE', 199 | type=str, 200 | help='select latest crx from tar, released before DATE.\n' + 201 | 'Together with --from-date, specifies all crx released in specified\n' + 202 | 'date range.') 203 | 204 | main_parser.add_argument( 205 | '-d', 206 | '--from-date', 207 | metavar='DATE', 208 | type=str, 209 | help='select oldest crx from tar released after DATE.\n' + 210 | 'Together with --latest-date, specifies all crx released in specified\n' + 211 | 'date range.') 212 | 213 | main_parser.add_argument( 214 | '-a', 215 | '--archive-dir', 216 | metavar='archive', 217 | type=str, 218 | default=const_basedir(), 219 | help='archive directory') 220 | 221 | return main_parser 222 | 223 | 224 | if __name__ == "__main__": 225 | main_parser = build_parser() 226 | 227 | main_conf = main_parser.parse_args() 228 | 229 | sys.exit(main(main_conf)) 230 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ExtensionCrawler 2 | 3 | A collection of utilities for downloading and analyzing browser 4 | extension from the Chrome Web store. 5 | 6 | * `crawler`: A crawler for extensions from the Chrome Web Store. 7 | * `crx-tool`: A tool for analyzing and extracting `*.crx` files 8 | (i.e., Chrome extensions). Calling `crx-tool.py .crx` 9 | will check the integrity of the extension. 10 | * `crx-extract`: A simple tool for extracting `*.crx` files from the 11 | tar-based archive hierarchy. 12 | * `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a 13 | JavaScript decomposition analysis. 14 | * `crx-jsstrings`: A tool for extracting code blocks, comment blocks, and 15 | string literals from JavaScript. 16 | * `create-db`: A tool for updating a remote MariaDB from already 17 | existing extension archives. 18 | 19 | The utilities store the extensions in the following directory 20 | hierarchy: 21 | 22 | ```shell 23 | archive 24 |    ├── conf 25 |    │   └── forums.conf 26 |    ├── data 27 |    │   └── ... 28 |    └── log 29 |    └── ... 30 | ``` 31 | 32 | The crawler downloads the most recent extension (i.e., the `*.crx` 33 | file as well as the overview page. In addition, the `conf` directory 34 | may contain one file, called `forums.conf` that lists the ids of 35 | extensions for which the forums and support pages should be downloaded 36 | as well. The `data` directory will contain the downloaded extensions. 37 | 38 | The `crawler` and `create-db` scripts will access and update a MariaDB. 39 | They will use the host, datebase, and credentials found in `~/.my.cnf`. 40 | Since they make use of various JSON features, it is recommended to use at 41 | least version 10.2.8 of MariaDB. 42 | 43 | All utilities are written in Python 3.7. The required modules are listed 44 | in the file `requirements.txt`. 45 | 46 | ## Installation 47 | 48 | Clone and use pip3 to install as a package. 49 | 50 | ```shell 51 | git clone git@logicalhacking.com:BrowserSecurity/ExtensionCrawler.git 52 | pip3 install --user -e ExtensionCrawler 53 | ``` 54 | 55 | ## Team 56 | 57 | * [Achim D. Brucker](http://www.brucker.ch/) 58 | * [Michael Herzberg](http://www.dcs.shef.ac.uk/cgi-bin/makeperson?M.Herzberg) 59 | 60 | ### Contributors 61 | 62 | * Mehmet Balande 63 | 64 | ## License 65 | 66 | This project is licensed under the GPL 3.0 (or any later version). 67 | 68 | SPDX-License-Identifier: GPL-3.0-or-later 69 | 70 | ## Master Repository 71 | 72 | The master git repository for this project is hosted by the [Software 73 | Assurance & Security Research Team](https://logicalhacking.com) at 74 | . 75 | -------------------------------------------------------------------------------- /analysis/library-detector/angular/angular.py: -------------------------------------------------------------------------------- 1 | import MySQLdb 2 | from MySQLdb import cursors 3 | import os 4 | from distutils.version import LooseVersion 5 | from itertools import groupby, islice 6 | import datetime 7 | import pickle 8 | 9 | def execute(q, args=None): 10 | cachepath = "mysqlcache.tmp" 11 | cache = {} 12 | if os.path.exists(cachepath): 13 | with open(cachepath, 'rb') as f: 14 | try: 15 | cache = pickle.load(f) 16 | except Exception as e: 17 | print(e) 18 | 19 | if q in cache: 20 | print("retrieving query results from cache...") 21 | for row in cache[q]: 22 | yield row 23 | else: 24 | print("query not in cache, contacting db ...") 25 | db = MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), cursorclass=cursors.SSCursor) 26 | cursor = db.cursor() 27 | cursor.execute(q, args) 28 | 29 | result = [] 30 | for row in cursor: 31 | result += [row] 32 | yield row 33 | cache[q] = result 34 | with open(cachepath, 'wb') as f: 35 | pickle.dump(cache, f) 36 | print("cache saved") 37 | 38 | vuln_md5s = {} 39 | 40 | for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='angular.js' and (filename in ('angular.js', 'angular.min.js'))"): 41 | if version not in vuln_md5s: 42 | vuln_md5s[version] = set() 43 | vuln_md5s[version].add(md5) 44 | 45 | sorted_vuln_md5s = [] 46 | for library_version in sorted(vuln_md5s.keys(), key=LooseVersion)[::-1]: 47 | sorted_vuln_md5s += [(library_version, vuln_md5s[library_version])] 48 | 49 | 50 | def get_angular_version(md5): 51 | for library_version, md5s in sorted_vuln_md5s: 52 | if md5 in md5s: 53 | return library_version 54 | 55 | for extid, g in groupby(execute("select extid, crx_etag, date, md5 from extension_update_most_recent join crxfile using (crx_etag) where typ='NORMALIZED' order by extid, date, crx_etag"), lambda x: x[0]): 56 | result = {} 57 | 58 | for crx_etag, g in groupby(map(lambda x: x[1:], g), lambda x: x[0]): 59 | result_version = None 60 | for date, md5, in map(lambda x: x[1:], g): 61 | version = get_angular_version(md5) 62 | if version is not None and (result_version is None or LooseVersion(version) > LooseVersion(result_version)): 63 | result_version = version 64 | result[date] = result_version 65 | 66 | if len(set(result.values())) > 1: 67 | for date in sorted(result.keys()): 68 | print(f"{extid}|{date}|{result[date]}") 69 | -------------------------------------------------------------------------------- /analysis/library-detector/angular/angularversions.txt: -------------------------------------------------------------------------------- 1 | 1.7.5,2018-10-04 14:59:37 +0100 2 | 1.7.4,2018-09-07 09:57:37 +0100 3 | 1.7.3,2018-08-03 13:35:40 +0200 4 | 1.7.2,2018-06-12 16:34:38 +0300 5 | 1.7.1,2018-06-08 16:26:22 +0300 6 | 1.7.0,2018-05-11 10:31:53 +0200 7 | 1.7.0-rc.0,2018-04-19 10:07:41 +0200 8 | 1.6.10,2018-04-17 18:35:33 +0200 9 | 1.6.9,2018-02-02 11:19:32 +0100 10 | 1.6.8,2017-12-18 15:17:56 +0100 11 | 1.6.7,2017-11-24 18:44:04 +0100 12 | 1.6.6,2017-08-18 15:12:44 +0200 13 | 1.6.5,2017-07-03 22:34:52 +0300 14 | 1.6.4,2017-03-31 10:48:25 +0200 15 | 1.6.3,2017-03-08 12:44:24 +0100 16 | 1.6.2,2017-02-05 17:58:25 +0200 17 | 1.6.1,2016-12-23 10:38:58 +0000 18 | 1.6.0,2016-12-08 11:07:52 +0000 19 | 1.6.0-rc.2,2016-11-24 21:30:56 +0000 20 | 1.6.0-rc.1,2016-11-21 13:27:47 +0000 21 | 1.6.0-rc.0,2016-10-27 20:28:09 +0100 22 | 1.5.11,2017-01-12 11:22:40 +0200 23 | 1.5.10,2016-12-16 12:27:04 +0200 24 | 1.5.9,2016-11-24 09:27:57 +0000 25 | 1.5.8,2016-07-22 16:01:46 +0100 26 | 1.5.7,2016-06-14 08:08:25 -0700 27 | 1.5.6,2016-05-25 17:00:13 +0100 28 | 1.5.5,2016-04-15 14:09:39 +0100 29 | 1.5.4,2016-04-14 09:13:48 +0100 30 | 1.5.3,2016-03-25 20:01:45 +0000 31 | 1.5.2,2016-03-18 15:37:43 -0700 32 | 1.5.1,2016-03-14 14:45:29 +0000 33 | 1.5.0,2016-02-05 10:04:17 +0000 34 | 1.5.0-rc.2,2016-01-28 09:51:01 +0000 35 | 1.5.0-rc.1,2016-01-15 20:31:08 +0000 36 | 1.5.0-rc.0,2015-12-09 13:50:58 +0000 37 | 1.5.0-beta.2,2015-11-17 15:57:27 -0800 38 | 1.5.0-beta.1,2015-09-29 13:59:34 -0700 39 | 1.5.0-beta.0,2015-09-17 13:42:10 +0100 40 | 1.4.14,2016-10-11 14:11:08 +0100 41 | 1.4.13,2016-10-10 22:02:52 +0100 42 | 1.4.12,2016-06-07 10:44:56 +0200 43 | 1.4.11,2016-05-24 16:44:11 +0200 44 | 1.4.10,2016-03-14 17:27:49 -0400 45 | 1.4.9,2016-01-20 10:11:04 -0800 46 | 1.4.8,2015-11-19 14:52:56 -0800 47 | 1.4.7,2015-09-29 13:54:51 -0700 48 | 1.4.6,2015-09-14 22:43:55 +0200 49 | 1.4.5,2015-08-28 12:06:35 -0700 50 | 1.4.4,2015-08-13 11:15:10 -0700 51 | 1.4.3,2015-07-14 18:26:10 -0700 52 | 1.4.2,2015-07-02 14:36:49 +0300 53 | 1.4.1,2015-06-15 20:50:59 +0200 54 | 1.4.0,2015-05-26 17:34:50 -0700 55 | 1.4.0-rc.2,2015-05-07 14:33:28 -0700 56 | 1.4.0-rc.1,2015-04-24 11:26:10 -0700 57 | 1.4.0-rc.0,2015-04-10 10:44:35 -0700 58 | 1.4.0-beta.6,2015-03-15 21:00:39 +0000 59 | 1.4.0-beta.5,2015-02-24 17:22:13 +0000 60 | 1.4.0-beta.4,2015-02-07 10:26:21 +0000 61 | 1.4.0-beta.3,2015-02-03 19:46:22 +0100 62 | 1.4.0-beta.2,2015-01-26 14:50:48 -0800 63 | 1.4.0-beta.1,2015-01-20 19:42:59 +0100 64 | 1.4.0-beta.0,2015-01-14 20:44:32 +0000 65 | 1.2.32,2016-10-11 13:48:38 +0100 66 | 1.2.31,2016-10-11 07:48:26 +0100 67 | 1.2.30,2016-07-20 23:17:37 +0300 68 | 1.2.29,2015-09-29 13:18:52 -0700 69 | 1.2.28,2014-12-13 21:28:02 -0500 70 | 1.2.27,2014-11-20 14:34:26 -0800 71 | 1.2.26,2014-10-02 09:46:40 -0700 72 | 1.2.25,2014-09-16 15:05:22 -0700 73 | 1.2.24,2014-09-09 16:21:16 -0700 74 | 1.2.23,2014-08-22 15:56:49 -0700 75 | 1.2.22,2014-08-11 17:04:40 +0100 76 | 1.2.21,2014-07-25 09:01:43 -0700 77 | 1.2.20,2014-07-11 11:26:39 -0700 78 | 1.2.19,2014-06-30 16:58:15 -0700 79 | 1.2.18,2014-06-13 13:55:33 -0700 80 | 1.2.17,2014-06-06 20:13:16 +0100 81 | 1.2.16,2014-04-03 14:42:19 -0700 82 | 1.2.15,2014-03-21 14:58:48 -0700 83 | 1.3.20,2015-09-29 13:54:03 -0700 84 | 1.3.19,2015-09-15 13:34:09 +0100 85 | 1.3.18,2015-08-18 15:14:56 -0700 86 | 1.3.17,2015-07-01 12:16:14 -0700 87 | 1.3.16,2015-06-05 13:29:27 -0700 88 | 1.3.15,2015-03-15 21:01:49 +0000 89 | 1.3.14,2015-02-24 17:22:45 +0000 90 | 1.3.13,2015-02-07 19:21:53 +0100 91 | 1.3.12,2015-02-02 14:03:17 +0000 92 | 1.3.11,2015-01-26 14:20:52 -0800 93 | 1.3.10,2015-01-20 19:31:56 +0100 94 | 1.3.9,2015-01-13 14:29:29 -0500 95 | 1.3.8,2014-12-19 13:22:00 -0800 96 | 1.3.7,2014-12-15 13:46:21 +0000 97 | 1.3.6,2014-12-08 16:29:39 -0500 98 | 1.3.5,2014-12-01 19:54:14 +0100 99 | 1.3.4,2014-11-25 00:05:18 +0100 100 | 1.3.3,2014-11-17 09:32:21 -0800 101 | 1.3.2,2014-11-07 13:22:01 -0500 102 | 1.3.1,2014-10-31 12:28:58 -0400 103 | 1.3.0,2014-10-13 15:27:20 -0700 104 | 1.3.0-rc.5,2014-10-08 15:51:30 -0700 105 | 1.3.0-rc.4,2014-10-01 17:37:40 -0700 106 | 1.3.0-rc.3,2014-09-23 18:47:24 -0700 107 | 1.3.0-rc.2,2014-09-16 14:52:25 -0700 108 | 1.3.0-rc.1,2014-09-09 15:45:51 -0700 109 | 1.3.0-rc.0,2014-08-29 21:22:46 -0400 110 | 1.3.0-beta.19,2014-08-22 15:57:26 -0700 111 | 1.3.0-beta.18,2014-08-11 16:54:40 +0100 112 | 1.3.0-beta.17,2014-07-25 16:37:53 +0100 113 | 1.3.0-beta.16,2014-07-18 12:18:26 -0700 114 | 1.3.0-beta.15,2014-07-11 11:15:42 -0700 115 | 1.3.0-beta.14,2014-06-30 09:52:32 -0700 116 | 1.3.0-beta.13,2014-06-16 10:47:09 -0700 117 | 1.3.0-beta.12,2014-06-13 13:41:18 -0700 118 | 1.3.0-beta.11,2014-06-06 20:22:50 +0100 119 | 1.3.0-beta.10,2014-05-23 15:08:36 -0700 120 | 1.3.0-beta.9,2014-05-16 15:14:12 -0700 121 | 1.3.0-beta.8,2014-05-09 14:42:26 +0100 122 | 1.3.0-beta.7,2014-04-25 15:00:17 -0700 123 | 1.3.0-beta.6,2014-04-21 15:57:08 -0700 124 | 1.3.0-beta.5,2014-04-03 14:46:15 -0700 125 | 1.3.0-beta.4,2014-03-28 17:43:17 -0400 126 | 1.3.0-beta.3,2014-03-21 11:16:35 -0700 127 | 1.3.0-beta.2,2014-03-14 16:26:40 -0700 128 | 1.3.0-beta.1,2014-03-07 16:23:14 -0800 129 | 1.2.14,2014-03-01 09:51:19 -0800 130 | 1.2.13,2014-02-14 16:41:02 -0800 131 | 1.2.12,2014-02-07 17:00:28 -0500 132 | 1.2.11,2014-02-03 09:40:03 -0800 133 | 1.2.10,2014-01-24 15:28:28 -0800 134 | 1.2.9,2014-01-15 10:02:10 -0800 135 | 1.2.8,2014-01-10 12:37:49 -0800 136 | 1.2.7,2014-01-03 10:28:30 -0800 137 | 1.2.6,2013-12-19 15:50:07 -0800 138 | 1.2.5,2013-12-13 10:52:13 -0800 139 | 1.2.4,2013-12-06 13:14:56 -0500 140 | 1.2.3,2013-11-27 10:04:59 +0000 141 | 1.2.2,2013-11-22 09:05:42 -0800 142 | 1.2.1,2013-11-14 22:33:20 -0800 143 | 1.2.0,2013-11-08 09:40:09 -0800 144 | 1.2.0-rc.3,2013-10-14 10:36:23 -0700 145 | 1.2.0-rc.2,2013-09-04 14:50:39 +0200 146 | 1.2.0rc1,2013-08-13 11:50:32 -0700 147 | 1.1.5,2013-05-22 01:05:11 -0700 148 | 1.1.4,2013-04-03 18:54:52 -0700 149 | 1.1.3,2013-02-20 12:54:44 -0800 150 | 1.1.2,2013-01-23 10:54:35 -0800 151 | 1.1.1,2012-11-27 01:45:35 +0100 152 | 1.1.0,2012-09-04 11:11:09 -0700 153 | 1.0.8,2013-08-22 11:20:23 -0700 154 | 1.0.7,2013-05-22 01:05:53 -0700 155 | 1.0.6,2013-04-04 10:48:05 -0700 156 | 1.0.5,2013-02-20 12:58:02 -0800 157 | 1.0.4,2013-01-23 10:57:51 -0800 158 | 1.0.3,2012-11-27 01:44:46 +0100 159 | 1.0.2,2012-09-04 11:08:40 -0700 160 | 1.0.1,2012-06-25 09:30:57 -0700 161 | 1.0.0,2012-06-14 10:50:22 -0700 162 | 1.0.0rc12,2012-06-12 01:46:02 -0700 163 | 1.0.0rc11,2012-06-11 00:03:01 -0700 164 | 1.0.0rc10,2012-05-23 21:05:21 -0700 165 | 1.0.0rc9,2012-05-14 22:13:15 -0700 166 | 1.0.0rc8,2012-05-07 00:09:20 -0700 167 | 1.0.0rc7,2012-04-30 16:32:45 -0700 168 | 1.0.0rc6,2012-04-20 15:06:39 -0700 169 | 1.0.0rc5,2012-04-12 03:56:28 -0700 170 | 1.0.0rc4,2012-04-05 11:46:36 -0700 171 | 1.0.0rc3,2012-03-29 16:10:40 -0700 172 | 1.0.0rc2,2012-03-20 15:38:57 -0700 173 | g3-v1.0.0rc1,2012-03-16 12:06:29 -0700 174 | 1.0.0rc1,2012-03-14 01:00:46 -0700 175 | 0.10.6,2012-01-17 13:54:18 -0800 176 | 0.10.5,2011-11-08 04:29:07 -0800 177 | 0.10.4,2011-10-22 21:39:39 -0700 178 | 0.10.3,2011-10-14 08:31:39 -0700 179 | 0.10.2,2011-10-08 09:18:19 -0700 180 | 0.10.1,2011-09-09 01:01:46 -0700 181 | 0.10.0,2011-09-02 11:32:29 -0700 182 | 0.9.19,2011-08-21 01:12:34 -0700 183 | 0.9.18,2011-07-29 16:30:24 -0700 184 | 0.9.17,2011-06-30 09:10:59 -0700 185 | 0.9.16,2011-06-07 16:11:01 -0700 186 | 0.9.15,2011-04-11 14:23:26 -0700 187 | 0.9.14,2011-04-01 12:26:04 -0700 188 | 0.9.13,2011-03-13 22:48:26 -0700 189 | 0.9.12,2011-03-03 23:14:43 -0800 190 | 0.9.11,2011-02-08 17:47:31 -0800 191 | 0.9.10,2011-01-26 23:51:06 -0800 192 | 0.9.9,2011-01-13 22:08:27 -0800 193 | 0.9.7,2010-12-10 17:08:52 -0800 194 | 0.9.6,2010-12-06 21:11:10 -0800 195 | 0.9.5,2010-11-25 10:11:26 -0800 196 | 0.9.4,2010-11-18 22:40:01 -0800 197 | 0.9.3,2010-11-10 22:15:16 -0800 198 | 0.9.2,2010-11-03 13:06:45 -0700 199 | 0.9.1,2010-10-26 22:18:25 -0700 200 | 0.9.0,2010-10-20 15:51:36 -0700 201 | -------------------------------------------------------------------------------- /analysis/library-detector/angular/ideas.txt: -------------------------------------------------------------------------------- 1 | start with current version & never update 2 | start with outdated version & never update 3 | update frequently 4 | downgrade 5 | 6 | 7 | angular is transitive dep 8 | own dep 9 | -------------------------------------------------------------------------------- /analysis/library-detector/angular/plotting.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | from dateutil import parser 4 | from distutils.version import LooseVersion 5 | 6 | 7 | import numpy as np 8 | from matplotlib import pyplot as plt 9 | import matplotlib.patches as mpatches 10 | 11 | def get_cmap(n, name='hsv'): 12 | '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 13 | RGB color; the keyword argument name must be a standard mpl colormap name.''' 14 | return plt.cm.get_cmap(name, n) 15 | 16 | 17 | plt.figure(figsize=(20, 100)) 18 | 19 | data = {} 20 | with open(sys.argv[1]) as f: 21 | for line in f.readlines()[0:5000]: 22 | line = line.strip() 23 | extid, ts, vers = line.split(",") 24 | if extid not in data: 25 | data[extid] = {} 26 | data[extid][parser.parse(ts).date()] = vers 27 | 28 | startdate = datetime.date(year=2017, month=2, day=1) 29 | enddate = datetime.date(year=2018, month=12, day=13) 30 | NOT_IN_STORE = "NO DATA" 31 | 32 | 33 | 34 | converted_data = {} 35 | versions = set() 36 | for extid, tups in data.items(): 37 | days_version_tups = [(0, NOT_IN_STORE)] 38 | for ts, vers in sorted(tups.items()): 39 | if vers != "None": 40 | versions.add(vers) 41 | #if vers != days_version_tups[-1][1]: 42 | days_version_tups += [((ts - startdate).days, vers)] 43 | converted_data[extid] = days_version_tups 44 | 45 | converted_data["angular_updates"] = [(0, NOT_IN_STORE)] 46 | version_release = {} 47 | with open(sys.argv[2]) as f: 48 | for line in f.readlines(): 49 | line = line.strip() 50 | vers, ts_str = line.split(",") 51 | ts = parser.parse(ts_str).date() 52 | version_release[vers] = ts 53 | if startdate < ts and ts < enddate: 54 | converted_data["angular_updates"] += [((ts - startdate).days, vers)] 55 | 56 | converted_data["angular_updates"].sort() 57 | 58 | 59 | colors = {} 60 | for i, version in enumerate(sorted(versions, key=version_release.get)): 61 | #colors[version] = get_cmap(len(versions))(i) 62 | colors[version] = plt.cm.jet(1. * i / ((len(versions)) - 1)) 63 | for version, color in colors.items(): 64 | print(f"{version}: {color}") 65 | 66 | bottoms = np.arange(len(converted_data)) 67 | 68 | sorted_data = sorted(list(converted_data.items()), key=lambda x: min(map(lambda y: y[1], x[1]))) 69 | 70 | for i in range(len(converted_data.items())): 71 | extid, tups = sorted_data[i] 72 | for j in range(len(tups)): 73 | days, vers = tups[j] 74 | if j + 1 == len(tups): 75 | next_days = (enddate - startdate).days 76 | else: 77 | next_days = tups[j + 1][0] 78 | print(f"{extid}: {days}") 79 | #print(f"{vers} and {colors[vers]}") 80 | color = "w" 81 | if vers in colors: 82 | color = colors[vers] 83 | plt.bar(days, 0.8, width=next_days - days, bottom=bottoms[i], 84 | color=color, orientation="horizontal", label=vers, linewidth=1, edgecolor="black") 85 | plt.yticks(bottoms, map(lambda x: x[0], sorted(list(converted_data.items()), key=lambda x: min(map(lambda y: y[1], x[1]))))) 86 | 87 | patchList = [] 88 | for version, color in sorted(colors.items(), key=lambda x: LooseVersion(x[0])): 89 | data_key = mpatches.Patch(color=color, label=version) 90 | patchList.append(data_key) 91 | 92 | plt.legend(handles=patchList, loc="best", bbox_to_anchor=(1.0, 1.00)) 93 | 94 | 95 | plt.subplots_adjust(right=0.85) 96 | plt.savefig("out.pdf") 97 | -------------------------------------------------------------------------------- /analysis/library-detector/jquery.py: -------------------------------------------------------------------------------- 1 | import MySQLdb 2 | from MySQLdb import cursors 3 | import os 4 | from distutils.version import LooseVersion 5 | from itertools import groupby, islice 6 | import datetime 7 | import pickle 8 | 9 | def execute(q, args=None): 10 | cachepath = "mysqlcache.tmp" 11 | cache = {} 12 | if os.path.exists(cachepath): 13 | with open(cachepath, 'rb') as f: 14 | try: 15 | cache = pickle.load(f) 16 | except Exception as e: 17 | print(e) 18 | 19 | if q in cache: 20 | print("retrieving query results from cache...") 21 | for row in cache[q]: 22 | yield row 23 | else: 24 | print("query not in cache, contacting db ...") 25 | db = MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), cursorclass=cursors.SSCursor) 26 | cursor = db.cursor() 27 | cursor.execute(q, args) 28 | 29 | result = [] 30 | for row in cursor: 31 | result += [row] 32 | yield row 33 | cache[q] = result 34 | with open(cachepath, 'wb') as f: 35 | pickle.dump(cache, f) 36 | print("cache saved") 37 | 38 | vuln_md5s = set() 39 | 40 | # for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='jquery'"): 41 | # if LooseVersion(version) < LooseVersion('1.6.3'): 42 | # vuln_md5s.add(md5) 43 | for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='angular.js'"): 44 | if LooseVersion(version) < LooseVersion('1.6.9'): 45 | vuln_md5s.add(md5) 46 | print(f"found {len(vuln_md5s)} MD5s") 47 | 48 | hits = 0 49 | still_vuln = 0 50 | for extid, g in groupby(execute("select extid, crx_etag, date, md5 from extension_update_most_recent join crxfile using (crx_etag) where typ='NORMALIZED' order by extid, date, crx_etag"), lambda x: x[0]): 51 | ext_is_vuln = False 52 | for crx_etag, g in groupby(map(lambda x: x[1:], g), lambda x: x[0]): 53 | is_vuln = False 54 | for date, md5, in map(lambda x: x[1:], g): 55 | if md5 in vuln_md5s: 56 | is_vuln = True 57 | break 58 | 59 | if not is_vuln and ext_is_vuln: 60 | print(f"{extid} got fixed in {crx_etag} on {date}!") 61 | hits += 1 62 | ext_is_vuln = is_vuln 63 | if is_vuln and date > datetime.datetime(year=2018, month=11, day=14): 64 | print(f"{extid} in {crx_etag} is still vulnerable as of {date}") 65 | still_vuln += 1 66 | 67 | print(f"# fixes: {hits}") 68 | print(f"# still vulnerable: {still_vuln}") 69 | 70 | -------------------------------------------------------------------------------- /cdnjs-git-miner: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | """ Tool for mining the cdnjs git repository""" 20 | 21 | import getopt 22 | import logging 23 | import sys 24 | import os 25 | 26 | from ExtensionCrawler.config import (const_log_format, const_basedir) 27 | from ExtensionCrawler.cdnjs_git import (pull_and_update_db, update_db_all_libs, 28 | update_db_from_listfile) 29 | 30 | 31 | def helpmsg(): 32 | """Print help message.""" 33 | print("cdnjs-git-miner [OPTION]") 34 | print( 35 | " -i initialize/update database with all libraries in the repository" 36 | ) 37 | print(" -u update: pull repository and update database") 38 | print( 39 | " -l read list of libraries to update from file (recusively)" 40 | ) 41 | print(" -n process chunk n where n in [1,N]") 42 | print(" -N ") 43 | print(" -v verbose") 44 | print( 45 | " -c print csv format to stdout instead of writing to database" 46 | ) 47 | print(" -a= archive directory") 48 | print(" -h print this help text") 49 | 50 | 51 | def main(argv): 52 | """Main function of the extension crawler.""" 53 | basedir = const_basedir() 54 | verbose = False 55 | initialize = False 56 | update = False 57 | taskid = 1 58 | listfile = None 59 | maxtaskid = 1 60 | csv = False 61 | 62 | try: 63 | opts, args = getopt.getopt(argv, "hvicl:ua:p:n:N:", [ 64 | "archive=", "listupdate=", "taskid=", "maxtaskid=" 65 | ]) 66 | except getopt.GetoptError: 67 | helpmsg() 68 | sys.exit(2) 69 | for opt, arg in opts: 70 | if opt == '-h': 71 | helpmsg() 72 | sys.exit() 73 | elif opt == '-v': 74 | verbose = True 75 | elif opt in ("-l", "--listupdate"): 76 | listfile = arg 77 | elif opt in ("-a", "--archive"): 78 | basedir = arg 79 | elif opt == '-i': 80 | initialize = True 81 | elif opt == '-u': 82 | update = True 83 | elif opt == '-c': 84 | csv = True 85 | elif opt in ("-n", "--taskid"): 86 | taskid = int(arg) 87 | elif opt in ("-N", "--maxtaskid"): 88 | maxtaskid = int(arg) 89 | 90 | if verbose: 91 | loglevel = logging.INFO 92 | else: 93 | loglevel = logging.WARNING 94 | 95 | logger = logging.getLogger() 96 | ch = logging.StreamHandler(sys.stdout) 97 | ch.setFormatter(logging.Formatter(const_log_format())) 98 | logger.addHandler(ch) 99 | logger.setLevel(loglevel) 100 | 101 | cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git") 102 | 103 | if initialize: 104 | logging.info("Starting update of all db libs") 105 | update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid) 106 | logging.info("Finished update of all db libs") 107 | if update: 108 | logging.info("Starting update of new db libs") 109 | pull_and_update_db(cdnjs_git_path, csv) 110 | logging.info("Finished update of new db libs") 111 | if listfile is not None: 112 | logging.info("Starting update from list file") 113 | update_db_from_listfile(cdnjs_git_path, listfile, csv) 114 | logging.info("Finished update from list file") 115 | 116 | logging.info("Successfully updated cdnjs table") 117 | 118 | 119 | if __name__ == "__main__": 120 | main(sys.argv[1:]) 121 | -------------------------------------------------------------------------------- /crawler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016-2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | """ 20 | A crawler for extensions from the Chrome Web Store. 21 | """ 22 | 23 | import sys 24 | import datetime 25 | import time 26 | import getopt 27 | import logging 28 | import itertools 29 | import multiprocessing 30 | from functools import reduce 31 | from ExtensionCrawler.discover import get_new_ids 32 | from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions 33 | from ExtensionCrawler.config import * 34 | from ExtensionCrawler.util import log_info, log_exception, setup_logger 35 | 36 | 37 | def write_log(dirname, fname, text): 38 | """Write text into the file with name fname in directory dirname.""" 39 | os.makedirs(dirname, exist_ok=True) 40 | fname = fname.replace(":", "_") 41 | with open(os.path.join(dirname, fname), 'w') as logfile: 42 | logfile.write(text) 43 | 44 | 45 | def log_failures_to_file(dirname, today, res): 46 | """Log failures during download/update in the log directory dirname.""" 47 | not_authorized = "\n".join(sorted([x.ext_id for x in res if x.not_authorized()])) 48 | write_log(dirname, today + "-not-authorized.log", not_authorized) 49 | 50 | updated = "\n".join(sorted([x.ext_id for x in res if x.is_ok() and not x.not_modified()])) 51 | write_log(dirname, today + "-updated.log", updated) 52 | 53 | has_exception = "\n".join(sorted([x.ext_id for x in res if x.has_exception()])) 54 | write_log(dirname, today + "-raised-exception.log", has_exception) 55 | 56 | raised_ddos = "\n".join(sorted([x.ext_id for x in res if x.raised_google_ddos()])) 57 | write_log(dirname, today + "-raised-ddos.log", raised_ddos) 58 | 59 | not_in_store = "\n".join(sorted([x.ext_id for x in res if x.not_in_store()])) 60 | write_log(dirname, today + "-not-in-store.log", not_in_store) 61 | 62 | new = "\n".join(sorted([x.ext_id for x in res if x.is_new()])) 63 | write_log(dirname, today + "-new-in-store.log", new) 64 | 65 | file_corruption = "\n".join(sorted([x.ext_id for x in res if x.corrupt_tar()])) 66 | write_log(dirname, today + "-file-corruption.log", file_corruption) 67 | 68 | sql_exception = "\n".join(sorted([x.ext_id for x in res if x.sql_exception()])) 69 | write_log(dirname, today + "-sql-exception.log", sql_exception) 70 | 71 | worker_exception = "\n".join(sorted([x.ext_id for x in res if x.worker_exception])) 72 | write_log(dirname, today + "-worker-exception.log", worker_exception) 73 | 74 | sql_fail = "\n".join(sorted([x.ext_id for x in res if not x.sql_success()])) 75 | write_log(dirname, today + "-sql-not-updated.log", sql_fail) 76 | 77 | 78 | def log_summary(res, runtime=0): 79 | """Log brief result summary.""" 80 | 81 | corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res)) 82 | 83 | log_info("Summary:") 84 | log_info(" Updated {} out of {} extensions successfully".format(str(len(list(filter(lambda x: x.is_ok(), res)))), 85 | str(len(res)))) 86 | log_info(" Updated extensions: {:8d}".format( 87 | len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res))))) 88 | log_info(" Updated SQL databases: {:8d}".format(len(list(filter(lambda x: x.sql_success(), res))))) 89 | log_info(" New extensions: {:8d}".format(len(list(filter(lambda x: x.is_new(), res))))) 90 | log_info(" Not authorized: {:8d}".format(len(list(filter(lambda x: x.not_authorized(), res))))) 91 | log_info(" Raised Google DDOS: {:8d}".format(len(list(filter(lambda x: x.raised_google_ddos(), res))))) 92 | log_info(" Not modified archives: {:8d}".format(len(list(filter(lambda x: x.not_modified(), res))))) 93 | log_info(" Extensions not in store: {:8d}".format(len(list(filter(lambda x: x.not_in_store(), res))))) 94 | log_info(" Unknown exception: {:8d}".format(len(list(filter(lambda x: x.has_exception(), res))))) 95 | log_info(" Corrupt tar archives: {:8d}".format(len(corrupt_tar_archives))) 96 | log_info(" SQL exception: {:8d}".format(len(list(filter(lambda x: x.sql_exception(), res))))) 97 | log_info( 98 | " Worker exception: {:8d}".format(len(list(filter(lambda x: x.worker_exception is not None, res))))) 99 | log_info(" Total runtime: {}".format(str(datetime.timedelta(seconds=int(runtime))))) 100 | 101 | if corrupt_tar_archives: 102 | log_info("") 103 | log_info("List of extensions with corrupted files/archives:") 104 | for x in corrupt_tar_archives: 105 | log_info("{}: {}".format(x.ext_id, x.exception), 1) 106 | log_info("") 107 | 108 | 109 | def helpmsg(): 110 | """Print help message.""" 111 | print("crawler [OPTION]") 112 | print(" -h print this help text") 113 | print(" -s silent (no log messages)") 114 | print(" -d discover new extensions") 115 | print(" -p number of concurrent downloads") 116 | print(" -a archive directory") 117 | print( 118 | " -t timeout for an individual extension download") 119 | print(" --max-discover discover at most N new extensions") 120 | print(" --pystuck start pystuck server for all processes") 121 | 122 | 123 | def print_config(basedir, archive_dir, conf_dir, discover, parallel, 124 | ext_timeout, start_pystuck): 125 | """Print current configuration.""" 126 | log_info("Configuration:") 127 | log_info(" Base dir: {}".format(basedir)) 128 | log_info(" Archive directory: {}".format(archive_dir)) 129 | log_info(" Configuration directory: {}".format(conf_dir)) 130 | log_info(" Discover new extensions: {}".format(discover)) 131 | log_info(" Max num. of concurrent downloads: {}".format(parallel)) 132 | log_info(" Download timeout: {}".format(ext_timeout)) 133 | log_info(" Start PyStuck: {}".format(start_pystuck)) 134 | 135 | 136 | def parse_args(argv): 137 | """Parse command line arguments. """ 138 | basedir = const_basedir() 139 | parallel = const_parallel_downloads() 140 | verbose = const_verbose() 141 | discover = const_discover() 142 | ext_timeout = const_ext_timeout() 143 | max_discover = None 144 | start_pystuck = False 145 | try: 146 | opts, _ = getopt.getopt( 147 | argv, "hsda:p:t:", 148 | ["timeout=", "archive=", 'parallel=', 'max-discover=', 'pystuck']) 149 | except getopt.GetoptError: 150 | helpmsg() 151 | sys.exit(2) 152 | for opt, arg in opts: 153 | if opt == '-h': 154 | helpmsg() 155 | sys.exit() 156 | elif opt in ("-a", "--archive"): 157 | basedir = arg 158 | elif opt in ("-p", "--parallel"): 159 | parallel = int(arg) 160 | elif opt in ("-t", "--timeout"): 161 | ext_timeout = int(arg) 162 | elif opt == '-s': 163 | verbose = False 164 | elif opt == '-d': 165 | discover = True 166 | elif opt == '--max-discover': 167 | discover = True 168 | max_discover = int(arg) 169 | elif opt == '--pystuck': 170 | start_pystuck = True 171 | return basedir, parallel, verbose, discover, max_discover, ext_timeout, start_pystuck 172 | 173 | 174 | def main(argv): 175 | """Main function of the extension crawler.""" 176 | 177 | today = datetime.datetime.now(datetime.timezone.utc).isoformat() 178 | basedir, parallel, verbose, discover, max_discover, ext_timeout, start_pystuck = parse_args(argv) 179 | 180 | setup_logger(verbose) 181 | 182 | if start_pystuck: 183 | import pystuck 184 | pystuck.run_server(port=10000) 185 | 186 | # Surpressing these "Starting HTTPS connection ..." log messages 187 | # Older versions of requests use loglevel INFO for that, newer ones DEBUG 188 | logging.getLogger("requests").setLevel(logging.WARNING) 189 | 190 | archive_dir = os.path.join(basedir, "data") 191 | os.makedirs(archive_dir, exist_ok=True) 192 | conf_dir = os.path.join(basedir, "conf") 193 | os.makedirs(conf_dir, exist_ok=True) 194 | open(os.path.join(conf_dir, "forums.conf"), 'a').close() 195 | log_dir = os.path.join(basedir, "log",datetime.datetime.today().strftime("%Y-%m")) 196 | os.makedirs(log_dir, exist_ok=True) 197 | 198 | start_time = time.time() 199 | 200 | print_config(basedir, archive_dir, conf_dir, discover, parallel, 201 | ext_timeout, start_pystuck) 202 | 203 | forum_ext_ids = get_forum_ext_ids(conf_dir) 204 | known_ids = list(set(get_existing_ids(archive_dir)) | set(forum_ext_ids)) 205 | discovered_ids = [] 206 | if discover: 207 | log_info("Discovering new ids {}...".format( 208 | "(at most {}) ".format(max_discover) if max_discover is not None else "")) 209 | try: 210 | discovered_ids = list(get_new_ids(known_ids, max_discover)) 211 | except Exception: 212 | log_exception("Exception when discovering new ids") 213 | log_info("Discovered {} new extensions".format(len(discovered_ids)), 1) 214 | 215 | ext_ids = list(set(discovered_ids) | set(known_ids)) 216 | 217 | discovered_ids = None 218 | known_ids = None 219 | 220 | res = update_extensions(archive_dir, parallel, forum_ext_ids, ext_ids, ext_timeout, verbose, start_pystuck) 221 | 222 | # We re-try (once) the extensions with unknown exceptions, as 223 | # they are often temporary 224 | has_exception = list(filter(lambda x: x.has_exception(), res)) 225 | if has_exception: 226 | log_info(" {} extensions with unknown exceptions, start another try ...".format(str(len(has_exception)))) 227 | has_exception_ids = [x.ext_id for x in has_exception] 228 | forum_ext_ids_except = list( 229 | set(forum_ext_ids).intersection(set(has_exception_ids))) 230 | ext_ids_except = sorted( 231 | list(set(has_exception_ids) - set(forum_ext_ids_except))) 232 | res_update = update_extensions(archive_dir, parallel, 233 | forum_ext_ids_except, ext_ids_except, ext_timeout, verbose, start_pystuck) 234 | res = list(set(res) - set(has_exception)) + res_update 235 | 236 | end_time = time.time() 237 | log_summary(res, int(end_time - start_time)) 238 | log_failures_to_file(log_dir, today, res) 239 | 240 | 241 | if __name__ == "__main__": 242 | main(sys.argv[1:]) 243 | -------------------------------------------------------------------------------- /create-db: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | 19 | import getopt 20 | import sys 21 | import tarfile 22 | import time 23 | import tempfile 24 | from functools import partial 25 | import fnmatch 26 | import multiprocessing 27 | from pebble import ProcessPool 28 | import os 29 | import datetime 30 | 31 | from ExtensionCrawler.archive import update_db_incremental 32 | from ExtensionCrawler.config import archive_file, const_basedir, const_mysql_config_file 33 | from ExtensionCrawler.util import log_info, log_exception, setup_logger, set_logger_tag 34 | 35 | from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend 36 | 37 | 38 | def print_help(): 39 | print("""create-db [OPTION]""") 40 | print(""" -h print this help text""") 41 | print(""" -a archive directory""") 42 | print(""" -p three-letter-prefix""") 43 | print(""" -e file with extension ids""") 44 | print(""" --from-date only process information gathered after""" 45 | """ this date (compared lexographically)""") 46 | print(""" --until-date only process information gathered before""" 47 | """ this date (compared lexographically)""") 48 | print(""" -t number of parallel threads""") 49 | print(""" -n process chunk n where n in [1,N]""") 50 | print(""" -N """) 51 | print(""" --delayed uses INSERT DELAYED INTO statements""") 52 | 53 | def init_process(verbose): 54 | # When not using fork, we need to setup logging again in the worker threads 55 | setup_logger(verbose) 56 | 57 | def process_id(from_date, until_date, delayed, path): 58 | start = time.time() 59 | with tempfile.TemporaryDirectory() as tmpdir: 60 | with tarfile.open(path) as t: 61 | t.extractall(tmpdir) 62 | 63 | extid = os.listdir(tmpdir)[0] 64 | set_logger_tag(extid) 65 | log_info("Start processing extension", 0) 66 | iddir = os.path.join(tmpdir, extid) 67 | 68 | try: 69 | with MysqlBackend( 70 | extid, 71 | delayed=delayed, 72 | cache_etags=True, 73 | read_default_file=const_mysql_config_file(), 74 | charset='utf8mb4') as con: 75 | for date in sorted(os.listdir(iddir)): 76 | if (from_date is not None and date < from_date) or \ 77 | (until_date is not None and date > until_date): 78 | log_info("* Skipping {}".format(date), 2) 79 | continue 80 | try: 81 | update_db_incremental(iddir, extid, date, con) 82 | except Exception: 83 | log_exception("Exception when handling data from {}".format(date), 0) 84 | except Exception: 85 | log_exception("Exception when handling extension", 0) 86 | log_info("Finished extension in {}".format(str(datetime.timedelta(seconds=int(time.time() - start)))), 0) 87 | 88 | 89 | def find(archive, pattern): 90 | for root, _, files in os.walk(os.path.join(archive, "data")): 91 | for file in files: 92 | if fnmatch.fnmatch(file, pattern + ".tar") or fnmatch.fnmatch(file, pattern + ".[0-9][0-9][0-9].tar.xz"): 93 | yield os.path.join(root, file) 94 | 95 | 96 | def find_from_file(archive, extidlistfile): 97 | with open(extidlistfile, 'r') as f: 98 | for line in f.readlines(): 99 | yield archive_file(os.path.join(archive, "data"), line.strip()) 100 | 101 | 102 | def parse_args(argv): 103 | archive = const_basedir() 104 | parallel = 8 105 | taskid = 1 106 | maxtaskid = 1 107 | from_date = None 108 | until_date = None 109 | delayed = False 110 | 111 | paths = [] 112 | 113 | try: 114 | opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [ 115 | "archive=", "prefix=", "extidlistfile=", "threads=", "taskid=", 116 | "maxtaskid=", "from-date=", "until-date=", "delayed", "help" 117 | ]) 118 | except getopt.GetoptError: 119 | print_help() 120 | sys.exit(2) 121 | for opt, arg in opts: 122 | if opt in ("-h", "--help"): 123 | print_help() 124 | sys.exit() 125 | elif opt in ("-a", "--archive"): 126 | archive = arg 127 | elif opt in ("-p", "--prefix"): 128 | paths += find(archive, arg + "*") 129 | elif opt in ("-e", "--extidlistfile"): 130 | paths += find_from_file(archive, arg) 131 | elif opt in ("-t", "--threads"): 132 | parallel = int(arg) 133 | elif opt in ("-n", "--taskid"): 134 | taskid = int(arg) 135 | elif opt in ("-N", "--maxtaskid"): 136 | maxtaskid = int(arg) 137 | elif opt == "--from-date": 138 | from_date = arg 139 | elif opt == "--until-date": 140 | until_date = arg 141 | elif opt == "--delayed": 142 | delayed = True 143 | 144 | if not paths: 145 | paths = list(find(archive, "*")) 146 | 147 | chunksize = int(len(paths) / maxtaskid) 148 | if taskid == maxtaskid: 149 | paths = paths[(taskid - 1) * chunksize:] 150 | else: 151 | paths = paths[(taskid - 1) * chunksize:taskid * chunksize] 152 | 153 | return paths, parallel, from_date, until_date, delayed 154 | 155 | 156 | def main(argv): 157 | multiprocessing.set_start_method("forkserver") 158 | verbose = True 159 | setup_logger(verbose) 160 | 161 | paths, parallel, from_date, until_date, delayed = parse_args(argv) 162 | 163 | with ProcessPool(max_workers=parallel, max_tasks=100, initializer=init_process, initargs=(verbose,)) as p: 164 | p.map(partial(process_id, from_date, until_date, delayed), paths) 165 | 166 | 167 | if __name__ == "__main__": 168 | main(sys.argv[1:]) 169 | -------------------------------------------------------------------------------- /crx-extract: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2017-2018 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | """Tool for extracting crx file from a tar archive.""" 20 | 21 | import os 22 | import sys 23 | import glob 24 | import getopt 25 | import tarfile 26 | import datetime 27 | import dateutil 28 | import dateutil.parser 29 | from ExtensionCrawler.archive import last_crx, get_local_archive_dir 30 | from ExtensionCrawler.config import const_basedir 31 | 32 | 33 | def helpmsg(): 34 | """Print help message.""" 35 | print("crx-extract [OPTION] extid") 36 | print(" -h print this help text") 37 | print(" -s silent (no log messages)") 38 | print(" -e use etag instead of date in outoput") 39 | print(" -w avoid ':' in filenames (useful on Windows)") 40 | print(" -d= date") 41 | print(" -o= output directory") 42 | print(" -a= archive directory") 43 | 44 | 45 | def get_tarinfo(members, name, winfs=False, etag=None): 46 | """Select tarinfo object with a specified path/name.""" 47 | for tarinfo in members: 48 | if tarinfo.name == name: 49 | if winfs: 50 | tarinfo.name = name.replace(":", "-") 51 | if etag is not None: 52 | (path, crx) = os.path.split(tarinfo.name) 53 | (path, _) = os.path.split(path) 54 | tarinfo.name = os.path.join(path, etag, crx) 55 | yield tarinfo 56 | 57 | 58 | def main(argv): 59 | """Main function of the extension crawler.""" 60 | basedir = const_basedir() 61 | verbose = True 62 | date = None 63 | useetag = False 64 | output = "" 65 | winfs = False 66 | try: 67 | opts, args = getopt.getopt(argv, "hsed:a:o:w", 68 | ["date=", "archive=", "output="]) 69 | except getopt.GetoptError: 70 | helpmsg() 71 | sys.exit(2) 72 | for opt, arg in opts: 73 | if opt == '-h': 74 | helpmsg() 75 | sys.exit() 76 | elif opt in ("-a", "--archive"): 77 | basedir = arg 78 | elif opt in ("-d", "--date"): 79 | date = arg 80 | elif opt in ("-o", "--output"): 81 | output = arg 82 | elif opt in ("-w", "--winfs"): 83 | winfs = True 84 | elif opt in ("-e", "--etag"): 85 | useetag = True 86 | elif opt == '-s': 87 | verbose = False 88 | 89 | if len(args) > 0: 90 | extid = args[0] 91 | else: 92 | helpmsg() 93 | sys.exit() 94 | 95 | if date is not None: 96 | dateobj = dateutil.parser.parse(date) 97 | if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None: 98 | dateobj = dateobj.replace(tzinfo=datetime.timezone.utc) 99 | last, etag = last_crx(os.path.join(basedir, "data"), extid, dateobj) 100 | else: 101 | last, etag = last_crx(os.path.join(basedir, "data"), extid) 102 | 103 | if not useetag: 104 | etag = None 105 | basetar = os.path.join(basedir, "data", 106 | get_local_archive_dir(extid), extid) 107 | tar = basetar+".tar" 108 | 109 | if last != "": 110 | if os.path.exists(tar): 111 | files = None 112 | if verbose: 113 | print("Extracting " + os.path.join(output, last) + " from " + tar) 114 | with tarfile.open(tar, 'r') as archive: 115 | files = archive.extractall( 116 | path=output, 117 | members=get_tarinfo(archive, last, winfs, etag)) 118 | archivetars = sorted(glob.glob(basetar+".[0-9][0-9][0-9].tar.xz")) 119 | while (not files and archivetars): 120 | tar = archivetars.pop() 121 | if verbose: 122 | print("Extracting " + os.path.join(output, last) + " from " + tar) 123 | with tarfile.open(tar, 'r:xz') as archive: 124 | files = archive.extractall( 125 | path=output, 126 | members=get_tarinfo(archive, last, winfs, etag)) 127 | elif verbose: 128 | print("Cannot find archive " + tar) 129 | elif verbose: 130 | if os.path.exists(tar): 131 | print("CRX not in archive" + tar) 132 | else: 133 | print("CRX does not exist: cannot find archive " + tar) 134 | 135 | 136 | if __name__ == "__main__": 137 | main(sys.argv[1:]) 138 | -------------------------------------------------------------------------------- /crx-jsinventory: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2017 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | """Tool for extracting crx file from a tar archive.""" 20 | 21 | import sys 22 | import getopt 23 | import csv 24 | import logging 25 | from collections import OrderedDict 26 | from zipfile import ZipFile 27 | from tabulate import tabulate 28 | from ExtensionCrawler.js_decomposer import decompose_js 29 | from ExtensionCrawler.config import (const_log_format) 30 | 31 | 32 | def helpmsg(): 33 | """Print help message.""" 34 | print("crx-jsinventory [OPTION] crx-file|js-file") 35 | print(" -h print this help text") 36 | print(" -c= cvs file (output)") 37 | print(" -v verbose") 38 | print( 39 | " -d disable use of database with file information (not recommended)" 40 | ) 41 | print(" -s silent") 42 | 43 | 44 | def main(argv): 45 | """Main function of the extension crawler.""" 46 | verbose = False 47 | silent = False 48 | csvfile = None 49 | database = True 50 | try: 51 | opts, args = getopt.getopt(argv, "hvdsc:", ["cvs="]) 52 | except getopt.GetoptError: 53 | helpmsg() 54 | sys.exit(2) 55 | for opt, arg in opts: 56 | if opt == '-h': 57 | helpmsg() 58 | sys.exit() 59 | elif opt == '-v': 60 | verbose = True 61 | elif opt == '-s': 62 | silent = True 63 | elif opt == '-d': 64 | database = False 65 | elif opt in ('-c', "--cvs"): 66 | csvfile = arg 67 | 68 | if len(args) > 0: 69 | filename = args[0] 70 | else: 71 | helpmsg() 72 | sys.exit() 73 | 74 | if verbose: 75 | loglevel = logging.INFO 76 | else: 77 | loglevel = logging.WARNING 78 | 79 | logger = logging.getLogger() 80 | ch = logging.StreamHandler(sys.stdout) 81 | ch.setFormatter(logging.Formatter(const_log_format())) 82 | logger.addHandler(ch) 83 | logger.setLevel(loglevel) 84 | 85 | fieldnames = [ 86 | 'filename', 'path', 'size', 'dec_size', 'md5', 'sha1', 'mimetype', 87 | 'description', 'encoding', 'type', 'detectionMethod', 88 | 'detectionMethodDetails', 'lib', 'version', 'lib_filename', 89 | 'evidenceText', 'evidenceStartPos', 'evidenceEndPos' 90 | ] 91 | 92 | brief_fieldnames = [ 93 | 'filename', 'md5', 'type', 'detectionMethod', 'lib', 'version', 94 | 'lib_filename' 95 | ] 96 | 97 | if filename.endswith('.crx'): 98 | with ZipFile(filename) as crxobj: 99 | inventory = decompose_js(crxobj, database) 100 | else: 101 | inventory = decompose_js(filename, database) 102 | 103 | if not silent: 104 | if verbose: 105 | print_fieldnames = fieldnames 106 | else: 107 | print_fieldnames = brief_fieldnames 108 | 109 | print_inventory = [] 110 | for item in inventory: 111 | tmp = {k: item[k] for k in print_fieldnames} 112 | if 'type' in tmp: 113 | tmp['type'] = tmp['type'].value 114 | if 'detectionMethod' in tmp: 115 | tmp['detectionMethod'] = tmp['detectionMethod'].value 116 | if 'md5' in tmp: 117 | tmp['md5'] = tmp['md5'].hex() 118 | if 'sha1' in tmp: 119 | tmp['sha1'] = tmp['sha1'].hex() 120 | 121 | print_inventory.append( 122 | OrderedDict( 123 | sorted( 124 | tmp.items(), 125 | key=lambda t: print_fieldnames.index(t[0])))) 126 | print(tabulate(print_inventory, headers='keys')) 127 | 128 | if csvfile is not None: 129 | with open(csvfile, 'w') as csvobj: 130 | writer = csv.DictWriter(csvobj, fieldnames=fieldnames) 131 | writer.writeheader() 132 | writer.writerows(inventory) 133 | 134 | 135 | if __name__ == "__main__": 136 | main(sys.argv[1:]) 137 | -------------------------------------------------------------------------------- /crx-tool: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2016 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | """ A tool for analyzing and extracting `*.crx` files 20 | (i.e., Chrome extensions).""" 21 | 22 | import argparse 23 | from ExtensionCrawler.crx import extract_crxfile, verify_crxfile 24 | 25 | 26 | def main(): 27 | """Main function of the extension crawler.""" 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("file", help="chrome extension archive (*.crx)") 30 | parser.add_argument('targetdir', nargs='?', default="") 31 | parser.add_argument( 32 | "-c", 33 | "--check", 34 | help="verify format and signature of ", 35 | action="store_true") 36 | parser.add_argument( 37 | "-e", "--extract", help="extract ", action="store_true") 38 | parser.add_argument( 39 | "-f", 40 | "--force", 41 | help="apply action also to (potential) invalid files", 42 | action="store_true") 43 | parser.add_argument( 44 | "-v", "--verbose", help="increase verbosity", action="store_true") 45 | args = parser.parse_args() 46 | 47 | if args.extract: 48 | retval = extract_crxfile(args.verbose, args.force, args.file, 49 | args.targetdir) 50 | else: 51 | retval = verify_crxfile(args.verbose, args.file) 52 | 53 | exit(retval) 54 | 55 | 56 | if __name__ == "__main__": 57 | main() 58 | -------------------------------------------------------------------------------- /database/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | The extension crawler downloads all metadata and extension files into tar files. 4 | This is great for archival, but not so great for analyzing the data. The crawler 5 | therefore also supports inserting all newly crawled information into a MariaDB 6 | database. Additionally, there exists a script to regenerate the database from 7 | old tar files. 8 | 9 | 10 | # Setting up the database 11 | 12 | ## Hardware requirements 13 | 14 | The database is meant to be setup on a (old) PC, although it should also work 15 | with common cloud offerings. 16 | 17 | The amount of data that the database needs to handle grows over time. Currently, 18 | containing ~18 months worth of data, the database requires ~150GB of space. 19 | 20 | It is recommended to have at least 16GB of RAM to keep the indices available; 21 | less RAM might work, more RAM will certainly speed queries up. It is also good 22 | to have at least 16GB of swap; while this detrimental to the performance of 23 | MariaDB, it is often better than it being killed by the OS. 24 | 25 | For storage, it is beneficial to have at least one HDD and one SSD, as the 26 | database workload can be split into sequential and random IO. 27 | 28 | 29 | ## Configuration 30 | 31 | A commented configuration file for MariaDB can be found in `config/my.cnf`. 32 | Configuration options such as pool size and storage locations will need to be 33 | adjusted. 34 | 35 | ## Table schemas 36 | 37 | To set up the tables and schemas, make sure that you have the credentials for 38 | root in your `~/.my.cnf` file, and execute the following: 39 | ```bash 40 | mysql -e "create database extensions;" 41 | for f in schemas/*.sql; do mysql extensions < $f; done 42 | for f in views/*.sql; do mysql extensions < $f; done 43 | ``` 44 | 45 | # Maintaining the database 46 | 47 | ## Memory consumption 48 | 49 | MariaDB will, at times, use much more memory than specified for the pool size -- 50 | 100GB with a pool size of 4GB is certainly possible while regenerating the data. 51 | In these cases, the database should be restarted. The crawler and regeneration 52 | script will retry their database operations by default for around one hour. 53 | 54 | ## Backup 55 | 56 | Regenerating the whole data set can take days, if not weeks, so even though all 57 | data can be restored, having a backup speeds up recovery. For this purpose, the 58 | MariaDB binary log is enabled to allow physical backups, which are much faster 59 | than logical backups for our case. The folder `scripts/` contains scripts to do 60 | full and incremental backups, as well as scripts to backup the schemas and users 61 | (including permissions and hashed passwords). 62 | 63 | # Regenerating extension data 64 | 65 | When the crawler is changed to extract more or different data from the 66 | extensions, one will probably want to regenerate all data, i.e., ask the crawler 67 | to go through all existing tar files and re-extract the already downloaded data. 68 | In order to do so, the `create-db` or `sge/create-db.sh` (for HPCs) can be used. 69 | More information can be found when calling these scripts with `--help`. 70 | 71 | # Using the data set 72 | 73 | ## Example queries 74 | 75 | For more (commented) queries, see the `queries/` folder. 76 | 77 | - ```sql 78 | select extid,crx_etag,count(filename) from extension_most_recent_small join crxfile using (crx_etag) where filename like '%.js' group by extid,crx_etag limit 10; 79 | ``` 80 | This query will print the number of JavaScript files per extension. 81 | 82 | ## Table schemas 83 | 84 | All schema files can be found in the `schemas/` folder. 85 | 86 | | Table name | Description | 87 | | --- | --- | 88 | | extension | General extension metadata from the store pages. One row per \ 89 | extension and crawldate (!). If you are only interested in the most recent \ 90 | *view* of the Chrome Web Store, use the `extension_most_recent` view. For \ 91 | testing your queries, suffix either table/view with *\_small* to only get \ 92 | roughly 1/256th of all extensions. | 93 | | status | The HTTP status codes for the store page and `crx` download. | 94 | | crx | General metadata of the extension file (the `crx` archive itself). Also \ 95 | contains the manifest. | 96 | | crxfile | General metadata of the extension files, e.g., the files contained \ 97 | in the `crx` archives (JavaScript files, etc.).| 98 | | category | Categories of the extensions, e.g. *productivity*, *office*, \ 99 | or *game*. | 100 | | permission | Permissions found in the manifests, e.g., *webRequest*, *tab*, but also \ 101 | host permissions such as *https://www.google.com*. | 102 | | content_script_url | Content script URLs found in the manifest. These are the \ 103 | URLs where the extensions request to have a content script executed when the \ 104 | user visits the website. | 105 | | libdet | Information about used libraries. For each file found in `crx` \ 106 | archives (identified by MD5 sums), this table stores classifications of the \ 107 | file, e.g., whether it is a certain library. | 108 | | review{,\_comment} | Post-metadata and posts from the review forum of an extension. | 109 | | support{,\_comment} | Post-metadata and posts from the support forum of an extension. | 110 | | reply{,\_comment} | Reply-post-metadata and posts for both the review and support forums. | 111 | 112 | ## Views 113 | 114 | All views can be found in the `views/` folder. 115 | 116 | | View name | Description | 117 | | --- | --- | 118 | | extension_small | Contains only roughly 1/256th of all extensions. | 119 | | extension_most_recent | Instead of one row for every combination of extension \ 120 | id and crawl date, this view only contains the rows from the most recent crawl \ 121 | date. | 122 | | extension_most_recent_small | Same, but roughly only 1/256th of all extensions. | 123 | | extension_second_most_recent | Similar to `extension_most_recent`, but \ 124 | contains the second-most recent entry for all extensions. This is useful for \ 125 | investigating how extensions change. | 126 | | extension_{most,second_most}_recent_until_date | Parameterized query. Only \ 127 | considers extensions crawled before a given date. Usage: \ 128 | ```sql 129 | select * from (select @until_date:='2018-05-25') foo, extension_most_recent_until_date; 130 | ``` | 131 | | extension_update | Selects all extension updates in the database. A row in the result represents \ 132 | one extension update, with the date and crx_etag when we have first seen the \ 133 | update, and the date and crx_etag when we have last seen the old version. As \ 134 | we crawl every night, the difference should be around 24 hours on average. | 135 | -------------------------------------------------------------------------------- /database/config/my.cnf: -------------------------------------------------------------------------------- 1 | [client] 2 | port = 3306 3 | socket = /run/mysqld/mysqld.sock 4 | 5 | [mysqld] 6 | port = 3306 7 | socket = /run/mysqld/mysqld.sock 8 | 9 | wait_timeout=1800 10 | max_connections=1000 11 | explicit_defaults_for_timestamp=1 12 | default_time_zone='+00:00' 13 | 14 | server-id = 1 15 | 16 | expire_logs_days=8 17 | log-basename=master1-bin 18 | 19 | # Ideally, the MariaDB datadir resides on a HDD, as there will be a lot of sequential IO. 20 | # After creating a database, it is best moved to a SSD, as there will be a lot of 21 | # random IO. This can be done by simply moving the directory (do NOT move individual table 22 | # files!), e.g.: cd /hdd/mysql; mv extensions /ssd/databases/; ln -s /ssd/databases/extensions 23 | datadir=/hdd/mysql 24 | 25 | # When adding indices, MariaDB uses a lot of space in /tmp. If that space is not enough, the 26 | # used tmpdir can be moved: 27 | innodb_tmpdir=/ssd/innodb_tmp 28 | 29 | # The pool size is said to be around 75% of the available RAM on db-only hosts. However, current 30 | # versions of MariaDB seem to have serious memory leaks when doing a lot of concurrent writes. 31 | # Therefore, expect MariaDB to use a lot more memory, create sufficient swap to prevent killing, 32 | # and restart MariaDB when the usage grows too high. 33 | innodb_buffer_pool_size = 18G 34 | 35 | # General performance tweaks 36 | innodb_read_io_threads=8 37 | innodb_write_io_threads=8 38 | innodb_sort_buffer_size=67108864 39 | innodb_log_file_size=256M 40 | innodb_log_buffer_size=256M 41 | 42 | # Performance tweaks for inserts 43 | #innodb_flush_log_at_trx_commit=0 44 | #innodb_change_buffer_max_size=50 45 | #innodb_flush_method=O_DIRECT 46 | 47 | [mysqldump] 48 | quick 49 | max_allowed_packet = 16M 50 | 51 | [mysql] 52 | no-auto-rehash 53 | 54 | [myisamchk] 55 | key_buffer_size = 20M 56 | sort_buffer_size = 20M 57 | read_buffer = 2M 58 | write_buffer = 2M 59 | 60 | [mysqlhotcopy] 61 | interactive-timeout 62 | -------------------------------------------------------------------------------- /database/queries/get_added_content_scripts.sql: -------------------------------------------------------------------------------- 1 | select downloads, eu.extid, name, url, new_crx_etag 2 | from extension_update eu join extension e on eu.extid=e.extid and eu.first_date_with_new_crx_etag=e.date 3 | join content_script_url c on eu.new_crx_etag=c.crx_etag 4 | where 5 | url in ( 6 | "file://*/*", 7 | "http://*/*", 8 | "https://*/*", 9 | "*://*/*", 10 | "" 11 | ) 12 | and 13 | url not in (select url from content_script_url where crx_etag=previous_crx_etag) 14 | and 15 | first_date_with_new_crx_etag > NOW() - INTERVAL 2 DAY 16 | order by downloads desc; 17 | -------------------------------------------------------------------------------- /database/queries/get_added_permissions.sql: -------------------------------------------------------------------------------- 1 | select downloads, eu.extid, name, permission, new_crx_etag 2 | from extension_update eu join extension e on eu.extid=e.extid and eu.first_date_with_new_crx_etag=e.date 3 | join permission p on eu.new_crx_etag=p.crx_etag 4 | where 5 | permission in ( 6 | "", 7 | "http://*/*", 8 | "https://*/*", 9 | "webRequest", 10 | "webRequestBlocking" 11 | ) 12 | and 13 | permission not in (select permission from permission where crx_etag=previous_crx_etag) 14 | and 15 | first_date_with_new_crx_etag > NOW() - INTERVAL 2 DAY 16 | order by downloads desc; 17 | -------------------------------------------------------------------------------- /database/schemas/category.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `category` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `category`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `category` ( 24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `date` datetime(6) NOT NULL, 26 | `category_md5` varbinary(16) NOT NULL, 27 | `category` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 28 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 29 | PRIMARY KEY (`extid`,`date`,`category_md5`) KEY_BLOCK_SIZE=8 30 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 31 | /*!40101 SET character_set_client = @saved_cs_client */; 32 | 33 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 34 | 35 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 36 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 37 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 38 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 39 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 40 | 41 | -- Dump completed on 2018-08-09 12:31:29 42 | -------------------------------------------------------------------------------- /database/schemas/cdnjs.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `cdnjs` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `cdnjs`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `cdnjs` ( 24 | `path` varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL, 26 | `md5` varbinary(16) NOT NULL, 27 | `filename` varchar(253) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 28 | `sha1` varbinary(20) DEFAULT NULL, 29 | `sha256` varbinary(32) DEFAULT NULL, 30 | `simhash` varbinary(64) DEFAULT NULL, 31 | `size` bigint(20) DEFAULT NULL, 32 | `loc` bigint(20) DEFAULT NULL, 33 | `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 34 | `encoding` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 35 | `mimetype` varchar(126) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 36 | `add_date` datetime(6) NULL DEFAULT NULL, 37 | `library` varchar(254) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 38 | `version` varchar(30) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 39 | `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 40 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 41 | PRIMARY KEY (`path`,`typ`), 42 | KEY `cdnjs_md5_typ` (`md5`,`typ`) 43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 44 | /*!40101 SET character_set_client = @saved_cs_client */; 45 | 46 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 47 | 48 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 52 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 53 | 54 | -- Dump completed on 2018-08-09 12:31:29 55 | -------------------------------------------------------------------------------- /database/schemas/content_script_url.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `content_script_url` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `content_script_url`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `content_script_url` ( 24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `url_md5` varbinary(16) NOT NULL, 26 | `url` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 27 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 28 | PRIMARY KEY (`crx_etag`,`url_md5`) KEY_BLOCK_SIZE=8 29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 30 | /*!40101 SET character_set_client = @saved_cs_client */; 31 | 32 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 33 | 34 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 35 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 36 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 37 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 38 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 39 | 40 | -- Dump completed on 2018-08-09 12:31:29 41 | -------------------------------------------------------------------------------- /database/schemas/crx.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `crx` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `crx`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `crx` ( 24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `filename` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '', 26 | `size` int(11) NOT NULL, 27 | `publickey` blob NOT NULL, 28 | `manifest` longtext /*!100301 COMPRESSED*/ CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT '', 29 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 30 | PRIMARY KEY (`crx_etag`) KEY_BLOCK_SIZE=8 31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 32 | /*!40101 SET character_set_client = @saved_cs_client */; 33 | 34 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 35 | 36 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 37 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 38 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 39 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 40 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 41 | 42 | -- Dump completed on 2018-08-09 12:31:29 43 | -------------------------------------------------------------------------------- /database/schemas/crxfile.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `crxfile` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `crxfile`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `crxfile` ( 24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `path` varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL, 26 | `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL, 27 | `md5` varbinary(16) DEFAULT NULL, 28 | `filename` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 29 | `sha1` varbinary(20) DEFAULT NULL, 30 | `sha256` varbinary(32) DEFAULT NULL, 31 | `simhash` varbinary(64) DEFAULT NULL, 32 | `mimetype` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 33 | `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 34 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 35 | PRIMARY KEY (`crx_etag`,`path`,`typ`), 36 | KEY `crxfile_md5_typ` (`md5`,`typ`) 37 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 38 | /*!40101 SET character_set_client = @saved_cs_client */; 39 | 40 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 41 | 42 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 43 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 44 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 45 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 46 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 47 | 48 | -- Dump completed on 2018-08-09 12:31:29 49 | -------------------------------------------------------------------------------- /database/schemas/extension.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `extension` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `extension`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `extension` ( 24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `date` datetime(6) NOT NULL, 26 | `name` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 27 | `version` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 28 | `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 29 | `downloads` int(11) DEFAULT NULL, 30 | `rating` double DEFAULT NULL, 31 | `ratingcount` int(11) DEFAULT NULL, 32 | `fulldescription` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 33 | `offeredby` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 34 | `developer` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 35 | `itemcategory` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 36 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci DEFAULT NULL, 37 | `lastupdated` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 38 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 39 | PRIMARY KEY (`extid`,`date`) KEY_BLOCK_SIZE=8, 40 | KEY `extension_crx_etag` (`crx_etag`), 41 | KEY `extension_date` (`date`), 42 | KEY `extension_date_extid` (`date`,`extid`), 43 | KEY `extension_extid_crx_etag` (`extid`,`crx_etag`) 44 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 45 | /*!40101 SET character_set_client = @saved_cs_client */; 46 | 47 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 48 | 49 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 50 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 51 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 52 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 53 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 54 | 55 | -- Dump completed on 2018-08-09 12:31:29 56 | -------------------------------------------------------------------------------- /database/schemas/libdet.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `libdet` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `libdet`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `libdet` ( 24 | `md5` varbinary(16) NOT NULL, 25 | `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL, 26 | `sha1` varbinary(20) DEFAULT NULL, 27 | `sha256` varbinary(32) DEFAULT NULL, 28 | `size` bigint(20) DEFAULT NULL, 29 | `loc` bigint(20) DEFAULT NULL, 30 | `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 31 | `encoding` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 32 | `mimetype` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 33 | `library` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 34 | `version` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 35 | `classification_type` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 36 | `detect_method` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 37 | `detect_method_details` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 38 | `evidence_start_pos` bigint(20) DEFAULT NULL, 39 | `evidence_end_pos` bigint(20) DEFAULT NULL, 40 | `evidence_text` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 41 | `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 42 | `mimetype_magic` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 43 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 44 | PRIMARY KEY (`md5`,`typ`) 45 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 46 | /*!40101 SET character_set_client = @saved_cs_client */; 47 | 48 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 49 | 50 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 51 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 52 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 53 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 54 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 55 | 56 | -- Dump completed on 2018-08-09 12:31:29 57 | -------------------------------------------------------------------------------- /database/schemas/permission.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `permission` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `permission`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `permission` ( 24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `permission_md5` varbinary(16) NOT NULL, 26 | `permission` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 27 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 28 | PRIMARY KEY (`crx_etag`,`permission_md5`) KEY_BLOCK_SIZE=8 29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 30 | /*!40101 SET character_set_client = @saved_cs_client */; 31 | 32 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 33 | 34 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 35 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 36 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 37 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 38 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 39 | 40 | -- Dump completed on 2018-08-09 12:31:29 41 | -------------------------------------------------------------------------------- /database/schemas/reply.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `reply` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `reply`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `reply` ( 24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `date` datetime(6) NOT NULL, 26 | `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL, 27 | `commentdate` datetime NOT NULL, 28 | `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 29 | `replyto` varchar(98) COLLATE utf8mb4_unicode_ci DEFAULT NULL, 30 | `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 31 | `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 32 | `commentmd5` varbinary(16) DEFAULT NULL, 33 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 34 | PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8 35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 36 | /*!40101 SET character_set_client = @saved_cs_client */; 37 | 38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 39 | 40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 45 | 46 | -- Dump completed on 2018-08-09 12:31:29 47 | -------------------------------------------------------------------------------- /database/schemas/reply_comment.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `reply_comment` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `reply_comment`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `reply_comment` ( 24 | `commentmd5` varbinary(16) NOT NULL, 25 | `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 26 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 27 | PRIMARY KEY (`commentmd5`) 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 29 | /*!40101 SET character_set_client = @saved_cs_client */; 30 | 31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 32 | 33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 38 | 39 | -- Dump completed on 2018-08-09 12:31:29 40 | -------------------------------------------------------------------------------- /database/schemas/review.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `review` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `review`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `review` ( 24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `date` datetime(6) NOT NULL, 26 | `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL, 27 | `commentdate` datetime NOT NULL, 28 | `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 29 | `rating` double DEFAULT NULL, 30 | `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 31 | `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 32 | `commentmd5` varbinary(16) DEFAULT NULL, 33 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 34 | PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8 35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 36 | /*!40101 SET character_set_client = @saved_cs_client */; 37 | 38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 39 | 40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 45 | 46 | -- Dump completed on 2018-08-09 12:31:29 47 | -------------------------------------------------------------------------------- /database/schemas/review_comment.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `review_comment` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `review_comment`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `review_comment` ( 24 | `commentmd5` varbinary(16) NOT NULL, 25 | `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 26 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 27 | PRIMARY KEY (`commentmd5`) 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 29 | /*!40101 SET character_set_client = @saved_cs_client */; 30 | 31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 32 | 33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 38 | 39 | -- Dump completed on 2018-08-09 12:31:29 40 | -------------------------------------------------------------------------------- /database/schemas/status.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `status` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `status`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `status` ( 24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `date` datetime(6) NOT NULL, 26 | `crx_status` int(11) DEFAULT NULL, 27 | `overview_status` int(11) DEFAULT NULL, 28 | `overview_exception` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 29 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 30 | PRIMARY KEY (`extid`,`date`) KEY_BLOCK_SIZE=8 31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 32 | /*!40101 SET character_set_client = @saved_cs_client */; 33 | 34 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 35 | 36 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 37 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 38 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 39 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 40 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 41 | 42 | -- Dump completed on 2018-08-09 12:31:29 43 | -------------------------------------------------------------------------------- /database/schemas/support.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `support` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `support`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `support` ( 24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL, 25 | `date` datetime(6) NOT NULL, 26 | `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL, 27 | `commentdate` datetime NOT NULL, 28 | `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 29 | `title` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 30 | `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 31 | `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 32 | `commentmd5` varbinary(16) DEFAULT NULL, 33 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 34 | PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8 35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 36 | /*!40101 SET character_set_client = @saved_cs_client */; 37 | 38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 39 | 40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 45 | 46 | -- Dump completed on 2018-08-09 12:31:29 47 | -------------------------------------------------------------------------------- /database/schemas/support_comment.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Table structure for table `support_comment` 18 | -- 19 | 20 | DROP TABLE IF EXISTS `support_comment`; 21 | /*!40101 SET @saved_cs_client = @@character_set_client */; 22 | /*!40101 SET character_set_client = utf8 */; 23 | CREATE TABLE `support_comment` ( 24 | `commentmd5` varbinary(16) NOT NULL, 25 | `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL, 26 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), 27 | PRIMARY KEY (`commentmd5`) 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON'; 29 | /*!40101 SET character_set_client = @saved_cs_client */; 30 | 31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 32 | 33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 38 | 39 | -- Dump completed on 2018-08-09 12:31:29 40 | -------------------------------------------------------------------------------- /database/scripts/mariabackup-full: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | /usr/bin/mariabackup --backup --stream=xbstream --parallel=4 --compress --compress-threads=2 7 | -------------------------------------------------------------------------------- /database/scripts/mariabackup-inc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -o nounset 4 | set -o errexit 5 | 6 | LSN=$1 7 | if ! [[ "$LSN" =~ ^[0-9]+$ ]]; then 8 | >&2 echo "Invalid LSN: $LSN" 9 | exit 1 10 | fi 11 | 12 | /usr/bin/mariabackup --backup --stream=xbstream --parallel=4 --compress --compress-threads=2 --incremental-lsn=$LSN 13 | -------------------------------------------------------------------------------- /database/scripts/mariabackup-schemas: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | T=$(mktemp -d) 7 | for db in $(mysql -N -e "show databases" | grep -v -e "^mysql$" -e "^information_schema$" -e "^performance_schema$") 8 | do 9 | mkdir -p $T/schemas/$db 10 | mysqldump $db --no-data --single-transaction --tab=$T/schemas/$db 11 | done 12 | (cd $T; tar cz *) 13 | rm -r $T 14 | -------------------------------------------------------------------------------- /database/scripts/showgrants: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | set -o nounset 4 | 5 | mysql "" --skip-column-names -A -e"SELECT CONCAT('SHOW GRANTS FOR ''',user,'''@''',host,''';') FROM mysql.user WHERE user<>''" | mysql "" --skip-column-names -A | sed 's/$/;/g' 6 | -------------------------------------------------------------------------------- /database/views/extension_most_recent.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Final view structure for view `extension_most_recent` 18 | -- 19 | 20 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent`*/; 21 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent`*/; 22 | /*!50001 SET @saved_cs_client = @@character_set_client */; 23 | /*!50001 SET @saved_cs_results = @@character_set_results */; 24 | /*!50001 SET @saved_col_connection = @@collation_connection */; 25 | /*!50001 SET character_set_client = utf8 */; 26 | /*!50001 SET character_set_results = utf8 */; 27 | /*!50001 SET collation_connection = utf8_general_ci */; 28 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */ 30 | /*!50001 VIEW `extension_most_recent` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */; 31 | /*!50001 SET character_set_client = @saved_cs_client */; 32 | /*!50001 SET character_set_results = @saved_cs_results */; 33 | /*!50001 SET collation_connection = @saved_col_connection */; 34 | 35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 36 | 37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 42 | 43 | -- Dump completed on 2018-08-09 12:31:29 44 | -------------------------------------------------------------------------------- /database/views/extension_most_recent_small.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Final view structure for view `extension_most_recent_small` 18 | -- 19 | 20 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent_small`*/; 21 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent_small`*/; 22 | /*!50001 SET @saved_cs_client = @@character_set_client */; 23 | /*!50001 SET @saved_cs_results = @@character_set_results */; 24 | /*!50001 SET @saved_col_connection = @@collation_connection */; 25 | /*!50001 SET character_set_client = utf8 */; 26 | /*!50001 SET character_set_results = utf8 */; 27 | /*!50001 SET collation_connection = utf8_general_ci */; 28 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */ 30 | /*!50001 VIEW `extension_most_recent_small` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` where `e1`.`extid` like 'aa%' group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */; 31 | /*!50001 SET character_set_client = @saved_cs_client */; 32 | /*!50001 SET character_set_results = @saved_cs_results */; 33 | /*!50001 SET collation_connection = @saved_col_connection */; 34 | 35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 36 | 37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 42 | 43 | -- Dump completed on 2018-08-09 12:31:29 44 | -------------------------------------------------------------------------------- /database/views/extension_most_recent_until_date.sql: -------------------------------------------------------------------------------- 1 | drop function if exists until_date; 2 | create function until_date returns datetime NO SQL DEERMINISTIC return @until_date; 3 | 4 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 5 | -- 6 | -- Host: localhost Database: extensions 7 | -- ------------------------------------------------------ 8 | -- Server version 10.3.8-MariaDB-log 9 | 10 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 11 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 12 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 13 | /*!40101 SET NAMES utf8 */; 14 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 15 | /*!40103 SET TIME_ZONE='+00:00' */; 16 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 17 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 18 | 19 | -- 20 | -- Final view structure for view `extension_most_recent_until_date` 21 | -- 22 | 23 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent_until_date`*/; 24 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent_until_date`*/; 25 | /*!50001 SET @saved_cs_client = @@character_set_client */; 26 | /*!50001 SET @saved_cs_results = @@character_set_results */; 27 | /*!50001 SET @saved_col_connection = @@collation_connection */; 28 | /*!50001 SET character_set_client = utf8 */; 29 | /*!50001 SET character_set_results = utf8 */; 30 | /*!50001 SET collation_connection = utf8_general_ci */; 31 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 32 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ 33 | /*!50001 VIEW `extension_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; 34 | /*!50001 SET character_set_client = @saved_cs_client */; 35 | /*!50001 SET character_set_results = @saved_cs_results */; 36 | /*!50001 SET collation_connection = @saved_col_connection */; 37 | 38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 39 | 40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 45 | 46 | -- Dump completed on 2018-08-09 12:31:29 47 | -------------------------------------------------------------------------------- /database/views/extension_second_most_recent.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Final view structure for view `extension_second_most_recent` 18 | -- 19 | 20 | /*!50001 DROP TABLE IF EXISTS `extension_second_most_recent`*/; 21 | /*!50001 DROP VIEW IF EXISTS `extension_second_most_recent`*/; 22 | /*!50001 SET @saved_cs_client = @@character_set_client */; 23 | /*!50001 SET @saved_cs_results = @@character_set_results */; 24 | /*!50001 SET @saved_col_connection = @@collation_connection */; 25 | /*!50001 SET character_set_client = utf8 */; 26 | /*!50001 SET character_set_results = utf8 */; 27 | /*!50001 SET collation_connection = utf8_general_ci */; 28 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 29 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ 30 | /*!50001 VIEW `extension_second_most_recent` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; 31 | /*!50001 SET character_set_client = @saved_cs_client */; 32 | /*!50001 SET character_set_results = @saved_cs_results */; 33 | /*!50001 SET collation_connection = @saved_col_connection */; 34 | 35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 36 | 37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 42 | 43 | -- Dump completed on 2018-08-09 12:31:29 44 | -------------------------------------------------------------------------------- /database/views/extension_second_most_recent_until_date.sql: -------------------------------------------------------------------------------- 1 | drop function if exists until_date; 2 | create function until_date returns datetime NO SQL DEERMINISTIC return @until_date; 3 | 4 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 5 | -- 6 | -- Host: localhost Database: extensions 7 | -- ------------------------------------------------------ 8 | -- Server version 10.3.8-MariaDB-log 9 | 10 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 11 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 12 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 13 | /*!40101 SET NAMES utf8 */; 14 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 15 | /*!40103 SET TIME_ZONE='+00:00' */; 16 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 17 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 18 | 19 | -- 20 | -- Final view structure for view `extension_second_most_recent_until_date` 21 | -- 22 | 23 | /*!50001 DROP TABLE IF EXISTS `extension_second_most_recent_until_date`*/; 24 | /*!50001 DROP VIEW IF EXISTS `extension_second_most_recent_until_date`*/; 25 | /*!50001 SET @saved_cs_client = @@character_set_client */; 26 | /*!50001 SET @saved_cs_results = @@character_set_results */; 27 | /*!50001 SET @saved_col_connection = @@collation_connection */; 28 | /*!50001 SET character_set_client = utf8 */; 29 | /*!50001 SET character_set_results = utf8 */; 30 | /*!50001 SET collation_connection = utf8_general_ci */; 31 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 32 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */ 33 | /*!50001 VIEW `extension_second_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`, `extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() and !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */; 34 | /*!50001 SET character_set_client = @saved_cs_client */; 35 | /*!50001 SET character_set_results = @saved_cs_results */; 36 | /*!50001 SET collation_connection = @saved_col_connection */; 37 | 38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 39 | 40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 45 | 46 | -- Dump completed on 2018-08-09 12:31:29 47 | -------------------------------------------------------------------------------- /database/views/extension_small.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Final view structure for view `extension_small` 18 | -- 19 | 20 | /*!50001 DROP TABLE IF EXISTS `extension_small`*/; 21 | /*!50001 DROP VIEW IF EXISTS `extension_small`*/; 22 | /*!50001 SET @saved_cs_client = @@character_set_client */; 23 | /*!50001 SET @saved_cs_results = @@character_set_results */; 24 | /*!50001 SET @saved_col_connection = @@collation_connection */; 25 | /*!50001 SET character_set_client = utf8 */; 26 | /*!50001 SET character_set_results = utf8 */; 27 | /*!50001 SET collation_connection = utf8_general_ci */; 28 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */ 30 | /*!50001 VIEW `extension_small` AS select `extension`.`extid` AS `extid`,`extension`.`date` AS `date`,`extension`.`name` AS `name`,`extension`.`version` AS `version`,`extension`.`description` AS `description`,`extension`.`downloads` AS `downloads`,`extension`.`rating` AS `rating`,`extension`.`ratingcount` AS `ratingcount`,`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extension`.`developer` AS `developer`,`extension`.`itemcategory` AS `itemcategory`,`extension`.`crx_etag` AS `crx_etag`,`extension`.`lastupdated` AS `lastupdated` from `extension` where `extension`.`extid` like 'aa%' */; 31 | /*!50001 SET character_set_client = @saved_cs_client */; 32 | /*!50001 SET character_set_results = @saved_cs_results */; 33 | /*!50001 SET collation_connection = @saved_col_connection */; 34 | 35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 36 | 37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 42 | 43 | -- Dump completed on 2018-08-09 12:31:29 44 | -------------------------------------------------------------------------------- /database/views/extension_update.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: extensions 4 | -- ------------------------------------------------------ 5 | -- Server version 10.3.8-MariaDB-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */; 14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 15 | 16 | -- 17 | -- Final view structure for view `extension_update` 18 | -- 19 | 20 | /*!50001 DROP TABLE IF EXISTS `extension_update`*/; 21 | /*!50001 DROP VIEW IF EXISTS `extension_update`*/; 22 | /*!50001 SET @saved_cs_client = @@character_set_client */; 23 | /*!50001 SET @saved_cs_results = @@character_set_results */; 24 | /*!50001 SET @saved_col_connection = @@collation_connection */; 25 | /*!50001 SET character_set_client = utf8 */; 26 | /*!50001 SET character_set_results = utf8 */; 27 | /*!50001 SET collation_connection = utf8_general_ci */; 28 | /*!50001 CREATE ALGORITHM=UNDEFINED */ 29 | /*!50013 DEFINER=`root`@`%` SQL SECURITY DEFINER */ 30 | /*!50001 VIEW `extension_update` AS select `e3`.`extid` AS `extid`,`e3`.`first_date_with_new_crx_etag` AS `first_date_with_new_crx_etag`,`e3`.`new_crx_etag` AS `new_crx_etag`,`e3`.`last_date_with_previous_crx_etag` AS `last_date_with_previous_crx_etag`,`e4`.`crx_etag` AS `previous_crx_etag` from (((select `e1`.`extid` AS `extid`,`e1`.`date` AS `first_date_with_new_crx_etag`,`e1`.`crx_etag` AS `new_crx_etag`,max(`e2`.`date`) AS `last_date_with_previous_crx_etag` from (((select `extensions`.`extension`.`extid` AS `extid`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,min(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`crx_etag` is not null group by `extensions`.`extension`.`extid`,`extensions`.`extension`.`crx_etag`)) `e1` join (select `extensions`.`extension`.`extid` AS `extid`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`crx_etag` is not null group by `extensions`.`extension`.`extid`,`extensions`.`extension`.`crx_etag`) `e2` on(`e1`.`extid` = `e2`.`extid`)) where `e1`.`date` > `e2`.`date` group by `e1`.`crx_etag`)) `e3` join `extensions`.`extension` `e4` on(`e3`.`extid` = `e4`.`extid` and `e3`.`last_date_with_previous_crx_etag` = `e4`.`date`)) */; 31 | /*!50001 SET character_set_client = @saved_cs_client */; 32 | /*!50001 SET character_set_results = @saved_cs_results */; 33 | /*!50001 SET collation_connection = @saved_col_connection */; 34 | 35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 36 | 37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 42 | 43 | -- Dump completed on 2018-08-09 12:31:29 44 | -------------------------------------------------------------------------------- /extgrep: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3.7 2 | # 3 | # Copyright (C) 2019 The University of Sheffield, UK 4 | # 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | # 18 | # SPDX-License-Identifier: GPL-3.0-or-later 19 | 20 | import argparse 21 | import io 22 | import logging 23 | import re 24 | import json 25 | import sys 26 | import importlib.util 27 | import csv 28 | import math 29 | import ast 30 | 31 | from zipfile import ZipFile 32 | 33 | from ExtensionCrawler.config import (const_log_format, const_basedir) 34 | from ExtensionCrawler.archive import iter_tar_entries_by_date 35 | from ExtensionCrawler.js_mincer import mince_js 36 | 37 | 38 | def get_shannon_entropy(string): 39 | """ 40 | This code has been borrowed from 41 | "http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and 42 | "git@github.com:dxa4481/truffleHog.git" 43 | """ 44 | chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" 45 | if not string: 46 | return 0 47 | entropy = 0 48 | for x in chars: 49 | p_x = float(string.count(x))/len(string) 50 | if p_x > 0: 51 | entropy += - p_x*math.log(p_x, 2) 52 | return entropy 53 | 54 | 55 | def is_likely_hash(string): 56 | return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4 57 | 58 | 59 | def import_regexs(path): 60 | spec = importlib.util.spec_from_file_location("MinerStrings", path) 61 | module = importlib.util.module_from_spec(spec) 62 | spec.loader.exec_module(module) 63 | return module 64 | 65 | 66 | def get_etag(headers_content): 67 | d = ast.literal_eval(headers_content) 68 | if "ETag" in d: 69 | return d["ETag"] 70 | 71 | 72 | def get_name_and_version(overview_contents): 73 | # Extract extension name 74 | match = re.search("""""", 75 | overview_contents) 76 | name = match.group(1) if match else None 77 | 78 | # Extract extension version 79 | match = re.search( 80 | """""", overview_contents) 81 | version = match.group(1) if match else None 82 | 83 | return name, version 84 | 85 | 86 | def first_match_in_locations(search_tag, pattern, locations): 87 | for location_tag, lines in locations: 88 | for line in lines: 89 | m = re.search(pattern, line) 90 | if m: 91 | matched_string = m.group() 92 | if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string): 93 | return [[location_tag, search_tag, matched_string]] 94 | return [] 95 | 96 | 97 | def handle_extid(conf, extid, csvwriter): 98 | miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings() 99 | 100 | results = [] 101 | 102 | still_in_store = None 103 | crx_etags = [None] 104 | for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid): 105 | if conf.from_date and not (conf.from_date <= date): 106 | continue 107 | if conf.latest_date and not (date <= conf.latest_date): 108 | continue 109 | 110 | crx_etag = None 111 | name = None 112 | version = None 113 | date_matches = [] 114 | for tarentry, tarfile in tups: 115 | tarentry_filename = tarentry.name.split("/")[-1] 116 | 117 | if tarentry_filename.endswith(".crx.headers"): 118 | crx_etag = get_etag(tarfile.read().decode()) 119 | if crx_etag: 120 | crx_etags += [crx_etag] 121 | 122 | if tarentry_filename == "overview.html": 123 | name, version = get_name_and_version(tarfile.read().decode()) 124 | 125 | if tarentry_filename == "overview.html.status": 126 | still_in_store = tarfile.read().decode().startswith("2") 127 | 128 | if tarentry_filename.endswith(".crx") and tarentry.size > 0: 129 | with ZipFile(tarfile) as zf: 130 | for zipentry in zf.infolist(): 131 | file_matches = [] 132 | if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"): 133 | with zf.open(zipentry) as f: 134 | verbatim_lines = [] 135 | joined_string_lines = [] 136 | for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")): 137 | verbatim_lines += block.content.splitlines() 138 | joined_string_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines() 139 | 140 | for search_tag in miner_strings.strings.keys(): 141 | for search_string in miner_strings.strings[search_tag]: 142 | for match in first_match_in_locations(search_tag, re.escape(search_string), 143 | [("verbatim", verbatim_lines), 144 | ("joined_string", joined_string_lines)]): 145 | file_matches.append(match) 146 | 147 | for search_tag in miner_strings.patterns.keys(): 148 | for search_pattern in miner_strings.patterns[search_tag]: 149 | for match in first_match_in_locations(search_tag, search_pattern, 150 | [("verbatim", verbatim_lines), 151 | ("joined_string", joined_string_lines)]): 152 | file_matches.append(match) 153 | 154 | for match in file_matches: 155 | date_matches.append([zipentry.filename] + match) 156 | 157 | for match in date_matches: 158 | results += [[date, crx_etag, name, version] + match] 159 | 160 | for result in results: 161 | csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)]) 162 | 163 | 164 | def main(conf): 165 | logger = logging.getLogger() 166 | ch = logging.StreamHandler(sys.stderr) 167 | ch.setFormatter(logging.Formatter(const_log_format())) 168 | logger.addHandler(ch) 169 | if conf.verbose: 170 | logger.setLevel(logging.DEBUG) 171 | else: 172 | logger.setLevel(logging.WARNING) 173 | 174 | with open(conf.EXTID_FILE) as f: 175 | csvwriter = csv.writer(sys.stdout, csv.unix_dialect) 176 | csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "position", "tag", "match"]) 177 | for extid in [l.strip() for l in f.readlines()]: 178 | handle_extid(conf, extid, csvwriter) 179 | 180 | 181 | def build_parser(): 182 | main_parser = argparse.ArgumentParser( 183 | formatter_class=argparse.RawTextHelpFormatter, 184 | description='Grep for extensions.') 185 | main_parser.add_argument( 186 | 'REGEXP_FILE', 187 | help='python file with regular expressions') 188 | main_parser.add_argument( 189 | 'EXTID_FILE', 190 | help='file with extension ids') 191 | main_parser.add_argument( 192 | '-v', 193 | '--verbose', 194 | action='store_true', 195 | default=False, 196 | help='increase verbosity') 197 | 198 | 199 | main_parser.add_argument( 200 | '-D', 201 | '--latest-date', 202 | metavar='DATE', 203 | type=str, 204 | help='select latest crx from tar, released before DATE.\n' + 205 | 'Together with --from-date, specifies all crx released in specified\n' + 206 | 'date range.') 207 | 208 | main_parser.add_argument( 209 | '-d', 210 | '--from-date', 211 | metavar='DATE', 212 | type=str, 213 | help='select oldest crx from tar released after DATE.\n' + 214 | 'Together with --latest-date, specifies all crx released in specified\n' + 215 | 'date range.') 216 | 217 | main_parser.add_argument( 218 | '-a', 219 | '--archive-dir', 220 | metavar='archive', 221 | type=str, 222 | default=const_basedir(), 223 | help='archive directory') 224 | 225 | return main_parser 226 | 227 | 228 | if __name__ == "__main__": 229 | main_parser = build_parser() 230 | 231 | main_conf = main_parser.parse_args() 232 | 233 | sys.exit(main(main_conf)) 234 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | colorama==0.3.9 2 | pystuck==0.8.5 3 | simhash==1.8.0 4 | tabulate==0.7.7 5 | setuptools==65.5.1 6 | cchardet==2.1.1 7 | mysqlclient==1.3.10 8 | requests==2.20.0 9 | pycryptodomex==3.4.6 10 | beautifulsoup4==4.6.0 11 | python_dateutil==2.6.1 12 | GitPython==2.1.5 13 | python_magic==0.4.13 14 | jsbeautifier==1.7.3 15 | pebble==4.3.7 16 | jsmin==2.2.2 17 | -------------------------------------------------------------------------------- /scripts/hpc-utilities/hpc-submit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | print_help() 7 | { 8 | echo "Usage: $prog [OPTION] ... -- COMMAND ... %INPUT% ..." 9 | echo "" 10 | echo "Run ..." 11 | echo "" 12 | echo " --help, -h display this help message" 13 | echo " --jobs, -j num number of jobs (default: $jobs)" 14 | echo " --input, -i file file with input data" 15 | echo " --prefix, -p prefix prefix path for job directory (default: $prefix)" 16 | echo " --jobname, -n name job name (default: $name)" 17 | echo " --wrapper, -w wrapper exec wrapper (default: $wrapper)" 18 | echo " --copy-from, -f copy command from direcotry (default: src)" 19 | echo " --max-memory, -m mem max mem (default: $mem)" 20 | echo " --max-time, -t timelimit (default: $timelimit)" 21 | echo " --host, -s remote host (default: $host)" 22 | echo " --srcdir, -d src for copying binary (default: $srcdir)" 23 | echo " assumed to be remote, if it starts with a \":\"" 24 | echo "" 25 | echo " COMMAND is the command that should be executed on the HPC cluster, where" 26 | echo " %INPUT% will be replaced with a file containing the job-specific input data." 27 | } 28 | 29 | 30 | 31 | mk_jobdir(){ 32 | echo "Creating temporary job directory in $workdir." 33 | mkdir -p "$workdir"/bin 34 | mkdir -p "$workdir"/cfg 35 | mkdir -p "$workdir"/input 36 | mkdir -p "$workdir"/output 37 | mkdir -p "$workdir"/tmp 38 | } 39 | 40 | 41 | clean_jobdir(){ 42 | rm -rf "$workdir" 43 | } 44 | 45 | split_input(){ 46 | echo "Splitting input." 47 | split --numeric-suffixes=1 -a 8 -e -n l/$jobs "$input" "$workdir/input/" 48 | } 49 | 50 | mk_hpc_script(){ 51 | local HOSTNAME=`hostname -f` 52 | echo "Creating HPC script." 53 | cat < $workdir/job.sge 54 | 55 | #!/bin/bash 56 | ## This script was generated by $prog (version: $version) 57 | ## on $timestamp 58 | ## by $USER@$HOSTNAME 59 | ## in $PWD 60 | ## using the following command: 61 | ## $invokation 62 | ## 63 | ## SGE configuration: 64 | #$ -V 65 | #$ -t 1-$jobs 66 | #$ -l rmem=$mem 67 | #$ -l h_rt=$timelimit 68 | #$ -j yes 69 | #$ -o "$prefix"/"$name"/output 70 | 71 | 72 | set -o nounset 73 | set -o errexit 74 | set -x 75 | 76 | export JOBINPUT="$prefix"/"$name"/input/\`printf %08d \$SGE_TASK_ID\` 77 | 78 | /usr/bin/time -v $wrapper $prefix/$name/bin/$cmd 79 | echo "Execution successful." 80 | EOF 81 | } 82 | 83 | 84 | mk_remote_jobdir(){ 85 | echo "Create remote working directory ($host:$prefix)." 86 | ssh $host mkdir -p $prefix 87 | } 88 | 89 | install_hpc_script(){ 90 | echo "Installing HPC Script" 91 | scp -q -r "$workdir" "$host":"$prefix"/"$name" 92 | 93 | if [[ $srcdir == ":"* ]]; then 94 | echo " Copying cmd from remote src." 95 | ssh $host cp "${srcdir:1}"/"$srccmd" "$prefix"/"$name"/bin; 96 | else 97 | echo " Copying cmd from local src." 98 | scp $srccmd "$srcdir"/"$srccmd" "$host":"$prefix"/"$name"/bin; 99 | fi 100 | } 101 | 102 | submit_job(){ 103 | echo "Submitting job." 104 | ssh $host qsub "$prefix"/"$name"/job.sge 105 | } 106 | 107 | ## global configuration 108 | version="0.0" 109 | prog=`echo $0 | sed 's|.*/||'`; 110 | invokation="$prog $(printf "%q " "$@")" 111 | timestamp=`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'` 112 | host=`hostname` 113 | workdir=`mktemp -d` 114 | echo $workdir 115 | 116 | ## default values 117 | prefix="\$HOME/hpc" 118 | jobs=1 119 | name="$host-$USER-$timestamp" 120 | input="" 121 | wrapper="singularity exec -B \$TMPDIR:$prefix/$name/tmp" 122 | cmd="" 123 | mem="2G" 124 | timelimit="01:00:00" 125 | local="false"; 126 | host="sharc.shef.ac.uk" 127 | srcdir="." 128 | 129 | while [ $# -gt 0 ] 130 | do 131 | case "$1" in 132 | --jobs|-j) 133 | jobs="$2"; 134 | shift;; 135 | --input|-i) 136 | input="$2"; 137 | shift;; 138 | --jobname|-n) 139 | name="$2"; 140 | shift;; 141 | --max-memory|-m) 142 | mem="$2"; 143 | shift;; 144 | --max-time|-t) 145 | timelimit="$2"; 146 | shift;; 147 | --host|-s) 148 | host="$2"; 149 | shift;; 150 | --srcdir|-d) 151 | srcdir="$2"; 152 | shift;; 153 | --wrapper|-w) 154 | wrapper="$2"; 155 | shift;; 156 | --prefix|-p) 157 | prefix="$2"; 158 | shift;; 159 | --help|-h) 160 | print_help 161 | exit 0;; 162 | --) shift; break;; 163 | *) print_help 164 | exit 1;; 165 | esac 166 | shift 167 | done 168 | cmd=`echo $(printf "%q " "$@") | sed -e 's/%INPUT%/\$JOBINPUT/'` 169 | cmdarray=("$@") 170 | srccmd=${cmdarray[0]} 171 | 172 | mk_jobdir; 173 | 174 | if [ -n "$input" ]; then 175 | if [ ! -f "$input" ]; then 176 | echo "Input file \"$input\" not found!" 177 | exit 1 178 | fi 179 | split_input; 180 | fi 181 | 182 | mk_hpc_script; 183 | 184 | mk_remote_jobdir; 185 | 186 | install_hpc_script; 187 | 188 | clean_jobdir; 189 | 190 | submit_job; 191 | 192 | -------------------------------------------------------------------------------- /scripts/maintainance/maintain_archive: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ACTION=${1:-MAIN} 4 | ARCHIVE=${2:-/srv/Shared/BrowserExtensions/archive} 5 | 6 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"` 7 | mkdir -p $LOGDIR 8 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'` 9 | LOG=${3:-$LOGPREFIX-maintain-archive-$ACTION.log} 10 | 11 | SELF=$0 12 | SRC=$4 13 | 14 | next_generation(){ 15 | local src=$1 16 | local filebase=`basename $src .tar` 17 | local dir=`dirname $src` 18 | 19 | # Check next free file name: 20 | if ls $dir/$filebase.[0-9][0-9][0-9].tar.xz &> /dev/null; then 21 | latest=`ls $dir/$filebase.[0-9][0-9][0-9].tar.xz | \ 22 | sort -r | head -1 | \ 23 | sed -e "s/.*\([0-9][0-9][0-9]\).tar.xz/\1/"` 24 | next=`printf %03d $((latest+1))` 25 | else 26 | next=000 27 | fi 28 | 29 | dest=$dir/$filebase.$next.tar 30 | echo "Processing: $src -> $dest" | tee -a $LOG 31 | mv -n $src $dest 32 | if [ ! -f $src ]; then 33 | tar -cf $src -T /dev/null 34 | if [ ! -f $src ]; then 35 | echo "ERROR: cannot create empty tar archive ($src)" | tee -a $LOG 36 | fi 37 | else 38 | echo "ERROR: old archive exists ($src)" | tee -a $LOG 39 | fi 40 | } 41 | 42 | zge_compress(){ 43 | mkdir -p $LOG.dir 44 | find $ARCHIVE/data/ \ 45 | -type d \ 46 | -name "[a-p][a-p][a-p]" \ 47 | -exec qsub -o $LOG.dir `dirname $SELF`/xz.sge {} \; 48 | } 49 | 50 | main(){ 51 | find $ARCHIVE/data/ \ 52 | -name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].tar" \ 53 | -exec $SELF MOVE $ARCHIVE $LOG {} \; 54 | } 55 | 56 | case "$ACTION" in 57 | MAIN) 58 | main;; 59 | MOVE) 60 | next_generation $SRC;; 61 | COMPRESS) 62 | zge_compress;; 63 | esac 64 | -------------------------------------------------------------------------------- /scripts/maintainance/xz.sge: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -V 3 | #$ -l rmem=2G 4 | #$ -j yes 5 | set -o nounset 6 | set -x 7 | 8 | find $1 \ 9 | -name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].[0-9][0-9][0-9].tar" \ 10 | -exec xz {} \; 11 | -------------------------------------------------------------------------------- /scripts/monitoring/download-report-one-week.gp: -------------------------------------------------------------------------------- 1 | if (!exists("monitordir")) monitordir='.' 2 | filename="updates.csv" 3 | set terminal pngcairo size 3000,800 enhanced font 'Verdana,10' 4 | set output monitordir."/download-report-one-week.png" 5 | 6 | day="2018-04-01" 7 | # basic configuration 8 | set datafile separator ";" 9 | 10 | set autoscale x 11 | 12 | # plot last 7 days 13 | set xrange [time(0) - 7*24*60*60:] 14 | 15 | set ytics 16 | set yrange [0:400000] 17 | set ylabel "Parallel Downloads" 18 | set ytics 25000 19 | set mytics 2 20 | set y2range [0:4500] 21 | set y2label "Sequential Downloads" 22 | set y2tics 500 23 | 24 | 25 | set grid 26 | 27 | set xdata time 28 | set timefmt '%Y-%m-%d %H:%M:%S' 29 | set format x "%Y-%m-%d\n%H:%M:%S" 30 | 31 | set xtics 28800 32 | set mxtics 8 33 | 34 | set style data lines 35 | set title sprintf("Extension Downloads (Last Seven Days)") 36 | 37 | set key horiz 38 | set key out bot center 39 | 40 | # for plotting only one day, one can use: 41 | data_for_day(day,file)=sprintf("<(grep %s %s)",day, file) 42 | data=data_for_day(day, monitordir."/".filename) 43 | 44 | # for plotting all data 45 | data=monitordir."/".filename 46 | 47 | # Trick for plotting first derivative of data: 48 | # x0=NaN 49 | # y0=NaN 50 | # replot data using (dx=$1-x0,x0=$1,$1-dx/2):(dy=$6-y0,y0=$6,dy/dx) w l notitle 51 | # TODO: support time on x scale 52 | 53 | x0p=NaN 54 | y0p=NaN 55 | x0s=NaN 56 | y0s=NaN 57 | 58 | plot data using 1:4 with lines dashtype 2 lt rgb "#d07b95" axes x1y1 \ 59 | title "Parallel Downloads (Target)" ,\ 60 | data using 1:6 with lines lw 2 dashtype 1 lt rgb "#9c416e" axes x1y1 \ 61 | title "Parallel Downloads" ,\ 62 | data using (dx=timecolumn(1)-x0p,x0p=timecolumn(1),timecolumn(1)-dx/2):(dy=$6-y0p,y0p=$6,dy/dx < 0 ? 0 : (8*60*60)*dy/dx) \ 63 | with lines dashtype 2 lt rgb "#622a55" axes x1y1 \ 64 | title "Parallel Downloads per Eight Hours",\ 65 | data using 1:5 with lines dashtype 2 lt rgb "#76eec6" axes x1y2 \ 66 | title "Sequential Downloads (Target)",\ 67 | data using 1:7 with lines lw 2 dashtype 1 lt rgb "#5ebe9e" axes x1y2 \ 68 | title "Sequential Downloads",\ 69 | data using (dx=timecolumn(1)-x0s,x0s=timecolumn(1),timecolumn(1)-dx/2):(dy=$7-y0s,y0s=$7,dy/dx < 0 ? 0 : (8*60*60)*dy/dx) \ 70 | with lines dashtype 2 lt rgb "#468e76" axes x1y2 \ 71 | title "Sequential Downloads per Eight Hours" 72 | 73 | set terminal pdfcairo size 30,8 enhanced font 'Verdana,15' 74 | set output monitordir."/download-report-one-week.pdf" 75 | replot 76 | 77 | # Plot number of extensions over time 78 | set title sprintf("Size of Extensions Archive") 79 | set terminal pngcairo size 3000,800 enhanced font 'Verdana,10' 80 | set output monitordir."/size-of-archive.png" 81 | 82 | set timefmt '%Y-%m-%d %H:%M:%S' 83 | set format x "%Y-%m-%d" 84 | 85 | set xrange ["2018-05-01":*] 86 | 87 | 88 | set yrange [150000:400000] 89 | set ylabel "Parallel Downloads" 90 | set y2range [2750:4500] 91 | 92 | 93 | set xtics 604800 94 | set mxtics 7 95 | 96 | 97 | plot data using 1:4 with lines dashtype 1 lt rgb "#d07b95" axes x1y1 \ 98 | title "Parallel Downloads" ,\ 99 | data using 1:5 with lines dashtype 1 lt rgb "#76eec6" axes x1y2 \ 100 | title "Sequential Downloads",\ 101 | data using 1:($4+$5) with lines dashtype 1 lt rgb "#000000" axes x1y1 \ 102 | title "Total Downloads" ,\ 103 | 104 | set terminal pdfcairo size 30,8 enhanced font 'Verdana,15' 105 | set output monitordir."/size-of-archive.pdf" 106 | replot 107 | 108 | -------------------------------------------------------------------------------- /scripts/monitoring/global_update_monitor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | 6 | KILL="NO" 7 | ARCHIVE="/srv/Shared/BrowserExtensions/archive" 8 | 9 | while [[ $# -gt 0 ]] 10 | do 11 | key="$1" 12 | case $key in 13 | -a|--ARCHIVE) 14 | ARCHIVE="$2" 15 | shift # past argument 16 | shift # past value 17 | ;; 18 | --kill) 19 | KILL=YES 20 | shift # past argument 21 | ;; 22 | *) # unknown option 23 | shift # past argument 24 | ;; 25 | esac 26 | done 27 | 28 | LATESTLOG=`ls $ARCHIVE/log/*/*0.log | tail -n 1` 29 | LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log | tail -n 1` 30 | BASEDIR=$(dirname "$0") 31 | 32 | PIDS="" 33 | echo "# Checking update status" 34 | if ps u -C global_update.sh > /dev/null; then 35 | NUM=`ps u -C global_update.sh | tail -n +2 | wc -l` 36 | echo "* $NUM instances of global_update.sh still running (WARNING)" 37 | PIDS=`ps u -C global_update.sh | tail -n +2 | awk '{print $2}' | xargs` 38 | echo " Running PIDs: $PIDS" 39 | if [[ "$KILL" == "YES" ]];then 40 | echo " KILL mode enabled, killing running global_update.sh instances" 41 | echo " (executing pkill -9 -P $PIDS)" 42 | pkill -9 -P $PIDS 43 | pkill -f "ExtensionCrawler//crawler " 44 | fi 45 | else 46 | echo "* global_update.sh not running" 47 | NUM=0 48 | fi 49 | 50 | echo "* current status" 51 | PDOWNLOADS=`grep 'Updating extension $' $LATESTLOG | wc -l` 52 | echo " * parallel downloads finished: $PDOWNLOADS" 53 | SDOWNLOADS=`grep 'Updating extension (' $LATESTLOG | wc -l` 54 | echo " * sequential downloads finished: $SDOWNLOADS" 55 | echo " * Updating info from log ($LATESTLOG):" 56 | grep 'Updating .* extensions' $LATESTLOG | sed -e 's/^.*---//' 57 | 58 | echo "" 59 | echo "## Latest log:" 60 | cat $LATESTGLOBALLOG 61 | 62 | EXTENSIONS=`grep "Updating db" $LATESTLOG | wc -l` 63 | 64 | WE=`grep WorkerException $LATESTLOG | sort -k 5,5 -u | wc -l` 65 | echo "## Worker Exceptions: $WE (out of $EXTENSIONS)" 66 | grep WorkerException $LATESTLOG | sort -k 5,5 -u | sort -k 3,3 67 | 68 | ERRORS=`grep ERROR $LATESTLOG | sort -k 5,5 -u | wc -l` 69 | echo "## ERROR LOG: $ERRORS (out of $EXTENSIONS)" 70 | grep ERROR $LATESTLOG | sort -k 5,5 -u | sort -k 3,3 71 | 72 | echo "# Server utilization" 73 | top b -n 1 | head -n 15 74 | 75 | DATE=`date --utc +%Y-%m-%d` 76 | TIME=`date --utc +%H:%M:%S` 77 | 78 | EXTS=`grep 'Updating .* extensions' $LATESTLOG \ 79 | | head -1 \ 80 | | sed -e 's/^.* (//' \ 81 | -e 's/ including forums, / /' \ 82 | -e 's/ excluding forums.*/ /g' \ 83 | | awk '{print $2";"$1}'` 84 | 85 | if [[ "$EXTS" == "" ]]; then 86 | EXTS=";" 87 | fi 88 | 89 | LASTPDOWNLOADS=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f8` 90 | LASTSDOWNLOADS=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f10` 91 | LASTMAIL=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f14` 92 | 93 | if [[ "$NUM" == "0" ]]; then 94 | MAIL=0 95 | else 96 | if [[ "$LASTPDOWNLOADS$LASTSDOWNLOADS" == "$PDOWNLOADS$SDOWNLOADS" ]]; then 97 | if [[ "$LASTMAIL" == "0" ]]; then 98 | echo "" | /usr/bin/mail -s "Extension Download Stalled!" ${USER:-root}; 99 | fi; 100 | MAIL=1; 101 | else 102 | MAIL=0; 103 | fi 104 | fi 105 | 106 | MEM=`free | tail -2 | awk '{print $2 " " $3 " " $4}' | xargs | sed -e 's/ /\";\"/g'` 107 | 108 | echo "\"$DATE $TIME\";\"$NUM\";\"$PIDS\";$EXTS;\"$PDOWNLOADS\";\"$SDOWNLOADS\";\"$ERRORS\";\"$MAIL\";\"$MEM\"" >> $ARCHIVE/monitor/updates.csv 109 | gnuplot -e "monitordir='$ARCHIVE/monitor'" $BASEDIR/download-report-one-week.gp 110 | 111 | -------------------------------------------------------------------------------- /scripts/singularity/ExtensionCrawler.def: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2017 The University of Sheffield, UK 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | bootstrap:debootstrap 18 | OSVersion: testing 19 | MirrorURL: https://deb.debian.org/debian 20 | 21 | %labels 22 | Maintainer The LogicalHacking Team (https://logicalhacking.com) 23 | 24 | %setup 25 | 26 | %post 27 | 28 | ################################################################### 29 | # Add Debian unstable as a secondary (lower priority) source 30 | # and update the data base of available packages. 31 | cat >> /etc/apt/sources.list << EOF 32 | deb http://ftp.us.debian.org/debian unstable main 33 | EOF 34 | 35 | cat > /etc/apt/preferences << EOF 36 | Package: * 37 | Pin: release a=testing 38 | Pin-Priority: 900 39 | 40 | Package: * 41 | Pin: release a=unstable 42 | Pin-Priority: 800 43 | EOF 44 | 45 | cat > /etc/apt/apt.conf.d/01norecommend << EOF 46 | APT::Install-Recommends "0"; 47 | APT::Install-Suggests "0"; 48 | EOF 49 | 50 | chmod go+r /etc/apt/preferences 51 | apt-get update 52 | ################################################################### 53 | 54 | ################################################################### 55 | # Add hook for apt that removes various files after installation 56 | # that are not needed at runtime. 57 | cat > /etc/apt/apt.conf.d/99-clean << EOF 58 | DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; }; 59 | APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; }; 60 | Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache ""; 61 | EOF 62 | ################################################################### 63 | 64 | ################################################################### 65 | # Configure locales 66 | apt-get install -y locales 67 | echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen 68 | echo "en_GB.UTF-8 UTF-8" >> /etc/locale.gen 69 | locale-gen 70 | echo "LANG=en_US.UTF-8" > /etc/default/locale 71 | ################################################################### 72 | 73 | ################################################################### 74 | # Install the core dependencies (Python 3.6 or later) 75 | # from the Debian Testing repository 76 | apt-get install -y --no-install-recommends libpython3.7-dev python3-magic python3-minimal python3-pip python3-setuptools python3-mysqldb g++ git libmariadb-dev-compat 77 | apt-get clean 78 | rm -rf /var/lib/apt/lists/* 79 | ################################################################### 80 | 81 | ################################################################### 82 | # Create /opt for local software (mainly cloned git repositories 83 | # from logicalhacking.com 84 | mkdir -p /opt 85 | chmod 755 /opt 86 | ################################################################### 87 | 88 | ################################################################### 89 | # Add the Extension Crawler repository, for more details, visit 90 | # https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler 91 | cd /opt 92 | git clone https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler.git 93 | cd ExtensionCrawler 94 | git checkout production 95 | cd .. 96 | pip3 install wheel # simhash needs wheel to build properly, still works without it though 97 | pip3 install --system -e ExtensionCrawler 98 | cd / 99 | chmod -R go+u-w /opt/ExtensionCrawler 100 | chmod -R go+u-w /usr/local/lib/ 101 | chmod -R go+u-w /usr/local/bin/ 102 | ################################################################### 103 | 104 | ################################################################### 105 | # Clone cdnjs repository or crate link to external archive dir 106 | ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive 107 | case ${SINGULARITY_IMAGE} in 108 | *-cdnjs.img) 109 | mkdir -p /opt/archive/filedb 110 | cd /opt/archive/filedb 111 | git clone https://github.com/cdnjs/cdnjs.git cdnjs-git 112 | cd cdnjs-git 113 | git pull 114 | ln -s ${ARCHIVE}/conf . > /dev/null 115 | ln -s ${ARCHIVE}/data > /dev/null 116 | ln -s ${ARCHIVE}/log > /dev/null 117 | ;; 118 | *) 119 | cd /opt/ 120 | ln -s ${ARCHIVE} . 121 | ;; 122 | esac 123 | chmod -R go+u /opt 124 | ################################################################### 125 | 126 | ################################################################### 127 | # Create mount/bind points for the various network drives 128 | # on SHARC (only useful when using the Singularity image on 129 | # the High-Performance Cluster of The University of Sheffield 130 | mkdir /scratch 131 | mkdir /fastdata 132 | mkdir /data 133 | mkdir /shared 134 | 135 | # Create nvidia driver directories to get rid of the singularity 136 | # warnings on sharc 137 | mkdir /nvbin 138 | mkdir /nvlib 139 | chmod go+u-w /scratch /fastdata /data /shared 140 | ################################################################### 141 | 142 | ################################################################### 143 | # Manual clean-up and removal of not strictly necessary directories 144 | yes | apt purge g++ 145 | yes | apt autoremove 146 | rm -rf /usr/share/doc || true 147 | ################################################################### 148 | 149 | %environment 150 | 151 | export EXTENSION_ARCHIVE=/opt/archive 152 | export PATH=/opt/ExtensionCrawler/:${PATH} 153 | 154 | # We install all python modules into the container, so we do not want 155 | # to use any packages that the user might have installed in their home 156 | # directory. 157 | export PYTHONNOUSERSITE=1 158 | 159 | %runscript 160 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 161 | # this text will get copied to /singularity and will run whenever the container 162 | # is called as an executable 163 | usage() { 164 | cat < "$LOG" 2>&1 89 | sudo singularity image.expand --size ${BASESIZE} --writable ${IMAGE} ${BASE}.def > "$LOG" 2>&1 90 | else 91 | echo "Creating read-only $IMAGE using ${BASE}.def" 92 | sudo singularity build ${IMAGE} ${BASE}.def > "$LOG" 2>&1 93 | fi 94 | 95 | if [ ! -f $IMAGE ]; then 96 | echo "Image (${IMAGE}) creation failed!" 97 | exit 1 98 | else 99 | echo "Image (${IMAGE}) creation successful!" 100 | fi 101 | 102 | if [ "$INSTALL" = "true" ]; then 103 | if [ -f $BINDIR/$IMAGE ]; then 104 | mv $BINDIR/$IMAGE $BINDIR/$IMAGE.bak 105 | fi 106 | echo "Installing ${IMAGE} into $BINDIR" 107 | mv $IMAGE $BINDIR 108 | fi 109 | -------------------------------------------------------------------------------- /scripts/singularity/singularitybuilder-arch.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM archlinux/base 2 | 3 | ARG version=2.6.1 4 | 5 | RUN curl -o /etc/pacman.d/mirrorlist "https://www.archlinux.org/mirrorlist/?country=GB&protocol=https&use_mirror_status=on" &&\ 6 | sed -i 's/^#//' /etc/pacman.d/mirrorlist &&\ 7 | pacman --noconfirm -Syyu base-devel wget python squashfs-tools debootstrap 8 | 9 | RUN mkdir /tmp/singularity &&\ 10 | cd /tmp/singularity &&\ 11 | wget "https://github.com/singularityware/singularity/releases/download/${version}/singularity-${version}.tar.gz" &&\ 12 | tar -xvzf singularity-${version}.tar.gz &&\ 13 | cd singularity-${version} &&\ 14 | ./configure --prefix=/usr/local &&\ 15 | make &&\ 16 | sudo make install 17 | -------------------------------------------------------------------------------- /scripts/singularity/singularitybuilder-arch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | set -o errexit 3 | set -o nounset 4 | 5 | if [ "$#" -lt 2 ]; then 6 | echo "Usage: $0 " 7 | exit 1 8 | fi 9 | 10 | IMGFILE=$(realpath $1) 11 | IMGDIR=$(dirname "$IMGFILE") 12 | DEFFILE=$(realpath $2) 13 | DEFDIR=$(dirname "$DEFFILE") 14 | 15 | if [ -f "$IMGFILE" ]; then 16 | rm "$IMGFILE" 17 | fi 18 | 19 | docker build --tag=singularitybuilder-arch -f singularitybuilder-arch.Dockerfile . 20 | docker run -v "$IMGDIR:$IMGDIR" -v "$DEFDIR:$DEFDIR" --privileged singularitybuilder-arch:latest singularity build "$IMGFILE" "$DEFFILE" 21 | -------------------------------------------------------------------------------- /scripts/update/global_update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # m h dom mon dow command 3 | # 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null)) 4 | # 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh 5 | 6 | ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive} 7 | CRAWLERHOME=${2:-~/ExtensionCrawler} 8 | IMAGE=${3:-/shared/brucker_research1/Shared/BrowserExtensions/bin/ExtensionCrawler.img} 9 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"` 10 | mkdir -p $LOGDIR 11 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'` 12 | LOG=$LOGPREFIX-global.log 13 | 14 | date --utc +'* Start Updating Extensions Archive (%c)' | tee $LOG 15 | 16 | # Update extensions 17 | (singularity exec --bind /srv/:/srv/ $IMAGE crawler -p 32 -d --pystuck -a $ARCHIVE > $LOGPREFIX.log ) |& ts '%Y-%m-%dT%H:%M:%S' | tee $LOGPREFIX-stderr.log 18 | 19 | date --utc +'* Update Finished (%c)' | tee -a $LOG 20 | 21 | 22 | ERRORS=`grep ERROR $LOGPREFIX.log | sort -k 5,5 -u | wc -l` 23 | EXTENSIONS=`grep "Updating db" $LOGPREFIX.log | wc -l` 24 | echo "ERROR LOG: $ERRORS (out of $EXTENSIONS)" 25 | echo "==========" 26 | grep ERROR $LOGPREFIX.log | sort -k 5,5 -u | sort -k 3,3 27 | -------------------------------------------------------------------------------- /scripts/update/update_cdnjs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive} 4 | TMPDIR=${TMPDIR:-/tmp} 5 | 6 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"` 7 | mkdir -p $LOGDIR 8 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'` 9 | LOG=$LOGPREFIX-cdnjs.log 10 | 11 | SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img 12 | date --utc +'* Create backup of disk image (%c)' | tee -a $LOG 13 | cp $SING_IMG $SING_IMG.bak 14 | SING_EXEC="singularity exec -w --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG" 15 | ls "$SING_IMG" > /dev/null 16 | 17 | # Update production branch of WebCrawler repository 18 | date --utc +'* Updating WebCrawler repository (%c)' | tee -a $LOG 19 | $SING_EXEC git fetch >> $LOG 20 | $SING_EXEC git checkout production >> $LOG 2>&1 21 | $SING_EXEC git pull >> $LOG 2>&1 22 | # $SING_EXEC pip3 install --system -e ../ExtensionCrawler 23 | 24 | # Update cdnjs git repository and update cdnjs data base table 25 | date --utc +'* Updating CDNJS repository (%c)' | tee -a $LOG 26 | $SING_EXEC ./cdnjs-git-miner -v -u -a /opt/archive >> $LOG 27 | date --utc +'* Successfully updated CDNJS repository (%c)' | tee -a $LOG 28 | 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open('requirements.txt') as f: 4 | requirements = f.read().splitlines() 5 | 6 | setup( 7 | name='Extension Crawler', 8 | description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.', 9 | author='Achim D. Brucker, Michael Herzberg', 10 | license='GPL 3.0', 11 | install_requires=requirements 12 | ) 13 | -------------------------------------------------------------------------------- /sge/create-db-cdnjs.sge: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #$ -V 3 | #$ -l rmem=4G 4 | #$ -t 1-10000 5 | #$ -j yes 6 | #$ -o /shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/log 7 | set -o nounset 8 | set -x 9 | 10 | SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img 11 | 12 | SING_EXEC="singularity exec -w --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG" 13 | 14 | printenv 15 | echo "The following parameter were passed: $*" 16 | ls "$SING_IMG" > /dev/null 17 | 18 | /usr/bin/time $SING_EXEC ./cdnjs-git-miner -v -p 1 -i -a /opt/archive -n $SGE_TASK_ID -N 10000 $* 19 | 20 | -------------------------------------------------------------------------------- /sge/create-db.sge: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o nounset 3 | set -o errexit 4 | 5 | printenv 6 | 7 | (set -x; /usr/bin/time singularity exec --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp create-db.img create-db -t 1 -n $SGE_TASK_ID $*) 8 | -------------------------------------------------------------------------------- /sge/create-db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o nounset 3 | set -o errexit 4 | 5 | REMOTE_ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive 6 | REMOTE_TARGET_DIR_PREFIX=/data/\$USER 7 | NUM_THREADS=48 8 | SGE_EXTRA_ARGS='-P rse -l h_rt=01:00:00,rmem=4G,h=\!sharc-node126 -j yes' 9 | PY_EXTRA_ARGS='' 10 | EXTENSION_IDS= 11 | 12 | usage() { 13 | echo "Usage:" 14 | echo " -a (set archive path, default: ${REMOTE_ARCHIVE})" 15 | echo " -t (set target directory, default: ${REMOTE_TARGET_DIR_PREFIX})" 16 | echo " -m (set degree of parallelism, default: ${NUM_THREADS})" 17 | echo " -s \"\" (add qsub arguments, default: ${SGE_EXTRA_ARGS})" 18 | echo " -p \"\" (add python script arguments, default: ${PY_EXTRA_ARGS})" 19 | echo " -e (set path to extension id list, default: crawl from archive)" 20 | echo " -l (limit number of sharc tasks, default: number of extensions)" 21 | } 22 | 23 | while getopts ":a:t:s:p:m:e:l:" o; do 24 | case "${o}" in 25 | a) 26 | REMOTE_ARCHIVE=${OPTARG} 27 | ;; 28 | t) 29 | REMOTE_TARGET_DIR_PREFIX=${OPTARG} 30 | ;; 31 | m) 32 | NUM_THREADS=${OPTARG} 33 | ;; 34 | s) 35 | SGE_EXTRA_ARGS+=" ${OPTARG}" 36 | ;; 37 | p) 38 | PY_EXTRA_ARGS+=" ${OPTARG}" 39 | ;; 40 | e) 41 | EXTENSION_IDS="${OPTARG}" 42 | ;; 43 | l) 44 | MAX_TASKS="${OPTARG}" 45 | ;; 46 | *) 47 | usage 48 | exit 1 49 | ;; 50 | esac 51 | done 52 | 53 | shift $((OPTIND-1)) 54 | 55 | BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P ) 56 | TEMP_FOLDER=$(mktemp -d) 57 | TARGETDIR="${REMOTE_TARGET_DIR_PREFIX}/create-db-$(date +%Y%m%d-%H%M%S)" 58 | 59 | echo "Using target dir: $TARGETDIR" 60 | ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs 61 | 62 | echo "Pushing sge script ..." 63 | scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge" 64 | 65 | echo "Building image..." 66 | if [ -f "$BASEDIR/scripts/singularity/create-db.img" ]; then 67 | rm -f "$BASEDIR/scripts/singularity/create-db.img" 68 | fi 69 | ( 70 | cd "$BASEDIR/scripts/singularity" 71 | if [[ "$(docker images -q singularitybuilder-arch 2> /dev/null)" == "" ]]; then 72 | docker build --tag=singularitybuilder -f singularitybuilder-arch.Dockerfile . 73 | fi 74 | docker run -it -v "$(pwd):$(pwd)" -w "$(pwd)" --privileged singularitybuilder-arch:latest singularity build create-db.img ExtensionCrawler.def 75 | ) 76 | 77 | echo "Pushing image..." 78 | scp "$BASEDIR/scripts/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img" 79 | 80 | 81 | if [[ -z $EXTENSION_IDS ]]; then 82 | echo "Gathering extension IDs..." 83 | ssh sharc.shef.ac.uk find "${REMOTE_ARCHIVE}/data" -name "*.tar" | grep -Po "[a-p]{32}" > ${TEMP_FOLDER}/extension.ids 84 | else 85 | cp "$EXTENSION_IDS" ${TEMP_FOLDER}/extension.ids 86 | fi 87 | 88 | NO_IDS=$(cat ${TEMP_FOLDER}/extension.ids | wc -l) 89 | 90 | echo "Found $NO_IDS IDs!" 91 | if [ "$NO_IDS" = 0 ]; then 92 | echo "Nothing to do!" 93 | exit 0 94 | fi 95 | 96 | echo "Pushing extension IDs..." 97 | scp ${TEMP_FOLDER}/extension.ids sharc.shef.ac.uk:$TARGETDIR/ 98 | 99 | if [[ ! -v MAX_TASKS ]]; then 100 | MAX_TASKS=NO_IDS 101 | fi 102 | 103 | NO_BATCH_JOBS=$(((MAX_TASKS+1)/75000+1)) 104 | JOBS_PER_BATCH=$((MAX_TASKS/NO_BATCH_JOBS+1)) 105 | 106 | for run_no in $(seq 1 $NO_BATCH_JOBS); do 107 | FIRST_ID=$(((run_no-1) * $JOBS_PER_BATCH + 1)) 108 | LAST_ID=$((run_no * $JOBS_PER_BATCH)) 109 | 110 | echo "Starting job $run_no ..." 111 | (set -x; ssh sharc.shef.ac.uk qsub \ 112 | -tc $((NUM_THREADS/NO_BATCH_JOBS)) \ 113 | -t ${FIRST_ID}-${LAST_ID} \ 114 | -wd "$TARGETDIR" \ 115 | -o "$TARGETDIR/logs" \ 116 | ${SGE_EXTRA_ARGS} \ 117 | "$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $MAX_TASKS ${PY_EXTRA_ARGS}) 118 | done 119 | --------------------------------------------------------------------------------