├── .gitignore
├── ExtensionCrawler
├── __init__.py
├── archive.py
├── cdnjs_crawler.py
├── cdnjs_git.py
├── config.py
├── crx.py
├── db.py
├── dbbackend
│ ├── __init__.py
│ ├── mysql_backend.py
│ └── mysql_process.py
├── discover.py
├── file_identifiers.py
├── js_decomposer.py
├── js_mincer.py
├── request_manager.py
└── util.py
├── LICENSE
├── PermissionAnalysis
└── grep-unused-permissions
├── README.md
├── analysis
└── library-detector
│ ├── angular
│ ├── 2018-11-28-results.csv
│ ├── angular.py
│ ├── angularversions.txt
│ ├── ideas.txt
│ └── plotting.py
│ └── jquery.py
├── cdnjs-git-miner
├── crawler
├── create-db
├── crx-extract
├── crx-jsinventory
├── crx-jsstrings
├── crx-tool
├── database
├── README.md
├── config
│ └── my.cnf
├── queries
│ ├── get_added_content_scripts.sql
│ └── get_added_permissions.sql
├── schemas
│ ├── category.sql
│ ├── cdnjs.sql
│ ├── content_script_url.sql
│ ├── crx.sql
│ ├── crxfile.sql
│ ├── extension.sql
│ ├── libdet.sql
│ ├── permission.sql
│ ├── reply.sql
│ ├── reply_comment.sql
│ ├── review.sql
│ ├── review_comment.sql
│ ├── status.sql
│ ├── support.sql
│ └── support_comment.sql
├── scripts
│ ├── mariabackup-full
│ ├── mariabackup-inc
│ ├── mariabackup-schemas
│ └── showgrants
└── views
│ ├── extension_most_recent.sql
│ ├── extension_most_recent_small.sql
│ ├── extension_most_recent_until_date.sql
│ ├── extension_second_most_recent.sql
│ ├── extension_second_most_recent_until_date.sql
│ ├── extension_small.sql
│ └── extension_update.sql
├── extgrep
├── requirements.txt
├── resources
└── js_identifier.json
├── scripts
├── hpc-utilities
│ └── hpc-submit
├── maintainance
│ ├── maintain_archive
│ └── xz.sge
├── monitoring
│ ├── download-report-one-week.gp
│ └── global_update_monitor.sh
├── singularity
│ ├── ExtensionCrawler.def
│ ├── build.sh
│ ├── singularitybuilder-arch.Dockerfile
│ └── singularitybuilder-arch.sh
└── update
│ ├── global_update.sh
│ └── update_cdnjs.sh
├── setup.py
├── sge
├── create-db-cdnjs.sge
├── create-db.sge
└── create-db.sh
└── simhashbucket
/.gitignore:
--------------------------------------------------------------------------------
1 | # ---> Python
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 | # vi
62 | *.swp
63 |
64 | # vscode
65 | .vscode
66 |
67 | archive
68 | .ropeproject
69 | ExtensionCrawler.img
70 | ExtensionCrawler-cdnjs.img
71 |
72 | .idea
73 | venv
74 |
--------------------------------------------------------------------------------
/ExtensionCrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalhacking/ExtensionCrawler/853d69d1a3478eaa3b8649f9dd754a044a561cc5/ExtensionCrawler/__init__.py
--------------------------------------------------------------------------------
/ExtensionCrawler/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | """Global configuration of the Extension Crawler and related tools."""
18 |
19 | import os
20 | import json
21 |
22 |
23 | def const_sitemap_url():
24 | """Sitemap URL."""
25 | return "https://chrome.google.com/webstore/sitemap"
26 |
27 |
28 | def const_sitemap_scheme():
29 | """URL of Sitemap schema."""
30 | return "http://www.sitemaps.org/schemas/sitemap/0.9"
31 |
32 |
33 | def const_overview_url(ext_id):
34 | """URL template for the overview page of an extension."""
35 | return 'https://chrome.google.com/webstore/detail/{}'.format(ext_id)
36 |
37 |
38 | def const_store_url():
39 | """Main URL of the Chrome store."""
40 | return 'https://chrome.google.com/webstore'
41 |
42 |
43 | def const_review_url():
44 | """Base URL of the review page of an extension."""
45 | return 'https://chrome.google.com/reviews/components'
46 |
47 |
48 | def const_review_search_url():
49 | """Base URL for review search."""
50 | return 'https://chrome.google.com/reviews/json/search'
51 |
52 |
53 | def const_support_url():
54 | """Base URL for support pages."""
55 | return 'https://chrome.google.com/reviews/components'
56 |
57 |
58 | def const_download_url():
59 | """Base download URL."""
60 | return ('https://clients2.google.com/service/update2/' +
61 | 'crx?response=redirect&nacl_arch=x86-64&' +
62 | 'prodversion=9999.0.9999.0&x=id%3D{}%26uc')
63 |
64 |
65 | def const_categories():
66 | """List of known categories."""
67 | return [
68 | 'extensions', 'ext/22-accessibility', 'ext/10-blogging',
69 | 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
70 | 'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
71 | 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
72 | 'ext/13-sports'
73 | ]
74 |
75 |
76 | def const_support_payload(ext_id, start, end):
77 | """Payload for requesting support pages."""
78 | return (
79 | 'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
80 | '"specs":[{{"type":"CommentThread",' +
81 | '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
82 | + '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
83 | '"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
84 | '"internedValues":[]}}').format(ext_id, start, end)
85 |
86 |
87 | def const_review_payload(ext_id, start, end):
88 | """Payload for requesting review pages."""
89 | return (
90 | 'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
91 | '"specs":[{{"type":"CommentThread",' +
92 | '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
93 | + '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
94 | '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
95 | '"internedKeys":[],' + '"internedValues":[]}}').format(
96 | ext_id, start, end)
97 |
98 |
99 | def const_review_search_payload(params):
100 | """Payload for searches."""
101 | pre = """req={"applicationId":94,"searchSpecs":["""
102 | post = """]}&requestSource=widget"""
103 | args = []
104 | for extid, author, start, numresults, groups in params:
105 | args += [
106 | """{{"requireComment":true,"entities":[{{"annotation":"""
107 | """{{"groups":{},"author":"{}","""
108 | """"url":"http://chrome.google.com/extensions/permalink?id={}"}}}}],"""
109 | """"matchExtraGroups":true,"startIndex":{},"numResults":{},"""
110 | """"includeNicknames":true,"locale": {{"language": "en","country": "us"}}}}"""
111 | .format(json.dumps(groups), author, extid, start, numresults)
112 | ]
113 |
114 | return pre + ",".join(args) + post
115 |
116 |
117 | def get_local_archive_dir(ext_id):
118 | """Local archive dir of extension."""
119 | return "{}".format(ext_id[:3])
120 |
121 |
122 | def archive_file(archivedir, ext_id):
123 | """Archive tar of an extension."""
124 | return os.path.join(
125 | str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
126 |
127 |
128 | def const_basedir():
129 | """Top-level directory for the extension crawler archive."""
130 | if "EXTENSION_ARCHIVE" in os.environ:
131 | return os.environ.get("EXTENSION_ARCHIVE")
132 | else:
133 | return "archive"
134 |
135 |
136 | def const_parallel_downloads():
137 | """Number of parallel downloads."""
138 | return 36
139 |
140 |
141 | def const_verbose():
142 | """Default verbosity."""
143 | return True
144 |
145 |
146 | def const_use_process_pool():
147 | """Use ProcessPool (from module 'pebble') for concurrency."""
148 | return False
149 |
150 |
151 | def const_log_format(ext_id="-"*32):
152 | return "%(process)6s %(asctime)s %(levelname)8s {} %(message)s".format(ext_id)
153 |
154 |
155 | def const_discover():
156 | """Default configuration of discovery mode"""
157 | return False
158 |
159 |
160 | def const_ext_timeout():
161 | """Timeout for downloading an individual extension (2 hours)."""
162 | return 2*60*60
163 |
164 |
165 | def const_mysql_config_file():
166 | return os.path.expanduser("~/.my.cnf")
167 |
168 |
169 | def const_mysql_maxtries():
170 | return 12
171 |
172 |
173 | def const_mysql_try_wait():
174 | return 300
175 |
--------------------------------------------------------------------------------
/ExtensionCrawler/crx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | """Utility functions for working with Chrome extensionsx archives,
19 | i.e., *.crx files."""
20 |
21 | import io
22 | import os
23 | import zipfile
24 | import binascii
25 | from Cryptodome.PublicKey import RSA
26 | from Cryptodome.Hash import SHA
27 | from Cryptodome.Signature import PKCS1_v1_5
28 |
29 |
30 | class CrxFile:
31 | """Record class for storing crx files."""
32 |
33 | def __init__(self, filename, magic, version, public_key_len, sig_len,
34 | public_key, sig, header_len, data):
35 | self.file = filename
36 | self.magic = magic
37 | self.version = version
38 | self.public_key_len = public_key_len
39 | self.sig_len = sig_len
40 | self.public_key = public_key
41 | self.sig = sig
42 | self.header_len = header_len
43 | self.data = data
44 |
45 |
46 | def is_valid_magic(magic):
47 | """Check magic matches the magic bytes of the crx specificaton."""
48 | return magic == b'Cr24'
49 |
50 |
51 | def is_crxfile(filename):
52 | """Check magic number: crx files should start with \"Cr24\"."""
53 | file = open(filename, 'rb')
54 | magic = file.read(4)
55 | file.close()
56 | return is_valid_magic(magic)
57 |
58 |
59 | def check_signature(public_key, sig, data):
60 | """Check validity of signature contained in the crx file."""
61 | key = RSA.importKey(public_key)
62 | crxhash = SHA.new(data)
63 | return PKCS1_v1_5.new(key).verify(crxhash, sig)
64 |
65 |
66 | def read_crx(filename):
67 | """Read header of an crx file (https://developer.chrome.com/extensions/crx)."""
68 | file = open(filename, 'rb')
69 | magic = file.read(4)
70 | version = int.from_bytes(file.read(4), byteorder='little')
71 | public_key_len = int.from_bytes(file.read(4), byteorder='little')
72 | sig_len = int.from_bytes(file.read(4), byteorder='little')
73 | public_key = file.read(public_key_len)
74 | sig = file.read(sig_len)
75 | header_len = 16 + public_key_len + sig_len
76 | data = file.read()
77 | file.close()
78 | return CrxFile(filename, magic, version, public_key_len, sig_len,
79 | public_key, sig, header_len, data)
80 |
81 |
82 | def print_crx_info(verbose, crx):
83 | """Print information extracted from a crx file."""
84 | if is_valid_magic(crx.magic):
85 | magic = "valid"
86 | else:
87 | magic = "invalid"
88 | if check_signature(crx.public_key, crx.sig, crx.data):
89 | sig = "valid"
90 | else:
91 | sig = "invalid"
92 | print("Filename: " + crx.file)
93 | print("Header size: " + str(crx.header_len))
94 | print("Size: " + str(crx.header_len + len(crx.data)))
95 | print("Magic byte: " + str(crx.magic.decode("utf-8")) + " (" + magic +
96 | ")")
97 | print("Version: " + str(crx.version))
98 | print("Signature: " + sig)
99 | print("Public Key [" + str(crx.public_key_len) + "]:")
100 | key = RSA.importKey(crx.public_key)
101 | print(key.exportKey().decode("utf-8"))
102 | if verbose:
103 | print("Signature [" + str(crx.sig_len) + "]: " + str(
104 | binascii.hexlify(crx.sig)))
105 | out = io.BytesIO(crx.data)
106 | ziparchive = zipfile.ZipFile(out, 'r')
107 | print("Zip content:")
108 | for info in ziparchive.infolist():
109 | print('{:8d} {:8d}'.format(info.file_size, info.compress_size),
110 | info.filename)
111 |
112 |
113 | def verify_crxfile(verbose, filename):
114 | """Verify integrity of a crx file."""
115 | if is_crxfile(filename):
116 | if verbose:
117 | print("Found correct magic bytes.")
118 | print_crx_info(verbose, read_crx(filename))
119 | return 0
120 | else:
121 | if verbose:
122 | print("No valid magic bytes found")
123 | return -1
124 |
125 |
126 | def extract_crxfile(verbose, force, filename, destdir):
127 | """Extract crxfile into specified destdir."""
128 | crx = read_crx(filename)
129 | if is_valid_magic(crx.magic) | force:
130 | if (destdir == "") | (destdir is None):
131 | destdir = "."
132 | if filename.endswith(".crx"):
133 | extname = os.path.basename(filename)
134 | dirname = extname[0:len(os.path.basename(extname)) - 4]
135 | else:
136 | dirname = filename
137 | out = io.BytesIO(crx.data)
138 | ziparchive = zipfile.ZipFile(out, 'r')
139 | ziparchive.extractall(destdir + "/" + dirname)
140 | if verbose:
141 | print("Content extracted into: " + destdir + "/" + dirname)
142 | return 0
143 | else:
144 | print("Input file not valid.")
145 | return -1
146 |
--------------------------------------------------------------------------------
/ExtensionCrawler/dbbackend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalhacking/ExtensionCrawler/853d69d1a3478eaa3b8649f9dd754a044a561cc5/ExtensionCrawler/dbbackend/__init__.py
--------------------------------------------------------------------------------
/ExtensionCrawler/dbbackend/mysql_backend.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2017 The University of Sheffield, UK
3 | #
4 | # This program is free software: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation, either version 3 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 | #
17 |
18 | import time
19 | import datetime
20 | from collections import OrderedDict
21 | from random import uniform
22 | import sys
23 | import configparser
24 |
25 | import MySQLdb
26 | import MySQLdb._exceptions
27 |
28 | import ExtensionCrawler.config as config
29 | from ExtensionCrawler.util import log_info, log_error, log_warning
30 |
31 |
32 | class MysqlBackend:
33 | def __init__(self, ext_id, delayed=False, cache_etags=False, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(),
34 | **kwargs):
35 | self.ext_id = ext_id
36 | self.delayed = delayed
37 | self.cache_etags = cache_etags
38 | self.dbargs = kwargs
39 | self.try_wait = try_wait
40 | self.maxtries = maxtries
41 | self.cache = {}
42 | self.crx_etag_cache = {}
43 | self.db = None
44 | self.cursor = None
45 |
46 | # For more info, see https://jira.mariadb.org/browse/CONC-359
47 | self._fix_missing_host(self.dbargs)
48 |
49 | def _fix_missing_host(self, dbargs):
50 | if "host" in dbargs:
51 | return
52 |
53 | if "read_default_file" not in dbargs:
54 | return
55 |
56 | config = configparser.ConfigParser()
57 | config.read(dbargs["read_default_file"])
58 | if "host" not in config["client"]:
59 | return
60 | dbargs["host"] = config["client"]["host"]
61 |
62 |
63 | def __enter__(self):
64 | # We open a connection once we actually need it
65 | return self
66 |
67 | def __exit__(self, *args):
68 | for table, arglist in self.cache.items():
69 | self._do_insert(table, arglist)
70 | self.cache[table] = []
71 | self._close_conn()
72 |
73 | def _get_column_names(self, table):
74 | self.cursor.execute(f"select column_name from information_schema.columns where table_schema=database() and table_name=%s", (table,))
75 | return [row[0] for row in self.cursor.fetchall()]
76 |
77 |
78 | def _do_insert(self, table, arglist):
79 | if len(arglist) == 0:
80 | return
81 | sorted_arglist = self.sort_by_primary_key(table, arglist)
82 | args = [tuple(arg.values()) for arg in sorted_arglist]
83 |
84 | if self.delayed:
85 | query = "INSERT DELAYED INTO {}({}) VALUES ({})".format(
86 | table,
87 | ",".join(sorted_arglist[0].keys()),
88 | ",".join(len(args[0]) * ["%s"]))
89 | else:
90 | column_names = self.retry(lambda: self._get_column_names(table))
91 | if "last_modified" in column_names:
92 | additional_columns = ["last_modified"]
93 | else:
94 | additional_columns = []
95 | # Looks like this, for example:
96 | # INSERT INTO category VALUES(extid,date,category) (%s,%s,%s)
97 | # ON DUPLICATE KEY UPDATE extid=VALUES(extid),date=VALUES(date)
98 | # ,category=VALUES(category)
99 | query = "INSERT INTO {}({}) VALUES ({}) ON DUPLICATE KEY UPDATE {}".format(
100 | table,
101 | ",".join(sorted_arglist[0].keys()),
102 | ",".join(len(args[0]) * ["%s"]),
103 | ",".join(
104 | ["{c}=VALUES({c})".format(c=c) for c in list(sorted_arglist[0].keys()) + additional_columns]))
105 | start = time.time()
106 | self.retry(lambda: self.cursor.executemany(query, args))
107 | log_info("* Inserted {} bytes into {}, taking {}.".format(sum([sys.getsizeof(arg) for arg in args]),
108 | table, datetime.timedelta(seconds=int(time.time() - start))), 3)
109 |
110 | def _create_conn(self):
111 | if self.db is None:
112 | log_info("* self.db is None, open new connection ...", 3)
113 | self.db = MySQLdb.connect(**self.dbargs)
114 | self.db.autocommit(True)
115 | log_info("* success", 4)
116 | if self.cursor is None:
117 | log_info("* self.cursor is None, assigning new cursor ...", 3)
118 | self.cursor = self.db.cursor()
119 | log_info("* success", 4)
120 |
121 | def _close_conn(self):
122 | if self.cursor is not None:
123 | self.cursor.close()
124 | self.cursor = None
125 | if self.db is not None:
126 | self.db.close()
127 | self.db = None
128 |
129 | def retry(self, f):
130 | for t in range(self.maxtries):
131 | try:
132 | self._create_conn()
133 | return f()
134 | except MySQLdb._exceptions.OperationalError as e:
135 | last_exception = e
136 |
137 | try:
138 | self._close_conn()
139 | except Exception as e2:
140 | log_error("Suppressed exception: {}".format(str(e2)), 3)
141 |
142 | if t + 1 == self.maxtries:
143 | log_error("MySQL connection eventually failed, closing connection!", 3)
144 | raise last_exception
145 | else:
146 | factor = 0.2
147 | logmsg = ("Exception ({}) on mysql connection attempt "
148 | "{} of {}, wating {}s +/- {}% before retrying..."
149 | ).format(str(e),
150 | t + 1,
151 | self.maxtries,
152 | self.try_wait, factor * 100)
153 | log_warning(logmsg, 3)
154 | time.sleep(self.try_wait * uniform(
155 | 1 - factor, 1 + factor))
156 |
157 | def get_single_value(self, query, args):
158 | self.retry(lambda: self.cursor.execute(query, args))
159 |
160 | result = self.retry(lambda: self.cursor.fetchone())
161 | if result is not None:
162 | return result[0]
163 | else:
164 | return None
165 |
166 | def sort_by_primary_key(self, table, arglist):
167 | self.retry(lambda: self.cursor.execute(f"SHOW KEYS FROM {table} WHERE Key_name = 'PRIMARY'"))
168 | primary_keys = [row[4] for row in self.cursor.fetchall()]
169 |
170 | sorted_arglist = sorted(arglist, key=lambda x: [x[pk] for pk in primary_keys])
171 |
172 | def arglist_shuffler(x):
173 | try:
174 | return primary_keys.index(x)
175 | except ValueError:
176 | return len(primary_keys)
177 | shuffled_arglist = [OrderedDict(sorted(arg.items(), key=lambda x: arglist_shuffler(x[0]))) for arg in sorted_arglist]
178 | return shuffled_arglist
179 |
180 |
181 | def insertmany(self, table, arglist):
182 | if table not in self.cache:
183 | self.cache[table] = []
184 | self.cache[table] += arglist
185 | if len(self.cache[table]) >= 128:
186 | self._do_insert(table, self.cache[table])
187 | self.cache[table] = []
188 | if self.cache_etags and table == "extension":
189 | for arg in arglist:
190 | self.crx_etag_cache[(arg["extid"], arg["date"])] = arg["crx_etag"]
191 |
192 | def insert(self, table, **kwargs):
193 | self.insertmany(table, [kwargs])
194 |
195 | def get_etag(self, extid, date):
196 | if (extid, date) in self.crx_etag_cache:
197 | return self.crx_etag_cache[(extid, date)]
198 | else:
199 | return None
200 |
201 | def get_cdnjs_info(self, md5):
202 | query = """SELECT library, version, filename, add_date, typ from cdnjs where md5=%s"""
203 | args = [md5]
204 | self.retry(lambda: self.cursor.execute(query, args))
205 | result = self.retry(lambda: self.cursor.fetchone())
206 | return result
207 |
208 |
209 | def convert_date(date):
210 | return date[:-6]
211 |
--------------------------------------------------------------------------------
/ExtensionCrawler/dbbackend/mysql_process.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (C) 2017 The University of Sheffield, UK
3 | #
4 | # This program is free software: you can redistribute it and/or modify
5 | # it under the terms of the GNU General Public License as published by
6 | # the Free Software Foundation, either version 3 of the License, or
7 | # (at your option) any later version.
8 | #
9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program. If not, see .
16 | #
17 |
18 | from multiprocessing import Process, Manager
19 |
20 | from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
21 | from ExtensionCrawler.util import setup_logger, log_exception
22 |
23 | class MysqlProxy:
24 | def __init__(self, q):
25 | self.q = q
26 |
27 | def insertmany(self, table, arglist):
28 | self.q.put((MysqlProcessBackend.INSERT, (table, arglist)))
29 |
30 | def insert(self, table, **kwargs):
31 | self.insertmany(table, [kwargs])
32 |
33 | def get_cdnjs_info(self, md5):
34 | return None
35 |
36 |
37 | def run(mysql_kwargs, q):
38 | setup_logger(True)
39 | finished = False
40 |
41 | try:
42 | with MysqlBackend(None, **mysql_kwargs) as db:
43 | while True:
44 | cmd, data = q.get()
45 | if cmd == MysqlProcessBackend.STOP:
46 | finished = True
47 | break
48 | if cmd == MysqlProcessBackend.INSERT:
49 | db.insertmany(*data)
50 | except:
51 | log_exception("Stopping Mysql backend and emptying queue...")
52 | if not finished:
53 | while True:
54 | cmd, data = q.get()
55 | if cmd == MysqlProcessBackend.STOP:
56 | break
57 | if cmd == MysqlProcessBackend.INSERT:
58 | pass
59 |
60 |
61 | class MysqlProcessBackend:
62 | STOP = "stop"
63 | INSERT = "insert"
64 |
65 | def __init__(self, ext_id, **mysql_kwargs):
66 | self.mysql_kwargs = mysql_kwargs
67 | self.m = Manager()
68 | self.queue = self.m.Queue()
69 |
70 | def __enter__(self):
71 | self.p = Process(target=run, args=(self.mysql_kwargs, self.queue))
72 | self.p.start()
73 | return MysqlProxy(self.queue)
74 |
75 | def __exit__(self, *args):
76 | self.queue.put((MysqlProcessBackend.STOP, None))
77 | self.p.join()
78 |
--------------------------------------------------------------------------------
/ExtensionCrawler/discover.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | """Python mnodule providing methods for discovering extensions in the
18 | Chrome extension store."""
19 |
20 | from xml.etree.ElementTree import fromstring
21 | import re
22 | import requests
23 | from pebble import ThreadPool
24 | from ExtensionCrawler import config
25 |
26 |
27 | def get_inner_elems(doc):
28 | """Get inner element."""
29 | return fromstring(doc).iterfind(r".//{{{}}}loc".format(
30 | config.const_sitemap_scheme()))
31 |
32 |
33 | def is_generic_url(url):
34 | """Check if URL is a generic extension URL."""
35 | """The urls with a language parameter attached return a subset"""
36 | """of the ids that get returned by the plain urls, therefore we"""
37 | """skip urls with a language parameter."""
38 |
39 | return re.match(r"^{}\?shard=\d+&numshards=\d+$".format(
40 | config.const_sitemap_url()), url)
41 |
42 |
43 | def iterate_shard(shard_url):
44 | if is_generic_url(shard_url):
45 | shard = requests.get(shard_url, timeout=10).text
46 | for inner_elem in get_inner_elems(shard):
47 | overview_url = inner_elem.text
48 | yield re.search("[a-z]{32}", overview_url).group(0)
49 |
50 |
51 | def process_shard(shard_url):
52 | return list(iterate_shard(shard_url))
53 |
54 |
55 | def get_new_ids(known_ids, max_ids=None):
56 | """Crawl extension ids available in Chrome store."""
57 |
58 | shard_urls = [shard_elem.text for shard_elem in get_inner_elems(
59 | requests.get(config.const_sitemap_url(), timeout=10).text)]
60 | with ThreadPool(16) as pool:
61 | future = pool.map(process_shard, shard_urls, chunksize=1)
62 | iterator = future.result()
63 |
64 | returned_ids = 0
65 | while True:
66 | try:
67 | for extid in next(iterator):
68 | if extid not in known_ids:
69 | yield extid
70 | returned_ids += 1
71 | if max_ids is not None and returned_ids >= max_ids:
72 | pool.stop()
73 | return
74 | except StopIteration:
75 | return
76 |
--------------------------------------------------------------------------------
/ExtensionCrawler/file_identifiers.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | """ Module for obtaining (normalized) hashes for files."""
19 |
20 | import hashlib
21 | import os
22 | import re
23 | import zlib
24 | import mimetypes
25 | from io import StringIO
26 | from simhash import Simhash
27 |
28 | import cchardet as chardet
29 | import magic
30 |
31 | from ExtensionCrawler.js_mincer import mince_js
32 |
33 |
34 | def is_binary_resource(mimetype_magic):
35 | return (mimetype_magic.startswith("image/") or
36 | mimetype_magic.startswith("video/") or
37 | mimetype_magic.startswith("audio/") or
38 | mimetype_magic == "application/pdf")
39 |
40 |
41 | def normalize_jsdata(str_data):
42 | """Compute normalized code blocks of a JavaScript file"""
43 | txt = ""
44 | loc = 0
45 | with StringIO(str_data) as str_obj:
46 | for block in mince_js(str_obj):
47 | if block.is_code():
48 | for line in block.content.splitlines():
49 | txt += line.strip()
50 | loc += 1
51 | return txt.encode(), loc
52 |
53 |
54 | def get_features(s):
55 | """Compute feature set of text (represented as string)."""
56 | width = 3
57 | s = s.lower()
58 | s = re.sub(r'[^\w]+', '', s)
59 | return (s[i:i + width] for i in range(max(len(s) - width + 1, 1)))
60 |
61 |
62 | def get_simhash(encoding, data):
63 | """Compute simhash of text."""
64 | if encoding is not None:
65 | # VISCII is not supported by python, UTF-8 parses at least the for us important parts
66 | if encoding == "VISCII":
67 | encoding = "UTF-8"
68 | str_data = data.decode(encoding=encoding, errors="replace")
69 | else:
70 | str_data = str(data)
71 | simhash = Simhash(get_features(str_data)).value
72 | return simhash
73 |
74 |
75 | def compute_difference(hx, hy):
76 | """Compute difference between two simhashes."""
77 | assert hx.bit_length() == hy.bit_length()
78 | h = (hx ^ hy) & ((1 << 64) - 1)
79 | d = 0
80 | while h:
81 | d += 1
82 | h &= h - 1
83 | return d
84 |
85 |
86 | def get_data_identifiers(data):
87 | """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
88 |
89 | data_identifier = {
90 | 'encoding': None,
91 | 'description': None,
92 | 'size': None,
93 | 'loc': None,
94 | 'mimetype_magic': None,
95 | 'md5': None,
96 | 'sha1': None,
97 | 'sha256': None,
98 | 'simhash': None,
99 | 'size_stripped': None,
100 | 'normalized_encoding': None,
101 | 'normalized_description': None,
102 | 'normalized_size': None,
103 | 'normalized_loc': None,
104 | 'normalized_mimetype_magic': None,
105 | 'normalized_md5': None,
106 | 'normalized_sha1': None,
107 | 'normalized_sha256': None,
108 | 'normalized_simhash': None
109 | }
110 |
111 | mimetype_magic = magic.from_buffer(data, mime=True)
112 |
113 | try:
114 | magic_desc = magic.from_buffer(data)
115 | except magic.MagicException as exp:
116 | rgx = re.compile(r' name use count.*$')
117 | msg = str(exp.message)
118 | if re.search(rgx, msg):
119 | magic_desc = re.sub(rgx, '', msg)
120 | else:
121 | raise exp
122 |
123 | data_identifier['mimetype_magic'] = mimetype_magic
124 | data_identifier['md5'] = hashlib.md5(data).digest()
125 | data_identifier['sha1'] = hashlib.sha1(data).digest()
126 | data_identifier['sha256'] = hashlib.sha256(data).digest()
127 | data_identifier['size'] = len(data)
128 | data_identifier['description'] = magic_desc
129 |
130 | # We don't continue here with binary files, as that consumes too many
131 | # resources.
132 | if is_binary_resource(mimetype_magic):
133 | return data_identifier
134 |
135 | encoding = chardet.detect(data)['encoding']
136 |
137 | data_identifier['simhash'] = get_simhash(encoding, data)
138 | data_identifier['size_stripped'] = len(data.strip())
139 | data_identifier['loc'] = len(data.splitlines())
140 | data_identifier['encoding'] = encoding
141 | try:
142 | normalized_data, normalized_loc = normalize_jsdata(
143 | data.decode(encoding=data_identifier['encoding'], errors="replace"))
144 | except Exception:
145 | normalized_data = None
146 | normalized_loc = 0
147 |
148 | if normalized_data is not None:
149 | normalized_magic_desc = ""
150 | try:
151 | normalized_magic_desc = magic.from_buffer(normalized_data)
152 | except magic.MagicException as exp:
153 | rgx = re.compile(r' name use count.*$')
154 | msg = str(exp.message)
155 | if re.search(rgx, msg):
156 | normalized_magic_desc = re.sub(rgx, '', msg)
157 | else:
158 | raise exp
159 | normalized_encoding = chardet.detect(normalized_data)['encoding']
160 | data_identifier['normalized_encoding'] = normalized_encoding
161 | data_identifier['normalized_description'] = normalized_magic_desc
162 | data_identifier['normalized_size'] = len(normalized_data)
163 | data_identifier['normalized_loc'] = normalized_loc
164 | data_identifier['normalized_mimetype_magic'] = magic.from_buffer(normalized_data, mime=True)
165 | data_identifier['normalized_md5'] = hashlib.md5(
166 | normalized_data).digest()
167 | data_identifier['normalized_sha1'] = hashlib.sha1(
168 | normalized_data).digest()
169 | data_identifier['normalized_sha256'] = hashlib.sha256(
170 | normalized_data).digest()
171 | data_identifier['normalized_simhash'] = get_simhash(
172 | normalized_encoding, normalized_data)
173 | return data_identifier
174 |
175 |
176 | def get_file_identifiers(path, data=None):
177 | """Get basic file identifiers (path, filename, etc.) and data identifiers."""
178 | dec_data_identifier = {
179 | 'mimetype_magic': None,
180 | 'md5': None,
181 | 'sha1': None,
182 | 'sha256': None,
183 | 'simhash': None,
184 | 'size': None,
185 | 'size_stripped': None,
186 | 'loc': None,
187 | 'description': None,
188 | 'encoding': None,
189 | 'normalized_mimetype_magic': None,
190 | 'normalized_loc': None,
191 | 'normalized_encoding': None,
192 | 'normalized_description': None,
193 | 'normalized_size': None,
194 | 'normalized_md5': None,
195 | 'normalized_sha1': None,
196 | 'normalized_sha256': None,
197 | 'normalized_simhash': None
198 | }
199 | if data is None:
200 | with open(path, 'rb') as fileobj:
201 | data = fileobj.read()
202 |
203 | data_identifier = get_data_identifiers(data)
204 | if data_identifier['description'].startswith('gzip'):
205 | try:
206 | dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
207 | dec_data = dec.decompress(data, 100 * data_identifier['size'])
208 | dec_data_identifier = get_data_identifiers(dec_data)
209 | del dec_data
210 | except Exception as e:
211 | dec_data_identifier[
212 | 'description'] = "Exception during compression (likely zip-bomb:" + str(
213 | e)
214 | file_identifier = {
215 | 'filename':
216 | os.path.basename(path),
217 | 'path':
218 | path,
219 | 'mimetype':
220 | mimetypes.guess_type(path),
221 | 'mimetype_magic':
222 | data_identifier['mimetype_magic'],
223 | 'md5':
224 | data_identifier['md5'],
225 | 'sha1':
226 | data_identifier['sha1'],
227 | 'sha256':
228 | data_identifier['sha256'],
229 | 'simhash':
230 | data_identifier['simhash'],
231 | 'size':
232 | data_identifier['size'],
233 | 'size_stripped':
234 | data_identifier['size_stripped'],
235 | 'loc':
236 | data_identifier['loc'],
237 | 'description':
238 | data_identifier['description'],
239 | 'encoding':
240 | data_identifier['encoding'],
241 | 'normalized_encoding':
242 | data_identifier['normalized_encoding'],
243 | 'normalized_description':
244 | data_identifier['normalized_description'],
245 | 'normalized_size':
246 | data_identifier['normalized_size'],
247 | 'normalized_loc':
248 | data_identifier['normalized_loc'],
249 | 'normalized_mimetype_magic':
250 | data_identifier['normalized_mimetype_magic'],
251 | 'normalized_md5':
252 | data_identifier['normalized_md5'],
253 | 'normalized_sha1':
254 | data_identifier['normalized_sha1'],
255 | 'normalized_sha256':
256 | data_identifier['normalized_sha256'],
257 | 'normalized_simhash':
258 | data_identifier['normalized_simhash'],
259 | 'dec_mimetype_magic':
260 | dec_data_identifier['mimetype_magic'],
261 | 'dec_md5':
262 | dec_data_identifier['md5'],
263 | 'dec_sha1':
264 | dec_data_identifier['sha1'],
265 | 'dec_sha256':
266 | dec_data_identifier['sha256'],
267 | 'dec_simhash':
268 | dec_data_identifier['simhash'],
269 | 'dec_size':
270 | dec_data_identifier['size'],
271 | 'dec_size_stripped':
272 | dec_data_identifier['size_stripped'],
273 | 'dec_loc':
274 | dec_data_identifier['loc'],
275 | 'dec_description':
276 | dec_data_identifier['description'],
277 | 'dec_encoding':
278 | dec_data_identifier['encoding'],
279 | 'dec_normalized_encoding':
280 | dec_data_identifier['normalized_encoding'],
281 | 'dec_normalized_description':
282 | dec_data_identifier['normalized_description'],
283 | 'dec_normalized_size':
284 | dec_data_identifier['normalized_size'],
285 | 'dec_normalized_loc':
286 | dec_data_identifier['normalized_loc'],
287 | 'dec_normalized_mimetype_magic':
288 | dec_data_identifier['normalized_mimetype_magic'],
289 | 'dec_normalized_md5':
290 | dec_data_identifier['normalized_md5'],
291 | 'dec_normalized_sha1':
292 | dec_data_identifier['normalized_sha1'],
293 | 'dec_normalized_sha256':
294 | dec_data_identifier['normalized_sha256'],
295 | 'dec_normalized_simhash':
296 | dec_data_identifier['normalized_simhash']
297 | }
298 |
299 | return file_identifier
300 |
--------------------------------------------------------------------------------
/ExtensionCrawler/js_mincer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | """ A mostly correct JavaScript analyzer that separates comments from code. The
18 | implementation prioritizes speed over correctness. """
19 |
20 | from enum import Enum
21 |
22 |
23 | class JsBlockType(Enum):
24 | """Enumeration of the different JavaScript blocks."""
25 | CODE_BLOCK = 1
26 | SINGLE_LINE_COMMENT = 2
27 | SINGLE_LINE_COMMENT_BLOCK = 3
28 | MULTI_LINE_COMMENT_BLOCK = 4
29 | STRING_SQ = 5
30 | STRING_DQ = 6
31 |
32 |
33 | def is_string_literal_sq(state):
34 | """Check if block is a single quote string literal."""
35 | return state == JsBlockType.STRING_SQ
36 |
37 |
38 | def is_string_literal_dq(state):
39 | """Check if block is a double quote string literal."""
40 | return state == JsBlockType.STRING_DQ
41 |
42 |
43 | def is_string_literal(state):
44 | """Check if block is a quote string literal."""
45 | return is_string_literal_sq(state) or is_string_literal_dq(state)
46 |
47 |
48 | def is_code(state):
49 | """Check if block is code (without string literals)."""
50 | return state == JsBlockType.CODE_BLOCK
51 |
52 |
53 | def is_code_or_string_literal(state):
54 | """Check if block is code or a string literal."""
55 | return is_code(state) or is_string_literal(state)
56 |
57 |
58 | def is_comment_multi_line(state):
59 | """Check if block is a multi line comment."""
60 | return state == JsBlockType.MULTI_LINE_COMMENT_BLOCK
61 |
62 |
63 | def is_comment_single_line(state):
64 | """Check if block is a single line comment."""
65 | return state == JsBlockType.SINGLE_LINE_COMMENT
66 |
67 |
68 | def is_comment_single_line_block(state):
69 | """Check if block is a single line comment block."""
70 | return state == JsBlockType.SINGLE_LINE_COMMENT_BLOCK
71 |
72 |
73 | def is_comment(state):
74 | """Check if block is a comment."""
75 | return is_comment_single_line(state) or is_comment_multi_line(
76 | state) or is_comment_single_line_block(state)
77 |
78 |
79 | def get_next_character(fileobj):
80 | """Get next character from (text) file."""
81 | char = fileobj.read(1)
82 | while char:
83 | yield char
84 | char = fileobj.read(1)
85 |
86 |
87 | class JsBlock:
88 | """Class representing JavaScript blocks."""
89 |
90 | def __init__(self, typ, start, end, content, string_literals=None):
91 | self.typ = typ
92 | self.start = start
93 | self.end = end
94 | self.content = content
95 | self.string_literals = string_literals
96 |
97 | def is_code(self):
98 | """Check if block is a code block."""
99 | return not is_comment(self.typ)
100 |
101 | def is_comment(self):
102 | """Check if block is a comment."""
103 | return is_comment(self.typ)
104 |
105 | def is_comment_single_line(self):
106 | """Check if block is a single line comment."""
107 | return is_comment_single_line(self.typ)
108 |
109 | def is_comment_single_line_block(self):
110 | """Check if block is single line comment block."""
111 | return is_comment_single_line_block(self.typ)
112 |
113 | def is_comment_multi_line_block(self):
114 | """Check if block is a multi line comment."""
115 | return is_comment_multi_line(self.typ)
116 |
117 | def __str__(self):
118 | str_msg = ""
119 | if is_code(self.typ):
120 | str_msg = "** String Literals: " + str(len(
121 | self.string_literals)) + "\n"
122 | return (
123 | "***************************************************************\n"
124 | + "** Type: " + str(self.typ.name) + "\n" + "** Start: " + str(
125 | self.start) + "\n" + "** End: " + str(
126 | self.end) + "\n" + str_msg + self.content.strip() + "\n" +
127 | "***************************************************************\n"
128 | )
129 |
130 |
131 | def mince_js_fileobj(fileobj):
132 | """Mince JavaScript file object into code and comment blocks."""
133 | line = 1
134 | cpos = 1
135 | escaped = False
136 | content = ""
137 | block_start_line = line
138 | block_start_cpos = cpos
139 | state = JsBlockType.CODE_BLOCK
140 | string_literals = []
141 | current_string_literal = ""
142 |
143 | for char in get_next_character(fileobj):
144 | cpos += 1
145 | content += char
146 | suc_state = state
147 | if not escaped:
148 | if is_code_or_string_literal(state):
149 | if is_code(state):
150 | if char == "'":
151 | suc_state = JsBlockType.STRING_SQ
152 | if char == '"':
153 | suc_state = JsBlockType.STRING_DQ
154 | if char == '/':
155 | try:
156 | next_char = next(get_next_character(fileobj))
157 | if next_char == '/':
158 | suc_state = JsBlockType.SINGLE_LINE_COMMENT
159 | elif next_char == '*':
160 | suc_state = JsBlockType.MULTI_LINE_COMMENT_BLOCK
161 | next_content = content[-1] + next_char
162 | content = content[:-1]
163 | cpos -= 1
164 | char = next_char
165 | except StopIteration:
166 | pass
167 | elif is_string_literal_dq(state):
168 | if char == '"':
169 | suc_state = JsBlockType.CODE_BLOCK
170 | string_literals.append(((line, cpos),
171 | current_string_literal))
172 | current_string_literal = ""
173 | else:
174 | current_string_literal += char
175 | elif is_string_literal_sq(state):
176 | if char == "'":
177 | suc_state = JsBlockType.CODE_BLOCK
178 | string_literals.append(((line, cpos),
179 | current_string_literal))
180 | current_string_literal = ""
181 | else:
182 | current_string_literal += char
183 | else:
184 | raise Exception("Unknown state")
185 | elif is_comment(state):
186 | if is_comment_single_line(state):
187 | if char == '\n':
188 | suc_state = JsBlockType.CODE_BLOCK
189 | elif is_comment_multi_line(state):
190 | if char == '*':
191 | try:
192 | next_char = next(get_next_character(fileobj))
193 | if next_char == '/':
194 | suc_state = JsBlockType.CODE_BLOCK
195 | content = content + next_char
196 | cpos += 1
197 | char = next_char
198 | except StopIteration:
199 | pass
200 |
201 | if ((is_comment(state) and is_code_or_string_literal(suc_state)) or (
202 | is_code_or_string_literal(state) and is_comment(suc_state))):
203 | if content.strip():
204 | yield (JsBlock(state, (block_start_line, block_start_cpos),
205 | (line, cpos), content, string_literals))
206 | if char == '\n':
207 | block_start_line = line + 1
208 | block_start_cpos = 1
209 | else:
210 | block_start_line = line
211 | block_start_cpos = cpos
212 | content = next_content
213 | next_content = ""
214 | string_literals = []
215 |
216 | if char == '\n':
217 | line += 1
218 | cpos = 1
219 |
220 | escaped = bool(char == '\\' and not escaped)
221 | state = suc_state
222 |
223 | if content.strip():
224 | yield (JsBlock(state, (block_start_line, block_start_cpos),
225 | (line, cpos), content, string_literals))
226 |
227 |
228 | def mince_js_fileobj_slc_blocks(fileobj):
229 | """Mince JavaScript file object into code and comment blocks (join subsequent
230 | single line comments)."""
231 | for block in mince_js_fileobj(fileobj):
232 | if block.typ == JsBlockType.SINGLE_LINE_COMMENT:
233 | start = block.start
234 | end = block.end
235 | content = block.content
236 | single_block = False
237 | for suc in mince_js_fileobj(fileobj):
238 | if suc.typ == JsBlockType.SINGLE_LINE_COMMENT:
239 | content += suc.content
240 | end = suc.end
241 | single_block = True
242 | else:
243 | if single_block:
244 | yield (JsBlock(JsBlockType.SINGLE_LINE_COMMENT_BLOCK,
245 | start, end, content))
246 | else:
247 | yield block
248 | content = ""
249 | yield suc
250 | break
251 | if content.strip() != "":
252 | yield (JsBlock(JsBlockType.SINGLE_LINE_COMMENT_BLOCK, start,
253 | end, content))
254 | else:
255 | yield block
256 |
257 |
258 | def mince_js_file(file):
259 | """Mince JavaScript file into code and comment blocks."""
260 | with open(file, encoding="utf-8") as fileobj:
261 | for block in mince_js_fileobj(fileobj):
262 | yield block
263 |
264 |
265 | def mince_js_file_slc_blocks(file):
266 | """Mince JavaScript file into code and comment blocks (join subsequent single
267 | line comments)."""
268 | with open(file, encoding="utf-8") as fileobj:
269 | for block in mince_js_fileobj_slc_blocks(fileobj):
270 | yield block
271 |
272 |
273 | def mince_js(file, single_line_comments_block=False):
274 | """Mince JavaScript file (either file name or open file object) into code and
275 | comment blocks. Subsequent comment line blocks can be minced into separate
276 | entities or merged."""
277 | if isinstance(file, str):
278 | if single_line_comments_block:
279 | return mince_js_file_slc_blocks(file)
280 | else:
281 | return mince_js_file(file)
282 | else:
283 | if single_line_comments_block:
284 | return mince_js_fileobj_slc_blocks(file)
285 | else:
286 | return mince_js_fileobj(file)
287 |
--------------------------------------------------------------------------------
/ExtensionCrawler/request_manager.py:
--------------------------------------------------------------------------------
1 | import time
2 | import random
3 | from contextlib import contextmanager
4 | from multiprocessing import Lock, BoundedSemaphore, Value
5 |
6 |
7 | class RequestManager:
8 | def __init__(self, max_workers):
9 | self.max_workers = max_workers
10 | self.lock = Lock()
11 | self.sem = BoundedSemaphore(max_workers)
12 | self.last_request = Value('d', 0.0)
13 | self.last_restricted_request = Value('d', 0.0)
14 |
15 | @contextmanager
16 | def normal_request(self):
17 | with self.lock:
18 | self.sem.acquire()
19 | time.sleep(max(0.0, self.last_restricted_request.value + 0.6 + (random.random() * 0.15) - time.time()))
20 | try:
21 | yield
22 | except Exception as e:
23 | raise e
24 | finally:
25 | self.last_request.value = time.time()
26 | self.sem.release()
27 |
28 | @contextmanager
29 | def restricted_request(self):
30 | with self.lock:
31 | for i in range(self.max_workers):
32 | self.sem.acquire()
33 | time.sleep(max(0.0, self.last_request.value + 0.6 + (random.random() * 0.15) - time.time()))
34 | try:
35 | yield
36 | except Exception as e:
37 | raise e
38 | finally:
39 | self.last_request.value = time.time()
40 | self.last_restricted_request.value = time.time()
41 | for i in range(self.max_workers):
42 | self.sem.release()
43 |
--------------------------------------------------------------------------------
/ExtensionCrawler/util.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | """ Various utility methods."""
19 |
20 | import traceback
21 | import logging
22 | import sys
23 |
24 | from ExtensionCrawler.config import const_log_format
25 |
26 |
27 | def value_of(value, default):
28 | """Get value or default value if None."""
29 | if value is not None and value is not "":
30 | return value
31 | else:
32 | return default
33 |
34 |
35 | def log_debug(msg, indent_level=0):
36 | logging.debug(4 * indent_level * " " + str(msg))
37 |
38 |
39 | def log_info(msg, indent_level=0):
40 | logging.info(4 * indent_level * " " + str(msg))
41 |
42 |
43 | def log_warning(msg, indent_level=0):
44 | logging.warning(4 * indent_level * " " + str(msg))
45 |
46 |
47 | def log_error(msg, indent_level=0):
48 | logging.error(4 * indent_level * " " + str(msg))
49 |
50 |
51 | def log_exception(msg, indent_level=0):
52 | logging.error(4 * indent_level * " " + str(msg))
53 | for line in traceback.format_exc().splitlines():
54 | logging.error(4 * indent_level * " " + line)
55 |
56 |
57 | def set_logger_tag(ext_id):
58 | logger = logging.getLogger()
59 | for handler in logger.handlers:
60 | handler.setFormatter(logging.Formatter(const_log_format(ext_id)))
61 |
62 |
63 | def setup_logger(verbose):
64 | if verbose:
65 | loglevel = logging.INFO
66 | else:
67 | loglevel = logging.WARNING
68 |
69 | logger = logging.getLogger()
70 | logger.setLevel(loglevel)
71 | ch = logging.StreamHandler(sys.stdout)
72 | logger.addHandler(ch)
73 |
74 | set_logger_tag("-" * 32)
75 |
--------------------------------------------------------------------------------
/PermissionAnalysis/grep-unused-permissions:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2019 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 |
20 | import argparse
21 | import io
22 | import logging
23 | import re
24 | import json
25 | import sys
26 | import csv
27 | from jsmin import jsmin
28 | import ast
29 |
30 | from zipfile import ZipFile
31 |
32 | from ExtensionCrawler.config import (const_log_format, const_basedir)
33 | from ExtensionCrawler.archive import iter_tar_entries_by_date
34 | from ExtensionCrawler.js_mincer import mince_js
35 |
36 |
37 | def get_etag(headers_content):
38 | d = ast.literal_eval(headers_content)
39 | if "ETag" in d:
40 | return d["ETag"]
41 |
42 |
43 | def get_metadata(overview_contents):
44 | # Extract extension name
45 | match = re.search("""""",
46 | overview_contents)
47 | name = match.group(1) if match else None
48 |
49 | # Extract extension version
50 | match = re.search(
51 | """""", overview_contents)
52 | version = match.group(1) if match else None
53 |
54 | # Extracts extension categories
55 | match = re.search(
56 | """Attribute name="category">(.+?)""", overview_contents)
57 | categories = match.group(1).split(",") if match else []
58 |
59 | # Extracts the number of downloads
60 | match = re.search(
61 | """ 0:
103 | has_crx_file = True
104 | with ZipFile(tarfile) as zf:
105 | for zipentry in zf.infolist():
106 | if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"):
107 | with zf.open(zipentry) as f:
108 | verbatim_lines = []
109 | for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
110 | verbatim_lines += block.content.splitlines()
111 |
112 | for permission, evidences in permission_map.items():
113 | for evidence in evidences:
114 | for line in verbatim_lines:
115 | if evidence in line:
116 | date_matches[permission] = True
117 | break
118 |
119 | if zipentry.filename == "manifest.json":
120 | with zf.open(zipentry) as m:
121 | raw_content = m.read()
122 | # There are some manifests that seem to have weird encodings...
123 | try:
124 | content = raw_content.decode("utf-8-sig")
125 | except UnicodeDecodeError:
126 | # Trying a different encoding, manifests are weird...
127 | content = raw_content.decode("latin1")
128 |
129 | manifest = json.loads(jsmin(content), strict=False)
130 | if "permissions" in manifest:
131 | for permission in manifest["permissions"]:
132 | used_permissions.add(str(permission))
133 |
134 | if has_crx_file:
135 | line = [date, crx_etag, name, version, "+".join(categories), downloads]
136 | for permission in sorted(list(permission_map.keys())):
137 | if permission in used_permissions:
138 | if date_matches[permission]:
139 | line += ["REQ_AND_FOUND"]
140 | else:
141 | line += ["REQ_AND_NOT_FOUND"]
142 | else:
143 | if date_matches[permission]:
144 | line += ["NOT_REQ_AND_FOUND"]
145 | else:
146 | line += ["NOT_REQ_AND_NOT_FOUND"]
147 | results += [line]
148 |
149 | for result in results:
150 | csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
151 |
152 |
153 | def main(conf):
154 | logger = logging.getLogger()
155 | ch = logging.StreamHandler(sys.stderr)
156 | ch.setFormatter(logging.Formatter(const_log_format()))
157 | logger.addHandler(ch)
158 | if conf.verbose:
159 | logger.setLevel(logging.DEBUG)
160 | else:
161 | logger.setLevel(logging.WARNING)
162 |
163 | with open(conf.MAP_FILE) as f:
164 | permission_map = json.load(f)
165 |
166 | with open(conf.EXTID_FILE) as f:
167 | csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
168 | csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "categories", "downloads"]
169 | + sorted(list(permission_map.keys())))
170 | for extid in [l.strip() for l in f.readlines()]:
171 | try:
172 | handle_extid(conf, extid, permission_map, csvwriter)
173 | except Exception as e:
174 | logging.exception(f"Fatal error when handling extension '{extid}'")
175 |
176 |
177 | def build_parser():
178 | main_parser = argparse.ArgumentParser(
179 | formatter_class=argparse.RawTextHelpFormatter,
180 | description='Search extensions for unused permissions')
181 | main_parser.add_argument(
182 | 'MAP_FILE',
183 | help='json file with permission - literal string mapping')
184 | main_parser.add_argument(
185 | 'EXTID_FILE',
186 | help='file with extension ids')
187 | main_parser.add_argument(
188 | '-v',
189 | '--verbose',
190 | action='store_true',
191 | default=False,
192 | help='increase verbosity')
193 |
194 |
195 | main_parser.add_argument(
196 | '-D',
197 | '--latest-date',
198 | metavar='DATE',
199 | type=str,
200 | help='select latest crx from tar, released before DATE.\n' +
201 | 'Together with --from-date, specifies all crx released in specified\n' +
202 | 'date range.')
203 |
204 | main_parser.add_argument(
205 | '-d',
206 | '--from-date',
207 | metavar='DATE',
208 | type=str,
209 | help='select oldest crx from tar released after DATE.\n' +
210 | 'Together with --latest-date, specifies all crx released in specified\n' +
211 | 'date range.')
212 |
213 | main_parser.add_argument(
214 | '-a',
215 | '--archive-dir',
216 | metavar='archive',
217 | type=str,
218 | default=const_basedir(),
219 | help='archive directory')
220 |
221 | return main_parser
222 |
223 |
224 | if __name__ == "__main__":
225 | main_parser = build_parser()
226 |
227 | main_conf = main_parser.parse_args()
228 |
229 | sys.exit(main(main_conf))
230 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ExtensionCrawler
2 |
3 | A collection of utilities for downloading and analyzing browser
4 | extension from the Chrome Web store.
5 |
6 | * `crawler`: A crawler for extensions from the Chrome Web Store.
7 | * `crx-tool`: A tool for analyzing and extracting `*.crx` files
8 | (i.e., Chrome extensions). Calling `crx-tool.py .crx`
9 | will check the integrity of the extension.
10 | * `crx-extract`: A simple tool for extracting `*.crx` files from the
11 | tar-based archive hierarchy.
12 | * `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
13 | JavaScript decomposition analysis.
14 | * `crx-jsstrings`: A tool for extracting code blocks, comment blocks, and
15 | string literals from JavaScript.
16 | * `create-db`: A tool for updating a remote MariaDB from already
17 | existing extension archives.
18 |
19 | The utilities store the extensions in the following directory
20 | hierarchy:
21 |
22 | ```shell
23 | archive
24 | ├── conf
25 | │ └── forums.conf
26 | ├── data
27 | │ └── ...
28 | └── log
29 | └── ...
30 | ```
31 |
32 | The crawler downloads the most recent extension (i.e., the `*.crx`
33 | file as well as the overview page. In addition, the `conf` directory
34 | may contain one file, called `forums.conf` that lists the ids of
35 | extensions for which the forums and support pages should be downloaded
36 | as well. The `data` directory will contain the downloaded extensions.
37 |
38 | The `crawler` and `create-db` scripts will access and update a MariaDB.
39 | They will use the host, datebase, and credentials found in `~/.my.cnf`.
40 | Since they make use of various JSON features, it is recommended to use at
41 | least version 10.2.8 of MariaDB.
42 |
43 | All utilities are written in Python 3.7. The required modules are listed
44 | in the file `requirements.txt`.
45 |
46 | ## Installation
47 |
48 | Clone and use pip3 to install as a package.
49 |
50 | ```shell
51 | git clone git@logicalhacking.com:BrowserSecurity/ExtensionCrawler.git
52 | pip3 install --user -e ExtensionCrawler
53 | ```
54 |
55 | ## Team
56 |
57 | * [Achim D. Brucker](http://www.brucker.ch/)
58 | * [Michael Herzberg](http://www.dcs.shef.ac.uk/cgi-bin/makeperson?M.Herzberg)
59 |
60 | ### Contributors
61 |
62 | * Mehmet Balande
63 |
64 | ## License
65 |
66 | This project is licensed under the GPL 3.0 (or any later version).
67 |
68 | SPDX-License-Identifier: GPL-3.0-or-later
69 |
70 | ## Master Repository
71 |
72 | The master git repository for this project is hosted by the [Software
73 | Assurance & Security Research Team](https://logicalhacking.com) at
74 | .
75 |
--------------------------------------------------------------------------------
/analysis/library-detector/angular/angular.py:
--------------------------------------------------------------------------------
1 | import MySQLdb
2 | from MySQLdb import cursors
3 | import os
4 | from distutils.version import LooseVersion
5 | from itertools import groupby, islice
6 | import datetime
7 | import pickle
8 |
9 | def execute(q, args=None):
10 | cachepath = "mysqlcache.tmp"
11 | cache = {}
12 | if os.path.exists(cachepath):
13 | with open(cachepath, 'rb') as f:
14 | try:
15 | cache = pickle.load(f)
16 | except Exception as e:
17 | print(e)
18 |
19 | if q in cache:
20 | print("retrieving query results from cache...")
21 | for row in cache[q]:
22 | yield row
23 | else:
24 | print("query not in cache, contacting db ...")
25 | db = MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), cursorclass=cursors.SSCursor)
26 | cursor = db.cursor()
27 | cursor.execute(q, args)
28 |
29 | result = []
30 | for row in cursor:
31 | result += [row]
32 | yield row
33 | cache[q] = result
34 | with open(cachepath, 'wb') as f:
35 | pickle.dump(cache, f)
36 | print("cache saved")
37 |
38 | vuln_md5s = {}
39 |
40 | for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='angular.js' and (filename in ('angular.js', 'angular.min.js'))"):
41 | if version not in vuln_md5s:
42 | vuln_md5s[version] = set()
43 | vuln_md5s[version].add(md5)
44 |
45 | sorted_vuln_md5s = []
46 | for library_version in sorted(vuln_md5s.keys(), key=LooseVersion)[::-1]:
47 | sorted_vuln_md5s += [(library_version, vuln_md5s[library_version])]
48 |
49 |
50 | def get_angular_version(md5):
51 | for library_version, md5s in sorted_vuln_md5s:
52 | if md5 in md5s:
53 | return library_version
54 |
55 | for extid, g in groupby(execute("select extid, crx_etag, date, md5 from extension_update_most_recent join crxfile using (crx_etag) where typ='NORMALIZED' order by extid, date, crx_etag"), lambda x: x[0]):
56 | result = {}
57 |
58 | for crx_etag, g in groupby(map(lambda x: x[1:], g), lambda x: x[0]):
59 | result_version = None
60 | for date, md5, in map(lambda x: x[1:], g):
61 | version = get_angular_version(md5)
62 | if version is not None and (result_version is None or LooseVersion(version) > LooseVersion(result_version)):
63 | result_version = version
64 | result[date] = result_version
65 |
66 | if len(set(result.values())) > 1:
67 | for date in sorted(result.keys()):
68 | print(f"{extid}|{date}|{result[date]}")
69 |
--------------------------------------------------------------------------------
/analysis/library-detector/angular/angularversions.txt:
--------------------------------------------------------------------------------
1 | 1.7.5,2018-10-04 14:59:37 +0100
2 | 1.7.4,2018-09-07 09:57:37 +0100
3 | 1.7.3,2018-08-03 13:35:40 +0200
4 | 1.7.2,2018-06-12 16:34:38 +0300
5 | 1.7.1,2018-06-08 16:26:22 +0300
6 | 1.7.0,2018-05-11 10:31:53 +0200
7 | 1.7.0-rc.0,2018-04-19 10:07:41 +0200
8 | 1.6.10,2018-04-17 18:35:33 +0200
9 | 1.6.9,2018-02-02 11:19:32 +0100
10 | 1.6.8,2017-12-18 15:17:56 +0100
11 | 1.6.7,2017-11-24 18:44:04 +0100
12 | 1.6.6,2017-08-18 15:12:44 +0200
13 | 1.6.5,2017-07-03 22:34:52 +0300
14 | 1.6.4,2017-03-31 10:48:25 +0200
15 | 1.6.3,2017-03-08 12:44:24 +0100
16 | 1.6.2,2017-02-05 17:58:25 +0200
17 | 1.6.1,2016-12-23 10:38:58 +0000
18 | 1.6.0,2016-12-08 11:07:52 +0000
19 | 1.6.0-rc.2,2016-11-24 21:30:56 +0000
20 | 1.6.0-rc.1,2016-11-21 13:27:47 +0000
21 | 1.6.0-rc.0,2016-10-27 20:28:09 +0100
22 | 1.5.11,2017-01-12 11:22:40 +0200
23 | 1.5.10,2016-12-16 12:27:04 +0200
24 | 1.5.9,2016-11-24 09:27:57 +0000
25 | 1.5.8,2016-07-22 16:01:46 +0100
26 | 1.5.7,2016-06-14 08:08:25 -0700
27 | 1.5.6,2016-05-25 17:00:13 +0100
28 | 1.5.5,2016-04-15 14:09:39 +0100
29 | 1.5.4,2016-04-14 09:13:48 +0100
30 | 1.5.3,2016-03-25 20:01:45 +0000
31 | 1.5.2,2016-03-18 15:37:43 -0700
32 | 1.5.1,2016-03-14 14:45:29 +0000
33 | 1.5.0,2016-02-05 10:04:17 +0000
34 | 1.5.0-rc.2,2016-01-28 09:51:01 +0000
35 | 1.5.0-rc.1,2016-01-15 20:31:08 +0000
36 | 1.5.0-rc.0,2015-12-09 13:50:58 +0000
37 | 1.5.0-beta.2,2015-11-17 15:57:27 -0800
38 | 1.5.0-beta.1,2015-09-29 13:59:34 -0700
39 | 1.5.0-beta.0,2015-09-17 13:42:10 +0100
40 | 1.4.14,2016-10-11 14:11:08 +0100
41 | 1.4.13,2016-10-10 22:02:52 +0100
42 | 1.4.12,2016-06-07 10:44:56 +0200
43 | 1.4.11,2016-05-24 16:44:11 +0200
44 | 1.4.10,2016-03-14 17:27:49 -0400
45 | 1.4.9,2016-01-20 10:11:04 -0800
46 | 1.4.8,2015-11-19 14:52:56 -0800
47 | 1.4.7,2015-09-29 13:54:51 -0700
48 | 1.4.6,2015-09-14 22:43:55 +0200
49 | 1.4.5,2015-08-28 12:06:35 -0700
50 | 1.4.4,2015-08-13 11:15:10 -0700
51 | 1.4.3,2015-07-14 18:26:10 -0700
52 | 1.4.2,2015-07-02 14:36:49 +0300
53 | 1.4.1,2015-06-15 20:50:59 +0200
54 | 1.4.0,2015-05-26 17:34:50 -0700
55 | 1.4.0-rc.2,2015-05-07 14:33:28 -0700
56 | 1.4.0-rc.1,2015-04-24 11:26:10 -0700
57 | 1.4.0-rc.0,2015-04-10 10:44:35 -0700
58 | 1.4.0-beta.6,2015-03-15 21:00:39 +0000
59 | 1.4.0-beta.5,2015-02-24 17:22:13 +0000
60 | 1.4.0-beta.4,2015-02-07 10:26:21 +0000
61 | 1.4.0-beta.3,2015-02-03 19:46:22 +0100
62 | 1.4.0-beta.2,2015-01-26 14:50:48 -0800
63 | 1.4.0-beta.1,2015-01-20 19:42:59 +0100
64 | 1.4.0-beta.0,2015-01-14 20:44:32 +0000
65 | 1.2.32,2016-10-11 13:48:38 +0100
66 | 1.2.31,2016-10-11 07:48:26 +0100
67 | 1.2.30,2016-07-20 23:17:37 +0300
68 | 1.2.29,2015-09-29 13:18:52 -0700
69 | 1.2.28,2014-12-13 21:28:02 -0500
70 | 1.2.27,2014-11-20 14:34:26 -0800
71 | 1.2.26,2014-10-02 09:46:40 -0700
72 | 1.2.25,2014-09-16 15:05:22 -0700
73 | 1.2.24,2014-09-09 16:21:16 -0700
74 | 1.2.23,2014-08-22 15:56:49 -0700
75 | 1.2.22,2014-08-11 17:04:40 +0100
76 | 1.2.21,2014-07-25 09:01:43 -0700
77 | 1.2.20,2014-07-11 11:26:39 -0700
78 | 1.2.19,2014-06-30 16:58:15 -0700
79 | 1.2.18,2014-06-13 13:55:33 -0700
80 | 1.2.17,2014-06-06 20:13:16 +0100
81 | 1.2.16,2014-04-03 14:42:19 -0700
82 | 1.2.15,2014-03-21 14:58:48 -0700
83 | 1.3.20,2015-09-29 13:54:03 -0700
84 | 1.3.19,2015-09-15 13:34:09 +0100
85 | 1.3.18,2015-08-18 15:14:56 -0700
86 | 1.3.17,2015-07-01 12:16:14 -0700
87 | 1.3.16,2015-06-05 13:29:27 -0700
88 | 1.3.15,2015-03-15 21:01:49 +0000
89 | 1.3.14,2015-02-24 17:22:45 +0000
90 | 1.3.13,2015-02-07 19:21:53 +0100
91 | 1.3.12,2015-02-02 14:03:17 +0000
92 | 1.3.11,2015-01-26 14:20:52 -0800
93 | 1.3.10,2015-01-20 19:31:56 +0100
94 | 1.3.9,2015-01-13 14:29:29 -0500
95 | 1.3.8,2014-12-19 13:22:00 -0800
96 | 1.3.7,2014-12-15 13:46:21 +0000
97 | 1.3.6,2014-12-08 16:29:39 -0500
98 | 1.3.5,2014-12-01 19:54:14 +0100
99 | 1.3.4,2014-11-25 00:05:18 +0100
100 | 1.3.3,2014-11-17 09:32:21 -0800
101 | 1.3.2,2014-11-07 13:22:01 -0500
102 | 1.3.1,2014-10-31 12:28:58 -0400
103 | 1.3.0,2014-10-13 15:27:20 -0700
104 | 1.3.0-rc.5,2014-10-08 15:51:30 -0700
105 | 1.3.0-rc.4,2014-10-01 17:37:40 -0700
106 | 1.3.0-rc.3,2014-09-23 18:47:24 -0700
107 | 1.3.0-rc.2,2014-09-16 14:52:25 -0700
108 | 1.3.0-rc.1,2014-09-09 15:45:51 -0700
109 | 1.3.0-rc.0,2014-08-29 21:22:46 -0400
110 | 1.3.0-beta.19,2014-08-22 15:57:26 -0700
111 | 1.3.0-beta.18,2014-08-11 16:54:40 +0100
112 | 1.3.0-beta.17,2014-07-25 16:37:53 +0100
113 | 1.3.0-beta.16,2014-07-18 12:18:26 -0700
114 | 1.3.0-beta.15,2014-07-11 11:15:42 -0700
115 | 1.3.0-beta.14,2014-06-30 09:52:32 -0700
116 | 1.3.0-beta.13,2014-06-16 10:47:09 -0700
117 | 1.3.0-beta.12,2014-06-13 13:41:18 -0700
118 | 1.3.0-beta.11,2014-06-06 20:22:50 +0100
119 | 1.3.0-beta.10,2014-05-23 15:08:36 -0700
120 | 1.3.0-beta.9,2014-05-16 15:14:12 -0700
121 | 1.3.0-beta.8,2014-05-09 14:42:26 +0100
122 | 1.3.0-beta.7,2014-04-25 15:00:17 -0700
123 | 1.3.0-beta.6,2014-04-21 15:57:08 -0700
124 | 1.3.0-beta.5,2014-04-03 14:46:15 -0700
125 | 1.3.0-beta.4,2014-03-28 17:43:17 -0400
126 | 1.3.0-beta.3,2014-03-21 11:16:35 -0700
127 | 1.3.0-beta.2,2014-03-14 16:26:40 -0700
128 | 1.3.0-beta.1,2014-03-07 16:23:14 -0800
129 | 1.2.14,2014-03-01 09:51:19 -0800
130 | 1.2.13,2014-02-14 16:41:02 -0800
131 | 1.2.12,2014-02-07 17:00:28 -0500
132 | 1.2.11,2014-02-03 09:40:03 -0800
133 | 1.2.10,2014-01-24 15:28:28 -0800
134 | 1.2.9,2014-01-15 10:02:10 -0800
135 | 1.2.8,2014-01-10 12:37:49 -0800
136 | 1.2.7,2014-01-03 10:28:30 -0800
137 | 1.2.6,2013-12-19 15:50:07 -0800
138 | 1.2.5,2013-12-13 10:52:13 -0800
139 | 1.2.4,2013-12-06 13:14:56 -0500
140 | 1.2.3,2013-11-27 10:04:59 +0000
141 | 1.2.2,2013-11-22 09:05:42 -0800
142 | 1.2.1,2013-11-14 22:33:20 -0800
143 | 1.2.0,2013-11-08 09:40:09 -0800
144 | 1.2.0-rc.3,2013-10-14 10:36:23 -0700
145 | 1.2.0-rc.2,2013-09-04 14:50:39 +0200
146 | 1.2.0rc1,2013-08-13 11:50:32 -0700
147 | 1.1.5,2013-05-22 01:05:11 -0700
148 | 1.1.4,2013-04-03 18:54:52 -0700
149 | 1.1.3,2013-02-20 12:54:44 -0800
150 | 1.1.2,2013-01-23 10:54:35 -0800
151 | 1.1.1,2012-11-27 01:45:35 +0100
152 | 1.1.0,2012-09-04 11:11:09 -0700
153 | 1.0.8,2013-08-22 11:20:23 -0700
154 | 1.0.7,2013-05-22 01:05:53 -0700
155 | 1.0.6,2013-04-04 10:48:05 -0700
156 | 1.0.5,2013-02-20 12:58:02 -0800
157 | 1.0.4,2013-01-23 10:57:51 -0800
158 | 1.0.3,2012-11-27 01:44:46 +0100
159 | 1.0.2,2012-09-04 11:08:40 -0700
160 | 1.0.1,2012-06-25 09:30:57 -0700
161 | 1.0.0,2012-06-14 10:50:22 -0700
162 | 1.0.0rc12,2012-06-12 01:46:02 -0700
163 | 1.0.0rc11,2012-06-11 00:03:01 -0700
164 | 1.0.0rc10,2012-05-23 21:05:21 -0700
165 | 1.0.0rc9,2012-05-14 22:13:15 -0700
166 | 1.0.0rc8,2012-05-07 00:09:20 -0700
167 | 1.0.0rc7,2012-04-30 16:32:45 -0700
168 | 1.0.0rc6,2012-04-20 15:06:39 -0700
169 | 1.0.0rc5,2012-04-12 03:56:28 -0700
170 | 1.0.0rc4,2012-04-05 11:46:36 -0700
171 | 1.0.0rc3,2012-03-29 16:10:40 -0700
172 | 1.0.0rc2,2012-03-20 15:38:57 -0700
173 | g3-v1.0.0rc1,2012-03-16 12:06:29 -0700
174 | 1.0.0rc1,2012-03-14 01:00:46 -0700
175 | 0.10.6,2012-01-17 13:54:18 -0800
176 | 0.10.5,2011-11-08 04:29:07 -0800
177 | 0.10.4,2011-10-22 21:39:39 -0700
178 | 0.10.3,2011-10-14 08:31:39 -0700
179 | 0.10.2,2011-10-08 09:18:19 -0700
180 | 0.10.1,2011-09-09 01:01:46 -0700
181 | 0.10.0,2011-09-02 11:32:29 -0700
182 | 0.9.19,2011-08-21 01:12:34 -0700
183 | 0.9.18,2011-07-29 16:30:24 -0700
184 | 0.9.17,2011-06-30 09:10:59 -0700
185 | 0.9.16,2011-06-07 16:11:01 -0700
186 | 0.9.15,2011-04-11 14:23:26 -0700
187 | 0.9.14,2011-04-01 12:26:04 -0700
188 | 0.9.13,2011-03-13 22:48:26 -0700
189 | 0.9.12,2011-03-03 23:14:43 -0800
190 | 0.9.11,2011-02-08 17:47:31 -0800
191 | 0.9.10,2011-01-26 23:51:06 -0800
192 | 0.9.9,2011-01-13 22:08:27 -0800
193 | 0.9.7,2010-12-10 17:08:52 -0800
194 | 0.9.6,2010-12-06 21:11:10 -0800
195 | 0.9.5,2010-11-25 10:11:26 -0800
196 | 0.9.4,2010-11-18 22:40:01 -0800
197 | 0.9.3,2010-11-10 22:15:16 -0800
198 | 0.9.2,2010-11-03 13:06:45 -0700
199 | 0.9.1,2010-10-26 22:18:25 -0700
200 | 0.9.0,2010-10-20 15:51:36 -0700
201 |
--------------------------------------------------------------------------------
/analysis/library-detector/angular/ideas.txt:
--------------------------------------------------------------------------------
1 | start with current version & never update
2 | start with outdated version & never update
3 | update frequently
4 | downgrade
5 |
6 |
7 | angular is transitive dep
8 | own dep
9 |
--------------------------------------------------------------------------------
/analysis/library-detector/angular/plotting.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import datetime
3 | from dateutil import parser
4 | from distutils.version import LooseVersion
5 |
6 |
7 | import numpy as np
8 | from matplotlib import pyplot as plt
9 | import matplotlib.patches as mpatches
10 |
11 | def get_cmap(n, name='hsv'):
12 | '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
13 | RGB color; the keyword argument name must be a standard mpl colormap name.'''
14 | return plt.cm.get_cmap(name, n)
15 |
16 |
17 | plt.figure(figsize=(20, 100))
18 |
19 | data = {}
20 | with open(sys.argv[1]) as f:
21 | for line in f.readlines()[0:5000]:
22 | line = line.strip()
23 | extid, ts, vers = line.split(",")
24 | if extid not in data:
25 | data[extid] = {}
26 | data[extid][parser.parse(ts).date()] = vers
27 |
28 | startdate = datetime.date(year=2017, month=2, day=1)
29 | enddate = datetime.date(year=2018, month=12, day=13)
30 | NOT_IN_STORE = "NO DATA"
31 |
32 |
33 |
34 | converted_data = {}
35 | versions = set()
36 | for extid, tups in data.items():
37 | days_version_tups = [(0, NOT_IN_STORE)]
38 | for ts, vers in sorted(tups.items()):
39 | if vers != "None":
40 | versions.add(vers)
41 | #if vers != days_version_tups[-1][1]:
42 | days_version_tups += [((ts - startdate).days, vers)]
43 | converted_data[extid] = days_version_tups
44 |
45 | converted_data["angular_updates"] = [(0, NOT_IN_STORE)]
46 | version_release = {}
47 | with open(sys.argv[2]) as f:
48 | for line in f.readlines():
49 | line = line.strip()
50 | vers, ts_str = line.split(",")
51 | ts = parser.parse(ts_str).date()
52 | version_release[vers] = ts
53 | if startdate < ts and ts < enddate:
54 | converted_data["angular_updates"] += [((ts - startdate).days, vers)]
55 |
56 | converted_data["angular_updates"].sort()
57 |
58 |
59 | colors = {}
60 | for i, version in enumerate(sorted(versions, key=version_release.get)):
61 | #colors[version] = get_cmap(len(versions))(i)
62 | colors[version] = plt.cm.jet(1. * i / ((len(versions)) - 1))
63 | for version, color in colors.items():
64 | print(f"{version}: {color}")
65 |
66 | bottoms = np.arange(len(converted_data))
67 |
68 | sorted_data = sorted(list(converted_data.items()), key=lambda x: min(map(lambda y: y[1], x[1])))
69 |
70 | for i in range(len(converted_data.items())):
71 | extid, tups = sorted_data[i]
72 | for j in range(len(tups)):
73 | days, vers = tups[j]
74 | if j + 1 == len(tups):
75 | next_days = (enddate - startdate).days
76 | else:
77 | next_days = tups[j + 1][0]
78 | print(f"{extid}: {days}")
79 | #print(f"{vers} and {colors[vers]}")
80 | color = "w"
81 | if vers in colors:
82 | color = colors[vers]
83 | plt.bar(days, 0.8, width=next_days - days, bottom=bottoms[i],
84 | color=color, orientation="horizontal", label=vers, linewidth=1, edgecolor="black")
85 | plt.yticks(bottoms, map(lambda x: x[0], sorted(list(converted_data.items()), key=lambda x: min(map(lambda y: y[1], x[1])))))
86 |
87 | patchList = []
88 | for version, color in sorted(colors.items(), key=lambda x: LooseVersion(x[0])):
89 | data_key = mpatches.Patch(color=color, label=version)
90 | patchList.append(data_key)
91 |
92 | plt.legend(handles=patchList, loc="best", bbox_to_anchor=(1.0, 1.00))
93 |
94 |
95 | plt.subplots_adjust(right=0.85)
96 | plt.savefig("out.pdf")
97 |
--------------------------------------------------------------------------------
/analysis/library-detector/jquery.py:
--------------------------------------------------------------------------------
1 | import MySQLdb
2 | from MySQLdb import cursors
3 | import os
4 | from distutils.version import LooseVersion
5 | from itertools import groupby, islice
6 | import datetime
7 | import pickle
8 |
9 | def execute(q, args=None):
10 | cachepath = "mysqlcache.tmp"
11 | cache = {}
12 | if os.path.exists(cachepath):
13 | with open(cachepath, 'rb') as f:
14 | try:
15 | cache = pickle.load(f)
16 | except Exception as e:
17 | print(e)
18 |
19 | if q in cache:
20 | print("retrieving query results from cache...")
21 | for row in cache[q]:
22 | yield row
23 | else:
24 | print("query not in cache, contacting db ...")
25 | db = MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), cursorclass=cursors.SSCursor)
26 | cursor = db.cursor()
27 | cursor.execute(q, args)
28 |
29 | result = []
30 | for row in cursor:
31 | result += [row]
32 | yield row
33 | cache[q] = result
34 | with open(cachepath, 'wb') as f:
35 | pickle.dump(cache, f)
36 | print("cache saved")
37 |
38 | vuln_md5s = set()
39 |
40 | # for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='jquery'"):
41 | # if LooseVersion(version) < LooseVersion('1.6.3'):
42 | # vuln_md5s.add(md5)
43 | for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='angular.js'"):
44 | if LooseVersion(version) < LooseVersion('1.6.9'):
45 | vuln_md5s.add(md5)
46 | print(f"found {len(vuln_md5s)} MD5s")
47 |
48 | hits = 0
49 | still_vuln = 0
50 | for extid, g in groupby(execute("select extid, crx_etag, date, md5 from extension_update_most_recent join crxfile using (crx_etag) where typ='NORMALIZED' order by extid, date, crx_etag"), lambda x: x[0]):
51 | ext_is_vuln = False
52 | for crx_etag, g in groupby(map(lambda x: x[1:], g), lambda x: x[0]):
53 | is_vuln = False
54 | for date, md5, in map(lambda x: x[1:], g):
55 | if md5 in vuln_md5s:
56 | is_vuln = True
57 | break
58 |
59 | if not is_vuln and ext_is_vuln:
60 | print(f"{extid} got fixed in {crx_etag} on {date}!")
61 | hits += 1
62 | ext_is_vuln = is_vuln
63 | if is_vuln and date > datetime.datetime(year=2018, month=11, day=14):
64 | print(f"{extid} in {crx_etag} is still vulnerable as of {date}")
65 | still_vuln += 1
66 |
67 | print(f"# fixes: {hits}")
68 | print(f"# still vulnerable: {still_vuln}")
69 |
70 |
--------------------------------------------------------------------------------
/cdnjs-git-miner:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 | """ Tool for mining the cdnjs git repository"""
20 |
21 | import getopt
22 | import logging
23 | import sys
24 | import os
25 |
26 | from ExtensionCrawler.config import (const_log_format, const_basedir)
27 | from ExtensionCrawler.cdnjs_git import (pull_and_update_db, update_db_all_libs,
28 | update_db_from_listfile)
29 |
30 |
31 | def helpmsg():
32 | """Print help message."""
33 | print("cdnjs-git-miner [OPTION]")
34 | print(
35 | " -i initialize/update database with all libraries in the repository"
36 | )
37 | print(" -u update: pull repository and update database")
38 | print(
39 | " -l read list of libraries to update from file (recusively)"
40 | )
41 | print(" -n process chunk n where n in [1,N]")
42 | print(" -N ")
43 | print(" -v verbose")
44 | print(
45 | " -c print csv format to stdout instead of writing to database"
46 | )
47 | print(" -a= archive directory")
48 | print(" -h print this help text")
49 |
50 |
51 | def main(argv):
52 | """Main function of the extension crawler."""
53 | basedir = const_basedir()
54 | verbose = False
55 | initialize = False
56 | update = False
57 | taskid = 1
58 | listfile = None
59 | maxtaskid = 1
60 | csv = False
61 |
62 | try:
63 | opts, args = getopt.getopt(argv, "hvicl:ua:p:n:N:", [
64 | "archive=", "listupdate=", "taskid=", "maxtaskid="
65 | ])
66 | except getopt.GetoptError:
67 | helpmsg()
68 | sys.exit(2)
69 | for opt, arg in opts:
70 | if opt == '-h':
71 | helpmsg()
72 | sys.exit()
73 | elif opt == '-v':
74 | verbose = True
75 | elif opt in ("-l", "--listupdate"):
76 | listfile = arg
77 | elif opt in ("-a", "--archive"):
78 | basedir = arg
79 | elif opt == '-i':
80 | initialize = True
81 | elif opt == '-u':
82 | update = True
83 | elif opt == '-c':
84 | csv = True
85 | elif opt in ("-n", "--taskid"):
86 | taskid = int(arg)
87 | elif opt in ("-N", "--maxtaskid"):
88 | maxtaskid = int(arg)
89 |
90 | if verbose:
91 | loglevel = logging.INFO
92 | else:
93 | loglevel = logging.WARNING
94 |
95 | logger = logging.getLogger()
96 | ch = logging.StreamHandler(sys.stdout)
97 | ch.setFormatter(logging.Formatter(const_log_format()))
98 | logger.addHandler(ch)
99 | logger.setLevel(loglevel)
100 |
101 | cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
102 |
103 | if initialize:
104 | logging.info("Starting update of all db libs")
105 | update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid)
106 | logging.info("Finished update of all db libs")
107 | if update:
108 | logging.info("Starting update of new db libs")
109 | pull_and_update_db(cdnjs_git_path, csv)
110 | logging.info("Finished update of new db libs")
111 | if listfile is not None:
112 | logging.info("Starting update from list file")
113 | update_db_from_listfile(cdnjs_git_path, listfile, csv)
114 | logging.info("Finished update from list file")
115 |
116 | logging.info("Successfully updated cdnjs table")
117 |
118 |
119 | if __name__ == "__main__":
120 | main(sys.argv[1:])
121 |
--------------------------------------------------------------------------------
/crawler:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016-2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 | """
20 | A crawler for extensions from the Chrome Web Store.
21 | """
22 |
23 | import sys
24 | import datetime
25 | import time
26 | import getopt
27 | import logging
28 | import itertools
29 | import multiprocessing
30 | from functools import reduce
31 | from ExtensionCrawler.discover import get_new_ids
32 | from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions
33 | from ExtensionCrawler.config import *
34 | from ExtensionCrawler.util import log_info, log_exception, setup_logger
35 |
36 |
37 | def write_log(dirname, fname, text):
38 | """Write text into the file with name fname in directory dirname."""
39 | os.makedirs(dirname, exist_ok=True)
40 | fname = fname.replace(":", "_")
41 | with open(os.path.join(dirname, fname), 'w') as logfile:
42 | logfile.write(text)
43 |
44 |
45 | def log_failures_to_file(dirname, today, res):
46 | """Log failures during download/update in the log directory dirname."""
47 | not_authorized = "\n".join(sorted([x.ext_id for x in res if x.not_authorized()]))
48 | write_log(dirname, today + "-not-authorized.log", not_authorized)
49 |
50 | updated = "\n".join(sorted([x.ext_id for x in res if x.is_ok() and not x.not_modified()]))
51 | write_log(dirname, today + "-updated.log", updated)
52 |
53 | has_exception = "\n".join(sorted([x.ext_id for x in res if x.has_exception()]))
54 | write_log(dirname, today + "-raised-exception.log", has_exception)
55 |
56 | raised_ddos = "\n".join(sorted([x.ext_id for x in res if x.raised_google_ddos()]))
57 | write_log(dirname, today + "-raised-ddos.log", raised_ddos)
58 |
59 | not_in_store = "\n".join(sorted([x.ext_id for x in res if x.not_in_store()]))
60 | write_log(dirname, today + "-not-in-store.log", not_in_store)
61 |
62 | new = "\n".join(sorted([x.ext_id for x in res if x.is_new()]))
63 | write_log(dirname, today + "-new-in-store.log", new)
64 |
65 | file_corruption = "\n".join(sorted([x.ext_id for x in res if x.corrupt_tar()]))
66 | write_log(dirname, today + "-file-corruption.log", file_corruption)
67 |
68 | sql_exception = "\n".join(sorted([x.ext_id for x in res if x.sql_exception()]))
69 | write_log(dirname, today + "-sql-exception.log", sql_exception)
70 |
71 | worker_exception = "\n".join(sorted([x.ext_id for x in res if x.worker_exception]))
72 | write_log(dirname, today + "-worker-exception.log", worker_exception)
73 |
74 | sql_fail = "\n".join(sorted([x.ext_id for x in res if not x.sql_success()]))
75 | write_log(dirname, today + "-sql-not-updated.log", sql_fail)
76 |
77 |
78 | def log_summary(res, runtime=0):
79 | """Log brief result summary."""
80 |
81 | corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
82 |
83 | log_info("Summary:")
84 | log_info(" Updated {} out of {} extensions successfully".format(str(len(list(filter(lambda x: x.is_ok(), res)))),
85 | str(len(res))))
86 | log_info(" Updated extensions: {:8d}".format(
87 | len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res)))))
88 | log_info(" Updated SQL databases: {:8d}".format(len(list(filter(lambda x: x.sql_success(), res)))))
89 | log_info(" New extensions: {:8d}".format(len(list(filter(lambda x: x.is_new(), res)))))
90 | log_info(" Not authorized: {:8d}".format(len(list(filter(lambda x: x.not_authorized(), res)))))
91 | log_info(" Raised Google DDOS: {:8d}".format(len(list(filter(lambda x: x.raised_google_ddos(), res)))))
92 | log_info(" Not modified archives: {:8d}".format(len(list(filter(lambda x: x.not_modified(), res)))))
93 | log_info(" Extensions not in store: {:8d}".format(len(list(filter(lambda x: x.not_in_store(), res)))))
94 | log_info(" Unknown exception: {:8d}".format(len(list(filter(lambda x: x.has_exception(), res)))))
95 | log_info(" Corrupt tar archives: {:8d}".format(len(corrupt_tar_archives)))
96 | log_info(" SQL exception: {:8d}".format(len(list(filter(lambda x: x.sql_exception(), res)))))
97 | log_info(
98 | " Worker exception: {:8d}".format(len(list(filter(lambda x: x.worker_exception is not None, res)))))
99 | log_info(" Total runtime: {}".format(str(datetime.timedelta(seconds=int(runtime)))))
100 |
101 | if corrupt_tar_archives:
102 | log_info("")
103 | log_info("List of extensions with corrupted files/archives:")
104 | for x in corrupt_tar_archives:
105 | log_info("{}: {}".format(x.ext_id, x.exception), 1)
106 | log_info("")
107 |
108 |
109 | def helpmsg():
110 | """Print help message."""
111 | print("crawler [OPTION]")
112 | print(" -h print this help text")
113 | print(" -s silent (no log messages)")
114 | print(" -d discover new extensions")
115 | print(" -p number of concurrent downloads")
116 | print(" -a archive directory")
117 | print(
118 | " -t timeout for an individual extension download")
119 | print(" --max-discover discover at most N new extensions")
120 | print(" --pystuck start pystuck server for all processes")
121 |
122 |
123 | def print_config(basedir, archive_dir, conf_dir, discover, parallel,
124 | ext_timeout, start_pystuck):
125 | """Print current configuration."""
126 | log_info("Configuration:")
127 | log_info(" Base dir: {}".format(basedir))
128 | log_info(" Archive directory: {}".format(archive_dir))
129 | log_info(" Configuration directory: {}".format(conf_dir))
130 | log_info(" Discover new extensions: {}".format(discover))
131 | log_info(" Max num. of concurrent downloads: {}".format(parallel))
132 | log_info(" Download timeout: {}".format(ext_timeout))
133 | log_info(" Start PyStuck: {}".format(start_pystuck))
134 |
135 |
136 | def parse_args(argv):
137 | """Parse command line arguments. """
138 | basedir = const_basedir()
139 | parallel = const_parallel_downloads()
140 | verbose = const_verbose()
141 | discover = const_discover()
142 | ext_timeout = const_ext_timeout()
143 | max_discover = None
144 | start_pystuck = False
145 | try:
146 | opts, _ = getopt.getopt(
147 | argv, "hsda:p:t:",
148 | ["timeout=", "archive=", 'parallel=', 'max-discover=', 'pystuck'])
149 | except getopt.GetoptError:
150 | helpmsg()
151 | sys.exit(2)
152 | for opt, arg in opts:
153 | if opt == '-h':
154 | helpmsg()
155 | sys.exit()
156 | elif opt in ("-a", "--archive"):
157 | basedir = arg
158 | elif opt in ("-p", "--parallel"):
159 | parallel = int(arg)
160 | elif opt in ("-t", "--timeout"):
161 | ext_timeout = int(arg)
162 | elif opt == '-s':
163 | verbose = False
164 | elif opt == '-d':
165 | discover = True
166 | elif opt == '--max-discover':
167 | discover = True
168 | max_discover = int(arg)
169 | elif opt == '--pystuck':
170 | start_pystuck = True
171 | return basedir, parallel, verbose, discover, max_discover, ext_timeout, start_pystuck
172 |
173 |
174 | def main(argv):
175 | """Main function of the extension crawler."""
176 |
177 | today = datetime.datetime.now(datetime.timezone.utc).isoformat()
178 | basedir, parallel, verbose, discover, max_discover, ext_timeout, start_pystuck = parse_args(argv)
179 |
180 | setup_logger(verbose)
181 |
182 | if start_pystuck:
183 | import pystuck
184 | pystuck.run_server(port=10000)
185 |
186 | # Surpressing these "Starting HTTPS connection ..." log messages
187 | # Older versions of requests use loglevel INFO for that, newer ones DEBUG
188 | logging.getLogger("requests").setLevel(logging.WARNING)
189 |
190 | archive_dir = os.path.join(basedir, "data")
191 | os.makedirs(archive_dir, exist_ok=True)
192 | conf_dir = os.path.join(basedir, "conf")
193 | os.makedirs(conf_dir, exist_ok=True)
194 | open(os.path.join(conf_dir, "forums.conf"), 'a').close()
195 | log_dir = os.path.join(basedir, "log",datetime.datetime.today().strftime("%Y-%m"))
196 | os.makedirs(log_dir, exist_ok=True)
197 |
198 | start_time = time.time()
199 |
200 | print_config(basedir, archive_dir, conf_dir, discover, parallel,
201 | ext_timeout, start_pystuck)
202 |
203 | forum_ext_ids = get_forum_ext_ids(conf_dir)
204 | known_ids = list(set(get_existing_ids(archive_dir)) | set(forum_ext_ids))
205 | discovered_ids = []
206 | if discover:
207 | log_info("Discovering new ids {}...".format(
208 | "(at most {}) ".format(max_discover) if max_discover is not None else ""))
209 | try:
210 | discovered_ids = list(get_new_ids(known_ids, max_discover))
211 | except Exception:
212 | log_exception("Exception when discovering new ids")
213 | log_info("Discovered {} new extensions".format(len(discovered_ids)), 1)
214 |
215 | ext_ids = list(set(discovered_ids) | set(known_ids))
216 |
217 | discovered_ids = None
218 | known_ids = None
219 |
220 | res = update_extensions(archive_dir, parallel, forum_ext_ids, ext_ids, ext_timeout, verbose, start_pystuck)
221 |
222 | # We re-try (once) the extensions with unknown exceptions, as
223 | # they are often temporary
224 | has_exception = list(filter(lambda x: x.has_exception(), res))
225 | if has_exception:
226 | log_info(" {} extensions with unknown exceptions, start another try ...".format(str(len(has_exception))))
227 | has_exception_ids = [x.ext_id for x in has_exception]
228 | forum_ext_ids_except = list(
229 | set(forum_ext_ids).intersection(set(has_exception_ids)))
230 | ext_ids_except = sorted(
231 | list(set(has_exception_ids) - set(forum_ext_ids_except)))
232 | res_update = update_extensions(archive_dir, parallel,
233 | forum_ext_ids_except, ext_ids_except, ext_timeout, verbose, start_pystuck)
234 | res = list(set(res) - set(has_exception)) + res_update
235 |
236 | end_time = time.time()
237 | log_summary(res, int(end_time - start_time))
238 | log_failures_to_file(log_dir, today, res)
239 |
240 |
241 | if __name__ == "__main__":
242 | main(sys.argv[1:])
243 |
--------------------------------------------------------------------------------
/create-db:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 |
19 | import getopt
20 | import sys
21 | import tarfile
22 | import time
23 | import tempfile
24 | from functools import partial
25 | import fnmatch
26 | import multiprocessing
27 | from pebble import ProcessPool
28 | import os
29 | import datetime
30 |
31 | from ExtensionCrawler.archive import update_db_incremental
32 | from ExtensionCrawler.config import archive_file, const_basedir, const_mysql_config_file
33 | from ExtensionCrawler.util import log_info, log_exception, setup_logger, set_logger_tag
34 |
35 | from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
36 |
37 |
38 | def print_help():
39 | print("""create-db [OPTION]""")
40 | print(""" -h print this help text""")
41 | print(""" -a archive directory""")
42 | print(""" -p three-letter-prefix""")
43 | print(""" -e file with extension ids""")
44 | print(""" --from-date only process information gathered after"""
45 | """ this date (compared lexographically)""")
46 | print(""" --until-date only process information gathered before"""
47 | """ this date (compared lexographically)""")
48 | print(""" -t number of parallel threads""")
49 | print(""" -n process chunk n where n in [1,N]""")
50 | print(""" -N """)
51 | print(""" --delayed uses INSERT DELAYED INTO statements""")
52 |
53 | def init_process(verbose):
54 | # When not using fork, we need to setup logging again in the worker threads
55 | setup_logger(verbose)
56 |
57 | def process_id(from_date, until_date, delayed, path):
58 | start = time.time()
59 | with tempfile.TemporaryDirectory() as tmpdir:
60 | with tarfile.open(path) as t:
61 | t.extractall(tmpdir)
62 |
63 | extid = os.listdir(tmpdir)[0]
64 | set_logger_tag(extid)
65 | log_info("Start processing extension", 0)
66 | iddir = os.path.join(tmpdir, extid)
67 |
68 | try:
69 | with MysqlBackend(
70 | extid,
71 | delayed=delayed,
72 | cache_etags=True,
73 | read_default_file=const_mysql_config_file(),
74 | charset='utf8mb4') as con:
75 | for date in sorted(os.listdir(iddir)):
76 | if (from_date is not None and date < from_date) or \
77 | (until_date is not None and date > until_date):
78 | log_info("* Skipping {}".format(date), 2)
79 | continue
80 | try:
81 | update_db_incremental(iddir, extid, date, con)
82 | except Exception:
83 | log_exception("Exception when handling data from {}".format(date), 0)
84 | except Exception:
85 | log_exception("Exception when handling extension", 0)
86 | log_info("Finished extension in {}".format(str(datetime.timedelta(seconds=int(time.time() - start)))), 0)
87 |
88 |
89 | def find(archive, pattern):
90 | for root, _, files in os.walk(os.path.join(archive, "data")):
91 | for file in files:
92 | if fnmatch.fnmatch(file, pattern + ".tar") or fnmatch.fnmatch(file, pattern + ".[0-9][0-9][0-9].tar.xz"):
93 | yield os.path.join(root, file)
94 |
95 |
96 | def find_from_file(archive, extidlistfile):
97 | with open(extidlistfile, 'r') as f:
98 | for line in f.readlines():
99 | yield archive_file(os.path.join(archive, "data"), line.strip())
100 |
101 |
102 | def parse_args(argv):
103 | archive = const_basedir()
104 | parallel = 8
105 | taskid = 1
106 | maxtaskid = 1
107 | from_date = None
108 | until_date = None
109 | delayed = False
110 |
111 | paths = []
112 |
113 | try:
114 | opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
115 | "archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
116 | "maxtaskid=", "from-date=", "until-date=", "delayed", "help"
117 | ])
118 | except getopt.GetoptError:
119 | print_help()
120 | sys.exit(2)
121 | for opt, arg in opts:
122 | if opt in ("-h", "--help"):
123 | print_help()
124 | sys.exit()
125 | elif opt in ("-a", "--archive"):
126 | archive = arg
127 | elif opt in ("-p", "--prefix"):
128 | paths += find(archive, arg + "*")
129 | elif opt in ("-e", "--extidlistfile"):
130 | paths += find_from_file(archive, arg)
131 | elif opt in ("-t", "--threads"):
132 | parallel = int(arg)
133 | elif opt in ("-n", "--taskid"):
134 | taskid = int(arg)
135 | elif opt in ("-N", "--maxtaskid"):
136 | maxtaskid = int(arg)
137 | elif opt == "--from-date":
138 | from_date = arg
139 | elif opt == "--until-date":
140 | until_date = arg
141 | elif opt == "--delayed":
142 | delayed = True
143 |
144 | if not paths:
145 | paths = list(find(archive, "*"))
146 |
147 | chunksize = int(len(paths) / maxtaskid)
148 | if taskid == maxtaskid:
149 | paths = paths[(taskid - 1) * chunksize:]
150 | else:
151 | paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
152 |
153 | return paths, parallel, from_date, until_date, delayed
154 |
155 |
156 | def main(argv):
157 | multiprocessing.set_start_method("forkserver")
158 | verbose = True
159 | setup_logger(verbose)
160 |
161 | paths, parallel, from_date, until_date, delayed = parse_args(argv)
162 |
163 | with ProcessPool(max_workers=parallel, max_tasks=100, initializer=init_process, initargs=(verbose,)) as p:
164 | p.map(partial(process_id, from_date, until_date, delayed), paths)
165 |
166 |
167 | if __name__ == "__main__":
168 | main(sys.argv[1:])
169 |
--------------------------------------------------------------------------------
/crx-extract:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2017-2018 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 | """Tool for extracting crx file from a tar archive."""
20 |
21 | import os
22 | import sys
23 | import glob
24 | import getopt
25 | import tarfile
26 | import datetime
27 | import dateutil
28 | import dateutil.parser
29 | from ExtensionCrawler.archive import last_crx, get_local_archive_dir
30 | from ExtensionCrawler.config import const_basedir
31 |
32 |
33 | def helpmsg():
34 | """Print help message."""
35 | print("crx-extract [OPTION] extid")
36 | print(" -h print this help text")
37 | print(" -s silent (no log messages)")
38 | print(" -e use etag instead of date in outoput")
39 | print(" -w avoid ':' in filenames (useful on Windows)")
40 | print(" -d= date")
41 | print(" -o= output directory")
42 | print(" -a= archive directory")
43 |
44 |
45 | def get_tarinfo(members, name, winfs=False, etag=None):
46 | """Select tarinfo object with a specified path/name."""
47 | for tarinfo in members:
48 | if tarinfo.name == name:
49 | if winfs:
50 | tarinfo.name = name.replace(":", "-")
51 | if etag is not None:
52 | (path, crx) = os.path.split(tarinfo.name)
53 | (path, _) = os.path.split(path)
54 | tarinfo.name = os.path.join(path, etag, crx)
55 | yield tarinfo
56 |
57 |
58 | def main(argv):
59 | """Main function of the extension crawler."""
60 | basedir = const_basedir()
61 | verbose = True
62 | date = None
63 | useetag = False
64 | output = ""
65 | winfs = False
66 | try:
67 | opts, args = getopt.getopt(argv, "hsed:a:o:w",
68 | ["date=", "archive=", "output="])
69 | except getopt.GetoptError:
70 | helpmsg()
71 | sys.exit(2)
72 | for opt, arg in opts:
73 | if opt == '-h':
74 | helpmsg()
75 | sys.exit()
76 | elif opt in ("-a", "--archive"):
77 | basedir = arg
78 | elif opt in ("-d", "--date"):
79 | date = arg
80 | elif opt in ("-o", "--output"):
81 | output = arg
82 | elif opt in ("-w", "--winfs"):
83 | winfs = True
84 | elif opt in ("-e", "--etag"):
85 | useetag = True
86 | elif opt == '-s':
87 | verbose = False
88 |
89 | if len(args) > 0:
90 | extid = args[0]
91 | else:
92 | helpmsg()
93 | sys.exit()
94 |
95 | if date is not None:
96 | dateobj = dateutil.parser.parse(date)
97 | if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
98 | dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
99 | last, etag = last_crx(os.path.join(basedir, "data"), extid, dateobj)
100 | else:
101 | last, etag = last_crx(os.path.join(basedir, "data"), extid)
102 |
103 | if not useetag:
104 | etag = None
105 | basetar = os.path.join(basedir, "data",
106 | get_local_archive_dir(extid), extid)
107 | tar = basetar+".tar"
108 |
109 | if last != "":
110 | if os.path.exists(tar):
111 | files = None
112 | if verbose:
113 | print("Extracting " + os.path.join(output, last) + " from " + tar)
114 | with tarfile.open(tar, 'r') as archive:
115 | files = archive.extractall(
116 | path=output,
117 | members=get_tarinfo(archive, last, winfs, etag))
118 | archivetars = sorted(glob.glob(basetar+".[0-9][0-9][0-9].tar.xz"))
119 | while (not files and archivetars):
120 | tar = archivetars.pop()
121 | if verbose:
122 | print("Extracting " + os.path.join(output, last) + " from " + tar)
123 | with tarfile.open(tar, 'r:xz') as archive:
124 | files = archive.extractall(
125 | path=output,
126 | members=get_tarinfo(archive, last, winfs, etag))
127 | elif verbose:
128 | print("Cannot find archive " + tar)
129 | elif verbose:
130 | if os.path.exists(tar):
131 | print("CRX not in archive" + tar)
132 | else:
133 | print("CRX does not exist: cannot find archive " + tar)
134 |
135 |
136 | if __name__ == "__main__":
137 | main(sys.argv[1:])
138 |
--------------------------------------------------------------------------------
/crx-jsinventory:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2017 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 | """Tool for extracting crx file from a tar archive."""
20 |
21 | import sys
22 | import getopt
23 | import csv
24 | import logging
25 | from collections import OrderedDict
26 | from zipfile import ZipFile
27 | from tabulate import tabulate
28 | from ExtensionCrawler.js_decomposer import decompose_js
29 | from ExtensionCrawler.config import (const_log_format)
30 |
31 |
32 | def helpmsg():
33 | """Print help message."""
34 | print("crx-jsinventory [OPTION] crx-file|js-file")
35 | print(" -h print this help text")
36 | print(" -c= cvs file (output)")
37 | print(" -v verbose")
38 | print(
39 | " -d disable use of database with file information (not recommended)"
40 | )
41 | print(" -s silent")
42 |
43 |
44 | def main(argv):
45 | """Main function of the extension crawler."""
46 | verbose = False
47 | silent = False
48 | csvfile = None
49 | database = True
50 | try:
51 | opts, args = getopt.getopt(argv, "hvdsc:", ["cvs="])
52 | except getopt.GetoptError:
53 | helpmsg()
54 | sys.exit(2)
55 | for opt, arg in opts:
56 | if opt == '-h':
57 | helpmsg()
58 | sys.exit()
59 | elif opt == '-v':
60 | verbose = True
61 | elif opt == '-s':
62 | silent = True
63 | elif opt == '-d':
64 | database = False
65 | elif opt in ('-c', "--cvs"):
66 | csvfile = arg
67 |
68 | if len(args) > 0:
69 | filename = args[0]
70 | else:
71 | helpmsg()
72 | sys.exit()
73 |
74 | if verbose:
75 | loglevel = logging.INFO
76 | else:
77 | loglevel = logging.WARNING
78 |
79 | logger = logging.getLogger()
80 | ch = logging.StreamHandler(sys.stdout)
81 | ch.setFormatter(logging.Formatter(const_log_format()))
82 | logger.addHandler(ch)
83 | logger.setLevel(loglevel)
84 |
85 | fieldnames = [
86 | 'filename', 'path', 'size', 'dec_size', 'md5', 'sha1', 'mimetype',
87 | 'description', 'encoding', 'type', 'detectionMethod',
88 | 'detectionMethodDetails', 'lib', 'version', 'lib_filename',
89 | 'evidenceText', 'evidenceStartPos', 'evidenceEndPos'
90 | ]
91 |
92 | brief_fieldnames = [
93 | 'filename', 'md5', 'type', 'detectionMethod', 'lib', 'version',
94 | 'lib_filename'
95 | ]
96 |
97 | if filename.endswith('.crx'):
98 | with ZipFile(filename) as crxobj:
99 | inventory = decompose_js(crxobj, database)
100 | else:
101 | inventory = decompose_js(filename, database)
102 |
103 | if not silent:
104 | if verbose:
105 | print_fieldnames = fieldnames
106 | else:
107 | print_fieldnames = brief_fieldnames
108 |
109 | print_inventory = []
110 | for item in inventory:
111 | tmp = {k: item[k] for k in print_fieldnames}
112 | if 'type' in tmp:
113 | tmp['type'] = tmp['type'].value
114 | if 'detectionMethod' in tmp:
115 | tmp['detectionMethod'] = tmp['detectionMethod'].value
116 | if 'md5' in tmp:
117 | tmp['md5'] = tmp['md5'].hex()
118 | if 'sha1' in tmp:
119 | tmp['sha1'] = tmp['sha1'].hex()
120 |
121 | print_inventory.append(
122 | OrderedDict(
123 | sorted(
124 | tmp.items(),
125 | key=lambda t: print_fieldnames.index(t[0]))))
126 | print(tabulate(print_inventory, headers='keys'))
127 |
128 | if csvfile is not None:
129 | with open(csvfile, 'w') as csvobj:
130 | writer = csv.DictWriter(csvobj, fieldnames=fieldnames)
131 | writer.writeheader()
132 | writer.writerows(inventory)
133 |
134 |
135 | if __name__ == "__main__":
136 | main(sys.argv[1:])
137 |
--------------------------------------------------------------------------------
/crx-tool:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2016 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 | """ A tool for analyzing and extracting `*.crx` files
20 | (i.e., Chrome extensions)."""
21 |
22 | import argparse
23 | from ExtensionCrawler.crx import extract_crxfile, verify_crxfile
24 |
25 |
26 | def main():
27 | """Main function of the extension crawler."""
28 | parser = argparse.ArgumentParser()
29 | parser.add_argument("file", help="chrome extension archive (*.crx)")
30 | parser.add_argument('targetdir', nargs='?', default="")
31 | parser.add_argument(
32 | "-c",
33 | "--check",
34 | help="verify format and signature of ",
35 | action="store_true")
36 | parser.add_argument(
37 | "-e", "--extract", help="extract ", action="store_true")
38 | parser.add_argument(
39 | "-f",
40 | "--force",
41 | help="apply action also to (potential) invalid files",
42 | action="store_true")
43 | parser.add_argument(
44 | "-v", "--verbose", help="increase verbosity", action="store_true")
45 | args = parser.parse_args()
46 |
47 | if args.extract:
48 | retval = extract_crxfile(args.verbose, args.force, args.file,
49 | args.targetdir)
50 | else:
51 | retval = verify_crxfile(args.verbose, args.file)
52 |
53 | exit(retval)
54 |
55 |
56 | if __name__ == "__main__":
57 | main()
58 |
--------------------------------------------------------------------------------
/database/README.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | The extension crawler downloads all metadata and extension files into tar files.
4 | This is great for archival, but not so great for analyzing the data. The crawler
5 | therefore also supports inserting all newly crawled information into a MariaDB
6 | database. Additionally, there exists a script to regenerate the database from
7 | old tar files.
8 |
9 |
10 | # Setting up the database
11 |
12 | ## Hardware requirements
13 |
14 | The database is meant to be setup on a (old) PC, although it should also work
15 | with common cloud offerings.
16 |
17 | The amount of data that the database needs to handle grows over time. Currently,
18 | containing ~18 months worth of data, the database requires ~150GB of space.
19 |
20 | It is recommended to have at least 16GB of RAM to keep the indices available;
21 | less RAM might work, more RAM will certainly speed queries up. It is also good
22 | to have at least 16GB of swap; while this detrimental to the performance of
23 | MariaDB, it is often better than it being killed by the OS.
24 |
25 | For storage, it is beneficial to have at least one HDD and one SSD, as the
26 | database workload can be split into sequential and random IO.
27 |
28 |
29 | ## Configuration
30 |
31 | A commented configuration file for MariaDB can be found in `config/my.cnf`.
32 | Configuration options such as pool size and storage locations will need to be
33 | adjusted.
34 |
35 | ## Table schemas
36 |
37 | To set up the tables and schemas, make sure that you have the credentials for
38 | root in your `~/.my.cnf` file, and execute the following:
39 | ```bash
40 | mysql -e "create database extensions;"
41 | for f in schemas/*.sql; do mysql extensions < $f; done
42 | for f in views/*.sql; do mysql extensions < $f; done
43 | ```
44 |
45 | # Maintaining the database
46 |
47 | ## Memory consumption
48 |
49 | MariaDB will, at times, use much more memory than specified for the pool size --
50 | 100GB with a pool size of 4GB is certainly possible while regenerating the data.
51 | In these cases, the database should be restarted. The crawler and regeneration
52 | script will retry their database operations by default for around one hour.
53 |
54 | ## Backup
55 |
56 | Regenerating the whole data set can take days, if not weeks, so even though all
57 | data can be restored, having a backup speeds up recovery. For this purpose, the
58 | MariaDB binary log is enabled to allow physical backups, which are much faster
59 | than logical backups for our case. The folder `scripts/` contains scripts to do
60 | full and incremental backups, as well as scripts to backup the schemas and users
61 | (including permissions and hashed passwords).
62 |
63 | # Regenerating extension data
64 |
65 | When the crawler is changed to extract more or different data from the
66 | extensions, one will probably want to regenerate all data, i.e., ask the crawler
67 | to go through all existing tar files and re-extract the already downloaded data.
68 | In order to do so, the `create-db` or `sge/create-db.sh` (for HPCs) can be used.
69 | More information can be found when calling these scripts with `--help`.
70 |
71 | # Using the data set
72 |
73 | ## Example queries
74 |
75 | For more (commented) queries, see the `queries/` folder.
76 |
77 | - ```sql
78 | select extid,crx_etag,count(filename) from extension_most_recent_small join crxfile using (crx_etag) where filename like '%.js' group by extid,crx_etag limit 10;
79 | ```
80 | This query will print the number of JavaScript files per extension.
81 |
82 | ## Table schemas
83 |
84 | All schema files can be found in the `schemas/` folder.
85 |
86 | | Table name | Description |
87 | | --- | --- |
88 | | extension | General extension metadata from the store pages. One row per \
89 | extension and crawldate (!). If you are only interested in the most recent \
90 | *view* of the Chrome Web Store, use the `extension_most_recent` view. For \
91 | testing your queries, suffix either table/view with *\_small* to only get \
92 | roughly 1/256th of all extensions. |
93 | | status | The HTTP status codes for the store page and `crx` download. |
94 | | crx | General metadata of the extension file (the `crx` archive itself). Also \
95 | contains the manifest. |
96 | | crxfile | General metadata of the extension files, e.g., the files contained \
97 | in the `crx` archives (JavaScript files, etc.).|
98 | | category | Categories of the extensions, e.g. *productivity*, *office*, \
99 | or *game*. |
100 | | permission | Permissions found in the manifests, e.g., *webRequest*, *tab*, but also \
101 | host permissions such as *https://www.google.com*. |
102 | | content_script_url | Content script URLs found in the manifest. These are the \
103 | URLs where the extensions request to have a content script executed when the \
104 | user visits the website. |
105 | | libdet | Information about used libraries. For each file found in `crx` \
106 | archives (identified by MD5 sums), this table stores classifications of the \
107 | file, e.g., whether it is a certain library. |
108 | | review{,\_comment} | Post-metadata and posts from the review forum of an extension. |
109 | | support{,\_comment} | Post-metadata and posts from the support forum of an extension. |
110 | | reply{,\_comment} | Reply-post-metadata and posts for both the review and support forums. |
111 |
112 | ## Views
113 |
114 | All views can be found in the `views/` folder.
115 |
116 | | View name | Description |
117 | | --- | --- |
118 | | extension_small | Contains only roughly 1/256th of all extensions. |
119 | | extension_most_recent | Instead of one row for every combination of extension \
120 | id and crawl date, this view only contains the rows from the most recent crawl \
121 | date. |
122 | | extension_most_recent_small | Same, but roughly only 1/256th of all extensions. |
123 | | extension_second_most_recent | Similar to `extension_most_recent`, but \
124 | contains the second-most recent entry for all extensions. This is useful for \
125 | investigating how extensions change. |
126 | | extension_{most,second_most}_recent_until_date | Parameterized query. Only \
127 | considers extensions crawled before a given date. Usage: \
128 | ```sql
129 | select * from (select @until_date:='2018-05-25') foo, extension_most_recent_until_date;
130 | ``` |
131 | | extension_update | Selects all extension updates in the database. A row in the result represents \
132 | one extension update, with the date and crx_etag when we have first seen the \
133 | update, and the date and crx_etag when we have last seen the old version. As \
134 | we crawl every night, the difference should be around 24 hours on average. |
135 |
--------------------------------------------------------------------------------
/database/config/my.cnf:
--------------------------------------------------------------------------------
1 | [client]
2 | port = 3306
3 | socket = /run/mysqld/mysqld.sock
4 |
5 | [mysqld]
6 | port = 3306
7 | socket = /run/mysqld/mysqld.sock
8 |
9 | wait_timeout=1800
10 | max_connections=1000
11 | explicit_defaults_for_timestamp=1
12 | default_time_zone='+00:00'
13 |
14 | server-id = 1
15 |
16 | expire_logs_days=8
17 | log-basename=master1-bin
18 |
19 | # Ideally, the MariaDB datadir resides on a HDD, as there will be a lot of sequential IO.
20 | # After creating a database, it is best moved to a SSD, as there will be a lot of
21 | # random IO. This can be done by simply moving the directory (do NOT move individual table
22 | # files!), e.g.: cd /hdd/mysql; mv extensions /ssd/databases/; ln -s /ssd/databases/extensions
23 | datadir=/hdd/mysql
24 |
25 | # When adding indices, MariaDB uses a lot of space in /tmp. If that space is not enough, the
26 | # used tmpdir can be moved:
27 | innodb_tmpdir=/ssd/innodb_tmp
28 |
29 | # The pool size is said to be around 75% of the available RAM on db-only hosts. However, current
30 | # versions of MariaDB seem to have serious memory leaks when doing a lot of concurrent writes.
31 | # Therefore, expect MariaDB to use a lot more memory, create sufficient swap to prevent killing,
32 | # and restart MariaDB when the usage grows too high.
33 | innodb_buffer_pool_size = 18G
34 |
35 | # General performance tweaks
36 | innodb_read_io_threads=8
37 | innodb_write_io_threads=8
38 | innodb_sort_buffer_size=67108864
39 | innodb_log_file_size=256M
40 | innodb_log_buffer_size=256M
41 |
42 | # Performance tweaks for inserts
43 | #innodb_flush_log_at_trx_commit=0
44 | #innodb_change_buffer_max_size=50
45 | #innodb_flush_method=O_DIRECT
46 |
47 | [mysqldump]
48 | quick
49 | max_allowed_packet = 16M
50 |
51 | [mysql]
52 | no-auto-rehash
53 |
54 | [myisamchk]
55 | key_buffer_size = 20M
56 | sort_buffer_size = 20M
57 | read_buffer = 2M
58 | write_buffer = 2M
59 |
60 | [mysqlhotcopy]
61 | interactive-timeout
62 |
--------------------------------------------------------------------------------
/database/queries/get_added_content_scripts.sql:
--------------------------------------------------------------------------------
1 | select downloads, eu.extid, name, url, new_crx_etag
2 | from extension_update eu join extension e on eu.extid=e.extid and eu.first_date_with_new_crx_etag=e.date
3 | join content_script_url c on eu.new_crx_etag=c.crx_etag
4 | where
5 | url in (
6 | "file://*/*",
7 | "http://*/*",
8 | "https://*/*",
9 | "*://*/*",
10 | ""
11 | )
12 | and
13 | url not in (select url from content_script_url where crx_etag=previous_crx_etag)
14 | and
15 | first_date_with_new_crx_etag > NOW() - INTERVAL 2 DAY
16 | order by downloads desc;
17 |
--------------------------------------------------------------------------------
/database/queries/get_added_permissions.sql:
--------------------------------------------------------------------------------
1 | select downloads, eu.extid, name, permission, new_crx_etag
2 | from extension_update eu join extension e on eu.extid=e.extid and eu.first_date_with_new_crx_etag=e.date
3 | join permission p on eu.new_crx_etag=p.crx_etag
4 | where
5 | permission in (
6 | "",
7 | "http://*/*",
8 | "https://*/*",
9 | "webRequest",
10 | "webRequestBlocking"
11 | )
12 | and
13 | permission not in (select permission from permission where crx_etag=previous_crx_etag)
14 | and
15 | first_date_with_new_crx_etag > NOW() - INTERVAL 2 DAY
16 | order by downloads desc;
17 |
--------------------------------------------------------------------------------
/database/schemas/category.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `category`
18 | --
19 |
20 | DROP TABLE IF EXISTS `category`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `category` (
24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `date` datetime(6) NOT NULL,
26 | `category_md5` varbinary(16) NOT NULL,
27 | `category` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
28 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
29 | PRIMARY KEY (`extid`,`date`,`category_md5`) KEY_BLOCK_SIZE=8
30 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
31 | /*!40101 SET character_set_client = @saved_cs_client */;
32 |
33 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
34 |
35 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
36 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
37 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
38 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
39 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
40 |
41 | -- Dump completed on 2018-08-09 12:31:29
42 |
--------------------------------------------------------------------------------
/database/schemas/cdnjs.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `cdnjs`
18 | --
19 |
20 | DROP TABLE IF EXISTS `cdnjs`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `cdnjs` (
24 | `path` varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL,
26 | `md5` varbinary(16) NOT NULL,
27 | `filename` varchar(253) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
28 | `sha1` varbinary(20) DEFAULT NULL,
29 | `sha256` varbinary(32) DEFAULT NULL,
30 | `simhash` varbinary(64) DEFAULT NULL,
31 | `size` bigint(20) DEFAULT NULL,
32 | `loc` bigint(20) DEFAULT NULL,
33 | `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 | `encoding` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
35 | `mimetype` varchar(126) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
36 | `add_date` datetime(6) NULL DEFAULT NULL,
37 | `library` varchar(254) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
38 | `version` varchar(30) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
39 | `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
40 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
41 | PRIMARY KEY (`path`,`typ`),
42 | KEY `cdnjs_md5_typ` (`md5`,`typ`)
43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
44 | /*!40101 SET character_set_client = @saved_cs_client */;
45 |
46 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
47 |
48 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
52 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
53 |
54 | -- Dump completed on 2018-08-09 12:31:29
55 |
--------------------------------------------------------------------------------
/database/schemas/content_script_url.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `content_script_url`
18 | --
19 |
20 | DROP TABLE IF EXISTS `content_script_url`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `content_script_url` (
24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `url_md5` varbinary(16) NOT NULL,
26 | `url` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
27 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
28 | PRIMARY KEY (`crx_etag`,`url_md5`) KEY_BLOCK_SIZE=8
29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
30 | /*!40101 SET character_set_client = @saved_cs_client */;
31 |
32 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
33 |
34 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
35 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
36 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
37 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
38 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
39 |
40 | -- Dump completed on 2018-08-09 12:31:29
41 |
--------------------------------------------------------------------------------
/database/schemas/crx.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `crx`
18 | --
19 |
20 | DROP TABLE IF EXISTS `crx`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `crx` (
24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `filename` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
26 | `size` int(11) NOT NULL,
27 | `publickey` blob NOT NULL,
28 | `manifest` longtext /*!100301 COMPRESSED*/ CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT '',
29 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
30 | PRIMARY KEY (`crx_etag`) KEY_BLOCK_SIZE=8
31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
32 | /*!40101 SET character_set_client = @saved_cs_client */;
33 |
34 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
35 |
36 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
37 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
38 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
39 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
40 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
41 |
42 | -- Dump completed on 2018-08-09 12:31:29
43 |
--------------------------------------------------------------------------------
/database/schemas/crxfile.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `crxfile`
18 | --
19 |
20 | DROP TABLE IF EXISTS `crxfile`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `crxfile` (
24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `path` varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL,
26 | `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL,
27 | `md5` varbinary(16) DEFAULT NULL,
28 | `filename` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 | `sha1` varbinary(20) DEFAULT NULL,
30 | `sha256` varbinary(32) DEFAULT NULL,
31 | `simhash` varbinary(64) DEFAULT NULL,
32 | `mimetype` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
33 | `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
35 | PRIMARY KEY (`crx_etag`,`path`,`typ`),
36 | KEY `crxfile_md5_typ` (`md5`,`typ`)
37 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
38 | /*!40101 SET character_set_client = @saved_cs_client */;
39 |
40 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
41 |
42 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
43 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
44 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
45 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
46 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
47 |
48 | -- Dump completed on 2018-08-09 12:31:29
49 |
--------------------------------------------------------------------------------
/database/schemas/extension.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `extension`
18 | --
19 |
20 | DROP TABLE IF EXISTS `extension`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `extension` (
24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `date` datetime(6) NOT NULL,
26 | `name` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
27 | `version` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
28 | `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 | `downloads` int(11) DEFAULT NULL,
30 | `rating` double DEFAULT NULL,
31 | `ratingcount` int(11) DEFAULT NULL,
32 | `fulldescription` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
33 | `offeredby` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 | `developer` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
35 | `itemcategory` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
36 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
37 | `lastupdated` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
38 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
39 | PRIMARY KEY (`extid`,`date`) KEY_BLOCK_SIZE=8,
40 | KEY `extension_crx_etag` (`crx_etag`),
41 | KEY `extension_date` (`date`),
42 | KEY `extension_date_extid` (`date`,`extid`),
43 | KEY `extension_extid_crx_etag` (`extid`,`crx_etag`)
44 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
45 | /*!40101 SET character_set_client = @saved_cs_client */;
46 |
47 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
48 |
49 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
50 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
51 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
52 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
53 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
54 |
55 | -- Dump completed on 2018-08-09 12:31:29
56 |
--------------------------------------------------------------------------------
/database/schemas/libdet.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `libdet`
18 | --
19 |
20 | DROP TABLE IF EXISTS `libdet`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `libdet` (
24 | `md5` varbinary(16) NOT NULL,
25 | `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL,
26 | `sha1` varbinary(20) DEFAULT NULL,
27 | `sha256` varbinary(32) DEFAULT NULL,
28 | `size` bigint(20) DEFAULT NULL,
29 | `loc` bigint(20) DEFAULT NULL,
30 | `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 | `encoding` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 | `mimetype` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
33 | `library` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 | `version` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
35 | `classification_type` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
36 | `detect_method` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
37 | `detect_method_details` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
38 | `evidence_start_pos` bigint(20) DEFAULT NULL,
39 | `evidence_end_pos` bigint(20) DEFAULT NULL,
40 | `evidence_text` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
41 | `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
42 | `mimetype_magic` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
43 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
44 | PRIMARY KEY (`md5`,`typ`)
45 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
46 | /*!40101 SET character_set_client = @saved_cs_client */;
47 |
48 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
49 |
50 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
51 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
52 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
53 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
54 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
55 |
56 | -- Dump completed on 2018-08-09 12:31:29
57 |
--------------------------------------------------------------------------------
/database/schemas/permission.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `permission`
18 | --
19 |
20 | DROP TABLE IF EXISTS `permission`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `permission` (
24 | `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `permission_md5` varbinary(16) NOT NULL,
26 | `permission` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
27 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
28 | PRIMARY KEY (`crx_etag`,`permission_md5`) KEY_BLOCK_SIZE=8
29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
30 | /*!40101 SET character_set_client = @saved_cs_client */;
31 |
32 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
33 |
34 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
35 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
36 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
37 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
38 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
39 |
40 | -- Dump completed on 2018-08-09 12:31:29
41 |
--------------------------------------------------------------------------------
/database/schemas/reply.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `reply`
18 | --
19 |
20 | DROP TABLE IF EXISTS `reply`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `reply` (
24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `date` datetime(6) NOT NULL,
26 | `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL,
27 | `commentdate` datetime NOT NULL,
28 | `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 | `replyto` varchar(98) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
30 | `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 | `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 | `commentmd5` varbinary(16) DEFAULT NULL,
33 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
34 | PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
36 | /*!40101 SET character_set_client = @saved_cs_client */;
37 |
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 |
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 |
46 | -- Dump completed on 2018-08-09 12:31:29
47 |
--------------------------------------------------------------------------------
/database/schemas/reply_comment.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `reply_comment`
18 | --
19 |
20 | DROP TABLE IF EXISTS `reply_comment`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `reply_comment` (
24 | `commentmd5` varbinary(16) NOT NULL,
25 | `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
26 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
27 | PRIMARY KEY (`commentmd5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
29 | /*!40101 SET character_set_client = @saved_cs_client */;
30 |
31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
32 |
33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
38 |
39 | -- Dump completed on 2018-08-09 12:31:29
40 |
--------------------------------------------------------------------------------
/database/schemas/review.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `review`
18 | --
19 |
20 | DROP TABLE IF EXISTS `review`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `review` (
24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `date` datetime(6) NOT NULL,
26 | `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL,
27 | `commentdate` datetime NOT NULL,
28 | `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 | `rating` double DEFAULT NULL,
30 | `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 | `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 | `commentmd5` varbinary(16) DEFAULT NULL,
33 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
34 | PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
36 | /*!40101 SET character_set_client = @saved_cs_client */;
37 |
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 |
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 |
46 | -- Dump completed on 2018-08-09 12:31:29
47 |
--------------------------------------------------------------------------------
/database/schemas/review_comment.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `review_comment`
18 | --
19 |
20 | DROP TABLE IF EXISTS `review_comment`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `review_comment` (
24 | `commentmd5` varbinary(16) NOT NULL,
25 | `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
26 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
27 | PRIMARY KEY (`commentmd5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
29 | /*!40101 SET character_set_client = @saved_cs_client */;
30 |
31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
32 |
33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
38 |
39 | -- Dump completed on 2018-08-09 12:31:29
40 |
--------------------------------------------------------------------------------
/database/schemas/status.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `status`
18 | --
19 |
20 | DROP TABLE IF EXISTS `status`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `status` (
24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `date` datetime(6) NOT NULL,
26 | `crx_status` int(11) DEFAULT NULL,
27 | `overview_status` int(11) DEFAULT NULL,
28 | `overview_exception` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
30 | PRIMARY KEY (`extid`,`date`) KEY_BLOCK_SIZE=8
31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
32 | /*!40101 SET character_set_client = @saved_cs_client */;
33 |
34 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
35 |
36 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
37 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
38 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
39 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
40 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
41 |
42 | -- Dump completed on 2018-08-09 12:31:29
43 |
--------------------------------------------------------------------------------
/database/schemas/support.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `support`
18 | --
19 |
20 | DROP TABLE IF EXISTS `support`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `support` (
24 | `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 | `date` datetime(6) NOT NULL,
26 | `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL,
27 | `commentdate` datetime NOT NULL,
28 | `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 | `title` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
30 | `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 | `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 | `commentmd5` varbinary(16) DEFAULT NULL,
33 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
34 | PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
36 | /*!40101 SET character_set_client = @saved_cs_client */;
37 |
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 |
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 |
46 | -- Dump completed on 2018-08-09 12:31:29
47 |
--------------------------------------------------------------------------------
/database/schemas/support_comment.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Table structure for table `support_comment`
18 | --
19 |
20 | DROP TABLE IF EXISTS `support_comment`;
21 | /*!40101 SET @saved_cs_client = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `support_comment` (
24 | `commentmd5` varbinary(16) NOT NULL,
25 | `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
26 | `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
27 | PRIMARY KEY (`commentmd5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
29 | /*!40101 SET character_set_client = @saved_cs_client */;
30 |
31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
32 |
33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
38 |
39 | -- Dump completed on 2018-08-09 12:31:29
40 |
--------------------------------------------------------------------------------
/database/scripts/mariabackup-full:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -o nounset
4 | set -o errexit
5 |
6 | /usr/bin/mariabackup --backup --stream=xbstream --parallel=4 --compress --compress-threads=2
7 |
--------------------------------------------------------------------------------
/database/scripts/mariabackup-inc:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -o nounset
4 | set -o errexit
5 |
6 | LSN=$1
7 | if ! [[ "$LSN" =~ ^[0-9]+$ ]]; then
8 | >&2 echo "Invalid LSN: $LSN"
9 | exit 1
10 | fi
11 |
12 | /usr/bin/mariabackup --backup --stream=xbstream --parallel=4 --compress --compress-threads=2 --incremental-lsn=$LSN
13 |
--------------------------------------------------------------------------------
/database/scripts/mariabackup-schemas:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 |
3 | set -o errexit
4 | set -o nounset
5 |
6 | T=$(mktemp -d)
7 | for db in $(mysql -N -e "show databases" | grep -v -e "^mysql$" -e "^information_schema$" -e "^performance_schema$")
8 | do
9 | mkdir -p $T/schemas/$db
10 | mysqldump $db --no-data --single-transaction --tab=$T/schemas/$db
11 | done
12 | (cd $T; tar cz *)
13 | rm -r $T
14 |
--------------------------------------------------------------------------------
/database/scripts/showgrants:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -o errexit
3 | set -o nounset
4 |
5 | mysql "" --skip-column-names -A -e"SELECT CONCAT('SHOW GRANTS FOR ''',user,'''@''',host,''';') FROM mysql.user WHERE user<>''" | mysql "" --skip-column-names -A | sed 's/$/;/g'
6 |
--------------------------------------------------------------------------------
/database/views/extension_most_recent.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Final view structure for view `extension_most_recent`
18 | --
19 |
20 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent`*/;
22 | /*!50001 SET @saved_cs_client = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection = @@collation_connection */;
25 | /*!50001 SET character_set_client = utf8 */;
26 | /*!50001 SET character_set_results = utf8 */;
27 | /*!50001 SET collation_connection = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_most_recent` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */;
31 | /*!50001 SET character_set_client = @saved_cs_client */;
32 | /*!50001 SET character_set_results = @saved_cs_results */;
33 | /*!50001 SET collation_connection = @saved_col_connection */;
34 |
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 |
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 |
43 | -- Dump completed on 2018-08-09 12:31:29
44 |
--------------------------------------------------------------------------------
/database/views/extension_most_recent_small.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Final view structure for view `extension_most_recent_small`
18 | --
19 |
20 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent_small`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent_small`*/;
22 | /*!50001 SET @saved_cs_client = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection = @@collation_connection */;
25 | /*!50001 SET character_set_client = utf8 */;
26 | /*!50001 SET character_set_results = utf8 */;
27 | /*!50001 SET collation_connection = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_most_recent_small` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` where `e1`.`extid` like 'aa%' group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */;
31 | /*!50001 SET character_set_client = @saved_cs_client */;
32 | /*!50001 SET character_set_results = @saved_cs_results */;
33 | /*!50001 SET collation_connection = @saved_col_connection */;
34 |
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 |
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 |
43 | -- Dump completed on 2018-08-09 12:31:29
44 |
--------------------------------------------------------------------------------
/database/views/extension_most_recent_until_date.sql:
--------------------------------------------------------------------------------
1 | drop function if exists until_date;
2 | create function until_date returns datetime NO SQL DEERMINISTIC return @until_date;
3 |
4 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
5 | --
6 | -- Host: localhost Database: extensions
7 | -- ------------------------------------------------------
8 | -- Server version 10.3.8-MariaDB-log
9 |
10 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
11 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
12 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
13 | /*!40101 SET NAMES utf8 */;
14 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
15 | /*!40103 SET TIME_ZONE='+00:00' */;
16 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
17 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
18 |
19 | --
20 | -- Final view structure for view `extension_most_recent_until_date`
21 | --
22 |
23 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent_until_date`*/;
24 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent_until_date`*/;
25 | /*!50001 SET @saved_cs_client = @@character_set_client */;
26 | /*!50001 SET @saved_cs_results = @@character_set_results */;
27 | /*!50001 SET @saved_col_connection = @@collation_connection */;
28 | /*!50001 SET character_set_client = utf8 */;
29 | /*!50001 SET character_set_results = utf8 */;
30 | /*!50001 SET collation_connection = utf8_general_ci */;
31 | /*!50001 CREATE ALGORITHM=UNDEFINED */
32 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
33 | /*!50001 VIEW `extension_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */;
34 | /*!50001 SET character_set_client = @saved_cs_client */;
35 | /*!50001 SET character_set_results = @saved_cs_results */;
36 | /*!50001 SET collation_connection = @saved_col_connection */;
37 |
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 |
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 |
46 | -- Dump completed on 2018-08-09 12:31:29
47 |
--------------------------------------------------------------------------------
/database/views/extension_second_most_recent.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Final view structure for view `extension_second_most_recent`
18 | --
19 |
20 | /*!50001 DROP TABLE IF EXISTS `extension_second_most_recent`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_second_most_recent`*/;
22 | /*!50001 SET @saved_cs_client = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection = @@collation_connection */;
25 | /*!50001 SET character_set_client = utf8 */;
26 | /*!50001 SET character_set_results = utf8 */;
27 | /*!50001 SET collation_connection = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_second_most_recent` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */;
31 | /*!50001 SET character_set_client = @saved_cs_client */;
32 | /*!50001 SET character_set_results = @saved_cs_results */;
33 | /*!50001 SET collation_connection = @saved_col_connection */;
34 |
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 |
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 |
43 | -- Dump completed on 2018-08-09 12:31:29
44 |
--------------------------------------------------------------------------------
/database/views/extension_second_most_recent_until_date.sql:
--------------------------------------------------------------------------------
1 | drop function if exists until_date;
2 | create function until_date returns datetime NO SQL DEERMINISTIC return @until_date;
3 |
4 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
5 | --
6 | -- Host: localhost Database: extensions
7 | -- ------------------------------------------------------
8 | -- Server version 10.3.8-MariaDB-log
9 |
10 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
11 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
12 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
13 | /*!40101 SET NAMES utf8 */;
14 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
15 | /*!40103 SET TIME_ZONE='+00:00' */;
16 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
17 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
18 |
19 | --
20 | -- Final view structure for view `extension_second_most_recent_until_date`
21 | --
22 |
23 | /*!50001 DROP TABLE IF EXISTS `extension_second_most_recent_until_date`*/;
24 | /*!50001 DROP VIEW IF EXISTS `extension_second_most_recent_until_date`*/;
25 | /*!50001 SET @saved_cs_client = @@character_set_client */;
26 | /*!50001 SET @saved_cs_results = @@character_set_results */;
27 | /*!50001 SET @saved_col_connection = @@collation_connection */;
28 | /*!50001 SET character_set_client = utf8 */;
29 | /*!50001 SET character_set_results = utf8 */;
30 | /*!50001 SET collation_connection = utf8_general_ci */;
31 | /*!50001 CREATE ALGORITHM=UNDEFINED */
32 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
33 | /*!50001 VIEW `extension_second_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`, `extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() and !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */;
34 | /*!50001 SET character_set_client = @saved_cs_client */;
35 | /*!50001 SET character_set_results = @saved_cs_results */;
36 | /*!50001 SET collation_connection = @saved_col_connection */;
37 |
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 |
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 |
46 | -- Dump completed on 2018-08-09 12:31:29
47 |
--------------------------------------------------------------------------------
/database/views/extension_small.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Final view structure for view `extension_small`
18 | --
19 |
20 | /*!50001 DROP TABLE IF EXISTS `extension_small`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_small`*/;
22 | /*!50001 SET @saved_cs_client = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection = @@collation_connection */;
25 | /*!50001 SET character_set_client = utf8 */;
26 | /*!50001 SET character_set_results = utf8 */;
27 | /*!50001 SET collation_connection = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_small` AS select `extension`.`extid` AS `extid`,`extension`.`date` AS `date`,`extension`.`name` AS `name`,`extension`.`version` AS `version`,`extension`.`description` AS `description`,`extension`.`downloads` AS `downloads`,`extension`.`rating` AS `rating`,`extension`.`ratingcount` AS `ratingcount`,`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extension`.`developer` AS `developer`,`extension`.`itemcategory` AS `itemcategory`,`extension`.`crx_etag` AS `crx_etag`,`extension`.`lastupdated` AS `lastupdated` from `extension` where `extension`.`extid` like 'aa%' */;
31 | /*!50001 SET character_set_client = @saved_cs_client */;
32 | /*!50001 SET character_set_results = @saved_cs_results */;
33 | /*!50001 SET collation_connection = @saved_col_connection */;
34 |
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 |
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 |
43 | -- Dump completed on 2018-08-09 12:31:29
44 |
--------------------------------------------------------------------------------
/database/views/extension_update.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.16 Distrib 10.3.8-MariaDB, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: extensions
4 | -- ------------------------------------------------------
5 | -- Server version 10.3.8-MariaDB-log
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 |
16 | --
17 | -- Final view structure for view `extension_update`
18 | --
19 |
20 | /*!50001 DROP TABLE IF EXISTS `extension_update`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_update`*/;
22 | /*!50001 SET @saved_cs_client = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection = @@collation_connection */;
25 | /*!50001 SET character_set_client = utf8 */;
26 | /*!50001 SET character_set_results = utf8 */;
27 | /*!50001 SET collation_connection = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`root`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_update` AS select `e3`.`extid` AS `extid`,`e3`.`first_date_with_new_crx_etag` AS `first_date_with_new_crx_etag`,`e3`.`new_crx_etag` AS `new_crx_etag`,`e3`.`last_date_with_previous_crx_etag` AS `last_date_with_previous_crx_etag`,`e4`.`crx_etag` AS `previous_crx_etag` from (((select `e1`.`extid` AS `extid`,`e1`.`date` AS `first_date_with_new_crx_etag`,`e1`.`crx_etag` AS `new_crx_etag`,max(`e2`.`date`) AS `last_date_with_previous_crx_etag` from (((select `extensions`.`extension`.`extid` AS `extid`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,min(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`crx_etag` is not null group by `extensions`.`extension`.`extid`,`extensions`.`extension`.`crx_etag`)) `e1` join (select `extensions`.`extension`.`extid` AS `extid`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`crx_etag` is not null group by `extensions`.`extension`.`extid`,`extensions`.`extension`.`crx_etag`) `e2` on(`e1`.`extid` = `e2`.`extid`)) where `e1`.`date` > `e2`.`date` group by `e1`.`crx_etag`)) `e3` join `extensions`.`extension` `e4` on(`e3`.`extid` = `e4`.`extid` and `e3`.`last_date_with_previous_crx_etag` = `e4`.`date`)) */;
31 | /*!50001 SET character_set_client = @saved_cs_client */;
32 | /*!50001 SET character_set_results = @saved_cs_results */;
33 | /*!50001 SET collation_connection = @saved_col_connection */;
34 |
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 |
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 |
43 | -- Dump completed on 2018-08-09 12:31:29
44 |
--------------------------------------------------------------------------------
/extgrep:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3.7
2 | #
3 | # Copyright (C) 2019 The University of Sheffield, UK
4 | #
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 |
20 | import argparse
21 | import io
22 | import logging
23 | import re
24 | import json
25 | import sys
26 | import importlib.util
27 | import csv
28 | import math
29 | import ast
30 |
31 | from zipfile import ZipFile
32 |
33 | from ExtensionCrawler.config import (const_log_format, const_basedir)
34 | from ExtensionCrawler.archive import iter_tar_entries_by_date
35 | from ExtensionCrawler.js_mincer import mince_js
36 |
37 |
38 | def get_shannon_entropy(string):
39 | """
40 | This code has been borrowed from
41 | "http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
42 | "git@github.com:dxa4481/truffleHog.git"
43 | """
44 | chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
45 | if not string:
46 | return 0
47 | entropy = 0
48 | for x in chars:
49 | p_x = float(string.count(x))/len(string)
50 | if p_x > 0:
51 | entropy += - p_x*math.log(p_x, 2)
52 | return entropy
53 |
54 |
55 | def is_likely_hash(string):
56 | return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
57 |
58 |
59 | def import_regexs(path):
60 | spec = importlib.util.spec_from_file_location("MinerStrings", path)
61 | module = importlib.util.module_from_spec(spec)
62 | spec.loader.exec_module(module)
63 | return module
64 |
65 |
66 | def get_etag(headers_content):
67 | d = ast.literal_eval(headers_content)
68 | if "ETag" in d:
69 | return d["ETag"]
70 |
71 |
72 | def get_name_and_version(overview_contents):
73 | # Extract extension name
74 | match = re.search("""""",
75 | overview_contents)
76 | name = match.group(1) if match else None
77 |
78 | # Extract extension version
79 | match = re.search(
80 | """""", overview_contents)
81 | version = match.group(1) if match else None
82 |
83 | return name, version
84 |
85 |
86 | def first_match_in_locations(search_tag, pattern, locations):
87 | for location_tag, lines in locations:
88 | for line in lines:
89 | m = re.search(pattern, line)
90 | if m:
91 | matched_string = m.group()
92 | if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
93 | return [[location_tag, search_tag, matched_string]]
94 | return []
95 |
96 |
97 | def handle_extid(conf, extid, csvwriter):
98 | miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
99 |
100 | results = []
101 |
102 | still_in_store = None
103 | crx_etags = [None]
104 | for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
105 | if conf.from_date and not (conf.from_date <= date):
106 | continue
107 | if conf.latest_date and not (date <= conf.latest_date):
108 | continue
109 |
110 | crx_etag = None
111 | name = None
112 | version = None
113 | date_matches = []
114 | for tarentry, tarfile in tups:
115 | tarentry_filename = tarentry.name.split("/")[-1]
116 |
117 | if tarentry_filename.endswith(".crx.headers"):
118 | crx_etag = get_etag(tarfile.read().decode())
119 | if crx_etag:
120 | crx_etags += [crx_etag]
121 |
122 | if tarentry_filename == "overview.html":
123 | name, version = get_name_and_version(tarfile.read().decode())
124 |
125 | if tarentry_filename == "overview.html.status":
126 | still_in_store = tarfile.read().decode().startswith("2")
127 |
128 | if tarentry_filename.endswith(".crx") and tarentry.size > 0:
129 | with ZipFile(tarfile) as zf:
130 | for zipentry in zf.infolist():
131 | file_matches = []
132 | if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"):
133 | with zf.open(zipentry) as f:
134 | verbatim_lines = []
135 | joined_string_lines = []
136 | for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
137 | verbatim_lines += block.content.splitlines()
138 | joined_string_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
139 |
140 | for search_tag in miner_strings.strings.keys():
141 | for search_string in miner_strings.strings[search_tag]:
142 | for match in first_match_in_locations(search_tag, re.escape(search_string),
143 | [("verbatim", verbatim_lines),
144 | ("joined_string", joined_string_lines)]):
145 | file_matches.append(match)
146 |
147 | for search_tag in miner_strings.patterns.keys():
148 | for search_pattern in miner_strings.patterns[search_tag]:
149 | for match in first_match_in_locations(search_tag, search_pattern,
150 | [("verbatim", verbatim_lines),
151 | ("joined_string", joined_string_lines)]):
152 | file_matches.append(match)
153 |
154 | for match in file_matches:
155 | date_matches.append([zipentry.filename] + match)
156 |
157 | for match in date_matches:
158 | results += [[date, crx_etag, name, version] + match]
159 |
160 | for result in results:
161 | csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
162 |
163 |
164 | def main(conf):
165 | logger = logging.getLogger()
166 | ch = logging.StreamHandler(sys.stderr)
167 | ch.setFormatter(logging.Formatter(const_log_format()))
168 | logger.addHandler(ch)
169 | if conf.verbose:
170 | logger.setLevel(logging.DEBUG)
171 | else:
172 | logger.setLevel(logging.WARNING)
173 |
174 | with open(conf.EXTID_FILE) as f:
175 | csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
176 | csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "position", "tag", "match"])
177 | for extid in [l.strip() for l in f.readlines()]:
178 | handle_extid(conf, extid, csvwriter)
179 |
180 |
181 | def build_parser():
182 | main_parser = argparse.ArgumentParser(
183 | formatter_class=argparse.RawTextHelpFormatter,
184 | description='Grep for extensions.')
185 | main_parser.add_argument(
186 | 'REGEXP_FILE',
187 | help='python file with regular expressions')
188 | main_parser.add_argument(
189 | 'EXTID_FILE',
190 | help='file with extension ids')
191 | main_parser.add_argument(
192 | '-v',
193 | '--verbose',
194 | action='store_true',
195 | default=False,
196 | help='increase verbosity')
197 |
198 |
199 | main_parser.add_argument(
200 | '-D',
201 | '--latest-date',
202 | metavar='DATE',
203 | type=str,
204 | help='select latest crx from tar, released before DATE.\n' +
205 | 'Together with --from-date, specifies all crx released in specified\n' +
206 | 'date range.')
207 |
208 | main_parser.add_argument(
209 | '-d',
210 | '--from-date',
211 | metavar='DATE',
212 | type=str,
213 | help='select oldest crx from tar released after DATE.\n' +
214 | 'Together with --latest-date, specifies all crx released in specified\n' +
215 | 'date range.')
216 |
217 | main_parser.add_argument(
218 | '-a',
219 | '--archive-dir',
220 | metavar='archive',
221 | type=str,
222 | default=const_basedir(),
223 | help='archive directory')
224 |
225 | return main_parser
226 |
227 |
228 | if __name__ == "__main__":
229 | main_parser = build_parser()
230 |
231 | main_conf = main_parser.parse_args()
232 |
233 | sys.exit(main(main_conf))
234 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | colorama==0.3.9
2 | pystuck==0.8.5
3 | simhash==1.8.0
4 | tabulate==0.7.7
5 | setuptools==65.5.1
6 | cchardet==2.1.1
7 | mysqlclient==1.3.10
8 | requests==2.20.0
9 | pycryptodomex==3.4.6
10 | beautifulsoup4==4.6.0
11 | python_dateutil==2.6.1
12 | GitPython==2.1.5
13 | python_magic==0.4.13
14 | jsbeautifier==1.7.3
15 | pebble==4.3.7
16 | jsmin==2.2.2
17 |
--------------------------------------------------------------------------------
/scripts/hpc-utilities/hpc-submit:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o errexit
4 | set -o nounset
5 |
6 | print_help()
7 | {
8 | echo "Usage: $prog [OPTION] ... -- COMMAND ... %INPUT% ..."
9 | echo ""
10 | echo "Run ..."
11 | echo ""
12 | echo " --help, -h display this help message"
13 | echo " --jobs, -j num number of jobs (default: $jobs)"
14 | echo " --input, -i file file with input data"
15 | echo " --prefix, -p prefix prefix path for job directory (default: $prefix)"
16 | echo " --jobname, -n name job name (default: $name)"
17 | echo " --wrapper, -w wrapper exec wrapper (default: $wrapper)"
18 | echo " --copy-from, -f copy command from direcotry (default: src)"
19 | echo " --max-memory, -m mem max mem (default: $mem)"
20 | echo " --max-time, -t timelimit (default: $timelimit)"
21 | echo " --host, -s remote host (default: $host)"
22 | echo " --srcdir, -d src for copying binary (default: $srcdir)"
23 | echo " assumed to be remote, if it starts with a \":\""
24 | echo ""
25 | echo " COMMAND is the command that should be executed on the HPC cluster, where"
26 | echo " %INPUT% will be replaced with a file containing the job-specific input data."
27 | }
28 |
29 |
30 |
31 | mk_jobdir(){
32 | echo "Creating temporary job directory in $workdir."
33 | mkdir -p "$workdir"/bin
34 | mkdir -p "$workdir"/cfg
35 | mkdir -p "$workdir"/input
36 | mkdir -p "$workdir"/output
37 | mkdir -p "$workdir"/tmp
38 | }
39 |
40 |
41 | clean_jobdir(){
42 | rm -rf "$workdir"
43 | }
44 |
45 | split_input(){
46 | echo "Splitting input."
47 | split --numeric-suffixes=1 -a 8 -e -n l/$jobs "$input" "$workdir/input/"
48 | }
49 |
50 | mk_hpc_script(){
51 | local HOSTNAME=`hostname -f`
52 | echo "Creating HPC script."
53 | cat < $workdir/job.sge
54 |
55 | #!/bin/bash
56 | ## This script was generated by $prog (version: $version)
57 | ## on $timestamp
58 | ## by $USER@$HOSTNAME
59 | ## in $PWD
60 | ## using the following command:
61 | ## $invokation
62 | ##
63 | ## SGE configuration:
64 | #$ -V
65 | #$ -t 1-$jobs
66 | #$ -l rmem=$mem
67 | #$ -l h_rt=$timelimit
68 | #$ -j yes
69 | #$ -o "$prefix"/"$name"/output
70 |
71 |
72 | set -o nounset
73 | set -o errexit
74 | set -x
75 |
76 | export JOBINPUT="$prefix"/"$name"/input/\`printf %08d \$SGE_TASK_ID\`
77 |
78 | /usr/bin/time -v $wrapper $prefix/$name/bin/$cmd
79 | echo "Execution successful."
80 | EOF
81 | }
82 |
83 |
84 | mk_remote_jobdir(){
85 | echo "Create remote working directory ($host:$prefix)."
86 | ssh $host mkdir -p $prefix
87 | }
88 |
89 | install_hpc_script(){
90 | echo "Installing HPC Script"
91 | scp -q -r "$workdir" "$host":"$prefix"/"$name"
92 |
93 | if [[ $srcdir == ":"* ]]; then
94 | echo " Copying cmd from remote src."
95 | ssh $host cp "${srcdir:1}"/"$srccmd" "$prefix"/"$name"/bin;
96 | else
97 | echo " Copying cmd from local src."
98 | scp $srccmd "$srcdir"/"$srccmd" "$host":"$prefix"/"$name"/bin;
99 | fi
100 | }
101 |
102 | submit_job(){
103 | echo "Submitting job."
104 | ssh $host qsub "$prefix"/"$name"/job.sge
105 | }
106 |
107 | ## global configuration
108 | version="0.0"
109 | prog=`echo $0 | sed 's|.*/||'`;
110 | invokation="$prog $(printf "%q " "$@")"
111 | timestamp=`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
112 | host=`hostname`
113 | workdir=`mktemp -d`
114 | echo $workdir
115 |
116 | ## default values
117 | prefix="\$HOME/hpc"
118 | jobs=1
119 | name="$host-$USER-$timestamp"
120 | input=""
121 | wrapper="singularity exec -B \$TMPDIR:$prefix/$name/tmp"
122 | cmd=""
123 | mem="2G"
124 | timelimit="01:00:00"
125 | local="false";
126 | host="sharc.shef.ac.uk"
127 | srcdir="."
128 |
129 | while [ $# -gt 0 ]
130 | do
131 | case "$1" in
132 | --jobs|-j)
133 | jobs="$2";
134 | shift;;
135 | --input|-i)
136 | input="$2";
137 | shift;;
138 | --jobname|-n)
139 | name="$2";
140 | shift;;
141 | --max-memory|-m)
142 | mem="$2";
143 | shift;;
144 | --max-time|-t)
145 | timelimit="$2";
146 | shift;;
147 | --host|-s)
148 | host="$2";
149 | shift;;
150 | --srcdir|-d)
151 | srcdir="$2";
152 | shift;;
153 | --wrapper|-w)
154 | wrapper="$2";
155 | shift;;
156 | --prefix|-p)
157 | prefix="$2";
158 | shift;;
159 | --help|-h)
160 | print_help
161 | exit 0;;
162 | --) shift; break;;
163 | *) print_help
164 | exit 1;;
165 | esac
166 | shift
167 | done
168 | cmd=`echo $(printf "%q " "$@") | sed -e 's/%INPUT%/\$JOBINPUT/'`
169 | cmdarray=("$@")
170 | srccmd=${cmdarray[0]}
171 |
172 | mk_jobdir;
173 |
174 | if [ -n "$input" ]; then
175 | if [ ! -f "$input" ]; then
176 | echo "Input file \"$input\" not found!"
177 | exit 1
178 | fi
179 | split_input;
180 | fi
181 |
182 | mk_hpc_script;
183 |
184 | mk_remote_jobdir;
185 |
186 | install_hpc_script;
187 |
188 | clean_jobdir;
189 |
190 | submit_job;
191 |
192 |
--------------------------------------------------------------------------------
/scripts/maintainance/maintain_archive:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ACTION=${1:-MAIN}
4 | ARCHIVE=${2:-/srv/Shared/BrowserExtensions/archive}
5 |
6 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
7 | mkdir -p $LOGDIR
8 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
9 | LOG=${3:-$LOGPREFIX-maintain-archive-$ACTION.log}
10 |
11 | SELF=$0
12 | SRC=$4
13 |
14 | next_generation(){
15 | local src=$1
16 | local filebase=`basename $src .tar`
17 | local dir=`dirname $src`
18 |
19 | # Check next free file name:
20 | if ls $dir/$filebase.[0-9][0-9][0-9].tar.xz &> /dev/null; then
21 | latest=`ls $dir/$filebase.[0-9][0-9][0-9].tar.xz | \
22 | sort -r | head -1 | \
23 | sed -e "s/.*\([0-9][0-9][0-9]\).tar.xz/\1/"`
24 | next=`printf %03d $((latest+1))`
25 | else
26 | next=000
27 | fi
28 |
29 | dest=$dir/$filebase.$next.tar
30 | echo "Processing: $src -> $dest" | tee -a $LOG
31 | mv -n $src $dest
32 | if [ ! -f $src ]; then
33 | tar -cf $src -T /dev/null
34 | if [ ! -f $src ]; then
35 | echo "ERROR: cannot create empty tar archive ($src)" | tee -a $LOG
36 | fi
37 | else
38 | echo "ERROR: old archive exists ($src)" | tee -a $LOG
39 | fi
40 | }
41 |
42 | zge_compress(){
43 | mkdir -p $LOG.dir
44 | find $ARCHIVE/data/ \
45 | -type d \
46 | -name "[a-p][a-p][a-p]" \
47 | -exec qsub -o $LOG.dir `dirname $SELF`/xz.sge {} \;
48 | }
49 |
50 | main(){
51 | find $ARCHIVE/data/ \
52 | -name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].tar" \
53 | -exec $SELF MOVE $ARCHIVE $LOG {} \;
54 | }
55 |
56 | case "$ACTION" in
57 | MAIN)
58 | main;;
59 | MOVE)
60 | next_generation $SRC;;
61 | COMPRESS)
62 | zge_compress;;
63 | esac
64 |
--------------------------------------------------------------------------------
/scripts/maintainance/xz.sge:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #$ -V
3 | #$ -l rmem=2G
4 | #$ -j yes
5 | set -o nounset
6 | set -x
7 |
8 | find $1 \
9 | -name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].[0-9][0-9][0-9].tar" \
10 | -exec xz {} \;
11 |
--------------------------------------------------------------------------------
/scripts/monitoring/download-report-one-week.gp:
--------------------------------------------------------------------------------
1 | if (!exists("monitordir")) monitordir='.'
2 | filename="updates.csv"
3 | set terminal pngcairo size 3000,800 enhanced font 'Verdana,10'
4 | set output monitordir."/download-report-one-week.png"
5 |
6 | day="2018-04-01"
7 | # basic configuration
8 | set datafile separator ";"
9 |
10 | set autoscale x
11 |
12 | # plot last 7 days
13 | set xrange [time(0) - 7*24*60*60:]
14 |
15 | set ytics
16 | set yrange [0:400000]
17 | set ylabel "Parallel Downloads"
18 | set ytics 25000
19 | set mytics 2
20 | set y2range [0:4500]
21 | set y2label "Sequential Downloads"
22 | set y2tics 500
23 |
24 |
25 | set grid
26 |
27 | set xdata time
28 | set timefmt '%Y-%m-%d %H:%M:%S'
29 | set format x "%Y-%m-%d\n%H:%M:%S"
30 |
31 | set xtics 28800
32 | set mxtics 8
33 |
34 | set style data lines
35 | set title sprintf("Extension Downloads (Last Seven Days)")
36 |
37 | set key horiz
38 | set key out bot center
39 |
40 | # for plotting only one day, one can use:
41 | data_for_day(day,file)=sprintf("<(grep %s %s)",day, file)
42 | data=data_for_day(day, monitordir."/".filename)
43 |
44 | # for plotting all data
45 | data=monitordir."/".filename
46 |
47 | # Trick for plotting first derivative of data:
48 | # x0=NaN
49 | # y0=NaN
50 | # replot data using (dx=$1-x0,x0=$1,$1-dx/2):(dy=$6-y0,y0=$6,dy/dx) w l notitle
51 | # TODO: support time on x scale
52 |
53 | x0p=NaN
54 | y0p=NaN
55 | x0s=NaN
56 | y0s=NaN
57 |
58 | plot data using 1:4 with lines dashtype 2 lt rgb "#d07b95" axes x1y1 \
59 | title "Parallel Downloads (Target)" ,\
60 | data using 1:6 with lines lw 2 dashtype 1 lt rgb "#9c416e" axes x1y1 \
61 | title "Parallel Downloads" ,\
62 | data using (dx=timecolumn(1)-x0p,x0p=timecolumn(1),timecolumn(1)-dx/2):(dy=$6-y0p,y0p=$6,dy/dx < 0 ? 0 : (8*60*60)*dy/dx) \
63 | with lines dashtype 2 lt rgb "#622a55" axes x1y1 \
64 | title "Parallel Downloads per Eight Hours",\
65 | data using 1:5 with lines dashtype 2 lt rgb "#76eec6" axes x1y2 \
66 | title "Sequential Downloads (Target)",\
67 | data using 1:7 with lines lw 2 dashtype 1 lt rgb "#5ebe9e" axes x1y2 \
68 | title "Sequential Downloads",\
69 | data using (dx=timecolumn(1)-x0s,x0s=timecolumn(1),timecolumn(1)-dx/2):(dy=$7-y0s,y0s=$7,dy/dx < 0 ? 0 : (8*60*60)*dy/dx) \
70 | with lines dashtype 2 lt rgb "#468e76" axes x1y2 \
71 | title "Sequential Downloads per Eight Hours"
72 |
73 | set terminal pdfcairo size 30,8 enhanced font 'Verdana,15'
74 | set output monitordir."/download-report-one-week.pdf"
75 | replot
76 |
77 | # Plot number of extensions over time
78 | set title sprintf("Size of Extensions Archive")
79 | set terminal pngcairo size 3000,800 enhanced font 'Verdana,10'
80 | set output monitordir."/size-of-archive.png"
81 |
82 | set timefmt '%Y-%m-%d %H:%M:%S'
83 | set format x "%Y-%m-%d"
84 |
85 | set xrange ["2018-05-01":*]
86 |
87 |
88 | set yrange [150000:400000]
89 | set ylabel "Parallel Downloads"
90 | set y2range [2750:4500]
91 |
92 |
93 | set xtics 604800
94 | set mxtics 7
95 |
96 |
97 | plot data using 1:4 with lines dashtype 1 lt rgb "#d07b95" axes x1y1 \
98 | title "Parallel Downloads" ,\
99 | data using 1:5 with lines dashtype 1 lt rgb "#76eec6" axes x1y2 \
100 | title "Sequential Downloads",\
101 | data using 1:($4+$5) with lines dashtype 1 lt rgb "#000000" axes x1y1 \
102 | title "Total Downloads" ,\
103 |
104 | set terminal pdfcairo size 30,8 enhanced font 'Verdana,15'
105 | set output monitordir."/size-of-archive.pdf"
106 | replot
107 |
108 |
--------------------------------------------------------------------------------
/scripts/monitoring/global_update_monitor.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o errexit
4 | set -o nounset
5 |
6 | KILL="NO"
7 | ARCHIVE="/srv/Shared/BrowserExtensions/archive"
8 |
9 | while [[ $# -gt 0 ]]
10 | do
11 | key="$1"
12 | case $key in
13 | -a|--ARCHIVE)
14 | ARCHIVE="$2"
15 | shift # past argument
16 | shift # past value
17 | ;;
18 | --kill)
19 | KILL=YES
20 | shift # past argument
21 | ;;
22 | *) # unknown option
23 | shift # past argument
24 | ;;
25 | esac
26 | done
27 |
28 | LATESTLOG=`ls $ARCHIVE/log/*/*0.log | tail -n 1`
29 | LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log | tail -n 1`
30 | BASEDIR=$(dirname "$0")
31 |
32 | PIDS=""
33 | echo "# Checking update status"
34 | if ps u -C global_update.sh > /dev/null; then
35 | NUM=`ps u -C global_update.sh | tail -n +2 | wc -l`
36 | echo "* $NUM instances of global_update.sh still running (WARNING)"
37 | PIDS=`ps u -C global_update.sh | tail -n +2 | awk '{print $2}' | xargs`
38 | echo " Running PIDs: $PIDS"
39 | if [[ "$KILL" == "YES" ]];then
40 | echo " KILL mode enabled, killing running global_update.sh instances"
41 | echo " (executing pkill -9 -P $PIDS)"
42 | pkill -9 -P $PIDS
43 | pkill -f "ExtensionCrawler//crawler "
44 | fi
45 | else
46 | echo "* global_update.sh not running"
47 | NUM=0
48 | fi
49 |
50 | echo "* current status"
51 | PDOWNLOADS=`grep 'Updating extension $' $LATESTLOG | wc -l`
52 | echo " * parallel downloads finished: $PDOWNLOADS"
53 | SDOWNLOADS=`grep 'Updating extension (' $LATESTLOG | wc -l`
54 | echo " * sequential downloads finished: $SDOWNLOADS"
55 | echo " * Updating info from log ($LATESTLOG):"
56 | grep 'Updating .* extensions' $LATESTLOG | sed -e 's/^.*---//'
57 |
58 | echo ""
59 | echo "## Latest log:"
60 | cat $LATESTGLOBALLOG
61 |
62 | EXTENSIONS=`grep "Updating db" $LATESTLOG | wc -l`
63 |
64 | WE=`grep WorkerException $LATESTLOG | sort -k 5,5 -u | wc -l`
65 | echo "## Worker Exceptions: $WE (out of $EXTENSIONS)"
66 | grep WorkerException $LATESTLOG | sort -k 5,5 -u | sort -k 3,3
67 |
68 | ERRORS=`grep ERROR $LATESTLOG | sort -k 5,5 -u | wc -l`
69 | echo "## ERROR LOG: $ERRORS (out of $EXTENSIONS)"
70 | grep ERROR $LATESTLOG | sort -k 5,5 -u | sort -k 3,3
71 |
72 | echo "# Server utilization"
73 | top b -n 1 | head -n 15
74 |
75 | DATE=`date --utc +%Y-%m-%d`
76 | TIME=`date --utc +%H:%M:%S`
77 |
78 | EXTS=`grep 'Updating .* extensions' $LATESTLOG \
79 | | head -1 \
80 | | sed -e 's/^.* (//' \
81 | -e 's/ including forums, / /' \
82 | -e 's/ excluding forums.*/ /g' \
83 | | awk '{print $2";"$1}'`
84 |
85 | if [[ "$EXTS" == "" ]]; then
86 | EXTS=";"
87 | fi
88 |
89 | LASTPDOWNLOADS=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f8`
90 | LASTSDOWNLOADS=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f10`
91 | LASTMAIL=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f14`
92 |
93 | if [[ "$NUM" == "0" ]]; then
94 | MAIL=0
95 | else
96 | if [[ "$LASTPDOWNLOADS$LASTSDOWNLOADS" == "$PDOWNLOADS$SDOWNLOADS" ]]; then
97 | if [[ "$LASTMAIL" == "0" ]]; then
98 | echo "" | /usr/bin/mail -s "Extension Download Stalled!" ${USER:-root};
99 | fi;
100 | MAIL=1;
101 | else
102 | MAIL=0;
103 | fi
104 | fi
105 |
106 | MEM=`free | tail -2 | awk '{print $2 " " $3 " " $4}' | xargs | sed -e 's/ /\";\"/g'`
107 |
108 | echo "\"$DATE $TIME\";\"$NUM\";\"$PIDS\";$EXTS;\"$PDOWNLOADS\";\"$SDOWNLOADS\";\"$ERRORS\";\"$MAIL\";\"$MEM\"" >> $ARCHIVE/monitor/updates.csv
109 | gnuplot -e "monitordir='$ARCHIVE/monitor'" $BASEDIR/download-report-one-week.gp
110 |
111 |
--------------------------------------------------------------------------------
/scripts/singularity/ExtensionCrawler.def:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Copyright 2017 The University of Sheffield, UK
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 |
17 | bootstrap:debootstrap
18 | OSVersion: testing
19 | MirrorURL: https://deb.debian.org/debian
20 |
21 | %labels
22 | Maintainer The LogicalHacking Team (https://logicalhacking.com)
23 |
24 | %setup
25 |
26 | %post
27 |
28 | ###################################################################
29 | # Add Debian unstable as a secondary (lower priority) source
30 | # and update the data base of available packages.
31 | cat >> /etc/apt/sources.list << EOF
32 | deb http://ftp.us.debian.org/debian unstable main
33 | EOF
34 |
35 | cat > /etc/apt/preferences << EOF
36 | Package: *
37 | Pin: release a=testing
38 | Pin-Priority: 900
39 |
40 | Package: *
41 | Pin: release a=unstable
42 | Pin-Priority: 800
43 | EOF
44 |
45 | cat > /etc/apt/apt.conf.d/01norecommend << EOF
46 | APT::Install-Recommends "0";
47 | APT::Install-Suggests "0";
48 | EOF
49 |
50 | chmod go+r /etc/apt/preferences
51 | apt-get update
52 | ###################################################################
53 |
54 | ###################################################################
55 | # Add hook for apt that removes various files after installation
56 | # that are not needed at runtime.
57 | cat > /etc/apt/apt.conf.d/99-clean << EOF
58 | DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };
59 | APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };
60 | Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache "";
61 | EOF
62 | ###################################################################
63 |
64 | ###################################################################
65 | # Configure locales
66 | apt-get install -y locales
67 | echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
68 | echo "en_GB.UTF-8 UTF-8" >> /etc/locale.gen
69 | locale-gen
70 | echo "LANG=en_US.UTF-8" > /etc/default/locale
71 | ###################################################################
72 |
73 | ###################################################################
74 | # Install the core dependencies (Python 3.6 or later)
75 | # from the Debian Testing repository
76 | apt-get install -y --no-install-recommends libpython3.7-dev python3-magic python3-minimal python3-pip python3-setuptools python3-mysqldb g++ git libmariadb-dev-compat
77 | apt-get clean
78 | rm -rf /var/lib/apt/lists/*
79 | ###################################################################
80 |
81 | ###################################################################
82 | # Create /opt for local software (mainly cloned git repositories
83 | # from logicalhacking.com
84 | mkdir -p /opt
85 | chmod 755 /opt
86 | ###################################################################
87 |
88 | ###################################################################
89 | # Add the Extension Crawler repository, for more details, visit
90 | # https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler
91 | cd /opt
92 | git clone https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler.git
93 | cd ExtensionCrawler
94 | git checkout production
95 | cd ..
96 | pip3 install wheel # simhash needs wheel to build properly, still works without it though
97 | pip3 install --system -e ExtensionCrawler
98 | cd /
99 | chmod -R go+u-w /opt/ExtensionCrawler
100 | chmod -R go+u-w /usr/local/lib/
101 | chmod -R go+u-w /usr/local/bin/
102 | ###################################################################
103 |
104 | ###################################################################
105 | # Clone cdnjs repository or crate link to external archive dir
106 | ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
107 | case ${SINGULARITY_IMAGE} in
108 | *-cdnjs.img)
109 | mkdir -p /opt/archive/filedb
110 | cd /opt/archive/filedb
111 | git clone https://github.com/cdnjs/cdnjs.git cdnjs-git
112 | cd cdnjs-git
113 | git pull
114 | ln -s ${ARCHIVE}/conf . > /dev/null
115 | ln -s ${ARCHIVE}/data > /dev/null
116 | ln -s ${ARCHIVE}/log > /dev/null
117 | ;;
118 | *)
119 | cd /opt/
120 | ln -s ${ARCHIVE} .
121 | ;;
122 | esac
123 | chmod -R go+u /opt
124 | ###################################################################
125 |
126 | ###################################################################
127 | # Create mount/bind points for the various network drives
128 | # on SHARC (only useful when using the Singularity image on
129 | # the High-Performance Cluster of The University of Sheffield
130 | mkdir /scratch
131 | mkdir /fastdata
132 | mkdir /data
133 | mkdir /shared
134 |
135 | # Create nvidia driver directories to get rid of the singularity
136 | # warnings on sharc
137 | mkdir /nvbin
138 | mkdir /nvlib
139 | chmod go+u-w /scratch /fastdata /data /shared
140 | ###################################################################
141 |
142 | ###################################################################
143 | # Manual clean-up and removal of not strictly necessary directories
144 | yes | apt purge g++
145 | yes | apt autoremove
146 | rm -rf /usr/share/doc || true
147 | ###################################################################
148 |
149 | %environment
150 |
151 | export EXTENSION_ARCHIVE=/opt/archive
152 | export PATH=/opt/ExtensionCrawler/:${PATH}
153 |
154 | # We install all python modules into the container, so we do not want
155 | # to use any packages that the user might have installed in their home
156 | # directory.
157 | export PYTHONNOUSERSITE=1
158 |
159 | %runscript
160 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
161 | # this text will get copied to /singularity and will run whenever the container
162 | # is called as an executable
163 | usage() {
164 | cat < "$LOG" 2>&1
89 | sudo singularity image.expand --size ${BASESIZE} --writable ${IMAGE} ${BASE}.def > "$LOG" 2>&1
90 | else
91 | echo "Creating read-only $IMAGE using ${BASE}.def"
92 | sudo singularity build ${IMAGE} ${BASE}.def > "$LOG" 2>&1
93 | fi
94 |
95 | if [ ! -f $IMAGE ]; then
96 | echo "Image (${IMAGE}) creation failed!"
97 | exit 1
98 | else
99 | echo "Image (${IMAGE}) creation successful!"
100 | fi
101 |
102 | if [ "$INSTALL" = "true" ]; then
103 | if [ -f $BINDIR/$IMAGE ]; then
104 | mv $BINDIR/$IMAGE $BINDIR/$IMAGE.bak
105 | fi
106 | echo "Installing ${IMAGE} into $BINDIR"
107 | mv $IMAGE $BINDIR
108 | fi
109 |
--------------------------------------------------------------------------------
/scripts/singularity/singularitybuilder-arch.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM archlinux/base
2 |
3 | ARG version=2.6.1
4 |
5 | RUN curl -o /etc/pacman.d/mirrorlist "https://www.archlinux.org/mirrorlist/?country=GB&protocol=https&use_mirror_status=on" &&\
6 | sed -i 's/^#//' /etc/pacman.d/mirrorlist &&\
7 | pacman --noconfirm -Syyu base-devel wget python squashfs-tools debootstrap
8 |
9 | RUN mkdir /tmp/singularity &&\
10 | cd /tmp/singularity &&\
11 | wget "https://github.com/singularityware/singularity/releases/download/${version}/singularity-${version}.tar.gz" &&\
12 | tar -xvzf singularity-${version}.tar.gz &&\
13 | cd singularity-${version} &&\
14 | ./configure --prefix=/usr/local &&\
15 | make &&\
16 | sudo make install
17 |
--------------------------------------------------------------------------------
/scripts/singularity/singularitybuilder-arch.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/bash
2 | set -o errexit
3 | set -o nounset
4 |
5 | if [ "$#" -lt 2 ]; then
6 | echo "Usage: $0 "
7 | exit 1
8 | fi
9 |
10 | IMGFILE=$(realpath $1)
11 | IMGDIR=$(dirname "$IMGFILE")
12 | DEFFILE=$(realpath $2)
13 | DEFDIR=$(dirname "$DEFFILE")
14 |
15 | if [ -f "$IMGFILE" ]; then
16 | rm "$IMGFILE"
17 | fi
18 |
19 | docker build --tag=singularitybuilder-arch -f singularitybuilder-arch.Dockerfile .
20 | docker run -v "$IMGDIR:$IMGDIR" -v "$DEFDIR:$DEFDIR" --privileged singularitybuilder-arch:latest singularity build "$IMGFILE" "$DEFFILE"
21 |
--------------------------------------------------------------------------------
/scripts/update/global_update.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # m h dom mon dow command
3 | # 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
4 | # 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
5 |
6 | ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive}
7 | CRAWLERHOME=${2:-~/ExtensionCrawler}
8 | IMAGE=${3:-/shared/brucker_research1/Shared/BrowserExtensions/bin/ExtensionCrawler.img}
9 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
10 | mkdir -p $LOGDIR
11 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
12 | LOG=$LOGPREFIX-global.log
13 |
14 | date --utc +'* Start Updating Extensions Archive (%c)' | tee $LOG
15 |
16 | # Update extensions
17 | (singularity exec --bind /srv/:/srv/ $IMAGE crawler -p 32 -d --pystuck -a $ARCHIVE > $LOGPREFIX.log ) |& ts '%Y-%m-%dT%H:%M:%S' | tee $LOGPREFIX-stderr.log
18 |
19 | date --utc +'* Update Finished (%c)' | tee -a $LOG
20 |
21 |
22 | ERRORS=`grep ERROR $LOGPREFIX.log | sort -k 5,5 -u | wc -l`
23 | EXTENSIONS=`grep "Updating db" $LOGPREFIX.log | wc -l`
24 | echo "ERROR LOG: $ERRORS (out of $EXTENSIONS)"
25 | echo "=========="
26 | grep ERROR $LOGPREFIX.log | sort -k 5,5 -u | sort -k 3,3
27 |
--------------------------------------------------------------------------------
/scripts/update/update_cdnjs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive}
4 | TMPDIR=${TMPDIR:-/tmp}
5 |
6 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
7 | mkdir -p $LOGDIR
8 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
9 | LOG=$LOGPREFIX-cdnjs.log
10 |
11 | SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img
12 | date --utc +'* Create backup of disk image (%c)' | tee -a $LOG
13 | cp $SING_IMG $SING_IMG.bak
14 | SING_EXEC="singularity exec -w --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG"
15 | ls "$SING_IMG" > /dev/null
16 |
17 | # Update production branch of WebCrawler repository
18 | date --utc +'* Updating WebCrawler repository (%c)' | tee -a $LOG
19 | $SING_EXEC git fetch >> $LOG
20 | $SING_EXEC git checkout production >> $LOG 2>&1
21 | $SING_EXEC git pull >> $LOG 2>&1
22 | # $SING_EXEC pip3 install --system -e ../ExtensionCrawler
23 |
24 | # Update cdnjs git repository and update cdnjs data base table
25 | date --utc +'* Updating CDNJS repository (%c)' | tee -a $LOG
26 | $SING_EXEC ./cdnjs-git-miner -v -u -a /opt/archive >> $LOG
27 | date --utc +'* Successfully updated CDNJS repository (%c)' | tee -a $LOG
28 |
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | with open('requirements.txt') as f:
4 | requirements = f.read().splitlines()
5 |
6 | setup(
7 | name='Extension Crawler',
8 | description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
9 | author='Achim D. Brucker, Michael Herzberg',
10 | license='GPL 3.0',
11 | install_requires=requirements
12 | )
13 |
--------------------------------------------------------------------------------
/sge/create-db-cdnjs.sge:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #$ -V
3 | #$ -l rmem=4G
4 | #$ -t 1-10000
5 | #$ -j yes
6 | #$ -o /shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/log
7 | set -o nounset
8 | set -x
9 |
10 | SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img
11 |
12 | SING_EXEC="singularity exec -w --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG"
13 |
14 | printenv
15 | echo "The following parameter were passed: $*"
16 | ls "$SING_IMG" > /dev/null
17 |
18 | /usr/bin/time $SING_EXEC ./cdnjs-git-miner -v -p 1 -i -a /opt/archive -n $SGE_TASK_ID -N 10000 $*
19 |
20 |
--------------------------------------------------------------------------------
/sge/create-db.sge:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o nounset
3 | set -o errexit
4 |
5 | printenv
6 |
7 | (set -x; /usr/bin/time singularity exec --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp create-db.img create-db -t 1 -n $SGE_TASK_ID $*)
8 |
--------------------------------------------------------------------------------
/sge/create-db.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o nounset
3 | set -o errexit
4 |
5 | REMOTE_ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
6 | REMOTE_TARGET_DIR_PREFIX=/data/\$USER
7 | NUM_THREADS=48
8 | SGE_EXTRA_ARGS='-P rse -l h_rt=01:00:00,rmem=4G,h=\!sharc-node126 -j yes'
9 | PY_EXTRA_ARGS=''
10 | EXTENSION_IDS=
11 |
12 | usage() {
13 | echo "Usage:"
14 | echo " -a (set archive path, default: ${REMOTE_ARCHIVE})"
15 | echo " -t (set target directory, default: ${REMOTE_TARGET_DIR_PREFIX})"
16 | echo " -m (set degree of parallelism, default: ${NUM_THREADS})"
17 | echo " -s \"\" (add qsub arguments, default: ${SGE_EXTRA_ARGS})"
18 | echo " -p \"\" (add python script arguments, default: ${PY_EXTRA_ARGS})"
19 | echo " -e (set path to extension id list, default: crawl from archive)"
20 | echo " -l (limit number of sharc tasks, default: number of extensions)"
21 | }
22 |
23 | while getopts ":a:t:s:p:m:e:l:" o; do
24 | case "${o}" in
25 | a)
26 | REMOTE_ARCHIVE=${OPTARG}
27 | ;;
28 | t)
29 | REMOTE_TARGET_DIR_PREFIX=${OPTARG}
30 | ;;
31 | m)
32 | NUM_THREADS=${OPTARG}
33 | ;;
34 | s)
35 | SGE_EXTRA_ARGS+=" ${OPTARG}"
36 | ;;
37 | p)
38 | PY_EXTRA_ARGS+=" ${OPTARG}"
39 | ;;
40 | e)
41 | EXTENSION_IDS="${OPTARG}"
42 | ;;
43 | l)
44 | MAX_TASKS="${OPTARG}"
45 | ;;
46 | *)
47 | usage
48 | exit 1
49 | ;;
50 | esac
51 | done
52 |
53 | shift $((OPTIND-1))
54 |
55 | BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
56 | TEMP_FOLDER=$(mktemp -d)
57 | TARGETDIR="${REMOTE_TARGET_DIR_PREFIX}/create-db-$(date +%Y%m%d-%H%M%S)"
58 |
59 | echo "Using target dir: $TARGETDIR"
60 | ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
61 |
62 | echo "Pushing sge script ..."
63 | scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge"
64 |
65 | echo "Building image..."
66 | if [ -f "$BASEDIR/scripts/singularity/create-db.img" ]; then
67 | rm -f "$BASEDIR/scripts/singularity/create-db.img"
68 | fi
69 | (
70 | cd "$BASEDIR/scripts/singularity"
71 | if [[ "$(docker images -q singularitybuilder-arch 2> /dev/null)" == "" ]]; then
72 | docker build --tag=singularitybuilder -f singularitybuilder-arch.Dockerfile .
73 | fi
74 | docker run -it -v "$(pwd):$(pwd)" -w "$(pwd)" --privileged singularitybuilder-arch:latest singularity build create-db.img ExtensionCrawler.def
75 | )
76 |
77 | echo "Pushing image..."
78 | scp "$BASEDIR/scripts/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img"
79 |
80 |
81 | if [[ -z $EXTENSION_IDS ]]; then
82 | echo "Gathering extension IDs..."
83 | ssh sharc.shef.ac.uk find "${REMOTE_ARCHIVE}/data" -name "*.tar" | grep -Po "[a-p]{32}" > ${TEMP_FOLDER}/extension.ids
84 | else
85 | cp "$EXTENSION_IDS" ${TEMP_FOLDER}/extension.ids
86 | fi
87 |
88 | NO_IDS=$(cat ${TEMP_FOLDER}/extension.ids | wc -l)
89 |
90 | echo "Found $NO_IDS IDs!"
91 | if [ "$NO_IDS" = 0 ]; then
92 | echo "Nothing to do!"
93 | exit 0
94 | fi
95 |
96 | echo "Pushing extension IDs..."
97 | scp ${TEMP_FOLDER}/extension.ids sharc.shef.ac.uk:$TARGETDIR/
98 |
99 | if [[ ! -v MAX_TASKS ]]; then
100 | MAX_TASKS=NO_IDS
101 | fi
102 |
103 | NO_BATCH_JOBS=$(((MAX_TASKS+1)/75000+1))
104 | JOBS_PER_BATCH=$((MAX_TASKS/NO_BATCH_JOBS+1))
105 |
106 | for run_no in $(seq 1 $NO_BATCH_JOBS); do
107 | FIRST_ID=$(((run_no-1) * $JOBS_PER_BATCH + 1))
108 | LAST_ID=$((run_no * $JOBS_PER_BATCH))
109 |
110 | echo "Starting job $run_no ..."
111 | (set -x; ssh sharc.shef.ac.uk qsub \
112 | -tc $((NUM_THREADS/NO_BATCH_JOBS)) \
113 | -t ${FIRST_ID}-${LAST_ID} \
114 | -wd "$TARGETDIR" \
115 | -o "$TARGETDIR/logs" \
116 | ${SGE_EXTRA_ARGS} \
117 | "$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $MAX_TASKS ${PY_EXTRA_ARGS})
118 | done
119 |
--------------------------------------------------------------------------------