├── .gitignore
├── ExtensionCrawler
    ├── __init__.py
    ├── archive.py
    ├── cdnjs_crawler.py
    ├── cdnjs_git.py
    ├── config.py
    ├── crx.py
    ├── db.py
    ├── dbbackend
    │   ├── __init__.py
    │   ├── mysql_backend.py
    │   └── mysql_process.py
    ├── discover.py
    ├── file_identifiers.py
    ├── js_decomposer.py
    ├── js_mincer.py
    ├── request_manager.py
    └── util.py
├── LICENSE
├── PermissionAnalysis
    └── grep-unused-permissions
├── README.md
├── analysis
    └── library-detector
    │   ├── angular
    │       ├── 2018-11-28-results.csv
    │       ├── angular.py
    │       ├── angularversions.txt
    │       ├── ideas.txt
    │       └── plotting.py
    │   └── jquery.py
├── cdnjs-git-miner
├── crawler
├── create-db
├── crx-extract
├── crx-jsinventory
├── crx-jsstrings
├── crx-tool
├── database
    ├── README.md
    ├── config
    │   └── my.cnf
    ├── queries
    │   ├── get_added_content_scripts.sql
    │   └── get_added_permissions.sql
    ├── schemas
    │   ├── category.sql
    │   ├── cdnjs.sql
    │   ├── content_script_url.sql
    │   ├── crx.sql
    │   ├── crxfile.sql
    │   ├── extension.sql
    │   ├── libdet.sql
    │   ├── permission.sql
    │   ├── reply.sql
    │   ├── reply_comment.sql
    │   ├── review.sql
    │   ├── review_comment.sql
    │   ├── status.sql
    │   ├── support.sql
    │   └── support_comment.sql
    ├── scripts
    │   ├── mariabackup-full
    │   ├── mariabackup-inc
    │   ├── mariabackup-schemas
    │   └── showgrants
    └── views
    │   ├── extension_most_recent.sql
    │   ├── extension_most_recent_small.sql
    │   ├── extension_most_recent_until_date.sql
    │   ├── extension_second_most_recent.sql
    │   ├── extension_second_most_recent_until_date.sql
    │   ├── extension_small.sql
    │   └── extension_update.sql
├── extgrep
├── requirements.txt
├── resources
    └── js_identifier.json
├── scripts
    ├── hpc-utilities
    │   └── hpc-submit
    ├── maintainance
    │   ├── maintain_archive
    │   └── xz.sge
    ├── monitoring
    │   ├── download-report-one-week.gp
    │   └── global_update_monitor.sh
    ├── singularity
    │   ├── ExtensionCrawler.def
    │   ├── build.sh
    │   ├── singularitybuilder-arch.Dockerfile
    │   └── singularitybuilder-arch.sh
    └── update
    │   ├── global_update.sh
    │   └── update_cdnjs.sh
├── setup.py
├── sge
    ├── create-db-cdnjs.sge
    ├── create-db.sge
    └── create-db.sh
└── simhashbucket


/.gitignore:
--------------------------------------------------------------------------------
 1 | # ---> Python
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # vi
62 | *.swp
63 | 
64 | # vscode 
65 | .vscode 
66 | 
67 | archive
68 | .ropeproject
69 | ExtensionCrawler.img
70 | ExtensionCrawler-cdnjs.img
71 | 
72 | .idea
73 | venv
74 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalhacking/ExtensionCrawler/853d69d1a3478eaa3b8649f9dd754a044a561cc5/ExtensionCrawler/__init__.py


--------------------------------------------------------------------------------
/ExtensionCrawler/config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 17 | """Global configuration of the Extension Crawler and related tools."""
 18 | 
 19 | import os
 20 | import json
 21 | 
 22 | 
 23 | def const_sitemap_url():
 24 |     """Sitemap URL."""
 25 |     return "https://chrome.google.com/webstore/sitemap"
 26 | 
 27 | 
 28 | def const_sitemap_scheme():
 29 |     """URL of Sitemap schema."""
 30 |     return "http://www.sitemaps.org/schemas/sitemap/0.9"
 31 | 
 32 | 
 33 | def const_overview_url(ext_id):
 34 |     """URL template for the overview page of an extension."""
 35 |     return 'https://chrome.google.com/webstore/detail/{}'.format(ext_id)
 36 | 
 37 | 
 38 | def const_store_url():
 39 |     """Main URL of the Chrome store."""
 40 |     return 'https://chrome.google.com/webstore'
 41 | 
 42 | 
 43 | def const_review_url():
 44 |     """Base URL of the review page of an extension."""
 45 |     return 'https://chrome.google.com/reviews/components'
 46 | 
 47 | 
 48 | def const_review_search_url():
 49 |     """Base URL for review search."""
 50 |     return 'https://chrome.google.com/reviews/json/search'
 51 | 
 52 | 
 53 | def const_support_url():
 54 |     """Base URL for support pages."""
 55 |     return 'https://chrome.google.com/reviews/components'
 56 | 
 57 | 
 58 | def const_download_url():
 59 |     """Base download URL."""
 60 |     return ('https://clients2.google.com/service/update2/' +
 61 |             'crx?response=redirect&nacl_arch=x86-64&' +
 62 |             'prodversion=9999.0.9999.0&x=id%3D{}%26uc')
 63 | 
 64 | 
 65 | def const_categories():
 66 |     """List of known categories."""
 67 |     return [
 68 |         'extensions', 'ext/22-accessibility', 'ext/10-blogging',
 69 |         'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
 70 |         'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
 71 |         'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
 72 |         'ext/13-sports'
 73 |     ]
 74 | 
 75 | 
 76 | def const_support_payload(ext_id, start, end):
 77 |     """Payload for requesting support pages."""
 78 |     return (
 79 |         'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
 80 |         '"specs":[{{"type":"CommentThread",' +
 81 |         '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
 82 |         + '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
 83 |         '"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
 84 |         '"internedValues":[]}}').format(ext_id, start, end)
 85 | 
 86 | 
 87 | def const_review_payload(ext_id, start, end):
 88 |     """Payload for requesting review pages."""
 89 |     return (
 90 |         'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
 91 |         '"specs":[{{"type":"CommentThread",' +
 92 |         '"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
 93 |         + '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
 94 |         '"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
 95 |         '"internedKeys":[],' + '"internedValues":[]}}').format(
 96 |             ext_id, start, end)
 97 | 
 98 | 
 99 | def const_review_search_payload(params):
100 |     """Payload for searches."""
101 |     pre = """req={"applicationId":94,"searchSpecs":["""
102 |     post = """]}&requestSource=widget"""
103 |     args = []
104 |     for extid, author, start, numresults, groups in params:
105 |         args += [
106 |             """{{"requireComment":true,"entities":[{{"annotation":"""
107 |             """{{"groups":{},"author":"{}","""
108 |             """"url":"http://chrome.google.com/extensions/permalink?id={}"}}}}],"""
109 |             """"matchExtraGroups":true,"startIndex":{},"numResults":{},"""
110 |             """"includeNicknames":true,"locale": {{"language": "en","country": "us"}}}}"""
111 |             .format(json.dumps(groups), author, extid, start, numresults)
112 |         ]
113 | 
114 |     return pre + ",".join(args) + post
115 | 
116 | 
117 | def get_local_archive_dir(ext_id):
118 |     """Local archive dir of extension."""
119 |     return "{}".format(ext_id[:3])
120 | 
121 | 
122 | def archive_file(archivedir, ext_id):
123 |     """Archive tar of an extension."""
124 |     return os.path.join(
125 |         str(archivedir), get_local_archive_dir(ext_id), ext_id + ".tar")
126 | 
127 | 
128 | def const_basedir():
129 |     """Top-level directory for the extension crawler archive."""
130 |     if "EXTENSION_ARCHIVE" in os.environ:
131 |         return os.environ.get("EXTENSION_ARCHIVE")
132 |     else:
133 |         return "archive"
134 | 
135 | 
136 | def const_parallel_downloads():
137 |     """Number of parallel downloads."""
138 |     return 36
139 | 
140 | 
141 | def const_verbose():
142 |     """Default verbosity."""
143 |     return True
144 | 
145 | 
146 | def const_use_process_pool():
147 |     """Use ProcessPool (from module 'pebble') for concurrency."""
148 |     return False
149 | 
150 | 
151 | def const_log_format(ext_id="-"*32):
152 |     return "%(process)6s %(asctime)s %(levelname)8s {} %(message)s".format(ext_id)
153 | 
154 | 
155 | def const_discover():
156 |     """Default configuration of discovery mode"""
157 |     return False
158 | 
159 | 
160 | def const_ext_timeout():
161 |     """Timeout for downloading an individual extension (2 hours)."""
162 |     return 2*60*60
163 | 
164 | 
165 | def const_mysql_config_file():
166 |     return os.path.expanduser("~/.my.cnf")
167 | 
168 | 
169 | def const_mysql_maxtries():
170 |     return 12
171 | 
172 | 
173 | def const_mysql_try_wait():
174 |     return 300
175 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/crx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 17 | #
 18 | """Utility functions for working with Chrome extensionsx archives,
 19 |    i.e., *.crx files."""
 20 | 
 21 | import io
 22 | import os
 23 | import zipfile
 24 | import binascii
 25 | from Cryptodome.PublicKey import RSA
 26 | from Cryptodome.Hash import SHA
 27 | from Cryptodome.Signature import PKCS1_v1_5
 28 | 
 29 | 
 30 | class CrxFile:
 31 |     """Record class for storing crx files."""
 32 | 
 33 |     def __init__(self, filename, magic, version, public_key_len, sig_len,
 34 |                  public_key, sig, header_len, data):
 35 |         self.file = filename
 36 |         self.magic = magic
 37 |         self.version = version
 38 |         self.public_key_len = public_key_len
 39 |         self.sig_len = sig_len
 40 |         self.public_key = public_key
 41 |         self.sig = sig
 42 |         self.header_len = header_len
 43 |         self.data = data
 44 | 
 45 | 
 46 | def is_valid_magic(magic):
 47 |     """Check magic matches the magic bytes of the crx specificaton."""
 48 |     return magic == b'Cr24'
 49 | 
 50 | 
 51 | def is_crxfile(filename):
 52 |     """Check magic number: crx files should start with \"Cr24\"."""
 53 |     file = open(filename, 'rb')
 54 |     magic = file.read(4)
 55 |     file.close()
 56 |     return is_valid_magic(magic)
 57 | 
 58 | 
 59 | def check_signature(public_key, sig, data):
 60 |     """Check validity of signature contained in the crx file."""
 61 |     key = RSA.importKey(public_key)
 62 |     crxhash = SHA.new(data)
 63 |     return PKCS1_v1_5.new(key).verify(crxhash, sig)
 64 | 
 65 | 
 66 | def read_crx(filename):
 67 |     """Read header of an crx file (https://developer.chrome.com/extensions/crx)."""
 68 |     file = open(filename, 'rb')
 69 |     magic = file.read(4)
 70 |     version = int.from_bytes(file.read(4), byteorder='little')
 71 |     public_key_len = int.from_bytes(file.read(4), byteorder='little')
 72 |     sig_len = int.from_bytes(file.read(4), byteorder='little')
 73 |     public_key = file.read(public_key_len)
 74 |     sig = file.read(sig_len)
 75 |     header_len = 16 + public_key_len + sig_len
 76 |     data = file.read()
 77 |     file.close()
 78 |     return CrxFile(filename, magic, version, public_key_len, sig_len,
 79 |                    public_key, sig, header_len, data)
 80 | 
 81 | 
 82 | def print_crx_info(verbose, crx):
 83 |     """Print information extracted from a crx file."""
 84 |     if is_valid_magic(crx.magic):
 85 |         magic = "valid"
 86 |     else:
 87 |         magic = "invalid"
 88 |     if check_signature(crx.public_key, crx.sig, crx.data):
 89 |         sig = "valid"
 90 |     else:
 91 |         sig = "invalid"
 92 |     print("Filename:    " + crx.file)
 93 |     print("Header size: " + str(crx.header_len))
 94 |     print("Size:        " + str(crx.header_len + len(crx.data)))
 95 |     print("Magic byte:  " + str(crx.magic.decode("utf-8")) + " (" + magic +
 96 |           ")")
 97 |     print("Version:     " + str(crx.version))
 98 |     print("Signature:   " + sig)
 99 |     print("Public Key [" + str(crx.public_key_len) + "]:")
100 |     key = RSA.importKey(crx.public_key)
101 |     print(key.exportKey().decode("utf-8"))
102 |     if verbose:
103 |         print("Signature [" + str(crx.sig_len) + "]: " + str(
104 |             binascii.hexlify(crx.sig)))
105 |     out = io.BytesIO(crx.data)
106 |     ziparchive = zipfile.ZipFile(out, 'r')
107 |     print("Zip content:")
108 |     for info in ziparchive.infolist():
109 |         print('{:8d} {:8d}'.format(info.file_size, info.compress_size),
110 |               info.filename)
111 | 
112 | 
113 | def verify_crxfile(verbose, filename):
114 |     """Verify integrity of a crx file."""
115 |     if is_crxfile(filename):
116 |         if verbose:
117 |             print("Found correct magic bytes.")
118 |         print_crx_info(verbose, read_crx(filename))
119 |         return 0
120 |     else:
121 |         if verbose:
122 |             print("No valid magic bytes found")
123 |         return -1
124 | 
125 | 
126 | def extract_crxfile(verbose, force, filename, destdir):
127 |     """Extract crxfile into specified destdir."""
128 |     crx = read_crx(filename)
129 |     if is_valid_magic(crx.magic) | force:
130 |         if (destdir == "") | (destdir is None):
131 |             destdir = "."
132 |         if filename.endswith(".crx"):
133 |             extname = os.path.basename(filename)
134 |             dirname = extname[0:len(os.path.basename(extname)) - 4]
135 |         else:
136 |             dirname = filename
137 |         out = io.BytesIO(crx.data)
138 |         ziparchive = zipfile.ZipFile(out, 'r')
139 |         ziparchive.extractall(destdir + "/" + dirname)
140 |         if verbose:
141 |             print("Content extracted into: " + destdir + "/" + dirname)
142 |         return 0
143 |     else:
144 |         print("Input file not valid.")
145 |         return -1
146 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/dbbackend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalhacking/ExtensionCrawler/853d69d1a3478eaa3b8649f9dd754a044a561cc5/ExtensionCrawler/dbbackend/__init__.py


--------------------------------------------------------------------------------
/ExtensionCrawler/dbbackend/mysql_backend.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (C) 2017 The University of Sheffield, UK
  3 | #
  4 | # This program is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | #
  9 | # This program is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | #
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | #
 17 | 
 18 | import time
 19 | import datetime
 20 | from collections import OrderedDict
 21 | from random import uniform
 22 | import sys
 23 | import configparser
 24 | 
 25 | import MySQLdb
 26 | import MySQLdb._exceptions
 27 | 
 28 | import ExtensionCrawler.config as config
 29 | from ExtensionCrawler.util import log_info, log_error, log_warning
 30 | 
 31 | 
 32 | class MysqlBackend:
 33 |     def __init__(self, ext_id, delayed=False, cache_etags=False, try_wait=config.const_mysql_try_wait(), maxtries=config.const_mysql_maxtries(),
 34 |                  **kwargs):
 35 |         self.ext_id = ext_id
 36 |         self.delayed = delayed
 37 |         self.cache_etags = cache_etags
 38 |         self.dbargs = kwargs
 39 |         self.try_wait = try_wait
 40 |         self.maxtries = maxtries
 41 |         self.cache = {}
 42 |         self.crx_etag_cache = {}
 43 |         self.db = None
 44 |         self.cursor = None
 45 | 
 46 |         # For more info, see https://jira.mariadb.org/browse/CONC-359
 47 |         self._fix_missing_host(self.dbargs)
 48 | 
 49 |     def _fix_missing_host(self, dbargs):
 50 |         if "host" in dbargs:
 51 |             return
 52 | 
 53 |         if "read_default_file" not in dbargs:
 54 |             return
 55 | 
 56 |         config = configparser.ConfigParser()
 57 |         config.read(dbargs["read_default_file"])
 58 |         if "host" not in config["client"]:
 59 |             return
 60 |         dbargs["host"] = config["client"]["host"]
 61 | 
 62 | 
 63 |     def __enter__(self):
 64 |         # We open a connection once we actually need it
 65 |         return self
 66 | 
 67 |     def __exit__(self, *args):
 68 |         for table, arglist in self.cache.items():
 69 |             self._do_insert(table, arglist)
 70 |             self.cache[table] = []
 71 |         self._close_conn()
 72 | 
 73 |     def _get_column_names(self, table):
 74 |         self.cursor.execute(f"select column_name from information_schema.columns where table_schema=database() and table_name=%s", (table,))
 75 |         return [row[0] for row in self.cursor.fetchall()]
 76 | 
 77 | 
 78 |     def _do_insert(self, table, arglist):
 79 |         if len(arglist) == 0:
 80 |             return
 81 |         sorted_arglist = self.sort_by_primary_key(table, arglist)
 82 |         args = [tuple(arg.values()) for arg in sorted_arglist]
 83 | 
 84 |         if self.delayed:
 85 |             query = "INSERT DELAYED INTO {}({}) VALUES ({})".format(
 86 |                 table,
 87 |                 ",".join(sorted_arglist[0].keys()),
 88 |                 ",".join(len(args[0]) * ["%s"]))
 89 |         else:
 90 |             column_names = self.retry(lambda: self._get_column_names(table))
 91 |             if "last_modified" in column_names:
 92 |                 additional_columns = ["last_modified"]
 93 |             else:
 94 |                 additional_columns = []
 95 |             # Looks like this, for example:
 96 |             # INSERT INTO category VALUES(extid,date,category) (%s,%s,%s)
 97 |             #   ON DUPLICATE KEY UPDATE extid=VALUES(extid),date=VALUES(date)
 98 |             #   ,category=VALUES(category)
 99 |             query = "INSERT INTO {}({}) VALUES ({}) ON DUPLICATE KEY UPDATE {}".format(
100 |                 table,
101 |                 ",".join(sorted_arglist[0].keys()),
102 |                 ",".join(len(args[0]) * ["%s"]),
103 |                 ",".join(
104 |                     ["{c}=VALUES({c})".format(c=c) for c in list(sorted_arglist[0].keys()) + additional_columns]))
105 |         start = time.time()
106 |         self.retry(lambda: self.cursor.executemany(query, args))
107 |         log_info("* Inserted {} bytes into {}, taking {}.".format(sum([sys.getsizeof(arg) for arg in args]),
108 |                                                                        table, datetime.timedelta(seconds=int(time.time() - start))), 3)
109 | 
110 |     def _create_conn(self):
111 |         if self.db is None:
112 |             log_info("* self.db is None,  open new connection ...", 3)
113 |             self.db = MySQLdb.connect(**self.dbargs)
114 |             self.db.autocommit(True)
115 |             log_info("* success", 4)
116 |         if self.cursor is None:
117 |             log_info("* self.cursor is None,  assigning new cursor ...", 3)
118 |             self.cursor = self.db.cursor()
119 |             log_info("* success", 4)
120 | 
121 |     def _close_conn(self):
122 |         if self.cursor is not None:
123 |             self.cursor.close()
124 |             self.cursor = None
125 |         if self.db is not None:
126 |             self.db.close()
127 |             self.db = None
128 | 
129 |     def retry(self, f):
130 |         for t in range(self.maxtries):
131 |             try:
132 |                 self._create_conn()
133 |                 return f()
134 |             except MySQLdb._exceptions.OperationalError as e:
135 |                 last_exception = e
136 | 
137 |                 try:
138 |                     self._close_conn()
139 |                 except Exception as e2:
140 |                     log_error("Suppressed exception: {}".format(str(e2)), 3)
141 | 
142 |                 if t + 1 == self.maxtries:
143 |                     log_error("MySQL connection eventually failed, closing connection!", 3)
144 |                     raise last_exception
145 |                 else:
146 |                     factor = 0.2
147 |                     logmsg = ("Exception ({}) on mysql connection attempt "
148 |                               "{} of {}, wating {}s +/- {}% before retrying..."
149 |                               ).format(str(e),
150 |                                        t + 1,
151 |                                        self.maxtries,
152 |                                        self.try_wait, factor * 100)
153 |                     log_warning(logmsg, 3)
154 |                     time.sleep(self.try_wait * uniform(
155 |                         1 - factor, 1 + factor))
156 | 
157 |     def get_single_value(self, query, args):
158 |         self.retry(lambda: self.cursor.execute(query, args))
159 | 
160 |         result = self.retry(lambda: self.cursor.fetchone())
161 |         if result is not None:
162 |             return result[0]
163 |         else:
164 |             return None
165 | 
166 |     def sort_by_primary_key(self, table, arglist):
167 |         self.retry(lambda: self.cursor.execute(f"SHOW KEYS FROM {table} WHERE Key_name = 'PRIMARY'"))
168 |         primary_keys = [row[4] for row in self.cursor.fetchall()]
169 | 
170 |         sorted_arglist = sorted(arglist, key=lambda x: [x[pk] for pk in primary_keys])
171 | 
172 |         def arglist_shuffler(x):
173 |             try:
174 |                 return primary_keys.index(x)
175 |             except ValueError:
176 |                 return len(primary_keys)
177 |         shuffled_arglist = [OrderedDict(sorted(arg.items(), key=lambda x: arglist_shuffler(x[0]))) for arg in sorted_arglist]
178 |         return shuffled_arglist
179 | 
180 | 
181 |     def insertmany(self, table, arglist):
182 |         if table not in self.cache:
183 |             self.cache[table] = []
184 |         self.cache[table] += arglist
185 |         if len(self.cache[table]) >= 128:
186 |             self._do_insert(table, self.cache[table])
187 |             self.cache[table] = []
188 |         if self.cache_etags and table == "extension":
189 |             for arg in arglist:
190 |                 self.crx_etag_cache[(arg["extid"], arg["date"])] = arg["crx_etag"]
191 | 
192 |     def insert(self, table, **kwargs):
193 |         self.insertmany(table, [kwargs])
194 | 
195 |     def get_etag(self, extid, date):
196 |         if (extid, date) in self.crx_etag_cache:
197 |             return self.crx_etag_cache[(extid, date)]
198 |         else:
199 |             return None
200 | 
201 |     def get_cdnjs_info(self, md5):
202 |         query = """SELECT library, version, filename, add_date, typ from cdnjs where md5=%s"""
203 |         args = [md5]
204 |         self.retry(lambda: self.cursor.execute(query, args))
205 |         result = self.retry(lambda: self.cursor.fetchone())
206 |         return result
207 | 
208 | 
209 | def convert_date(date):
210 |     return date[:-6]
211 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/dbbackend/mysql_process.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (C) 2017 The University of Sheffield, UK
 3 | #
 4 | # This program is free software: you can redistribute it and/or modify
 5 | # it under the terms of the GNU General Public License as published by
 6 | # the Free Software Foundation, either version 3 of the License, or
 7 | # (at your option) any later version.
 8 | #
 9 | # This program is distributed in the hope that it will be useful,
10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | # GNU General Public License for more details.
13 | #
14 | # You should have received a copy of the GNU General Public License
15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | #
17 | 
18 | from multiprocessing import Process, Manager
19 | 
20 | from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
21 | from ExtensionCrawler.util import setup_logger, log_exception
22 | 
23 | class MysqlProxy:
24 |     def __init__(self, q):
25 |         self.q = q
26 | 
27 |     def insertmany(self, table, arglist):
28 |         self.q.put((MysqlProcessBackend.INSERT, (table, arglist)))
29 | 
30 |     def insert(self, table, **kwargs):
31 |         self.insertmany(table, [kwargs])
32 | 
33 |     def get_cdnjs_info(self, md5):
34 |         return None
35 | 
36 | 
37 | def run(mysql_kwargs, q):
38 |     setup_logger(True)
39 |     finished = False
40 | 
41 |     try:
42 |         with MysqlBackend(None, **mysql_kwargs) as db:
43 |             while True:
44 |                 cmd, data = q.get()
45 |                 if cmd == MysqlProcessBackend.STOP:
46 |                     finished = True
47 |                     break
48 |                 if cmd == MysqlProcessBackend.INSERT:
49 |                     db.insertmany(*data)
50 |     except:
51 |         log_exception("Stopping Mysql backend and emptying queue...")
52 |         if not finished:
53 |             while True:
54 |                 cmd, data = q.get()
55 |                 if cmd == MysqlProcessBackend.STOP:
56 |                     break
57 |                 if cmd == MysqlProcessBackend.INSERT:
58 |                     pass
59 | 
60 | 
61 | class MysqlProcessBackend:
62 |     STOP = "stop"
63 |     INSERT = "insert"
64 | 
65 |     def __init__(self, ext_id, **mysql_kwargs):
66 |         self.mysql_kwargs = mysql_kwargs
67 |         self.m = Manager()
68 |         self.queue = self.m.Queue()
69 | 
70 |     def __enter__(self):
71 |         self.p = Process(target=run, args=(self.mysql_kwargs, self.queue))
72 |         self.p.start()
73 |         return MysqlProxy(self.queue)
74 | 
75 |     def __exit__(self, *args):
76 |         self.queue.put((MysqlProcessBackend.STOP, None))
77 |         self.p.join()
78 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/discover.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3.7
 2 | #
 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | """Python mnodule providing methods for discovering extensions in the
18 |    Chrome extension store."""
19 | 
20 | from xml.etree.ElementTree import fromstring
21 | import re
22 | import requests
23 | from pebble import ThreadPool
24 | from ExtensionCrawler import config
25 | 
26 | 
27 | def get_inner_elems(doc):
28 |     """Get inner element."""
29 |     return fromstring(doc).iterfind(r".//{{{}}}loc".format(
30 |         config.const_sitemap_scheme()))
31 | 
32 | 
33 | def is_generic_url(url):
34 |     """Check if URL is a generic extension URL."""
35 |     """The urls with a language parameter attached return a subset"""
36 |     """of the ids that get returned by the plain urls, therefore we"""
37 |     """skip urls with a language parameter."""
38 | 
39 |     return re.match(r"^{}\?shard=\d+&numshards=\d+$".format(
40 |         config.const_sitemap_url()), url)
41 | 
42 | 
43 | def iterate_shard(shard_url):
44 |     if is_generic_url(shard_url):
45 |         shard = requests.get(shard_url, timeout=10).text
46 |         for inner_elem in get_inner_elems(shard):
47 |             overview_url = inner_elem.text
48 |             yield re.search("[a-z]{32}", overview_url).group(0)
49 | 
50 | 
51 | def process_shard(shard_url):
52 |     return list(iterate_shard(shard_url))
53 | 
54 | 
55 | def get_new_ids(known_ids, max_ids=None):
56 |     """Crawl extension ids available in Chrome store."""
57 | 
58 |     shard_urls = [shard_elem.text for shard_elem in get_inner_elems(
59 |         requests.get(config.const_sitemap_url(), timeout=10).text)]
60 |     with ThreadPool(16) as pool:
61 |         future = pool.map(process_shard, shard_urls, chunksize=1)
62 |         iterator = future.result()
63 | 
64 |         returned_ids = 0
65 |         while True:
66 |             try:
67 |                 for extid in next(iterator):
68 |                     if extid not in known_ids:
69 |                         yield extid
70 |                         returned_ids += 1
71 |                         if max_ids is not None and returned_ids >= max_ids:
72 |                             pool.stop()
73 |                             return
74 |             except StopIteration:
75 |                 return
76 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/file_identifiers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | #
 18 | """ Module for obtaining (normalized) hashes for files."""
 19 | 
 20 | import hashlib
 21 | import os
 22 | import re
 23 | import zlib
 24 | import mimetypes
 25 | from io import StringIO
 26 | from simhash import Simhash
 27 | 
 28 | import cchardet as chardet
 29 | import magic
 30 | 
 31 | from ExtensionCrawler.js_mincer import mince_js
 32 | 
 33 | 
 34 | def is_binary_resource(mimetype_magic):
 35 |     return (mimetype_magic.startswith("image/") or
 36 |             mimetype_magic.startswith("video/") or
 37 |             mimetype_magic.startswith("audio/") or
 38 |             mimetype_magic == "application/pdf")
 39 | 
 40 | 
 41 | def normalize_jsdata(str_data):
 42 |     """Compute normalized code blocks of a JavaScript file"""
 43 |     txt = ""
 44 |     loc = 0
 45 |     with StringIO(str_data) as str_obj:
 46 |         for block in mince_js(str_obj):
 47 |             if block.is_code():
 48 |                 for line in block.content.splitlines():
 49 |                     txt += line.strip()
 50 |                     loc += 1
 51 |     return txt.encode(), loc
 52 | 
 53 | 
 54 | def get_features(s):
 55 |     """Compute feature set of text (represented as string)."""
 56 |     width = 3
 57 |     s = s.lower()
 58 |     s = re.sub(r'[^\w]+', '', s)
 59 |     return (s[i:i + width] for i in range(max(len(s) - width + 1, 1)))
 60 | 
 61 | 
 62 | def get_simhash(encoding, data):
 63 |     """Compute simhash of text."""
 64 |     if encoding is not None:
 65 |         # VISCII is not supported by python, UTF-8 parses at least the for us important parts
 66 |         if encoding == "VISCII":
 67 |             encoding = "UTF-8"
 68 |         str_data = data.decode(encoding=encoding, errors="replace")
 69 |     else:
 70 |         str_data = str(data)
 71 |     simhash = Simhash(get_features(str_data)).value
 72 |     return simhash
 73 | 
 74 | 
 75 | def compute_difference(hx, hy):
 76 |     """Compute difference between two simhashes."""
 77 |     assert hx.bit_length() == hy.bit_length()
 78 |     h = (hx ^ hy) & ((1 << 64) - 1)
 79 |     d = 0
 80 |     while h:
 81 |         d += 1
 82 |         h &= h - 1
 83 |     return d
 84 | 
 85 | 
 86 | def get_data_identifiers(data):
 87 |     """Get basic data identifiers (size, hashes, normalized hashes, etc.)."""
 88 | 
 89 |     data_identifier = {
 90 |         'encoding': None,
 91 |         'description': None,
 92 |         'size': None,
 93 |         'loc': None,
 94 |         'mimetype_magic': None,
 95 |         'md5': None,
 96 |         'sha1': None,
 97 |         'sha256': None,
 98 |         'simhash': None,
 99 |         'size_stripped': None,
100 |         'normalized_encoding': None,
101 |         'normalized_description': None,
102 |         'normalized_size': None,
103 |         'normalized_loc': None,
104 |         'normalized_mimetype_magic': None,
105 |         'normalized_md5': None,
106 |         'normalized_sha1': None,
107 |         'normalized_sha256': None,
108 |         'normalized_simhash': None
109 |     }
110 | 
111 |     mimetype_magic = magic.from_buffer(data, mime=True)
112 | 
113 |     try:
114 |         magic_desc = magic.from_buffer(data)
115 |     except magic.MagicException as exp:
116 |         rgx = re.compile(r' name use count.*$')
117 |         msg = str(exp.message)
118 |         if re.search(rgx, msg):
119 |             magic_desc = re.sub(rgx, '', msg)
120 |         else:
121 |             raise exp
122 | 
123 |     data_identifier['mimetype_magic'] = mimetype_magic
124 |     data_identifier['md5'] = hashlib.md5(data).digest()
125 |     data_identifier['sha1'] = hashlib.sha1(data).digest()
126 |     data_identifier['sha256'] = hashlib.sha256(data).digest()
127 |     data_identifier['size'] = len(data)
128 |     data_identifier['description'] = magic_desc
129 | 
130 |     # We don't continue here with binary files, as that consumes too many
131 |     # resources.
132 |     if is_binary_resource(mimetype_magic):
133 |         return data_identifier
134 | 
135 |     encoding = chardet.detect(data)['encoding']
136 | 
137 |     data_identifier['simhash'] = get_simhash(encoding, data)
138 |     data_identifier['size_stripped'] = len(data.strip())
139 |     data_identifier['loc'] = len(data.splitlines())
140 |     data_identifier['encoding'] = encoding
141 |     try:
142 |         normalized_data, normalized_loc = normalize_jsdata(
143 |             data.decode(encoding=data_identifier['encoding'], errors="replace"))
144 |     except Exception:
145 |         normalized_data = None
146 |         normalized_loc = 0
147 | 
148 |     if normalized_data is not None:
149 |         normalized_magic_desc = ""
150 |         try:
151 |             normalized_magic_desc = magic.from_buffer(normalized_data)
152 |         except magic.MagicException as exp:
153 |             rgx = re.compile(r' name use count.*$')
154 |             msg = str(exp.message)
155 |             if re.search(rgx, msg):
156 |                 normalized_magic_desc = re.sub(rgx, '', msg)
157 |             else:
158 |                 raise exp
159 |         normalized_encoding = chardet.detect(normalized_data)['encoding']
160 |         data_identifier['normalized_encoding'] = normalized_encoding
161 |         data_identifier['normalized_description'] = normalized_magic_desc
162 |         data_identifier['normalized_size'] = len(normalized_data)
163 |         data_identifier['normalized_loc'] = normalized_loc
164 |         data_identifier['normalized_mimetype_magic'] = magic.from_buffer(normalized_data, mime=True)
165 |         data_identifier['normalized_md5'] = hashlib.md5(
166 |             normalized_data).digest()
167 |         data_identifier['normalized_sha1'] = hashlib.sha1(
168 |             normalized_data).digest()
169 |         data_identifier['normalized_sha256'] = hashlib.sha256(
170 |             normalized_data).digest()
171 |         data_identifier['normalized_simhash'] = get_simhash(
172 |             normalized_encoding, normalized_data)
173 |     return data_identifier
174 | 
175 | 
176 | def get_file_identifiers(path, data=None):
177 |     """Get basic file identifiers (path, filename, etc.) and data identifiers."""
178 |     dec_data_identifier = {
179 |         'mimetype_magic': None,
180 |         'md5': None,
181 |         'sha1': None,
182 |         'sha256': None,
183 |         'simhash': None,
184 |         'size': None,
185 |         'size_stripped': None,
186 |         'loc': None,
187 |         'description': None,
188 |         'encoding': None,
189 |         'normalized_mimetype_magic': None,
190 |         'normalized_loc': None,
191 |         'normalized_encoding': None,
192 |         'normalized_description': None,
193 |         'normalized_size': None,
194 |         'normalized_md5': None,
195 |         'normalized_sha1': None,
196 |         'normalized_sha256': None,
197 |         'normalized_simhash': None
198 |     }
199 |     if data is None:
200 |         with open(path, 'rb') as fileobj:
201 |             data = fileobj.read()
202 | 
203 |     data_identifier = get_data_identifiers(data)
204 |     if data_identifier['description'].startswith('gzip'):
205 |         try:
206 |             dec = zlib.decompressobj(zlib.MAX_WBITS | 16)
207 |             dec_data = dec.decompress(data, 100 * data_identifier['size'])
208 |             dec_data_identifier = get_data_identifiers(dec_data)
209 |             del dec_data
210 |         except Exception as e:
211 |             dec_data_identifier[
212 |                 'description'] = "Exception during compression (likely zip-bomb:" + str(
213 |                     e)
214 |     file_identifier = {
215 |         'filename':
216 |         os.path.basename(path),
217 |         'path':
218 |         path,
219 |         'mimetype':
220 |         mimetypes.guess_type(path),
221 |         'mimetype_magic':
222 |         data_identifier['mimetype_magic'],
223 |         'md5':
224 |         data_identifier['md5'],
225 |         'sha1':
226 |         data_identifier['sha1'],
227 |         'sha256':
228 |         data_identifier['sha256'],
229 |         'simhash':
230 |         data_identifier['simhash'],
231 |         'size':
232 |         data_identifier['size'],
233 |         'size_stripped':
234 |         data_identifier['size_stripped'],
235 |         'loc':
236 |         data_identifier['loc'],
237 |         'description':
238 |         data_identifier['description'],
239 |         'encoding':
240 |         data_identifier['encoding'],
241 |         'normalized_encoding':
242 |         data_identifier['normalized_encoding'],
243 |         'normalized_description':
244 |         data_identifier['normalized_description'],
245 |         'normalized_size':
246 |         data_identifier['normalized_size'],
247 |         'normalized_loc':
248 |         data_identifier['normalized_loc'],
249 |         'normalized_mimetype_magic':
250 |         data_identifier['normalized_mimetype_magic'],
251 |         'normalized_md5':
252 |         data_identifier['normalized_md5'],
253 |         'normalized_sha1':
254 |         data_identifier['normalized_sha1'],
255 |         'normalized_sha256':
256 |         data_identifier['normalized_sha256'],
257 |         'normalized_simhash':
258 |         data_identifier['normalized_simhash'],
259 |         'dec_mimetype_magic':
260 |         dec_data_identifier['mimetype_magic'],
261 |         'dec_md5':
262 |         dec_data_identifier['md5'],
263 |         'dec_sha1':
264 |         dec_data_identifier['sha1'],
265 |         'dec_sha256':
266 |         dec_data_identifier['sha256'],
267 |         'dec_simhash':
268 |         dec_data_identifier['simhash'],
269 |         'dec_size':
270 |         dec_data_identifier['size'],
271 |         'dec_size_stripped':
272 |         dec_data_identifier['size_stripped'],
273 |         'dec_loc':
274 |         dec_data_identifier['loc'],
275 |         'dec_description':
276 |         dec_data_identifier['description'],
277 |         'dec_encoding':
278 |         dec_data_identifier['encoding'],
279 |         'dec_normalized_encoding':
280 |         dec_data_identifier['normalized_encoding'],
281 |         'dec_normalized_description':
282 |         dec_data_identifier['normalized_description'],
283 |         'dec_normalized_size':
284 |         dec_data_identifier['normalized_size'],
285 |         'dec_normalized_loc':
286 |         dec_data_identifier['normalized_loc'],
287 |         'dec_normalized_mimetype_magic':
288 |         dec_data_identifier['normalized_mimetype_magic'],
289 |         'dec_normalized_md5':
290 |         dec_data_identifier['normalized_md5'],
291 |         'dec_normalized_sha1':
292 |         dec_data_identifier['normalized_sha1'],
293 |         'dec_normalized_sha256':
294 |         dec_data_identifier['normalized_sha256'],
295 |         'dec_normalized_simhash':
296 |         dec_data_identifier['normalized_simhash']
297 |     }
298 | 
299 |     return file_identifier
300 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/js_mincer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | """ A mostly correct JavaScript analyzer that separates comments from code. The
 18 |     implementation prioritizes speed over correctness. """
 19 | 
 20 | from enum import Enum
 21 | 
 22 | 
 23 | class JsBlockType(Enum):
 24 |     """Enumeration of the different JavaScript blocks."""
 25 |     CODE_BLOCK = 1
 26 |     SINGLE_LINE_COMMENT = 2
 27 |     SINGLE_LINE_COMMENT_BLOCK = 3
 28 |     MULTI_LINE_COMMENT_BLOCK = 4
 29 |     STRING_SQ = 5
 30 |     STRING_DQ = 6
 31 | 
 32 | 
 33 | def is_string_literal_sq(state):
 34 |     """Check if block is a single quote string literal."""
 35 |     return state == JsBlockType.STRING_SQ
 36 | 
 37 | 
 38 | def is_string_literal_dq(state):
 39 |     """Check if block is a double quote string literal."""
 40 |     return state == JsBlockType.STRING_DQ
 41 | 
 42 | 
 43 | def is_string_literal(state):
 44 |     """Check if block is a quote string literal."""
 45 |     return is_string_literal_sq(state) or is_string_literal_dq(state)
 46 | 
 47 | 
 48 | def is_code(state):
 49 |     """Check if block is code (without string literals)."""
 50 |     return state == JsBlockType.CODE_BLOCK
 51 | 
 52 | 
 53 | def is_code_or_string_literal(state):
 54 |     """Check if block is code or a string literal."""
 55 |     return is_code(state) or is_string_literal(state)
 56 | 
 57 | 
 58 | def is_comment_multi_line(state):
 59 |     """Check if block is a multi line comment."""
 60 |     return state == JsBlockType.MULTI_LINE_COMMENT_BLOCK
 61 | 
 62 | 
 63 | def is_comment_single_line(state):
 64 |     """Check if block is a single line comment."""
 65 |     return state == JsBlockType.SINGLE_LINE_COMMENT
 66 | 
 67 | 
 68 | def is_comment_single_line_block(state):
 69 |     """Check if block is a single line comment block."""
 70 |     return state == JsBlockType.SINGLE_LINE_COMMENT_BLOCK
 71 | 
 72 | 
 73 | def is_comment(state):
 74 |     """Check if block is a comment."""
 75 |     return is_comment_single_line(state) or is_comment_multi_line(
 76 |         state) or is_comment_single_line_block(state)
 77 | 
 78 | 
 79 | def get_next_character(fileobj):
 80 |     """Get next character from (text) file."""
 81 |     char = fileobj.read(1)
 82 |     while char:
 83 |         yield char
 84 |         char = fileobj.read(1)
 85 | 
 86 | 
 87 | class JsBlock:
 88 |     """Class representing JavaScript blocks."""
 89 | 
 90 |     def __init__(self, typ, start, end, content, string_literals=None):
 91 |         self.typ = typ
 92 |         self.start = start
 93 |         self.end = end
 94 |         self.content = content
 95 |         self.string_literals = string_literals
 96 | 
 97 |     def is_code(self):
 98 |         """Check if block is a code block."""
 99 |         return not is_comment(self.typ)
100 | 
101 |     def is_comment(self):
102 |         """Check if block is a comment."""
103 |         return is_comment(self.typ)
104 | 
105 |     def is_comment_single_line(self):
106 |         """Check if block is a single line comment."""
107 |         return is_comment_single_line(self.typ)
108 | 
109 |     def is_comment_single_line_block(self):
110 |         """Check if block is single line comment block."""
111 |         return is_comment_single_line_block(self.typ)
112 | 
113 |     def is_comment_multi_line_block(self):
114 |         """Check if block is a multi line comment."""
115 |         return is_comment_multi_line(self.typ)
116 | 
117 |     def __str__(self):
118 |         str_msg = ""
119 |         if is_code(self.typ):
120 |             str_msg = "** String Literals: " + str(len(
121 |                 self.string_literals)) + "\n"
122 |         return (
123 |             "***************************************************************\n"
124 |             + "** Type:  " + str(self.typ.name) + "\n" + "** Start: " + str(
125 |                 self.start) + "\n" + "** End:   " + str(
126 |                     self.end) + "\n" + str_msg + self.content.strip() + "\n" +
127 |             "***************************************************************\n"
128 |         )
129 | 
130 | 
131 | def mince_js_fileobj(fileobj):
132 |     """Mince JavaScript file object into code and comment blocks."""
133 |     line = 1
134 |     cpos = 1
135 |     escaped = False
136 |     content = ""
137 |     block_start_line = line
138 |     block_start_cpos = cpos
139 |     state = JsBlockType.CODE_BLOCK
140 |     string_literals = []
141 |     current_string_literal = ""
142 | 
143 |     for char in get_next_character(fileobj):
144 |         cpos += 1
145 |         content += char
146 |         suc_state = state
147 |         if not escaped:
148 |             if is_code_or_string_literal(state):
149 |                 if is_code(state):
150 |                     if char == "'":
151 |                         suc_state = JsBlockType.STRING_SQ
152 |                     if char == '"':
153 |                         suc_state = JsBlockType.STRING_DQ
154 |                     if char == '/':
155 |                         try:
156 |                             next_char = next(get_next_character(fileobj))
157 |                             if next_char == '/':
158 |                                 suc_state = JsBlockType.SINGLE_LINE_COMMENT
159 |                             elif next_char == '*':
160 |                                 suc_state = JsBlockType.MULTI_LINE_COMMENT_BLOCK
161 |                             next_content = content[-1] + next_char
162 |                             content = content[:-1]
163 |                             cpos -= 1
164 |                             char = next_char
165 |                         except StopIteration:
166 |                             pass
167 |                 elif is_string_literal_dq(state):
168 |                     if char == '"':
169 |                         suc_state = JsBlockType.CODE_BLOCK
170 |                         string_literals.append(((line, cpos),
171 |                                                 current_string_literal))
172 |                         current_string_literal = ""
173 |                     else:
174 |                         current_string_literal += char
175 |                 elif is_string_literal_sq(state):
176 |                     if char == "'":
177 |                         suc_state = JsBlockType.CODE_BLOCK
178 |                         string_literals.append(((line, cpos),
179 |                                                 current_string_literal))
180 |                         current_string_literal = ""
181 |                     else:
182 |                         current_string_literal += char
183 |                 else:
184 |                     raise Exception("Unknown state")
185 |             elif is_comment(state):
186 |                 if is_comment_single_line(state):
187 |                     if char == '\n':
188 |                         suc_state = JsBlockType.CODE_BLOCK
189 |                 elif is_comment_multi_line(state):
190 |                     if char == '*':
191 |                         try:
192 |                             next_char = next(get_next_character(fileobj))
193 |                             if next_char == '/':
194 |                                 suc_state = JsBlockType.CODE_BLOCK
195 |                             content = content + next_char
196 |                             cpos += 1
197 |                             char = next_char
198 |                         except StopIteration:
199 |                             pass
200 | 
201 |         if ((is_comment(state) and is_code_or_string_literal(suc_state)) or (
202 |                 is_code_or_string_literal(state) and is_comment(suc_state))):
203 |             if content.strip():
204 |                 yield (JsBlock(state, (block_start_line, block_start_cpos),
205 |                                (line, cpos), content, string_literals))
206 |             if char == '\n':
207 |                 block_start_line = line + 1
208 |                 block_start_cpos = 1
209 |             else:
210 |                 block_start_line = line
211 |                 block_start_cpos = cpos
212 |             content = next_content
213 |             next_content = ""
214 |             string_literals = []
215 | 
216 |         if char == '\n':
217 |             line += 1
218 |             cpos = 1
219 | 
220 |         escaped = bool(char == '\\' and not escaped)
221 |         state = suc_state
222 | 
223 |     if content.strip():
224 |         yield (JsBlock(state, (block_start_line, block_start_cpos),
225 |                        (line, cpos), content, string_literals))
226 | 
227 | 
228 | def mince_js_fileobj_slc_blocks(fileobj):
229 |     """Mince JavaScript file object into code and comment blocks (join subsequent
230 |        single line comments)."""
231 |     for block in mince_js_fileobj(fileobj):
232 |         if block.typ == JsBlockType.SINGLE_LINE_COMMENT:
233 |             start = block.start
234 |             end = block.end
235 |             content = block.content
236 |             single_block = False
237 |             for suc in mince_js_fileobj(fileobj):
238 |                 if suc.typ == JsBlockType.SINGLE_LINE_COMMENT:
239 |                     content += suc.content
240 |                     end = suc.end
241 |                     single_block = True
242 |                 else:
243 |                     if single_block:
244 |                         yield (JsBlock(JsBlockType.SINGLE_LINE_COMMENT_BLOCK,
245 |                                        start, end, content))
246 |                     else:
247 |                         yield block
248 |                     content = ""
249 |                     yield suc
250 |                     break
251 |             if content.strip() != "":
252 |                 yield (JsBlock(JsBlockType.SINGLE_LINE_COMMENT_BLOCK, start,
253 |                                end, content))
254 |         else:
255 |             yield block
256 | 
257 | 
258 | def mince_js_file(file):
259 |     """Mince JavaScript file into code and comment blocks."""
260 |     with open(file, encoding="utf-8") as fileobj:
261 |         for block in mince_js_fileobj(fileobj):
262 |             yield block
263 | 
264 | 
265 | def mince_js_file_slc_blocks(file):
266 |     """Mince JavaScript file into code and comment blocks (join subsequent single
267 |        line comments)."""
268 |     with open(file, encoding="utf-8") as fileobj:
269 |         for block in mince_js_fileobj_slc_blocks(fileobj):
270 |             yield block
271 | 
272 | 
273 | def mince_js(file, single_line_comments_block=False):
274 |     """Mince JavaScript file (either file name or open file object) into code and
275 |        comment blocks. Subsequent comment line blocks can be minced into separate
276 |        entities or merged."""
277 |     if isinstance(file, str):
278 |         if single_line_comments_block:
279 |             return mince_js_file_slc_blocks(file)
280 |         else:
281 |             return mince_js_file(file)
282 |     else:
283 |         if single_line_comments_block:
284 |             return mince_js_fileobj_slc_blocks(file)
285 |         else:
286 |             return mince_js_fileobj(file)
287 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/request_manager.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | from contextlib import contextmanager
 4 | from multiprocessing import Lock, BoundedSemaphore, Value
 5 | 
 6 | 
 7 | class RequestManager:
 8 |     def __init__(self, max_workers):
 9 |         self.max_workers = max_workers
10 |         self.lock = Lock()
11 |         self.sem = BoundedSemaphore(max_workers)
12 |         self.last_request = Value('d', 0.0)
13 |         self.last_restricted_request = Value('d', 0.0)
14 | 
15 |     @contextmanager
16 |     def normal_request(self):
17 |         with self.lock:
18 |             self.sem.acquire()
19 |         time.sleep(max(0.0, self.last_restricted_request.value + 0.6 + (random.random() * 0.15) - time.time()))
20 |         try:
21 |             yield
22 |         except Exception as e:
23 |             raise e
24 |         finally:
25 |             self.last_request.value = time.time()
26 |             self.sem.release()
27 | 
28 |     @contextmanager
29 |     def restricted_request(self):
30 |         with self.lock:
31 |             for i in range(self.max_workers):
32 |                 self.sem.acquire()
33 |         time.sleep(max(0.0, self.last_request.value + 0.6 + (random.random() * 0.15) - time.time()))
34 |         try:
35 |             yield
36 |         except Exception as e:
37 |             raise e
38 |         finally:
39 |             self.last_request.value = time.time()
40 |             self.last_restricted_request.value = time.time()
41 |             for i in range(self.max_workers):
42 |                 self.sem.release()
43 | 


--------------------------------------------------------------------------------
/ExtensionCrawler/util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3.7
 2 | #
 3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
17 | #
18 | """ Various utility methods."""
19 | 
20 | import traceback
21 | import logging
22 | import sys
23 | 
24 | from ExtensionCrawler.config import const_log_format
25 | 
26 | 
27 | def value_of(value, default):
28 |     """Get value or default value if None."""
29 |     if value is not None and value is not "":
30 |         return value
31 |     else:
32 |         return default
33 | 
34 | 
35 | def log_debug(msg, indent_level=0):
36 |     logging.debug(4 * indent_level * " " + str(msg))
37 | 
38 | 
39 | def log_info(msg, indent_level=0):
40 |     logging.info(4 * indent_level * " " + str(msg))
41 | 
42 | 
43 | def log_warning(msg, indent_level=0):
44 |     logging.warning(4 * indent_level * " " + str(msg))
45 | 
46 | 
47 | def log_error(msg, indent_level=0):
48 |     logging.error(4 * indent_level * " " + str(msg))
49 | 
50 | 
51 | def log_exception(msg, indent_level=0):
52 |     logging.error(4 * indent_level * " " + str(msg))
53 |     for line in traceback.format_exc().splitlines():
54 |         logging.error(4 * indent_level * " " + line)
55 | 
56 | 
57 | def set_logger_tag(ext_id):
58 |     logger = logging.getLogger()
59 |     for handler in logger.handlers:
60 |         handler.setFormatter(logging.Formatter(const_log_format(ext_id)))
61 | 
62 | 
63 | def setup_logger(verbose):
64 |     if verbose:
65 |         loglevel = logging.INFO
66 |     else:
67 |         loglevel = logging.WARNING
68 | 
69 |     logger = logging.getLogger()
70 |     logger.setLevel(loglevel)
71 |     ch = logging.StreamHandler(sys.stdout)
72 |     logger.addHandler(ch)
73 | 
74 |     set_logger_tag("-" * 32)
75 | 


--------------------------------------------------------------------------------
/PermissionAnalysis/grep-unused-permissions:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2019 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 17 | #
 18 | # SPDX-License-Identifier: GPL-3.0-or-later
 19 | 
 20 | import argparse
 21 | import io
 22 | import logging
 23 | import re
 24 | import json
 25 | import sys
 26 | import csv
 27 | from jsmin import jsmin
 28 | import ast
 29 | 
 30 | from zipfile import ZipFile
 31 | 
 32 | from ExtensionCrawler.config import (const_log_format, const_basedir)
 33 | from ExtensionCrawler.archive import iter_tar_entries_by_date
 34 | from ExtensionCrawler.js_mincer import mince_js
 35 | 
 36 | 
 37 | def get_etag(headers_content):
 38 |     d = ast.literal_eval(headers_content)
 39 |     if "ETag" in d:
 40 |         return d["ETag"]
 41 | 
 42 | 
 43 | def get_metadata(overview_contents):
 44 |     # Extract extension name
 45 |     match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
 46 |                       overview_contents)
 47 |     name = match.group(1) if match else None
 48 | 
 49 |     # Extract extension version
 50 |     match = re.search(
 51 |         """<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
 52 |     version = match.group(1) if match else None
 53 |     
 54 |     # Extracts extension categories
 55 |     match = re.search(
 56 |         """Attribute name="category">(.+?)</Attribute>""", overview_contents)
 57 |     categories = match.group(1).split(",") if match else []
 58 | 
 59 |     # Extracts the number of downloads
 60 |     match = re.search(
 61 |         """<meta itemprop="interactionCount" content="UserDownloads:((:?\d|,)+)""",
 62 |         overview_contents)
 63 |     downloads = int(match.group(1).replace(",", '')) if match else None
 64 | 
 65 |     return name, version, categories, downloads
 66 | 
 67 | 
 68 | def handle_extid(conf, extid, permission_map, csvwriter):
 69 |     results = []
 70 | 
 71 |     still_in_store = None
 72 |     crx_etags = [None]
 73 |     for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
 74 |         if conf.from_date and not (conf.from_date <= date):
 75 |             continue
 76 |         if conf.latest_date and not (date <= conf.latest_date):
 77 |             continue
 78 | 
 79 |         crx_etag = None
 80 |         name = None
 81 |         version = None
 82 |         date_matches = {}
 83 |         for permission in permission_map.keys():
 84 |             date_matches[permission] = False
 85 |         has_crx_file = False
 86 |         used_permissions = set()
 87 | 
 88 |         for tarentry, tarfile in tups:
 89 |             tarentry_filename = tarentry.name.split("/")[-1]
 90 | 
 91 |             if tarentry_filename.endswith(".crx.headers"):
 92 |                 crx_etag = get_etag(tarfile.read().decode())
 93 |                 if crx_etag:
 94 |                     crx_etags += [crx_etag]
 95 | 
 96 |             if tarentry_filename == "overview.html":
 97 |                 name, version, categories, downloads = get_metadata(tarfile.read().decode())
 98 | 
 99 |             if tarentry_filename == "overview.html.status":
100 |                 still_in_store = tarfile.read().decode().startswith("2")
101 | 
102 |             if tarentry_filename.endswith(".crx") and tarentry.size > 0:
103 |                 has_crx_file = True
104 |                 with ZipFile(tarfile) as zf:
105 |                     for zipentry in zf.infolist():
106 |                         if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"):
107 |                             with zf.open(zipentry) as f:
108 |                                 verbatim_lines = []
109 |                                 for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
110 |                                     verbatim_lines += block.content.splitlines()
111 | 
112 |                                 for permission, evidences in permission_map.items():
113 |                                     for evidence in evidences:
114 |                                         for line in verbatim_lines:
115 |                                             if evidence in line:
116 |                                                 date_matches[permission] = True
117 |                                                 break
118 | 
119 |                         if zipentry.filename == "manifest.json":
120 |                             with zf.open(zipentry) as m:
121 |                                 raw_content = m.read()
122 |                                 # There are some manifests that seem to have weird encodings...
123 |                                 try:
124 |                                     content = raw_content.decode("utf-8-sig")
125 |                                 except UnicodeDecodeError:
126 |                                     # Trying a different encoding, manifests are weird...
127 |                                     content = raw_content.decode("latin1")
128 | 
129 |                                 manifest = json.loads(jsmin(content), strict=False)
130 |                                 if "permissions" in manifest:
131 |                                     for permission in manifest["permissions"]:
132 |                                         used_permissions.add(str(permission))
133 | 
134 |         if has_crx_file:
135 |             line = [date, crx_etag, name, version, "+".join(categories), downloads]
136 |             for permission in sorted(list(permission_map.keys())):
137 |                 if permission in used_permissions:
138 |                     if date_matches[permission]:
139 |                         line += ["REQ_AND_FOUND"]
140 |                     else:
141 |                         line += ["REQ_AND_NOT_FOUND"]
142 |                 else:
143 |                     if date_matches[permission]:
144 |                         line += ["NOT_REQ_AND_FOUND"]
145 |                     else:
146 |                         line += ["NOT_REQ_AND_NOT_FOUND"]
147 |             results += [line]
148 | 
149 |     for result in results:
150 |         csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
151 | 
152 | 
153 | def main(conf):
154 |     logger = logging.getLogger()
155 |     ch = logging.StreamHandler(sys.stderr)
156 |     ch.setFormatter(logging.Formatter(const_log_format()))
157 |     logger.addHandler(ch)
158 |     if conf.verbose:
159 |         logger.setLevel(logging.DEBUG)
160 |     else:
161 |         logger.setLevel(logging.WARNING)
162 | 
163 |     with open(conf.MAP_FILE) as f:
164 |         permission_map = json.load(f)
165 | 
166 |     with open(conf.EXTID_FILE) as f:
167 |         csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
168 |         csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "categories", "downloads"]
169 |                            + sorted(list(permission_map.keys())))
170 |         for extid in [l.strip() for l in f.readlines()]:
171 |             try:
172 |                 handle_extid(conf, extid, permission_map, csvwriter)
173 |             except Exception as e:
174 |                 logging.exception(f"Fatal error when handling extension '{extid}'")
175 | 
176 | 
177 | def build_parser():
178 |     main_parser = argparse.ArgumentParser(
179 |         formatter_class=argparse.RawTextHelpFormatter,
180 |         description='Search extensions for unused permissions')
181 |     main_parser.add_argument(
182 |         'MAP_FILE',
183 |         help='json file with permission - literal string mapping')
184 |     main_parser.add_argument(
185 |         'EXTID_FILE',
186 |         help='file with extension ids')
187 |     main_parser.add_argument(
188 |         '-v',
189 |         '--verbose',
190 |         action='store_true',
191 |         default=False,
192 |         help='increase verbosity')
193 | 
194 | 
195 |     main_parser.add_argument(
196 |         '-D',
197 |         '--latest-date',
198 |         metavar='DATE',
199 |         type=str,
200 |         help='select latest crx from tar, released before DATE.\n' +
201 |              'Together with --from-date, specifies all crx released in specified\n' +
202 |              'date range.')
203 | 
204 |     main_parser.add_argument(
205 |         '-d',
206 |         '--from-date',
207 |         metavar='DATE',
208 |         type=str,
209 |         help='select oldest crx from tar released after DATE.\n' +
210 |              'Together with --latest-date, specifies all crx released in specified\n' +
211 |              'date range.')
212 | 
213 |     main_parser.add_argument(
214 |         '-a',
215 |         '--archive-dir',
216 |         metavar='archive',
217 |         type=str,
218 |         default=const_basedir(),
219 |         help='archive directory')
220 | 
221 |     return main_parser
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     main_parser = build_parser()
226 | 
227 |     main_conf = main_parser.parse_args()
228 | 
229 |     sys.exit(main(main_conf))
230 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ExtensionCrawler
 2 | 
 3 | A collection of utilities for downloading and analyzing browser
 4 | extension from the Chrome Web store.
 5 | 
 6 | * `crawler`: A crawler for extensions from the Chrome Web Store.
 7 | * `crx-tool`: A tool for analyzing and extracting `*.crx` files
 8 |   (i.e., Chrome extensions). Calling `crx-tool.py <extension>.crx`
 9 |   will check the integrity of the extension.
10 | * `crx-extract`: A simple tool for extracting `*.crx` files from the
11 |    tar-based archive hierarchy.
12 | * `crx-jsinventory`: Build a JavaScript inventory of a `*.crx` file using a
13 |                    JavaScript decomposition analysis.
14 | * `crx-jsstrings`: A tool for extracting code blocks, comment blocks, and
15 |                  string literals from JavaScript.
16 | * `create-db`: A tool for updating a remote MariaDB from already
17 |    existing extension archives.
18 | 
19 | The utilities store the extensions in the following directory
20 | hierarchy:
21 | 
22 | ```shell
23 |    archive
24 |    ├── conf
25 |    │   └── forums.conf
26 |    ├── data
27 |    │   └── ...
28 |    └── log
29 |        └── ...
30 | ```
31 | 
32 | The crawler downloads the most recent extension (i.e., the `*.crx`
33 | file as well as the overview page. In addition, the `conf` directory
34 | may contain one file, called `forums.conf` that lists the ids of
35 | extensions for which the forums and support pages should be downloaded
36 | as well. The `data` directory will contain the downloaded extensions.
37 | 
38 | The `crawler` and `create-db` scripts will access and update a MariaDB.
39 | They will use the host, datebase, and credentials found in `~/.my.cnf`.
40 | Since they make use of various JSON features, it is recommended to use at
41 | least version 10.2.8 of MariaDB.
42 | 
43 | All utilities are written in Python 3.7. The required modules are listed
44 | in the file `requirements.txt`.
45 | 
46 | ## Installation
47 | 
48 | Clone and use pip3 to install as a package.
49 | 
50 | ```shell
51 | git clone git@logicalhacking.com:BrowserSecurity/ExtensionCrawler.git
52 | pip3 install --user -e ExtensionCrawler
53 | ```
54 | 
55 | ## Team
56 | 
57 | * [Achim D. Brucker](http://www.brucker.ch/)
58 | * [Michael Herzberg](http://www.dcs.shef.ac.uk/cgi-bin/makeperson?M.Herzberg)
59 | 
60 | ### Contributors
61 | 
62 | * Mehmet Balande
63 | 
64 | ## License
65 | 
66 | This project is licensed under the GPL 3.0 (or any later version).
67 | 
68 | SPDX-License-Identifier: GPL-3.0-or-later
69 | 
70 | ## Master Repository
71 | 
72 | The master git repository for this project is hosted by the [Software
73 | Assurance & Security Research Team](https://logicalhacking.com) at
74 | <https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler>.
75 | 


--------------------------------------------------------------------------------
/analysis/library-detector/angular/angular.py:
--------------------------------------------------------------------------------
 1 | import MySQLdb
 2 | from MySQLdb import cursors
 3 | import os
 4 | from distutils.version import LooseVersion
 5 | from itertools import groupby, islice
 6 | import datetime
 7 | import pickle
 8 | 
 9 | def execute(q, args=None):
10 |     cachepath = "mysqlcache.tmp"
11 |     cache = {}
12 |     if os.path.exists(cachepath):
13 |         with open(cachepath, 'rb') as f:
14 |             try:
15 |                 cache = pickle.load(f)
16 |             except Exception as e:
17 |                 print(e)
18 | 
19 |     if q in cache:
20 |         print("retrieving query results from cache...")
21 |         for row in cache[q]:
22 |             yield row
23 |     else:
24 |         print("query not in cache, contacting db ...")
25 |         db = MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), cursorclass=cursors.SSCursor)
26 |         cursor = db.cursor()
27 |         cursor.execute(q, args)
28 | 
29 |         result = []
30 |         for row in cursor:
31 |             result += [row]
32 |             yield row
33 |         cache[q] = result
34 |         with open(cachepath, 'wb') as f:
35 |             pickle.dump(cache, f)
36 |             print("cache saved")
37 | 
38 | vuln_md5s = {}
39 | 
40 | for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='angular.js' and (filename in ('angular.js', 'angular.min.js'))"):
41 |     if version not in vuln_md5s:
42 |         vuln_md5s[version] = set()
43 |     vuln_md5s[version].add(md5)
44 | 
45 | sorted_vuln_md5s = []
46 | for library_version in sorted(vuln_md5s.keys(), key=LooseVersion)[::-1]:
47 |     sorted_vuln_md5s += [(library_version, vuln_md5s[library_version])]
48 | 
49 | 
50 | def get_angular_version(md5):
51 |     for library_version, md5s in sorted_vuln_md5s:
52 |         if md5 in md5s:
53 |             return library_version
54 | 
55 | for extid, g in groupby(execute("select extid, crx_etag, date, md5 from extension_update_most_recent join crxfile using (crx_etag) where typ='NORMALIZED' order by extid, date, crx_etag"), lambda x: x[0]):
56 |     result = {}
57 | 
58 |     for crx_etag, g in groupby(map(lambda x: x[1:], g), lambda x: x[0]):
59 |         result_version = None
60 |         for date, md5, in map(lambda x: x[1:], g):
61 |             version = get_angular_version(md5)
62 |             if version is not None and (result_version is None or LooseVersion(version) > LooseVersion(result_version)):
63 |                 result_version = version
64 |         result[date] = result_version
65 | 
66 |     if len(set(result.values())) > 1:
67 |         for date in sorted(result.keys()):
68 |             print(f"{extid}|{date}|{result[date]}")
69 | 


--------------------------------------------------------------------------------
/analysis/library-detector/angular/angularversions.txt:
--------------------------------------------------------------------------------
  1 | 1.7.5,2018-10-04 14:59:37 +0100
  2 | 1.7.4,2018-09-07 09:57:37 +0100
  3 | 1.7.3,2018-08-03 13:35:40 +0200
  4 | 1.7.2,2018-06-12 16:34:38 +0300
  5 | 1.7.1,2018-06-08 16:26:22 +0300
  6 | 1.7.0,2018-05-11 10:31:53 +0200
  7 | 1.7.0-rc.0,2018-04-19 10:07:41 +0200
  8 | 1.6.10,2018-04-17 18:35:33 +0200
  9 | 1.6.9,2018-02-02 11:19:32 +0100
 10 | 1.6.8,2017-12-18 15:17:56 +0100
 11 | 1.6.7,2017-11-24 18:44:04 +0100
 12 | 1.6.6,2017-08-18 15:12:44 +0200
 13 | 1.6.5,2017-07-03 22:34:52 +0300
 14 | 1.6.4,2017-03-31 10:48:25 +0200
 15 | 1.6.3,2017-03-08 12:44:24 +0100
 16 | 1.6.2,2017-02-05 17:58:25 +0200
 17 | 1.6.1,2016-12-23 10:38:58 +0000
 18 | 1.6.0,2016-12-08 11:07:52 +0000
 19 | 1.6.0-rc.2,2016-11-24 21:30:56 +0000
 20 | 1.6.0-rc.1,2016-11-21 13:27:47 +0000
 21 | 1.6.0-rc.0,2016-10-27 20:28:09 +0100
 22 | 1.5.11,2017-01-12 11:22:40 +0200
 23 | 1.5.10,2016-12-16 12:27:04 +0200
 24 | 1.5.9,2016-11-24 09:27:57 +0000
 25 | 1.5.8,2016-07-22 16:01:46 +0100
 26 | 1.5.7,2016-06-14 08:08:25 -0700
 27 | 1.5.6,2016-05-25 17:00:13 +0100
 28 | 1.5.5,2016-04-15 14:09:39 +0100
 29 | 1.5.4,2016-04-14 09:13:48 +0100
 30 | 1.5.3,2016-03-25 20:01:45 +0000
 31 | 1.5.2,2016-03-18 15:37:43 -0700
 32 | 1.5.1,2016-03-14 14:45:29 +0000
 33 | 1.5.0,2016-02-05 10:04:17 +0000
 34 | 1.5.0-rc.2,2016-01-28 09:51:01 +0000
 35 | 1.5.0-rc.1,2016-01-15 20:31:08 +0000
 36 | 1.5.0-rc.0,2015-12-09 13:50:58 +0000
 37 | 1.5.0-beta.2,2015-11-17 15:57:27 -0800
 38 | 1.5.0-beta.1,2015-09-29 13:59:34 -0700
 39 | 1.5.0-beta.0,2015-09-17 13:42:10 +0100
 40 | 1.4.14,2016-10-11 14:11:08 +0100
 41 | 1.4.13,2016-10-10 22:02:52 +0100
 42 | 1.4.12,2016-06-07 10:44:56 +0200
 43 | 1.4.11,2016-05-24 16:44:11 +0200
 44 | 1.4.10,2016-03-14 17:27:49 -0400
 45 | 1.4.9,2016-01-20 10:11:04 -0800
 46 | 1.4.8,2015-11-19 14:52:56 -0800
 47 | 1.4.7,2015-09-29 13:54:51 -0700
 48 | 1.4.6,2015-09-14 22:43:55 +0200
 49 | 1.4.5,2015-08-28 12:06:35 -0700
 50 | 1.4.4,2015-08-13 11:15:10 -0700
 51 | 1.4.3,2015-07-14 18:26:10 -0700
 52 | 1.4.2,2015-07-02 14:36:49 +0300
 53 | 1.4.1,2015-06-15 20:50:59 +0200
 54 | 1.4.0,2015-05-26 17:34:50 -0700
 55 | 1.4.0-rc.2,2015-05-07 14:33:28 -0700
 56 | 1.4.0-rc.1,2015-04-24 11:26:10 -0700
 57 | 1.4.0-rc.0,2015-04-10 10:44:35 -0700
 58 | 1.4.0-beta.6,2015-03-15 21:00:39 +0000
 59 | 1.4.0-beta.5,2015-02-24 17:22:13 +0000
 60 | 1.4.0-beta.4,2015-02-07 10:26:21 +0000
 61 | 1.4.0-beta.3,2015-02-03 19:46:22 +0100
 62 | 1.4.0-beta.2,2015-01-26 14:50:48 -0800
 63 | 1.4.0-beta.1,2015-01-20 19:42:59 +0100
 64 | 1.4.0-beta.0,2015-01-14 20:44:32 +0000
 65 | 1.2.32,2016-10-11 13:48:38 +0100
 66 | 1.2.31,2016-10-11 07:48:26 +0100
 67 | 1.2.30,2016-07-20 23:17:37 +0300
 68 | 1.2.29,2015-09-29 13:18:52 -0700
 69 | 1.2.28,2014-12-13 21:28:02 -0500
 70 | 1.2.27,2014-11-20 14:34:26 -0800
 71 | 1.2.26,2014-10-02 09:46:40 -0700
 72 | 1.2.25,2014-09-16 15:05:22 -0700
 73 | 1.2.24,2014-09-09 16:21:16 -0700
 74 | 1.2.23,2014-08-22 15:56:49 -0700
 75 | 1.2.22,2014-08-11 17:04:40 +0100
 76 | 1.2.21,2014-07-25 09:01:43 -0700
 77 | 1.2.20,2014-07-11 11:26:39 -0700
 78 | 1.2.19,2014-06-30 16:58:15 -0700
 79 | 1.2.18,2014-06-13 13:55:33 -0700
 80 | 1.2.17,2014-06-06 20:13:16 +0100
 81 | 1.2.16,2014-04-03 14:42:19 -0700
 82 | 1.2.15,2014-03-21 14:58:48 -0700
 83 | 1.3.20,2015-09-29 13:54:03 -0700
 84 | 1.3.19,2015-09-15 13:34:09 +0100
 85 | 1.3.18,2015-08-18 15:14:56 -0700
 86 | 1.3.17,2015-07-01 12:16:14 -0700
 87 | 1.3.16,2015-06-05 13:29:27 -0700
 88 | 1.3.15,2015-03-15 21:01:49 +0000
 89 | 1.3.14,2015-02-24 17:22:45 +0000
 90 | 1.3.13,2015-02-07 19:21:53 +0100
 91 | 1.3.12,2015-02-02 14:03:17 +0000
 92 | 1.3.11,2015-01-26 14:20:52 -0800
 93 | 1.3.10,2015-01-20 19:31:56 +0100
 94 | 1.3.9,2015-01-13 14:29:29 -0500
 95 | 1.3.8,2014-12-19 13:22:00 -0800
 96 | 1.3.7,2014-12-15 13:46:21 +0000
 97 | 1.3.6,2014-12-08 16:29:39 -0500
 98 | 1.3.5,2014-12-01 19:54:14 +0100
 99 | 1.3.4,2014-11-25 00:05:18 +0100
100 | 1.3.3,2014-11-17 09:32:21 -0800
101 | 1.3.2,2014-11-07 13:22:01 -0500
102 | 1.3.1,2014-10-31 12:28:58 -0400
103 | 1.3.0,2014-10-13 15:27:20 -0700
104 | 1.3.0-rc.5,2014-10-08 15:51:30 -0700
105 | 1.3.0-rc.4,2014-10-01 17:37:40 -0700
106 | 1.3.0-rc.3,2014-09-23 18:47:24 -0700
107 | 1.3.0-rc.2,2014-09-16 14:52:25 -0700
108 | 1.3.0-rc.1,2014-09-09 15:45:51 -0700
109 | 1.3.0-rc.0,2014-08-29 21:22:46 -0400
110 | 1.3.0-beta.19,2014-08-22 15:57:26 -0700
111 | 1.3.0-beta.18,2014-08-11 16:54:40 +0100
112 | 1.3.0-beta.17,2014-07-25 16:37:53 +0100
113 | 1.3.0-beta.16,2014-07-18 12:18:26 -0700
114 | 1.3.0-beta.15,2014-07-11 11:15:42 -0700
115 | 1.3.0-beta.14,2014-06-30 09:52:32 -0700
116 | 1.3.0-beta.13,2014-06-16 10:47:09 -0700
117 | 1.3.0-beta.12,2014-06-13 13:41:18 -0700
118 | 1.3.0-beta.11,2014-06-06 20:22:50 +0100
119 | 1.3.0-beta.10,2014-05-23 15:08:36 -0700
120 | 1.3.0-beta.9,2014-05-16 15:14:12 -0700
121 | 1.3.0-beta.8,2014-05-09 14:42:26 +0100
122 | 1.3.0-beta.7,2014-04-25 15:00:17 -0700
123 | 1.3.0-beta.6,2014-04-21 15:57:08 -0700
124 | 1.3.0-beta.5,2014-04-03 14:46:15 -0700
125 | 1.3.0-beta.4,2014-03-28 17:43:17 -0400
126 | 1.3.0-beta.3,2014-03-21 11:16:35 -0700
127 | 1.3.0-beta.2,2014-03-14 16:26:40 -0700
128 | 1.3.0-beta.1,2014-03-07 16:23:14 -0800
129 | 1.2.14,2014-03-01 09:51:19 -0800
130 | 1.2.13,2014-02-14 16:41:02 -0800
131 | 1.2.12,2014-02-07 17:00:28 -0500
132 | 1.2.11,2014-02-03 09:40:03 -0800
133 | 1.2.10,2014-01-24 15:28:28 -0800
134 | 1.2.9,2014-01-15 10:02:10 -0800
135 | 1.2.8,2014-01-10 12:37:49 -0800
136 | 1.2.7,2014-01-03 10:28:30 -0800
137 | 1.2.6,2013-12-19 15:50:07 -0800
138 | 1.2.5,2013-12-13 10:52:13 -0800
139 | 1.2.4,2013-12-06 13:14:56 -0500
140 | 1.2.3,2013-11-27 10:04:59 +0000
141 | 1.2.2,2013-11-22 09:05:42 -0800
142 | 1.2.1,2013-11-14 22:33:20 -0800
143 | 1.2.0,2013-11-08 09:40:09 -0800
144 | 1.2.0-rc.3,2013-10-14 10:36:23 -0700
145 | 1.2.0-rc.2,2013-09-04 14:50:39 +0200
146 | 1.2.0rc1,2013-08-13 11:50:32 -0700
147 | 1.1.5,2013-05-22 01:05:11 -0700
148 | 1.1.4,2013-04-03 18:54:52 -0700
149 | 1.1.3,2013-02-20 12:54:44 -0800
150 | 1.1.2,2013-01-23 10:54:35 -0800
151 | 1.1.1,2012-11-27 01:45:35 +0100
152 | 1.1.0,2012-09-04 11:11:09 -0700
153 | 1.0.8,2013-08-22 11:20:23 -0700
154 | 1.0.7,2013-05-22 01:05:53 -0700
155 | 1.0.6,2013-04-04 10:48:05 -0700
156 | 1.0.5,2013-02-20 12:58:02 -0800
157 | 1.0.4,2013-01-23 10:57:51 -0800
158 | 1.0.3,2012-11-27 01:44:46 +0100
159 | 1.0.2,2012-09-04 11:08:40 -0700
160 | 1.0.1,2012-06-25 09:30:57 -0700
161 | 1.0.0,2012-06-14 10:50:22 -0700
162 | 1.0.0rc12,2012-06-12 01:46:02 -0700
163 | 1.0.0rc11,2012-06-11 00:03:01 -0700
164 | 1.0.0rc10,2012-05-23 21:05:21 -0700
165 | 1.0.0rc9,2012-05-14 22:13:15 -0700
166 | 1.0.0rc8,2012-05-07 00:09:20 -0700
167 | 1.0.0rc7,2012-04-30 16:32:45 -0700
168 | 1.0.0rc6,2012-04-20 15:06:39 -0700
169 | 1.0.0rc5,2012-04-12 03:56:28 -0700
170 | 1.0.0rc4,2012-04-05 11:46:36 -0700
171 | 1.0.0rc3,2012-03-29 16:10:40 -0700
172 | 1.0.0rc2,2012-03-20 15:38:57 -0700
173 | g3-v1.0.0rc1,2012-03-16 12:06:29 -0700
174 | 1.0.0rc1,2012-03-14 01:00:46 -0700
175 | 0.10.6,2012-01-17 13:54:18 -0800
176 | 0.10.5,2011-11-08 04:29:07 -0800
177 | 0.10.4,2011-10-22 21:39:39 -0700
178 | 0.10.3,2011-10-14 08:31:39 -0700
179 | 0.10.2,2011-10-08 09:18:19 -0700
180 | 0.10.1,2011-09-09 01:01:46 -0700
181 | 0.10.0,2011-09-02 11:32:29 -0700
182 | 0.9.19,2011-08-21 01:12:34 -0700
183 | 0.9.18,2011-07-29 16:30:24 -0700
184 | 0.9.17,2011-06-30 09:10:59 -0700
185 | 0.9.16,2011-06-07 16:11:01 -0700
186 | 0.9.15,2011-04-11 14:23:26 -0700
187 | 0.9.14,2011-04-01 12:26:04 -0700
188 | 0.9.13,2011-03-13 22:48:26 -0700
189 | 0.9.12,2011-03-03 23:14:43 -0800
190 | 0.9.11,2011-02-08 17:47:31 -0800
191 | 0.9.10,2011-01-26 23:51:06 -0800
192 | 0.9.9,2011-01-13 22:08:27 -0800
193 | 0.9.7,2010-12-10 17:08:52 -0800
194 | 0.9.6,2010-12-06 21:11:10 -0800
195 | 0.9.5,2010-11-25 10:11:26 -0800
196 | 0.9.4,2010-11-18 22:40:01 -0800
197 | 0.9.3,2010-11-10 22:15:16 -0800
198 | 0.9.2,2010-11-03 13:06:45 -0700
199 | 0.9.1,2010-10-26 22:18:25 -0700
200 | 0.9.0,2010-10-20 15:51:36 -0700
201 | 


--------------------------------------------------------------------------------
/analysis/library-detector/angular/ideas.txt:
--------------------------------------------------------------------------------
1 | start with current version & never update
2 | start with outdated version & never update
3 | update frequently
4 | downgrade
5 | 
6 | 
7 | angular is transitive dep
8 | own dep
9 | 


--------------------------------------------------------------------------------
/analysis/library-detector/angular/plotting.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import datetime
 3 | from dateutil import parser
 4 | from distutils.version import LooseVersion
 5 | 
 6 | 
 7 | import numpy as np
 8 | from matplotlib import pyplot as plt
 9 | import matplotlib.patches as mpatches
10 | 
11 | def get_cmap(n, name='hsv'):
12 |     '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
13 |     RGB color; the keyword argument name must be a standard mpl colormap name.'''
14 |     return plt.cm.get_cmap(name, n)
15 | 
16 | 
17 | plt.figure(figsize=(20, 100))
18 | 
19 | data = {}
20 | with open(sys.argv[1]) as f:
21 |     for line in f.readlines()[0:5000]:
22 |         line = line.strip()
23 |         extid, ts, vers = line.split(",")
24 |         if extid not in data:
25 |             data[extid] = {}
26 |         data[extid][parser.parse(ts).date()] = vers
27 | 
28 | startdate = datetime.date(year=2017, month=2, day=1)
29 | enddate = datetime.date(year=2018, month=12, day=13)
30 | NOT_IN_STORE = "NO DATA"
31 | 
32 | 
33 | 
34 | converted_data = {}
35 | versions = set()
36 | for extid, tups in data.items():
37 |     days_version_tups = [(0, NOT_IN_STORE)]
38 |     for ts, vers in sorted(tups.items()):
39 |         if vers != "None":
40 |             versions.add(vers)
41 |         #if vers != days_version_tups[-1][1]:
42 |         days_version_tups += [((ts - startdate).days, vers)]
43 |     converted_data[extid] = days_version_tups
44 | 
45 | converted_data["angular_updates"] = [(0, NOT_IN_STORE)]
46 | version_release = {}
47 | with open(sys.argv[2]) as f:
48 |     for line in f.readlines():
49 |         line = line.strip()
50 |         vers, ts_str = line.split(",")
51 |         ts = parser.parse(ts_str).date()
52 |         version_release[vers] = ts
53 |         if startdate < ts and ts < enddate:
54 |             converted_data["angular_updates"] += [((ts - startdate).days, vers)]
55 | 
56 | converted_data["angular_updates"].sort()
57 | 
58 | 
59 | colors = {}
60 | for i, version in enumerate(sorted(versions, key=version_release.get)):
61 |     #colors[version] = get_cmap(len(versions))(i)
62 |     colors[version] = plt.cm.jet(1. * i / ((len(versions)) - 1))
63 | for version, color in colors.items():
64 |     print(f"{version}: {color}")
65 | 
66 | bottoms = np.arange(len(converted_data))
67 | 
68 | sorted_data = sorted(list(converted_data.items()), key=lambda x: min(map(lambda y: y[1], x[1])))
69 | 
70 | for i in range(len(converted_data.items())):
71 |     extid, tups = sorted_data[i]
72 |     for j in range(len(tups)):
73 |         days, vers = tups[j]
74 |         if j + 1 == len(tups):
75 |             next_days = (enddate - startdate).days
76 |         else:
77 |             next_days = tups[j + 1][0]
78 |         print(f"{extid}: {days}")
79 |         #print(f"{vers} and {colors[vers]}")
80 |         color = "w"
81 |         if vers in colors:
82 |             color = colors[vers]
83 |         plt.bar(days, 0.8, width=next_days - days, bottom=bottoms[i],
84 |                 color=color, orientation="horizontal", label=vers, linewidth=1, edgecolor="black")
85 | plt.yticks(bottoms, map(lambda x: x[0], sorted(list(converted_data.items()), key=lambda x: min(map(lambda y: y[1], x[1])))))
86 | 
87 | patchList = []
88 | for version, color in sorted(colors.items(), key=lambda x: LooseVersion(x[0])):
89 |         data_key = mpatches.Patch(color=color, label=version)
90 |         patchList.append(data_key)
91 | 
92 | plt.legend(handles=patchList, loc="best", bbox_to_anchor=(1.0, 1.00))
93 | 
94 | 
95 | plt.subplots_adjust(right=0.85)
96 | plt.savefig("out.pdf")
97 | 


--------------------------------------------------------------------------------
/analysis/library-detector/jquery.py:
--------------------------------------------------------------------------------
 1 | import MySQLdb
 2 | from MySQLdb import cursors
 3 | import os
 4 | from distutils.version import LooseVersion
 5 | from itertools import groupby, islice
 6 | import datetime
 7 | import pickle
 8 | 
 9 | def execute(q, args=None):
10 |     cachepath = "mysqlcache.tmp"
11 |     cache = {}
12 |     if os.path.exists(cachepath):
13 |         with open(cachepath, 'rb') as f:
14 |             try:
15 |                 cache = pickle.load(f)
16 |             except Exception as e:
17 |                 print(e)
18 | 
19 |     if q in cache:
20 |         print("retrieving query results from cache...")
21 |         for row in cache[q]:
22 |             yield row
23 |     else:
24 |         print("query not in cache, contacting db ...")
25 |         db = MySQLdb.connect(read_default_file=os.path.expanduser("~/.my.cnf"), cursorclass=cursors.SSCursor)
26 |         cursor = db.cursor()
27 |         cursor.execute(q, args)
28 | 
29 |         result = []
30 |         for row in cursor:
31 |             result += [row]
32 |             yield row
33 |         cache[q] = result
34 |         with open(cachepath, 'wb') as f:
35 |             pickle.dump(cache, f)
36 |             print("cache saved")
37 | 
38 | vuln_md5s = set()
39 | 
40 | # for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='jquery'"):
41 | #     if LooseVersion(version) < LooseVersion('1.6.3'):
42 | #         vuln_md5s.add(md5)
43 | for version, md5 in execute("select version, md5 from cdnjs where typ='NORMALIZED' and path like '%.js' and library='angular.js'"):
44 |     if LooseVersion(version) < LooseVersion('1.6.9'):
45 |         vuln_md5s.add(md5)
46 | print(f"found {len(vuln_md5s)} MD5s")
47 | 
48 | hits = 0
49 | still_vuln = 0
50 | for extid, g in groupby(execute("select extid, crx_etag, date, md5 from extension_update_most_recent join crxfile using (crx_etag) where typ='NORMALIZED' order by extid, date, crx_etag"), lambda x: x[0]):
51 |     ext_is_vuln = False
52 |     for crx_etag, g in groupby(map(lambda x: x[1:], g), lambda x: x[0]):
53 |         is_vuln = False
54 |         for date, md5, in map(lambda x: x[1:], g):
55 |             if md5 in vuln_md5s:
56 |                 is_vuln = True
57 |                 break
58 | 
59 |         if not is_vuln and ext_is_vuln:
60 |             print(f"{extid} got fixed in {crx_etag} on {date}!")
61 |             hits += 1
62 |         ext_is_vuln = is_vuln
63 |     if is_vuln and date > datetime.datetime(year=2018, month=11, day=14):
64 |         print(f"{extid} in {crx_etag} is still vulnerable as of {date}")
65 |         still_vuln += 1
66 | 
67 | print(f"# fixes: {hits}")
68 | print(f"# still vulnerable: {still_vuln}")
69 | 
70 | 


--------------------------------------------------------------------------------
/cdnjs-git-miner:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | #
 18 | # SPDX-License-Identifier: GPL-3.0-or-later
 19 | """ Tool for mining the cdnjs git repository"""
 20 | 
 21 | import getopt
 22 | import logging
 23 | import sys
 24 | import os
 25 | 
 26 | from ExtensionCrawler.config import (const_log_format, const_basedir)
 27 | from ExtensionCrawler.cdnjs_git import (pull_and_update_db, update_db_all_libs,
 28 |                                         update_db_from_listfile)
 29 | 
 30 | 
 31 | def helpmsg():
 32 |     """Print help message."""
 33 |     print("cdnjs-git-miner [OPTION]")
 34 |     print(
 35 |         "    -i               initialize/update database with all libraries in the repository"
 36 |     )
 37 |     print("    -u               update: pull repository and update database")
 38 |     print(
 39 |         "    -l <PATHFILE>    read list of libraries to update from file (recusively)"
 40 |     )
 41 |     print("    -n <TASKID>      process chunk n where n in [1,N]")
 42 |     print("    -N <MAXTASKID>   ")
 43 |     print("    -v               verbose")
 44 |     print(
 45 |         "    -c               print csv format to stdout instead of writing to database"
 46 |     )
 47 |     print("    -a=<DIR>         archive directory")
 48 |     print("    -h               print this help text")
 49 | 
 50 | 
 51 | def main(argv):
 52 |     """Main function of the extension crawler."""
 53 |     basedir = const_basedir()
 54 |     verbose = False
 55 |     initialize = False
 56 |     update = False
 57 |     taskid = 1
 58 |     listfile = None
 59 |     maxtaskid = 1
 60 |     csv = False
 61 | 
 62 |     try:
 63 |         opts, args = getopt.getopt(argv, "hvicl:ua:p:n:N:", [
 64 |             "archive=", "listupdate=", "taskid=", "maxtaskid="
 65 |         ])
 66 |     except getopt.GetoptError:
 67 |         helpmsg()
 68 |         sys.exit(2)
 69 |     for opt, arg in opts:
 70 |         if opt == '-h':
 71 |             helpmsg()
 72 |             sys.exit()
 73 |         elif opt == '-v':
 74 |             verbose = True
 75 |         elif opt in ("-l", "--listupdate"):
 76 |             listfile = arg
 77 |         elif opt in ("-a", "--archive"):
 78 |             basedir = arg
 79 |         elif opt == '-i':
 80 |             initialize = True
 81 |         elif opt == '-u':
 82 |             update = True
 83 |         elif opt == '-c':
 84 |             csv = True
 85 |         elif opt in ("-n", "--taskid"):
 86 |             taskid = int(arg)
 87 |         elif opt in ("-N", "--maxtaskid"):
 88 |             maxtaskid = int(arg)
 89 | 
 90 |     if verbose:
 91 |         loglevel = logging.INFO
 92 |     else:
 93 |         loglevel = logging.WARNING
 94 | 
 95 |     logger = logging.getLogger()
 96 |     ch = logging.StreamHandler(sys.stdout)
 97 |     ch.setFormatter(logging.Formatter(const_log_format()))
 98 |     logger.addHandler(ch)
 99 |     logger.setLevel(loglevel)
100 | 
101 |     cdnjs_git_path = os.path.join(os.path.join(basedir, "filedb"), "cdnjs-git")
102 | 
103 |     if initialize:
104 |         logging.info("Starting update of all db libs")
105 |         update_db_all_libs(cdnjs_git_path, csv, taskid, maxtaskid)
106 |         logging.info("Finished update of all db libs")
107 |     if update:
108 |         logging.info("Starting update of new db libs")
109 |         pull_and_update_db(cdnjs_git_path, csv)
110 |         logging.info("Finished update of new db libs")
111 |     if listfile is not None:
112 |         logging.info("Starting update from list file")
113 |         update_db_from_listfile(cdnjs_git_path, listfile, csv)
114 |         logging.info("Finished update from list file")
115 | 
116 |     logging.info("Successfully updated cdnjs table")
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main(sys.argv[1:])
121 | 


--------------------------------------------------------------------------------
/crawler:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016-2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | #
 18 | # SPDX-License-Identifier: GPL-3.0-or-later
 19 | """
 20 | A crawler for extensions from the Chrome Web Store.
 21 | """
 22 | 
 23 | import sys
 24 | import datetime
 25 | import time
 26 | import getopt
 27 | import logging
 28 | import itertools
 29 | import multiprocessing
 30 | from functools import reduce
 31 | from ExtensionCrawler.discover import get_new_ids
 32 | from ExtensionCrawler.archive import get_forum_ext_ids, get_existing_ids, update_extensions
 33 | from ExtensionCrawler.config import *
 34 | from ExtensionCrawler.util import log_info, log_exception, setup_logger
 35 | 
 36 | 
 37 | def write_log(dirname, fname, text):
 38 |     """Write text into the file with name fname in directory dirname."""
 39 |     os.makedirs(dirname, exist_ok=True)
 40 |     fname = fname.replace(":", "_")
 41 |     with open(os.path.join(dirname, fname), 'w') as logfile:
 42 |         logfile.write(text)
 43 | 
 44 | 
 45 | def log_failures_to_file(dirname, today, res):
 46 |     """Log failures during download/update in the log directory dirname."""
 47 |     not_authorized = "\n".join(sorted([x.ext_id for x in res if x.not_authorized()]))
 48 |     write_log(dirname, today + "-not-authorized.log", not_authorized)
 49 | 
 50 |     updated = "\n".join(sorted([x.ext_id for x in res if x.is_ok() and not x.not_modified()]))
 51 |     write_log(dirname, today + "-updated.log", updated)
 52 | 
 53 |     has_exception = "\n".join(sorted([x.ext_id for x in res if x.has_exception()]))
 54 |     write_log(dirname, today + "-raised-exception.log", has_exception)
 55 | 
 56 |     raised_ddos = "\n".join(sorted([x.ext_id for x in res if x.raised_google_ddos()]))
 57 |     write_log(dirname, today + "-raised-ddos.log", raised_ddos)
 58 | 
 59 |     not_in_store = "\n".join(sorted([x.ext_id for x in res if x.not_in_store()]))
 60 |     write_log(dirname, today + "-not-in-store.log", not_in_store)
 61 | 
 62 |     new = "\n".join(sorted([x.ext_id for x in res if x.is_new()]))
 63 |     write_log(dirname, today + "-new-in-store.log", new)
 64 | 
 65 |     file_corruption = "\n".join(sorted([x.ext_id for x in res if x.corrupt_tar()]))
 66 |     write_log(dirname, today + "-file-corruption.log", file_corruption)
 67 | 
 68 |     sql_exception = "\n".join(sorted([x.ext_id for x in res if x.sql_exception()]))
 69 |     write_log(dirname, today + "-sql-exception.log", sql_exception)
 70 | 
 71 |     worker_exception = "\n".join(sorted([x.ext_id for x in res if x.worker_exception]))
 72 |     write_log(dirname, today + "-worker-exception.log", worker_exception)
 73 | 
 74 |     sql_fail = "\n".join(sorted([x.ext_id for x in res if not x.sql_success()]))
 75 |     write_log(dirname, today + "-sql-not-updated.log", sql_fail)
 76 | 
 77 | 
 78 | def log_summary(res, runtime=0):
 79 |     """Log brief result summary."""
 80 | 
 81 |     corrupt_tar_archives = list(filter(lambda x: x.corrupt_tar(), res))
 82 | 
 83 |     log_info("Summary:")
 84 |     log_info("    Updated {} out of {} extensions successfully".format(str(len(list(filter(lambda x: x.is_ok(), res)))),
 85 |         str(len(res))))
 86 |     log_info("    Updated extensions:      {:8d}".format(
 87 |         len(list(filter(lambda x: x.is_ok() and not x.not_modified(), res)))))
 88 |     log_info("    Updated SQL databases:   {:8d}".format(len(list(filter(lambda x: x.sql_success(), res)))))
 89 |     log_info("    New extensions:          {:8d}".format(len(list(filter(lambda x: x.is_new(), res)))))
 90 |     log_info("    Not authorized:          {:8d}".format(len(list(filter(lambda x: x.not_authorized(), res)))))
 91 |     log_info("    Raised Google DDOS:      {:8d}".format(len(list(filter(lambda x: x.raised_google_ddos(), res)))))
 92 |     log_info("    Not modified archives:   {:8d}".format(len(list(filter(lambda x: x.not_modified(), res)))))
 93 |     log_info("    Extensions not in store: {:8d}".format(len(list(filter(lambda x: x.not_in_store(), res)))))
 94 |     log_info("    Unknown exception:       {:8d}".format(len(list(filter(lambda x: x.has_exception(), res)))))
 95 |     log_info("    Corrupt tar archives:    {:8d}".format(len(corrupt_tar_archives)))
 96 |     log_info("    SQL exception:           {:8d}".format(len(list(filter(lambda x: x.sql_exception(), res)))))
 97 |     log_info(
 98 |         "    Worker exception:        {:8d}".format(len(list(filter(lambda x: x.worker_exception is not None, res)))))
 99 |     log_info("    Total runtime:            {}".format(str(datetime.timedelta(seconds=int(runtime)))))
100 | 
101 |     if corrupt_tar_archives:
102 |         log_info("")
103 |         log_info("List of extensions with corrupted files/archives:")
104 |         for x in corrupt_tar_archives:
105 |             log_info("{}: {}".format(x.ext_id, x.exception), 1)
106 |         log_info("")
107 | 
108 | 
109 | def helpmsg():
110 |     """Print help message."""
111 |     print("crawler [OPTION]")
112 |     print("    -h                  print this help text")
113 |     print("    -s                  silent (no log messages)")
114 |     print("    -d                  discover new extensions")
115 |     print("    -p <N>              number of concurrent downloads")
116 |     print("    -a <DIR>            archive directory")
117 |     print(
118 |         "    -t <N>              timeout for an individual extension download")
119 |     print("    --max-discover <N>  discover at most N new extensions")
120 |     print("    --pystuck           start pystuck server for all processes")
121 | 
122 | 
123 | def print_config(basedir, archive_dir, conf_dir, discover, parallel,
124 |                  ext_timeout, start_pystuck):
125 |     """Print current configuration."""
126 |     log_info("Configuration:")
127 |     log_info("  Base dir:                         {}".format(basedir))
128 |     log_info("    Archive directory:              {}".format(archive_dir))
129 |     log_info("    Configuration directory:        {}".format(conf_dir))
130 |     log_info("  Discover new extensions:          {}".format(discover))
131 |     log_info("  Max num. of concurrent downloads: {}".format(parallel))
132 |     log_info("  Download timeout:                 {}".format(ext_timeout))
133 |     log_info("  Start PyStuck:                    {}".format(start_pystuck))
134 | 
135 | 
136 | def parse_args(argv):
137 |     """Parse command line arguments. """
138 |     basedir = const_basedir()
139 |     parallel = const_parallel_downloads()
140 |     verbose = const_verbose()
141 |     discover = const_discover()
142 |     ext_timeout = const_ext_timeout()
143 |     max_discover = None
144 |     start_pystuck = False
145 |     try:
146 |         opts, _ = getopt.getopt(
147 |             argv, "hsda:p:t:",
148 |             ["timeout=", "archive=", 'parallel=', 'max-discover=', 'pystuck'])
149 |     except getopt.GetoptError:
150 |         helpmsg()
151 |         sys.exit(2)
152 |     for opt, arg in opts:
153 |         if opt == '-h':
154 |             helpmsg()
155 |             sys.exit()
156 |         elif opt in ("-a", "--archive"):
157 |             basedir = arg
158 |         elif opt in ("-p", "--parallel"):
159 |             parallel = int(arg)
160 |         elif opt in ("-t", "--timeout"):
161 |             ext_timeout = int(arg)
162 |         elif opt == '-s':
163 |             verbose = False
164 |         elif opt == '-d':
165 |             discover = True
166 |         elif opt == '--max-discover':
167 |             discover = True
168 |             max_discover = int(arg)
169 |         elif opt == '--pystuck':
170 |             start_pystuck = True
171 |     return basedir, parallel, verbose, discover, max_discover, ext_timeout, start_pystuck
172 | 
173 | 
174 | def main(argv):
175 |     """Main function of the extension crawler."""
176 | 
177 |     today = datetime.datetime.now(datetime.timezone.utc).isoformat()
178 |     basedir, parallel, verbose, discover, max_discover, ext_timeout, start_pystuck = parse_args(argv)
179 | 
180 |     setup_logger(verbose)
181 | 
182 |     if start_pystuck:
183 |         import pystuck
184 |         pystuck.run_server(port=10000)
185 | 
186 |     # Surpressing these "Starting HTTPS connection ..." log messages
187 |     # Older versions of requests use loglevel INFO for that, newer ones DEBUG
188 |     logging.getLogger("requests").setLevel(logging.WARNING)
189 | 
190 |     archive_dir = os.path.join(basedir, "data")
191 |     os.makedirs(archive_dir, exist_ok=True)
192 |     conf_dir = os.path.join(basedir, "conf")
193 |     os.makedirs(conf_dir, exist_ok=True)
194 |     open(os.path.join(conf_dir, "forums.conf"), 'a').close()
195 |     log_dir = os.path.join(basedir, "log",datetime.datetime.today().strftime("%Y-%m"))
196 |     os.makedirs(log_dir, exist_ok=True)
197 | 
198 |     start_time = time.time()
199 | 
200 |     print_config(basedir, archive_dir, conf_dir, discover, parallel,
201 |                  ext_timeout, start_pystuck)
202 | 
203 |     forum_ext_ids = get_forum_ext_ids(conf_dir)
204 |     known_ids = list(set(get_existing_ids(archive_dir)) | set(forum_ext_ids))
205 |     discovered_ids = []
206 |     if discover:
207 |         log_info("Discovering new ids {}...".format(
208 |             "(at most {}) ".format(max_discover) if max_discover is not None else ""))
209 |         try:
210 |             discovered_ids = list(get_new_ids(known_ids, max_discover))
211 |         except Exception:
212 |             log_exception("Exception when discovering new ids")
213 |         log_info("Discovered {} new extensions".format(len(discovered_ids)), 1)
214 | 
215 |     ext_ids = list(set(discovered_ids) | set(known_ids))
216 | 
217 |     discovered_ids = None
218 |     known_ids = None
219 | 
220 |     res = update_extensions(archive_dir, parallel, forum_ext_ids, ext_ids, ext_timeout, verbose, start_pystuck)
221 | 
222 |     # We re-try (once) the extensions with unknown exceptions, as
223 |     # they are often temporary
224 |     has_exception = list(filter(lambda x: x.has_exception(), res))
225 |     if has_exception:
226 |         log_info("  {} extensions with unknown exceptions, start another try ...".format(str(len(has_exception))))
227 |         has_exception_ids = [x.ext_id for x in has_exception]
228 |         forum_ext_ids_except = list(
229 |             set(forum_ext_ids).intersection(set(has_exception_ids)))
230 |         ext_ids_except = sorted(
231 |             list(set(has_exception_ids) - set(forum_ext_ids_except)))
232 |         res_update = update_extensions(archive_dir, parallel,
233 |                                        forum_ext_ids_except, ext_ids_except, ext_timeout, verbose, start_pystuck)
234 |         res = list(set(res) - set(has_exception)) + res_update
235 | 
236 |     end_time = time.time()
237 |     log_summary(res, int(end_time - start_time))
238 |     log_failures_to_file(log_dir, today, res)
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     main(sys.argv[1:])
243 | 


--------------------------------------------------------------------------------
/create-db:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2016,2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 | #
 18 | 
 19 | import getopt
 20 | import sys
 21 | import tarfile
 22 | import time
 23 | import tempfile
 24 | from functools import partial
 25 | import fnmatch
 26 | import multiprocessing
 27 | from pebble import ProcessPool
 28 | import os
 29 | import datetime
 30 | 
 31 | from ExtensionCrawler.archive import update_db_incremental
 32 | from ExtensionCrawler.config import archive_file, const_basedir, const_mysql_config_file
 33 | from ExtensionCrawler.util import log_info, log_exception, setup_logger, set_logger_tag
 34 | 
 35 | from ExtensionCrawler.dbbackend.mysql_backend import MysqlBackend
 36 | 
 37 | 
 38 | def print_help():
 39 |     print("""create-db [OPTION]""")
 40 |     print("""  -h                  print this help text""")
 41 |     print("""  -a <DIR>            archive directory""")
 42 |     print("""  -p <PREFIX>         three-letter-prefix""")
 43 |     print("""  -e <EXTIDFILELIST>  file with extension ids""")
 44 |     print("""  --from-date <DATE>  only process information gathered after"""
 45 |           """                      this date (compared lexographically)""")
 46 |     print("""  --until-date <DATE> only process information gathered before"""
 47 |           """                      this date (compared lexographically)""")
 48 |     print("""  -t <THREADS>        number of parallel threads""")
 49 |     print("""  -n <TASKID>         process chunk n where n in [1,N]""")
 50 |     print("""  -N <MAXTASKID>      """)
 51 |     print("""  --delayed           uses INSERT DELAYED INTO statements""")
 52 | 
 53 | def init_process(verbose):
 54 |     # When not using fork, we need to setup logging again in the worker threads
 55 |     setup_logger(verbose)
 56 | 
 57 | def process_id(from_date, until_date, delayed, path):
 58 |     start = time.time()
 59 |     with tempfile.TemporaryDirectory() as tmpdir:
 60 |         with tarfile.open(path) as t:
 61 |             t.extractall(tmpdir)
 62 | 
 63 |             extid = os.listdir(tmpdir)[0]
 64 |             set_logger_tag(extid)
 65 |             log_info("Start processing extension", 0)
 66 |             iddir = os.path.join(tmpdir, extid)
 67 | 
 68 |             try:
 69 |                 with MysqlBackend(
 70 |                         extid,
 71 |                         delayed=delayed,
 72 |                         cache_etags=True,
 73 |                         read_default_file=const_mysql_config_file(),
 74 |                         charset='utf8mb4') as con:
 75 |                     for date in sorted(os.listdir(iddir)):
 76 |                         if (from_date is not None and date < from_date) or \
 77 |                                 (until_date is not None and date > until_date):
 78 |                             log_info("* Skipping {}".format(date), 2)
 79 |                             continue
 80 |                         try:
 81 |                             update_db_incremental(iddir, extid, date, con)
 82 |                         except Exception:
 83 |                             log_exception("Exception when handling data from {}".format(date), 0)
 84 |             except Exception:
 85 |                 log_exception("Exception when handling extension", 0)
 86 |     log_info("Finished extension in {}".format(str(datetime.timedelta(seconds=int(time.time() - start)))), 0)
 87 | 
 88 | 
 89 | def find(archive, pattern):
 90 |     for root, _, files in os.walk(os.path.join(archive, "data")):
 91 |         for file in files:
 92 |             if fnmatch.fnmatch(file, pattern + ".tar") or fnmatch.fnmatch(file, pattern + ".[0-9][0-9][0-9].tar.xz"):
 93 |                 yield os.path.join(root, file)
 94 | 
 95 | 
 96 | def find_from_file(archive, extidlistfile):
 97 |     with open(extidlistfile, 'r') as f:
 98 |         for line in f.readlines():
 99 |             yield archive_file(os.path.join(archive, "data"), line.strip())
100 | 
101 | 
102 | def parse_args(argv):
103 |     archive = const_basedir()
104 |     parallel = 8
105 |     taskid = 1
106 |     maxtaskid = 1
107 |     from_date = None
108 |     until_date = None
109 |     delayed = False
110 | 
111 |     paths = []
112 | 
113 |     try:
114 |         opts, args = getopt.getopt(argv, "ha:p:e:t:n:N:", [
115 |             "archive=", "prefix=", "extidlistfile=", "threads=", "taskid=",
116 |             "maxtaskid=", "from-date=", "until-date=", "delayed", "help"
117 |         ])
118 |     except getopt.GetoptError:
119 |         print_help()
120 |         sys.exit(2)
121 |     for opt, arg in opts:
122 |         if opt in ("-h", "--help"):
123 |             print_help()
124 |             sys.exit()
125 |         elif opt in ("-a", "--archive"):
126 |             archive = arg
127 |         elif opt in ("-p", "--prefix"):
128 |             paths += find(archive, arg + "*")
129 |         elif opt in ("-e", "--extidlistfile"):
130 |             paths += find_from_file(archive, arg)
131 |         elif opt in ("-t", "--threads"):
132 |             parallel = int(arg)
133 |         elif opt in ("-n", "--taskid"):
134 |             taskid = int(arg)
135 |         elif opt in ("-N", "--maxtaskid"):
136 |             maxtaskid = int(arg)
137 |         elif opt == "--from-date":
138 |             from_date = arg
139 |         elif opt == "--until-date":
140 |             until_date = arg
141 |         elif opt == "--delayed":
142 |             delayed = True
143 | 
144 |     if not paths:
145 |         paths = list(find(archive, "*"))
146 | 
147 |     chunksize = int(len(paths) / maxtaskid)
148 |     if taskid == maxtaskid:
149 |         paths = paths[(taskid - 1) * chunksize:]
150 |     else:
151 |         paths = paths[(taskid - 1) * chunksize:taskid * chunksize]
152 | 
153 |     return paths, parallel, from_date, until_date, delayed
154 | 
155 | 
156 | def main(argv):
157 |     multiprocessing.set_start_method("forkserver")
158 |     verbose = True
159 |     setup_logger(verbose)
160 | 
161 |     paths, parallel, from_date, until_date, delayed = parse_args(argv)
162 | 
163 |     with ProcessPool(max_workers=parallel, max_tasks=100, initializer=init_process, initargs=(verbose,)) as p:
164 |         p.map(partial(process_id, from_date, until_date, delayed), paths)
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     main(sys.argv[1:])
169 | 


--------------------------------------------------------------------------------
/crx-extract:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2017-2018 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 17 | #
 18 | # SPDX-License-Identifier: GPL-3.0-or-later
 19 | """Tool for extracting crx file from a tar archive."""
 20 | 
 21 | import os
 22 | import sys
 23 | import glob
 24 | import getopt
 25 | import tarfile
 26 | import datetime
 27 | import dateutil
 28 | import dateutil.parser
 29 | from ExtensionCrawler.archive import last_crx, get_local_archive_dir
 30 | from ExtensionCrawler.config import const_basedir
 31 | 
 32 | 
 33 | def helpmsg():
 34 |     """Print help message."""
 35 |     print("crx-extract [OPTION] extid")
 36 |     print("    -h        print this help text")
 37 |     print("    -s        silent (no log messages)")
 38 |     print("    -e        use etag instead of date in outoput")
 39 |     print("    -w        avoid ':' in filenames (useful on Windows)")
 40 |     print("    -d=<DATE> date")
 41 |     print("    -o=<DIR>  output directory")
 42 |     print("    -a=<DIR>  archive directory")
 43 | 
 44 | 
 45 | def get_tarinfo(members, name, winfs=False, etag=None):
 46 |     """Select tarinfo object with a specified path/name."""
 47 |     for tarinfo in members:
 48 |         if tarinfo.name == name:
 49 |             if winfs:
 50 |                 tarinfo.name = name.replace(":", "-")
 51 |             if etag is not None:
 52 |                 (path, crx) = os.path.split(tarinfo.name)
 53 |                 (path, _) = os.path.split(path)
 54 |                 tarinfo.name = os.path.join(path, etag, crx)
 55 |             yield tarinfo
 56 | 
 57 | 
 58 | def main(argv):
 59 |     """Main function of the extension crawler."""
 60 |     basedir = const_basedir()
 61 |     verbose = True
 62 |     date = None
 63 |     useetag = False
 64 |     output = ""
 65 |     winfs = False
 66 |     try:
 67 |         opts, args = getopt.getopt(argv, "hsed:a:o:w",
 68 |                                    ["date=", "archive=", "output="])
 69 |     except getopt.GetoptError:
 70 |         helpmsg()
 71 |         sys.exit(2)
 72 |     for opt, arg in opts:
 73 |         if opt == '-h':
 74 |             helpmsg()
 75 |             sys.exit()
 76 |         elif opt in ("-a", "--archive"):
 77 |             basedir = arg
 78 |         elif opt in ("-d", "--date"):
 79 |             date = arg
 80 |         elif opt in ("-o", "--output"):
 81 |             output = arg
 82 |         elif opt in ("-w", "--winfs"):
 83 |             winfs = True
 84 |         elif opt in ("-e", "--etag"):
 85 |             useetag = True
 86 |         elif opt == '-s':
 87 |             verbose = False
 88 | 
 89 |     if len(args) > 0:
 90 |         extid = args[0]
 91 |     else:
 92 |         helpmsg()
 93 |         sys.exit()
 94 | 
 95 |     if date is not None:
 96 |         dateobj = dateutil.parser.parse(date)
 97 |         if dateobj.tzinfo is None or dateobj.tzinfo.utcoffset(dateobj) is None:
 98 |             dateobj = dateobj.replace(tzinfo=datetime.timezone.utc)
 99 |         last, etag = last_crx(os.path.join(basedir, "data"), extid, dateobj)
100 |     else:
101 |         last, etag = last_crx(os.path.join(basedir, "data"), extid)
102 | 
103 |     if not useetag:
104 |         etag = None
105 |     basetar = os.path.join(basedir, "data",
106 |                        get_local_archive_dir(extid), extid)
107 |     tar = basetar+".tar"
108 |    
109 |     if last != "":
110 |         if os.path.exists(tar):
111 |             files = None
112 |             if verbose:
113 |                 print("Extracting " + os.path.join(output, last) + " from " + tar)
114 |             with tarfile.open(tar, 'r') as archive:
115 |                 files = archive.extractall(
116 |                             path=output,
117 |                             members=get_tarinfo(archive, last, winfs, etag))
118 |             archivetars = sorted(glob.glob(basetar+".[0-9][0-9][0-9].tar.xz"))
119 |             while (not files and  archivetars):
120 |                 tar = archivetars.pop()
121 |                 if verbose:
122 |                     print("Extracting " + os.path.join(output, last) + " from " + tar)
123 |                 with tarfile.open(tar, 'r:xz') as archive:
124 |                     files = archive.extractall(
125 |                                 path=output,
126 |                                 members=get_tarinfo(archive, last, winfs, etag))
127 |         elif verbose:
128 |             print("Cannot find archive " + tar)
129 |     elif verbose:
130 |         if os.path.exists(tar):
131 |             print("CRX not in archive" + tar)
132 |         else:
133 |             print("CRX does not exist: cannot find archive " + tar)
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main(sys.argv[1:])
138 | 


--------------------------------------------------------------------------------
/crx-jsinventory:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2017 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 17 | #
 18 | # SPDX-License-Identifier: GPL-3.0-or-later
 19 | """Tool for extracting crx file from a tar archive."""
 20 | 
 21 | import sys
 22 | import getopt
 23 | import csv
 24 | import logging
 25 | from collections import OrderedDict
 26 | from zipfile import ZipFile
 27 | from tabulate import tabulate
 28 | from ExtensionCrawler.js_decomposer import decompose_js
 29 | from ExtensionCrawler.config import (const_log_format)
 30 | 
 31 | 
 32 | def helpmsg():
 33 |     """Print help message."""
 34 |     print("crx-jsinventory [OPTION] crx-file|js-file")
 35 |     print("    -h        print this help text")
 36 |     print("    -c=<FILE> cvs file (output)")
 37 |     print("    -v        verbose")
 38 |     print(
 39 |         "    -d        disable use of database with file information (not recommended)"
 40 |     )
 41 |     print("    -s        silent")
 42 | 
 43 | 
 44 | def main(argv):
 45 |     """Main function of the extension crawler."""
 46 |     verbose = False
 47 |     silent = False
 48 |     csvfile = None
 49 |     database = True
 50 |     try:
 51 |         opts, args = getopt.getopt(argv, "hvdsc:", ["cvs="])
 52 |     except getopt.GetoptError:
 53 |         helpmsg()
 54 |         sys.exit(2)
 55 |     for opt, arg in opts:
 56 |         if opt == '-h':
 57 |             helpmsg()
 58 |             sys.exit()
 59 |         elif opt == '-v':
 60 |             verbose = True
 61 |         elif opt == '-s':
 62 |             silent = True
 63 |         elif opt == '-d':
 64 |             database = False
 65 |         elif opt in ('-c', "--cvs"):
 66 |             csvfile = arg
 67 | 
 68 |     if len(args) > 0:
 69 |         filename = args[0]
 70 |     else:
 71 |         helpmsg()
 72 |         sys.exit()
 73 | 
 74 |     if verbose:
 75 |         loglevel = logging.INFO
 76 |     else:
 77 |         loglevel = logging.WARNING
 78 | 
 79 |     logger = logging.getLogger()
 80 |     ch = logging.StreamHandler(sys.stdout)
 81 |     ch.setFormatter(logging.Formatter(const_log_format()))
 82 |     logger.addHandler(ch)
 83 |     logger.setLevel(loglevel)
 84 | 
 85 |     fieldnames = [
 86 |         'filename', 'path', 'size', 'dec_size', 'md5', 'sha1', 'mimetype',
 87 |         'description', 'encoding', 'type', 'detectionMethod',
 88 |         'detectionMethodDetails', 'lib', 'version', 'lib_filename',
 89 |         'evidenceText', 'evidenceStartPos', 'evidenceEndPos'
 90 |     ]
 91 | 
 92 |     brief_fieldnames = [
 93 |         'filename', 'md5', 'type', 'detectionMethod', 'lib', 'version',
 94 |         'lib_filename'
 95 |     ]
 96 | 
 97 |     if filename.endswith('.crx'):
 98 |         with ZipFile(filename) as crxobj:
 99 |             inventory = decompose_js(crxobj, database)
100 |     else:
101 |         inventory = decompose_js(filename, database)
102 | 
103 |     if not silent:
104 |         if verbose:
105 |             print_fieldnames = fieldnames
106 |         else:
107 |             print_fieldnames = brief_fieldnames
108 | 
109 |         print_inventory = []
110 |         for item in inventory:
111 |             tmp = {k: item[k] for k in print_fieldnames}
112 |             if 'type' in tmp:
113 |                 tmp['type'] = tmp['type'].value
114 |             if 'detectionMethod' in tmp:
115 |                 tmp['detectionMethod'] = tmp['detectionMethod'].value
116 |             if 'md5' in tmp:
117 |                 tmp['md5'] = tmp['md5'].hex()
118 |             if 'sha1' in tmp:
119 |                 tmp['sha1'] = tmp['sha1'].hex()
120 | 
121 |             print_inventory.append(
122 |                 OrderedDict(
123 |                     sorted(
124 |                         tmp.items(),
125 |                         key=lambda t: print_fieldnames.index(t[0]))))
126 |         print(tabulate(print_inventory, headers='keys'))
127 | 
128 |     if csvfile is not None:
129 |         with open(csvfile, 'w') as csvobj:
130 |             writer = csv.DictWriter(csvobj, fieldnames=fieldnames)
131 |             writer.writeheader()
132 |             writer.writerows(inventory)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main(sys.argv[1:])
137 | 


--------------------------------------------------------------------------------
/crx-tool:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3.7
 2 | #
 3 | # Copyright (C) 2016 The University of Sheffield, UK
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | #
18 | # SPDX-License-Identifier: GPL-3.0-or-later
19 | """ A tool for analyzing and extracting `*.crx` files
20 |     (i.e., Chrome extensions)."""
21 | 
22 | import argparse
23 | from ExtensionCrawler.crx import extract_crxfile, verify_crxfile
24 | 
25 | 
26 | def main():
27 |     """Main function of the extension crawler."""
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("file", help="chrome extension archive (*.crx)")
30 |     parser.add_argument('targetdir', nargs='?', default="")
31 |     parser.add_argument(
32 |         "-c",
33 |         "--check",
34 |         help="verify format and signature of <file>",
35 |         action="store_true")
36 |     parser.add_argument(
37 |         "-e", "--extract", help="extract <file>", action="store_true")
38 |     parser.add_argument(
39 |         "-f",
40 |         "--force",
41 |         help="apply action also to (potential) invalid files",
42 |         action="store_true")
43 |     parser.add_argument(
44 |         "-v", "--verbose", help="increase verbosity", action="store_true")
45 |     args = parser.parse_args()
46 | 
47 |     if args.extract:
48 |         retval = extract_crxfile(args.verbose, args.force, args.file,
49 |                                  args.targetdir)
50 |     else:
51 |         retval = verify_crxfile(args.verbose, args.file)
52 | 
53 |     exit(retval)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     main()
58 | 


--------------------------------------------------------------------------------
/database/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | The extension crawler downloads all metadata and extension files into tar files.
  4 | This is great for archival, but not so great for analyzing the data. The crawler
  5 | therefore also supports inserting all newly crawled information into a MariaDB
  6 | database.  Additionally, there exists a script to regenerate the database from
  7 | old tar files.
  8 | 
  9 | 
 10 | # Setting up the database
 11 | 
 12 | ## Hardware requirements
 13 | 
 14 | The database is meant to be setup on a (old) PC, although it should also work
 15 | with common cloud offerings.
 16 | 
 17 | The amount of data that the database needs to handle grows over time. Currently,
 18 | containing ~18 months worth of data, the database requires ~150GB of space.
 19 | 
 20 | It is recommended to have at least 16GB of RAM to keep the indices available;
 21 | less RAM might work, more RAM will certainly speed queries up. It is also good
 22 | to have at least 16GB of swap; while this detrimental to the performance of
 23 | MariaDB, it is often better than it being killed by the OS.
 24 | 
 25 | For storage, it is beneficial to have at least one HDD and one SSD, as the
 26 | database workload can be split into sequential and random IO.
 27 | 
 28 | 
 29 | ## Configuration
 30 | 
 31 | A commented configuration file for MariaDB can be found in `config/my.cnf`.
 32 | Configuration options such as pool size and storage locations will need to be
 33 | adjusted.
 34 | 
 35 | ## Table schemas
 36 | 
 37 | To set up the tables and schemas, make sure that you have the credentials for
 38 | root in your `~/.my.cnf` file, and execute the following:
 39 | ```bash
 40 | mysql -e "create database extensions;"
 41 | for f in schemas/*.sql; do mysql extensions < $f; done
 42 | for f in views/*.sql; do mysql extensions < $f; done
 43 | ```
 44 | 
 45 | # Maintaining the database
 46 | 
 47 | ## Memory consumption
 48 | 
 49 | MariaDB will, at times, use much more memory than specified for the pool size --
 50 | 100GB with a pool size of 4GB is certainly possible while regenerating the data.
 51 | In these cases, the database should be restarted. The crawler and regeneration
 52 | script will retry their database operations by default for around one hour.
 53 | 
 54 | ## Backup
 55 | 
 56 | Regenerating the whole data set can take days, if not weeks, so even though all
 57 | data can be restored, having a backup speeds up recovery. For this purpose, the
 58 | MariaDB binary log is enabled to allow physical backups, which are much faster
 59 | than logical backups for our case. The folder `scripts/` contains scripts to do
 60 | full and incremental backups, as well as scripts to backup the schemas and users
 61 | (including permissions and hashed passwords).
 62 | 
 63 | # Regenerating extension data
 64 | 
 65 | When the crawler is changed to extract more or different data from the
 66 | extensions, one will probably want to regenerate all data, i.e., ask the crawler
 67 | to go through all existing tar files and re-extract the already downloaded data.
 68 | In order to do so, the `create-db` or `sge/create-db.sh` (for HPCs) can be used.
 69 | More information can be found when calling these scripts with `--help`.
 70 | 
 71 | # Using the data set
 72 | 
 73 | ## Example queries
 74 | 
 75 | For more (commented) queries, see the `queries/` folder.
 76 | 
 77 | - ```sql
 78 | select extid,crx_etag,count(filename) from extension_most_recent_small join crxfile using (crx_etag) where filename like '%.js' group by extid,crx_etag limit 10;
 79 | ```
 80 | This query will print the number of JavaScript files per extension.
 81 | 
 82 | ## Table schemas
 83 | 
 84 | All schema files can be found in the `schemas/` folder.
 85 | 
 86 | | Table name | Description |
 87 | | --- | --- |
 88 | | extension | General extension metadata from the store pages. One row per \
 89 | extension and crawldate (!). If you are only interested in the most recent \
 90 | *view* of the Chrome Web Store, use the `extension_most_recent` view. For \
 91 | testing your queries, suffix either table/view with *\_small* to only get \
 92 | roughly 1/256th of all extensions. |
 93 | | status | The HTTP status codes for the store page and `crx` download. |
 94 | | crx | General metadata of the extension file (the `crx` archive itself). Also \
 95 | contains the manifest. |
 96 | | crxfile | General metadata of the extension files, e.g., the files contained \
 97 | in the `crx` archives (JavaScript files, etc.).|
 98 | | category | Categories of the extensions, e.g. *productivity*, *office*, \
 99 | or *game*. |
100 | | permission | Permissions found in the manifests, e.g., *webRequest*, *tab*, but also \
101 | host permissions such as *https://www.google.com*. |
102 | | content_script_url | Content script URLs found in the manifest. These are the \
103 | URLs where the extensions request to have a content script executed when the \
104 | user visits the website. |
105 | | libdet | Information about used libraries. For each file found in `crx` \
106 | archives (identified by MD5 sums), this table stores classifications of the \
107 | file, e.g., whether it is a certain library. |
108 | | review{,\_comment} | Post-metadata and posts from the review forum of an extension. |
109 | | support{,\_comment} | Post-metadata and posts from the support forum of an extension. |
110 | | reply{,\_comment} | Reply-post-metadata and posts for both the review and support forums. |
111 | 
112 | ## Views
113 | 
114 | All views can be found in the `views/` folder.
115 | 
116 | | View name | Description |
117 | | --- | --- |
118 | | extension_small | Contains only roughly 1/256th of all extensions. |
119 | | extension_most_recent | Instead of one row for every combination of extension \
120 | id and crawl date, this view only contains the rows from the most recent crawl \
121 | date. |
122 | | extension_most_recent_small | Same, but roughly only 1/256th of all extensions. |
123 | | extension_second_most_recent | Similar to `extension_most_recent`, but \
124 | contains the second-most recent entry for all extensions. This is useful for \
125 | investigating how extensions change. |
126 | | extension_{most,second_most}_recent_until_date | Parameterized query. Only \
127 | considers extensions crawled before a given date. Usage:  \
128 | ```sql
129 | select * from (select @until_date:='2018-05-25') foo, extension_most_recent_until_date;
130 | ``` |
131 | | extension_update | Selects all extension updates in the database. A row in the result represents \
132 | one extension update, with the date and crx_etag when we have first seen the \
133 | update, and the date and crx_etag when we have last seen the old version. As \
134 | we crawl every night, the difference should be around 24 hours on average. |
135 | 


--------------------------------------------------------------------------------
/database/config/my.cnf:
--------------------------------------------------------------------------------
 1 | [client]
 2 | port		= 3306
 3 | socket		= /run/mysqld/mysqld.sock
 4 | 
 5 | [mysqld]
 6 | port		= 3306
 7 | socket		= /run/mysqld/mysqld.sock
 8 | 
 9 | wait_timeout=1800
10 | max_connections=1000
11 | explicit_defaults_for_timestamp=1
12 | default_time_zone='+00:00'
13 | 
14 | server-id	= 1
15 | 
16 | expire_logs_days=8
17 | log-basename=master1-bin
18 | 
19 | # Ideally, the MariaDB datadir resides on a HDD, as there will be a lot of sequential IO.
20 | # After creating a database, it is best moved to a SSD, as there will be a lot of
21 | # random IO. This can be done by simply moving the directory (do NOT move individual table
22 | # files!), e.g.: cd /hdd/mysql; mv extensions /ssd/databases/; ln -s /ssd/databases/extensions
23 | datadir=/hdd/mysql
24 | 
25 | # When adding indices, MariaDB uses a lot of space in /tmp. If that space is not enough, the
26 | # used tmpdir can be moved:
27 | innodb_tmpdir=/ssd/innodb_tmp
28 | 
29 | # The pool size is said to be around 75% of the available RAM on db-only hosts. However, current
30 | # versions of MariaDB seem to have serious memory leaks when doing a lot of concurrent writes.
31 | # Therefore, expect MariaDB to use a lot more memory, create sufficient swap to prevent killing,
32 | # and restart MariaDB when the usage grows too high.
33 | innodb_buffer_pool_size = 18G
34 | 
35 | # General performance tweaks
36 | innodb_read_io_threads=8
37 | innodb_write_io_threads=8
38 | innodb_sort_buffer_size=67108864
39 | innodb_log_file_size=256M
40 | innodb_log_buffer_size=256M
41 | 
42 | # Performance tweaks for inserts
43 | #innodb_flush_log_at_trx_commit=0
44 | #innodb_change_buffer_max_size=50
45 | #innodb_flush_method=O_DIRECT
46 | 
47 | [mysqldump]
48 | quick
49 | max_allowed_packet = 16M
50 | 
51 | [mysql]
52 | no-auto-rehash
53 | 
54 | [myisamchk]
55 | key_buffer_size = 20M
56 | sort_buffer_size = 20M
57 | read_buffer = 2M
58 | write_buffer = 2M
59 | 
60 | [mysqlhotcopy]
61 | interactive-timeout
62 | 


--------------------------------------------------------------------------------
/database/queries/get_added_content_scripts.sql:
--------------------------------------------------------------------------------
 1 | select downloads, eu.extid, name, url, new_crx_etag
 2 | from extension_update eu join extension e on eu.extid=e.extid and eu.first_date_with_new_crx_etag=e.date
 3 | join content_script_url c on eu.new_crx_etag=c.crx_etag
 4 | where
 5 |   url in (
 6 |     "file://*/*",
 7 |     "http://*/*",
 8 |     "https://*/*",
 9 |     "*://*/*",
10 |     "<all_urls>"
11 |   )
12 | and
13 |   url not in (select url from content_script_url where crx_etag=previous_crx_etag)
14 | and
15 |   first_date_with_new_crx_etag > NOW() - INTERVAL 2 DAY
16 | order by downloads desc;
17 | 


--------------------------------------------------------------------------------
/database/queries/get_added_permissions.sql:
--------------------------------------------------------------------------------
 1 | select downloads, eu.extid, name, permission, new_crx_etag
 2 | from extension_update eu join extension e on eu.extid=e.extid and eu.first_date_with_new_crx_etag=e.date
 3 | join permission p on eu.new_crx_etag=p.crx_etag
 4 | where
 5 |   permission in (
 6 |     "<all_url>",
 7 |     "http://*/*",
 8 |     "https://*/*",
 9 |     "webRequest",
10 |     "webRequestBlocking"
11 |   )
12 | and
13 |   permission not in (select permission from permission where crx_etag=previous_crx_etag)
14 | and
15 |   first_date_with_new_crx_etag > NOW() - INTERVAL 2 DAY
16 | order by downloads desc;
17 | 


--------------------------------------------------------------------------------
/database/schemas/category.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `category`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `category`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `category` (
24 |   `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `date` datetime(6) NOT NULL,
26 |   `category_md5` varbinary(16) NOT NULL,
27 |   `category` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
28 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
29 |   PRIMARY KEY (`extid`,`date`,`category_md5`) KEY_BLOCK_SIZE=8
30 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
31 | /*!40101 SET character_set_client = @saved_cs_client */;
32 | 
33 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
34 | 
35 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
36 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
37 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
38 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
39 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
40 | 
41 | -- Dump completed on 2018-08-09 12:31:29
42 | 


--------------------------------------------------------------------------------
/database/schemas/cdnjs.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `cdnjs`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `cdnjs`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `cdnjs` (
24 |   `path` varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL,
26 |   `md5` varbinary(16) NOT NULL,
27 |   `filename` varchar(253) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
28 |   `sha1` varbinary(20) DEFAULT NULL,
29 |   `sha256` varbinary(32) DEFAULT NULL,
30 |   `simhash` varbinary(64) DEFAULT NULL,
31 |   `size` bigint(20) DEFAULT NULL,
32 |   `loc` bigint(20) DEFAULT NULL,
33 |   `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 |   `encoding` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
35 |   `mimetype` varchar(126) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
36 |   `add_date` datetime(6) NULL DEFAULT NULL,
37 |   `library` varchar(254) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
38 |   `version` varchar(30) /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
39 |   `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
40 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
41 |   PRIMARY KEY (`path`,`typ`),
42 |   KEY `cdnjs_md5_typ` (`md5`,`typ`)
43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
44 | /*!40101 SET character_set_client = @saved_cs_client */;
45 | 
46 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
47 | 
48 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
52 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
53 | 
54 | -- Dump completed on 2018-08-09 12:31:29
55 | 


--------------------------------------------------------------------------------
/database/schemas/content_script_url.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `content_script_url`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `content_script_url`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `content_script_url` (
24 |   `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `url_md5` varbinary(16) NOT NULL,
26 |   `url` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
27 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
28 |   PRIMARY KEY (`crx_etag`,`url_md5`) KEY_BLOCK_SIZE=8
29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
30 | /*!40101 SET character_set_client = @saved_cs_client */;
31 | 
32 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
33 | 
34 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
35 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
36 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
37 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
38 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
39 | 
40 | -- Dump completed on 2018-08-09 12:31:29
41 | 


--------------------------------------------------------------------------------
/database/schemas/crx.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `crx`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `crx`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `crx` (
24 |   `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `filename` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci NOT NULL DEFAULT '',
26 |   `size` int(11) NOT NULL,
27 |   `publickey` blob NOT NULL,
28 |   `manifest` longtext /*!100301 COMPRESSED*/ CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT '',
29 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
30 |   PRIMARY KEY (`crx_etag`) KEY_BLOCK_SIZE=8
31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
32 | /*!40101 SET character_set_client = @saved_cs_client */;
33 | 
34 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
35 | 
36 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
37 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
38 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
39 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
40 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
41 | 
42 | -- Dump completed on 2018-08-09 12:31:29
43 | 


--------------------------------------------------------------------------------
/database/schemas/crxfile.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `crxfile`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `crxfile`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `crxfile` (
24 |   `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `path` varchar(512) COLLATE utf8mb4_unicode_ci NOT NULL,
26 |   `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL,
27 |   `md5` varbinary(16) DEFAULT NULL,
28 |   `filename` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 |   `sha1` varbinary(20) DEFAULT NULL,
30 |   `sha256` varbinary(32) DEFAULT NULL,
31 |   `simhash` varbinary(64) DEFAULT NULL,
32 |   `mimetype` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
33 |   `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
35 |   PRIMARY KEY (`crx_etag`,`path`,`typ`),
36 |   KEY `crxfile_md5_typ` (`md5`,`typ`)
37 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
38 | /*!40101 SET character_set_client = @saved_cs_client */;
39 | 
40 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
41 | 
42 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
43 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
44 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
45 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
46 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
47 | 
48 | -- Dump completed on 2018-08-09 12:31:29
49 | 


--------------------------------------------------------------------------------
/database/schemas/extension.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `extension`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `extension`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `extension` (
24 |   `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `date` datetime(6) NOT NULL,
26 |   `name` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
27 |   `version` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
28 |   `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 |   `downloads` int(11) DEFAULT NULL,
30 |   `rating` double DEFAULT NULL,
31 |   `ratingcount` int(11) DEFAULT NULL,
32 |   `fulldescription` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
33 |   `offeredby` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 |   `developer` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
35 |   `itemcategory` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
36 |   `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
37 |   `lastupdated` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
38 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
39 |   PRIMARY KEY (`extid`,`date`) KEY_BLOCK_SIZE=8,
40 |   KEY `extension_crx_etag` (`crx_etag`),
41 |   KEY `extension_date` (`date`),
42 |   KEY `extension_date_extid` (`date`,`extid`),
43 |   KEY `extension_extid_crx_etag` (`extid`,`crx_etag`)
44 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
45 | /*!40101 SET character_set_client = @saved_cs_client */;
46 | 
47 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
48 | 
49 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
50 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
51 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
52 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
53 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
54 | 
55 | -- Dump completed on 2018-08-09 12:31:29
56 | 


--------------------------------------------------------------------------------
/database/schemas/libdet.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `libdet`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `libdet`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `libdet` (
24 |   `md5` varbinary(16) NOT NULL,
25 |   `typ` enum('AS_IS','NORMALIZED','DECOMPRESSED','DECOMPRESSED_NORMALIZED') COLLATE utf8mb4_unicode_ci NOT NULL,
26 |   `sha1` varbinary(20) DEFAULT NULL,
27 |   `sha256` varbinary(32) DEFAULT NULL,
28 |   `size` bigint(20) DEFAULT NULL,
29 |   `loc` bigint(20) DEFAULT NULL,
30 |   `description` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 |   `encoding` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 |   `mimetype` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
33 |   `library` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
34 |   `version` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
35 |   `classification_type` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
36 |   `detect_method` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
37 |   `detect_method_details` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
38 |   `evidence_start_pos` bigint(20) DEFAULT NULL,
39 |   `evidence_end_pos` bigint(20) DEFAULT NULL,
40 |   `evidence_text` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
41 |   `mimetype_detail` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
42 |   `mimetype_magic` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
43 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
44 |   PRIMARY KEY (`md5`,`typ`)
45 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
46 | /*!40101 SET character_set_client = @saved_cs_client */;
47 | 
48 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
49 | 
50 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
51 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
52 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
53 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
54 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
55 | 
56 | -- Dump completed on 2018-08-09 12:31:29
57 | 


--------------------------------------------------------------------------------
/database/schemas/permission.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `permission`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `permission`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `permission` (
24 |   `crx_etag` varchar(44) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `permission_md5` varbinary(16) NOT NULL,
26 |   `permission` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
27 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
28 |   PRIMARY KEY (`crx_etag`,`permission_md5`) KEY_BLOCK_SIZE=8
29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
30 | /*!40101 SET character_set_client = @saved_cs_client */;
31 | 
32 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
33 | 
34 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
35 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
36 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
37 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
38 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
39 | 
40 | -- Dump completed on 2018-08-09 12:31:29
41 | 


--------------------------------------------------------------------------------
/database/schemas/reply.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `reply`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `reply`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `reply` (
24 |   `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `date` datetime(6) NOT NULL,
26 |   `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL,
27 |   `commentdate` datetime NOT NULL,
28 |   `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 |   `replyto` varchar(98) COLLATE utf8mb4_unicode_ci DEFAULT NULL,
30 |   `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 |   `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 |   `commentmd5` varbinary(16) DEFAULT NULL,
33 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
34 |   PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
36 | /*!40101 SET character_set_client = @saved_cs_client */;
37 | 
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 | 
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 | 
46 | -- Dump completed on 2018-08-09 12:31:29
47 | 


--------------------------------------------------------------------------------
/database/schemas/reply_comment.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `reply_comment`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `reply_comment`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `reply_comment` (
24 |   `commentmd5` varbinary(16) NOT NULL,
25 |   `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
26 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
27 |   PRIMARY KEY (`commentmd5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
29 | /*!40101 SET character_set_client = @saved_cs_client */;
30 | 
31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
32 | 
33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
38 | 
39 | -- Dump completed on 2018-08-09 12:31:29
40 | 


--------------------------------------------------------------------------------
/database/schemas/review.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `review`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `review`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `review` (
24 |   `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `date` datetime(6) NOT NULL,
26 |   `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL,
27 |   `commentdate` datetime NOT NULL,
28 |   `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 |   `rating` double DEFAULT NULL,
30 |   `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 |   `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 |   `commentmd5` varbinary(16) DEFAULT NULL,
33 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
34 |   PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
36 | /*!40101 SET character_set_client = @saved_cs_client */;
37 | 
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 | 
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 | 
46 | -- Dump completed on 2018-08-09 12:31:29
47 | 


--------------------------------------------------------------------------------
/database/schemas/review_comment.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `review_comment`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `review_comment`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `review_comment` (
24 |   `commentmd5` varbinary(16) NOT NULL,
25 |   `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
26 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
27 |   PRIMARY KEY (`commentmd5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
29 | /*!40101 SET character_set_client = @saved_cs_client */;
30 | 
31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
32 | 
33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
38 | 
39 | -- Dump completed on 2018-08-09 12:31:29
40 | 


--------------------------------------------------------------------------------
/database/schemas/status.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `status`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `status`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `status` (
24 |   `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `date` datetime(6) NOT NULL,
26 |   `crx_status` int(11) DEFAULT NULL,
27 |   `overview_status` int(11) DEFAULT NULL,
28 |   `overview_exception` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
30 |   PRIMARY KEY (`extid`,`date`) KEY_BLOCK_SIZE=8
31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
32 | /*!40101 SET character_set_client = @saved_cs_client */;
33 | 
34 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
35 | 
36 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
37 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
38 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
39 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
40 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
41 | 
42 | -- Dump completed on 2018-08-09 12:31:29
43 | 


--------------------------------------------------------------------------------
/database/schemas/support.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `support`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `support`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `support` (
24 |   `extid` varchar(32) COLLATE utf8mb4_unicode_ci NOT NULL,
25 |   `date` datetime(6) NOT NULL,
26 |   `author` varchar(98) COLLATE utf8mb4_unicode_ci NOT NULL,
27 |   `commentdate` datetime NOT NULL,
28 |   `displayname` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
29 |   `title` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
30 |   `language` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
31 |   `shortauthor` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
32 |   `commentmd5` varbinary(16) DEFAULT NULL,
33 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
34 |   PRIMARY KEY (`extid`,`date`,`author`,`commentdate`) KEY_BLOCK_SIZE=8
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
36 | /*!40101 SET character_set_client = @saved_cs_client */;
37 | 
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 | 
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 | 
46 | -- Dump completed on 2018-08-09 12:31:29
47 | 


--------------------------------------------------------------------------------
/database/schemas/support_comment.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Table structure for table `support_comment`
18 | --
19 | 
20 | DROP TABLE IF EXISTS `support_comment`;
21 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
22 | /*!40101 SET character_set_client = utf8 */;
23 | CREATE TABLE `support_comment` (
24 |   `commentmd5` varbinary(16) NOT NULL,
25 |   `comment` text /*!100301 COMPRESSED*/ COLLATE utf8mb4_unicode_ci DEFAULT NULL,
26 |   `last_modified` datetime NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
27 |   PRIMARY KEY (`commentmd5`)
28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci `PAGE_COMPRESSED`='ON';
29 | /*!40101 SET character_set_client = @saved_cs_client */;
30 | 
31 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
32 | 
33 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
34 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
35 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
36 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
37 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
38 | 
39 | -- Dump completed on 2018-08-09 12:31:29
40 | 


--------------------------------------------------------------------------------
/database/scripts/mariabackup-full:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | set -o nounset
4 | set -o errexit
5 | 
6 | /usr/bin/mariabackup --backup --stream=xbstream --parallel=4 --compress --compress-threads=2
7 | 


--------------------------------------------------------------------------------
/database/scripts/mariabackup-inc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -o nounset
 4 | set -o errexit
 5 | 
 6 | LSN=$1
 7 | if ! [[ "$LSN" =~ ^[0-9]+$ ]]; then
 8 | 	>&2 echo "Invalid LSN: $LSN"
 9 | 	exit 1
10 | fi
11 | 
12 | /usr/bin/mariabackup --backup --stream=xbstream --parallel=4 --compress --compress-threads=2 --incremental-lsn=$LSN
13 | 


--------------------------------------------------------------------------------
/database/scripts/mariabackup-schemas:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o nounset
 5 | 
 6 | T=$(mktemp -d)
 7 | for db in $(mysql -N -e "show databases" | grep -v -e "^mysql$" -e "^information_schema$" -e "^performance_schema$")
 8 | do
 9 | 	mkdir -p $T/schemas/$db
10 | 	mysqldump $db --no-data --single-transaction --tab=$T/schemas/$db
11 | done
12 | (cd $T; tar cz *)
13 | rm -r $T
14 | 


--------------------------------------------------------------------------------
/database/scripts/showgrants:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -o errexit
3 | set -o nounset
4 | 
5 | mysql "" --skip-column-names -A -e"SELECT CONCAT('SHOW GRANTS FOR ''',user,'''@''',host,''';') FROM mysql.user WHERE user<>''" | mysql "" --skip-column-names -A | sed 's/$/;/g'
6 | 


--------------------------------------------------------------------------------
/database/views/extension_most_recent.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Final view structure for view `extension_most_recent`
18 | --
19 | 
20 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent`*/;
22 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
25 | /*!50001 SET character_set_client      = utf8 */;
26 | /*!50001 SET character_set_results     = utf8 */;
27 | /*!50001 SET collation_connection      = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_most_recent` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */;
31 | /*!50001 SET character_set_client      = @saved_cs_client */;
32 | /*!50001 SET character_set_results     = @saved_cs_results */;
33 | /*!50001 SET collation_connection      = @saved_col_connection */;
34 | 
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 | 
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 | 
43 | -- Dump completed on 2018-08-09 12:31:29
44 | 


--------------------------------------------------------------------------------
/database/views/extension_most_recent_small.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Final view structure for view `extension_most_recent_small`
18 | --
19 | 
20 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent_small`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent_small`*/;
22 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
25 | /*!50001 SET character_set_client      = utf8 */;
26 | /*!50001 SET character_set_results     = utf8 */;
27 | /*!50001 SET collation_connection      = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_most_recent_small` AS select `e3`.`extid` AS `extid`,`e3`.`date` AS `date`,`e3`.`name` AS `name`,`e3`.`version` AS `version`,`e3`.`description` AS `description`,`e3`.`downloads` AS `downloads`,`e3`.`rating` AS `rating`,`e3`.`ratingcount` AS `ratingcount`,`e3`.`fulldescription` AS `fulldescription`,`e3`.`offeredby` AS `offeredby`,`e3`.`developer` AS `developer`,`e3`.`itemcategory` AS `itemcategory`,`e3`.`crx_etag` AS `crx_etag`,`e3`.`lastupdated` AS `lastupdated` from (((select `e1`.`extid` AS `extid`,max(`e1`.`date`) AS `date` from `extensions`.`extension` `e1` where `e1`.`extid` like 'aa%' group by `e1`.`extid`)) `e2` join `extensions`.`extension` `e3` on(`e2`.`extid` = `e3`.`extid` and `e2`.`date` = `e3`.`date`)) */;
31 | /*!50001 SET character_set_client      = @saved_cs_client */;
32 | /*!50001 SET character_set_results     = @saved_cs_results */;
33 | /*!50001 SET collation_connection      = @saved_col_connection */;
34 | 
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 | 
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 | 
43 | -- Dump completed on 2018-08-09 12:31:29
44 | 


--------------------------------------------------------------------------------
/database/views/extension_most_recent_until_date.sql:
--------------------------------------------------------------------------------
 1 | drop function if exists until_date;
 2 | create function until_date returns datetime NO SQL DEERMINISTIC return @until_date;
 3 | 
 4 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 5 | --
 6 | -- Host: localhost    Database: extensions
 7 | -- ------------------------------------------------------
 8 | -- Server version	10.3.8-MariaDB-log
 9 | 
10 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
11 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
12 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
13 | /*!40101 SET NAMES utf8 */;
14 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
15 | /*!40103 SET TIME_ZONE='+00:00' */;
16 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
17 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
18 | 
19 | --
20 | -- Final view structure for view `extension_most_recent_until_date`
21 | --
22 | 
23 | /*!50001 DROP TABLE IF EXISTS `extension_most_recent_until_date`*/;
24 | /*!50001 DROP VIEW IF EXISTS `extension_most_recent_until_date`*/;
25 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
26 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
27 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
28 | /*!50001 SET character_set_client      = utf8 */;
29 | /*!50001 SET character_set_results     = utf8 */;
30 | /*!50001 SET collation_connection      = utf8_general_ci */;
31 | /*!50001 CREATE ALGORITHM=UNDEFINED */
32 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
33 | /*!50001 VIEW `extension_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */;
34 | /*!50001 SET character_set_client      = @saved_cs_client */;
35 | /*!50001 SET character_set_results     = @saved_cs_results */;
36 | /*!50001 SET collation_connection      = @saved_col_connection */;
37 | 
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 | 
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 | 
46 | -- Dump completed on 2018-08-09 12:31:29
47 | 


--------------------------------------------------------------------------------
/database/views/extension_second_most_recent.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Final view structure for view `extension_second_most_recent`
18 | --
19 | 
20 | /*!50001 DROP TABLE IF EXISTS `extension_second_most_recent`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_second_most_recent`*/;
22 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
25 | /*!50001 SET character_set_client      = utf8 */;
26 | /*!50001 SET character_set_results     = utf8 */;
27 | /*!50001 SET collation_connection      = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_second_most_recent` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`,`extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */;
31 | /*!50001 SET character_set_client      = @saved_cs_client */;
32 | /*!50001 SET character_set_results     = @saved_cs_results */;
33 | /*!50001 SET collation_connection      = @saved_col_connection */;
34 | 
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 | 
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 | 
43 | -- Dump completed on 2018-08-09 12:31:29
44 | 


--------------------------------------------------------------------------------
/database/views/extension_second_most_recent_until_date.sql:
--------------------------------------------------------------------------------
 1 | drop function if exists until_date;
 2 | create function until_date returns datetime NO SQL DEERMINISTIC return @until_date;
 3 |  
 4 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 5 | --
 6 | -- Host: localhost    Database: extensions
 7 | -- ------------------------------------------------------
 8 | -- Server version	10.3.8-MariaDB-log
 9 | 
10 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
11 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
12 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
13 | /*!40101 SET NAMES utf8 */;
14 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
15 | /*!40103 SET TIME_ZONE='+00:00' */;
16 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
17 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
18 | 
19 | --
20 | -- Final view structure for view `extension_second_most_recent_until_date`
21 | --
22 | 
23 | /*!50001 DROP TABLE IF EXISTS `extension_second_most_recent_until_date`*/;
24 | /*!50001 DROP VIEW IF EXISTS `extension_second_most_recent_until_date`*/;
25 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
26 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
27 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
28 | /*!50001 SET character_set_client      = utf8 */;
29 | /*!50001 SET character_set_results     = utf8 */;
30 | /*!50001 SET collation_connection      = utf8_general_ci */;
31 | /*!50001 CREATE ALGORITHM=UNDEFINED */
32 | /*!50013 DEFINER=`root`@`localhost` SQL SECURITY DEFINER */
33 | /*!50001 VIEW `extension_second_most_recent_until_date` AS select `e1`.`extid` AS `extid`,`e1`.`date` AS `date`,`extensions`.`extension`.`name` AS `name`,`extensions`.`extension`.`version` AS `version`,`extensions`.`extension`.`description` AS `description`,`extensions`.`extension`.`downloads` AS `downloads`,`extensions`.`extension`.`rating` AS `rating`,`extensions`.`extension`.`ratingcount` AS `ratingcount`,`extensions`.`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`,`extensions`.`extension`.`developer` AS `developer`, `extensions`.`extension`.`itemcategory` AS `itemcategory`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,`extensions`.`extension`.`lastupdated` AS `lastupdated`,`extensions`.`extension`.`last_modified` AS `last_modified` from (((select `extensions`.`extension`.`extid` AS `extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() and !((`extensions`.`extension`.`extid`,`extensions`.`extension`.`date`) in (select `extensions`.`extension`.`extid`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`date` <= `until_date`() group by `extensions`.`extension`.`extid`)) group by `extensions`.`extension`.`extid`)) `e1` join `extensions`.`extension` on(`e1`.`extid` = `extensions`.`extension`.`extid` and `e1`.`date` = `extensions`.`extension`.`date`)) */;
34 | /*!50001 SET character_set_client      = @saved_cs_client */;
35 | /*!50001 SET character_set_results     = @saved_cs_results */;
36 | /*!50001 SET collation_connection      = @saved_col_connection */;
37 | 
38 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
39 | 
40 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
41 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
42 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
43 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
44 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
45 | 
46 | -- Dump completed on 2018-08-09 12:31:29
47 | 


--------------------------------------------------------------------------------
/database/views/extension_small.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Final view structure for view `extension_small`
18 | --
19 | 
20 | /*!50001 DROP TABLE IF EXISTS `extension_small`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_small`*/;
22 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
25 | /*!50001 SET character_set_client      = utf8 */;
26 | /*!50001 SET character_set_results     = utf8 */;
27 | /*!50001 SET collation_connection      = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`writer`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_small` AS select `extension`.`extid` AS `extid`,`extension`.`date` AS `date`,`extension`.`name` AS `name`,`extension`.`version` AS `version`,`extension`.`description` AS `description`,`extension`.`downloads` AS `downloads`,`extension`.`rating` AS `rating`,`extension`.`ratingcount` AS `ratingcount`,`extension`.`fulldescription` AS `fulldescription`,`extensions`.`extension`.`offeredby` AS `offeredby`, `extension`.`developer` AS `developer`,`extension`.`itemcategory` AS `itemcategory`,`extension`.`crx_etag` AS `crx_etag`,`extension`.`lastupdated` AS `lastupdated` from `extension` where `extension`.`extid` like 'aa%' */;
31 | /*!50001 SET character_set_client      = @saved_cs_client */;
32 | /*!50001 SET character_set_results     = @saved_cs_results */;
33 | /*!50001 SET collation_connection      = @saved_col_connection */;
34 | 
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 | 
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 | 
43 | -- Dump completed on 2018-08-09 12:31:29
44 | 


--------------------------------------------------------------------------------
/database/views/extension_update.sql:
--------------------------------------------------------------------------------
 1 | -- MySQL dump 10.16  Distrib 10.3.8-MariaDB, for Linux (x86_64)
 2 | --
 3 | -- Host: localhost    Database: extensions
 4 | -- ------------------------------------------------------
 5 | -- Server version	10.3.8-MariaDB-log
 6 | 
 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='' */;
14 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
15 | 
16 | --
17 | -- Final view structure for view `extension_update`
18 | --
19 | 
20 | /*!50001 DROP TABLE IF EXISTS `extension_update`*/;
21 | /*!50001 DROP VIEW IF EXISTS `extension_update`*/;
22 | /*!50001 SET @saved_cs_client          = @@character_set_client */;
23 | /*!50001 SET @saved_cs_results         = @@character_set_results */;
24 | /*!50001 SET @saved_col_connection     = @@collation_connection */;
25 | /*!50001 SET character_set_client      = utf8 */;
26 | /*!50001 SET character_set_results     = utf8 */;
27 | /*!50001 SET collation_connection      = utf8_general_ci */;
28 | /*!50001 CREATE ALGORITHM=UNDEFINED */
29 | /*!50013 DEFINER=`root`@`%` SQL SECURITY DEFINER */
30 | /*!50001 VIEW `extension_update` AS select `e3`.`extid` AS `extid`,`e3`.`first_date_with_new_crx_etag` AS `first_date_with_new_crx_etag`,`e3`.`new_crx_etag` AS `new_crx_etag`,`e3`.`last_date_with_previous_crx_etag` AS `last_date_with_previous_crx_etag`,`e4`.`crx_etag` AS `previous_crx_etag` from (((select `e1`.`extid` AS `extid`,`e1`.`date` AS `first_date_with_new_crx_etag`,`e1`.`crx_etag` AS `new_crx_etag`,max(`e2`.`date`) AS `last_date_with_previous_crx_etag` from (((select `extensions`.`extension`.`extid` AS `extid`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,min(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`crx_etag` is not null group by `extensions`.`extension`.`extid`,`extensions`.`extension`.`crx_etag`)) `e1` join (select `extensions`.`extension`.`extid` AS `extid`,`extensions`.`extension`.`crx_etag` AS `crx_etag`,max(`extensions`.`extension`.`date`) AS `date` from `extensions`.`extension` where `extensions`.`extension`.`crx_etag` is not null group by `extensions`.`extension`.`extid`,`extensions`.`extension`.`crx_etag`) `e2` on(`e1`.`extid` = `e2`.`extid`)) where `e1`.`date` > `e2`.`date` group by `e1`.`crx_etag`)) `e3` join `extensions`.`extension` `e4` on(`e3`.`extid` = `e4`.`extid` and `e3`.`last_date_with_previous_crx_etag` = `e4`.`date`)) */;
31 | /*!50001 SET character_set_client      = @saved_cs_client */;
32 | /*!50001 SET character_set_results     = @saved_cs_results */;
33 | /*!50001 SET collation_connection      = @saved_col_connection */;
34 | 
35 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
36 | 
37 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
38 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
39 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
40 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
41 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
42 | 
43 | -- Dump completed on 2018-08-09 12:31:29
44 | 


--------------------------------------------------------------------------------
/extgrep:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3.7
  2 | #
  3 | # Copyright (C) 2019 The University of Sheffield, UK
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or
  8 | # (at your option) any later version.
  9 | #
 10 | # This program is distributed in the hope that it will be useful,
 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 | # GNU General Public License for more details.
 14 | #
 15 | # You should have received a copy of the GNU General Public License
 16 | # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 17 | #
 18 | # SPDX-License-Identifier: GPL-3.0-or-later
 19 | 
 20 | import argparse
 21 | import io
 22 | import logging
 23 | import re
 24 | import json
 25 | import sys
 26 | import importlib.util
 27 | import csv
 28 | import math
 29 | import ast
 30 | 
 31 | from zipfile import ZipFile
 32 | 
 33 | from ExtensionCrawler.config import (const_log_format, const_basedir)
 34 | from ExtensionCrawler.archive import iter_tar_entries_by_date
 35 | from ExtensionCrawler.js_mincer import mince_js
 36 | 
 37 | 
 38 | def get_shannon_entropy(string):
 39 |     """
 40 |     This code has been borrowed from
 41 |     "http://blog.dkbza.org/2007/05/scanning-data-for-entropy-anomalies.html" and
 42 |     "git@github.com:dxa4481/truffleHog.git"
 43 |     """
 44 |     chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
 45 |     if not string:
 46 |         return 0
 47 |     entropy = 0
 48 |     for x in chars:
 49 |         p_x = float(string.count(x))/len(string)
 50 |         if p_x > 0:
 51 |             entropy += - p_x*math.log(p_x, 2)
 52 |     return entropy
 53 | 
 54 | 
 55 | def is_likely_hash(string):
 56 |     return get_shannon_entropy(string) > 2.0 and len([c for c in string if c.isdigit()]) > 4
 57 | 
 58 | 
 59 | def import_regexs(path):
 60 |     spec = importlib.util.spec_from_file_location("MinerStrings", path)
 61 |     module = importlib.util.module_from_spec(spec)
 62 |     spec.loader.exec_module(module)
 63 |     return module
 64 | 
 65 | 
 66 | def get_etag(headers_content):
 67 |     d = ast.literal_eval(headers_content)
 68 |     if "ETag" in d:
 69 |         return d["ETag"]
 70 | 
 71 | 
 72 | def get_name_and_version(overview_contents):
 73 |     # Extract extension name
 74 |     match = re.search("""<meta itemprop="name" content="(.*?)"\s*/>""",
 75 |                       overview_contents)
 76 |     name = match.group(1) if match else None
 77 | 
 78 |     # Extract extension version
 79 |     match = re.search(
 80 |         """<meta itemprop="version" content="(.*?)"\s*/>""", overview_contents)
 81 |     version = match.group(1) if match else None
 82 | 
 83 |     return name, version
 84 | 
 85 | 
 86 | def first_match_in_locations(search_tag, pattern, locations):
 87 |     for location_tag, lines in locations:
 88 |         for line in lines:
 89 |             m = re.search(pattern, line)
 90 |             if m:
 91 |                 matched_string = m.group()
 92 |                 if search_tag is not "MINING_KEYS_REGEX" or is_likely_hash(matched_string):
 93 |                     return [[location_tag, search_tag, matched_string]]
 94 |     return []
 95 | 
 96 | 
 97 | def handle_extid(conf, extid, csvwriter):
 98 |     miner_strings = import_regexs(conf.REGEXP_FILE).MinerStrings()
 99 | 
100 |     results = []
101 | 
102 |     still_in_store = None
103 |     crx_etags = [None]
104 |     for date, tups in iter_tar_entries_by_date(conf.archive_dir, extid):
105 |         if conf.from_date and not (conf.from_date <= date):
106 |             continue
107 |         if conf.latest_date and not (date <= conf.latest_date):
108 |             continue
109 | 
110 |         crx_etag = None
111 |         name = None
112 |         version = None
113 |         date_matches = []
114 |         for tarentry, tarfile in tups:
115 |             tarentry_filename = tarentry.name.split("/")[-1]
116 | 
117 |             if tarentry_filename.endswith(".crx.headers"):
118 |                 crx_etag = get_etag(tarfile.read().decode())
119 |                 if crx_etag:
120 |                     crx_etags += [crx_etag]
121 | 
122 |             if tarentry_filename == "overview.html":
123 |                 name, version = get_name_and_version(tarfile.read().decode())
124 | 
125 |             if tarentry_filename == "overview.html.status":
126 |                 still_in_store = tarfile.read().decode().startswith("2")
127 | 
128 |             if tarentry_filename.endswith(".crx") and tarentry.size > 0:
129 |                 with ZipFile(tarfile) as zf:
130 |                     for zipentry in zf.infolist():
131 |                         file_matches = []
132 |                         if zipentry.filename.endswith(".js") or zipentry.filename.endswith(".html"):
133 |                             with zf.open(zipentry) as f:
134 |                                 verbatim_lines = []
135 |                                 joined_string_lines = []
136 |                                 for block in mince_js(io.TextIOWrapper(f, encoding="utf-8", errors="surrogateescape")):
137 |                                     verbatim_lines += block.content.splitlines()
138 |                                     joined_string_lines += "".join(map(lambda x: x[1], block.string_literals)).splitlines()
139 | 
140 |                                 for search_tag in miner_strings.strings.keys():
141 |                                     for search_string in miner_strings.strings[search_tag]:
142 |                                         for match in first_match_in_locations(search_tag, re.escape(search_string),
143 |                                                                               [("verbatim", verbatim_lines),
144 |                                                                                ("joined_string", joined_string_lines)]):
145 |                                             file_matches.append(match)
146 | 
147 |                                 for search_tag in miner_strings.patterns.keys():
148 |                                     for search_pattern in miner_strings.patterns[search_tag]:
149 |                                         for match in first_match_in_locations(search_tag, search_pattern,
150 |                                                                               [("verbatim", verbatim_lines),
151 |                                                                                ("joined_string", joined_string_lines)]):
152 |                                             file_matches.append(match)
153 | 
154 |                         for match in file_matches:
155 |                             date_matches.append([zipentry.filename] + match)
156 | 
157 |         for match in date_matches:
158 |             results += [[date, crx_etag, name, version] + match]
159 | 
160 |     for result in results:
161 |         csvwriter.writerow([str(x) for x in ([extid, still_in_store, crx_etags[-1]] + result)])
162 | 
163 | 
164 | def main(conf):
165 |     logger = logging.getLogger()
166 |     ch = logging.StreamHandler(sys.stderr)
167 |     ch.setFormatter(logging.Formatter(const_log_format()))
168 |     logger.addHandler(ch)
169 |     if conf.verbose:
170 |         logger.setLevel(logging.DEBUG)
171 |     else:
172 |         logger.setLevel(logging.WARNING)
173 | 
174 |     with open(conf.EXTID_FILE) as f:
175 |         csvwriter = csv.writer(sys.stdout, csv.unix_dialect)
176 |         csvwriter.writerow(["extid", "still_in_store", "most_recent_crx_etag", "date", "crx_etag", "name", "version", "path", "position", "tag", "match"])
177 |         for extid in [l.strip() for l in f.readlines()]:
178 |             handle_extid(conf, extid, csvwriter)
179 | 
180 | 
181 | def build_parser():
182 |     main_parser = argparse.ArgumentParser(
183 |         formatter_class=argparse.RawTextHelpFormatter,
184 |         description='Grep for extensions.')
185 |     main_parser.add_argument(
186 |         'REGEXP_FILE',
187 |         help='python file with regular expressions')
188 |     main_parser.add_argument(
189 |         'EXTID_FILE',
190 |         help='file with extension ids')
191 |     main_parser.add_argument(
192 |         '-v',
193 |         '--verbose',
194 |         action='store_true',
195 |         default=False,
196 |         help='increase verbosity')
197 | 
198 | 
199 |     main_parser.add_argument(
200 |         '-D',
201 |         '--latest-date',
202 |         metavar='DATE',
203 |         type=str,
204 |         help='select latest crx from tar, released before DATE.\n' +
205 |              'Together with --from-date, specifies all crx released in specified\n' +
206 |              'date range.')
207 | 
208 |     main_parser.add_argument(
209 |         '-d',
210 |         '--from-date',
211 |         metavar='DATE',
212 |         type=str,
213 |         help='select oldest crx from tar released after DATE.\n' +
214 |              'Together with --latest-date, specifies all crx released in specified\n' +
215 |              'date range.')
216 | 
217 |     main_parser.add_argument(
218 |         '-a',
219 |         '--archive-dir',
220 |         metavar='archive',
221 |         type=str,
222 |         default=const_basedir(),
223 |         help='archive directory')
224 | 
225 |     return main_parser
226 | 
227 | 
228 | if __name__ == "__main__":
229 |     main_parser = build_parser()
230 | 
231 |     main_conf = main_parser.parse_args()
232 | 
233 |     sys.exit(main(main_conf))
234 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | colorama==0.3.9
 2 | pystuck==0.8.5
 3 | simhash==1.8.0
 4 | tabulate==0.7.7
 5 | setuptools==65.5.1
 6 | cchardet==2.1.1
 7 | mysqlclient==1.3.10
 8 | requests==2.20.0
 9 | pycryptodomex==3.4.6
10 | beautifulsoup4==4.6.0
11 | python_dateutil==2.6.1
12 | GitPython==2.1.5
13 | python_magic==0.4.13
14 | jsbeautifier==1.7.3
15 | pebble==4.3.7
16 | jsmin==2.2.2
17 | 


--------------------------------------------------------------------------------
/scripts/hpc-utilities/hpc-submit:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -o errexit
  4 | set -o nounset
  5 | 
  6 | print_help()
  7 | {
  8 |     echo "Usage: $prog [OPTION] ... -- COMMAND ... %INPUT% ..."
  9 |     echo ""
 10 |     echo "Run ..."
 11 |     echo ""
 12 |     echo "  --help, -h              display this help message"
 13 |     echo "  --jobs, -j  num         number of jobs (default: $jobs)"
 14 |     echo "  --input, -i file        file with input data"
 15 |     echo "  --prefix, -p prefix     prefix path for job directory (default: $prefix)"
 16 |     echo "  --jobname, -n name      job name (default: $name)"
 17 |     echo "  --wrapper, -w wrapper   exec wrapper (default: $wrapper)"
 18 |     echo "  --copy-from, -f         copy command from direcotry (default: src)" 
 19 |     echo "  --max-memory, -m mem    max mem (default: $mem)"
 20 |     echo "  --max-time, -t          timelimit (default: $timelimit)"
 21 |     echo "  --host, -s              remote host (default: $host)"
 22 |     echo "  --srcdir, -d            src for copying binary (default: $srcdir)"
 23 |     echo "                          assumed to be remote, if it starts with a \":\""
 24 |     echo ""
 25 |     echo " COMMAND is the command that should be executed on the HPC cluster, where"
 26 |     echo " %INPUT% will be replaced with a file containing the job-specific input data."
 27 | }
 28 | 
 29 | 
 30 | 
 31 | mk_jobdir(){
 32 |     echo "Creating temporary job directory in $workdir."
 33 |     mkdir -p "$workdir"/bin
 34 |     mkdir -p "$workdir"/cfg
 35 |     mkdir -p "$workdir"/input
 36 |     mkdir -p "$workdir"/output
 37 |     mkdir -p "$workdir"/tmp
 38 | }
 39 | 
 40 | 
 41 | clean_jobdir(){
 42 |    rm -rf "$workdir"
 43 | }
 44 | 
 45 | split_input(){
 46 |     echo "Splitting input."
 47 |     split --numeric-suffixes=1 -a 8 -e -n l/$jobs "$input" "$workdir/input/"
 48 | }
 49 | 
 50 | mk_hpc_script(){
 51 |     local HOSTNAME=`hostname -f`
 52 |     echo "Creating HPC script."
 53 |     cat <<EOF > $workdir/job.sge
 54 | 
 55 | #!/bin/bash
 56 | ## This script was generated by $prog (version: $version) 
 57 | ## on $timestamp 
 58 | ## by $USER@$HOSTNAME 
 59 | ## in $PWD 
 60 | ## using the following command:
 61 | ##   $invokation
 62 | ## 
 63 | ## SGE configuration:
 64 | #$ -V
 65 | #$ -t 1-$jobs
 66 | #$ -l rmem=$mem
 67 | #$ -l h_rt=$timelimit
 68 | #$ -j yes
 69 | #$ -o "$prefix"/"$name"/output
 70 | 
 71 | 
 72 | set -o nounset
 73 | set -o errexit
 74 | set -x
 75 | 
 76 | export JOBINPUT="$prefix"/"$name"/input/\`printf %08d \$SGE_TASK_ID\`
 77 | 
 78 | /usr/bin/time -v $wrapper $prefix/$name/bin/$cmd
 79 | echo "Execution successful."
 80 | EOF
 81 | }
 82 | 
 83 | 
 84 | mk_remote_jobdir(){
 85 |     echo "Create remote working directory ($host:$prefix)."
 86 |     ssh $host mkdir -p $prefix
 87 | }
 88 | 
 89 | install_hpc_script(){
 90 |     echo "Installing HPC Script"
 91 |     scp -q -r "$workdir" "$host":"$prefix"/"$name"
 92 | 
 93 |     if [[ $srcdir == ":"* ]]; then
 94 |         echo "  Copying cmd from remote src."
 95 |         ssh $host cp "${srcdir:1}"/"$srccmd" "$prefix"/"$name"/bin;
 96 |     else
 97 |         echo "  Copying cmd from local src."
 98 |         scp $srccmd "$srcdir"/"$srccmd" "$host":"$prefix"/"$name"/bin;
 99 |     fi
100 | }
101 | 
102 | submit_job(){
103 |     echo "Submitting job."
104 |     ssh $host qsub "$prefix"/"$name"/job.sge
105 | }
106 | 
107 | ## global configuration
108 | version="0.0"
109 | prog=`echo $0 | sed 's|.*/||'`;
110 | invokation="$prog $(printf "%q " "$@")"
111 | timestamp=`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
112 | host=`hostname`
113 | workdir=`mktemp -d`
114 | echo $workdir
115 | 
116 | ## default values
117 | prefix="\$HOME/hpc"
118 | jobs=1
119 | name="$host-$USER-$timestamp"
120 | input=""
121 | wrapper="singularity exec -B \$TMPDIR:$prefix/$name/tmp"
122 | cmd=""
123 | mem="2G"
124 | timelimit="01:00:00"
125 | local="false";
126 | host="sharc.shef.ac.uk"
127 | srcdir="."
128 | 
129 | while [ $# -gt 0 ]
130 | do
131 |     case "$1" in
132 | 	--jobs|-j)
133 | 	    jobs="$2";
134 |             shift;;
135 | 	--input|-i)
136 | 	    input="$2";
137 |             shift;;
138 | 	--jobname|-n)
139 | 	    name="$2";
140 |             shift;;
141 |         --max-memory|-m)
142 | 	    mem="$2";
143 |             shift;;
144 |         --max-time|-t)
145 | 	    timelimit="$2";
146 |             shift;;
147 |         --host|-s)
148 | 	    host="$2";
149 |             shift;;
150 |         --srcdir|-d)
151 | 	    srcdir="$2";
152 |             shift;;
153 |         --wrapper|-w)
154 | 	    wrapper="$2";
155 |             shift;;
156 |        --prefix|-p)
157 | 	    prefix="$2";
158 |             shift;;
159 | 	--help|-h)
160 | 	    print_help
161 | 	    exit 0;;
162 |         --) shift; break;;
163 | 	*)  print_help
164 |             exit 1;;
165 |     esac
166 |     shift
167 | done
168 | cmd=`echo $(printf "%q " "$@") | sed -e 's/%INPUT%/\$JOBINPUT/'`
169 | cmdarray=("$@")
170 | srccmd=${cmdarray[0]}
171 | 
172 | mk_jobdir;
173 | 
174 | if [ -n "$input" ]; then
175 |     if [ ! -f "$input" ]; then
176 |         echo "Input file \"$input\" not found!"
177 |         exit 1
178 |     fi
179 |     split_input;
180 | fi
181 | 
182 | mk_hpc_script;
183 | 
184 | mk_remote_jobdir;
185 | 
186 | install_hpc_script;
187 | 
188 | clean_jobdir;
189 | 
190 | submit_job;
191 | 
192 | 


--------------------------------------------------------------------------------
/scripts/maintainance/maintain_archive:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ACTION=${1:-MAIN}
 4 | ARCHIVE=${2:-/srv/Shared/BrowserExtensions/archive}
 5 | 
 6 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
 7 | mkdir -p $LOGDIR
 8 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
 9 | LOG=${3:-$LOGPREFIX-maintain-archive-$ACTION.log} 
10 | 
11 | SELF=$0
12 | SRC=$4
13 | 
14 | next_generation(){
15 |     local src=$1
16 |     local filebase=`basename $src .tar`
17 |     local dir=`dirname $src`
18 | 
19 |     # Check next free file name:
20 |     if ls $dir/$filebase.[0-9][0-9][0-9].tar.xz &> /dev/null; then
21 |         latest=`ls $dir/$filebase.[0-9][0-9][0-9].tar.xz | \
22 |                 sort -r | head -1 | \
23 |                 sed -e "s/.*\([0-9][0-9][0-9]\).tar.xz/\1/"`
24 |         next=`printf %03d $((latest+1))`
25 |     else
26 |         next=000
27 |     fi
28 | 
29 |     dest=$dir/$filebase.$next.tar
30 |     echo "Processing: $src -> $dest" | tee -a $LOG
31 |     mv -n $src $dest
32 |     if [ ! -f $src ]; then
33 |         tar -cf $src -T /dev/null
34 |         if [ ! -f $src ]; then
35 |             echo "ERROR: cannot create empty tar archive ($src)" | tee -a $LOG
36 |         fi
37 |     else
38 |          echo "ERROR: old archive exists ($src)" | tee -a $LOG
39 |     fi
40 | }
41 | 
42 | zge_compress(){
43 |     mkdir -p $LOG.dir
44 |     find $ARCHIVE/data/ \
45 | 	 -type d \
46 |          -name "[a-p][a-p][a-p]" \
47 |          -exec qsub -o  $LOG.dir `dirname $SELF`/xz.sge {} \;
48 | }
49 | 
50 | main(){
51 |     find $ARCHIVE/data/ \
52 |          -name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].tar" \
53 |          -exec $SELF MOVE $ARCHIVE $LOG {} \;
54 | }
55 | 
56 | case "$ACTION" in
57 |     MAIN)
58 |         main;;
59 |     MOVE)
60 |         next_generation $SRC;;
61 |     COMPRESS)
62 |        zge_compress;;
63 | esac
64 | 


--------------------------------------------------------------------------------
/scripts/maintainance/xz.sge:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -V
 3 | #$ -l rmem=2G
 4 | #$ -j yes
 5 | set -o nounset
 6 | set -x
 7 | 
 8 | find $1 \
 9 |          -name "[a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p][a-p].[0-9][0-9][0-9].tar" \
10 |          -exec xz {} \;
11 | 


--------------------------------------------------------------------------------
/scripts/monitoring/download-report-one-week.gp:
--------------------------------------------------------------------------------
  1 | if (!exists("monitordir")) monitordir='.'
  2 | filename="updates.csv"
  3 | set terminal pngcairo size 3000,800 enhanced font 'Verdana,10'
  4 | set output monitordir."/download-report-one-week.png"
  5 | 
  6 | day="2018-04-01"
  7 | # basic configuration
  8 | set datafile separator ";"
  9 | 
 10 | set autoscale x
 11 | 
 12 | # plot last 7 days
 13 | set xrange [time(0) - 7*24*60*60:]
 14 | 
 15 | set ytics
 16 | set yrange [0:400000]
 17 | set ylabel "Parallel Downloads"
 18 | set ytics 25000
 19 | set mytics 2
 20 | set y2range [0:4500]
 21 | set y2label "Sequential Downloads"
 22 | set y2tics 500
 23 | 
 24 | 
 25 | set grid
 26 | 
 27 | set xdata time
 28 | set timefmt '%Y-%m-%d %H:%M:%S'
 29 | set format x "%Y-%m-%d\n%H:%M:%S"
 30 | 
 31 | set xtics 28800
 32 | set mxtics 8
 33 | 
 34 | set style data lines
 35 | set title sprintf("Extension Downloads (Last Seven Days)")
 36 | 
 37 | set key horiz
 38 | set key out bot center
 39 | 
 40 | # for plotting only one day, one can use:
 41 | data_for_day(day,file)=sprintf("<(grep %s %s)",day, file) 
 42 | data=data_for_day(day, monitordir."/".filename)
 43 | 
 44 | # for plotting all data
 45 | data=monitordir."/".filename
 46 | 
 47 | # Trick for plotting first derivative of data:
 48 | # x0=NaN
 49 | # y0=NaN
 50 | # replot data using (dx=$1-x0,x0=$1,$1-dx/2):(dy=$6-y0,y0=$6,dy/dx) w l notitle
 51 | # TODO: support time on x scale
 52 | 
 53 | x0p=NaN
 54 | y0p=NaN
 55 | x0s=NaN
 56 | y0s=NaN
 57 | 
 58 | plot data using 1:4 with lines dashtype 2 lt rgb "#d07b95" axes x1y1 \
 59 |           title "Parallel Downloads (Target)"  ,\
 60 |      data using 1:6 with lines lw 2 dashtype 1 lt rgb "#9c416e" axes x1y1 \
 61 |           title "Parallel Downloads"           ,\
 62 |      data using (dx=timecolumn(1)-x0p,x0p=timecolumn(1),timecolumn(1)-dx/2):(dy=$6-y0p,y0p=$6,dy/dx < 0 ? 0 : (8*60*60)*dy/dx) \
 63 |           with lines dashtype 2 lt rgb "#622a55" axes x1y1 \
 64 |           title "Parallel Downloads per Eight Hours",\
 65 |      data using 1:5 with lines dashtype 2 lt rgb "#76eec6" axes x1y2 \
 66 |           title "Sequential Downloads (Target)",\
 67 |      data using 1:7 with lines lw 2 dashtype 1 lt rgb "#5ebe9e" axes x1y2 \
 68 |           title "Sequential Downloads",\
 69 |      data using (dx=timecolumn(1)-x0s,x0s=timecolumn(1),timecolumn(1)-dx/2):(dy=$7-y0s,y0s=$7,dy/dx < 0 ? 0 : (8*60*60)*dy/dx) \
 70 |           with lines dashtype 2 lt rgb "#468e76" axes x1y2 \
 71 |           title "Sequential Downloads per Eight Hours"
 72 | 
 73 | set terminal pdfcairo size 30,8 enhanced font 'Verdana,15'
 74 | set output monitordir."/download-report-one-week.pdf"
 75 | replot
 76 | 
 77 | # Plot number of extensions over time
 78 | set title sprintf("Size of Extensions Archive")
 79 | set terminal pngcairo size 3000,800 enhanced font 'Verdana,10'
 80 | set output monitordir."/size-of-archive.png"
 81 | 
 82 | set timefmt '%Y-%m-%d %H:%M:%S'
 83 | set format x "%Y-%m-%d"
 84 | 
 85 | set xrange ["2018-05-01":*]
 86 | 
 87 | 
 88 | set yrange [150000:400000]
 89 | set ylabel "Parallel Downloads"
 90 | set y2range [2750:4500]
 91 | 
 92 | 
 93 | set xtics 604800
 94 | set mxtics 7 
 95 | 
 96 | 
 97 | plot data using 1:4 with lines dashtype 1 lt rgb "#d07b95" axes x1y1 \
 98 |           title "Parallel Downloads"  ,\
 99 |      data using 1:5 with lines dashtype 1 lt rgb "#76eec6" axes x1y2 \
100 |           title "Sequential Downloads",\
101 |      data using 1:($4+$5) with lines dashtype 1 lt rgb "#000000" axes x1y1 \
102 |           title "Total Downloads"  ,\
103 |  
104 | set terminal pdfcairo size 30,8 enhanced font 'Verdana,15'
105 | set output monitordir."/size-of-archive.pdf"
106 | replot
107 | 
108 | 


--------------------------------------------------------------------------------
/scripts/monitoring/global_update_monitor.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -o errexit
  4 | set -o nounset
  5 | 
  6 | KILL="NO"
  7 | ARCHIVE="/srv/Shared/BrowserExtensions/archive"
  8 | 
  9 | while [[ $# -gt 0 ]]
 10 | do
 11 | key="$1"
 12 | case $key in
 13 |     -a|--ARCHIVE)
 14 |     ARCHIVE="$2"
 15 |     shift # past argument
 16 |     shift # past value
 17 |     ;;
 18 |     --kill)
 19 |     KILL=YES
 20 |     shift # past argument
 21 |     ;;
 22 |     *)    # unknown option
 23 |     shift # past argument
 24 |     ;;
 25 | esac
 26 | done
 27 | 
 28 | LATESTLOG=`ls $ARCHIVE/log/*/*0.log | tail -n 1`
 29 | LATESTGLOBALLOG=`ls $ARCHIVE/log/*/*-global.log | tail -n 1`
 30 | BASEDIR=$(dirname "$0")
 31 | 
 32 | PIDS=""
 33 | echo "# Checking update status"
 34 | if ps u -C global_update.sh > /dev/null; then 
 35 |     NUM=`ps u -C global_update.sh | tail -n +2 | wc -l`
 36 |     echo "* $NUM instances of global_update.sh still running (WARNING)"
 37 |     PIDS=`ps u -C global_update.sh | tail -n +2  | awk '{print $2}' | xargs`
 38 |     echo "  Running PIDs: $PIDS"
 39 |     if [[ "$KILL" == "YES" ]];then
 40 |         echo "  KILL mode enabled, killing running global_update.sh instances"
 41 |         echo "       (executing pkill -9 -P $PIDS)"
 42 |         pkill -9 -P $PIDS 
 43 |         pkill -f "ExtensionCrawler//crawler "
 44 |     fi
 45 | else
 46 |     echo "* global_update.sh not running"
 47 |     NUM=0
 48 | fi
 49 | 
 50 | echo "* current status"
 51 | PDOWNLOADS=`grep 'Updating extension $' $LATESTLOG | wc -l`
 52 | echo "  * parallel downloads finished:   $PDOWNLOADS" 
 53 | SDOWNLOADS=`grep 'Updating extension  (' $LATESTLOG | wc -l`
 54 | echo "  * sequential downloads finished: $SDOWNLOADS" 
 55 | echo "  * Updating info from log ($LATESTLOG):"
 56 | grep 'Updating .* extensions' $LATESTLOG  | sed -e 's/^.*---//'
 57 | 
 58 | echo ""
 59 | echo "## Latest log:"
 60 | cat $LATESTGLOBALLOG
 61 | 
 62 | EXTENSIONS=`grep "Updating db" $LATESTLOG | wc -l`
 63 | 
 64 | WE=`grep WorkerException $LATESTLOG | sort -k 5,5 -u | wc -l`
 65 | echo "## Worker Exceptions: $WE (out of $EXTENSIONS)"
 66 | grep WorkerException $LATESTLOG | sort -k 5,5 -u | sort -k 3,3
 67 | 
 68 | ERRORS=`grep ERROR $LATESTLOG | sort -k 5,5 -u | wc -l`
 69 | echo "## ERROR LOG: $ERRORS (out of $EXTENSIONS)"
 70 | grep ERROR $LATESTLOG | sort -k 5,5 -u | sort -k 3,3
 71 | 
 72 | echo "# Server utilization"
 73 | top b -n 1 | head -n 15
 74 | 
 75 | DATE=`date --utc +%Y-%m-%d`
 76 | TIME=`date --utc +%H:%M:%S`
 77 | 
 78 | EXTS=`grep 'Updating .* extensions' $LATESTLOG  \
 79 |  | head -1 \
 80 |  | sed -e 's/^.* (//' \
 81 |        -e 's/ including forums, / /' \
 82 |        -e 's/ excluding forums.*/ /g' \
 83 |  | awk '{print $2";"$1}'`
 84 | 
 85 | if [[ "$EXTS" == "" ]]; then
 86 |     EXTS=";"
 87 | fi
 88 | 
 89 | LASTPDOWNLOADS=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f8`
 90 | LASTSDOWNLOADS=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f10`
 91 | LASTMAIL=`tail -1 $ARCHIVE/monitor/updates.csv | cut -d'"' -f14`
 92 | 
 93 | if [[ "$NUM" == "0" ]]; then
 94 | MAIL=0
 95 | else
 96 |    if [[ "$LASTPDOWNLOADS$LASTSDOWNLOADS" == "$PDOWNLOADS$SDOWNLOADS" ]]; then 
 97 |        if [[ "$LASTMAIL" == "0" ]]; then 
 98 |            echo "" | /usr/bin/mail -s "Extension Download Stalled!" ${USER:-root};
 99 |        fi;
100 |        MAIL=1;
101 |    else
102 |        MAIL=0;
103 |    fi
104 | fi
105 | 
106 | MEM=`free | tail -2 | awk '{print $2 " " $3 " " $4}' | xargs | sed -e 's/ /\";\"/g'`
107 | 
108 | echo "\"$DATE $TIME\";\"$NUM\";\"$PIDS\";$EXTS;\"$PDOWNLOADS\";\"$SDOWNLOADS\";\"$ERRORS\";\"$MAIL\";\"$MEM\"" >> $ARCHIVE/monitor/updates.csv
109 | gnuplot -e "monitordir='$ARCHIVE/monitor'" $BASEDIR/download-report-one-week.gp
110 | 
111 | 


--------------------------------------------------------------------------------
/scripts/singularity/ExtensionCrawler.def:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # Copyright 2017 The University of Sheffield, UK
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | bootstrap:debootstrap
 18 | OSVersion: testing
 19 | MirrorURL: https://deb.debian.org/debian 
 20 |  
 21 | %labels
 22 | Maintainer The LogicalHacking Team (https://logicalhacking.com)
 23 | 
 24 | %setup 
 25 | 
 26 | %post
 27 | 
 28 | ###################################################################
 29 | # Add Debian unstable as a secondary (lower priority) source
 30 | # and update the data base of available packages.
 31 | cat >> /etc/apt/sources.list << EOF
 32 | deb http://ftp.us.debian.org/debian unstable main
 33 | EOF
 34 | 
 35 | cat > /etc/apt/preferences << EOF
 36 | Package: *
 37 | Pin: release a=testing
 38 | Pin-Priority: 900
 39 | 
 40 | Package: *
 41 | Pin: release a=unstable
 42 | Pin-Priority: 800
 43 | EOF
 44 | 
 45 | cat > /etc/apt/apt.conf.d/01norecommend << EOF
 46 | APT::Install-Recommends "0";
 47 | APT::Install-Suggests "0";
 48 | EOF
 49 | 
 50 | chmod go+r /etc/apt/preferences
 51 | apt-get update
 52 | ###################################################################
 53 | 
 54 | ###################################################################
 55 | # Add hook for apt that removes various files after installation
 56 | # that are not needed at runtime.
 57 | cat > /etc/apt/apt.conf.d/99-clean << EOF
 58 | DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };
 59 | APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };
 60 | Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache "";
 61 | EOF
 62 | ###################################################################
 63 | 
 64 | ###################################################################
 65 | # Configure locales
 66 | apt-get install -y locales
 67 | echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
 68 | echo "en_GB.UTF-8 UTF-8" >> /etc/locale.gen
 69 | locale-gen
 70 | echo "LANG=en_US.UTF-8" > /etc/default/locale
 71 | ###################################################################
 72 | 
 73 | ###################################################################
 74 | # Install the core dependencies (Python 3.6 or later)
 75 | # from the Debian Testing repository
 76 | apt-get	install -y --no-install-recommends libpython3.7-dev python3-magic python3-minimal python3-pip python3-setuptools python3-mysqldb g++ git libmariadb-dev-compat
 77 | apt-get clean
 78 | rm -rf /var/lib/apt/lists/* 
 79 | ###################################################################
 80 | 
 81 | ###################################################################
 82 | # Create /opt for local software (mainly cloned git repositories 
 83 | # from logicalhacking.com 
 84 | mkdir -p /opt
 85 | chmod 755 /opt
 86 | ###################################################################
 87 | 
 88 | ###################################################################
 89 | # Add the Extension Crawler repository, for more details, visit 
 90 | # https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler
 91 | cd /opt 
 92 | git clone https://git.logicalhacking.com/BrowserSecurity/ExtensionCrawler.git
 93 | cd ExtensionCrawler
 94 | git checkout production 
 95 | cd ..
 96 | pip3 install wheel # simhash needs wheel to build properly, still works without it though
 97 | pip3 install --system -e ExtensionCrawler
 98 | cd /
 99 | chmod -R go+u-w /opt/ExtensionCrawler
100 | chmod -R go+u-w /usr/local/lib/
101 | chmod -R go+u-w /usr/local/bin/
102 | ###################################################################
103 | 
104 | ###################################################################
105 | # Clone cdnjs repository or crate link to external archive dir
106 | ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
107 | case ${SINGULARITY_IMAGE} in 
108 |   *-cdnjs.img)
109 |     mkdir -p /opt/archive/filedb
110 |     cd /opt/archive/filedb
111 |     git clone https://github.com/cdnjs/cdnjs.git cdnjs-git
112 |     cd cdnjs-git
113 |     git pull
114 |     ln -s ${ARCHIVE}/conf . > /dev/null
115 |     ln -s ${ARCHIVE}/data > /dev/null
116 |     ln -s ${ARCHIVE}/log > /dev/null
117 |     ;;
118 |   *)
119 |     cd  /opt/
120 |     ln -s ${ARCHIVE} .
121 |     ;;
122 | esac
123 | chmod -R go+u /opt
124 | ###################################################################
125 | 
126 | ###################################################################
127 | # Create mount/bind points for the various network drives 
128 | # on SHARC (only useful when using the Singularity image on 
129 | # the High-Performance Cluster of The University of Sheffield
130 | mkdir /scratch
131 | mkdir /fastdata
132 | mkdir /data
133 | mkdir /shared
134 | 
135 | # Create nvidia driver directories to get rid of the singularity
136 | # warnings on sharc
137 | mkdir /nvbin
138 | mkdir /nvlib
139 | chmod go+u-w /scratch /fastdata /data /shared
140 | ###################################################################
141 | 
142 | ###################################################################
143 | # Manual clean-up and removal of not strictly necessary directories
144 | yes | apt purge g++
145 | yes | apt autoremove
146 | rm -rf /usr/share/doc || true
147 | ###################################################################
148 | 
149 | %environment
150 | 
151 | export EXTENSION_ARCHIVE=/opt/archive
152 | export PATH=/opt/ExtensionCrawler/:${PATH}
153 | 
154 | # We install all python modules into the container, so we do not want
155 | # to use any packages that the user might have installed in their home
156 | # directory.
157 | export PYTHONNOUSERSITE=1
158 | 
159 | %runscript
160 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
161 | # this text will get copied to /singularity and will run whenever the container
162 | # is called as an executable
163 | usage() {
164 |     cat <<EOF
165 | NAME
166 |      ExtensionCrawler 
167 | SYNOPSIS
168 |     ExtensionCrawler tool [tool options]
169 |     ExtensionCrawler list
170 |     ExtensionCrawler help
171 | DESCRIPTION
172 |     A collection of utilities for downloading and analyzing browser extension 
173 |     from the Chrome Web store.
174 | ENVIRIONMENT
175 |     EXTENSION_ARCHIVE=${EXTENSION_ARCHIVE}
176 | EOF
177 | }
178 | 
179 | tools() {
180 |     find /opt/ExtensionCrawler  -maxdepth 1 -executable -type f -exec sh -c "{} -h | head -n 1" \;
181 | }
182 | 
183 | arg="${1:-none}"
184 | 
185 | case "$arg" in
186 |     none) usage; exit 1;;
187 |     help) usage; exit 0;;
188 |     list) tools; exit 0;;
189 |     # just try to execute it then
190 |     *)    $@;;
191 | esac
192 | 
193 | 


--------------------------------------------------------------------------------
/scripts/singularity/build.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # Copyright 2017,2018 The University of Sheffield, UK
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | ARCHIVE="/srv/Shared/BrowserExtensions/archive"
 17 | BASE=ExtensionCrawler
 18 | BASESIZE=600
 19 | 
 20 | print_help()
 21 | {
 22 |     BINDIR=$(dirname "$ARCHIVE")/bin
 23 |     echo "Usage: $prog [OPTION] "
 24 |     echo ""
 25 |     echo "Build a singularity image (fat application) for all ExtensenCrawler utilities."
 26 |     echo ""
 27 |     echo "  --help, -h              display this help message"
 28 |     echo "  --force, -f             overwrite existing singularity image"
 29 |     echo "  --cdnjs, -c             include cdnjs repository (ca. 125 GB)"
 30 |     echo "  --install, -i           install image (default: $BINDIR)"
 31 |     echo "  --archive DIR, -a DIR   install image into DIR (default: $ARCHIVE)"
 32 | }
 33 | 
 34 | 
 35 | FORCE="false"
 36 | CDNJS="false"
 37 | INSTALL="false"
 38 | 
 39 | while [ $# -gt 0 ]
 40 | do
 41 |     case "$1" in
 42 |         --force|-f)
 43 |             FORCE="true";;
 44 |         --cdnjs|-c)
 45 |             CDNJS="true";;
 46 |         --install|-i)
 47 |             INSTALL="true";;
 48 |         --archive|-a)
 49 |             ARCHIVE="$2"
 50 |             shift;;
 51 |         --help|-h)
 52 |             print_help
 53 |             exit 0;;
 54 |     esac
 55 |     shift
 56 | done
 57 | 
 58 | 
 59 | if [ "$CDNJS" = "true" ]; then
 60 |     IMAGE=${BASE}-cdnjs.img
 61 |     BASESIZE=$((BASESIZE+134400))
 62 | else
 63 |     IMAGE=${BASE}.img
 64 | fi
 65 | 
 66 | BINDIR=$(dirname "$ARCHIVE")/bin
 67 | 
 68 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
 69 | mkdir -p $LOGDIR
 70 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g'`
 71 | LOG="$LOGPREFIX-$IMAGE.log"
 72 | 
 73 | if [ -f ${IMAGE} ]; then 
 74 |     if [ "$FORCE" = "true" ]; then
 75 |         rm -f ${IMAGE}
 76 |     else
 77 |         echo "Image ${IMAGE} exists already."
 78 |         echo "Please remove/rename the image and restart this script"
 79 |         exit 1
 80 |     fi
 81 | fi
 82 | 
 83 | if [ "$CDNJS" = "true" ]; then
 84 |     echo "Creating writable $IMAGE ($BASESIZE MiB) using ${BASE}.def"
 85 |     # TODO: --writable for 'build' action is deprecated due to some sparse file
 86 |     # issues; it is recommended to use --sandbox; however, that creates a
 87 |     # folder, which is probable not what we want here...
 88 |     sudo singularity build --writable ${IMAGE} ${BASE}.def > "$LOG" 2>&1
 89 |     sudo singularity image.expand --size ${BASESIZE} --writable ${IMAGE} ${BASE}.def > "$LOG" 2>&1
 90 | else 
 91 |     echo "Creating read-only $IMAGE using ${BASE}.def"
 92 |     sudo singularity build ${IMAGE} ${BASE}.def > "$LOG" 2>&1
 93 | fi
 94 | 
 95 | if [ ! -f $IMAGE ]; then
 96 |     echo "Image (${IMAGE}) creation failed!" 
 97 |     exit 1 
 98 | else
 99 |     echo "Image (${IMAGE}) creation successful!" 
100 | fi
101 | 
102 | if [ "$INSTALL" = "true" ]; then
103 |     if [ -f $BINDIR/$IMAGE ]; then
104 |         mv $BINDIR/$IMAGE $BINDIR/$IMAGE.bak
105 |     fi
106 |     echo "Installing ${IMAGE} into $BINDIR" 
107 |     mv $IMAGE $BINDIR
108 | fi
109 | 


--------------------------------------------------------------------------------
/scripts/singularity/singularitybuilder-arch.Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM archlinux/base
 2 | 
 3 | ARG version=2.6.1
 4 | 
 5 | RUN curl -o /etc/pacman.d/mirrorlist "https://www.archlinux.org/mirrorlist/?country=GB&protocol=https&use_mirror_status=on" &&\
 6 |     sed -i 's/^#//' /etc/pacman.d/mirrorlist &&\
 7 |     pacman --noconfirm -Syyu base-devel wget python squashfs-tools debootstrap
 8 | 
 9 | RUN mkdir /tmp/singularity &&\
10 |   cd /tmp/singularity &&\
11 |   wget "https://github.com/singularityware/singularity/releases/download/${version}/singularity-${version}.tar.gz" &&\
12 |   tar -xvzf singularity-${version}.tar.gz &&\
13 |   cd singularity-${version} &&\
14 |   ./configure --prefix=/usr/local &&\
15 |   make &&\
16 |   sudo make install
17 | 


--------------------------------------------------------------------------------
/scripts/singularity/singularitybuilder-arch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | set -o errexit
 3 | set -o nounset
 4 | 
 5 | if [ "$#" -lt 2 ]; then
 6 |   echo "Usage: $0 <IMGFILE> <DEFFILE>"
 7 |   exit 1
 8 | fi
 9 | 
10 | IMGFILE=$(realpath $1)
11 | IMGDIR=$(dirname "$IMGFILE")
12 | DEFFILE=$(realpath $2)
13 | DEFDIR=$(dirname "$DEFFILE")
14 | 
15 | if [ -f "$IMGFILE" ]; then
16 |   rm "$IMGFILE"
17 | fi
18 | 
19 | docker build --tag=singularitybuilder-arch -f singularitybuilder-arch.Dockerfile .
20 | docker run -v "$IMGDIR:$IMGDIR" -v "$DEFDIR:$DEFDIR" --privileged singularitybuilder-arch:latest singularity build "$IMGFILE" "$DEFFILE"
21 | 


--------------------------------------------------------------------------------
/scripts/update/global_update.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # m h  dom mon dow   command
 3 | # 15 01 * * * (cd ~/ExtensionCrawler; ((git fetch ; git checkout production; git pull) &> /dev/null))
 4 | # 07 02 * * * ~/ExtensionCrawler/scripts/global_update.sh
 5 | 
 6 | ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive}
 7 | CRAWLERHOME=${2:-~/ExtensionCrawler}
 8 | IMAGE=${3:-/shared/brucker_research1/Shared/BrowserExtensions/bin/ExtensionCrawler.img}
 9 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
10 | mkdir -p $LOGDIR
11 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
12 | LOG=$LOGPREFIX-global.log 
13 | 
14 | date --utc +'* Start Updating Extensions Archive (%c)' | tee $LOG
15 | 
16 | # Update extensions
17 | (singularity exec --bind /srv/:/srv/ $IMAGE crawler -p 32 -d --pystuck -a $ARCHIVE > $LOGPREFIX.log ) |& ts  '%Y-%m-%dT%H:%M:%S' | tee $LOGPREFIX-stderr.log
18 | 
19 | date --utc +'* Update Finished (%c)' | tee -a $LOG
20 | 
21 | 
22 | ERRORS=`grep ERROR $LOGPREFIX.log | sort -k 5,5 -u | wc -l`
23 | EXTENSIONS=`grep "Updating db" $LOGPREFIX.log | wc -l`
24 | echo "ERROR LOG: $ERRORS (out of $EXTENSIONS)"
25 | echo "=========="
26 | grep ERROR $LOGPREFIX.log | sort -k 5,5 -u | sort -k 3,3 
27 | 


--------------------------------------------------------------------------------
/scripts/update/update_cdnjs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ARCHIVE=${1:-/srv/Shared/BrowserExtensions/archive}
 4 | TMPDIR=${TMPDIR:-/tmp}
 5 | 
 6 | LOGDIR=$ARCHIVE/log/`date --utc +"%Y-%m"`
 7 | mkdir -p $LOGDIR
 8 | LOGPREFIX=$LOGDIR/`date --utc --iso-8601=ns | sed -e 's/:/_/g' -e 's/,/./'`
 9 | LOG=$LOGPREFIX-cdnjs.log 
10 | 
11 | SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img
12 | date --utc +'* Create backup of disk image (%c)' | tee -a $LOG
13 | cp $SING_IMG $SING_IMG.bak
14 | SING_EXEC="singularity exec -w --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG"
15 | ls "$SING_IMG" > /dev/null
16 | 
17 | # Update production branch of WebCrawler repository
18 | date --utc +'* Updating WebCrawler repository (%c)' | tee -a $LOG
19 | $SING_EXEC git fetch >> $LOG
20 | $SING_EXEC git checkout production >> $LOG 2>&1
21 | $SING_EXEC git pull >> $LOG 2>&1
22 | # $SING_EXEC pip3 install --system -e ../ExtensionCrawler
23 | 
24 | # Update cdnjs git repository and update cdnjs data base table
25 | date --utc +'* Updating CDNJS  repository (%c)' | tee -a $LOG
26 | $SING_EXEC ./cdnjs-git-miner -v -u -a /opt/archive >> $LOG
27 | date --utc +'* Successfully updated CDNJS  repository (%c)' | tee -a $LOG
28 | 
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open('requirements.txt') as f:
 4 |     requirements = f.read().splitlines()
 5 | 
 6 | setup(
 7 |     name='Extension Crawler',
 8 |     description='A collection of utilities for downloading and analyzing browser extension from the Chrome Web store.',
 9 |     author='Achim D. Brucker, Michael Herzberg',
10 |     license='GPL 3.0',
11 |     install_requires=requirements
12 | )
13 | 


--------------------------------------------------------------------------------
/sge/create-db-cdnjs.sge:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -V
 3 | #$ -l rmem=4G
 4 | #$ -t 1-10000
 5 | #$ -j yes
 6 | #$ -o /shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/log 
 7 | set -o nounset
 8 | set -x
 9 | 
10 | SING_IMG=/shared/brucker_research1/Shared/BrowserExtensions/archive/filedb/ExtensionCrawler-cdnjs.img
11 | 
12 | SING_EXEC="singularity exec -w --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp $SING_IMG"
13 | 
14 | printenv
15 | echo "The following parameter were passed: $*"
16 | ls "$SING_IMG" > /dev/null
17 | 
18 | /usr/bin/time $SING_EXEC ./cdnjs-git-miner -v -p 1 -i -a /opt/archive -n $SGE_TASK_ID -N 10000 $*
19 | 
20 | 


--------------------------------------------------------------------------------
/sge/create-db.sge:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o nounset
3 | set -o errexit
4 | 
5 | printenv
6 | 
7 | (set -x; /usr/bin/time singularity exec --pwd /opt/ExtensionCrawler -B $TMPDIR:/tmp create-db.img create-db -t 1 -n $SGE_TASK_ID $*)
8 | 


--------------------------------------------------------------------------------
/sge/create-db.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -o nounset
  3 | set -o errexit
  4 | 
  5 | REMOTE_ARCHIVE=/shared/brucker_research1/Shared/BrowserExtensions/archive
  6 | REMOTE_TARGET_DIR_PREFIX=/data/\$USER
  7 | NUM_THREADS=48
  8 | SGE_EXTRA_ARGS='-P rse -l h_rt=01:00:00,rmem=4G,h=\!sharc-node126 -j yes'
  9 | PY_EXTRA_ARGS=''
 10 | EXTENSION_IDS=
 11 | 
 12 | usage() {
 13 |   echo "Usage:"
 14 |   echo "  -a <path> (set archive path, default: ${REMOTE_ARCHIVE})"
 15 |   echo "  -t <path> (set target directory, default: ${REMOTE_TARGET_DIR_PREFIX})"
 16 |   echo "  -m <num_threads> (set degree of parallelism, default: ${NUM_THREADS})"
 17 |   echo "  -s \"<args>\" (add qsub arguments, default: ${SGE_EXTRA_ARGS})"
 18 |   echo "  -p \"<args>\" (add python script arguments, default: ${PY_EXTRA_ARGS})"
 19 |   echo "  -e <path> (set path to extension id list, default: crawl from archive)"
 20 |   echo "  -l <N> (limit number of sharc tasks, default: number of extensions)"
 21 | }
 22 | 
 23 | while getopts ":a:t:s:p:m:e:l:" o; do
 24 |   case "${o}" in
 25 |     a)
 26 |       REMOTE_ARCHIVE=${OPTARG}
 27 |       ;;
 28 |     t)
 29 |       REMOTE_TARGET_DIR_PREFIX=${OPTARG}
 30 |       ;;
 31 |     m)
 32 |       NUM_THREADS=${OPTARG}
 33 |       ;;
 34 |     s)
 35 |       SGE_EXTRA_ARGS+=" ${OPTARG}"
 36 |       ;;
 37 |     p)
 38 |       PY_EXTRA_ARGS+=" ${OPTARG}"
 39 |       ;;
 40 |     e)
 41 |       EXTENSION_IDS="${OPTARG}"
 42 |       ;;
 43 |     l)
 44 |       MAX_TASKS="${OPTARG}"
 45 |       ;;
 46 |     *)
 47 |       usage
 48 |       exit 1
 49 |       ;;
 50 |   esac
 51 | done
 52 | 
 53 | shift $((OPTIND-1))
 54 | 
 55 | BASEDIR=$( cd $(dirname "$0"); cd ..; pwd -P )
 56 | TEMP_FOLDER=$(mktemp -d)
 57 | TARGETDIR="${REMOTE_TARGET_DIR_PREFIX}/create-db-$(date +%Y%m%d-%H%M%S)"
 58 | 
 59 | echo "Using target dir: $TARGETDIR"
 60 | ssh sharc.shef.ac.uk mkdir -p $TARGETDIR/logs
 61 | 
 62 | echo "Pushing sge script ..."
 63 | scp "$BASEDIR/sge/create-db.sge" sharc.shef.ac.uk:"$TARGETDIR/create-db.sge"
 64 | 
 65 | echo "Building image..."
 66 | if [ -f "$BASEDIR/scripts/singularity/create-db.img" ]; then
 67 |   rm -f "$BASEDIR/scripts/singularity/create-db.img"
 68 | fi
 69 | (
 70 |   cd "$BASEDIR/scripts/singularity"
 71 |   if [[ "$(docker images -q singularitybuilder-arch 2> /dev/null)" == "" ]]; then
 72 |     docker build --tag=singularitybuilder -f singularitybuilder-arch.Dockerfile .
 73 |   fi
 74 |   docker run -it -v "$(pwd):$(pwd)" -w "$(pwd)" --privileged singularitybuilder-arch:latest singularity build create-db.img ExtensionCrawler.def
 75 | )
 76 | 
 77 | echo "Pushing image..."
 78 | scp "$BASEDIR/scripts/singularity/create-db.img" sharc.shef.ac.uk:"$TARGETDIR/create-db.img"
 79 | 
 80 | 
 81 | if [[ -z $EXTENSION_IDS ]]; then
 82 |   echo "Gathering extension IDs..."
 83 |   ssh sharc.shef.ac.uk find "${REMOTE_ARCHIVE}/data" -name "*.tar" | grep -Po "[a-p]{32}" > ${TEMP_FOLDER}/extension.ids
 84 | else
 85 |   cp "$EXTENSION_IDS" ${TEMP_FOLDER}/extension.ids
 86 | fi
 87 | 
 88 | NO_IDS=$(cat ${TEMP_FOLDER}/extension.ids | wc -l)
 89 | 
 90 | echo "Found $NO_IDS IDs!"
 91 | if [ "$NO_IDS" = 0 ]; then
 92 |   echo "Nothing to do!"
 93 |   exit 0
 94 | fi
 95 | 
 96 | echo "Pushing extension IDs..."
 97 | scp ${TEMP_FOLDER}/extension.ids sharc.shef.ac.uk:$TARGETDIR/
 98 | 
 99 | if [[ ! -v MAX_TASKS ]]; then
100 |   MAX_TASKS=NO_IDS
101 | fi
102 | 
103 | NO_BATCH_JOBS=$(((MAX_TASKS+1)/75000+1))
104 | JOBS_PER_BATCH=$((MAX_TASKS/NO_BATCH_JOBS+1))
105 | 
106 | for run_no in $(seq 1 $NO_BATCH_JOBS); do
107 |   FIRST_ID=$(((run_no-1) * $JOBS_PER_BATCH + 1))
108 |   LAST_ID=$((run_no * $JOBS_PER_BATCH))
109 | 
110 |   echo "Starting job $run_no ..."
111 |   (set -x; ssh sharc.shef.ac.uk qsub \
112 |     -tc $((NUM_THREADS/NO_BATCH_JOBS)) \
113 |     -t ${FIRST_ID}-${LAST_ID} \
114 |     -wd "$TARGETDIR" \
115 |     -o "$TARGETDIR/logs" \
116 |     ${SGE_EXTRA_ARGS} \
117 |     "$TARGETDIR/create-db.sge" -a "$REMOTE_ARCHIVE" -e "${TARGETDIR}/extension.ids" -N $MAX_TASKS ${PY_EXTRA_ARGS})
118 | done
119 | 


--------------------------------------------------------------------------------