├── .babelrc ├── .eslintignore ├── .eslintrc.yml ├── .flake8 ├── .gitignore ├── LICENSE ├── README.md ├── bin └── launch-fetcher ├── browsinglab ├── __init__.py ├── cli.py ├── connector.py ├── connlist.py ├── db.py ├── subenvvars.py └── urlcol.py ├── dev-requirements.txt ├── docs ├── LICENSE ├── activity-schema.md ├── future-design.md └── screencast-fetcher.gif ├── extension ├── .eslintrc.js ├── activityTracker.js ├── backgroundOnMessage.js ├── browser-polyfill.js ├── browserId.js ├── buildSettings.js.tmpl ├── catcher.js ├── communication.js ├── contentLoader.js ├── contentWatcher.js ├── controller.js ├── controls │ ├── popup.css │ ├── popup.html │ └── popup.jsx ├── elementToSelector.js ├── icon-live.svg ├── icon.svg ├── log.js ├── manifest.json ├── rssFinder.js ├── scraper │ ├── Readability.js │ ├── extractor-worker.js │ ├── make-static-html.js │ └── scrapeTab.js └── util.js ├── install.sh ├── package.json ├── python ├── README.md ├── analyze_classnames.ipynb ├── document_summary.ipynb ├── named_entities.ipynb ├── nn_readable.ipynb ├── pha │ ├── __init__.py │ ├── __main__.py │ ├── glovehelper.py │ ├── htmltools.py │ ├── notebooktools.py │ ├── saver.py │ ├── schema.sql │ ├── search.py │ ├── searchquery.py │ └── summarytools.py ├── requirements.txt ├── search_example.ipynb └── setup.py ├── setup.py └── test ├── .eslintrc.js ├── commands.js ├── driver-setup.js ├── random-walk.js ├── static ├── blank.html ├── debug.html ├── search-destination.html ├── search-results.html ├── search.html └── style.css ├── test-utils.js ├── test.js └── walk-configs ├── default.json └── news.json /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": ["transform-react-jsx"], 3 | } 4 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | /extension/scraper/Readability.js 2 | /test/test-data 3 | /dev-data 4 | /walk-data 5 | /extension/browser-polyfill.js 6 | /StickyProfile 7 | /Profile 8 | build 9 | build-walk 10 | -------------------------------------------------------------------------------- /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | env: 2 | browser: true 3 | es6: true 4 | node: true 5 | 6 | extends: 7 | - eslint:recommended 8 | - plugin:react/recommended 9 | - plugin:mozilla/recommended 10 | 11 | parserOptions: 12 | ecmaVersion: 8 13 | sourceType: module 14 | 15 | plugins: 16 | - mozilla 17 | - promise 18 | - react 19 | 20 | root: true 21 | 22 | rules: 23 | consistent-return: error 24 | eqeqeq: error 25 | no-console: warn 26 | prefer-const: off # TODO: change to "error" 27 | quotes: [error, double] 28 | 29 | promise/always-return: off 30 | promise/avoid-new: off 31 | promise/catch-or-return: error 32 | promise/no-callback-in-promise: off 33 | promise/no-native: off 34 | promise/no-nesting: off 35 | promise/no-promise-in-callback: off 36 | promise/param-names: error 37 | react/prop-types: off 38 | 39 | settings: 40 | react: 41 | version: 16 42 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore=E501 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | /node_modules 3 | /Profile 4 | /StickyProfile 5 | /pages 6 | /jobs 7 | *.sqlite 8 | /package-lock.json 9 | .DS_Store 10 | /python/pha.egg-info 11 | .ipynb_checkpoints 12 | /.vscode 13 | tmp 14 | /python/data 15 | build 16 | /test/test-data 17 | /addon.log 18 | /dev-data 19 | /data 20 | /.venv 21 | /test/build-walk 22 | /walk-data 23 | __pycache__ 24 | /blab 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # personal-history-archive 2 | 3 | Creating a dump of your personal browser history for analysis. This is a tool for people who want to research browsing behavior and content, starting with the only dataset you'll really be able to create: data about yourself. 4 | 5 | ## Motivation 6 | 7 | This is for creating a *browsing corpus* for later analysis. It's not a feasible end-user tool, and it collects information that can't normally be shared. But if you are interested in browsing behavior and web content analysis, then this is the package for you! 8 | 9 | The data collected here is specifically what you see and do via the browser. Unlike spidering or fetching documents via the command-line, you get fully rendered and personalized pages. This will help you include information in your corpus that specifically isn't available on the open web. 10 | 11 | ## Features 12 | 13 | Using this tool you can: 14 | 15 | * Extract your history from multiple browsers into a database 16 | * Fetch high quality versions of your history items: 17 | * Get frozen pages from the browser (no worries about JavaScript) 18 | * Fetch pages using your cookies and authentication (get personal and personalized versions of pages) 19 | * All HTML is well-formed, links are made absolute 20 | * HTML can be re-rendered easily 21 | * The frozen HTML has additional annotations to make it easier to interpret: 22 | * Hidden elements are marked as such 23 | * Elements whose `display` style is changed are marked as such (useful if you want to look for any block-like element) 24 | * The [Readability](https://github.com/mozilla/readability) library is used to extract a "readable" form 25 | * Elements in the original document that form the readable view are marked as such 26 | * The natural/rendered sizes of images are included 27 | * A first-page screenshot is taken, and a full-length thumbnail 28 | * Track ongoing browsing; collecting additional information not in normal browsing history: 29 | * Reliably track what page leads to the next page 30 | * Track what link click lead to the next page 31 | * Track how often and for how long the page was the active tab 32 | * [And more!](./docs/activity-schema.md) 33 | * A [Python library](./python/#readme) is included to help interpret your results: 34 | * Load and query history items and pages 35 | * Parse pages (using [lxml](http://lxml.de/)) 36 | * A [growing list of miscellany](./python#helpers)... 37 | 38 | ## Examples 39 | 40 | ## Overview 41 | 42 | This consists of two parts: 43 | 44 | * A [browser extension](./extension#readme) (for Firefox and Chrome) to save your history and activity 45 | * A [python library](./python#readme) to use and analyze the history 46 | 47 | ## Installation 48 | 49 | You must check out this repository to use the package. 50 | 51 | Run `npm install` to install the necessary packages, and to setup the Python **3** environment. (A virtualenv environment is created in `.venv/`) 52 | 53 | After installation you must restart your Firefox browser (Chrome support is iffy right now), go to `about:debugging` and manually install the extension from `build/extension/` 54 | 55 | Data will begin to be collected in `data/` 56 | 57 | 58 | ## Fetching history 59 | 60 | ![image](./docs/screencast-fetcher.gif) 61 | 62 | Once you have history uploaded, you may want to fetch static versions of your old history (from before you installed the extension). 63 | 64 | **Note:** these instructions are incorrect, and need updating after [#57](https://github.com/ianb/personal-history-archive/issues/57) is fixed. 65 | 66 | Use `./bin/launch-fetcher` to launch a Firefox instance dedicated to that fetching. Probably use `./bin/launch-fetcher --use-profile "Profile Name"` to use a *copy* of an existing profile (after doing that once, the profile copy will be kept for later launches). You'll want to use a profile that is logged into your services, so that you can get personalized versions of your pages. 67 | 68 | The page `http://localhost:11180/` will be loaded automatically in the fetcher browser instance, and that lets you start fetching pages. 69 | 70 | You may want to review `http://localhost:11180/viewer/redirected` to see pages that get redirects. These are often pages that required missing authentication. You can login to the pages, then delete the fetched page so it can be re-fetched. 71 | 72 | ## Python library 73 | 74 | There's a Python **3** library in [the `python/` subdirectory](https://github.com/ianb/personal-history-archive/tree/master/python). It gets automatically installed into the `.venv/` virtualenv, but you could install it elsewhere too. 75 | 76 | You can install it like: 77 | 78 | ```sh 79 | $ cd python 80 | $ pip install -e . 81 | # Optional packages: 82 | $ pip install -r requirements.txt 83 | ``` 84 | 85 | This adds a package called `pha`. There is some information [in the subdirectory](python/), and the notebooks (`*.ipynb`) show many examples (though as of March 2018, they are out of date due to refactorings). 86 | 87 | ## Random walk 88 | 89 | There's a script that will do random activity in the browser, saving data to `test/walk-data/`. Run: 90 | 91 | ```sh 92 | $ npm run walk 93 | # Or if you want to try a configuration in test/walk-configs/news.json that goes to news sites: 94 | $ CONFIG=news npm run walk 95 | ``` 96 | 97 | ## Testing 98 | 99 | The tests are in [`test/`](./test/). To run the tests: 100 | 101 | ```sh 102 | $ npm test 103 | ``` 104 | 105 | You can use `NO_CLOSE=1` to leave the browser open after the test completes (this can be helpful to understand failures). Use `TEST_ARGS="..."` to add [Mocha command-line arguments](https://mochajs.org/#usage) such as `TEST_ARGS='-g 404s' npm test` to run tests with "404s" in the test description. 106 | 107 | The temporary data will be in `test/test-data/` and you may find `test/test-data/addon.log` particularly interesting, as the Browser Console isn't very accessible from the test environment. 108 | 109 | ## Development 110 | 111 | If you want to run it interactively in a fresh profile, use: 112 | 113 | ```sh 114 | $ npm start 115 | ``` 116 | 117 | This will run a new browser profile, with data going into `dev-data/` (and logs in `dev-data/addon.log`). Changes are not automatically picked up, so you have to restart the browser after changes. There is no migration, so you may have to wipe out `dev-data/` after changes to the schema. 118 | 119 | ## Collaborating 120 | 121 | If you have a question, probably the best thing is to [open a ticket](https://github.com/ianb/personal-history-archive/issues/new). If you are interested in implementing something, it would also be great to open a ticket so we can discuss. 122 | 123 | If you'd like to chat, I've created a channel `#pha` on irc.mozilla.org. I (`ianbicking`) am usually only online during business hours, Central Time/UTC-6. 124 | 125 | ## Credits 126 | 127 | The icon comes from [Open Iconic](https://useiconic.com/open) 128 | -------------------------------------------------------------------------------- /bin/launch-fetcher: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | cd "$(dirname ${BASH_SOURCE[0]})/.." 5 | base="$(pwd)" 6 | PATH="node_modules/.bin:$PATH" 7 | webext="$base/node_modules/.bin/web-ext" 8 | 9 | binary= 10 | firefoxes=" 11 | /Applications/FirefoxNightly.app 12 | /Applications/FirefoxDeveloperEdition.app 13 | /Applications/FirefoxAurora.app 14 | $(which firefox || true) 15 | " 16 | use_profile= 17 | use_scratch= 18 | 19 | for firefox in $firefoxes ; do 20 | if [[ -e "$firefox" ]] ; then 21 | binary="$firefox" 22 | break 23 | fi 24 | done 25 | 26 | help () { 27 | echo "Usage: $(basename $0) [OPTIONS]" 28 | echo " Options:" 29 | echo " -b or --binary BINARY" 30 | echo " Use BINARY as the Firefox to run (default $binary)" 31 | echo " --use-profile PROFILE_NAME" 32 | echo " Use an existing profile based on the named profile; the profile will be copied to StickyProfile" 33 | echo " --use-scratch" 34 | echo " Use a scratch profile. Good for testing." 35 | } 36 | 37 | while [[ -n "$1" ]] ; do 38 | case "$1" in 39 | help|-h|--help) 40 | help 41 | exit 42 | ;; 43 | -b|--binary) 44 | binary="$2" 45 | shift 46 | shift 47 | ;; 48 | --use-profile) 49 | use_profile="$2" 50 | shift 51 | shift 52 | ;; 53 | --use-scratch) 54 | use_scratch=1 55 | shift 56 | ;; 57 | *) 58 | echo "Unknown option: $1" 59 | help 60 | exit 2 61 | ;; 62 | esac 63 | done 64 | 65 | if [[ -n "$use_profile" ]] ; then 66 | if [[ -e ./StickyProfile ]] ; then 67 | echo "An existing profile already exists. It was created from:" 68 | echo " $(cat ./StickyProfile/pha-orig-profile-name.txt)" 69 | echo "Do you want to overwrite it with:" 70 | echo " $use_profile" 71 | echo "(losing any changes you may have made in it)" 72 | echo -n "[y/N] ? " 73 | read answer 74 | if [[ "$answer" != "y" ]] ; then 75 | echo "Aborting." 76 | exit 77 | fi 78 | rm -rf ./StickyProfile/ 79 | fi 80 | echo "Copying profile '$use_profile' into ./StickyProfile" 81 | ./node_modules/.bin/firefox-profile -p "$use_profile" -o ./StickyProfile/ 82 | echo -n "$use_profile" > ./StickyProfile/pha-orig-profile-name.txt 83 | elif [[ -z "$use_scratch" ]] && [[ ! -e ./StickyProfile/ ]] ; then 84 | echo "You haven't used --use-profile before (which creates ./StickyProfile/)" 85 | echo "You must profile that option once, or use --use-scratch" 86 | echo 87 | help 88 | exit 2 89 | fi 90 | 91 | prefs="--pref=dom.webaudio.enabled=false --pref=media.autoplay.enabled=false --pref=dom.disable_beforeunload=true" 92 | 93 | for name in permissions.default.camera permissions.default.desktop-notification permissions.default.geo permissions.default.microphone permissions.default.shortcuts ; do 94 | prefs="$prefs --pref=$name=2" 95 | done 96 | 97 | for name in capability.policy.default.Window.alert capability.policy.default.Window.confirm capability.policy.default.Window.prompt ; do 98 | prefs="$prefs --pref=$name=noAccess" 99 | done 100 | 101 | 102 | run_webext() { 103 | echo "Running Firefox." 104 | if [[ -n "$use_scratch" ]] ; then 105 | $webext run $prefs --firefox "$binary" --source-dir ./extension/ --start-url http://localhost:11180/fetcher.html 106 | else 107 | echo " Using profile $(cat ./StickyProfile/pha-orig-profile-name.txt)" 108 | $webext run $prefs --firefox "$binary" --source-dir ./extension/ \ 109 | --keep-profile-changes --firefox-profile ./StickyProfile/ --start-url http://localhost:11180/fetcher.html 110 | fi 111 | } 112 | 113 | run_webext 114 | -------------------------------------------------------------------------------- /browsinglab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ianb/personal-history-archive/68168eac7876a8827ec566fb4882a7ab5804d87b/browsinglab/__init__.py -------------------------------------------------------------------------------- /browsinglab/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | import os 3 | import json 4 | import sys 5 | 6 | 7 | @click.group() 8 | def cli(): 9 | pass 10 | 11 | 12 | @cli.command() 13 | def install(native_name="browsinglab.connector"): 14 | """Install what is necessary for the browser connection""" 15 | # FIXME: support Windows 16 | manifest_path = os.path.abspath(os.path.join(__file__, "../../extension/manifest.json")) 17 | script_location = os.path.join(sys.prefix, "bin", "browser-connector") 18 | with open(manifest_path) as fp: 19 | manifest = json.load(fp) 20 | manifest_id = manifest["applications"]["gecko"]["id"] 21 | native_manifest = { 22 | "name": native_name, 23 | "description": "Saves information from the Browsing Lab extension", 24 | "path": script_location, 25 | "type": "stdio", 26 | "allowed_extensions": [manifest_id] 27 | } 28 | if sys.platform == "darwin": 29 | filename = os.path.expanduser("~/Library/Application Support/Mozilla/NativeMessagingHosts/%s.json" % native_name) 30 | elif sys.platform.startswith("linux"): 31 | filename = os.path.expanduser("~/.mozilla/native-messaging-hosts/%s.json" % native_name) 32 | else: 33 | raise Exception("Not a supported platform") 34 | dir = os.path.dirname(filename) 35 | if not os.path.exists(dir): 36 | os.makedirs(dir) 37 | with open(filename, "wb") as fp: 38 | fp.write(json.dumps(native_manifest, indent=2).encode("UTF-8")) 39 | click.echo("Connector installed to:") 40 | click.secho(" %s" % filename, bold=True) 41 | click.echo("Script located in:") 42 | click.secho(" %s" % script_location, bold=True) 43 | -------------------------------------------------------------------------------- /browsinglab/connector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements saving information into the database/files 3 | """ 4 | 5 | import os 6 | import re 7 | import json 8 | import sys 9 | import struct 10 | import time 11 | import pprint 12 | import traceback 13 | import uuid 14 | import atexit 15 | from .db import Page, Archive, Activity, ActivityLink, Browser, BrowserSession 16 | from . import connlist 17 | 18 | message_handlers = {} 19 | 20 | active_archive = None 21 | active_browser = None 22 | 23 | @atexit.register 24 | def end(): 25 | if active_browser: 26 | active_browser.connected = False 27 | 28 | 29 | def addon(func): 30 | message_handlers[func.__name__] = func 31 | return func 32 | 33 | 34 | @addon 35 | def add_history_list(archive, *, browserId, sessionId, historyItems): 36 | visits_to_ids = {} 37 | for history in historyItems.values(): 38 | for visitId, visit in history["visits"].items(): 39 | visits_to_ids[visitId] = visit["activity_id"] = str(uuid.uuid1()) 40 | for historyId, history in historyItems.items(): 41 | c = archive.conn.cursor() 42 | for visitId, visit in history["visits"].items(): 43 | c.execute(""" 44 | DELETE FROM activity WHERE browserVisitId = ? 45 | """, (visitId,)) 46 | sourceId = None 47 | if visit.get("referringVisitId"): 48 | sourceId = visits_to_ids.get(visit["referringVisitId"]) 49 | if not sourceId: 50 | c.execute(""" 51 | SELECT id FROM activity WHERE browserVisitId = ? 52 | """, (visit["referringVisitId"],)) 53 | row = c.fetchone() 54 | if row: 55 | sourceId = row.id 56 | c.execute(""" 57 | INSERT INTO activity ( 58 | id, 59 | title, 60 | browserId, 61 | sessionId, 62 | url, 63 | browserHistoryId, 64 | browserVisitId, 65 | loadTime, 66 | transitionType, 67 | browserReferringVisitId, 68 | sourceId 69 | ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 70 | """, ( 71 | visit["activity_id"], 72 | history["title"], 73 | browserId, 74 | sessionId, 75 | history["url"], 76 | historyId, 77 | visitId, 78 | visit["visitTime"], 79 | visit["transition"], 80 | visit["referringVisitId"], 81 | sourceId)) 82 | archive.conn.commit() 83 | c = archive.conn.cursor() 84 | c.execute(""" 85 | UPDATE browser 86 | SET 87 | newestHistory = (SELECT MAX(loadTime) 88 | FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL), 89 | oldestHistory = (SELECT MIN(loadTime) 90 | FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL) 91 | """, (browserId, browserId)) 92 | archive.conn.commit() 93 | 94 | 95 | @addon 96 | def add_activity_list(archive, *, browserId, sessionId, activityItems): 97 | sqlBrowserId = Browser.getID(browserId) 98 | sqlSessionId = BrowserSession.getID(sessionId) 99 | for activity in activityItems: 100 | linkInformation = activity.pop("linkInformation", []) 101 | uuid = activity.pop("id") 102 | activity["browserID"] = sqlBrowserId 103 | activity.pop("sessionId", None) 104 | activity["sessionID"] = sqlSessionId 105 | activity["sourceID"] = Activity.getID(activity.pop("sourceId", None), default=None) 106 | activity["initialLoadID"] = Activity.getID(activity.pop("initialLoadId", None), default=None) 107 | a = Activity.replaceUuid(uuid, **activity) 108 | log(archive, a) 109 | ActivityLink.deleteMany(ActivityLink.activity==a) 110 | for link in linkInformation or []: 111 | link = ActivityLink(**link) 112 | 113 | 114 | @addon 115 | def check_page_needed(archive, url): 116 | return Page.urlExists(url) 117 | 118 | 119 | @addon 120 | def register_browser(archive, *, browserId, userAgent, devicePixelRatio=1): 121 | global active_browser 122 | b = Browser.replaceUuid(browserId, userAgent=userAgent, devicePixelRatio=devicePixelRatio, connected=True) 123 | active_browser = b 124 | 125 | 126 | @addon 127 | def register_session(archive, sessionId, browserId, timezoneOffset): 128 | BrowserSession.replaceUuid( 129 | sessionId, 130 | browserID=Browser.getID(browserId), 131 | timezoneOffset=timezoneOffset, 132 | startTime=int(time.time() * 1000)) 133 | 134 | 135 | @addon 136 | def add_fetched_page(archive, id, url, page): 137 | redirectUrl = page["url"].split("#")[0] 138 | origUrl = url.split("#")[0] 139 | page["originalUrl"] = url 140 | if redirectUrl == origUrl: 141 | redirectUrl = None 142 | else: 143 | redirectUrl = page["url"] 144 | if redirectUrl: 145 | # Removes the YouTube start time we add 146 | redirectUrl = redirectUrl.replace("&start=86400", "") 147 | if page.get("activityId"): 148 | page["activityId"] = Activity.getID(page["activityId"], default=None) 149 | Page.replaceUuid( 150 | id, 151 | url=url, 152 | activityId=page.get("activityId"), 153 | timeToFetch=page["timeToFetch"], 154 | redirectUrl=redirectUrl, 155 | scrapeData=page, 156 | ) 157 | 158 | def substitute_location(path): 159 | path = path.replace("__prefix__", sys.prefix) 160 | path = os.path.expanduser(path) 161 | path = os.path.abspath(path) 162 | return path 163 | 164 | @addon 165 | def set_active_archive(archive, archiveLocation): 166 | global withheld_log_messages 167 | archiveLocation = substitute_location(archiveLocation) 168 | global active_archive 169 | if active_archive: 170 | active_archive.close() 171 | active_archive = Archive(archiveLocation) 172 | if withheld_log_messages: 173 | filename = os.path.join(active_archive.path, "addon.log") 174 | with open(filename, "a") as fp: 175 | fp.write("\n".join(withheld_log_messages)) 176 | withheld_log_messages = [] 177 | return archiveLocation 178 | 179 | set_active_archive.archive_optional = True 180 | 181 | @addon 182 | def unset_active_archive(archive): 183 | global active_archive 184 | active_archive.close() 185 | active_archive = None 186 | 187 | @addon 188 | def get_archive_info(archive): 189 | if not archive: 190 | return None 191 | return {"path": archive.path, "title": archive.title} 192 | 193 | get_archive_info.archive_optional = True 194 | 195 | 196 | @addon 197 | def set_archive_title(archive, title): 198 | archive.title = title 199 | 200 | 201 | @addon 202 | def list_archives(archive): 203 | return connlist.list_archives() 204 | 205 | list_archives.archive_optional = True 206 | 207 | withheld_log_messages = [] 208 | 209 | @addon 210 | def log(archive, *args, level='log', stack=None): 211 | lines = [] 212 | if stack: 213 | log_location = stack.splitlines()[0] 214 | log_location = re.sub(r'moz-extension://[a-f0-9-]+/', '/', log_location) 215 | else: 216 | log_location = "" 217 | lines.append("Log/{: <5} {} {}".format(level, int(time.time() * 1000), log_location)) 218 | if len(str(args)) < 70 and len(args) > 1: 219 | args = (args,) 220 | for arg in args: 221 | if isinstance(arg, str): 222 | s = arg 223 | else: 224 | s = pprint.pformat(arg, compact=True) 225 | if isinstance(arg, tuple): 226 | s = s[1:-1] 227 | s = s.splitlines() 228 | for line in s: 229 | lines.append(" %s" % line) 230 | if not args: 231 | lines.append(" (no arguments)") 232 | text = "\n".join(lines) + "\n" 233 | if not archive: 234 | filename = os.path.join(sys.prefix, "../addon.log") 235 | withheld_log_messages.append(text) 236 | else: 237 | filename = os.path.join(archive.path, "addon.log") 238 | with open(filename, "a") as fp: 239 | fp.write(text) 240 | 241 | log.archive_optional = True 242 | 243 | class LogPrinter: 244 | 245 | def __init__(self): 246 | self._cache = "" 247 | 248 | def write(self, s): 249 | sys.stderr.write(s) 250 | self._cache += s 251 | if self._cache.endswith("\n") or len(self._cache.splitlines()) > 1: 252 | log(active_archive, "print: %s" % self._cache.rstrip()) 253 | self._cache = "" 254 | 255 | def flush(self): 256 | sys.stderr.flush() 257 | 258 | 259 | def write_page(archive, url, data): 260 | pages = list(Page.selectBy(url=url, orderBy="-fetched", limit=1)) 261 | if not pages: 262 | raise Exception("No page found with url %r" % url) 263 | pages[0].scrapeData = data 264 | 265 | 266 | def connect(): 267 | print("Running browsing-connector from %s" % __file__, file=sys.stderr) 268 | sys.stdout = LogPrinter() 269 | while True: 270 | m_name = "(unknown)" 271 | try: 272 | message = get_message() 273 | m_name = "%(name)s(%(args)s%(kwargs)s)" % dict( 274 | name=message["name"], 275 | args=", ".join(json.dumps(s) for s in message.get("args", [])), 276 | kwargs=", ".join("%s=%s" % (name, json.dumps(value)) for name, value in message.get("kwargs", {}).items()), 277 | ) 278 | if len(m_name) > 100: 279 | m_name = m_name[:60] + " ... " + m_name[-10:] 280 | # print("Message:", m_name, file=sys.stderr) 281 | handler = message_handlers.get(message["name"]) 282 | if not handler: 283 | print("Error: got unexpected message name: %r" % message["name"], file=sys.stderr) 284 | continue 285 | if active_archive is None and not getattr(handler, "archive_optional", False): 286 | raise Exception("Attempted to send message before setting archive: %s()" % m_name) 287 | result = handler(active_archive, *message.get("args", ()), **message.get("kwargs", {})) 288 | send_message({"id": message["id"], "result": result}) 289 | except Exception as e: 290 | tb = traceback.format_exc() 291 | log(active_archive, "Error processing message %s(): %s" % (m_name, e), tb, level='s_err') 292 | send_message({"id": message["id"], "error": str(e), "traceback": tb}) 293 | 294 | 295 | def get_message(): 296 | length = sys.stdin.buffer.read(4) 297 | if len(length) == 0: 298 | sys.exit(0) 299 | length = struct.unpack('@I', length)[0] 300 | message = sys.stdin.buffer.read(length).decode('utf-8') 301 | message = json.loads(message) 302 | return message 303 | 304 | 305 | def encode_message(message): 306 | content = json.dumps(message).encode('utf-8') 307 | length = struct.pack('@I', len(content)) 308 | return length + content 309 | 310 | 311 | def send_message(message): 312 | sys.__stdout__.buffer.write(encode_message(message)) 313 | sys.__stdout__.buffer.flush() 314 | -------------------------------------------------------------------------------- /browsinglab/connlist.py: -------------------------------------------------------------------------------- 1 | """ 2 | This handles keeping all the archives on disk registered 3 | """ 4 | import os 5 | 6 | 7 | DIR_LOCATION = os.path.expanduser("~/.browsinglab") 8 | LOCATIONS = os.path.join(DIR_LOCATION, "locations.txt") 9 | 10 | if not os.path.exists(DIR_LOCATION): 11 | os.makedirs(DIR_LOCATION) 12 | 13 | 14 | def get_locations(): 15 | if not os.path.exists(LOCATIONS): 16 | return [] 17 | with open(LOCATIONS) as fp: 18 | lines = fp.readlines() 19 | locations = [l.strip() for l in lines if l.strip() and not l.strip().startswith("#")] 20 | locations = [l for l in locations if os.path.isdir(l)] 21 | return locations 22 | 23 | 24 | def list_archives(): 25 | result = [] 26 | for l in get_locations(): 27 | title = None 28 | title_path = os.path.join(l, "title.txt") 29 | if os.path.exists(title_path): 30 | with open(title_path) as fp: 31 | title = fp.read().strip() or None 32 | result.append({ 33 | "path": l, 34 | "title": title, 35 | }); 36 | return result 37 | 38 | 39 | def add_location(l): 40 | l = os.path.abspath(l) 41 | if l in get_locations(): 42 | return 43 | with open(LOCATIONS, "a") as fp: 44 | fp.write("%s\n" % l) 45 | -------------------------------------------------------------------------------- /browsinglab/db.py: -------------------------------------------------------------------------------- 1 | import os 2 | from sqlobject import ( 3 | sqlhub, SQLObject, connectionForURI, 4 | StringCol, DateTimeCol, FloatCol, IntCol, ForeignKey, BoolCol, JSONCol, 5 | ) 6 | from .urlcol import URLCol 7 | from . import connlist 8 | 9 | conn_init = False 10 | 11 | class Mixin: 12 | 13 | @classmethod 14 | def replaceUuid(cls, uuid, **kw): 15 | existing = list(cls.selectBy(uuid=uuid)) 16 | if existing: 17 | instance = existing[0] 18 | instance.set(**kw) 19 | else: 20 | instance = cls(uuid=uuid, **kw) 21 | return instance 22 | 23 | @classmethod 24 | def getID(cls, uuid, default="no default"): 25 | if uuid is None: 26 | if default == "no default": 27 | raise Exception("%s.getID() id of None" % cls.__name__) 28 | return default 29 | results = list(cls.selectBy(uuid=uuid)) 30 | if not results: 31 | if default == "no default": 32 | raise Exception("No %s found by uuid" % cls.__name__) 33 | return default 34 | return results[0].id 35 | 36 | class Archive: 37 | """ 38 | Represents one archive. It exists in some location on disk 39 | """ 40 | def __init__(self, path): 41 | global conn_init 42 | if conn_init: 43 | raise Exception("Two archives can't yet coexist") 44 | if not os.path.exists(path): 45 | os.makedirs(path) 46 | connlist.add_location(path) 47 | self.path = path 48 | self.sqlite_path = os.path.join(path, 'history.sqlite') 49 | exists = os.path.exists(self.sqlite_path) 50 | conn_init = True 51 | import sys 52 | print("Location", 'sqlite:%s/history.sqlite' % self.path, exists) 53 | sqlhub.processConnection = connectionForURI('sqlite:%s/history.sqlite' % self.path) 54 | print("Creating tables") 55 | create_tables() 56 | 57 | def __repr__(self): 58 | return "" % (self.path,) 59 | 60 | @property 61 | def title(self): 62 | title_path = os.path.join(self.path, "title.txt") 63 | if os.path.exists(title_path): 64 | with open(title_path) as fp: 65 | return fp.read().strip() or None 66 | return None 67 | 68 | @title.setter 69 | def title(self, value): 70 | title_path = os.path.join(self.path, "title.txt") 71 | if value: 72 | with open(title_path, "w") as fp: 73 | fp.write(value) 74 | elif os.path.exists(title_path): 75 | os.unlink(title_path) 76 | 77 | def close(self): 78 | global conn_init 79 | conn_init = False 80 | if sqlhub.processConnection: 81 | sqlhub.processConnection.close() 82 | sqlhub.processConnection = None 83 | self.path = None 84 | self.sqlite_path = None 85 | 86 | 87 | class Browser(SQLObject, Mixin): 88 | uuid = StringCol() 89 | created = DateTimeCol(default=DateTimeCol.now) 90 | userAgent = StringCol() 91 | devicePixelRatio = FloatCol() 92 | connected = BoolCol(default=False, notNone=True) 93 | 94 | 95 | class BrowserSession(SQLObject, Mixin): 96 | uuid = StringCol() 97 | browser = ForeignKey('Browser') 98 | startTime = IntCol(default=None) 99 | endTime = IntCol(default=None) 100 | timezoneOffset = IntCol(default=None) 101 | 102 | 103 | class Page(SQLObject, Mixin): 104 | uuid = StringCol() 105 | url = URLCol(notNone=True) 106 | fetched = DateTimeCol(default=DateTimeCol.now) 107 | activity = ForeignKey('Activity') 108 | timeToFetch = IntCol() 109 | redirectUrl = URLCol() 110 | redirectOk = BoolCol(default=False, notNone=True) 111 | scrapeData = JSONCol() 112 | 113 | @classmethod 114 | def urlExists(cls, url): 115 | return bool(list(cls.selectBy(url=url))) 116 | 117 | 118 | class Activity(SQLObject, Mixin): 119 | uuid = StringCol() 120 | browser = ForeignKey('Browser') 121 | session = ForeignKey('BrowserSession') 122 | url = URLCol(notNone=True) 123 | title = StringCol() 124 | ogTitle = StringCol() 125 | loadTime = IntCol() 126 | unloadTime = IntCol() 127 | transitionType = StringCol() 128 | sourceClickText = StringCol() 129 | sourceClickHref = StringCol() # FIXME: URL 130 | clientRedirect = BoolCol(default=False, notNone=True) 131 | serverRedirect = BoolCol(default=False, notNone=True) 132 | forwardBack = BoolCol(default=False, notNone=True) 133 | fromAddressBar = BoolCol(default=False, notNone=True) 134 | source = ForeignKey('Activity') 135 | browserReferringVisitId = StringCol(default=None) 136 | initialLoad = ForeignKey('Activity') 137 | newTab = BoolCol() # was opened in new tab? 138 | activeCount = IntCol() # Count of times it was "activated" 139 | activeTime = IntCol() # Millisecond active time 140 | closedReason = StringCol() 141 | method = StringCol() # HTTP request method 142 | statusCode = IntCol() # HTTP status code 143 | contentType = StringCol() # HTTP Content-Type 144 | hasSetCookie = BoolCol() # has Set-Cookie response header 145 | hasCookie = BoolCol() # has Cookie request header 146 | copyEvents = JSONCol() 147 | formControlInteraction = IntCol() # count of form interactions 148 | formTextInteraction = IntCol() # count of form interactions 149 | isHashChange = BoolCol() 150 | maxScroll = IntCol() # pixel Y location 151 | documentHeight = IntCol() # pixel height 152 | hashPointsToElement = BoolCol() 153 | zoomLevel = FloatCol() # 1.0 means 100% zoom 154 | canonicalUrl = URLCol() # URL 155 | mainFeedUrl = URLCol() # URL 156 | allFeeds = JSONCol() 157 | 158 | 159 | class ActivityLink(SQLObject): 160 | activity = ForeignKey('Activity') 161 | url = URLCol(notNone=True) 162 | text = StringCol(notNone=True) 163 | rel = StringCol() 164 | target = StringCol() 165 | elementId = StringCol() 166 | 167 | 168 | def create_tables(): 169 | classes = [Browser, BrowserSession, Activity, Page, ActivityLink] 170 | for cls in classes: 171 | cls.createTable(ifNotExists=True) 172 | -------------------------------------------------------------------------------- /browsinglab/subenvvars.py: -------------------------------------------------------------------------------- 1 | """Simple script for substituting environmental variables in a template-ish file""" 2 | 3 | import re 4 | import sys 5 | import os 6 | import json 7 | 8 | env_re = re.compile(r'process\.env\.([a-zA-Z0-9_]+)') 9 | 10 | 11 | def matcher(m): 12 | value = os.environ.get(m.group(1)) or "" 13 | return json.dumps(value) 14 | 15 | 16 | input = sys.stdin.read() 17 | output = env_re.sub(matcher, input) 18 | 19 | sys.stdout.write(output) 20 | -------------------------------------------------------------------------------- /browsinglab/urlcol.py: -------------------------------------------------------------------------------- 1 | from yarl import URL 2 | from sqlobject.col import StringValidator, SOStringCol, StringCol 3 | 4 | __all__ = ["URLCol"] 5 | 6 | class URLValidator(StringValidator): 7 | 8 | def to_python(self, value, state): 9 | if value is None: 10 | return None 11 | return URL(value) 12 | 13 | def from_python(self, value, state): 14 | if value is None: 15 | return None 16 | return str(value) 17 | 18 | class SOURLCol(SOStringCol): 19 | 20 | def createValidators(self): 21 | return [URLValidator(name=self.name)] + \ 22 | super(SOURLCol, self).createValidators() 23 | 24 | class URLCol(StringCol): 25 | baseClass = SOURLCol 26 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | pylint 2 | flake8 3 | prospector 4 | -------------------------------------------------------------------------------- /docs/activity-schema.md: -------------------------------------------------------------------------------- 1 | ## Activity Schema 2 | 3 | This describes the schema of browsing activity and pages. The schema is intended to be encoded in JSON, but could also end up in a database. 4 | 5 | Note: everything marked TODO needs to be added, or maybe adjusted. 6 | 7 | ### Data Types 8 | 9 | **Date / times**: these are represented as milliseconds from the epoch, i.e., the same as what `Date.now()` returns. 10 | 11 | **Unknown values**: as far as possible we use `null` as "unknown" values or sometimes "not applicable". Information that can affirmatively be known not to exist should use a different value. 12 | 13 | **IDs**: we try to use UUIDs as IDs as often as possible. There may be external IDs (such as history item IDs), and in those cases we use those as secondary IDs. 14 | 15 | ### Browser 16 | 17 | Because people use multiple browsers and profiles, we typically map activity to a specific browser: 18 | 19 | `id`: a UUID for the browser 20 | 21 | `userAgent`: the User Agent string for the browser 22 | 23 | `devicePixelRatio`: the base value of `window.devicePixelRatio` (typically 1 for a normal screen, 2 for a High-DPI/Retina display) 24 | 25 | `created`: when we first saw this browser 26 | 27 | `testing`: if true, then this browser profile was created specifically for testing. Hopefully these browsers shouldn't show up in your normal data! 28 | 29 | `autofetch`: if true, then this browser profile was created or cloned specifically to autofetch pages. It probably has valid cookies/etc, but its behavior isn't "real". Typically we keep these browsers from producing activity, but they *do* create pages (on purpose!) (TODO: need to set `$AUTOFETCH` while building for autofetch; also need to fix autofetch) 30 | 31 | #### Session 32 | 33 | Browsers also have sessions: 34 | 35 | `id`: a UUID for this session (changes each time the browser is restarted) 36 | 37 | `startTime`: timestamp when it was started 38 | 39 | `endTime`: timestamp when it was closed (often null, because we can't always catch this; may be derived from last saved visit once a new session starts). (TODO: nothing sets this) 40 | 41 | `timezoneOffset`: the value of `(new Date()).getTimezoneOffset()`, which is minutes-from-UTC. 42 | 43 | #### Derived: 44 | 45 | Coming from history: 46 | 47 | `oldestHistory`: the time of the oldest history item we've seen 48 | 49 | `newestHistory`: the time of the newest history item we've seen 50 | 51 | ### Activity 52 | 53 | There can be two sources of activity: activity created retroactively from browser history, and activity created by the extension. 54 | 55 | Browser history typically uses two concepts: the [HistoryItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/HistoryItem) and the [VisitItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/VisitItem). In our model we use the VisitItem, augment it with some information from HistoryItem, and there is no one-to-one equivalent of HistoryItem. 56 | 57 | `id`: a UUID representing this visit 58 | 59 | `browserId`: the browser this is associated with 60 | 61 | `sessionId`: the browser session (changed each time the browser is restarted) 62 | 63 | `url`: this is the full URL, including the hash. 64 | 65 | `title`: the title of the page, null if unknown, `""` if there is no title. (TODO: make sure it's "") 66 | 67 | `loadTime`: when the page was loaded 68 | 69 | `unloadTime`: when the page was unloaded. This will be null when unknown (browser history does not keep good track of this). 70 | 71 | `browserHistoryId`: the ID of the associated [HistoryItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/HistoryItem). This won't be unique at all, as many visits are associated with the same HistoryItem. 72 | 73 | `browserVisitId`: the ID of the associated [VisitItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/VisitItem). This will probably be unique, if it is set. 74 | 75 | `sourceId`: the id of the visit that lead to this visit. This may come from the VisitItem.referringVisitId (but won't match that ID, as we don't use the browserVisitId as our primary key). 76 | 77 | `browserReferringVisitId`: from VisitItem.referringVisitId, this should point to another record's `browserVisitId`. Note we try to keep `sourceId` updated, and it's better, but this is kept just in case we need to fix things up later. 78 | 79 | `sourceClickHref`: the URL the user clicked on that lead to this page, as from `a.href`. Null if unknown or no link appeared to be the source. 80 | 81 | `sourceClickText`: if a click led to this page, the `a.textContent` of that link. Null if unknown or no link appeared to be the source. May be `""`. 82 | 83 | `transition`: a string from [TransitionType](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/TransitionType): `link`, `typed`, `auto_bookmark`, `auto_subframe` (unlikely, as we don't track frames), `manual_subframe` (also unlikely), `generated`, `auto_toplevel`, `form_submit`, `reload`, `keyword`, `keyword_generated`. 84 | 85 | `client_redirect`: a boolean (or null if unknown) from [TransitionQualifier](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/webNavigation/transitionQualifier) 86 | 87 | `server_redirect`: a boolean (or null if unknown) from TransitionQualifier 88 | 89 | `forward_back`: a boolean (or null if unknown) from TransitionQualifier 90 | 91 | `from_address_bar`: a boolean (or null if unknown) from TransitionQualifier 92 | 93 | `initialId`: the id of the activity that initiated this. For instance, if you go to `page_1`, click on a link to get to `page_2`, then click on a table of contents to get to `page_2#section2`, then the last item would have a `sourceId` pointing to `page_1`, but an `initialId` pointing to `page_2`. You have to sort on `loadTime` to see the exact order of hash changes. 94 | 95 | `newTab`: if this page was opened in a new tab. Typically `sourceId` should be set in this case. It will be null if unknown (for instance VisitItem doesn't record this). 96 | 97 | `activeCount`: the number of times this page was made active, for more than a second. If you open a tab in the background, then close it without ever looking at it, then this should be 0. If you interact normally and don't change tabs it would be 1. Higher numbers mean it was revisited several times. 98 | 99 | `activeTime`: time in milliseconds that the page was active. Note that if a window goes into the background we keep counting, so this might not always be correct. Like with `activeCount`, we ignore when a tab is active for less than a second, assuming that it means the tab was passed over on the way to another tab. If the user goes idle (no keypresses or mouse movement) for 30 seconds, then we stop incrementing the time until there is activity again. 100 | 101 | `unloadReason`: a string indicating why the page was unloaded: `tabClose`, `navigation`. Null if unknown. 102 | 103 | `hashPointsToElement`: if the URL has a hash (e.g., `page.html#section1`), then does some element with `id="section1"` exist? 104 | 105 | `isHashChange`: if the new activity was an in-page change of the hash/fragment (no actual page loading), then this is true. Null if unknown. 106 | 107 | `method`: the HTTP method that loaded the page (usually GET, of course). We do not track the POST destination if it results in an immediate redirect. (TODO: confirm POST behavior) 108 | 109 | `statusCode`: the integer status code of the response. E.g., 200, 404. 110 | 111 | `contentType`: the Content-Type of the response. Note most URLs are *displayed* as a DOM page of some sort, but the underlying resource might not be text/html. In a case like `text/html; charset="UTF-8"` we remove the charset (and anything after `;`). 112 | 113 | `hasSetCookie`: the response contained a `Set-Cookie` header. 114 | 115 | `hasCookie`: the request contained a `Cookie` header. 116 | 117 | `maxScroll`: the greatest pixel location that this document was scrolled to. Null if unknown, 0 if not scrolled. 118 | 119 | `documentHeight`: the pixel height of the document. Null if unknown or if never scrolled. 120 | 121 | `copyEvents`: this is a JSON list that represents all the clipboard copies taken from the page. Each event looks like: `{text, startLocation, endLocation, time}`, where start and end location are CSS selectors (`endLocation` may be omitted if it is the same as `startLocation`). 122 | 123 | `formControlInteraction`: a count of the number of times a non-text form field was changed. Will be null if we weren't watching. 124 | 125 | `formTextInteraction`: a count of the number of times a text form field was changed. Will be null if we weren't watching. This is incremented when the `change` even occurs, so typically you have to unfocus the text field for this to get incremented. 126 | 127 | `zoomLevel`: the zoom level, if we can calculate it. Typically 1, null if we didn't determine it. 1.1 means, for example, a 110% zoom. 128 | 129 | `canonicalUrl`: if the page has ``, this gives the URL it points to. 130 | 131 | `mainFeedUrl`: if the page has an RSS (or similar) feed, what we think is the main feed URL. 132 | 133 | `allFeeds`: all the feeds found in the page. This is a list of `[{href, title, type}]`. 134 | 135 | `linkInformation`: a list of links found in the document. A list that looks like `[{url, text, rel, target, elementId}]` where `rel`, `target`, and `elementId` are optional (depending on the presence of those attributes), and `url` is the full URL, or if it's an page-internal link then it looks like `"#anchor`. 136 | 137 | #### Derived: 138 | 139 | This information can be calculated from the above information... (All TODO) 140 | 141 | `domain`: the domain, without port, and without leading `www.` or `wwwN.`. 142 | 143 | `canonicalUrl`: the URL with UTM and other cruft removed, with query string sorted, and if `containsHash` is true then with the hash removed. 144 | 145 | `urlPattern`: a rough pattern of the URL, based on `canonicalUrl`. This helps distinguish homepages from article pages on the same site, for instance. (This heuristic will need some ongoing work.) 146 | 147 | `query`: if this was a search result, what was the query string associated? 148 | 149 | ### Pages 150 | 151 | These are full dumps of a page's DOM. They may be associated with a visit, or loaded retroactively to fill in past history. Typically the system does not pull in repeated dumps of pages when they are re-visited (though we may try to do that in the future based on some heuristics). 152 | 153 | `id`: a UUID for this *fetch* of a page 154 | 155 | `url`: the URL fetched 156 | 157 | `loadTime`: the timestamp when we serialized this page (TODO: rename) 158 | 159 | `serializeVersion`: a version indicating the serializer. This gets bumped sometimes, so old pages can be re-fetched or updated in place. ([TODO](https://github.com/ianb/personal-history-archive/issues/5)) 160 | 161 | `autofetched`: true if this was created by an autofetch, as opposed to collected while browsing (TODO) 162 | 163 | `activityId`: if this was fetched during browsing, and associated with specific activity, then the ID of that activity. 164 | 165 | `redirectUrl`: if fetching the URL redirected to some other URL, then what URL? This is the URL that is actually displayed in the URL bar when we serialized the page. Will be null if this matches `url`. 166 | 167 | `redirectOk`: if `redirectUrl` exists, but someone decided the redirect is OK, then this will be true. These can be used to review autofetch redirects, and remove pages that were redirectd to login pages. 168 | 169 | `documentSize.width` and `documentSize.height`: height and width of the entire document (not just the visible portion). 170 | 171 | `docTitle`: the title as given by `document.title` 172 | 173 | `passwordFields`: a list of password fields found 174 | 175 | `passwordFields[i].name`: the name attribute of a password field 176 | 177 | `passwordFields[i].id`: the id of a password field 178 | 179 | `passwordFields[i].hasValue`: true of the field has something entered (e.g., by a password manager) 180 | 181 | `passwordFields[i].isHidden`: if the field appears not to be visible 182 | 183 | `openGraph`: attributes from Open Graph (i.e., `og:` metadata). From the list: title, type, url, image, audio, description, determiner, locale, site\_name, video, image:secure\_url, image:type, image:width, image:height, video:secure\_url, video:type, video:width, image:height, audio:secure\_url, audio:type, article:published\_time, article:modified\_time, article:expiration\_time, article:author, article:section, article:tag, book:author, book:isbn, book:release\_date, book:tag, profile:first\_name, profile:last\_name, profile:username, profile:gender 184 | 185 | `twitterCard`: attributes from Twitter Cards. From the list: card, site, title, description, image, player, player:width, player:height, player:stream, player:stream:content_type 186 | 187 | `images`: a list of images in the page. Excludes small images (smaller than 250x200). 188 | 189 | `images[i].url`: URL of image 190 | 191 | `images[i].dimensions`: `{x: width, y: height}` of the image, as displayed in the document 192 | 193 | `images[i].title`: the `title` attribute 194 | 195 | `images[i].alt`: the `alt` attribute 196 | 197 | `images[i].isReadable`: does the image appear in the Readability version of the document? 198 | 199 | `readable`: information extracted with the [Readability](https://github.com/mozilla/readability) library. Null if this didn't appear to be an article or otherwise parseable. 200 | 201 | `readable.title`: the title as determined 202 | 203 | `readable.content`: an HTML string with the content (not processed like other HTML content) 204 | 205 | `readable.textContent`: a text-only version of the content 206 | 207 | `readable.length`: the length of the content, in characters 208 | 209 | `readable.excerpt`: an exerpt 210 | 211 | `readable.byline`: author metadata 212 | 213 | `readable.dir`: content direction 214 | 215 | #### DOM 216 | 217 | These page records give the actual frozen page part of the fetched pages: 218 | 219 | `body`: a string of everything *inside* ``. 220 | 221 | `head`: a string of everything *inside* ``. 222 | 223 | `bodyAttrs`: the attributes in the body tab, like `[["class", "foobar"], ...]` 224 | 225 | `headAttrs`: same for head. 226 | 227 | `htmlAttrs`: same for ``. 228 | 229 | `resources`: links to embedded resources in the page are replaced with UUIDs. `resources` is `{id: description}` for all of these resources. 230 | 231 | `resources[id].url`: the fully resolved URL that this points to 232 | 233 | `resources[id].tag`: if the URL is embedded in a tag, the name of the tag, like `"LINK"`. 234 | 235 | `resources[id].elId`: the the containing element has an id attribute, then it's here 236 | 237 | `resources[id].selector`: a selector pointing to the element. 238 | 239 | `resources[id].attr`: the attribute name where the URL was found 240 | 241 | `resources[id].rel`: in the case of ``, the value of `rel`. 242 | 243 | `screenshots`: any screenshots taken. Each screenshot has a name. Specifically `screenshots.visible` (what shows in the browser window, "above the fold"), and `screenshots.fullPage` (the entire document). 244 | 245 | `screenshots.type.captureType`: how it was captured (typically matches `type`) 246 | 247 | `screenshots.type.originalDimensions`: a box of `{top, bottom, left, right}` showing what was captured 248 | 249 | `screenshots.type.size`: a value of `{height, width}` of what it was sized to (screenshots are all sized down) 250 | 251 | `screenshots.type.image`: a `data:` URL of the image 252 | 253 | #### DOM Annotations 254 | 255 | The DOM is annotated with some attributes to help understand the DOM without rendering it: 256 | 257 | `data-width` and `data-height`: these are added to all images 258 | 259 | `data-hidden="true"`: this is added to any element that doesn't appear to be visible (e.g., `display: none`). 260 | 261 | `data-display="block"`: or some other value, if `.style.display` (or calculated) is not what you'd expect given the element. E.g., if `` has a style making it display as `inline-block` then this attribute would be added 262 | 263 | `value`: this is set to the *actual* form value, not the one in the original HTML. 264 | 265 | ### Feeds 266 | 267 | In addition to the feed-related metadata captured as Activity, we also fetch the actual feeds alongside the page. By doing this we can match up timely feed information against a page. 268 | 269 | `feeds`: this is a list of all discovered feeds, listed in the order they appeared in the page. 270 | 271 | `feeds[i].url`: the URL of the feed (where it was fetched from) 272 | 273 | `feeds[i].redirectUrl`: if the feed redirected, then this is the destination URL 274 | 275 | `feeds[i].body`: the text body of the feed. 276 | 277 | `feeds[i].contentType`: the HTTP Content-Type given 278 | 279 | `feeds[i].lastModified`: the timestamp of the HTTP Last-Modified header 280 | 281 | `feeds[i].fetchStart`: the timestamp when we started fetching the feed 282 | 283 | `feeds[i].fetchTime`: the number of milliseconds it took to fetch the feed 284 | 285 | `feeds[i].error`: if the feed failed to fetch, this text error message describes why. Other error information: 286 | 287 | `feeds[i].statusCode`: if the feed failed to fetch because of an HTTP error, this gives the status code 288 | 289 | `feeds[i].status`: and this gives the status text 290 | 291 | `feeds[i].errorStack`: if there was an exception fetching the feed, this gives the traceback. 292 | 293 | #### Errored pages 294 | 295 | `url`: the URL that was attempted to be fetched (we don't store historical failures, so the URL is the primary key). 296 | 297 | `attempted`: a timestamp when the error occurred. 298 | 299 | `errorMessage`: the error message. 300 | -------------------------------------------------------------------------------- /docs/future-design.md: -------------------------------------------------------------------------------- 1 | # Future Design 2 | 3 | PHA has turned into a conglomeration of a bunch of use cases and techniques, and it's become downright confusing. 4 | 5 | ## Issues 6 | 7 | * The name is long 8 | * It's unclear where control happens: what makes things happen? 9 | * There's no clear interface 10 | * Mixed patterns 11 | * Build process is all wonky and weird 12 | 13 | ## Name 14 | 15 | What would be a good name for this? The essential aspects: 16 | 17 | 1. It collects browsing information 18 | 2. It makes that information easy to work with 19 | 3. It finds higher-level information about the pages 20 | 4. It can drive the browser 21 | 22 | Obviously "browsing" shows up a lot. Other phrases: 23 | 24 | * Navigation 25 | * Web 26 | * HTML / pages 27 | * Session 28 | * Dataset 29 | 30 | Candidate names: 31 | 32 | * Browser-dataset 33 | * Personal-web-dataset 34 | * Webnav-dataset 35 | * Webnav-collector 36 | * Webnav-archiver 37 | * Browser-archiver 38 | * barchive 39 | * firefox-dataset 40 | * browser-data 41 | * webnav-data 42 | * browserdump 43 | * navdump 44 | * pagedump 45 | * **browserdump** 46 | * Browser Science (also used in 2013, site is still up but inactive) 47 | * Browser Lab (was used in 2013) 48 | * Navlab 49 | * Browsing Lab 50 | 51 | Some dataset concepts ([from](https://medium.com/datadriveninvestor/the-50-best-public-datasets-for-machine-learning-d80e9f030279)): 52 | 53 | * A dataset should not be messy, because you do not want to spend a lot of time cleaning data. 54 | * A dataset should not have too many rows or columns, so it is easy to work with. 55 | * The cleaner the data, the better — cleaning a large data set can be very time consuming. 56 | * There should be an interesting question, which in turn can be answered with data. 57 | 58 | ## Query interface 59 | 60 | Right now we have: 61 | 62 | 1. `Archive`: this represents one set of data, a run, dev-vs-live, test-vs-dev, etc. Represents a database *and* a set of JSON files. 63 | 2. `Browser`: a browser *profile* 64 | 3. `BrowserSession`: a particular run of a browser. Belongs to a Browser. 65 | 4. `Activity`: a browsing activity, typically a navigation. Can include in-page navigations, like changing the hash of a page. Has a relation to [browser.tabs.onUpdated](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/tabs/onUpdated), though not a 1:1 mapping (not every onUpdated event turns into an activity). Belongs to a BrowserSession. 66 | 5. `ActivityLink`: links found in a page 67 | 6. `Page`: a page, with a URL, that belongs to a session (FIXME: doesn't currently map to a session), and has a time in place. It's more like a "page load". Belongs to an activity. 68 | 69 | What are we missing? 70 | 71 | 1. A "job" of some sort, such as a fetching of a list of stuff. 72 | 2. Combining found history with pages and activity. 73 | 3. The HTTP response that led to a page. 74 | 4. Filling in data like Common Crawl or Wikipedia data. 75 | 5. Using CSP to speed up activity (but also noting that it happened) 76 | 6. Any use of [Containers](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/contextualIdentities). Probably cookieStorageId needs to be added to everything, or maybe just to Browser. 77 | 7. Annotations on any of this data. (These may be very ad hoc and hard to implement, but maybe a naive approach would be good enough?) 78 | 79 | ## Interface 80 | 81 | There's a couple ways to start this: 82 | 83 | 1. Add it explicitly to an existing profile as a Temporary Installation 84 | 2. Have it run with `web-ext` and a scratch profile 85 | 3. Have it run with `web-ext` and an existing profile 86 | 4. Have it run with `web-ext` and a long-lived dev-only profile 87 | 5. Have it run via Selenium 88 | 89 | It uses multiple Native Connect names for handling some of these cases. I think that's good *for testing*, where we want good isolation between any old code, running code, production code, and the filesystem. Otherwise I think the archive location should be coded into the add-on storage. 90 | 91 | # Proposal 92 | 93 | 1. Make this an installable package. Lead with the Python side. Will include node_modules/etc as well. 94 | 2. The package includes an XPI, that you install in your browser (usually, some use cases might involve web-ext) 95 | 3. There's a script that you can use on an Archive to trigger activity (i.e., drive the browser) 96 | 4. Use an ORM, maybe SQLObject? 97 | 5. History will get extracted, but only informationally. You'll have to use the trigger to revisit history in some fashion. 98 | 6. We'll need a database view of the live browser connections. This both registers those connections, and is a queue to allow incoming connections. 99 | 100 | ## User experience: 101 | 102 | 1. Install the application (probably start with pip install + npm install, or a downloadable installation script) 103 | 2. Put the XPI in some known location 104 | 3. Install the special files for Native Connect 105 | 4. Maybe include something like `blab http` to open a local server that gives instructions and a link to the XPI 106 | 5. With the XPI installed, there's a button that controls the add-on 107 | 6. You can turn it on and off, with different icons 108 | 7. You can enable it just for some containers 109 | 8. There are instructions about using browser profiles and `about:profiles` 110 | 9. Create a script launcher, `blab browse --Profile` etc? 111 | 10. Create a central place to list known archives, in `~/.browserdump/` - just to make it easy to list 112 | 11. Archives should have names (user assignable) 113 | 12. The browser interface should be allowed to connect to different archives 114 | 13. You should be able to "remember" recording decisions. But if you don't, then on restart probably don't reconnect. 115 | 14. Offer a quick summary of what's happened in the archive. 116 | 15. Give a default archive path of something like `$HOME/browserdump-archive` 117 | 16. Connect browserdump script to a running browser with `blab connect` 118 | 17. Offer simple commands: like open a list of pages. 119 | 18. Something with Jupyter? 120 | -------------------------------------------------------------------------------- /docs/screencast-fetcher.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ianb/personal-history-archive/68168eac7876a8827ec566fb4882a7ab5804d87b/docs/screencast-fetcher.gif -------------------------------------------------------------------------------- /extension/.eslintrc.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | module.exports = { 4 | "env": { 5 | "webextensions": true 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /extension/backgroundOnMessage.js: -------------------------------------------------------------------------------- 1 | /* globals log */ 2 | 3 | this.backgroundOnMessage = (function() { 4 | let exports = {}; 5 | 6 | const handlers = {}; 7 | 8 | browser.runtime.onMessage.addListener((message, sender) => { 9 | let type = message.type; 10 | message.senderTabId = sender.tab && sender.tab.id; 11 | message.senderUrl = sender.url; 12 | message.senderFrameId = sender.frameId; 13 | if (!handlers[type]) { 14 | log.error("Got unexpected message type:", type, "from", message); 15 | return Promise.reject(new Error(`Unexpected message type: ${type}`)); 16 | } 17 | try { 18 | let result = handlers[type](message); 19 | return Promise.resolve(result); 20 | } catch (error) { 21 | return Promise.reject(error); 22 | } 23 | }); 24 | 25 | exports.register = function(type, handler) { 26 | if (handlers[type]) { 27 | throw new Error(`Attempt to reregister message type ${type}`); 28 | } 29 | handlers[type] = handler; 30 | }; 31 | 32 | exports.registerListener = function(type, handler) { 33 | let existing = handlers[type]; 34 | if (!existing) { 35 | handlers[type] = handler; 36 | } else if (Array.isArray(existing)) { 37 | existing.push(handler); 38 | } else { 39 | handlers[type] = [existing, handler]; 40 | } 41 | }; 42 | 43 | exports.unregister = function(type, handler) { 44 | let existing = handlers[type]; 45 | if (!existing) { 46 | throw new Error(`Attempt to unregister handler that has no handlers: ${type}`); 47 | } 48 | if (Array.isArray(existing)) { 49 | if (!existing.includes(handler)) { 50 | throw new Error(`Attempt to unregister handler that hasn't been registered: ${type}`); 51 | } 52 | handlers[type] = existing.filter(x => x !== handler); 53 | if (handlers.length === 1) { 54 | handlers[type] = handlers[type][0]; 55 | } 56 | } else { 57 | if (existing === handler) { 58 | throw new Error(`Attepmt to unregister handler that hasn't been registered: ${type}`); 59 | } 60 | delete handlers[type]; 61 | } 62 | }; 63 | 64 | return exports; 65 | })(); 66 | -------------------------------------------------------------------------------- /extension/browserId.js: -------------------------------------------------------------------------------- 1 | /* globals util, log, communication, catcher */ 2 | 3 | this.browserId = null; 4 | this.sessionId = null; 5 | this.browserIdPromise = catcher.watchPromise(browser.storage.local.get(["browserId"]).then(async (result) => { 6 | if (!result || !result.browserId) { 7 | browserId = util.makeUuid(); 8 | await browser.storage.local.set({browserId}).catch((error) => { 9 | log.error("Error setting browserId", error); 10 | }); 11 | } else { 12 | browserId = result.browserId; 13 | } 14 | sessionId = util.makeUuid(); 15 | })); 16 | -------------------------------------------------------------------------------- /extension/buildSettings.js.tmpl: -------------------------------------------------------------------------------- 1 | this.buildSettings = (function() { 2 | function toBoolean(n, defaultValue) { 3 | if (n !== 0 && !n) { 4 | return defaultValue; 5 | } 6 | if (typeof n === "string") { 7 | n = n.toLowerCase(); 8 | } 9 | if (n === "false" || n === "0" || n === 0 || n === "off" || n === "no") { 10 | return false; 11 | } 12 | return true; 13 | } 14 | 15 | return { 16 | nativeScriptName: process.env.NATIVE_SCRIPT || "browsinglab.connector", 17 | logLevel: process.env.LOG_LEVEL || "info", 18 | serverLogLevel: process.env.SERVER_LOG_LEVEL || "warn", 19 | setFreezeMarker: toBoolean(process.env.SET_FREEZE_MARKER, false), 20 | notifyError: true, 21 | testingBrowser: process.env.TESTING_BROWSER || false, 22 | cspRestrict: toBoolean(process.env.CSP_RESTRICT, false), 23 | defaultArchiveLocation: process.env.DEFAULT_ARCHIVE_LOCATION || null, 24 | updateServerPeriod: 10000, // 60 * 60 * 1000, // 1 hour 25 | temporaryArchiveLocation: process.env.TEMPORARY_ARCHIVE_LOCATION || "__prefix__/../dev-data", 26 | temporaryUpdateServerPeriod: 10000, // 10 seconds 27 | }; 28 | })(); 29 | -------------------------------------------------------------------------------- /extension/catcher.js: -------------------------------------------------------------------------------- 1 | /* globals log, buildSettings, util, backgroundOnMessage */ 2 | 3 | this.catcher = (function() { 4 | let exports = {}; 5 | 6 | exports.watchFunction = function(func) { 7 | return function(...args) { 8 | try { 9 | let result = func(...args); 10 | if (result && "then" in result && result.then) { 11 | return exports.watchPromise(result); 12 | } 13 | return result; 14 | } catch (error) { 15 | report(error); 16 | throw error; 17 | } 18 | }; 19 | }; 20 | 21 | exports.watchPromise = function(promise) { 22 | return promise.catch((error) => { 23 | report(error); 24 | throw error; 25 | }); 26 | }; 27 | 28 | const report = exports.report = function(error) { 29 | log.error("Error:", error); 30 | if (buildSettings.notifyError) { 31 | if (typeof backgroundOnMessage === "undefined") { 32 | // Then we are in a worker context 33 | browser.runtime.sendMessage({type: "reportError", error: String(error)}); 34 | } else { 35 | exports.notifyError(error); 36 | } 37 | } 38 | }; 39 | 40 | exports.notifyError = function(error) { 41 | error = String(error); 42 | let id = util.makeUuid(); 43 | browser.notifications.create(id, { 44 | type: "basic", 45 | title: "Browsing Lab Error", 46 | message: error 47 | }); 48 | }; 49 | 50 | if (typeof backgroundOnMessage !== "undefined") { 51 | backgroundOnMessage.register("reportError", (message) => { 52 | exports.notifyError(message.error); 53 | }); 54 | } 55 | 56 | return exports; 57 | })(); 58 | -------------------------------------------------------------------------------- /extension/communication.js: -------------------------------------------------------------------------------- 1 | /* globals buildSettings, log, browserId, sessionId */ 2 | 3 | /** Routines to communicate with the backend via native connection */ 4 | this.communication = (function() { 5 | let exports = {}; 6 | let port = browser.runtime.connectNative(buildSettings.nativeScriptName); 7 | let responderId = 1; 8 | let responders = new Map(); 9 | let hasActiveArchive = false; 10 | let callCache = []; 11 | const CALL_CACHE_LIMIT = 10; 12 | 13 | function portCall(name, args, kwargs, withoutArchive = false) { 14 | if (!sessionId) { 15 | // Stuff really hasn't initialized yet! 16 | log.warn(`Calling ${name}() before sessionId is set`); 17 | return new Promise((resolve, reject) => { 18 | callCache.push({name, args, kwargs, resolve, reject}); 19 | }); 20 | } 21 | if (!withoutArchive && !hasActiveArchive) { 22 | if (callCache.length > CALL_CACHE_LIMIT) { 23 | throw new Error("Attempted to send too many messages before setting archive"); 24 | } 25 | log.info(`Deferring message: ${name}()`); 26 | return new Promise((resolve, reject) => { 27 | callCache.push({name, args, kwargs, resolve, reject}); 28 | }); 29 | } 30 | args = args || []; 31 | kwargs = kwargs || {}; 32 | let id = responderId++; 33 | for (let i = 0; i < args.length; i++) { 34 | if (args[i] && typeof args[i] === "object" && "toJSON" in args[i]) { 35 | args[i] = args[i].toJSON(); 36 | } 37 | } 38 | for (let name in (kwargs || {})) { 39 | if (kwargs[name] && typeof kwargs[name] === "object" && "toJSON" in kwargs[name]) { 40 | kwargs[name] = kwargs[name].toJSON(); 41 | } 42 | } 43 | port.postMessage({name, args, kwargs, id}); 44 | return new Promise((resolve, reject) => { 45 | responders.set(id, {resolve, reject, name}); 46 | }); 47 | } 48 | 49 | port.onMessage.addListener((message) => { 50 | let id = message.id; 51 | let responder = responders.get(id); 52 | if ("result" in message) { 53 | responder.resolve(message.result); 54 | } else if (message.error) { 55 | // Using console.error so we don't ever send this back to the server: 56 | // 57 | console.error("Error calling", responder.name, ":", message.error, message.traceback); // eslint-disable-line no-console 58 | responder.reject(new Error(`Backend error: ${message.error}`)); 59 | } else { 60 | log.warn("Response without result/error:", message); 61 | } 62 | responders.delete(id); 63 | }); 64 | 65 | function setHasActiveArchive() { 66 | hasActiveArchive = true; 67 | for (let item of callCache) { 68 | portCall(item.name, item.args, item.kwargs).then(item.resolve).catch(item.reject); 69 | } 70 | callCache = []; 71 | } 72 | 73 | /* Each of these exported functions is a function in browsinglab.connect: */ 74 | 75 | exports.add_activity_list = function(activityItems) { 76 | if (!hasActiveArchive) { 77 | // Just throw it away then 78 | log.warn("Disposing of activity", hasActiveArchive); 79 | return null; 80 | } 81 | return portCall("add_activity_list", [], {browserId, sessionId, activityItems}); 82 | }; 83 | 84 | exports.register_browser = function() { 85 | return portCall("register_browser", [], { 86 | browserId, 87 | userAgent: navigator.userAgent, 88 | devicePixelRatio: window.devicePixelRatio, 89 | }); 90 | }; 91 | 92 | exports.register_session = function() { 93 | return portCall("register_session", [sessionId, browserId, (new Date()).getTimezoneOffset()]); 94 | }; 95 | 96 | exports.check_page_needed = function(url) { 97 | return portCall("check_page_needed", [url]); 98 | }; 99 | 100 | // FIXME: should be (url, pageData) but needs updating in saver.py 101 | exports.add_fetched_page = function(id, url, page) { 102 | return portCall("add_fetched_page", [id, url, page]); 103 | }; 104 | 105 | exports.log = function({level, args, stack}) { 106 | return portCall("log", args, {level, stack}, true); 107 | }; 108 | 109 | exports.set_active_archive = async function(path) { 110 | await portCall("set_active_archive", [path], {}, true); 111 | setHasActiveArchive(); 112 | await exports.register_browser(); 113 | await exports.register_session(); 114 | }; 115 | 116 | exports.unset_active_archive = async function() { 117 | hasActiveArchive = false; 118 | await portCall("unset_active_archive"); 119 | }; 120 | 121 | exports.set_archive_title = function(title) { 122 | return portCall("set_archive_title", [title]); 123 | }; 124 | 125 | exports.get_archive_info = function() { 126 | return portCall("get_archive_info", [], {}, true); 127 | }; 128 | 129 | exports.get_all_archives = function() { 130 | return portCall("get_all_archives", [], {}, true); 131 | }; 132 | 133 | return exports; 134 | })(); 135 | -------------------------------------------------------------------------------- /extension/contentLoader.js: -------------------------------------------------------------------------------- 1 | this.contentLoader = (function() { 2 | const exports = {}; 3 | 4 | const SCRIPTS = [ 5 | "browser-polyfill.js", 6 | "build/buildSettings.js", 7 | "log.js", 8 | "catcher.js", 9 | "util.js", 10 | "elementToSelector.js", 11 | "rssFinder.js", 12 | "contentWatcher.js", 13 | ]; 14 | 15 | exports.loadScripts = async function(tabId) { 16 | for (const script of SCRIPTS) { 17 | await browser.tabs.executeScript(tabId, { 18 | file: script, 19 | runAt: "document_idle", 20 | }); 21 | } 22 | }; 23 | 24 | exports.trackTabs = function() { 25 | let callback = (tab) => { 26 | console.log("tab update", tab.id, tab.status); 27 | if (tab.status === "loading") { 28 | exports.loadScripts(tab.id); 29 | } 30 | }; 31 | browser.tabs.onUpdated.addListener(callback, { 32 | properties: ["status"], 33 | }); 34 | let cancel = () => { 35 | browser.tabs.onUpdated.removeListener(callback); 36 | }; 37 | return cancel; 38 | }; 39 | 40 | return exports; 41 | })(); 42 | -------------------------------------------------------------------------------- /extension/contentWatcher.js: -------------------------------------------------------------------------------- 1 | /* globals elementToSelector, rssFinder */ 2 | 3 | this.contentWatcher = (function() { 4 | 5 | const IDLE_TIME = 30000; 6 | const LINK_TEXT_LIMIT = 80; 7 | 8 | document.addEventListener("click", (event) => { 9 | let target = event.target; 10 | if (target.tagName === "A") { 11 | browser.runtime.sendMessage({ 12 | type: "anchorClick", 13 | text: target.textContent, 14 | href: target.href 15 | }); 16 | } 17 | }); 18 | 19 | document.addEventListener("copy", (event) => { 20 | let selection = window.getSelection(); 21 | let startLocation; 22 | let endLocation; 23 | if (selection.anchorNode) { 24 | startLocation = elementToSelector(selection.anchorNode); 25 | } 26 | if (selection.focusNode && selection.focusNode !== selection.anchorNode) { 27 | endLocation = elementToSelector(selection.focusNode); 28 | } 29 | browser.runtime.sendMessage({ 30 | type: "copy", 31 | text: window.getSelection().toString(), 32 | startLocation, 33 | endLocation, 34 | }); 35 | }); 36 | 37 | document.addEventListener("change", (event) => { 38 | let changed = event.target; 39 | let isText = changed.tagName === "TEXTAREA"; 40 | if (changed.tagName === "INPUT") { 41 | let type = (changed.getAttribute("text") || "").toLowerCase(); 42 | let textyTypes = [ 43 | "", "text", "password", "email", "number", "search", "tel", "url", 44 | ]; 45 | if (textyTypes.includes(type)) { 46 | isText = true; 47 | } 48 | } 49 | browser.runtime.sendMessage({ 50 | type: "change", 51 | isText 52 | }); 53 | }); 54 | 55 | let maxScroll = 0; 56 | let sendScrollTimeout = null; 57 | 58 | window.addEventListener("scroll", function(event) { 59 | let position = window.scrollY; 60 | if (position > maxScroll) { 61 | maxScroll = position; 62 | if (!sendScrollTimeout) { 63 | sendScrollTimeout = setTimeout(() => { 64 | sendScrollTimeout = null; 65 | let documentHeight = Math.max( 66 | document.documentElement.clientHeight, 67 | document.body.clientHeight, 68 | document.documentElement.scrollHeight, 69 | document.body.scrollHeight); 70 | browser.runtime.sendMessage({ 71 | type: "scroll", 72 | maxScroll, 73 | documentHeight 74 | }); 75 | }, 100); 76 | } 77 | } 78 | }); 79 | 80 | window.addEventListener("hashchange", (event) => { 81 | let newHash = (new URL(event.newURL)).hash; 82 | if (!newHash || newHash === "#") { 83 | return; 84 | } 85 | newHash = newHash.substr(1); 86 | let element = document.getElementById(newHash); 87 | if (element) { 88 | browser.runtime.sendMessage({ 89 | type: "hashchange", 90 | hash: newHash, 91 | hasElement: !!element 92 | }); 93 | } 94 | }); 95 | 96 | let activityTimer; 97 | let lastActivity; 98 | let isActive = true; 99 | 100 | function updateActivity() { 101 | lastActivity = Date.now(); 102 | if (!isActive) { 103 | browser.runtime.sendMessage({ 104 | type: "activity" 105 | }); 106 | isActive = true; 107 | } 108 | if (activityTimer) { 109 | clearTimeout(activityTimer); 110 | } 111 | activityTimer = setTimeout(() => { 112 | browser.runtime.sendMessage({ 113 | type: "idle", 114 | lastActivity 115 | }); 116 | activityTimer = null; 117 | isActive = false; 118 | }, IDLE_TIME); 119 | } 120 | 121 | function watchForActivity() { 122 | document.addEventListener("mousemove", updateActivity); 123 | document.addEventListener("keypress", updateActivity); 124 | updateActivity(); 125 | } 126 | 127 | function unwatchForActivity() { 128 | document.removeEventListener("mousemove", updateActivity); 129 | document.removeEventListener("keypress", updateActivity); 130 | if (!isActive) { 131 | isActive = true; 132 | } 133 | clearTimeout(activityTimer); 134 | } 135 | 136 | document.addEventListener("visibilitychange", () => { 137 | if (document.hidden) { 138 | unwatchForActivity(); 139 | } else { 140 | watchForActivity(); 141 | } 142 | }); 143 | 144 | if (!document.hidden) { 145 | watchForActivity(); 146 | } 147 | 148 | function sendDevicePixelRatio() { 149 | browser.runtime.sendMessage({ 150 | type: "devicePixelRatio", 151 | devicePixelRatio: window.devicePixelRatio 152 | }); 153 | } 154 | 155 | window.addEventListener("resize", () => { 156 | sendDevicePixelRatio(); 157 | }); 158 | 159 | function sendBasicMetadata() { 160 | let message = { 161 | type: "basicPageMetadata", 162 | title: document.title 163 | }; 164 | let el = document.querySelector("link[rel=canonical]"); 165 | if (el) { 166 | message.canonicalUrl = el.href; 167 | } 168 | let ogTitleEl = document.querySelector("meta[name='og:title'], meta[name='twitter:title']"); 169 | if (ogTitleEl) { 170 | message.ogTitle = ogTitleEl.getAttribute("content"); 171 | } 172 | browser.runtime.sendMessage(message); 173 | } 174 | 175 | function sendFeedInformation() { 176 | let info = rssFinder(); 177 | if (info) { 178 | browser.runtime.sendMessage({ 179 | type: "feedInformation", 180 | mainFeedUrl: info.mainFeedUrl, 181 | allFeeds: info.allFeeds, 182 | }); 183 | } 184 | } 185 | 186 | function sendLinkInformation() { 187 | let links = Array.from(document.querySelectorAll("a[href]")); 188 | links = links.filter(el => el.getAttribute("href") !== "#"); 189 | let linkInformation = links.map((el) => { 190 | let info = { 191 | url: el.href 192 | }; 193 | let text = el.textContent; 194 | if (text.length > LINK_TEXT_LIMIT) { 195 | text = text.substr(0, LINK_TEXT_LIMIT) + "..."; 196 | } 197 | info.text = text; 198 | if (el.href.startsWith(location.href.split("#")[0] + "#")) { 199 | info.url = "#" + el.href.split("#")[1]; 200 | } 201 | if (el.rel) { 202 | info.rel = el.rel; 203 | } 204 | if (el.target) { 205 | info.target = el.target; 206 | } 207 | if (el.id) { 208 | info.elementId = el.id; 209 | } 210 | return info; 211 | }); 212 | browser.runtime.sendMessage({ 213 | type: "linkInformation", 214 | linkInformation 215 | }); 216 | } 217 | 218 | sendDevicePixelRatio(); 219 | sendBasicMetadata(); 220 | setTimeout(sendFeedInformation); 221 | setTimeout(sendLinkInformation); 222 | 223 | })(); 224 | -------------------------------------------------------------------------------- /extension/controller.js: -------------------------------------------------------------------------------- 1 | /* globals backgroundOnMessage, buildSettings, communication, activityTracker, browserIdPromise, log */ 2 | 3 | this.controller = (function() { 4 | const exports = {}; 5 | let tracker; 6 | let model = { 7 | selectContainers: false, 8 | selectedContainers: new Set(), 9 | track: false, 10 | archive: { 11 | title: null, 12 | path: null, 13 | } 14 | }; 15 | 16 | const onInstalled = new Promise((resolve) => { 17 | browser.runtime.onInstalled.addListener(resolve); 18 | }); 19 | 20 | async function init() { 21 | let { temporary } = await onInstalled; 22 | if (temporary) { 23 | if (!model.archive.path) { 24 | model.archive.path = buildSettings.temporaryArchiveLocation; 25 | model.track = true; 26 | openTracker(); 27 | } 28 | } 29 | } 30 | 31 | backgroundOnMessage.register("updateArchive", (info) => { 32 | if (tracker) { 33 | if (model.track && !info.track) { 34 | closeTracker(); 35 | } else if (model.archive.path !== info.archive.path) { 36 | closeTracker(); 37 | } 38 | } 39 | model.selectContainers = info.selectContainers; 40 | model.selectedContainers = new Set(info.selectedContainers); 41 | model.track = info.track; 42 | model.archive = info.archive; 43 | if (model.track && model.archive.path) { 44 | openTracker(); 45 | } 46 | }); 47 | 48 | backgroundOnMessage.register("requestUpdateArchive", () => { 49 | browser.runtime.sendMessage({ 50 | type: "updateArchive", 51 | selectContainers: model.selectContainers, 52 | selectedContainers: Array.from(model.selectedContainers.values()), 53 | track: model.track, 54 | archive: model.archive, 55 | }); 56 | }); 57 | 58 | function closeTracker() { 59 | tracker.uninit(); 60 | tracker = null; 61 | communication.unset_active_archive(); 62 | } 63 | 64 | async function openTracker() { 65 | await communication.set_active_archive(model.archive.path); 66 | await communication.set_archive_title(model.archive.title); 67 | tracker = new activityTracker.Tracker(); 68 | tracker.init(); 69 | } 70 | 71 | browserIdPromise.then(async () => { 72 | await init(); 73 | }).catch((e) => { 74 | log.error("Error initializing:", String(e), e, e.stack); 75 | }); 76 | 77 | return exports; 78 | })(); 79 | -------------------------------------------------------------------------------- /extension/controls/popup.css: -------------------------------------------------------------------------------- 1 | #container { 2 | padding: 1em; 3 | } 4 | -------------------------------------------------------------------------------- /extension/controls/popup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Popup 6 | 7 | 8 | 9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /extension/controls/popup.jsx: -------------------------------------------------------------------------------- 1 | /* globals React, ReactDOM */ 2 | 3 | const model = { 4 | selectContainers: false, 5 | selectedContainers: new Set(), 6 | track: false, 7 | archive: { 8 | title: null, 9 | path: null, 10 | }, 11 | }; 12 | 13 | class Interface extends React.Component { 14 | render() { 15 | return
16 | 17 | 18 | 23 |
; 24 | } 25 | } 26 | 27 | class ArchiveSelector extends React.Component { 28 | render() { 29 | return
30 | 39 | 48 |
; 49 | } 50 | 51 | changeArchiveTitle(event) { 52 | let title = event.target.value; 53 | model.archive.title = title; 54 | render(); 55 | } 56 | 57 | changeArchivePath(event) { 58 | let path = event.target.value; 59 | model.archive.path = path; 60 | render(); 61 | } 62 | } 63 | 64 | class GeneralControl extends React.Component { 65 | render() { 66 | return
67 | 71 |
; 72 | } 73 | 74 | onCheck(event) { 75 | model.track = event.target.checked; 76 | browser.runtime.sendMessage({ 77 | type: "track", 78 | value: model.track, 79 | }); 80 | render(); 81 | } 82 | } 83 | 84 | class ContainerSelector extends React.Component { 85 | render() { 86 | return
87 |
    88 |
  • 93 | { this.props.containers.map(c => { 94 | return
  • ; 99 | })} 100 |
101 |
; 102 | } 103 | 104 | onCheckSelectContainers(event) { 105 | model.selectContainers = !!event.target.checked; 106 | sendModel(); 107 | render(); 108 | } 109 | 110 | onCheckContainer(c, event) { 111 | if (event.target.checked) { 112 | model.selectedContainers.add(c.name); 113 | } else { 114 | model.selectedContainers.delete(c.name); 115 | } 116 | sendModel(); 117 | render(); 118 | } 119 | } 120 | 121 | function sendModel() { 122 | browser.runtime.sendMessage({ 123 | type: "updateArchive", 124 | selectContainers: model.selectContainers, 125 | selectedContainers: Array.from(model.selectedContainers.values()), 126 | track: model.track, 127 | archive: model.archive, 128 | }); 129 | } 130 | 131 | browser.runtime.onMessage.addListener((message) => { 132 | if (message.type !== "updateArchive") { 133 | return; 134 | } 135 | if ("selectContainers" in message) { 136 | model.selectContainers = message.selectContainers; 137 | } 138 | if ("selectedContainers" in message) { 139 | model.selectedContainers = new Set(message.selectedContainers); 140 | } 141 | if ("track" in message) { 142 | model.track = !!message.track; 143 | } 144 | if ("archive" in message) { 145 | model.archive = message.archive; 146 | } 147 | render(); 148 | }); 149 | 150 | browser.runtime.sendMessage({ 151 | type: "requestUpdateArchive", 152 | }); 153 | 154 | async function render() { 155 | let containers = await browser.contextualIdentities.query({}); 156 | let page = ; 157 | ReactDOM.render(page, document.getElementById("container")); 158 | } 159 | 160 | render(); 161 | -------------------------------------------------------------------------------- /extension/elementToSelector.js: -------------------------------------------------------------------------------- 1 | this.elementToSelector = function elementToSelector(el) { 2 | let singletons = {BODY: true, HEAD: true}; 3 | let parts = []; 4 | for (;;) { 5 | if (singletons[el.tagName]) { 6 | parts.unshift(el.tagName.toLowerCase()); 7 | break; 8 | } 9 | if (el.id) { 10 | parts.unshift(`#${el.id}`); 11 | break; 12 | } 13 | let parent = el.parentNode; 14 | let position = Array.from(parent.childNodes).indexOf(el); 15 | parts.unshift(`*:nth-child(${position + 1})`); 16 | el = parent; 17 | } 18 | return parts.join(" > "); 19 | }; 20 | null; 21 | -------------------------------------------------------------------------------- /extension/icon-live.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | -------------------------------------------------------------------------------- /extension/icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | -------------------------------------------------------------------------------- /extension/log.js: -------------------------------------------------------------------------------- 1 | /* globals buildSettings, communication, backgroundOnMessage */ 2 | /* eslint-disable no-console */ 3 | 4 | "use strict"; 5 | 6 | this.log = (function() { 7 | const exports = {}; 8 | 9 | const levels = ["debug", "info", "warn", "error"]; 10 | if (!levels.includes(buildSettings.logLevel)) { 11 | console.warn("Invalid buildSettings.logLevel:", buildSettings.logLevel); 12 | } 13 | const shouldLog = {}; 14 | const shouldLogServer = {}; 15 | 16 | { 17 | let startLogging = false; 18 | let startServerLogging = false; 19 | for (const level of levels) { 20 | if (buildSettings.logLevel === level) { 21 | startLogging = true; 22 | } 23 | if (buildSettings.serverLogLevel === level) { 24 | startServerLogging = true; 25 | } 26 | if (startLogging) { 27 | shouldLog[level] = true; 28 | } 29 | if (startServerLogging) { 30 | shouldLogServer[level] = true; 31 | } 32 | } 33 | } 34 | 35 | function logger(level) { 36 | return function(...args) { 37 | logWithLevel(level, args); 38 | }; 39 | } 40 | 41 | function logWithLevel(level, args, stack) { 42 | if (shouldLog[level]) { 43 | let newArgs = []; 44 | for (let arg of args) { 45 | newArgs.push(arg); 46 | if (arg instanceof Error) { 47 | newArgs.push(String(arg)); 48 | } 49 | } 50 | console[level](...newArgs); 51 | } 52 | if (shouldLogServer[level]) { 53 | let newArgs = []; 54 | if (!stack) { 55 | let stackLines = (new Error()).stack.split("\n"); 56 | while (stackLines[0] && /\/log.js:/.test(stackLines[0])) { 57 | stackLines.shift(); 58 | } 59 | stack = stackLines.join("\n"); 60 | } 61 | for (let arg of args) { 62 | if (arg instanceof Error) { 63 | newArgs.push(String(arg)); 64 | newArgs.push(arg.stack); 65 | } else { 66 | newArgs.push(arg); 67 | } 68 | } 69 | if (typeof communication !== "undefined") { 70 | communication.log({level, args: newArgs, stack}); 71 | } else { 72 | browser.runtime.sendMessage({type: "log", level, args: newArgs, stack}); 73 | } 74 | } 75 | } 76 | 77 | if (typeof backgroundOnMessage !== "undefined") { 78 | backgroundOnMessage.register("log", (message) => { 79 | logWithLevel(message.level, message.args, message.stack); 80 | }); 81 | } 82 | 83 | exports.debug = logger("debug"); 84 | exports.info = logger("info"); 85 | exports.warn = logger("warn"); 86 | exports.error = logger("error"); 87 | 88 | return exports; 89 | })(); 90 | -------------------------------------------------------------------------------- /extension/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "name": "Browsing Lab", 4 | "version": "0.1.0", 5 | "description": "Collects browsing information for later study", 6 | "author": "Ian Bicking ", 7 | "homepage_url": "https://github.com/ianb/personal-history-archive", 8 | "applications": { 9 | "gecko": { 10 | "id": "browsing-lab@ianbicking.org" 11 | } 12 | }, 13 | "browser_action": { 14 | "default_icon": { 15 | "32": "icon.svg" 16 | }, 17 | "default_title": "Browsing Lab", 18 | "browser_style": true, 19 | "default_popup": "controls/popup.html" 20 | }, 21 | "background": { 22 | "scripts": [ 23 | "browser-polyfill.js", 24 | "build/buildSettings.js", 25 | "backgroundOnMessage.js", 26 | "log.js", 27 | "catcher.js", 28 | "util.js", 29 | "browserId.js", 30 | "communication.js", 31 | "scraper/scrapeTab.js", 32 | "contentLoader.js", 33 | "activityTracker.js", 34 | "controller.js" 35 | ] 36 | }, 37 | "permissions": [ 38 | "", 39 | "history", 40 | "storage", 41 | "tabs", 42 | "webNavigation", 43 | "webRequest", 44 | "nativeMessaging", 45 | "notifications", 46 | "webRequestBlocking", 47 | "contextualIdentities" 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /extension/rssFinder.js: -------------------------------------------------------------------------------- 1 | this.rssFinder = (function() { 2 | 3 | const urlPatterns = [ 4 | /^\/feeds?$/, 5 | /^\/feeds?\/[a-zA-Z0-9]+$/, 6 | /\.xml$/, 7 | /\/feed\/?$/, 8 | /$\/(rss|atom)/, 9 | /\/rss\//, 10 | /[./]rss2?$/, 11 | // Business Insider: 12 | /rss.*\.cms$/, 13 | // The Philly Inquirer and others: 14 | /rss\.html$/, 15 | // Seattle PI: 16 | /collectionRss/, 17 | ]; 18 | 19 | const domainPatterns = [ 20 | /^feeds\./, 21 | // USA Today: 22 | /^rss(feeds)?\./, 23 | /^feeds[0-9]?\.feedburner\.com/, 24 | ]; 25 | 26 | const queryStringPatterns = [ 27 | // Miami Herald: 28 | /getXmlFeed/, 29 | /rssfeed/, 30 | // Sun Times: 31 | /template=rss/, 32 | // St Louis Post-Dispatch: 33 | /f=rss/, 34 | /feed=rss/, 35 | ]; 36 | 37 | // FIXME: use these 38 | const hintPatterns = [ 39 | /^https?:\/\/add.my.yahoo.com\/rss\?url=([^&]+)/, 40 | /^https?:\/\/feedly.com\/#subscription\/feed\/(.*)/, 41 | /https?:\/\/reader.aol.com\/#subscription\/(.*)/, 42 | ]; 43 | 44 | function isMaybeRssLink(url) { 45 | let urlObj = new URL(url); 46 | for (let pat of urlPatterns) { 47 | if (pat.test(urlObj.pathname)) { 48 | return true; 49 | } 50 | } 51 | for (let pat of domainPatterns) { 52 | if (pat.test(urlObj.hostname)) { 53 | return true; 54 | } 55 | } 56 | for (let pat of queryStringPatterns) { 57 | if (pat.test(urlObj.search)) { 58 | return true; 59 | } 60 | } 61 | return false; 62 | } 63 | 64 | function rssFinder() { 65 | let contentTypes = [ 66 | "application/rss+xml", 67 | "application/atom+xml", 68 | "application/rdf+xml", 69 | "application/rss", 70 | "application/atom", 71 | "application/rdf", 72 | "text/rss+xml", 73 | "text/atom+xml", 74 | "text/rdf+xml", 75 | "text/rss", 76 | "text/atom", 77 | "text/rdf", 78 | ]; 79 | let selector = contentTypes.map((t) => `link[rel=alternate][type="${t}"]`).join(", "); 80 | let feeds = document.querySelectorAll(selector); 81 | if (!feeds.length) { 82 | return null; 83 | } 84 | let mainFeedUrl = feeds[0].href; 85 | let allFeeds = Array.from(feeds).map(el => ({type: el.type, href: el.href, title: el.title})); 86 | let speculativeFeedLinks = Array.from(document.querySelectorAll("a[href]")); 87 | speculativeFeedLinks = speculativeFeedLinks.filter(a => a.href && isMaybeRssLink(a.href)); 88 | speculativeFeedLinks = speculativeFeedLinks.map(a => { 89 | return { 90 | href: a.href, 91 | anchorText: a.textContent.substr(0, 100), 92 | }; 93 | }); 94 | // Never keep more than 40 links, just in case: 95 | speculativeFeedLinks.splice(40); 96 | if (!speculativeFeedLinks.length) { 97 | speculativeFeedLinks = undefined; 98 | } 99 | return { 100 | mainFeedUrl, 101 | allFeeds, 102 | speculativeFeedLinks, 103 | }; 104 | } 105 | 106 | return rssFinder; 107 | 108 | })(); 109 | null; 110 | -------------------------------------------------------------------------------- /extension/scraper/extractor-worker.js: -------------------------------------------------------------------------------- 1 | /* globals Readability, document, console, location, makeStaticHtml, log, util, buildSettings */ 2 | 3 | /** extractor-worker is a content worker that is attached to a page when 4 | making a shot 5 | 6 | extractData() does the main work 7 | */ 8 | 9 | var extractorWorker = (function() { // eslint-disable-line no-unused-vars 10 | /** Extracts data: 11 | - Gets the Readability version of the page (`.readable`) 12 | - Finds images in roughly the preferred order (`.images`) 13 | */ 14 | let exports = {}; 15 | 16 | exports.extractData = function() { 17 | let start = Date.now(); 18 | let readableDiv; 19 | let readable; 20 | if (typeof Readability !== "undefined") { 21 | let result = extractReadable(); 22 | if (result) { 23 | readable = result; 24 | } else { 25 | readable = null; 26 | } 27 | } else { 28 | log.info("Skipping readability: not installed"); 29 | } 30 | let images = findImages([ 31 | {element: document.head, isReadable: false}, 32 | {element: readableDiv, isReadable: true}, 33 | {element: document.body, isReadable: false}]); 34 | log.info(`Image time: ${Date.now() - start}ms`); 35 | let siteName = findSiteName(); 36 | log.info(`extractData time: ${Date.now() - start}ms`); 37 | return { 38 | readable, 39 | images, 40 | siteName 41 | }; 42 | }; 43 | 44 | function extractReadable() { 45 | // Readability is destructive, so we have to run it on a copy 46 | let loc = document.location; 47 | let uri = { 48 | spec: loc.href, 49 | host: loc.host, 50 | prePath: loc.protocol + "//" + loc.host, 51 | scheme: loc.protocol.substr(0, loc.protocol.indexOf(":")), 52 | pathBase: loc.protocol + "//" + loc.host + loc.pathname.substr(0, loc.pathname.lastIndexOf("/") + 1) 53 | }; 54 | let article; 55 | let id = util.makeUuid(); 56 | let index = 1; 57 | for (let el of document.getElementsByTagName("*")) { 58 | el.setAttribute("data-tmp-id", `${id}-${index}`); 59 | index++; 60 | } 61 | var documentClone = document.cloneNode(true); 62 | try { 63 | article = new Readability(uri, documentClone).parse(); 64 | if (article) { 65 | let newDiv = document.createElement("div"); 66 | newDiv.innerHTML = article.content; // eslint-disable-line no-unsanitized/property 67 | for (let el of newDiv.querySelectorAll("*[data-tmp-id]")) { 68 | let id = el.getAttribute("data-tmp-id"); 69 | let origEl = document.querySelector(`*[data-tmp-id='${id}']`); 70 | let found = false; 71 | let parent = origEl.parentNode; 72 | while (parent) { 73 | if (parent.getAttribute && parent.getAttribute("data-isreadable")) { 74 | found = true; 75 | break; 76 | } 77 | parent = parent.parentNode; 78 | } 79 | if (!found) { 80 | origEl.setAttribute("data-isreadable", "1"); 81 | } 82 | } 83 | } 84 | } catch (e) { 85 | log.warn("Exception getting readable version:", e); 86 | article = {error: String(e), errorStack: e.stack}; 87 | } 88 | for (let el of document.getElementsByTagName("*")) { 89 | el.removeAttribute("data-tmp-id"); 90 | } 91 | return article; 92 | } 93 | 94 | // Images smaller than either of these sizes are skipped: 95 | let MIN_IMAGE_WIDTH = 250; 96 | let MIN_IMAGE_HEIGHT = 200; 97 | 98 | /** Finds images in any of the given elements, avoiding duplicates 99 | Looks for Open Graph og:image, then img elements, sorting img 100 | elements by width (largest preferred) */ 101 | function findImages(elements) { 102 | let images = []; 103 | let found = {}; 104 | function addImage(imgData) { 105 | if (!(imgData && imgData.url)) { 106 | return; 107 | } 108 | if (found[imgData.url]) { 109 | return; 110 | } 111 | images.push(imgData); 112 | found[imgData.url] = true; 113 | } 114 | for (let i = 0; i < elements.length; i++) { 115 | let el = elements[i].element; 116 | if (!el) { 117 | continue; 118 | } 119 | let isReadable = elements[i].isReadable; 120 | let ogs = el.querySelectorAll("meta[property='og:image'], meta[name='twitter:image']"); 121 | let j; 122 | for (j = 0; j < ogs.length; j++) { 123 | let src = ogs[j].getAttribute("content"); 124 | let a = document.createElement("a"); 125 | a.href = src; 126 | src = a.href; 127 | if (src.search(/^https?/i) === -1) { 128 | continue; 129 | } 130 | addImage({ 131 | url: src 132 | }); 133 | } 134 | let imgs = el.querySelectorAll("img"); 135 | imgs = Array.prototype.slice.call(imgs); 136 | // Widest images first: 137 | imgs.sort(function(a, b) { 138 | if (a.width > b.width) { 139 | return -1; 140 | } 141 | return 1; 142 | }); 143 | for (j = 0; j < imgs.length; j++) { 144 | let img = imgs[j]; 145 | if ((!img.src) || (img.src.search(/^https?/i) === -1)) { 146 | continue; 147 | } 148 | if (img.width >= MIN_IMAGE_WIDTH && img.height >= MIN_IMAGE_HEIGHT) { 149 | addImage({ 150 | url: img.src, 151 | dimensions: {x: img.width, y: img.height}, 152 | title: img.getAttribute("title") || null, 153 | alt: img.getAttribute("alt") || null, 154 | isReadable 155 | }); 156 | } 157 | } 158 | } 159 | return images; 160 | } 161 | 162 | function findSiteName() { 163 | let el = document.querySelector("meta[property='og:site_name']"); 164 | if (el) { 165 | return el.getAttribute("content"); 166 | } 167 | // nytimes.com uses this property: 168 | el = document.querySelector("meta[name='cre']"); 169 | if (el) { 170 | return el.getAttribute("content"); 171 | } 172 | return null; 173 | } 174 | 175 | exports.documentStaticJson = async function() { 176 | let json = {}; 177 | Object.assign(json, exports.extractData()); 178 | Object.assign(json, await makeStaticHtml.documentStaticData()); 179 | if (buildSettings.setFreezeMarker) { 180 | let el = document.createElement("span"); 181 | el.id = "browsinglab-completed-freeze"; 182 | el.style.display = "none"; 183 | document.body.appendChild(el); 184 | } 185 | return json; 186 | }; 187 | 188 | return exports; 189 | 190 | })(); 191 | -------------------------------------------------------------------------------- /extension/scraper/scrapeTab.js: -------------------------------------------------------------------------------- 1 | /* globals util, log, buildSettings */ 2 | 3 | this.scrapeTab = (function() { 4 | 5 | let restrictiveCsp = "font-src 'none'; frame-src 'self' data:; object-src 'none'; worker-src 'none'; manifest-src 'none'"; 6 | 7 | let rssContentTypes = [ 8 | "application/rss+xml", 9 | "application/atom+xml", 10 | "application/rdf+xml", 11 | "application/rss", 12 | "application/atom", 13 | "application/rdf", 14 | "text/rss+xml", 15 | "text/atom+xml", 16 | "text/rdf+xml", 17 | "text/rss", 18 | "text/atom", 19 | "text/rdf", 20 | ]; 21 | 22 | async function scrapeTab(tabId, requireUrl) { 23 | let scraped = await scrapeTabDOM(tabId, requireUrl); 24 | await addRss(scraped); 25 | return scraped; 26 | } 27 | 28 | async function scrapeTabDOM(tabId, requireUrl) { 29 | let start = Date.now(); 30 | let foundUrl = await waitForStableTab(tabId); 31 | if (foundUrl !== requireUrl) { 32 | log.debug("Change", requireUrl, "to", foundUrl); 33 | throw new Error("URL changed from what was expected"); 34 | } 35 | for (let file of ["build/buildSettings.js", "log.js", "util.js", "elementToSelector.js", "rssFinder.js", "scraper/make-static-html.js", "scraper/Readability.js", "scraper/extractor-worker.js"]) { 36 | await browser.tabs.executeScript(tabId, {file}); 37 | } 38 | let resultList = await browser.tabs.executeScript(tabId, { 39 | code: "extractorWorker.documentStaticJson()" 40 | }); 41 | resultList[0].timeToFetch = Date.now() - start; 42 | return resultList[0]; 43 | } 44 | 45 | async function addRss(scraped) { 46 | if (scraped.allFeeds) { 47 | scraped.feeds = []; 48 | for (let feed of scraped.allFeeds) { 49 | scraped.feeds.push(await getFeed(feed, true)); 50 | } 51 | log.info("Scraped feeds:", scraped.feeds.length, "bytes:", JSON.stringify(scraped.feeds).length); 52 | } 53 | if (scraped.speculativeFeedLinks) { 54 | let found = 0; 55 | for (let feed of scraped.speculativeFeedLinks) { 56 | let fetched = await getFeed(feed, false); 57 | if (fetched) { 58 | found++; 59 | scraped.feeds.push(fetched); 60 | } else { 61 | feed.shouldDelete = true; 62 | } 63 | } 64 | log.info("Scraped feed links:", found, "of potential", scraped.speculativeFeedLinks.length); 65 | scraped.speculativeFeedLinks = scraped.speculativeFeedLinks.filter(f => !f.shouldDelete); 66 | if (!scraped.speculativeFeedLinks.length) { 67 | delete scraped.speculativeFeedLinks; 68 | } 69 | } 70 | } 71 | 72 | async function getFeed(feed, ignoreContentType) { 73 | let start = Date.now(); 74 | let result = { 75 | url: feed.href, 76 | fetchStart: start, 77 | }; 78 | try { 79 | let resp = await fetch(feed.href); 80 | if (!resp.ok) { 81 | result.error = "Response error"; 82 | result.status = resp.status; 83 | result.statusCode = resp.statusCode; 84 | } else { 85 | result.body = await resp.text(); 86 | result.contentType = resp.headers.get("Content-Type").split(";")[0]; 87 | if (!ignoreContentType && !rssContentTypes.includes(result.contentType)) { 88 | return null; 89 | } 90 | result.lastModified = (new Date(resp.headers.get("Last-Modified"))).getTime(); 91 | } 92 | result.fetchTime = Date.now() - start; 93 | if (resp.url !== feed.href) { 94 | result.redirectUrl = resp.url; 95 | } 96 | return result; 97 | } catch (e) { 98 | log.error("Got error fetching feed", feed, e); 99 | result.fetchTime = Date.now() - start; 100 | result.error = String(e); 101 | result.errorStack = e.stack; 102 | return result; 103 | } 104 | } 105 | 106 | async function waitForStableTab(tabId, attempts = 3) { 107 | let originalUrl; 108 | let tab = await browser.tabs.get(tabId); 109 | originalUrl = tab.url; 110 | await waitForIdle(tabId); 111 | if (!attempts) { 112 | return tab.url; 113 | } 114 | await util.sleep(buildSettings.idleWaitTime); 115 | tab = await browser.tabs.get(tabId); 116 | if (tab.url !== originalUrl) { 117 | return waitForStableTab(tabId, attempts - 1); 118 | } 119 | return tab.url; 120 | } 121 | 122 | function waitForIdle(tabId) { 123 | return browser.tabs.executeScript(tabId, { 124 | code: "null", 125 | runAt: "document_start" 126 | }); 127 | } 128 | 129 | function installCsp() { 130 | let options = ["blocking", "responseHeaders"]; 131 | let filter = { 132 | types: ["main_frame"], 133 | urls: ["http://*/*", "https://*/*"], 134 | }; 135 | browser.webRequest.onHeadersReceived.addListener( 136 | cspHeaderRewriter, 137 | filter, 138 | options, 139 | ); 140 | return () => { 141 | browser.webRequest.onHeadersReceived.removeListener( 142 | cspHeaderRewriter, 143 | filter, 144 | options, 145 | ); 146 | }; 147 | } 148 | 149 | function cspHeaderRewriter(info) { 150 | let headers = info.responseHeaders; 151 | for (let i = 0; i < headers.length; i++) { 152 | let name = headers[i].name.toLowerCase(); 153 | if (name === "content-security-policy" || name === "content-security-policy-report-only") { 154 | headers.splice(i, 1); 155 | i--; 156 | } 157 | } 158 | headers.push({ 159 | name: "Content-Security-Policy", 160 | value: restrictiveCsp, 161 | }); 162 | return {"responseHeaders": headers}; 163 | } 164 | 165 | if (buildSettings.cspRestrict) { 166 | installCsp(); 167 | log.info("Installed CSP adder for all requests"); 168 | } 169 | 170 | return scrapeTab; 171 | })(); 172 | -------------------------------------------------------------------------------- /extension/util.js: -------------------------------------------------------------------------------- 1 | this.util = (function() { 2 | let exports = {}; 3 | 4 | exports.sleep = function(time) { 5 | return new Promise((resolve) => { 6 | setTimeout(resolve, time); 7 | }); 8 | }; 9 | 10 | exports.makeUuid = function() { // eslint-disable-line no-unused-vars 11 | // get sixteen unsigned 8 bit random values 12 | let randomValues = window 13 | .crypto 14 | .getRandomValues(new Uint8Array(36)); 15 | 16 | return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, function(c) { 17 | let i = Array.prototype.slice.call(arguments).slice(-2)[0]; // grab the `offset` parameter 18 | let r = randomValues[i] % 16|0, v = c === "x" ? r : (r & 0x3 | 0x8); 19 | return v.toString(16); 20 | }); 21 | }; 22 | 23 | return exports; 24 | })(); 25 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python3 -m venv .venv 4 | ./.venv/bin/pip install --upgrade pip 5 | ./.venv/bin/pip install -e . 6 | ./.venv/bin/pip install -r ./dev-requirements.txt 7 | if [[ ! -e blab ]] ; then 8 | ln -s ./.venv/bin/blab . 9 | fi 10 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "personal-history-archive", 3 | "description": "A server and browser extension for saving a personal archive", 4 | "version": "0.1.0", 5 | "author": "Ian Bicking ", 6 | "bugs": { 7 | "url": "https://github.com/ianb/personal-history-archive/issues" 8 | }, 9 | "dependencies": { 10 | "react": "^16.3.1", 11 | "react-dom": "^16.3.1", 12 | "readability": "git+https://github.com/mozilla/readability.git", 13 | "webextension-polyfill": "^0.2.1" 14 | }, 15 | "devDependencies": { 16 | "babel-cli": "^6.26.0", 17 | "babel-preset-env": "^1.6.1", 18 | "babel-preset-react": "^6.24.1", 19 | "cookie-parser": "^1.4.3", 20 | "eslint-plugin-mozilla": "^0.4.10", 21 | "eslint-plugin-no-unsanitized": "^2.0.2", 22 | "eslint-plugin-promise": "^3.6.0", 23 | "eslint-plugin-react": "^7.12.4", 24 | "firefox-profile": "^1.2.0", 25 | "geckodriver": "^1.14.1", 26 | "mocha": "^5.0.4", 27 | "node-feedparser": "^1.0.1", 28 | "random-seed": "^0.3.0", 29 | "selenium-webdriver": "^4.0.0-alpha.1", 30 | "web-ext": "^2.9.3" 31 | }, 32 | "homepage": "https://github.com/ianb/personal-history-archive", 33 | "license": "MPL-2.0", 34 | "repository": { 35 | "type": "git", 36 | "url": "git+https://github.com/ianb/personal-history-archive.git" 37 | }, 38 | "scripts": { 39 | "start": "npm run build:dev && web-ext run -f \"${FIREFOX:-nightly}\" -s extension/ --keep-profile-changes --firefox-profile dev-data/Profile --browser-console", 40 | "build:dev": "mkdir -p dev-data && npm run build:deps && LOG_LEVEL=debug SERVER_LOG_LEVEL=debug TESTING_BROWSER=1 DEST=dev-data npm run build:ext && mkdir -p dev-data/Profile", 41 | "lint:js": "eslint .", 42 | "build:zip": "web-ext build --source-dir ${DEST:-build}/extension/ --ignore-files '**/README.md' --ignore-files '**/*.template' --overwrite-dest && mv web-ext-artifacts/personal_history_saver*.zip ${DEST:-build}/extension.zip && rmdir web-ext-artifacts || true", 43 | "build:ext": "mkdir -p extension/build && .venv/bin/python -m browsinglab.subenvvars < extension/buildSettings.js.tmpl > extension/build/buildSettings.js", 44 | "build:deps": "mkdir -p extension/build/ && cp node_modules/react/umd/react.production.min.js node_modules/react-dom/umd/react-dom.production.min.js node_modules/readability/Readability.js extension/build/ && babel --retain-lines extension/controls/popup.jsx > extension/build/popup.js", 45 | "test": "npm run test:build-data && npm run test:build-ext && npm run test:selenium", 46 | "test:build-data": "rm -rf test/test-data/ && mkdir -p test/test-data/", 47 | "test:build-ext": "NATIVE_SCRIPT=pha.saver.test IDLE_WAIT_TIME=0 HISTORY_PAUSE=0 LOG_LEVEL=debug SERVER_LOG_LEVEL=debug TESTING_BROWSER=1 DEST=test/build/ npm run build:ext && DEST=test/build/ npm run build:zip && .venv/bin/python -m pha.saver --script-location test/test-data/pha-saver-script.py --native-name pha.saver.test test/test-data/", 48 | "test:selenium": "PATH=$PATH:/Applications/FirefoxNightly.app/Contents/MacOS/ mocha test/test.js $TEST_ARGS" 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # Python Library 2 | 3 | To install: 4 | 5 | ```sh 6 | $ pip install -e python/ 7 | # Optional requirements: 8 | $ pip install -r python/requirements.txt 9 | ``` 10 | 11 | ## Usage 12 | 13 | You'll probably want to get an instance of Archive: 14 | 15 | ``` 16 | from pha import Archive 17 | archive = Archive.default_location() 18 | ``` 19 | 20 | Or `Archive(path)`, but normal installation always puts the data into the `data/` directory. 21 | 22 | The key objects are all implemented in [`__init__.py`](./pha/__init__.py): `Archive`, `Activity`, and `Page`. 23 | 24 | * `Activity` is one visit in the browser. This includes any changes to the location hash. This represents both old activity fetched from browser history (from [`HistoryItem`](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/HistoryItem) and [`VisitItem`](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/VisitItem)), as well as new activity (with more complete information available). 25 | * `Page` is a fetched page. By default only one version a page will be created for a given URL (though the code/database allows for multiple pages fetched over time). A page is both stored in the database, as well as in a JSON file in `data/pages/` (the library tries to be resilient when the two sources don't match). 26 | 27 | Note that URLs *do* include the fragment/hash, so `http://example.com/` and `http://example.com/#header` are treated as different. 28 | 29 | Typically you'll call: 30 | 31 | * `archive.get_activity(url)`: get a list of activities for the URL 32 | * `archive.activity()`: get a list of ALL activities 33 | * `archive.activity_with_page()`: get a list of all activity that also have a fetched page 34 | * `archive.sample_activity_with_page(number, unique_url=True, unique_domain=False)`: fetch a random sample of pages. Because there tend to be *lots* of pages from some domains (e.g., gmail.com) this tries to get a sampling of "unique" pages. If you ask for `unique_url` then it will look at the entire URL, normalize segments of the URL, and treat number and non-number segments differently. So it would include a homepage and an article page, but probably not multiple article pages from the same site. `unique_domain` gets only one page per domain. 35 | * `archive.get_activity_by_source(activity.id)`: get every activity that came from the given activity (typically through navigation). 36 | 37 | ### Pages 38 | 39 | You might spend most of your time with the Page objects, at least if you are interested in content parsing and interpretation. 40 | 41 | A few highlights: 42 | 43 | * `page.html`: returns a viewable HTML representation of the page. 44 | * `page.lxml`: returns the page, having been parsed with [lxml.html](http://lxml.de/lxmlhtml.html). 45 | * `page.full_text`: tries to get the text of page. 46 | * `page.readable_text`: if the page was parseable with [Readability](https://github.com/mozilla/readability) then this will contain the text extracted as part of the article view (excluding navigation, etc). 47 | * `page.readable_html`: an HTML view of the readable portion of the page. 48 | * `page.display_page()`: run in a Jupyter Notebook, this will show the page in an iframe (see also `notebooktools`). 49 | 50 | ## Helpers 51 | 52 | There's several helper modules: 53 | 54 | * [`glovehelper`](./pha/glovehelper.py): helps with calling [GloVe](https://nlp.stanford.edu/projects/glove/). You must install and build the code from that site. The helper lets you pass in a sequence of strings and get vectors back. See [the analyze_classnames notebook](./analyze_classnames.ipynb) for an example. 55 | * [`htmltools`](./pha/htmltools.py): this includes various little functions to help you work with the HTML. Look at [analyze_classnames](./analyze_classnames.ipynb) for examples. 56 | * [`notebooktools`](./pha/notebooktools.py): other tools for working in Jupyter Notebooks. It's used to show inline HTML. 57 | * [`search`](./pha/search.py): creates a search index of your pages. You need the SQLite [FTS5](https://sqlite.org/fts5.html) extension installed. See [the search_example notebook](./search_example.ipynb) for more. 58 | * [`summarytools`](./pha/summarytools.py): some small helpers for doing document summarization. See [the document_summary notebook](./document_summary.ipynb) for more. 59 | 60 | ## Notebooks 61 | 62 | I'm collecting notebooks in this directory as examples, and hopefully they'll grow into simultaneously documentation and interesting data interpretation. It would be cool to have more! 63 | -------------------------------------------------------------------------------- /python/analyze_classnames.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 3, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "%autoreload\n", 27 | "import pha\n", 28 | "import pha.htmltools\n", 29 | "archive = pha.Archive.default_location()\n", 30 | "print(archive)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "14995\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "histories = archive.histories_with_page()\n", 48 | "print(len(histories))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 6, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "" 60 | ] 61 | }, 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "histories[0]" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 7, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from collections import Counter\n", 78 | "\n", 79 | "def count_classes(doc):\n", 80 | " counter = Counter()\n", 81 | " for el in doc.cssselect(\"*[class]\"):\n", 82 | " for phrase in pha.htmltools.normalize_classes(el):\n", 83 | " counter[phrase] += 1\n", 84 | " return counter" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 8, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "base_counter = Counter()\n", 94 | "by_doc = Counter()\n", 95 | "for history in histories:\n", 96 | " c = count_classes(history.page.lxml)\n", 97 | " base_counter.update(c)\n", 98 | " by_doc.update(c.keys())" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 9, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "(280151, 280151)" 110 | ] 111 | }, 112 | "execution_count": 9, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "len(base_counter), len(by_doc)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 10, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "Total counts: [('blank-may', 115345), ('noncollaps', 122856), ('reportform', 127379), ('thing', 127395), ('child', 127401), ('entri', 127438), ('parent', 127660), ('flat-list', 128492), ('taglin', 128495), ('-gb', 143091), ('button', 163053), ('bylink', 169683), ('arrow', 205311), ('scope-style', 215843), ('ctrl-f-no', 253178), ('clearleft', 254758), ('score', 299354), ('unvot', 328883), ('access-requir', 503082), ('login-requir', 602254)]\n", 131 | "By document: [('clear', 1513), ('js', 1548), ('hidden', 1553), ('undefin', 1604), ('comment', 1707), ('md', 1751), ('col', 1752), ('link', 1784), ('activ', 1858), ('titl', 1948), ('author', 2014), ('dropdown', 2113), ('footer', 2136), ('button', 2155), ('select', 2354), ('fit-shrink-to', 2396), ('btn', 2534), ('contain', 2539), ('icon', 2632), ('content', 3173)]\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "print(\"Total counts:\", sorted(base_counter.items(), key=lambda x: x[1])[-20:])\n", 137 | "print(\"By document:\", sorted(by_doc.items(), key=lambda x: x[1])[-20:])" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "## Prepare classes to be vectorized\n", 145 | "\n", 146 | "This creates one long file that has all the concatenated stemmed class names for all documents. This is reasonable for training different embedding vectors (mapping class names to vectors of floats):" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 52, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "%autoreload\n", 156 | "import pha.glovehelper\n", 157 | "pha.glovehelper.set_glove_path(\"/Users/ianbicking/src/personal-history-archive/tmp/GloVe\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 53, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "#: 23005752 Mb: 232\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "%autoreload\n", 175 | "import pha.htmltools\n", 176 | "import random\n", 177 | "shuffled_histories = list(histories)\n", 178 | "random.shuffle(shuffled_histories)\n", 179 | "all_classes = []\n", 180 | "for history in shuffled_histories:\n", 181 | " for el in history.page.lxml.iter():\n", 182 | " classes = pha.htmltools.normalize_classes(el, shuffle=True)\n", 183 | " if classes:\n", 184 | " all_classes.extend(classes)\n", 185 | " else:\n", 186 | " all_classes.append(\"no-class\")\n", 187 | "print(\"#:\", len(all_classes), \"Mb:\", len(\" \".join(all_classes)) // 1000000)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 54, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "['dtlwc-report-t', 'bqe-id-t', 'nj', 'drjof-id-t', '--c-waypoint-waypoint-xsk', 'wi', 'amphtml-i-interfac-video', 'navig-target', 'i', 'aafa-sx']\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "from pha.glovehelper import vectorize\n", 205 | "class_vectors = vectorize(\n", 206 | " all_classes, 50)\n", 207 | "print(list(class_vectors.keys())[:10])" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 55, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "#: 15403930 Mb: 60\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "tag_shuffled_histories = list(histories)\n", 225 | "random.shuffle(tag_shuffled_histories)\n", 226 | "all_tags = []\n", 227 | "for history in shuffled_histories:\n", 228 | " for el in history.page.lxml.iter():\n", 229 | " all_tags.append(el.tag)\n", 230 | "print(\"#:\", len(all_tags), \"Mb:\", len(\" \".join(all_tags)) // 1000000)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 56, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Number of tags: 588\n", 243 | "Removed 223 tags, with: 365 left\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "from collections import Counter\n", 249 | "tag_counter = Counter(all_tags)\n", 250 | "print(\"Number of tags:\", len(tag_counter))\n", 251 | "removed = 0\n", 252 | "for tag, count in tag_counter.most_common():\n", 253 | " if count <= 5:\n", 254 | " removed += 1\n", 255 | " all_tags.remove(tag)\n", 256 | " del tag_counter[tag]\n", 257 | "print(\"Removed\", removed, \"tags, with:\", len(tag_counter), \"left\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 57, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "tag_vectors = vectorize(\n", 267 | " all_tags, 20)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 58, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "import json\n", 277 | "json.dump({\"classes\": class_vectors, \"tags\": tag_vectors}, open(\"html-vectors.json\", \"w\"))" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [] 286 | } 287 | ], 288 | "metadata": { 289 | "kernelspec": { 290 | "display_name": "Python 3", 291 | "language": "python", 292 | "name": "python3" 293 | }, 294 | "language_info": { 295 | "codemirror_mode": { 296 | "name": "ipython", 297 | "version": 3 298 | }, 299 | "file_extension": ".py", 300 | "mimetype": "text/x-python", 301 | "name": "python", 302 | "nbconvert_exporter": "python", 303 | "pygments_lexer": "ipython3", 304 | "version": "3.5.1" 305 | } 306 | }, 307 | "nbformat": 4, 308 | "nbformat_minor": 2 309 | } 310 | -------------------------------------------------------------------------------- /python/pha/__main__.py: -------------------------------------------------------------------------------- 1 | from . import Archive 2 | 3 | if __name__ == "__main__": 4 | import sys 5 | archive = Archive.default_location() 6 | print("Archive:", archive) 7 | if sys.argv[1:]: 8 | history = archive.get_history(sys.argv[1]) 9 | page = history.page 10 | print("History:", history, history.visits) 11 | print("Page:", page) 12 | print("HTML:\n", page.html) 13 | -------------------------------------------------------------------------------- /python/pha/glovehelper.py: -------------------------------------------------------------------------------- 1 | """Simple wrapper for GloVe: https://nlp.stanford.edu/projects/glove/ 2 | 3 | Runs the scripts and produces vector output""" 4 | 5 | import tempfile 6 | import os 7 | import subprocess 8 | 9 | default_glove_path = None 10 | 11 | 12 | def set_glove_path(value): 13 | """ 14 | Sets the path where we can find GloVe installed, for all future calls to vectorize. 15 | """ 16 | global default_glove_path 17 | default_glove_path = value 18 | 19 | 20 | def vectorize( 21 | corpus, 22 | vector_size=50, 23 | *, 24 | glove_path=None, 25 | debug_print=False, 26 | vocab_min_count=5, 27 | window_size=15): 28 | """ 29 | Takes a corpus (list of words, or one big string with spaces separating words) and creates a mapping from words to vectors. 30 | 31 | This calls the scripts in GloVe and processes the results, it doesn't implement any vectorization itself. 32 | """ 33 | glove_path = glove_path or default_glove_path 34 | if not os.path.exists(glove_path): 35 | raise OSError("No such directory: %s" % glove_path) 36 | if os.path.exists(os.path.join(glove_path, "build")): 37 | glove_path = os.path.join(glove_path, "build") 38 | if not isinstance(corpus, (str, bytes)): 39 | corpus = " ".join(corpus) 40 | if isinstance(corpus, str): 41 | corpus = corpus.encode("UTF-8") 42 | with tempfile.TemporaryDirectory() as dirname: 43 | if debug_print: 44 | print("Temporary directory:", dirname) 45 | vocab_file = os.path.join(dirname, "vocab.txt") 46 | with open(vocab_file, "wb") as fp: 47 | proc = _exec([ 48 | os.path.join(glove_path, "vocab_count"), 49 | "-min-count", str(vocab_min_count), 50 | "-verbose", "2"], 51 | input=corpus, 52 | debug_print=debug_print) 53 | fp.write(proc.stdout) 54 | proc = _exec([ 55 | os.path.join(glove_path, "cooccur"), 56 | "-memory", "4.0", 57 | "-vocab-file", vocab_file, 58 | "-window-size", str(window_size)], 59 | input=corpus, 60 | debug_print=debug_print) 61 | cooccur_data = proc.stdout 62 | cooccur_file = os.path.join(dirname, "coocur.txt") 63 | with open(cooccur_file, "wb") as fp: 64 | proc = _exec([ 65 | os.path.join(glove_path, "shuffle"), 66 | "-memory", "4.0"], 67 | input=cooccur_data, 68 | debug_print=debug_print) 69 | fp.write(proc.stdout) 70 | save_file = os.path.join(dirname, "vectors.txt") 71 | proc = _exec([ 72 | os.path.join(glove_path, "glove"), 73 | "-save-file", os.path.splitext(save_file)[0], 74 | "-threads", "8", 75 | "-input-file", cooccur_file, 76 | "-x-max", "10", 77 | "-iter", "15", 78 | "-vector-size", str(vector_size), 79 | "-binary", "2", 80 | "-vocab-file", vocab_file], 81 | debug_print=debug_print) 82 | result = {} 83 | with open(save_file, "r", encoding="UTF-8") as fp: 84 | for line in fp.readlines(): 85 | line = line.strip().split() 86 | name = line[0] 87 | result[name] = [float(n) for n in line[1:]] 88 | return result 89 | 90 | 91 | def _exec(command, input=None, debug_print=False): 92 | if isinstance(input, str): 93 | input = input.encode("UTF-8") 94 | proc = subprocess.run(command, check=True, input=input, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 95 | if debug_print: 96 | print(" ".join(command)) 97 | if input: 98 | print("Input: %s bytes" % len(input)) 99 | print("Output: %s bytes" % len(proc.stdout)) 100 | if proc.stderr: 101 | print(proc.stderr.decode("UTF-8").rstrip()) 102 | print("---------------------------------------------") 103 | return proc 104 | -------------------------------------------------------------------------------- /python/pha/htmltools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some helpers for use with HTML. 3 | 4 | Mostly normalize_classes() 5 | """ 6 | import re 7 | import random 8 | from nltk.stem import PorterStemmer 9 | import lxml 10 | from urllib.parse import urlparse, parse_qsl 11 | 12 | mixed_regex = re.compile(r'([a-z])([A-Z])') 13 | non_char_regex = re.compile(r'[^a-z\-]', re.I) 14 | stemmer = PorterStemmer() 15 | 16 | 17 | def wordify_class(c): 18 | """Changes a class into a set of words""" 19 | c = mixed_regex.sub(r"\1-\2", c) 20 | c = c.replace("_", "-") 21 | c = non_char_regex.sub("", c) 22 | c = c.strip("-") 23 | return "-".join(c.lower().split("-")) 24 | 25 | 26 | def stem_words(c): 27 | return "-".join([stemmer.stem(w) for w in c.split("-")]) 28 | 29 | 30 | def sort_words(c): 31 | return "-".join(sorted(c.split("-"))) 32 | 33 | 34 | def normalize_classes(c, shuffle=False): 35 | """Takes an HTML class attribute (or element) and returns a normalized form of the classes: 36 | 37 | * Each class name is split into "words", either based on dashes or mixed case 38 | * Numbers are removed 39 | * Each word is stemmed 40 | * The words are sorted 41 | * They are combined back using dashes. 42 | 43 | If `shuffle` is true, then (if there is more than one class), the classes will be randomly shuffled. 44 | """ 45 | if isinstance(c, lxml.etree.ElementBase): 46 | c = c.get("class") 47 | if not c: 48 | return [] 49 | if isinstance(c, str): 50 | c = c.split() 51 | result = list(filter(None, [sort_words(stem_words(wordify_class(a_class))) for a_class in c])) 52 | if shuffle and len(result) > 1: 53 | random.shuffle(result) 54 | return result 55 | 56 | 57 | www_regex = re.compile(r'^www[0-9]*\.') 58 | number_regex = re.compile(r'^[0-9]+$') 59 | hex_only = re.compile(r'^[a-f0-9]+$', re.I) 60 | 61 | 62 | def _url_ignore_word(w): 63 | return w.strip() and number_regex.search(w) or (len(w) > 10 and hex_only.search(w)) 64 | 65 | 66 | def url_words(url): 67 | """ 68 | Tries to reduce a URL to a set of "words" that define the URL. This leaves out numbers, 69 | things that look like hex tokens, and the TLD. 70 | 71 | Typically used for searchable full text indexing of the URL. 72 | """ 73 | result = [] 74 | parsed = urlparse(url) 75 | hostname = parsed.hostname 76 | hostname = www_regex.sub("", hostname) 77 | hostname_parts = hostname.split(".") 78 | if len(hostname_parts) > 1: 79 | # Strip the TLD 80 | hostname_parts = hostname_parts[:-1] 81 | result.extend(hostname_parts) 82 | path = parsed.path.split("/") 83 | path = [p for p in path if not _url_ignore_word(p)] 84 | result.extend(path) 85 | if not _url_ignore_word(parsed.fragment or ""): 86 | result.append(parsed.fragment) 87 | query = parse_qsl(parsed.query) 88 | for name, value in query: 89 | if not _url_ignore_word(value): 90 | result.extend([name, value]) 91 | return result 92 | 93 | 94 | DEFAULT_DISPLAY = { 95 | "a": "inline", 96 | "applet": "inline", 97 | "article": "block", 98 | "area": "none", 99 | "audio": "none", 100 | "base": "none", 101 | "basefont": "none", 102 | "bgsound": "inline", 103 | "blockquote": "block", 104 | "body": "flex", 105 | "br": "inline", 106 | "button": "inline-block", 107 | "canvas": "inline", 108 | "col": "table-column", 109 | "colgroup": "table-column-group", 110 | "del": "inline", 111 | "details": "block", 112 | "dir": "block", 113 | "div": "block", 114 | "dl": "block", 115 | "embed": "inline", 116 | "fieldset": "block", 117 | "footer": "block", 118 | "font": "inline", 119 | "form": "block", 120 | "frame": "inline", 121 | "frameset": "block", 122 | "h1": "block", 123 | "h2": "block", 124 | "h3": "block", 125 | "h4": "block", 126 | "h5": "block", 127 | "h6": "block", 128 | "head": "none", 129 | "hr": "block", 130 | "iframe": "inline", 131 | "img": "inline", 132 | "input": "inline", 133 | "ins": "inline", 134 | "isindex": "inline", 135 | "label": "inline", 136 | "li": "list-item", 137 | "link": "none", 138 | "nav": "block", 139 | "map": "inline", 140 | "marquee": "inline-block", 141 | "menu": "block", 142 | "meta": "none", 143 | "meter": "inline-block", 144 | "object": "inline", 145 | "ol": "block", 146 | "optgroup": "block", 147 | "option": "block", 148 | "output": "inline", 149 | "p": "block", 150 | "param": "none", 151 | "pre": "block", 152 | "progress": "inline-block", 153 | "q": "inline", 154 | "script": "none", 155 | "select": "inline-block", 156 | "source": "inline", 157 | "span": "inline", 158 | "style": "none", 159 | "table": "table", 160 | "tbody": "table-row-group", 161 | "td": "table-cell", 162 | "textarea": "inline", 163 | "tfoot": "table-footer-group", 164 | "title": "none", 165 | "th": "table-cell", 166 | "thead": "table-header-group", 167 | "time": "inline", 168 | "tr": "table-row", 169 | "track": "inline", 170 | "ul": "block", 171 | "video": "inline" 172 | } 173 | 174 | blockish_display_values = ["block", "table-cell", "table", "flex", "list-item"] 175 | 176 | 177 | def _make_blockish_selector(): 178 | blockish_elements = set() 179 | for tagname, display_value in DEFAULT_DISPLAY.items(): 180 | if display_value in blockish_display_values: 181 | blockish_elements.add(tagname) 182 | blockish_selectors = ', '.join( 183 | '%s:not([data-display])' % tagname for tagname in sorted(blockish_elements)) 184 | extra_selectors = ', '.join( 185 | "*[data-display='%s']" % display for display in sorted(blockish_display_values)) 186 | return "%s, %s" % (blockish_selectors, extra_selectors) 187 | 188 | 189 | blockish_selector = _make_blockish_selector() 190 | 191 | 192 | def iter_block_level_elements(el): 193 | return el.cssselect(blockish_selector) 194 | 195 | 196 | def iter_block_level_text(el): 197 | """ 198 | Goes through the document, returning `[(text, element), ...]` for block-level elements. 199 | When block-level elements are nested, the text of the outer element only includes text that 200 | isn't in an inner element. Elements that have no text or only whitespace text are omitted. 201 | """ 202 | for child in el.iter(): 203 | if not is_blockish(child): 204 | continue 205 | text_chunks = get_unblockish_text(child) 206 | text_chunks = [s.strip() for s in text_chunks if s and s.strip()] 207 | if text_chunks: 208 | yield (' '.join(text_chunks), child) 209 | 210 | 211 | def is_blockish(el): 212 | display = el.get("data-display") or DEFAULT_DISPLAY.get(el.tag, "block") 213 | return display in blockish_display_values 214 | 215 | 216 | def get_unblockish_text(el): 217 | chunks = [el.text] 218 | for child in el: 219 | if not is_blockish(child): 220 | chunks.extend(get_unblockish_text(child)) 221 | chunks.append(child.tail) 222 | return chunks 223 | 224 | 225 | def element_to_css(el): 226 | """ 227 | Create a CSS selector that will select the given element 228 | """ 229 | singleton_elements = ["body", "head"] 230 | parts = [] 231 | context = el 232 | while True: 233 | if context.tag in singleton_elements: 234 | parts.insert(0, context.tag) 235 | break 236 | if context.get("id"): 237 | parts.insert(0, "#" + context.get("id")) 238 | break 239 | parent = context.getparent() 240 | position = parent.index(context) 241 | parts.insert(0, "*:nth-child(%s)" % (position + 1)) 242 | context = parent 243 | return " > ".join(parts) 244 | -------------------------------------------------------------------------------- /python/pha/notebooktools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for use in Jupyter Notebooks, especially display_html() 3 | """ 4 | import base64 5 | from IPython.core.display import display, HTML 6 | from cgi import escape as html_escape 7 | import lxml.etree 8 | import time 9 | import os 10 | import shutil 11 | from urllib.request import urlopen 12 | 13 | 14 | def make_data_url(content_type, content): 15 | encoded = base64.b64encode(content.encode('UTF-8')).decode('ASCII') 16 | return 'data:%s;base64,%s' % (content_type, encoded.replace('\n', '')) 17 | 18 | 19 | def display_html(html_page, header='', footer='', height="12em", title=None, link=None, link_title=None): 20 | """ 21 | Display an HTML page inline in a Jupyter notebook. 22 | 23 | The page will go in an iframe. The header and footer are optional extra HTML. The title, link, and link_title are all used as part of a header. 24 | """ 25 | if isinstance(html_page, lxml.etree.ElementBase): 26 | html_page = lxml.html.tostring(html_page) 27 | if isinstance(html_page, bytes): 28 | html_page = html_page.decode("UTF-8") 29 | literal_data = make_data_url("text/html", html_page) 30 | if title: 31 | if link and not link_title: 32 | title = '%s' % ( 33 | html_escape(title), html_escape(link)) 34 | elif link: 35 | title = '%s %s' % ( 36 | html_escape(title), html_escape(link), html_escape(link_title)) 37 | else: 38 | title = '%s' % html_escape(title) 39 | header = title + "\n" + header 40 | if header: 41 | header = '
%s
' % header 42 | if footer: 43 | footer = '
%s
' % footer 44 | html = ''' 45 |
46 | %s 47 | 48 | %s 49 |
50 | ''' % (header, html_escape(height), literal_data, footer) 51 | display(HTML(html)) 52 | 53 | 54 | def tag(t, c=None, **attrs): 55 | content = c or None 56 | for key, value in list(attrs.items()): 57 | if value is None: 58 | continue 59 | if key.startswith("style_"): 60 | name = key[len("style_"):] 61 | name = name.replace("_", "-") 62 | existing = attrs.get("style") 63 | if existing: 64 | attrs["style"] = "%s; %s: %s" % (existing, name, value) 65 | else: 66 | attrs["style"] = "%s: %s" % (name, value) 67 | del attrs[key] 68 | attrs = [ 69 | ' %s="%s"' % (html_escape(key), html_escape(str(value))) 70 | for key, value in sorted(attrs.items()) 71 | if value is not None 72 | ] 73 | start = '<%s%s' % ( 74 | t, 75 | "".join(attrs), 76 | ) 77 | if content: 78 | if isinstance(content, (list, tuple)): 79 | content = "".join(content) 80 | return "%s>%s" % (start, content, t) 81 | else: 82 | return "%s />" % (start) 83 | 84 | 85 | class Image: 86 | 87 | def __init__(self, src_or_metadata, max_height='100px'): 88 | if isinstance(src_or_metadata, str): 89 | self.metadata = {"href": src_or_metadata} 90 | else: 91 | self.metadata = src_or_metadata 92 | self.max_height = max_height 93 | 94 | def _repr_html_(self): 95 | src = self.metadata.get("src") or self.metadata.get("href") 96 | return tag( 97 | "img", 98 | src=src, 99 | alt=src, 100 | style_max_height=self.max_height, 101 | style_width="auto", 102 | width=self.metadata.get("width"), 103 | height=self.metadata.get("height"), 104 | ) 105 | 106 | 107 | class Link: 108 | 109 | def __init__(self, url, title=None, domain=False): 110 | if domain is True: 111 | from . import domain 112 | if title: 113 | title = "%s (%s)" % (title, domain(url)) 114 | else: 115 | title = domain(url) 116 | if not title: 117 | title = url 118 | self.url = url 119 | self.title = title 120 | 121 | def _repr_html_(self): 122 | return tag("a", href=self.url, target="_blank", c=html_escape(self.title)) 123 | 124 | 125 | class Table: 126 | 127 | def __init__(self, rows, header=None, max_height=None): 128 | self.rows = list(rows) 129 | self.header = header 130 | self.max_height = max_height 131 | if rows and self.header is None: 132 | first_row = rows[0] 133 | if isinstance(first_row, dict): 134 | self.header = sorted(first_row.keys()) 135 | 136 | def _repr_html_(self): 137 | if not self.rows: 138 | return '(No records)' 139 | rows = [] 140 | if self.header: 141 | rows.append(tag("tr", [ 142 | tag("th", c=h) for h in self.header 143 | ])) 144 | for row in self.rows: 145 | if isinstance(row, dict): 146 | row = [row[h] for h in self.header] 147 | values = [ 148 | c._repr_html_() if hasattr(c, "_repr_html_") else html_escape(str(c)) 149 | for c in row 150 | ] 151 | rows.append(tag("tr", [ 152 | tag("td", v) for v in values 153 | ])) 154 | table = tag("table", style_overflow="scroll-y", style_max_height=self.max_height, c=rows) 155 | if self.max_height: 156 | return tag( 157 | "div", 158 | table, 159 | style_overflow="scroll", 160 | style_max_height=self.max_height, 161 | style_border="box-shadow: 5px 10px 18px #888888", 162 | style_border_radius="3px", 163 | ) 164 | return table 165 | 166 | 167 | chooser_id = int(time.time()) 168 | 169 | 170 | def display_chooser(links, height="12em"): 171 | display(HTML(display_chooser_html(links, height=height))) 172 | 173 | 174 | def display_chooser_html(links, height="12em"): 175 | global chooser_id 176 | if not links: 177 | return '
Nothing to choose from
' 178 | chooser_id, my_id = chooser_id + 1, "chooser-%s" % chooser_id 179 | links_html = [] 180 | for link in links: 181 | if isinstance(link, str): 182 | link = {"src": link} 183 | if not link.get("title"): 184 | link["title"] = link["src"] 185 | links_html.append(''' 186 | 187 | ''' % ( 188 | my_id, 189 | html_escape(repr(link["src"])), 190 | html_escape(link["title"]), 191 | )) 192 | return '''\ 193 |
194 | %(links)s 195 | 196 |
197 | ''' % dict( 198 | id=my_id, 199 | links=' '.join(links_html), 200 | height=height, 201 | ) 202 | 203 | 204 | def lazyget(url, filename): 205 | if os.path.exists(filename): 206 | if os.path.getsize(filename): 207 | print("File", filename, "already exists") 208 | return 209 | else: 210 | print("File", filename, "is empty; overwriting") 211 | dirname = os.path.dirname(filename) 212 | if not os.path.exists(dirname): 213 | print("Creating directory %s/" % dirname) 214 | os.makedirs(dirname) 215 | with urlopen(url) as resp: 216 | try: 217 | length = int(resp.getheader("Content-Length")) // 1000 218 | length = "%skb" % length 219 | except ValueError: 220 | length = "unknown size" 221 | print("Reading %s into %s..." % (length, filename), end="") 222 | with open(filename, "wb") as fp: 223 | shutil.copyfileobj(resp, fp) 224 | print(" done.") 225 | -------------------------------------------------------------------------------- /python/pha/saver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements saving information into the database/files 3 | """ 4 | 5 | import os 6 | import re 7 | import stat 8 | import json 9 | import sys 10 | import struct 11 | import time 12 | import pprint 13 | import traceback 14 | import uuid 15 | from . import Page 16 | 17 | message_handlers = {} 18 | 19 | 20 | def addon(func): 21 | message_handlers[func.__name__] = func 22 | return func 23 | 24 | 25 | @addon 26 | def add_history_list(archive, *, browserId, sessionId, historyItems): 27 | visits_to_ids = {} 28 | for history in historyItems.values(): 29 | for visitId, visit in history["visits"].items(): 30 | visits_to_ids[visitId] = visit["activity_id"] = str(uuid.uuid1()) 31 | for historyId, history in historyItems.items(): 32 | c = archive.conn.cursor() 33 | for visitId, visit in history["visits"].items(): 34 | c.execute(""" 35 | DELETE FROM activity WHERE browserVisitId = ? 36 | """, (visitId,)) 37 | sourceId = None 38 | if visit.get("referringVisitId"): 39 | sourceId = visits_to_ids.get(visit["referringVisitId"]) 40 | if not sourceId: 41 | c.execute(""" 42 | SELECT id FROM activity WHERE browserVisitId = ? 43 | """, (visit["referringVisitId"],)) 44 | row = c.fetchone() 45 | if row: 46 | sourceId = row.id 47 | c.execute(""" 48 | INSERT INTO activity ( 49 | id, 50 | title, 51 | browserId, 52 | sessionId, 53 | url, 54 | browserHistoryId, 55 | browserVisitId, 56 | loadTime, 57 | transitionType, 58 | browserReferringVisitId, 59 | sourceId 60 | ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 61 | """, ( 62 | visit["activity_id"], 63 | history["title"], 64 | browserId, 65 | sessionId, 66 | history["url"], 67 | historyId, 68 | visitId, 69 | visit["visitTime"], 70 | visit["transition"], 71 | visit["referringVisitId"], 72 | sourceId)) 73 | archive.conn.commit() 74 | c = archive.conn.cursor() 75 | c.execute(""" 76 | UPDATE browser 77 | SET 78 | newestHistory = (SELECT MAX(loadTime) 79 | FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL), 80 | oldestHistory = (SELECT MIN(loadTime) 81 | FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL) 82 | """, (browserId, browserId)) 83 | archive.conn.commit() 84 | 85 | 86 | @addon 87 | def add_activity_list(archive, *, browserId, activityItems): 88 | for activity in activityItems: 89 | c = archive.conn.cursor() 90 | columns = """ 91 | id 92 | browserId 93 | sessionId 94 | url 95 | title 96 | ogTitle 97 | loadTime 98 | unloadTime 99 | transitionType 100 | sourceClickText 101 | sourceClickHref 102 | client_redirect 103 | server_redirect 104 | forward_back 105 | from_address_bar 106 | sourceId 107 | initialLoadId 108 | newTab 109 | activeCount 110 | activeTime 111 | closedReason 112 | method 113 | statusCode 114 | contentType 115 | hasSetCookie 116 | hasCookie 117 | copyEvents 118 | formControlInteraction 119 | formTextInteraction 120 | isHashChange 121 | maxScroll 122 | documentHeight 123 | hashPointsToElement 124 | zoomLevel 125 | canonicalUrl 126 | mainFeedUrl 127 | allFeeds 128 | """.strip().split() 129 | for null_default in "sourceId transitionType".split(): 130 | activity.setdefault(null_default, None) 131 | marks = ["?"] * len(columns) 132 | activity["browserId"] = browserId 133 | linkInformation = activity["linkInformation"] 134 | del activity["linkInformation"] 135 | if activity["copyEvents"]: 136 | activity["copyEvents"] = json.dumps(activity["copyEvents"]) 137 | else: 138 | activity["copyEvents"] = None 139 | if activity["allFeeds"]: 140 | activity["allFeeds"] = json.dumps(activity["allFeeds"]) 141 | else: 142 | activity["allFeeds"] = None 143 | log(archive, activity) 144 | values = [activity[column] for column in columns] 145 | unused = set(activity).difference(columns) 146 | if unused: 147 | raise Exception("Unused keys in activity submission: {}".format(unused)) 148 | c.execute(""" 149 | INSERT OR REPLACE INTO activity ( 150 | %s 151 | ) VALUES (%s) 152 | """ % (", ".join(columns), ", ".join(marks)), values) 153 | c.execute(""" 154 | DELETE FROM activity_link WHERE activity_id = ? 155 | """, (activity["id"],)) 156 | for link in linkInformation or []: 157 | c.execute(""" 158 | INSERT INTO activity_link ( 159 | url, 160 | text, 161 | rel, 162 | target, 163 | elementId 164 | ) VALUES (?, ?, ?, ?, ?) 165 | """, (link["url"], link["text"], link.get("rel"), link.get("target"), link.get("elementId"))) 166 | archive.conn.commit() 167 | 168 | 169 | @addon 170 | def register_browser(archive, *, browserId, userAgent, testing=False, autofetch=False, devicePixelRatio=1): 171 | c = archive.conn.cursor() 172 | c.execute(""" 173 | INSERT OR REPLACE INTO browser (id, userAgent, testing, autofetch, devicePixelRatio) 174 | VALUES (?, ?, ?, ?, ?) 175 | """, (browserId, userAgent, testing, autofetch, devicePixelRatio)) 176 | c.execute(""" 177 | UPDATE browser 178 | SET 179 | newestHistory = (SELECT MAX(loadTime) 180 | FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL), 181 | oldestHistory = (SELECT MIN(loadTime) 182 | FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL) 183 | """, (browserId, browserId)) 184 | archive.conn.commit() 185 | 186 | 187 | @addon 188 | def register_session(archive, sessionId, browserId, timezoneOffset): 189 | c = archive.conn.cursor() 190 | c.execute(""" 191 | INSERT OR REPLACE INTO browser_session (id, browserId, startTime, timezoneOffset) 192 | VALUES (?, ?, CURRENT_TIMESTAMP, ?) 193 | """, (sessionId, browserId, timezoneOffset)) 194 | archive.conn.commit() 195 | 196 | 197 | @addon 198 | def get_needed_pages(archive, limit=100): 199 | c = archive.conn.cursor() 200 | rows = c.execute(""" 201 | SELECT history.url, fetch_error.errorMessage FROM history 202 | LEFT JOIN page 203 | ON page.url = history.url 204 | LEFT JOIN fetch_error 205 | ON fetch_error.url = history.url 206 | WHERE page.url IS NULL 207 | ORDER BY fetch_error.url IS NULL DESC, lastVisitTime DESC 208 | LIMIT ? 209 | """, (limit,)) 210 | return [{"url": row["url"], "lastError": row["errorMessage"]} for row in rows] 211 | 212 | 213 | @addon 214 | def check_page_needed(archive, url): 215 | c = archive.conn.cursor() 216 | c.execute(""" 217 | SELECT COUNT(*) AS counter FROM page WHERE page.url = ? 218 | """, (url,)) 219 | return not c.fetchone()[0] 220 | 221 | 222 | @addon 223 | def add_fetched_page(archive, id, url, page): 224 | redirectUrl = page["url"].split("#")[0] 225 | origUrl = url.split("#")[0] 226 | page["originalUrl"] = url 227 | if redirectUrl == origUrl: 228 | redirectUrl = None 229 | else: 230 | redirectUrl = page["url"] 231 | if redirectUrl: 232 | # Removes the YouTube start time we add 233 | redirectUrl = redirectUrl.replace("&start=86400", "") 234 | c = archive.conn.cursor() 235 | c.execute(""" 236 | INSERT OR REPLACE INTO page (id, url, activityId, fetched, redirectUrl, timeToFetch) 237 | VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?) 238 | """, (id, url, page.get("activityId"), redirectUrl, page["timeToFetch"])) 239 | c.execute(""" 240 | DELETE FROM fetch_error 241 | WHERE url = ? 242 | """, (url,)) 243 | archive.conn.commit() 244 | write_page(archive, url, page) 245 | 246 | 247 | @addon 248 | def add_fetch_failure(archive, url, errorMessage): 249 | c = archive.conn.cursor() 250 | c.execute(""" 251 | INSERT OR REPLACE INTO fetch_error (url, errorMessage) 252 | VALUES (?, ?) 253 | """, (url, errorMessage)) 254 | archive.conn.commit() 255 | 256 | 257 | @addon 258 | def status(archive, browserId): 259 | c = archive.conn.cursor() 260 | c.execute(""" 261 | SELECT 262 | (SELECT COUNT(*) FROM activity) AS activity_count, 263 | (SELECT newestHistory FROM browser WHERE id = ?) AS latest, 264 | (SELECT oldestHistory FROM browser WHERE id = ?) AS oldest, 265 | (SELECT COUNT(*) FROM page) AS fetched_count 266 | """, (browserId, browserId)) 267 | row = c.fetchone() 268 | return dict(row) 269 | 270 | 271 | @addon 272 | def log(archive, *args, level='log', stack=None): 273 | filename = os.path.join(archive.path, "addon.log") 274 | with open(filename, "a") as fp: 275 | if stack: 276 | log_location = stack.splitlines()[0] 277 | log_location = re.sub(r'moz-extension://[a-f0-9-]+/', '/', log_location) 278 | else: 279 | log_location = "" 280 | print("Log/{: <5} {} {}".format(level, int(time.time() * 1000), log_location), file=fp) 281 | if len(str(args)) < 70 and len(args) > 1: 282 | args = (args,) 283 | for arg in args: 284 | if isinstance(arg, str): 285 | s = arg 286 | else: 287 | s = pprint.pformat(arg, compact=True) 288 | if isinstance(arg, tuple): 289 | s = s[1:-1] 290 | s = s.splitlines() 291 | for line in s: 292 | print(" ", line, file=fp) 293 | if not args: 294 | print(" (no arguments)", file=fp) 295 | print(file=fp) 296 | 297 | 298 | def write_page(archive, url, data): 299 | filename = Page.json_filename(archive, url) 300 | with open(filename, "wb") as fp: 301 | fp.write(json.dumps(data).encode("UTF-8")) 302 | 303 | 304 | def run_saver(storage_directory=None): 305 | from . import Archive 306 | if not storage_directory: 307 | archive = Archive.default_location() 308 | else: 309 | archive = Archive(storage_directory) 310 | while True: 311 | m_name = "(unknown)" 312 | try: 313 | message = get_message() 314 | m_name = "%(name)s(%(args)s%(kwargs)s)" % dict( 315 | name=message["name"], 316 | args=", ".join(json.dumps(s) for s in message.get("args", [])), 317 | kwargs=", ".join("%s=%s" % (name, json.dumps(value)) for name, value in message.get("kwargs", {}).items()), 318 | ) 319 | if len(m_name) > 100: 320 | m_name = m_name[:60] + " ... " + m_name[-10:] 321 | print("Message:", m_name, file=sys.stderr) 322 | handler = message_handlers.get(message["name"]) 323 | if not handler: 324 | print("Error: got unexpected message name: %r" % message["name"], file=sys.stderr) 325 | continue 326 | result = handler(archive, *message.get("args", ()), **message.get("kwargs", {})) 327 | send_message({"id": message["id"], "result": result}) 328 | except Exception as e: 329 | tb = traceback.format_exc() 330 | log(archive, "Error processing message %s(): %s" % (m_name, e), tb, level='s_err') 331 | send_message({"id": message["id"], "error": str(e), "traceback": tb}) 332 | 333 | 334 | def get_message(): 335 | length = sys.stdin.buffer.read(4) 336 | if len(length) == 0: 337 | sys.exit(0) 338 | length = struct.unpack('@I', length)[0] 339 | message = sys.stdin.buffer.read(length).decode('utf-8') 340 | message = json.loads(message) 341 | return message 342 | 343 | 344 | def encode_message(message): 345 | content = json.dumps(message).encode('utf-8') 346 | length = struct.pack('@I', len(content)) 347 | return length + content 348 | 349 | 350 | def send_message(message): 351 | sys.stdout.buffer.write(encode_message(message)) 352 | sys.stdout.buffer.flush() 353 | 354 | 355 | def install_json_command(): 356 | import argparse 357 | default_location = os.path.abspath(os.path.join(os.path.abspath(__file__), "../../../data")) 358 | script_location = os.path.join(default_location, ".pha-starter.py") 359 | parser = argparse.ArgumentParser() 360 | parser.add_argument("storage_directory", help="Location for storing the database and files", default=default_location) 361 | parser.add_argument("--script-location", "-s", help="Location to keep the connection script", default=script_location) 362 | parser.add_argument("--native-name", help="Name this will be registered for", default="pha.saver") 363 | args = parser.parse_args() 364 | print("Using the storage directory", args.storage_directory) 365 | print("Writing a connector script to", args.script_location) 366 | install_json_file(args.storage_directory, args.script_location, args.native_name) 367 | 368 | 369 | def install_json_file(storage_directory, script_location, native_name): 370 | # FIXME: support Windows 371 | manifest_path = os.path.abspath(os.path.join(__file__, "../../../extension/manifest.json")) 372 | script_location = os.path.abspath(script_location) 373 | with open(manifest_path) as fp: 374 | manifest = json.load(fp) 375 | manifest_id = manifest["applications"]["gecko"]["id"] 376 | with open(script_location, "w") as fp: 377 | # This script should support a Windows .BAT file 378 | fp.write("""\ 379 | #!%s 380 | storage_directory = %r 381 | from pha.saver import run_saver 382 | run_saver(storage_directory) 383 | """ % (sys.executable, os.path.abspath(storage_directory))) 384 | st = os.stat(script_location) 385 | os.chmod(script_location, st.st_mode | stat.S_IEXEC) 386 | native_manifest = { 387 | "name": native_name, 388 | "description": "Saves information from the personal-history-archive extension", 389 | "path": script_location, 390 | "type": "stdio", 391 | "allowed_extensions": [manifest_id] 392 | } 393 | if sys.platform == "darwin": 394 | filename = os.path.expanduser("~/Library/Application Support/Mozilla/NativeMessagingHosts/%s.json" % native_name) 395 | elif sys.platform.startswith("linux"): 396 | filename = os.path.expanduser("~/.mozilla/native-messaging-hosts/%s.json" % native_name) 397 | else: 398 | raise Exception("Not a supported platform") 399 | dir = os.path.dirname(filename) 400 | if not os.path.exists(dir): 401 | os.makedirs(dir) 402 | with open(filename, "wb") as fp: 403 | fp.write(json.dumps(native_manifest, indent=2).encode("UTF-8")) 404 | 405 | 406 | if __name__ == "__main__": 407 | install_json_command() 408 | -------------------------------------------------------------------------------- /python/pha/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS browser ( 2 | id TEXT PRIMARY KEY, 3 | created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 4 | oldestHistory INT, 5 | newestHistory INT, 6 | userAgent TEXT, 7 | testing BOOLEAN, 8 | autofetch BOOLEAN, 9 | devicePixelRatio FLOAT 10 | ); 11 | 12 | CREATE TABLE IF NOT EXISTS browser_session ( 13 | id TEXT PRIMARY KEY, 14 | browserId TEXT REFERENCES browser (id) ON DELETE CASCADE, 15 | startTime INT, 16 | endTime INT, 17 | timezoneOffset INT 18 | ); 19 | 20 | CREATE TABLE IF NOT EXISTS page ( 21 | id TEXT PRIMARY KEY, 22 | url TEXT NOT NULL, 23 | fetched TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 24 | activityId TEXT REFERENCES activity (id) ON DELETE SET NULL, 25 | timeToFetch INT, 26 | redirectUrl TEXT, 27 | redirectOk BOOLEAN DEFAULT FALSE 28 | ); 29 | 30 | CREATE TABLE IF NOT EXISTS fetch_error ( 31 | url TEXT PRIMARY KEY, 32 | attempted TIMESTAMP DEFAULT CURRENT_TIMESTAMP, 33 | errorMessage TEXT 34 | ); 35 | 36 | CREATE TABLE IF NOT EXISTS activity ( 37 | id TEXT PRIMARY KEY, 38 | browserId TEXT REFERENCES browser (id) ON DELETE CASCADE, 39 | sessionId TEXT REFERENCES browser_session (id) ON DELETE CASCADE, 40 | url TEXT NOT NULL, 41 | title TEXT, 42 | ogTitle TEXT, 43 | browserHistoryId TEXT, 44 | browserVisitId TEXT, 45 | loadTime INT, 46 | unloadTime INT, 47 | transitionType TEXT, 48 | sourceClickText TEXT, 49 | sourceClickHref TEXT, 50 | client_redirect BOOLEAN DEFAULT FALSE, 51 | server_redirect BOOLEAN DEFAULT FALSE, 52 | forward_back BOOLEAN DEFAULT FALSE, 53 | from_address_bar BOOLEAN DEFAULT FALSE, 54 | sourceId TEXT REFERENCES activity (id) ON DELETE SET NULL, 55 | browserReferringVisitId TEXT, 56 | initialLoadId TEXT REFERENCES activity (id) ON DELETE SET NULL, 57 | newTab BOOLEAN, -- was opened in new tab? 58 | activeCount INT, -- Count of times it was "activated" 59 | activeTime INT, -- Millisecond active time 60 | closedReason TEXT, 61 | method TEXT, -- HTTP request method 62 | statusCode INT, -- HTTP status code 63 | contentType TEXT, -- HTTP Content-Type 64 | hasSetCookie BOOLEAN, -- has Set-Cookie response header 65 | hasCookie BOOLEAN, -- has Cookie request header 66 | copyEvents TEXT, -- Actually JSON 67 | formControlInteraction INT, -- count of form interactions 68 | formTextInteraction INT, -- count of form interactions 69 | isHashChange BOOLEAN, 70 | maxScroll INT, -- pixel Y location 71 | documentHeight INT, -- pixel height 72 | hashPointsToElement BOOLEAN, 73 | zoomLevel FLOAT, -- 1.0 means 100% zoom 74 | canonicalUrl TEXT, -- URL 75 | mainFeedUrl TEXT, -- URL 76 | allFeeds TEXT -- JSON 77 | ); 78 | 79 | CREATE TABLE IF NOT EXISTS activity_link ( 80 | activity_id TEXT REFERENCES activity (id) ON DELETE CASCADE, 81 | url TEXT NOT NULL, 82 | text TEXT NOT NULL, 83 | rel TEXT, 84 | target TEXT, 85 | elementId TEXT 86 | ); 87 | -------------------------------------------------------------------------------- /python/pha/search.py: -------------------------------------------------------------------------------- 1 | """ 2 | See this for SQLite FTS5/full text installation instructions: https://sqlite.org/fts5.html 3 | 4 | Or: brew upgrade sqlite3 --with-fts5 5 | 6 | Use: `python -m pha.search` to create a fresh index. 7 | 8 | Use: `python -m pha.search entities` to create an entity index 9 | """ 10 | import re 11 | from urllib.parse import quote as url_quote 12 | from . import htmltools 13 | from . import domain 14 | from collections.abc import Sequence 15 | import time 16 | import random 17 | 18 | 19 | def create_index(archive, purge=True): 20 | """ 21 | Creates an index of all pages, in a SQLite table. 22 | 23 | If `purge` is true, then throw away any past index. 24 | """ 25 | c = archive.conn.cursor() 26 | c.execute(""" 27 | CREATE VIRTUAL TABLE IF NOT EXISTS search_index 28 | USING FTS5 ( 29 | url UNINDEXED, 30 | url_words, 31 | title, 32 | readable, 33 | readable_byline, 34 | readable_excerpt, 35 | meta_description, 36 | full_text 37 | ) 38 | """) 39 | existing = set() 40 | if purge: 41 | c.execute(""" 42 | DELETE FROM search_index; 43 | """) 44 | else: 45 | rows = c.execute(""" 46 | SELECT url FROM search_index 47 | """) 48 | for (url,) in rows: 49 | existing.add(url) 50 | count = 0 51 | for history in archive.histories_with_page(): 52 | if history.url in existing: 53 | continue 54 | count += 1 55 | page = history.page 56 | url_words = " ".join(htmltools.url_words(page.url)) 57 | title = page.title 58 | readable = page.readable_text 59 | full_text = page.full_text 60 | r = page.data.get("readable") or {} 61 | readable_byline = r.get("byline") 62 | readable_excerpt = r.get("excerpt") 63 | meta_description = "" # FIXME: do this 64 | c.execute(""" 65 | INSERT INTO search_index 66 | (url, url_words, title, readable, readable_byline, readable_excerpt, meta_description, full_text) 67 | VALUES 68 | (?, ?, ?, ?, ?, ?, ?, ?) 69 | """, (page.url, url_words, title, readable, readable_byline, readable_excerpt, meta_description, full_text)) 70 | c.close() 71 | archive.conn.commit() 72 | return count 73 | 74 | 75 | def search(archive, query): 76 | """ 77 | Searches pages from an archive. Returns a list-like object. 78 | """ 79 | c = archive.conn.cursor() 80 | rows = c.execute(""" 81 | SELECT url FROM search_index WHERE search_index MATCH ? ORDER BY rank 82 | """, (query,)) 83 | urls = [row[0] for row in rows] 84 | return SearchResult(archive, query, urls) 85 | 86 | 87 | class SearchResult(Sequence): 88 | 89 | def __init__(self, archive, query, urls): 90 | self.archive = archive 91 | self.query = query 92 | self.urls = urls 93 | self.fetched_histories = {} 94 | 95 | def __repr__(self): 96 | return '' % (self.query, len(self.urls)) 97 | 98 | def __getitem__(self, i): 99 | url = self.urls[i] 100 | history = self.fetched_histories.get(url) 101 | if history is None: 102 | history = self.fetched_histories[url] = self.archive.get_history(url) 103 | return history 104 | 105 | def __len__(self): 106 | return len(self.urls) 107 | 108 | 109 | def create_entity_index(archive, purge=True, verbose=False): 110 | from .summarytools import find_entities 111 | c = archive.conn.cursor() 112 | c.execute(""" 113 | CREATE TABLE IF NOT EXISTS entity_index ( 114 | entity TEXT, 115 | entity_label TEXT, 116 | url TEXT REFERENCES page (url) ON DELETE CASCADE, 117 | selector TEXT 118 | ) 119 | """) 120 | if verbose: 121 | print("Created table") 122 | existing = set() 123 | if purge: 124 | c.execute(""" 125 | DELETE FROM entity_index; 126 | """) 127 | if verbose: 128 | print("Removed any previous entries") 129 | else: 130 | rows = c.execute(""" 131 | SELECT DISTINCT url FROM entity_index; 132 | """) 133 | for (url,) in rows: 134 | existing.add(url) 135 | if verbose: 136 | print("Left", len(existing), "existing entries") 137 | c.close() 138 | archive.conn.commit() 139 | histories = [h for h in archive.histories_with_page() if h.url not in existing] 140 | loop_start = time.time() 141 | for count, history in enumerate(histories): 142 | start = time.time() 143 | c = archive.conn.cursor() 144 | page = history.page 145 | body = page.lxml.find("body") 146 | entities = list(find_entities(body)) 147 | if not entities: 148 | entities = [("no-entity", None, body)] 149 | for entity, entity_label, element in entities: 150 | selector = htmltools.element_to_css(element) 151 | c.execute(""" 152 | INSERT INTO entity_index (entity, entity_label, url, selector) 153 | VALUES (?, ?, ?, ?) 154 | """, (entity, entity_label, page.url, selector)) 155 | if verbose: 156 | print("Indexed %6i/%6i %s" % (count + 1, len(histories), page.url)) 157 | print(" entities: %i in %i elements" % (len(entities), len(set(el for ent, ent_label, el in entities)))) 158 | print(" time %is; total %s; eta %s" % ( 159 | time.time() - start, 160 | format_time(time.time() - loop_start), 161 | format_time((time.time() - loop_start) * len(histories) / (count + 1)), 162 | )) 163 | random.shuffle(entities) 164 | entities_string = ", ".join(["%r:%s" % (ent, ent_label) for ent, ent_label, el in entities]) 165 | print(" entities: %s" % entities_string[:145]) 166 | print() 167 | c.close() 168 | archive.conn.commit() 169 | if verbose: 170 | print("Inserted a total of", count, "pages") 171 | return count + 1 172 | 173 | 174 | def format_time(seconds): 175 | if seconds < 60: 176 | return '%is' % seconds 177 | minutes = seconds / 60 178 | if minutes < 60: 179 | return '%im' % minutes 180 | hours, minutes = minutes // 60, minutes % 60 181 | return '%ih%im' % (hours, minutes) 182 | 183 | 184 | def summarize_entities(archive, most_common=0): 185 | c = archive.conn.cursor() 186 | c.execute(""" 187 | SELECT 188 | (SELECT COUNT(DISTINCT entity) FROM entity_index) AS distinct_entities, 189 | (SELECT COUNT(*) FROM entity_index) AS total_entities, 190 | (SELECT COUNT(DISTINCT url) from entity_index) AS distinct_urls, 191 | (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'PER') AS total_label_per, 192 | (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'LOC') AS total_label_loc, 193 | (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'ORG') AS total_label_org, 194 | (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'MISC') AS total_label_misc, 195 | (SELECT COUNT(*) FROM entity_index WHERE entity_label IS NULL OR entity_label NOT IN ('PER', 'LOC', 'ORG', 'MISC')) AS total_label_unknown 196 | """) 197 | row = c.fetchone() 198 | result = { 199 | "distinct_entities": row[0], 200 | "total_entities": row[1], 201 | "distinct_urls": row[2], 202 | "total_labels": { 203 | "per": row[3], 204 | "loc": row[4], 205 | "org": row[5], 206 | "misc": row[6], 207 | "unknown": row[7], 208 | } 209 | } 210 | if most_common: 211 | c.execute(""" 212 | SELECT entity, COUNT(url) 213 | FROM entity_index 214 | GROUP BY entity 215 | ORDER BY COUNT(url) DESC 216 | LIMIT ? 217 | """, (most_common,)) 218 | result["most_common_entities"] = m = [] 219 | for row in c: 220 | m.append((row[0], row[1])) 221 | return result 222 | 223 | 224 | def search_entities(archive, entity, entity_label=None, wildcard=False): 225 | c = archive.conn.cursor() 226 | entity_arg = (entity,) 227 | entity_query = 'entity = ?' 228 | if wildcard: 229 | entity_query = 'LOWER(entity) LIKE ?' 230 | entity_arg = ('%' + entity.lower() + '%',) 231 | if entity_label: 232 | entity_query += " AND entity_label = ?" 233 | entity_arg += (entity_label,) 234 | rows = c.execute(""" 235 | SELECT entity, entity_label, url, selector 236 | FROM entity_index 237 | WHERE %s 238 | """ % entity_query, entity_arg) 239 | rows = [(row[0], row[1], row[2], row[3]) for row in rows] 240 | return EntitySearchResult(archive, entity, rows, wildcard=wildcard) 241 | 242 | 243 | class EntitySearchResult(Sequence): 244 | def __init__(self, archive, entity, rows, wildcard=False): 245 | self.archive = archive 246 | self.entity = entity 247 | self.wildcard = wildcard 248 | self.rows = rows 249 | self.fetched_results = {} 250 | 251 | def __repr__(self): 252 | return '' % ('like ' if self.wildcard else '', self.entity, len(self.rows)) 253 | 254 | def __getitem__(self, i): 255 | if isinstance(i, slice): 256 | return self.__class__(self.archive, self.entity, self.rows[i], wildcard=self.wildcard) 257 | row = self.rows[i] 258 | result = self.fetched_results.get(row) 259 | if result is None: 260 | result = self.fetched_results[row] = EntityResult(self.archive, *row) 261 | return result 262 | 263 | def __len__(self): 264 | return len(self.rows) 265 | 266 | 267 | class EntityResult: 268 | def __init__(self, archive, entity, entity_label, url, selector): 269 | self.archive = archive 270 | self.entity = entity 271 | self.entity_label = entity_label 272 | self.url = url 273 | self.selector = selector 274 | 275 | def __repr__(self): 276 | return '' % (self.url, self.selector, self.entity, self.entity_label) 277 | 278 | @property 279 | def page(self): 280 | if not hasattr(self, "_page"): 281 | self._page = self.archive.get_history(self.url).page 282 | return self._page 283 | 284 | @property 285 | def data_url(self): 286 | from .notebooktools import make_data_url 287 | url = make_data_url("text/html", self.page.html) 288 | if re.search(r"^#[^:]+$", self.selector): 289 | url += "#" + self.selector[1:] 290 | else: 291 | url += "#css=" + url_quote(self.selector) 292 | return url 293 | 294 | @property 295 | def domain(self): 296 | return domain(self.url) 297 | 298 | 299 | if __name__ == "__main__": 300 | import sys 301 | arg = sys.argv[1] if sys.argv[1:] else None 302 | import pha 303 | archive = pha.Archive.default_location() 304 | try: 305 | if arg == "entities" or arg == "entities": 306 | print(create_entity_index(archive, verbose=True, purge=False), "pages entity indexed") 307 | else: 308 | print(create_index(archive), "pages full text indexed") 309 | except KeyboardInterrupt: 310 | print(" aborted") 311 | -------------------------------------------------------------------------------- /python/pha/searchquery.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for finding search/query-related pages in history 3 | """ 4 | 5 | 6 | def find_queries(archive): 7 | activities = archive.get_activity_by_url(like='%google.com%') 8 | actual = [] 9 | for a in activities: 10 | q = a.query.get('q') 11 | if not q: 12 | continue 13 | q = q[0] 14 | actual.append((q, a)) 15 | archive.set_all_activity_from_sources([a for q, a in actual]) 16 | return actual 17 | -------------------------------------------------------------------------------- /python/pha/summarytools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for summarization, using either textteaser or sumy 3 | """ 4 | import re 5 | 6 | text_teaser_instance = None 7 | 8 | 9 | def textteaser_summary(page, *, try_readable=True): 10 | """Uses TextTeaser (https://github.com/IndigoResearch/textteaser/tree/master/textteaser) to summarize 11 | the page into a list of sentences 12 | """ 13 | global text_teaser_instance 14 | if text_teaser_instance is None: 15 | from textteaser import TextTeaser 16 | text_teaser_instance = TextTeaser() 17 | text = (try_readable and page.readable_text) or page.full_text 18 | return text_teaser_instance.summarize(page.title, text) 19 | 20 | 21 | def normalize_sentences(sentences, sep=" "): 22 | sentences = [normalize_sentence(s) for s in sentences] 23 | return sep.join(sentences) 24 | 25 | 26 | def normalize_sentence(sentence): 27 | return re.sub(r'\s+', ' ', str(sentence).replace("\n", " ")) 28 | 29 | 30 | def sumy_summary(page, sentence_count=5, *, language="english"): 31 | from sumy.parsers.html import HtmlParser 32 | from sumy.nlp.tokenizers import Tokenizer 33 | from sumy.summarizers.lsa import LsaSummarizer as Summarizer 34 | from sumy.nlp.stemmers import Stemmer 35 | from sumy.utils import get_stop_words 36 | parser = HtmlParser.from_string(page.html, page.url, Tokenizer(language)) 37 | stemmer = Stemmer(language) 38 | summarizer = Summarizer(stemmer) 39 | summarizer.stop_words = get_stop_words(language) 40 | return summarizer(parser.document, sentence_count) 41 | 42 | 43 | _has_letter_re = re.compile(r"[a-zA-Z]") 44 | 45 | 46 | def is_good_entity(e): 47 | """ 48 | Is this a plausible entity? For some reason scapy select entities like '-' or '\\n ' 49 | """ 50 | return _has_letter_re.search(e) 51 | 52 | 53 | _whitespace_re = re.compile(r"\s\s+", re.S) 54 | 55 | 56 | def find_entities(page_element): 57 | """ 58 | Uses SpaCy to find entities in the page element. Returns `[(entity_text, entity_label, element), ...]` 59 | """ 60 | import xx_ent_wiki_sm 61 | from .htmltools import iter_block_level_text 62 | nlp = xx_ent_wiki_sm.load() 63 | for text, element in iter_block_level_text(page_element): 64 | text = _whitespace_re.sub(" ", text) 65 | doc = nlp(text) 66 | seen = set() 67 | for entity in doc.ents: 68 | if entity.text in seen: 69 | continue 70 | seen.add(entity.text) 71 | if not is_good_entity(entity.text): 72 | continue 73 | yield entity.text, entity.label_, element 74 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | # These are not strict requirements, but useful libraries that can be used with pha, and are used in the notebooks: 2 | jupyter 3 | jupyterlab 4 | 5 | # Some JupyterLab extensions: 6 | jupyterlab_iframe 7 | jupyterlab_templates 8 | 9 | 10 | # Some general machine learning libraries... 11 | numpy 12 | keras 13 | tensorflow 14 | pandas 15 | 16 | # This is a fork of a simple NLP library to support Python 3 17 | -e git+https://www.github.com/ianb/textteaser.git#egg=TextTeaser 18 | 19 | # Used for entity search and NLP: 20 | spacy 21 | # This is the english entity database for SpaCy: 22 | https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-2.0.0/xx_ent_wiki_sm-2.0.0.tar.gz#egg=xx_ent_wiki_sm 23 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | 8 | requirements = [ 9 | "lxml", 10 | "cssselect", 11 | "nltk", 12 | "sumy", 13 | "feedparser", 14 | ] 15 | 16 | setup_requirements = [ 17 | # 'pytest-runner', 18 | # TODO(ianb): put setup requirements (distutils extensions, etc.) here 19 | ] 20 | 21 | test_requirements = [ 22 | # 'pytest', 23 | # TODO: put package test requirements here 24 | ] 25 | 26 | setup( 27 | name='pha', 28 | version='0.1.0', 29 | description="Library to access the Personal History Archive", 30 | # long_description=readme + '\n\n' + history, 31 | author="Ian Bicking", 32 | author_email='ian@ianbicking.org', 33 | url='https://github.com/ianb/personal-history-archive', 34 | packages=find_packages(include=['pha']), 35 | include_package_data=True, 36 | install_requires=requirements, 37 | license="MIT license", 38 | zip_safe=True, 39 | # keywords='', 40 | classifiers=[ 41 | 'Development Status :: 2 - Pre-Alpha', 42 | 'Intended Audience :: Developers', 43 | 'License :: OSI Approved :: MIT License', 44 | 'Natural Language :: English', 45 | 'Programming Language :: Python :: 3', 46 | 'Programming Language :: Python :: 3.3', 47 | 'Programming Language :: Python :: 3.4', 48 | 'Programming Language :: Python :: 3.5', 49 | ], 50 | # test_suite='tests', 51 | # tests_require=test_requirements, 52 | setup_requires=setup_requirements, 53 | ) 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | 8 | requirements = [ 9 | "lxml", 10 | "cssselect", 11 | "nltk", 12 | "sumy", 13 | "feedparser", 14 | "click", 15 | "sqlobject", 16 | "colorama", 17 | "yarl", 18 | ] 19 | 20 | setup( 21 | name='browsinglab', 22 | version='0.1.0', 23 | description="Generate and access data about browsing history", 24 | # long_description=readme + '\n\n' + history, 25 | author="Ian Bicking", 26 | author_email='ian@ianbicking.org', 27 | url='https://github.com/ianb/personal-history-archive', 28 | packages=find_packages(include=['browsinglab']), 29 | include_package_data=True, 30 | install_requires=requirements, 31 | license="MIT license", 32 | zip_safe=True, 33 | # keywords='', 34 | classifiers=[ 35 | 'Development Status :: 2 - Pre-Alpha', 36 | 'Intended Audience :: Developers', 37 | 'License :: OSI Approved :: MIT License', 38 | 'Natural Language :: English', 39 | 'Programming Language :: Python :: 3', 40 | 'Programming Language :: Python :: 3.3', 41 | 'Programming Language :: Python :: 3.4', 42 | 'Programming Language :: Python :: 3.5', 43 | ], 44 | entry_points=''' 45 | [console_scripts] 46 | blab=browsinglab.cli:cli 47 | browser-connector=browsinglab.connector:connect 48 | ''', 49 | # test_suite='tests', 50 | # tests_require=test_requirements, 51 | # setup_requires=setup_requirements, 52 | ) 53 | -------------------------------------------------------------------------------- /test/.eslintrc.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | module.exports = { 4 | "rules": { 5 | "no-console": "off" 6 | } 7 | }; 8 | 9 | -------------------------------------------------------------------------------- /test/commands.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const fs = require("fs"); 3 | const crypto = require("crypto"); 4 | const feedparser = require("node-feedparser"); 5 | const { By, until, Key } = require("selenium-webdriver"); 6 | const { promiseTimeout, eitherPromise } = require("./test-utils"); 7 | 8 | const LOAD_TIMEOUT = 20000; 9 | 10 | exports.fetchPage = async function(driver, url, base) { 11 | let timer = setTimeout(() => { 12 | console.log(" Sending ESCAPE key"); 13 | driver.findElement(By.tagName("body")).sendKeys(Key.ESCAPE); 14 | }, LOAD_TIMEOUT); 15 | await driver.get(url); 16 | clearTimeout(timer); 17 | let result = await eitherPromise( 18 | driver.wait(until.elementLocated(By.css("#pha-completed-freeze"))).then(() => true), 19 | promiseTimeout(30000).then(() => false) 20 | ); 21 | url = await driver.getCurrentUrl(); 22 | if (!result) { 23 | console.log("Freezing page timed out"); 24 | return null; 25 | } 26 | await promiseTimeout(500); 27 | let filename = filenameForUrl(base, url); 28 | let json = await readJson(filename, null); 29 | if (json && json.feeds) { 30 | json.parsedFeeds = []; 31 | for (let feed of json.feeds) { 32 | let parsed = await parseFeed(feed.body); 33 | if (parsed) { 34 | json.parsedFeeds.push(parsed); 35 | } 36 | } 37 | } 38 | return json; 39 | }; 40 | 41 | exports.pageExists = function(url, base) { 42 | let filename = filenameForUrl(base, url); 43 | return new Promise((resolve, reject) => { 44 | fs.access(filename, (error) => { 45 | resolve(!error); 46 | }); 47 | }); 48 | }; 49 | 50 | function filenameForUrl(base, url) { 51 | let name = encodeURIComponent(url); 52 | if (name.length > 200) { 53 | let sha1 = crypto.createHash("sha1"); 54 | let hash = sha1.digest(url).toString("hex"); 55 | name = `${name.substr(0, 100)}-${hash}-trunc`; 56 | } 57 | return path.join(base, "pages", name + "-page.json"); 58 | } 59 | 60 | function readJson(filename, defaultValue) { 61 | return new Promise((resolve, reject) => { 62 | fs.readFile(filename, {encoding: "UTF-8"}, (error, data) => { 63 | if (error && error.code === "ENOENT") { 64 | resolve(defaultValue); 65 | return; 66 | } else if (error) { 67 | reject(error); 68 | return; 69 | } 70 | let json; 71 | try { 72 | json = JSON.parse(data); 73 | } catch (e) { 74 | console.error("Error parsing JSON from", filename, ":", e); 75 | console.error(e.stack); 76 | console.error("text:", JSON.stringify(data)); 77 | reject(e); 78 | return; 79 | } 80 | resolve(json); 81 | }); 82 | }); 83 | } 84 | 85 | function parseFeed(feedBody) { 86 | return new Promise((resolve, reject) => { 87 | feedparser(feedBody, (error, result) => { 88 | if (error) { 89 | console.log("Got a bad field:", error); 90 | console.log(error.stack); 91 | resolve(null); 92 | } else { 93 | resolve(result); 94 | } 95 | }); 96 | }); 97 | } 98 | -------------------------------------------------------------------------------- /test/driver-setup.js: -------------------------------------------------------------------------------- 1 | const firefox = require("selenium-webdriver/firefox"); 2 | const webdriver = require("selenium-webdriver"); 3 | 4 | exports.getDriver = function(addonFileLocation) { 5 | const channel = process.env.FIREFOX_CHANNEL || "NIGHTLY"; 6 | if (!(channel in firefox.Channel)) { 7 | throw new Error(`Unknown channel: "${channel}"`); 8 | } 9 | 10 | const options = new firefox.Options() 11 | .setBinary(firefox.Channel[channel]) 12 | // Let our unsigned add-on be installed: 13 | .setPreference("xpinstall.signatures.required", false) 14 | // Try to keep audio from playing (doesn't work): 15 | .setPreference("dom.webaudio.enabled", false) 16 | // Try to keep videos from auto-playing (doesn't work that well): 17 | .setPreference("media.autoplay.enabled", false) 18 | // Don't let pages do something before unloading: 19 | .setPreference("dom.disable_beforeunload", true) 20 | // Automatically deny all these permission prompts: 21 | .setPreference("permissions.default.camera", 2) 22 | .setPreference("permissions.default.desktop-notification", 2) 23 | .setPreference("permissions.default.geo", 2) 24 | .setPreference("permissions.default.microphone", 2) 25 | .setPreference("permissions.default.shortcuts", 2) 26 | // Don't let pages make popups: 27 | .setPreference("capability.policy.default.Window.alert", "noAccess") 28 | .setPreference("capability.policy.default.Window.confirm", "noAccess") 29 | .setPreference("capability.policy.default.Window.prompt", "noAccess") 30 | // Tracking protection blocks some nice thing to block: 31 | .setPreference("privacy.trackingprotection.enabled", true) 32 | .setPreference("privacy.trackingprotection.introCount", 20) 33 | // Time out requests after 20 seconds: 34 | .setPreference("network.http.response.timeout", 20) 35 | .setPreference("network.http.connection-timeout", 20); 36 | 37 | const driver = new webdriver.Builder() 38 | .withCapabilities({"moz:webdriverClick": true}) 39 | .forBrowser("firefox") 40 | .setFirefoxOptions(options) 41 | .build(); 42 | 43 | driver.installAddon(addonFileLocation); 44 | 45 | return driver; 46 | }; 47 | 48 | exports.closeBrowser = async function(driver) { 49 | // This works around some geckodriver bugs in driver.quit() 50 | let handles = await driver.getAllWindowHandles(); 51 | for (let handle of handles) { 52 | await driver.switchTo().window(handle); 53 | await driver.close(); 54 | } 55 | try { 56 | driver.quit(); 57 | } catch (error) { 58 | // Ignore it (probably the browser is closed by now) 59 | } 60 | }; 61 | -------------------------------------------------------------------------------- /test/random-walk.js: -------------------------------------------------------------------------------- 1 | const { getDriver, closeBrowser } = require("./driver-setup"); 2 | const { By, until } = require("selenium-webdriver"); 3 | const { promiseTimeout, eitherPromise } = require("./test-utils"); 4 | const fs = require("fs"); 5 | const path = require("path"); 6 | const RandomGenerator = require("random-seed"); 7 | 8 | let seed = process.env.SEED || Date.now(); 9 | 10 | const randomGenerator = RandomGenerator.create(seed); 11 | const random = randomGenerator.random.bind(randomGenerator); 12 | 13 | const addonFileLocation = path.join(process.cwd(), "test", "build-walk", "extension.zip"); 14 | 15 | function choose(options) { 16 | return options[Math.floor(options.length * random())]; 17 | } 18 | 19 | function weightedChoice(options) { 20 | let sum = 0; 21 | for (let pair of options) { 22 | sum += pair[1]; 23 | } 24 | let choice = sum * random(); 25 | let pos = 0; 26 | for (let pair of options) { 27 | pos += pair[1]; 28 | if (pos >= choice) { 29 | return pair[0]; 30 | } 31 | } 32 | throw new Error("Weight choice returned nothing, how?"); 33 | } 34 | 35 | function chooseDestination(destinations, seenUrls) { 36 | let chooseOptions = destinations.filter(u => !seenUrls.has(u)); 37 | if (!chooseOptions.length) { 38 | chooseOptions = destinations; 39 | } 40 | return choose(chooseOptions); 41 | } 42 | 43 | function chooseQuery(queries, url) { 44 | let choices = []; 45 | for (let prefix in queries) { 46 | if (!url.startsWith(prefix)) { 47 | continue; 48 | } 49 | for (let selector in queries[prefix]) { 50 | choices.push([selector, queries[prefix][selector]]); 51 | } 52 | } 53 | if (!choices.length) { 54 | return null; 55 | } 56 | return weightedChoice(choices); 57 | } 58 | 59 | function chooseSearchTerm(terms) { 60 | let wordCount = choose([1, 2, 3]); 61 | let words = []; 62 | while (words.length < wordCount) { 63 | let w = choose(terms); 64 | if (!words.includes(w)) { 65 | words.push(w); 66 | } 67 | } 68 | return words.join(" "); 69 | } 70 | 71 | let driver; 72 | 73 | async function walk(config) { 74 | console.log(""); 75 | console.log(""); 76 | console.log("======================== RANDOM WALK ========================"); 77 | console.log(""); 78 | driver = await getDriver(addonFileLocation); 79 | // Give the add-on a moment to load: 80 | await promiseTimeout(1000); 81 | let seenUrls = new Set(); 82 | let steps = 0; 83 | let lastWasSearch = false; 84 | for (;;) { 85 | await promiseTimeout(500); 86 | steps++; 87 | let url = await driver.getCurrentUrl(); 88 | seenUrls.add(url); 89 | console.log("---Running step", steps, "url:", url); 90 | if (url.startsWith("http")) { 91 | let result = await eitherPromise( 92 | driver.wait(until.elementLocated(By.css("#pha-completed-freeze"))).then(() => true), 93 | promiseTimeout(30000).then(() => false) 94 | ); 95 | if (!result) { 96 | console.log("Freezing page timed out"); 97 | } 98 | } else { 99 | console.log("Unfreezable page"); 100 | } 101 | let queryElement = chooseQuery(config.queries, url); 102 | if (queryElement && !lastWasSearch) { 103 | let term = chooseSearchTerm(config.searchTerms); 104 | console.log("Doing search on", queryElement, "term:", term); 105 | await driver.findElement(By.css(queryElement)).sendKeys(term + "\n"); 106 | await promiseTimeout(100); 107 | lastWasSearch = true; 108 | continue; 109 | } 110 | lastWasSearch = false; 111 | if (url === "about:blank" || random() < config.destinations.frequency) { 112 | let dest = chooseDestination(config.destinations.urls, seenUrls); 113 | // Just in case a redirect happens and this exact URL isn't added: 114 | seenUrls.add(dest); 115 | await driver.get(dest); 116 | await promiseTimeout(100); 117 | continue; 118 | } 119 | let anchors = await driver.findElements(By.css("a")); 120 | let anchor = choose(anchors); 121 | if (!anchor) { 122 | console.log("Warning: no anchor found in page", url); 123 | continue; 124 | } 125 | let anchorUrl = await anchor.getAttribute("href"); 126 | if (!anchorUrl || anchorUrl.startsWith("mailto:")) { 127 | console.log("Chose bad anchor:", anchorUrl); 128 | continue; 129 | } 130 | if (!anchor) { 131 | console.log("Got no anchor"); 132 | continue; 133 | } 134 | try { 135 | await anchor.click(); 136 | } catch (e) { 137 | if (e.name === "ElementNotInteractableError") { 138 | console.log("Could not interact with anchor", anchorUrl); 139 | } else if (e.name === "ElementClickInterceptedError") { 140 | console.log("Could not interact with anchor due to cover", anchorUrl); 141 | } else { 142 | console.log("Error interacting with anchor:", anchorUrl, e); 143 | } 144 | continue; 145 | } 146 | promiseTimeout(100); 147 | } 148 | } 149 | 150 | async function fetchPages(pages) { 151 | const { fetchPage, pageExists } = require("./commands"); 152 | console.log(""); 153 | console.log(""); 154 | console.log("======================== FETCHER ========================"); 155 | console.log(""); 156 | driver = await getDriver(addonFileLocation); 157 | // Give the add-on a moment to load: 158 | await promiseTimeout(1000); 159 | let base = process.env.PHA_DATA || path.join(__dirname, "../walk-data"); 160 | let seenUrls = new Set(); 161 | for (let page of pages) { 162 | console.log("-----------------------", page); 163 | if (await pageExists(page, base)) { 164 | console.log(" ...already exists."); 165 | continue; 166 | } 167 | let result = await fetchPage(driver, page, base); 168 | if (!result) { 169 | console.log(" ...loaded but not fetched."); 170 | continue; 171 | } 172 | console.log(" ...fetched."); 173 | for (let feed of (result.parsedFeeds || [])) { 174 | for (let item of feed.items) { 175 | if (!seenUrls.has(item.link)) { 176 | seenUrls.add(item.link); 177 | console.log(" fetching", item.link); 178 | let feedResult = await fetchPage(driver, item.link, base); 179 | if (feedResult) { 180 | console.log(" ...fetched."); 181 | } else { 182 | console.log(" ...loaded but not fetched."); 183 | } 184 | } else { 185 | console.log(" skipping", item.link); 186 | } 187 | } 188 | } 189 | } 190 | } 191 | 192 | async function main() { 193 | let names = ["default.json"]; 194 | if (process.env.CONFIG) { 195 | names.push(process.env.CONFIG); 196 | } 197 | let config = await loadConfig(names); 198 | console.log("config:", config); 199 | try { 200 | await walk(config); 201 | } catch (e) { 202 | console.log("Error:", e); 203 | console.log(e.stack); 204 | } 205 | console.log("---- closing"); 206 | await closeBrowser(driver); 207 | } 208 | 209 | async function mainFetchPages() { 210 | let names = ["default.json"]; 211 | if (process.env.CONFIG) { 212 | names.push(process.env.CONFIG); 213 | } 214 | let config = await loadConfig(names); 215 | console.log("config:", config); 216 | try { 217 | await fetchPages(config.destinations.urls); 218 | } catch (e) { 219 | console.log("Error:", e); 220 | console.log(e.stack); 221 | } 222 | // await closeBrowser(driver); 223 | } 224 | 225 | async function loadConfig(names) { 226 | let configs = []; 227 | for (let name of names) { 228 | if (!name.endsWith(".json")) { 229 | name += ".json"; 230 | } 231 | if (!fs.existsSync(name)) { 232 | name = path.join(__dirname, "walk-configs", name); 233 | } 234 | let data = fs.readFileSync(name, {encoding: "UTF-8"}); 235 | data = JSON.parse(data); 236 | if (typeof data.searchTerms === "string") { 237 | data.searchTerms = data.searchTerms.trim().split(/[\s\n]+/g); 238 | } 239 | configs.push(data); 240 | } 241 | let result = { 242 | destinations: { 243 | urls: [], 244 | frequency: 0.05 245 | }, 246 | queries: {}, 247 | searchTerms: [] 248 | }; 249 | for (let config of configs) { 250 | let newUrls = config.destinations && config.destinations.urls; 251 | if (!newUrls) { 252 | newUrls = result.destinations.urls; 253 | } if (newUrls.includes("*")) { 254 | let newUrls = result.destinations.urls.concat(newUrls.filter(u => u !== "*")); 255 | } 256 | let newSearchTerms = config.searchTerms; 257 | if (!newSearchTerms) { 258 | newSearchTerms = config.searchTerms; 259 | } else if (newSearchTerms.includes("*")) { 260 | newSearchTerms = result.searchTerms.concat(newSearchTerms.filter(u => u !== "*")); 261 | } 262 | Object.assign(result, config); 263 | result.destinations.urls = newUrls; 264 | result.searchTerms = newSearchTerms; 265 | } 266 | return result; 267 | } 268 | 269 | if (require.main === module) { 270 | if (process.argv[2] === "fetch") { 271 | mainFetchPages(); 272 | } else { 273 | main(); 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /test/static/blank.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /test/static/debug.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug page 6 | 7 | 8 |

Note: this page is used in testing to send information to and from the add-on

9 | 10 |
11 | Status 12 | 13 |
14 | 15 |
16 | Controls 17 |
18 | Time: 19 |
20 |
21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /test/static/search-destination.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Pretend destination 6 | 7 | 8 | 9 |

Pretend destination

10 | 11 |
12 | Look at this table of contents! 13 | 14 |
    15 |
  1. first place
  2. 16 |
  3. second place
  4. 17 |
18 |
19 | 20 |

First paragraph

21 | 22 |

Second paragraph

23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test/static/search-results.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Search results 6 | 7 | 8 |

Search results

9 | 10 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /test/static/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Pretend Search 6 | 7 | 8 |

Search

9 |
10 |
11 | Search 12 | 13 | 14 |
15 |
16 | 17 | 18 | -------------------------------------------------------------------------------- /test/static/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: sans-serif; 3 | } 4 | -------------------------------------------------------------------------------- /test/test-utils.js: -------------------------------------------------------------------------------- 1 | exports.promiseTimeout = function(time) { 2 | return new Promise((resolve) => { 3 | setTimeout(resolve, time); 4 | }); 5 | }; 6 | 7 | exports.eitherPromise = function(...promises) { 8 | return new Promise((resolve, reject) => { 9 | function sendResolve(value) { 10 | if (resolve) { 11 | resolve(value); 12 | resolve = null; 13 | } 14 | } 15 | function sendReject(error) { 16 | if (reject) { 17 | reject(error); 18 | reject = null; 19 | } 20 | } 21 | for (let promise of promises) { 22 | promise.then(sendResolve).catch(sendReject); 23 | } 24 | }); 25 | }; 26 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | /* globals describe, it, before, after */ 2 | 3 | /* Environmental variables that help control this test: 4 | 5 | FIREFOX_CHANNEL = empty (default NIGHTLY) 6 | NIGHTLY 7 | AURORA (often Developer Edition) 8 | BETA 9 | RELEASE 10 | 11 | NO_CLOSE = if not empty then when the test is finished, the browser will not be closed 12 | 13 | */ 14 | 15 | const assert = require("assert"); 16 | const webdriver = require("selenium-webdriver"); 17 | const express = require("express"); 18 | const cookieParser = require("cookie-parser"); 19 | const http = require("http"); 20 | const { By, until, Key } = webdriver; 21 | const path = require("path"); 22 | const fs = require("fs"); 23 | const { getDriver, closeBrowser } = require("./driver-setup"); 24 | const { promiseTimeout } = require("./test-utils"); 25 | 26 | const PORT = 11180; 27 | const SERVER = `http://localhost:${PORT}`; 28 | const SERVER_STATIC = `${SERVER}/test-static`; 29 | const COMMAND_MOD = process.platform === "darwin" ? Key.COMMAND : Key.CONTROL; 30 | const addonFileLocation = path.join(process.cwd(), "test", "build", "extension.zip"); 31 | 32 | let server; 33 | 34 | function startServer() { 35 | if (server) { 36 | server.close(); 37 | } 38 | const app = express(); 39 | app.use(cookieParser()); 40 | app.get("/cookie", (req, res) => { 41 | if (req.query.remove) { 42 | res.cookie("testCookie", "", {maxAge: 0}); 43 | } else { 44 | res.cookie("testCookie", "test value", {maxAge: 3600000}); 45 | } 46 | res.send("OK"); 47 | }); 48 | app.use("/test-static", express.static(path.join(__dirname, "static"), { 49 | index: ["index.html"], 50 | maxAge: null 51 | })); 52 | server = http.createServer(app); 53 | server.listen(PORT); 54 | } 55 | 56 | function stopServer() { 57 | server.close(); 58 | server = null; 59 | } 60 | 61 | function filenameForUrl(url) { 62 | // FIXME: this won't work for long pages 63 | return path.join(__dirname, "test-data", "pages", encodeURIComponent(url) + "-page.json"); 64 | } 65 | 66 | async function collectInformation(driver) { 67 | await driver.get(`${SERVER}/test-static/debug.html`); 68 | await driver.wait(until.elementLocated(By.css("#status"))); 69 | let result = await driver.findElement(By.css("#status")).getAttribute("value"); 70 | result = JSON.parse(result); 71 | await driver.findElement(By.css("#flush")).click(); 72 | let status = await driver.findElement(By.css("#flush-status")); 73 | await driver.wait(until.elementTextContains(status, "finished")); 74 | return result; 75 | } 76 | 77 | describe("Test history collection", function() { 78 | this.timeout(120000); 79 | let driver; 80 | 81 | before(async function() { 82 | startServer(); 83 | driver = await getDriver(addonFileLocation); 84 | // Give the add-on a moment to load: 85 | await promiseTimeout(1000); 86 | }); 87 | 88 | after(async function() { 89 | stopServer(); 90 | if (!process.env.NO_CLOSE) { 91 | closeBrowser(driver); 92 | return null; 93 | } 94 | console.info("Note: leaving browser open"); 95 | return null; 96 | }); 97 | 98 | it("will browse about", async function() { 99 | this.timeout(15000); 100 | await driver.get(`${SERVER_STATIC}/search.html`); 101 | await driver.findElement(By.name("q")).sendKeys("test query\n"); 102 | await driver.findElement(By.css("button")).click(); 103 | await driver.wait(until.titleIs("Search results")); 104 | await driver.wait(until.elementLocated(By.css("a.result"))); 105 | await driver.findElement(By.css("a.result")).click(); 106 | await driver.wait(async () => { 107 | let url = await driver.getCurrentUrl(); 108 | return !url.includes("search-results.html"); 109 | }); 110 | await driver.wait(until.elementLocated(By.css("#first-link"))); 111 | await driver.findElement(By.css("#first-link")).click(); 112 | await driver.navigate().back(); 113 | await driver.navigate().back(); 114 | await driver.wait(until.elementLocated(By.css("a.result"))); 115 | let selectLinkOpeninNewTab = Key.chord(COMMAND_MOD, Key.RETURN); 116 | await driver.findElement(By.css("a.result")).sendKeys(selectLinkOpeninNewTab); 117 | // We want to be sure the Cmd+click opens a tab before we do the next step: 118 | await promiseTimeout(1000); 119 | 120 | /** ********************* 121 | * fetch the results */ 122 | let result = await collectInformation(driver); 123 | 124 | /** ********************** 125 | * analyze the results */ 126 | let pages = result.currentPages.concat(result.pendingPages); 127 | pages.sort((a, b) => a.loadTime > b.loadTime ? 1 : -1); 128 | if (pages[0].url === "about:blank") { 129 | // Sometimes about:blank shows up in the history, and sometimes it doesn't (presumably related 130 | // to load time), so we remove it if it is the first 131 | pages.shift(); 132 | } 133 | function idToIndex(id) { 134 | return pages.map(p => p.id).indexOf(id); 135 | } 136 | function property(name) { 137 | return pages.map(p => p[name]); 138 | } 139 | let urls = pages.map(p => p.url); 140 | let expectedUrls = [ 141 | `${SERVER_STATIC}/search.html`, 142 | `${SERVER_STATIC}/search-results.html?q=test+query`, 143 | `${SERVER_STATIC}/search-destination.html`, 144 | `${SERVER_STATIC}/search-destination.html#first`, 145 | `${SERVER_STATIC}/search-destination.html`, 146 | `${SERVER_STATIC}/search-results.html?q=test+query`, 147 | `${SERVER_STATIC}/search-destination.html`, 148 | `${SERVER}/test-static/debug.html`, 149 | ]; 150 | assert.deepEqual(urls, expectedUrls); 151 | // Apparently driver.get() doesn't act like from_address_bar 152 | assert.deepEqual(property("from_address_bar"), [ 153 | false, false, false, false, false, false, false, false 154 | ], "from_address_bar"); 155 | // We went "back" to the 4th item (the google search) 156 | assert.deepEqual(property("forward_back"), [ 157 | false, false, false, false, true, true, false, false 158 | ], "forward_back"); 159 | assert.deepEqual(property("transitionType"), [ 160 | "link", 161 | "form_submit", // search result 162 | "link", // clicked on search result 163 | "link", // clicked on anchor link 164 | "link", // clicked on back...? 165 | "link", // clicked on back again 166 | undefined, // apparently open in new window is misunderstood 167 | "link", // driver.get looks like link? 168 | ], "transitionType"); 169 | assert.deepEqual(pages.map(p => idToIndex(p.sourceId)), [ 170 | -1, // Didn't come from anywhere, about:blank 171 | 0, // search page 172 | 1, // search result 173 | 2, // click on link 174 | 3, // went "back" to this page... FIXME: is this right? 175 | 4, // came from previous search result, 176 | 5, // something else... 177 | 5, // mysterious extra copy of a page 178 | ]); 179 | assert.deepEqual(property("newTab"), [ 180 | false, false, false, false, false, false, true, false, 181 | ], "newTab"); 182 | assert.deepEqual(property("sourceClickText"), [ 183 | null, 184 | null, 185 | "A pretend destination", 186 | "first place", 187 | null, 188 | null, 189 | null, 190 | "A pretend destination", 191 | ], "sourceClickText"); 192 | assert.deepEqual(pages.map(p => !!p.unloadTime), [ 193 | true, true, true, true, true, true, 194 | false, false, // only the last two pages are still loaded 195 | ], "is unloaded"); 196 | assert.deepEqual(pages.map(p => typeof p.activeTime), [ 197 | "number", "number", "number", "number", "number", "number", "number", "number", 198 | ]); 199 | assert.deepEqual(property("closedReason"), [ 200 | "navigation", 201 | "navigation", 202 | "navigation", 203 | "navigation", 204 | "navigation", 205 | "navigation", 206 | null, 207 | null, // Only the last two pages haven't been redirected away 208 | ], "closedReason"); 209 | assert.deepEqual(property("title"), [ 210 | "Pretend Search", 211 | "Search results", 212 | "Pretend destination", 213 | null, 214 | null, 215 | "Search results", 216 | "Pretend destination", 217 | null, 218 | ], "captured title"); 219 | let searchResultLinks = [{ 220 | text: "A pretend destination", 221 | url: "http://localhost:11180/test-static/search-destination.html", 222 | }]; 223 | let destinationLinks = [ 224 | { 225 | elementId: "first-link", 226 | text: "first place", 227 | url: "#first", 228 | }, 229 | { 230 | elementId: "second-link", 231 | text: "second place", 232 | url: "#second", 233 | } 234 | ]; 235 | assert.deepEqual(property("linkInformation"), [ 236 | [], 237 | searchResultLinks, 238 | destinationLinks, 239 | null, // I'm not sure why these are null, probably because there isn't time to get the information? 240 | null, // that's not a good reason for null values, might be fragile in the future 241 | searchResultLinks, 242 | null, 243 | null, 244 | ]); 245 | return true; 246 | }); 247 | 248 | it("Will detect 404s", async function() { 249 | this.timeout(10000); 250 | let url = `${SERVER_STATIC}/does-not-exist.html`; 251 | await driver.get(url); 252 | await promiseTimeout(5000); 253 | let result = await collectInformation(driver); 254 | let page = result.pendingPages.filter(p => p.url.endsWith("does-not-exist.html"))[0]; 255 | assert.equal(page.statusCode, 404, `Status code not 404: ${page.statusCode}`); 256 | assert(page.contentType.startsWith("text/html"), `contentType: ${page.contentType}`); 257 | let filename = filenameForUrl(url); 258 | let pageData = JSON.parse(fs.readFileSync(filename, {encoding: "UTF-8"})); 259 | assert.equal(pageData.statusCode, 404); 260 | return true; 261 | }); 262 | 263 | it("Will detect cookies", async function() { 264 | this.timeout(10000); 265 | let url = `${SERVER}/cookie`; 266 | await driver.get(url); 267 | await promiseTimeout(500); 268 | await driver.get(url + "?remove=1"); 269 | await promiseTimeout(500); 270 | let result = await collectInformation(driver); 271 | let pages = result.currentPages.concat(result.pendingPages); 272 | pages.sort((a, b) => a.loadTime > b.loadTime ? 1 : -1); 273 | // Depending on previous tests, there might be other pages before the one we care about 274 | for (let i = 0; i < pages.length; i++) { 275 | if (pages[i].url.endsWith("cookie")) { 276 | // The page we want to start with 277 | pages.splice(0, i); 278 | break; 279 | } 280 | } 281 | assert.deepEqual(pages.map(p => [p.hasCookie, p.hasSetCookie]), [ 282 | [false, true], // has no cookie, but did set one 283 | [true, true], // has no cookie, but did set the deleting cookie 284 | [false, false], // the debug page, sets no cookie, and cookie has been deleted 285 | ]); 286 | }); 287 | 288 | }); 289 | -------------------------------------------------------------------------------- /test/walk-configs/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "destinations": { 3 | "urls": [ 4 | "https://www.google.com", 5 | "https://news.ycombinator.com", 6 | "https://news.google.com", 7 | "https://www.reddit.com" 8 | ], 9 | "frequency": 0.05 10 | }, 11 | "queries": { 12 | "https://www.google.com": { 13 | "input[name='q']": 1.0 14 | }, 15 | "https://www.reddit.com": { 16 | "input[name='q']": 0.3 17 | }, 18 | "https://news.google.com": { 19 | "input[aria-label='Search']": 0.3 20 | } 21 | }, 22 | "searchTerms": "tornado watch associated press united states winter storm dallas cowboys real fake news oklahoma yodeling atlanta braves tennis los angeles cleveland interstate michigan missouri new york mets lake dancing minnesota" 23 | } 24 | -------------------------------------------------------------------------------- /test/walk-configs/news.json: -------------------------------------------------------------------------------- 1 | { 2 | "destinations": { 3 | "urls": [ 4 | "https://www.huffingtonpost.com/", 5 | "https://www.washingtonpost.com/", 6 | "https://www.wired.com/about/rss_feeds/", 7 | "https://www.telegraph.co.uk/", 8 | "https://www.bloomberg.com/", 9 | "https://www.reuters.com/", 10 | "http://www.dailymail.co.uk/", 11 | "https://www.usatoday.com/", 12 | "http://time.com/", 13 | "http://www.latimes.com/", 14 | "http://www.bbc.com/", 15 | "http://www.businessinsider.com/", 16 | "https://www.wired.com/", 17 | "https://www.nationalgeographic.com/", 18 | "https://www.buzzfeed.com/", 19 | "https://www.theatlantic.com/", 20 | "https://www.cbsnews.com/", 21 | "http://www.foxnews.com/", 22 | "https://www.nature.com/", 23 | "https://techcrunch.com/", 24 | "https://mashable.com/", 25 | "http://www.cbc.ca/", 26 | "https://www.ft.com/", 27 | "https://www.usnews.com/", 28 | "http://www.chicagotribune.com/", 29 | "https://www.sfgate.com/", 30 | "https://www.newyorker.com/", 31 | "http://www.abc.net.au/", 32 | "http://cbslocal.com/", 33 | "https://slate.com/", 34 | "https://gizmodo.com/", 35 | "https://www.engadget.com/", 36 | "http://www.nydailynews.com/", 37 | "https://www.vice.com/", 38 | "http://fortune.com/", 39 | "https://www.theverge.com/", 40 | "https://www.sciencedaily.com/", 41 | "https://www.bloomberg.com/businessweek", 42 | "https://www.psychologytoday.com/", 43 | "https://nypost.com/", 44 | "https://www.indiatimes.com/", 45 | "https://www.marketwatch.com/", 46 | "https://www.fastcompany.com/", 47 | "https://www.scientificamerican.com/", 48 | "https://www.thetimes.co.uk/", 49 | "https://www.theglobeandmail.com/", 50 | "https://www.entrepreneur.com/", 51 | "http://www.ox.ac.uk/", 52 | "https://hbr.org/", 53 | "https://www.inc.com/", 54 | "http://www.sciencemag.org/", 55 | "http://www.newsweek.com/", 56 | "https://www.rollingstone.com/", 57 | "http://nymag.com/", 58 | "https://www.politico.com/", 59 | "https://www.hollywoodreporter.com/", 60 | "https://academic.oup.com/journals/", 61 | "https://www.ap.org/", 62 | "https://www.vox.com/", 63 | "http://www.adweek.com/", 64 | "http://thehill.com/", 65 | "https://venturebeat.com/", 66 | "https://www.today.com/", 67 | "http://www.mtv.com/", 68 | 69 | "https://www.cnn.com/", 70 | "http://www.msnbc.com/", 71 | "https://www.aol.com/news/", 72 | "https://www.nytimes.com/", 73 | "http://www.drudgereport.com/", 74 | "http://abcnews.go.com/", 75 | "https://www.wsj.com/", 76 | "http://www.mcclatchydc.com/", 77 | "http://talkingpointsmemo.com", 78 | 79 | "https://www.npr.org/", 80 | "https://www.nbcnews.com/", 81 | "http://www.breitbart.com/", 82 | "https://www.denverpost.com/", 83 | "http://newyork.cbslocal.com/", 84 | "http://losangeles.cbslocal.com/", 85 | "http://chicago.cbslocal.com/", 86 | "http://dfw.cbslocal.com/", 87 | "https://www.wusa9.com/", 88 | "http://washington.cbslocal.com/", 89 | "http://boston.cbslocal.com/", 90 | "http://philadelphia.cbslocal.com/", 91 | "http://minnesota.cbslocal.com/", 92 | "http://sanfrancisco.cbslocal.com/", 93 | "http://detroit.cbslocal.com/", 94 | "http://tampa.cbslocal.com/", 95 | "https://www.newsmax.com/", 96 | "https://www.washingtontimes.com/", 97 | "https://www.boston.com/", 98 | "https://www.mercurynews.com/", 99 | "http://www.philly.com/", 100 | "https://www.seattletimes.com/", 101 | "http://www.miamiherald.com/", 102 | "http://observer.com/", 103 | "http://www.stltoday.com/", 104 | "http://gothamist.com/", 105 | "http://ktla.com/", 106 | "https://www.seattlepi.com/", 107 | "https://www.newsday.com/", 108 | "https://chicago.suntimes.com/", 109 | "http://www.laweekly.com/", 110 | "http://abc13.com/", 111 | "https://wtop.com/", 112 | "http://www.bostonherald.com/", 113 | "https://www.nbcnewyork.com/", 114 | "http://wgntv.com/", 115 | "http://abc7news.com/", 116 | "http://www.autonews.com/", 117 | "http://kdvr.com/", 118 | "http://www.miaminewtimes.com/", 119 | "https://www.twincities.com/", 120 | "http://www.kxan.com/", 121 | "https://www.nbcchicago.com/", 122 | "https://www.nbcwashington.com/", 123 | "http://fox2now.com/", 124 | "http://longisland.news12.com/", 125 | "https://www.nbclosangeles.com/", 126 | "http://pix11.com/", 127 | "http://www.phillyvoice.com/", 128 | "https://www.villagevoice.com/", 129 | "http://www.westword.com/", 130 | "http://www.houstonpress.com/", 131 | "http://www.kron4.com/", 132 | "https://www.nbcphiladelphia.com/", 133 | "http://www.dailyherald.com/", 134 | "https://www.nbcsandiego.com/", 135 | "https://www.nbcdfw.com/", 136 | "http://www.phoenixnewtimes.com/", 137 | "http://arlington.wickedlocal.com/", 138 | "https://www.amny.com/", 139 | "http://www.chicagobusiness.com/", 140 | "https://www.pe.com/", 141 | "https://www.wxyz.com/", 142 | "https://whdh.com/", 143 | "http://www.wfla.com/", 144 | "http://fox5sandiego.com/", 145 | "https://www.nbcmiami.com/", 146 | "https://wsvn.com/", 147 | "https://www.riverfronttimes.com/", 148 | "https://www.abcactionnews.com/", 149 | "http://www.wivb.com/", 150 | "https://www.chicagoreader.com/", 151 | "https://www.minnpost.com/", 152 | "http://www.news10.com/", 153 | "https://www.metrotimes.com/", 154 | "https://www.texasobserver.org/", 155 | "https://billypenn.com/", 156 | "https://timesofsandiego.com/", 157 | "https://www.nysun.com/", 158 | "http://www.laobserved.com/", 159 | "https://citylimits.org/", 160 | "http://www.miamitodaynews.com/", 161 | "http://kplr11.com/", 162 | "https://atlantaintownpaper.com/", 163 | "http://heartlandnewsfeed.com/", 164 | "http://laindependent.com/", 165 | 166 | "http://www.startribune.com/", 167 | "https://weather.com/", 168 | "https://www.forbes.com/", 169 | "https://www.cnbc.com/", 170 | "https://www.theguardian.com/", 171 | "https://www.salon.com/", 172 | "http://dailycaller.com/", 173 | "https://www.theblaze.com/", 174 | 175 | "https://news.google.com", 176 | "https://www.yahoo.com/news/", 177 | "https://medium.com/", 178 | 179 | "https://www.businessinsider.in/rss_feeds.cms", 180 | "http://www.bbc.com/news/10628494", 181 | "http://www.latimes.com/la-rssinfopage-htmlstory.html", 182 | "https://www.huffingtonpost.com/syndication", 183 | "https://talkingpointsmemo.com/feeds", 184 | "https://archive.nytimes.com/www.nytimes.com/services/xml/rss/index.html?8dpc", 185 | "https://blog.feedspot.com/nytimes_rss_feeds/", 186 | "https://www.reuters.com/tools/rss", 187 | "https://www.huffingtonpost.com/syndication", 188 | "https://www.washingtontimes.com/feeds/", 189 | "http://www.dailymail.co.uk/home/article-2684527/RSS-Feeds.html", 190 | "https://www.usatoday.com/rss/", 191 | "http://content.time.com/time/rss/", 192 | "https://www.buzzfeed.com/rss", 193 | "http://dailycaller.com/rss-feeds/", 194 | "https://www.theguardian.com/help/feeds", 195 | "https://www.cnbc.com/rss-feeds/", 196 | "https://developer.yahoo.com/finance/?guccounter=1", 197 | "https://developer.yahoo.com/rss/", 198 | "http://www.startribune.com/rss-index/112994779/", 199 | "http://www.startribune.com/rss-index/112994779/", 200 | "http://www.miamiherald.com/site-services/rss/", 201 | "http://www.laobserved.com/pages/feeds.php", 202 | "https://www.nysun.com/rss.php", 203 | "https://www.metrotimes.com/detroit/Syndication", 204 | "https://www.minnpost.com/rss-feed-list", 205 | "https://www.chicagoreader.com/chicago/Syndication/Page", 206 | "http://www.dailyherald.com/rss/", 207 | "http://www.phillyvoice.com/rss-feeds/", 208 | "http://www.autonews.com/section/syndication", 209 | "http://www.laweekly.com/arts/rss-feeds-page-2143509", 210 | "http://www.thesuntimes.com/section/feed", 211 | "https://www.seattlepi.com/rss/", 212 | "http://www.stltoday.com/rss/", 213 | "http://observer.com/rss-feeds/", 214 | "https://www.seattletimes.com/rss-feeds/#all-content", 215 | "https://www.seattletimes.com/rss-feeds/#local-news", 216 | "https://www.seattletimes.com/rss-feeds/#nation-and-world", 217 | "https://www.seattletimes.com/rss-feeds/#business", 218 | "https://www.seattletimes.com/rss-feeds/#sports", 219 | "https://www.seattletimes.com/rss-feeds/#entertainment", 220 | "https://www.seattletimes.com/rss-feeds/#life", 221 | "https://www.seattletimes.com/rss-feeds/#opinion", 222 | "https://www.seattletimes.com/rss-feeds/#photo-and-video", 223 | "http://www.philly.com/philly/about/rss_index/", 224 | "https://www.boston.com/rss-feeds", 225 | "https://www.washingtontimes.com/feeds/", 226 | "https://www.newsmax.com/rss/", 227 | "https://www.denverpost.com/web-feeds/", 228 | "http://www.mcclatchydc.com/customer-service/rss/", 229 | "http://www.wsj.com/public/page/rss_news_and_feeds_podcast.html", 230 | "http://www.wsj.com/public/page/rss_news_and_feeds_videos.html", 231 | "http://www.wsj.com/public/page/rss_news_and_feeds_blogs.html", 232 | "http://www.wsj.com/public/page/rss_news_and_feeds.html", 233 | "http://abcnews.go.com/Site/page/rss--3520115", 234 | "https://archive.nytimes.com/www.nytimes.com/services/xml/rss/index.html?8dpc", 235 | "http://www.cnn.com/services/rss/", 236 | "http://thehill.com/resources/rss-feeds", 237 | "https://www.politico.com/rss", 238 | "https://www.rollingstone.com/services/rss", 239 | "http://www.newsweek.com/rss", 240 | "http://www.sciencemag.org/about/email-alerts-and-rss-feeds", 241 | "https://hbphelp.zendesk.com/hc/en-us/articles/215259487-RSS-Feed", 242 | "http://www.rssmix.com/uk-news-feeds", 243 | "https://www.marketwatch.com/rss/", 244 | "https://timesofindia.indiatimes.com/rss.cms", 245 | "https://nypost.com/rssfeeds/", 246 | "https://www.sciencedaily.com/newsfeeds.htm", 247 | "http://www.nydailynews.com/services/feeds", 248 | "http://minnesota.cbslocal.com/rss-feeds/", 249 | "http://abcnews.go.com/Site/page/rss--3520115", 250 | "https://www.newyorker.com/about/feeds", 251 | "https://www.sfgate.com/rss/", 252 | "http://www.chicagotribune.com/cs-rssfeeds-htmlstory.html", 253 | "https://www.usnews.com/info/features/rss-feeds", 254 | "http://www.cbc.ca/rss/", 255 | "https://www.nature.com/webfeeds/index.html", 256 | "http://www.foxnews.com/about/rss/", 257 | "https://www.cbsnews.com/rss/", 258 | "https://www.theatlantic.com/follow-the-atlantic/" 259 | ], 260 | "frequency": 0.10 261 | }, 262 | "queries": { 263 | "https://news.google.com": { 264 | "input[aria-label='Search']": 0.3 265 | }, 266 | "https://www.yahoo.com/news": { 267 | "input[aria-label='Search']": 0.3 268 | } 269 | }, 270 | "searchTerms": "tornado watch associated press united states winter storm dallas cowboys real fake news oklahoma yodeling atlanta braves tennis los angeles cleveland interstate michigan missouri new york mets lake dancing minnesota" 271 | } 272 | --------------------------------------------------------------------------------