├── .babelrc
├── .eslintignore
├── .eslintrc.yml
├── .flake8
├── .gitignore
├── LICENSE
├── README.md
├── bin
    └── launch-fetcher
├── browsinglab
    ├── __init__.py
    ├── cli.py
    ├── connector.py
    ├── connlist.py
    ├── db.py
    ├── subenvvars.py
    └── urlcol.py
├── dev-requirements.txt
├── docs
    ├── LICENSE
    ├── activity-schema.md
    ├── future-design.md
    └── screencast-fetcher.gif
├── extension
    ├── .eslintrc.js
    ├── activityTracker.js
    ├── backgroundOnMessage.js
    ├── browser-polyfill.js
    ├── browserId.js
    ├── buildSettings.js.tmpl
    ├── catcher.js
    ├── communication.js
    ├── contentLoader.js
    ├── contentWatcher.js
    ├── controller.js
    ├── controls
    │   ├── popup.css
    │   ├── popup.html
    │   └── popup.jsx
    ├── elementToSelector.js
    ├── icon-live.svg
    ├── icon.svg
    ├── log.js
    ├── manifest.json
    ├── rssFinder.js
    ├── scraper
    │   ├── Readability.js
    │   ├── extractor-worker.js
    │   ├── make-static-html.js
    │   └── scrapeTab.js
    └── util.js
├── install.sh
├── package.json
├── python
    ├── README.md
    ├── analyze_classnames.ipynb
    ├── document_summary.ipynb
    ├── named_entities.ipynb
    ├── nn_readable.ipynb
    ├── pha
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── glovehelper.py
    │   ├── htmltools.py
    │   ├── notebooktools.py
    │   ├── saver.py
    │   ├── schema.sql
    │   ├── search.py
    │   ├── searchquery.py
    │   └── summarytools.py
    ├── requirements.txt
    ├── search_example.ipynb
    └── setup.py
├── setup.py
└── test
    ├── .eslintrc.js
    ├── commands.js
    ├── driver-setup.js
    ├── random-walk.js
    ├── static
        ├── blank.html
        ├── debug.html
        ├── search-destination.html
        ├── search-results.html
        ├── search.html
        └── style.css
    ├── test-utils.js
    ├── test.js
    └── walk-configs
        ├── default.json
        └── news.json


/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "plugins": ["transform-react-jsx"],
3 | }
4 | 


--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
 1 | /extension/scraper/Readability.js
 2 | /test/test-data
 3 | /dev-data
 4 | /walk-data
 5 | /extension/browser-polyfill.js
 6 | /StickyProfile
 7 | /Profile
 8 | build
 9 | build-walk
10 | 


--------------------------------------------------------------------------------
/.eslintrc.yml:
--------------------------------------------------------------------------------
 1 | env:
 2 |   browser: true
 3 |   es6: true
 4 |   node: true
 5 | 
 6 | extends:
 7 |   - eslint:recommended
 8 |   - plugin:react/recommended
 9 |   - plugin:mozilla/recommended
10 | 
11 | parserOptions:
12 |   ecmaVersion: 8
13 |   sourceType: module
14 | 
15 | plugins:
16 |   - mozilla
17 |   - promise
18 |   - react
19 | 
20 | root: true
21 | 
22 | rules:
23 |   consistent-return: error
24 |   eqeqeq: error
25 |   no-console: warn
26 |   prefer-const: off # TODO: change to "error"
27 |   quotes: [error, double]
28 | 
29 |   promise/always-return: off
30 |   promise/avoid-new: off
31 |   promise/catch-or-return: error
32 |   promise/no-callback-in-promise: off
33 |   promise/no-native: off
34 |   promise/no-nesting: off
35 |   promise/no-promise-in-callback: off
36 |   promise/param-names: error
37 |   react/prop-types: off
38 | 
39 | settings:
40 |   react:
41 |     version: 16
42 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore=E501
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info
 2 | /node_modules
 3 | /Profile
 4 | /StickyProfile
 5 | /pages
 6 | /jobs
 7 | *.sqlite
 8 | /package-lock.json
 9 | .DS_Store
10 | /python/pha.egg-info
11 | .ipynb_checkpoints
12 | /.vscode
13 | tmp
14 | /python/data
15 | build
16 | /test/test-data
17 | /addon.log
18 | /dev-data
19 | /data
20 | /.venv
21 | /test/build-walk
22 | /walk-data
23 | __pycache__
24 | /blab
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # personal-history-archive
  2 | 
  3 | Creating a dump of your personal browser history for analysis. This is a tool for people who want to research browsing behavior and content, starting with the only dataset you'll really be able to create: data about yourself.
  4 | 
  5 | ## Motivation
  6 | 
  7 | This is for creating a *browsing corpus* for later analysis. It's not a feasible end-user tool, and it collects information that can't normally be shared. But if you are interested in browsing behavior and web content analysis, then this is the package for you!
  8 | 
  9 | The data collected here is specifically what you see and do via the browser. Unlike spidering or fetching documents via the command-line, you get fully rendered and personalized pages. This will help you include information in your corpus that specifically isn't available on the open web.
 10 | 
 11 | ## Features
 12 | 
 13 | Using this tool you can:
 14 | 
 15 | * Extract your history from multiple browsers into a database
 16 | * Fetch high quality versions of your history items:
 17 |   * Get frozen pages from the browser (no worries about JavaScript)
 18 |   * Fetch pages using your cookies and authentication (get personal and personalized versions of pages)
 19 |   * All HTML is well-formed, links are made absolute
 20 |   * HTML can be re-rendered easily
 21 | * The frozen HTML has additional annotations to make it easier to interpret:
 22 |   * Hidden elements are marked as such
 23 |   * Elements whose `display` style is changed are marked as such (useful if you want to look for any block-like element)
 24 |   * The [Readability](https://github.com/mozilla/readability) library is used to extract a "readable" form
 25 |   * Elements in the original document that form the readable view are marked as such
 26 |   * The natural/rendered sizes of images are included
 27 |   * A first-page screenshot is taken, and a full-length thumbnail
 28 | * Track ongoing browsing; collecting additional information not in normal browsing history:
 29 |   * Reliably track what page leads to the next page
 30 |   * Track what link click lead to the next page
 31 |   * Track how often and for how long the page was the active tab
 32 |   * [And more!](./docs/activity-schema.md)
 33 | * A [Python library](./python/#readme) is included to help interpret your results:
 34 |   * Load and query history items and pages
 35 |   * Parse pages (using [lxml](http://lxml.de/))
 36 |   * A [growing list of miscellany](./python#helpers)...
 37 | 
 38 | ## Examples
 39 | 
 40 | ## Overview
 41 | 
 42 | This consists of two parts:
 43 | 
 44 | * A [browser extension](./extension#readme) (for Firefox and Chrome) to save your history and activity
 45 | * A [python library](./python#readme) to use and analyze the history
 46 | 
 47 | ## Installation
 48 | 
 49 | You must check out this repository to use the package.
 50 | 
 51 | Run `npm install` to install the necessary packages, and to setup the Python **3** environment. (A virtualenv environment is created in `.venv/`)
 52 | 
 53 | After installation you must restart your Firefox browser (Chrome support is iffy right now), go to `about:debugging` and manually install the extension from `build/extension/`
 54 | 
 55 | Data will begin to be collected in `data/`
 56 | 
 57 | 
 58 | ## Fetching history
 59 | 
 60 | ![image](./docs/screencast-fetcher.gif)
 61 | 
 62 | Once you have history uploaded, you may want to fetch static versions of your old history (from before you installed the extension).
 63 | 
 64 | **Note:** these instructions are incorrect, and need updating after [#57](https://github.com/ianb/personal-history-archive/issues/57) is fixed.
 65 | 
 66 | Use `./bin/launch-fetcher` to launch a Firefox instance dedicated to that fetching. Probably use `./bin/launch-fetcher --use-profile "Profile Name"` to use a *copy* of an existing profile (after doing that once, the profile copy will be kept for later launches). You'll want to use a profile that is logged into your services, so that you can get personalized versions of your pages.
 67 | 
 68 | The page `http://localhost:11180/` will be loaded automatically in the fetcher browser instance, and that lets you start fetching pages.
 69 | 
 70 | You may want to review `http://localhost:11180/viewer/redirected` to see pages that get redirects. These are often pages that required missing authentication. You can login to the pages, then delete the fetched page so it can be re-fetched.
 71 | 
 72 | ## Python library
 73 | 
 74 | There's a Python **3** library in [the `python/` subdirectory](https://github.com/ianb/personal-history-archive/tree/master/python). It gets automatically installed into the `.venv/` virtualenv, but you could install it elsewhere too.
 75 | 
 76 | You can install it like:
 77 | 
 78 | ```sh
 79 | $ cd python
 80 | $ pip install -e .
 81 | # Optional packages:
 82 | $ pip install -r requirements.txt
 83 | ```
 84 | 
 85 | This adds a package called `pha`. There is some information [in the subdirectory](python/), and the notebooks (`*.ipynb`) show many examples (though as of March 2018, they are out of date due to refactorings).
 86 | 
 87 | ## Random walk
 88 | 
 89 | There's a script that will do random activity in the browser, saving data to `test/walk-data/`. Run:
 90 | 
 91 | ```sh
 92 | $ npm run walk
 93 | # Or if you want to try a configuration in test/walk-configs/news.json that goes to news sites:
 94 | $ CONFIG=news npm run walk
 95 | ```
 96 | 
 97 | ## Testing
 98 | 
 99 | The tests are in [`test/`](./test/). To run the tests:
100 | 
101 | ```sh
102 | $ npm test
103 | ```
104 | 
105 | You can use `NO_CLOSE=1` to leave the browser open after the test completes (this can be helpful to understand failures). Use `TEST_ARGS="..."` to add [Mocha command-line arguments](https://mochajs.org/#usage) such as `TEST_ARGS='-g 404s' npm test` to run tests with "404s" in the test description.
106 | 
107 | The temporary data will be in `test/test-data/` and you may find `test/test-data/addon.log` particularly interesting, as the Browser Console isn't very accessible from the test environment.
108 | 
109 | ## Development
110 | 
111 | If you want to run it interactively in a fresh profile, use:
112 | 
113 | ```sh
114 | $ npm start
115 | ```
116 | 
117 | This will run a new browser profile, with data going into `dev-data/` (and logs in `dev-data/addon.log`). Changes are not automatically picked up, so you have to restart the browser after changes. There is no migration, so you may have to wipe out `dev-data/` after changes to the schema.
118 | 
119 | ## Collaborating
120 | 
121 | If you have a question, probably the best thing is to [open a ticket](https://github.com/ianb/personal-history-archive/issues/new). If you are interested in implementing something, it would also be great to open a ticket so we can discuss.
122 | 
123 | If you'd like to chat, I've created a channel `#pha` on irc.mozilla.org. I (`ianbicking`) am usually only online during business hours, Central Time/UTC-6.
124 | 
125 | ## Credits
126 | 
127 | The icon comes from [Open Iconic](https://useiconic.com/open)
128 | 


--------------------------------------------------------------------------------
/bin/launch-fetcher:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | set -e
  4 | cd "$(dirname ${BASH_SOURCE[0]})/.."
  5 | base="$(pwd)"
  6 | PATH="node_modules/.bin:$PATH"
  7 | webext="$base/node_modules/.bin/web-ext"
  8 | 
  9 | binary=
 10 | firefoxes="
 11 | /Applications/FirefoxNightly.app
 12 | /Applications/FirefoxDeveloperEdition.app
 13 | /Applications/FirefoxAurora.app
 14 | $(which firefox || true)
 15 | "
 16 | use_profile=
 17 | use_scratch=
 18 | 
 19 | for firefox in $firefoxes ; do
 20 |   if [[ -e "$firefox" ]] ; then
 21 |     binary="$firefox"
 22 |     break
 23 |   fi
 24 | done
 25 | 
 26 | help () {
 27 |   echo "Usage: $(basename $0) [OPTIONS]"
 28 |   echo "  Options:"
 29 |   echo "    -b or --binary BINARY"
 30 |   echo "      Use BINARY as the Firefox to run (default $binary)"
 31 |   echo "    --use-profile PROFILE_NAME"
 32 |   echo "      Use an existing profile based on the named profile; the profile will be copied to StickyProfile"
 33 |   echo "    --use-scratch"
 34 |   echo "      Use a scratch profile. Good for testing."
 35 | }
 36 | 
 37 | while [[ -n "$1" ]] ; do
 38 |   case "$1" in
 39 |     help|-h|--help)
 40 |       help
 41 |       exit
 42 |       ;;
 43 |     -b|--binary)
 44 |       binary="$2"
 45 |       shift
 46 |       shift
 47 |       ;;
 48 |     --use-profile)
 49 |       use_profile="$2"
 50 |       shift
 51 |       shift
 52 |       ;;
 53 |     --use-scratch)
 54 |       use_scratch=1
 55 |       shift
 56 |       ;;
 57 |     *)
 58 |       echo "Unknown option: $1"
 59 |       help
 60 |       exit 2
 61 |       ;;
 62 |   esac
 63 | done
 64 | 
 65 | if [[ -n "$use_profile" ]] ; then
 66 |   if [[ -e ./StickyProfile ]] ; then
 67 |     echo "An existing profile already exists. It was created from:"
 68 |     echo "  $(cat ./StickyProfile/pha-orig-profile-name.txt)"
 69 |     echo "Do you want to overwrite it with:"
 70 |     echo "  $use_profile"
 71 |     echo "(losing any changes you may have made in it)"
 72 |     echo -n "[y/N] ? "
 73 |     read answer
 74 |     if [[ "$answer" != "y" ]] ; then
 75 |       echo "Aborting."
 76 |       exit
 77 |     fi
 78 |     rm -rf ./StickyProfile/
 79 |   fi
 80 |   echo "Copying profile '$use_profile' into ./StickyProfile"
 81 |   ./node_modules/.bin/firefox-profile -p "$use_profile" -o ./StickyProfile/
 82 |   echo -n "$use_profile" > ./StickyProfile/pha-orig-profile-name.txt
 83 | elif [[ -z "$use_scratch" ]] && [[ ! -e ./StickyProfile/ ]] ; then
 84 |   echo "You haven't used --use-profile before (which creates ./StickyProfile/)"
 85 |   echo "You must profile that option once, or use --use-scratch"
 86 |   echo
 87 |   help
 88 |   exit 2
 89 | fi
 90 | 
 91 | prefs="--pref=dom.webaudio.enabled=false --pref=media.autoplay.enabled=false --pref=dom.disable_beforeunload=true"
 92 | 
 93 | for name in permissions.default.camera permissions.default.desktop-notification permissions.default.geo permissions.default.microphone permissions.default.shortcuts ; do
 94 |   prefs="$prefs --pref=$name=2"
 95 | done
 96 | 
 97 | for name in capability.policy.default.Window.alert capability.policy.default.Window.confirm capability.policy.default.Window.prompt ; do
 98 |   prefs="$prefs --pref=$name=noAccess"
 99 | done
100 | 
101 | 
102 | run_webext() {
103 |   echo "Running Firefox."
104 |   if [[ -n "$use_scratch" ]] ; then
105 |     $webext run $prefs --firefox "$binary" --source-dir ./extension/ --start-url http://localhost:11180/fetcher.html
106 |   else
107 |     echo "  Using profile $(cat ./StickyProfile/pha-orig-profile-name.txt)"
108 |     $webext run $prefs --firefox "$binary" --source-dir ./extension/ \
109 |       --keep-profile-changes --firefox-profile ./StickyProfile/ --start-url http://localhost:11180/fetcher.html
110 |   fi
111 | }
112 | 
113 | run_webext
114 | 


--------------------------------------------------------------------------------
/browsinglab/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ianb/personal-history-archive/68168eac7876a8827ec566fb4882a7ab5804d87b/browsinglab/__init__.py


--------------------------------------------------------------------------------
/browsinglab/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import os
 3 | import json
 4 | import sys
 5 | 
 6 | 
 7 | @click.group()
 8 | def cli():
 9 |     pass
10 | 
11 | 
12 | @cli.command()
13 | def install(native_name="browsinglab.connector"):
14 |     """Install what is necessary for the browser connection"""
15 |     # FIXME: support Windows
16 |     manifest_path = os.path.abspath(os.path.join(__file__, "../../extension/manifest.json"))
17 |     script_location = os.path.join(sys.prefix, "bin", "browser-connector")
18 |     with open(manifest_path) as fp:
19 |         manifest = json.load(fp)
20 |     manifest_id = manifest["applications"]["gecko"]["id"]
21 |     native_manifest = {
22 |         "name": native_name,
23 |         "description": "Saves information from the Browsing Lab extension",
24 |         "path": script_location,
25 |         "type": "stdio",
26 |         "allowed_extensions": [manifest_id]
27 |     }
28 |     if sys.platform == "darwin":
29 |         filename = os.path.expanduser("~/Library/Application Support/Mozilla/NativeMessagingHosts/%s.json" % native_name)
30 |     elif sys.platform.startswith("linux"):
31 |         filename = os.path.expanduser("~/.mozilla/native-messaging-hosts/%s.json" % native_name)
32 |     else:
33 |         raise Exception("Not a supported platform")
34 |     dir = os.path.dirname(filename)
35 |     if not os.path.exists(dir):
36 |         os.makedirs(dir)
37 |     with open(filename, "wb") as fp:
38 |         fp.write(json.dumps(native_manifest, indent=2).encode("UTF-8"))
39 |     click.echo("Connector installed to:")
40 |     click.secho("  %s" % filename, bold=True)
41 |     click.echo("Script located in:")
42 |     click.secho("  %s" % script_location, bold=True)
43 | 


--------------------------------------------------------------------------------
/browsinglab/connector.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements saving information into the database/files
  3 | """
  4 | 
  5 | import os
  6 | import re
  7 | import json
  8 | import sys
  9 | import struct
 10 | import time
 11 | import pprint
 12 | import traceback
 13 | import uuid
 14 | import atexit
 15 | from .db import Page, Archive, Activity, ActivityLink, Browser, BrowserSession
 16 | from . import connlist
 17 | 
 18 | message_handlers = {}
 19 | 
 20 | active_archive = None
 21 | active_browser = None
 22 | 
 23 | @atexit.register
 24 | def end():
 25 |     if active_browser:
 26 |         active_browser.connected = False
 27 | 
 28 | 
 29 | def addon(func):
 30 |     message_handlers[func.__name__] = func
 31 |     return func
 32 | 
 33 | 
 34 | @addon
 35 | def add_history_list(archive, *, browserId, sessionId, historyItems):
 36 |     visits_to_ids = {}
 37 |     for history in historyItems.values():
 38 |         for visitId, visit in history["visits"].items():
 39 |             visits_to_ids[visitId] = visit["activity_id"] = str(uuid.uuid1())
 40 |     for historyId, history in historyItems.items():
 41 |         c = archive.conn.cursor()
 42 |         for visitId, visit in history["visits"].items():
 43 |             c.execute("""
 44 |                 DELETE FROM activity WHERE browserVisitId = ?
 45 |             """, (visitId,))
 46 |             sourceId = None
 47 |             if visit.get("referringVisitId"):
 48 |                 sourceId = visits_to_ids.get(visit["referringVisitId"])
 49 |                 if not sourceId:
 50 |                     c.execute("""
 51 |                         SELECT id FROM activity WHERE browserVisitId = ?
 52 |                     """, (visit["referringVisitId"],))
 53 |                     row = c.fetchone()
 54 |                     if row:
 55 |                         sourceId = row.id
 56 |             c.execute("""
 57 |                 INSERT INTO activity (
 58 |                     id,
 59 |                     title,
 60 |                     browserId,
 61 |                     sessionId,
 62 |                     url,
 63 |                     browserHistoryId,
 64 |                     browserVisitId,
 65 |                     loadTime,
 66 |                     transitionType,
 67 |                     browserReferringVisitId,
 68 |                     sourceId
 69 |                 ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 70 |             """, (
 71 |                 visit["activity_id"],
 72 |                 history["title"],
 73 |                 browserId,
 74 |                 sessionId,
 75 |                 history["url"],
 76 |                 historyId,
 77 |                 visitId,
 78 |                 visit["visitTime"],
 79 |                 visit["transition"],
 80 |                 visit["referringVisitId"],
 81 |                 sourceId))
 82 |         archive.conn.commit()
 83 |     c = archive.conn.cursor()
 84 |     c.execute("""
 85 |         UPDATE browser
 86 |           SET
 87 |             newestHistory = (SELECT MAX(loadTime)
 88 |                              FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL),
 89 |             oldestHistory = (SELECT MIN(loadTime)
 90 |                              FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL)
 91 |     """, (browserId, browserId))
 92 |     archive.conn.commit()
 93 | 
 94 | 
 95 | @addon
 96 | def add_activity_list(archive, *, browserId, sessionId, activityItems):
 97 |     sqlBrowserId = Browser.getID(browserId)
 98 |     sqlSessionId = BrowserSession.getID(sessionId)
 99 |     for activity in activityItems:
100 |         linkInformation = activity.pop("linkInformation", [])
101 |         uuid = activity.pop("id")
102 |         activity["browserID"] = sqlBrowserId
103 |         activity.pop("sessionId", None)
104 |         activity["sessionID"] = sqlSessionId
105 |         activity["sourceID"] = Activity.getID(activity.pop("sourceId", None), default=None)
106 |         activity["initialLoadID"] = Activity.getID(activity.pop("initialLoadId", None), default=None)
107 |         a = Activity.replaceUuid(uuid, **activity)
108 |         log(archive, a)
109 |         ActivityLink.deleteMany(ActivityLink.activity==a)
110 |         for link in linkInformation or []:
111 |             link = ActivityLink(**link)
112 | 
113 | 
114 | @addon
115 | def check_page_needed(archive, url):
116 |     return Page.urlExists(url)
117 | 
118 | 
119 | @addon
120 | def register_browser(archive, *, browserId, userAgent, devicePixelRatio=1):
121 |     global active_browser
122 |     b = Browser.replaceUuid(browserId, userAgent=userAgent, devicePixelRatio=devicePixelRatio, connected=True)
123 |     active_browser = b
124 | 
125 | 
126 | @addon
127 | def register_session(archive, sessionId, browserId, timezoneOffset):
128 |     BrowserSession.replaceUuid(
129 |         sessionId,
130 |         browserID=Browser.getID(browserId),
131 |         timezoneOffset=timezoneOffset,
132 |         startTime=int(time.time() * 1000))
133 | 
134 | 
135 | @addon
136 | def add_fetched_page(archive, id, url, page):
137 |     redirectUrl = page["url"].split("#")[0]
138 |     origUrl = url.split("#")[0]
139 |     page["originalUrl"] = url
140 |     if redirectUrl == origUrl:
141 |         redirectUrl = None
142 |     else:
143 |         redirectUrl = page["url"]
144 |     if redirectUrl:
145 |         # Removes the YouTube start time we add
146 |         redirectUrl = redirectUrl.replace("&start=86400", "")
147 |     if page.get("activityId"):
148 |         page["activityId"] = Activity.getID(page["activityId"], default=None)
149 |     Page.replaceUuid(
150 |         id,
151 |         url=url,
152 |         activityId=page.get("activityId"),
153 |         timeToFetch=page["timeToFetch"],
154 |         redirectUrl=redirectUrl,
155 |         scrapeData=page,
156 |         )
157 | 
158 | def substitute_location(path):
159 |     path = path.replace("__prefix__", sys.prefix)
160 |     path = os.path.expanduser(path)
161 |     path = os.path.abspath(path)
162 |     return path
163 | 
164 | @addon
165 | def set_active_archive(archive, archiveLocation):
166 |     global withheld_log_messages
167 |     archiveLocation = substitute_location(archiveLocation)
168 |     global active_archive
169 |     if active_archive:
170 |         active_archive.close()
171 |     active_archive = Archive(archiveLocation)
172 |     if withheld_log_messages:
173 |         filename = os.path.join(active_archive.path, "addon.log")
174 |         with open(filename, "a") as fp:
175 |             fp.write("\n".join(withheld_log_messages))
176 |         withheld_log_messages = []
177 |     return archiveLocation
178 | 
179 | set_active_archive.archive_optional = True
180 | 
181 | @addon
182 | def unset_active_archive(archive):
183 |     global active_archive
184 |     active_archive.close()
185 |     active_archive = None
186 | 
187 | @addon
188 | def get_archive_info(archive):
189 |     if not archive:
190 |         return None
191 |     return {"path": archive.path, "title": archive.title}
192 | 
193 | get_archive_info.archive_optional = True
194 | 
195 | 
196 | @addon
197 | def set_archive_title(archive, title):
198 |     archive.title = title
199 | 
200 | 
201 | @addon
202 | def list_archives(archive):
203 |     return connlist.list_archives()
204 | 
205 | list_archives.archive_optional = True
206 | 
207 | withheld_log_messages = []
208 | 
209 | @addon
210 | def log(archive, *args, level='log', stack=None):
211 |     lines = []
212 |     if stack:
213 |         log_location = stack.splitlines()[0]
214 |         log_location = re.sub(r'moz-extension://[a-f0-9-]+/', '/', log_location)
215 |     else:
216 |         log_location = ""
217 |     lines.append("Log/{: <5} {} {}".format(level, int(time.time() * 1000), log_location))
218 |     if len(str(args)) < 70 and len(args) > 1:
219 |         args = (args,)
220 |     for arg in args:
221 |         if isinstance(arg, str):
222 |             s = arg
223 |         else:
224 |             s = pprint.pformat(arg, compact=True)
225 |             if isinstance(arg, tuple):
226 |                 s = s[1:-1]
227 |         s = s.splitlines()
228 |         for line in s:
229 |             lines.append("    %s" % line)
230 |     if not args:
231 |         lines.append("    (no arguments)")
232 |     text = "\n".join(lines) + "\n"
233 |     if not archive:
234 |         filename = os.path.join(sys.prefix, "../addon.log")
235 |         withheld_log_messages.append(text)
236 |     else:
237 |         filename = os.path.join(archive.path, "addon.log")
238 |     with open(filename, "a") as fp:
239 |         fp.write(text)
240 | 
241 | log.archive_optional = True
242 | 
243 | class LogPrinter:
244 | 
245 |     def __init__(self):
246 |         self._cache = ""
247 | 
248 |     def write(self, s):
249 |         sys.stderr.write(s)
250 |         self._cache += s
251 |         if self._cache.endswith("\n") or len(self._cache.splitlines()) > 1:
252 |             log(active_archive, "print: %s" % self._cache.rstrip())
253 |             self._cache = ""
254 | 
255 |     def flush(self):
256 |         sys.stderr.flush()
257 | 
258 | 
259 | def write_page(archive, url, data):
260 |     pages = list(Page.selectBy(url=url, orderBy="-fetched", limit=1))
261 |     if not pages:
262 |         raise Exception("No page found with url %r" % url)
263 |     pages[0].scrapeData = data
264 | 
265 | 
266 | def connect():
267 |     print("Running browsing-connector from %s" % __file__, file=sys.stderr)
268 |     sys.stdout = LogPrinter()
269 |     while True:
270 |         m_name = "(unknown)"
271 |         try:
272 |             message = get_message()
273 |             m_name = "%(name)s(%(args)s%(kwargs)s)" % dict(
274 |                 name=message["name"],
275 |                 args=", ".join(json.dumps(s) for s in message.get("args", [])),
276 |                 kwargs=", ".join("%s=%s" % (name, json.dumps(value)) for name, value in message.get("kwargs", {}).items()),
277 |             )
278 |             if len(m_name) > 100:
279 |                 m_name = m_name[:60] + " ... " + m_name[-10:]
280 |             # print("Message:", m_name, file=sys.stderr)
281 |             handler = message_handlers.get(message["name"])
282 |             if not handler:
283 |                 print("Error: got unexpected message name: %r" % message["name"], file=sys.stderr)
284 |                 continue
285 |             if active_archive is None and not getattr(handler, "archive_optional", False):
286 |                 raise Exception("Attempted to send message before setting archive: %s()" % m_name)
287 |             result = handler(active_archive, *message.get("args", ()), **message.get("kwargs", {}))
288 |             send_message({"id": message["id"], "result": result})
289 |         except Exception as e:
290 |             tb = traceback.format_exc()
291 |             log(active_archive, "Error processing message %s(): %s" % (m_name, e), tb, level='s_err')
292 |             send_message({"id": message["id"], "error": str(e), "traceback": tb})
293 | 
294 | 
295 | def get_message():
296 |     length = sys.stdin.buffer.read(4)
297 |     if len(length) == 0:
298 |         sys.exit(0)
299 |     length = struct.unpack('@I', length)[0]
300 |     message = sys.stdin.buffer.read(length).decode('utf-8')
301 |     message = json.loads(message)
302 |     return message
303 | 
304 | 
305 | def encode_message(message):
306 |     content = json.dumps(message).encode('utf-8')
307 |     length = struct.pack('@I', len(content))
308 |     return length + content
309 | 
310 | 
311 | def send_message(message):
312 |     sys.__stdout__.buffer.write(encode_message(message))
313 |     sys.__stdout__.buffer.flush()
314 | 


--------------------------------------------------------------------------------
/browsinglab/connlist.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This handles keeping all the archives on disk registered
 3 | """
 4 | import os
 5 | 
 6 | 
 7 | DIR_LOCATION = os.path.expanduser("~/.browsinglab")
 8 | LOCATIONS = os.path.join(DIR_LOCATION, "locations.txt")
 9 | 
10 | if not os.path.exists(DIR_LOCATION):
11 |     os.makedirs(DIR_LOCATION)
12 | 
13 | 
14 | def get_locations():
15 |     if not os.path.exists(LOCATIONS):
16 |         return []
17 |     with open(LOCATIONS) as fp:
18 |         lines = fp.readlines()
19 |     locations = [l.strip() for l in lines if l.strip() and not l.strip().startswith("#")]
20 |     locations = [l for l in locations if os.path.isdir(l)]
21 |     return locations
22 | 
23 | 
24 | def list_archives():
25 |     result = []
26 |     for l in get_locations():
27 |         title = None
28 |         title_path = os.path.join(l, "title.txt")
29 |         if os.path.exists(title_path):
30 |             with open(title_path) as fp:
31 |                 title = fp.read().strip() or None
32 |         result.append({
33 |             "path": l,
34 |             "title": title,
35 |         });
36 |     return result
37 | 
38 | 
39 | def add_location(l):
40 |     l = os.path.abspath(l)
41 |     if l in get_locations():
42 |         return
43 |     with open(LOCATIONS, "a") as fp:
44 |         fp.write("%s\n" % l)
45 | 


--------------------------------------------------------------------------------
/browsinglab/db.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from sqlobject import (
  3 |     sqlhub, SQLObject, connectionForURI,
  4 |     StringCol, DateTimeCol, FloatCol, IntCol, ForeignKey, BoolCol, JSONCol,
  5 | )
  6 | from .urlcol import URLCol
  7 | from . import connlist
  8 | 
  9 | conn_init = False
 10 | 
 11 | class Mixin:
 12 | 
 13 |     @classmethod
 14 |     def replaceUuid(cls, uuid, **kw):
 15 |         existing = list(cls.selectBy(uuid=uuid))
 16 |         if existing:
 17 |             instance = existing[0]
 18 |             instance.set(**kw)
 19 |         else:
 20 |             instance = cls(uuid=uuid, **kw)
 21 |         return instance
 22 | 
 23 |     @classmethod
 24 |     def getID(cls, uuid, default="no default"):
 25 |         if uuid is None:
 26 |             if default == "no default":
 27 |                 raise Exception("%s.getID() id of None" % cls.__name__)
 28 |             return default
 29 |         results = list(cls.selectBy(uuid=uuid))
 30 |         if not results:
 31 |             if default == "no default":
 32 |                 raise Exception("No %s found by uuid" % cls.__name__)
 33 |             return default
 34 |         return results[0].id
 35 | 
 36 | class Archive:
 37 |     """
 38 |     Represents one archive. It exists in some location on disk
 39 |     """
 40 |     def __init__(self, path):
 41 |         global conn_init
 42 |         if conn_init:
 43 |             raise Exception("Two archives can't yet coexist")
 44 |         if not os.path.exists(path):
 45 |             os.makedirs(path)
 46 |         connlist.add_location(path)
 47 |         self.path = path
 48 |         self.sqlite_path = os.path.join(path, 'history.sqlite')
 49 |         exists = os.path.exists(self.sqlite_path)
 50 |         conn_init = True
 51 |         import sys
 52 |         print("Location", 'sqlite:%s/history.sqlite' % self.path, exists)
 53 |         sqlhub.processConnection = connectionForURI('sqlite:%s/history.sqlite' % self.path)
 54 |         print("Creating tables")
 55 |         create_tables()
 56 | 
 57 |     def __repr__(self):
 58 |         return "<Archive %s>" % (self.path,)
 59 | 
 60 |     @property
 61 |     def title(self):
 62 |         title_path = os.path.join(self.path, "title.txt")
 63 |         if os.path.exists(title_path):
 64 |             with open(title_path) as fp:
 65 |                 return fp.read().strip() or None
 66 |         return None
 67 | 
 68 |     @title.setter
 69 |     def title(self, value):
 70 |         title_path = os.path.join(self.path, "title.txt")
 71 |         if value:
 72 |             with open(title_path, "w") as fp:
 73 |                 fp.write(value)
 74 |         elif os.path.exists(title_path):
 75 |             os.unlink(title_path)
 76 | 
 77 |     def close(self):
 78 |         global conn_init
 79 |         conn_init = False
 80 |         if sqlhub.processConnection:
 81 |             sqlhub.processConnection.close()
 82 |         sqlhub.processConnection = None
 83 |         self.path = None
 84 |         self.sqlite_path = None
 85 | 
 86 | 
 87 | class Browser(SQLObject, Mixin):
 88 |     uuid = StringCol()
 89 |     created = DateTimeCol(default=DateTimeCol.now)
 90 |     userAgent = StringCol()
 91 |     devicePixelRatio = FloatCol()
 92 |     connected = BoolCol(default=False, notNone=True)
 93 | 
 94 | 
 95 | class BrowserSession(SQLObject, Mixin):
 96 |     uuid = StringCol()
 97 |     browser = ForeignKey('Browser')
 98 |     startTime = IntCol(default=None)
 99 |     endTime = IntCol(default=None)
100 |     timezoneOffset = IntCol(default=None)
101 | 
102 | 
103 | class Page(SQLObject, Mixin):
104 |     uuid = StringCol()
105 |     url = URLCol(notNone=True)
106 |     fetched = DateTimeCol(default=DateTimeCol.now)
107 |     activity = ForeignKey('Activity')
108 |     timeToFetch = IntCol()
109 |     redirectUrl = URLCol()
110 |     redirectOk = BoolCol(default=False, notNone=True)
111 |     scrapeData = JSONCol()
112 | 
113 |     @classmethod
114 |     def urlExists(cls, url):
115 |         return bool(list(cls.selectBy(url=url)))
116 | 
117 | 
118 | class Activity(SQLObject, Mixin):
119 |     uuid = StringCol()
120 |     browser = ForeignKey('Browser')
121 |     session = ForeignKey('BrowserSession')
122 |     url = URLCol(notNone=True)
123 |     title = StringCol()
124 |     ogTitle = StringCol()
125 |     loadTime = IntCol()
126 |     unloadTime = IntCol()
127 |     transitionType = StringCol()
128 |     sourceClickText = StringCol()
129 |     sourceClickHref = StringCol() # FIXME: URL
130 |     clientRedirect = BoolCol(default=False, notNone=True)
131 |     serverRedirect = BoolCol(default=False, notNone=True)
132 |     forwardBack = BoolCol(default=False, notNone=True)
133 |     fromAddressBar = BoolCol(default=False, notNone=True)
134 |     source = ForeignKey('Activity')
135 |     browserReferringVisitId = StringCol(default=None)
136 |     initialLoad = ForeignKey('Activity')
137 |     newTab = BoolCol()  # was opened in new tab?
138 |     activeCount = IntCol()  # Count of times it was "activated"
139 |     activeTime = IntCol()  # Millisecond active time
140 |     closedReason = StringCol()
141 |     method = StringCol()  # HTTP request method
142 |     statusCode = IntCol()  # HTTP status code
143 |     contentType = StringCol()  # HTTP Content-Type
144 |     hasSetCookie = BoolCol()  # has Set-Cookie response header
145 |     hasCookie = BoolCol()  # has Cookie request header
146 |     copyEvents = JSONCol()
147 |     formControlInteraction = IntCol()  # count of form interactions
148 |     formTextInteraction = IntCol()  # count of form interactions
149 |     isHashChange = BoolCol()
150 |     maxScroll = IntCol()  # pixel Y location
151 |     documentHeight = IntCol()  # pixel height
152 |     hashPointsToElement = BoolCol()
153 |     zoomLevel = FloatCol()  # 1.0 means 100% zoom
154 |     canonicalUrl = URLCol()  # URL
155 |     mainFeedUrl = URLCol()  # URL
156 |     allFeeds = JSONCol()
157 | 
158 | 
159 | class ActivityLink(SQLObject):
160 |     activity = ForeignKey('Activity')
161 |     url = URLCol(notNone=True)
162 |     text = StringCol(notNone=True)
163 |     rel = StringCol()
164 |     target = StringCol()
165 |     elementId = StringCol()
166 | 
167 | 
168 | def create_tables():
169 |     classes = [Browser, BrowserSession, Activity, Page, ActivityLink]
170 |     for cls in classes:
171 |         cls.createTable(ifNotExists=True)
172 | 


--------------------------------------------------------------------------------
/browsinglab/subenvvars.py:
--------------------------------------------------------------------------------
 1 | """Simple script for substituting environmental variables in a template-ish file"""
 2 | 
 3 | import re
 4 | import sys
 5 | import os
 6 | import json
 7 | 
 8 | env_re = re.compile(r'process\.env\.([a-zA-Z0-9_]+)')
 9 | 
10 | 
11 | def matcher(m):
12 |     value = os.environ.get(m.group(1)) or ""
13 |     return json.dumps(value)
14 | 
15 | 
16 | input = sys.stdin.read()
17 | output = env_re.sub(matcher, input)
18 | 
19 | sys.stdout.write(output)
20 | 


--------------------------------------------------------------------------------
/browsinglab/urlcol.py:
--------------------------------------------------------------------------------
 1 | from yarl import URL
 2 | from sqlobject.col import StringValidator, SOStringCol, StringCol
 3 | 
 4 | __all__ = ["URLCol"]
 5 | 
 6 | class URLValidator(StringValidator):
 7 | 
 8 |     def to_python(self, value, state):
 9 |         if value is None:
10 |             return None
11 |         return URL(value)
12 | 
13 |     def from_python(self, value, state):
14 |         if value is None:
15 |             return None
16 |         return str(value)
17 | 
18 | class SOURLCol(SOStringCol):
19 | 
20 |     def createValidators(self):
21 |         return [URLValidator(name=self.name)] + \
22 |             super(SOURLCol, self).createValidators()
23 | 
24 | class URLCol(StringCol):
25 |     baseClass = SOURLCol
26 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | pylint
2 | flake8
3 | prospector
4 | 


--------------------------------------------------------------------------------
/docs/activity-schema.md:
--------------------------------------------------------------------------------
  1 | ## Activity Schema
  2 | 
  3 | This describes the schema of browsing activity and pages. The schema is intended to be encoded in JSON, but could also end up in a database.
  4 | 
  5 | Note: everything marked TODO needs to be added, or maybe adjusted.
  6 | 
  7 | ### Data Types
  8 | 
  9 | **Date / times**: these are represented as milliseconds from the epoch, i.e., the same as what `Date.now()` returns.
 10 | 
 11 | **Unknown values**: as far as possible we use `null` as "unknown" values or sometimes "not applicable". Information that can affirmatively be known not to exist should use a different value.
 12 | 
 13 | **IDs**: we try to use UUIDs as IDs as often as possible. There may be external IDs (such as history item IDs), and in those cases we use those as secondary IDs.
 14 | 
 15 | ### Browser
 16 | 
 17 | Because people use multiple browsers and profiles, we typically map activity to a specific browser:
 18 | 
 19 | `id`: a UUID for the browser
 20 | 
 21 | `userAgent`: the User Agent string for the browser
 22 | 
 23 | `devicePixelRatio`: the base value of `window.devicePixelRatio` (typically 1 for a normal screen, 2 for a High-DPI/Retina display)
 24 | 
 25 | `created`: when we first saw this browser
 26 | 
 27 | `testing`: if true, then this browser profile was created specifically for testing. Hopefully these browsers shouldn't show up in your normal data!
 28 | 
 29 | `autofetch`: if true, then this browser profile was created or cloned specifically to autofetch pages. It probably has valid cookies/etc, but its behavior isn't "real". Typically we keep these browsers from producing activity, but they *do* create pages (on purpose!) (TODO: need to set `$AUTOFETCH` while building for autofetch; also need to fix autofetch)
 30 | 
 31 | #### Session
 32 | 
 33 | Browsers also have sessions:
 34 | 
 35 | `id`: a UUID for this session (changes each time the browser is restarted)
 36 | 
 37 | `startTime`: timestamp when it was started
 38 | 
 39 | `endTime`: timestamp when it was closed (often null, because we can't always catch this; may be derived from last saved visit once a new session starts). (TODO: nothing sets this)
 40 | 
 41 | `timezoneOffset`: the value of `(new Date()).getTimezoneOffset()`, which is minutes-from-UTC.
 42 | 
 43 | #### Derived:
 44 | 
 45 | Coming from history:
 46 | 
 47 | `oldestHistory`: the time of the oldest history item we've seen
 48 | 
 49 | `newestHistory`: the time of the newest history item we've seen
 50 | 
 51 | ### Activity
 52 | 
 53 | There can be two sources of activity: activity created retroactively from browser history, and activity created by the extension.
 54 | 
 55 | Browser history typically uses two concepts: the [HistoryItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/HistoryItem) and the [VisitItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/VisitItem). In our model we use the VisitItem, augment it with some information from HistoryItem, and there is no one-to-one equivalent of HistoryItem.
 56 | 
 57 | `id`: a UUID representing this visit
 58 | 
 59 | `browserId`: the browser this is associated with
 60 | 
 61 | `sessionId`: the browser session (changed each time the browser is restarted)
 62 | 
 63 | `url`: this is the full URL, including the hash.
 64 | 
 65 | `title`: the title of the page, null if unknown, `""` if there is no title. (TODO: make sure it's "")
 66 | 
 67 | `loadTime`: when the page was loaded
 68 | 
 69 | `unloadTime`: when the page was unloaded. This will be null when unknown (browser history does not keep good track of this).
 70 | 
 71 | `browserHistoryId`: the ID of the associated [HistoryItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/HistoryItem). This won't be unique at all, as many visits are associated with the same HistoryItem.
 72 | 
 73 | `browserVisitId`: the ID of the associated [VisitItem](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/VisitItem). This will probably be unique, if it is set.
 74 | 
 75 | `sourceId`: the id of the visit that lead to this visit. This may come from the VisitItem.referringVisitId (but won't match that ID, as we don't use the browserVisitId as our primary key).
 76 | 
 77 | `browserReferringVisitId`: from VisitItem.referringVisitId, this should point to another record's `browserVisitId`. Note we try to keep `sourceId` updated, and it's better, but this is kept just in case we need to fix things up later.
 78 | 
 79 | `sourceClickHref`: the URL the user clicked on that lead to this page, as from `a.href`. Null if unknown or no link appeared to be the source.
 80 | 
 81 | `sourceClickText`: if a click led to this page, the `a.textContent` of that link. Null if unknown or no link appeared to be the source. May be `""`.
 82 | 
 83 | `transition`: a string from [TransitionType](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/TransitionType): `link`, `typed`, `auto_bookmark`, `auto_subframe` (unlikely, as we don't track frames), `manual_subframe` (also unlikely), `generated`, `auto_toplevel`, `form_submit`, `reload`, `keyword`, `keyword_generated`.
 84 | 
 85 | `client_redirect`: a boolean (or null if unknown) from [TransitionQualifier](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/webNavigation/transitionQualifier)
 86 | 
 87 | `server_redirect`: a boolean (or null if unknown) from TransitionQualifier
 88 | 
 89 | `forward_back`: a boolean (or null if unknown) from TransitionQualifier
 90 | 
 91 | `from_address_bar`: a boolean (or null if unknown) from TransitionQualifier
 92 | 
 93 | `initialId`: the id of the activity that initiated this. For instance, if you go to `page_1`, click on a link to get to `page_2`, then click on a table of contents to get to `page_2#section2`, then the last item would have a `sourceId` pointing to `page_1`, but an `initialId` pointing to `page_2`. You have to sort on `loadTime` to see the exact order of hash changes.
 94 | 
 95 | `newTab`: if this page was opened in a new tab. Typically `sourceId` should be set in this case. It will be null if unknown (for instance VisitItem doesn't record this).
 96 | 
 97 | `activeCount`: the number of times this page was made active, for more than a second. If you open a tab in the background, then close it without ever looking at it, then this should be 0. If you interact normally and don't change tabs it would be 1. Higher numbers mean it was revisited several times.
 98 | 
 99 | `activeTime`: time in milliseconds that the page was active. Note that if a window goes into the background we keep counting, so this might not always be correct. Like with `activeCount`, we ignore when a tab is active for less than a second, assuming that it means the tab was passed over on the way to another tab. If the user goes idle (no keypresses or mouse movement) for 30 seconds, then we stop incrementing the time until there is activity again.
100 | 
101 | `unloadReason`: a string indicating why the page was unloaded: `tabClose`, `navigation`. Null if unknown.
102 | 
103 | `hashPointsToElement`: if the URL has a hash (e.g., `page.html#section1`), then does some element with `id="section1"` exist?
104 | 
105 | `isHashChange`: if the new activity was an in-page change of the hash/fragment (no actual page loading), then this is true. Null if unknown.
106 | 
107 | `method`: the HTTP method that loaded the page (usually GET, of course). We do not track the POST destination if it results in an immediate redirect. (TODO: confirm POST behavior)
108 | 
109 | `statusCode`: the integer status code of the response. E.g., 200, 404.
110 | 
111 | `contentType`: the Content-Type of the response. Note most URLs are *displayed* as a DOM page of some sort, but the underlying resource might not be text/html. In a case like `text/html; charset="UTF-8"` we remove the charset (and anything after `;`).
112 | 
113 | `hasSetCookie`: the response contained a `Set-Cookie` header.
114 | 
115 | `hasCookie`: the request contained a `Cookie` header.
116 | 
117 | `maxScroll`: the greatest pixel location that this document was scrolled to. Null if unknown, 0 if not scrolled.
118 | 
119 | `documentHeight`: the pixel height of the document. Null if unknown or if never scrolled.
120 | 
121 | `copyEvents`: this is a JSON list that represents all the clipboard copies taken from the page. Each event looks like: `{text, startLocation, endLocation, time}`, where start and end location are CSS selectors (`endLocation` may be omitted if it is the same as `startLocation`).
122 | 
123 | `formControlInteraction`: a count of the number of times a non-text form field was changed. Will be null if we weren't watching.
124 | 
125 | `formTextInteraction`: a count of the number of times a text form field was changed. Will be null if we weren't watching. This is incremented when the `change` even occurs, so typically you have to unfocus the text field for this to get incremented.
126 | 
127 | `zoomLevel`: the zoom level, if we can calculate it. Typically 1, null if we didn't determine it. 1.1 means, for example, a 110% zoom.
128 | 
129 | `canonicalUrl`: if the page has `<link rel="canonical">`, this gives the URL it points to.
130 | 
131 | `mainFeedUrl`: if the page has an RSS (or similar) feed, what we think is the main feed URL.
132 | 
133 | `allFeeds`: all the feeds found in the page. This is a list of `[{href, title, type}]`.
134 | 
135 | `linkInformation`: a list of links found in the document. A list that looks like `[{url, text, rel, target, elementId}]` where `rel`, `target`, and `elementId` are optional (depending on the presence of those attributes), and `url` is the full URL, or if it's an page-internal link then it looks like `"#anchor`.
136 | 
137 | #### Derived:
138 | 
139 | This information can be calculated from the above information... (All TODO)
140 | 
141 | `domain`: the domain, without port, and without leading `www.` or `wwwN.`.
142 | 
143 | `canonicalUrl`: the URL with UTM and other cruft removed, with query string sorted, and if `containsHash` is true then with the hash removed.
144 | 
145 | `urlPattern`: a rough pattern of the URL, based on `canonicalUrl`. This helps distinguish homepages from article pages on the same site, for instance. (This heuristic will need some ongoing work.)
146 | 
147 | `query`: if this was a search result, what was the query string associated?
148 | 
149 | ### Pages
150 | 
151 | These are full dumps of a page's DOM. They may be associated with a visit, or loaded retroactively to fill in past history. Typically the system does not pull in repeated dumps of pages when they are re-visited (though we may try to do that in the future based on some heuristics).
152 | 
153 | `id`: a UUID for this *fetch* of a page
154 | 
155 | `url`: the URL fetched
156 | 
157 | `loadTime`: the timestamp when we serialized this page (TODO: rename)
158 | 
159 | `serializeVersion`: a version indicating the serializer. This gets bumped sometimes, so old pages can be re-fetched or updated in place. ([TODO](https://github.com/ianb/personal-history-archive/issues/5))
160 | 
161 | `autofetched`: true if this was created by an autofetch, as opposed to collected while browsing (TODO)
162 | 
163 | `activityId`: if this was fetched during browsing, and associated with specific activity, then the ID of that activity.
164 | 
165 | `redirectUrl`: if fetching the URL redirected to some other URL, then what URL? This is the URL that is actually displayed in the URL bar when we serialized the page. Will be null if this matches `url`.
166 | 
167 | `redirectOk`: if `redirectUrl` exists, but someone decided the redirect is OK, then this will be true. These can be used to review autofetch redirects, and remove pages that were redirectd to login pages.
168 | 
169 | `documentSize.width` and `documentSize.height`: height and width of the entire document (not just the visible portion).
170 | 
171 | `docTitle`: the title as given by `document.title`
172 | 
173 | `passwordFields`: a list of password fields found
174 | 
175 | `passwordFields[i].name`: the name attribute of a password field
176 | 
177 | `passwordFields[i].id`: the id of a password field
178 | 
179 | `passwordFields[i].hasValue`: true of the field has something entered (e.g., by a password manager)
180 | 
181 | `passwordFields[i].isHidden`: if the field appears not to be visible
182 | 
183 | `openGraph`: attributes from Open Graph (i.e., `og:` metadata). From the list: title, type, url, image, audio, description, determiner, locale, site\_name, video, image:secure\_url, image:type, image:width, image:height, video:secure\_url, video:type, video:width, image:height, audio:secure\_url, audio:type, article:published\_time, article:modified\_time, article:expiration\_time, article:author, article:section, article:tag, book:author, book:isbn, book:release\_date, book:tag, profile:first\_name, profile:last\_name, profile:username, profile:gender
184 | 
185 | `twitterCard`: attributes from Twitter Cards. From the list: card, site, title, description, image, player, player:width, player:height, player:stream, player:stream:content_type
186 | 
187 | `images`: a list of images in the page. Excludes small images (smaller than 250x200).
188 | 
189 | `images[i].url`: URL of image
190 | 
191 | `images[i].dimensions`: `{x: width, y: height}` of the image, as displayed in the document
192 | 
193 | `images[i].title`: the `title` attribute
194 | 
195 | `images[i].alt`: the `alt` attribute
196 | 
197 | `images[i].isReadable`: does the image appear in the Readability version of the document?
198 | 
199 | `readable`: information extracted with the [Readability](https://github.com/mozilla/readability) library. Null if this didn't appear to be an article or otherwise parseable.
200 | 
201 | `readable.title`: the title as determined
202 | 
203 | `readable.content`: an HTML string with the content (not processed like other HTML content)
204 | 
205 | `readable.textContent`: a text-only version of the content
206 | 
207 | `readable.length`: the length of the content, in characters
208 | 
209 | `readable.excerpt`: an exerpt
210 | 
211 | `readable.byline`: author metadata
212 | 
213 | `readable.dir`: content direction
214 | 
215 | #### DOM
216 | 
217 | These page records give the actual frozen page part of the fetched pages:
218 | 
219 | `body`: a string of everything *inside* `<body>`.
220 | 
221 | `head`: a string of everything *inside* `<head>`.
222 | 
223 | `bodyAttrs`: the attributes in the body tab, like `[["class", "foobar"], ...]`
224 | 
225 | `headAttrs`: same for head.
226 | 
227 | `htmlAttrs`: same for `<html>`.
228 | 
229 | `resources`: links to embedded resources in the page are replaced with UUIDs. `resources` is `{id: description}` for all of these resources.
230 | 
231 | `resources[id].url`: the fully resolved URL that this points to
232 | 
233 | `resources[id].tag`: if the URL is embedded in a tag, the name of the tag, like `"LINK"`.
234 | 
235 | `resources[id].elId`: the the containing element has an id attribute, then it's here
236 | 
237 | `resources[id].selector`: a selector pointing to the element.
238 | 
239 | `resources[id].attr`: the attribute name where the URL was found
240 | 
241 | `resources[id].rel`: in the case of `<link href="..." rel="...">`, the value of `rel`.
242 | 
243 | `screenshots`: any screenshots taken. Each screenshot has a name. Specifically `screenshots.visible` (what shows in the browser window, "above the fold"), and `screenshots.fullPage` (the entire document).
244 | 
245 | `screenshots.type.captureType`: how it was captured (typically matches `type`)
246 | 
247 | `screenshots.type.originalDimensions`: a box of `{top, bottom, left, right}` showing what was captured
248 | 
249 | `screenshots.type.size`: a value of `{height, width}` of what it was sized to (screenshots are all sized down)
250 | 
251 | `screenshots.type.image`: a `data:` URL of the image
252 | 
253 | #### DOM Annotations
254 | 
255 | The DOM is annotated with some attributes to help understand the DOM without rendering it:
256 | 
257 | `data-width` and `data-height`: these are added to all images
258 | 
259 | `data-hidden="true"`: this is added to any element that doesn't appear to be visible (e.g., `display: none`).
260 | 
261 | `data-display="block"`: or some other value, if `.style.display` (or calculated) is not what you'd expect given the element. E.g., if `<span class="button">` has a style making it display as `inline-block` then this attribute would be added
262 | 
263 | `value`: this is set to the *actual* form value, not the one in the original HTML.
264 | 
265 | ### Feeds
266 | 
267 | In addition to the feed-related metadata captured as Activity, we also fetch the actual feeds alongside the page. By doing this we can match up timely feed information against a page.
268 | 
269 | `feeds`: this is a list of all discovered feeds, listed in the order they appeared in the page.
270 | 
271 | `feeds[i].url`: the URL of the feed (where it was fetched from)
272 | 
273 | `feeds[i].redirectUrl`: if the feed redirected, then this is the destination URL
274 | 
275 | `feeds[i].body`: the text body of the feed.
276 | 
277 | `feeds[i].contentType`: the HTTP Content-Type given
278 | 
279 | `feeds[i].lastModified`: the timestamp of the HTTP Last-Modified header
280 | 
281 | `feeds[i].fetchStart`: the timestamp when we started fetching the feed
282 | 
283 | `feeds[i].fetchTime`: the number of milliseconds it took to fetch the feed
284 | 
285 | `feeds[i].error`: if the feed failed to fetch, this text error message describes why.  Other error information:
286 | 
287 | `feeds[i].statusCode`: if the feed failed to fetch because of an HTTP error, this gives the status code
288 | 
289 | `feeds[i].status`: and this gives the status text
290 | 
291 | `feeds[i].errorStack`: if there was an exception fetching the feed, this gives the traceback.
292 | 
293 | #### Errored pages
294 | 
295 | `url`: the URL that was attempted to be fetched (we don't store historical failures, so the URL is the primary key).
296 | 
297 | `attempted`: a timestamp when the error occurred.
298 | 
299 | `errorMessage`: the error message.
300 | 


--------------------------------------------------------------------------------
/docs/future-design.md:
--------------------------------------------------------------------------------
  1 | # Future Design
  2 | 
  3 | PHA has turned into a conglomeration of a bunch of use cases and techniques, and it's become downright confusing.
  4 | 
  5 | ## Issues
  6 | 
  7 | * The name is long
  8 | * It's unclear where control happens: what makes things happen?
  9 | * There's no clear interface
 10 | * Mixed patterns
 11 | * Build process is all wonky and weird
 12 | 
 13 | ## Name
 14 | 
 15 | What would be a good name for this? The essential aspects:
 16 | 
 17 | 1. It collects browsing information
 18 | 2. It makes that information easy to work with
 19 | 3. It finds higher-level information about the pages
 20 | 4. It can drive the browser
 21 | 
 22 | Obviously "browsing" shows up a lot. Other phrases:
 23 | 
 24 | * Navigation
 25 | * Web
 26 | * HTML / pages
 27 | * Session
 28 | * Dataset
 29 | 
 30 | Candidate names:
 31 | 
 32 | * Browser-dataset
 33 | * Personal-web-dataset
 34 | * Webnav-dataset
 35 | * Webnav-collector
 36 | * Webnav-archiver
 37 | * Browser-archiver
 38 | * barchive
 39 | * firefox-dataset
 40 | * browser-data
 41 | * webnav-data
 42 | * browserdump
 43 | * navdump
 44 | * pagedump
 45 | * **browserdump**
 46 | * Browser Science (also used in 2013, site is still up but inactive)
 47 | * Browser Lab (was used in 2013)
 48 | * Navlab
 49 | * Browsing Lab
 50 | 
 51 | Some dataset concepts ([from](https://medium.com/datadriveninvestor/the-50-best-public-datasets-for-machine-learning-d80e9f030279)):
 52 | 
 53 | * A dataset should not be messy, because you do not want to spend a lot of time cleaning data.
 54 | * A dataset should not have too many rows or columns, so it is easy to work with.
 55 | * The cleaner the data, the better — cleaning a large data set can be very time consuming.
 56 | * There should be an interesting question, which in turn can be answered with data.
 57 | 
 58 | ## Query interface
 59 | 
 60 | Right now we have:
 61 | 
 62 | 1. `Archive`: this represents one set of data, a run, dev-vs-live, test-vs-dev, etc. Represents a database *and* a set of JSON files.
 63 | 2. `Browser`: a browser *profile*
 64 | 3. `BrowserSession`: a particular run of a browser. Belongs to a Browser.
 65 | 4. `Activity`: a browsing activity, typically a navigation. Can include in-page navigations, like changing the hash of a page. Has a relation to [browser.tabs.onUpdated](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/tabs/onUpdated), though not a 1:1 mapping (not every onUpdated event turns into an activity). Belongs to a BrowserSession.
 66 | 5. `ActivityLink`: links found in a page
 67 | 6. `Page`: a page, with a URL, that belongs to a session (FIXME: doesn't currently map to a session), and has a time in place. It's more like a "page load". Belongs to an activity.
 68 | 
 69 | What are we missing?
 70 | 
 71 | 1. A "job" of some sort, such as a fetching of a list of stuff.
 72 | 2. Combining found history with pages and activity.
 73 | 3. The HTTP response that led to a page.
 74 | 4. Filling in data like Common Crawl or Wikipedia data.
 75 | 5. Using CSP to speed up activity (but also noting that it happened)
 76 | 6. Any use of [Containers](https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/contextualIdentities). Probably cookieStorageId needs to be added to everything, or maybe just to Browser.
 77 | 7. Annotations on any of this data. (These may be very ad hoc and hard to implement, but maybe a naive approach would be good enough?)
 78 | 
 79 | ## Interface
 80 | 
 81 | There's a couple ways to start this:
 82 | 
 83 | 1. Add it explicitly to an existing profile as a Temporary Installation
 84 | 2. Have it run with `web-ext` and a scratch profile
 85 | 3. Have it run with `web-ext` and an existing profile
 86 | 4. Have it run with `web-ext` and a long-lived dev-only profile
 87 | 5. Have it run via Selenium
 88 | 
 89 | It uses multiple Native Connect names for handling some of these cases. I think that's good *for testing*, where we want good isolation between any old code, running code, production code, and the filesystem. Otherwise I think the archive location should be coded into the add-on storage.
 90 | 
 91 | # Proposal
 92 | 
 93 | 1. Make this an installable package. Lead with the Python side. Will include node_modules/etc as well.
 94 | 2. The package includes an XPI, that you install in your browser (usually, some use cases might involve web-ext)
 95 | 3. There's a script that you can use on an Archive to trigger activity (i.e., drive the browser)
 96 | 4. Use an ORM, maybe SQLObject?
 97 | 5. History will get extracted, but only informationally. You'll have to use the trigger to revisit history in some fashion.
 98 | 6. We'll need a database view of the live browser connections. This both registers those connections, and is a queue to allow incoming connections.
 99 | 
100 | ## User experience:
101 | 
102 | 1. Install the application (probably start with pip install + npm install, or a downloadable installation script)
103 | 2. Put the XPI in some known location
104 | 3. Install the special files for Native Connect
105 | 4. Maybe include something like `blab http` to open a local server that gives instructions and a link to the XPI
106 | 5. With the XPI installed, there's a button that controls the add-on
107 | 6. You can turn it on and off, with different icons
108 | 7. You can enable it just for some containers
109 | 8. There are instructions about using browser profiles and `about:profiles`
110 | 9. Create a script launcher, `blab browse --Profile` etc?
111 | 10. Create a central place to list known archives, in `~/.browserdump/` - just to make it easy to list
112 | 11. Archives should have names (user assignable)
113 | 12. The browser interface should be allowed to connect to different archives
114 | 13. You should be able to "remember" recording decisions. But if you don't, then on restart probably don't reconnect.
115 | 14. Offer a quick summary of what's happened in the archive.
116 | 15. Give a default archive path of something like `$HOME/browserdump-archive`
117 | 16. Connect browserdump script to a running browser with `blab connect`
118 | 17. Offer simple commands: like open a list of pages.
119 | 18. Something with Jupyter?
120 | 


--------------------------------------------------------------------------------
/docs/screencast-fetcher.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ianb/personal-history-archive/68168eac7876a8827ec566fb4882a7ab5804d87b/docs/screencast-fetcher.gif


--------------------------------------------------------------------------------
/extension/.eslintrc.js:
--------------------------------------------------------------------------------
1 | "use strict";
2 | 
3 | module.exports = {
4 |   "env": {
5 |     "webextensions": true
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/extension/backgroundOnMessage.js:
--------------------------------------------------------------------------------
 1 | /* globals log */
 2 | 
 3 | this.backgroundOnMessage = (function() {
 4 |   let exports = {};
 5 | 
 6 |   const handlers = {};
 7 | 
 8 |   browser.runtime.onMessage.addListener((message, sender) => {
 9 |     let type = message.type;
10 |     message.senderTabId = sender.tab && sender.tab.id;
11 |     message.senderUrl = sender.url;
12 |     message.senderFrameId = sender.frameId;
13 |     if (!handlers[type]) {
14 |       log.error("Got unexpected message type:", type, "from", message);
15 |       return Promise.reject(new Error(`Unexpected message type: ${type}`));
16 |     }
17 |     try {
18 |       let result = handlers[type](message);
19 |       return Promise.resolve(result);
20 |     } catch (error) {
21 |       return Promise.reject(error);
22 |     }
23 |   });
24 | 
25 |   exports.register = function(type, handler) {
26 |     if (handlers[type]) {
27 |       throw new Error(`Attempt to reregister message type ${type}`);
28 |     }
29 |     handlers[type] = handler;
30 |   };
31 | 
32 |   exports.registerListener = function(type, handler) {
33 |     let existing = handlers[type];
34 |     if (!existing) {
35 |       handlers[type] = handler;
36 |     } else if (Array.isArray(existing)) {
37 |       existing.push(handler);
38 |     } else {
39 |       handlers[type] = [existing, handler];
40 |     }
41 |   };
42 | 
43 |   exports.unregister = function(type, handler) {
44 |     let existing = handlers[type];
45 |     if (!existing) {
46 |       throw new Error(`Attempt to unregister handler that has no handlers: ${type}`);
47 |     }
48 |     if (Array.isArray(existing)) {
49 |       if (!existing.includes(handler)) {
50 |         throw new Error(`Attempt to unregister handler that hasn't been registered: ${type}`);
51 |       }
52 |       handlers[type] = existing.filter(x => x !== handler);
53 |       if (handlers.length === 1) {
54 |         handlers[type] = handlers[type][0];
55 |       }
56 |     } else {
57 |       if (existing === handler) {
58 |         throw new Error(`Attepmt to unregister handler that hasn't been registered: ${type}`);
59 |       }
60 |       delete handlers[type];
61 |     }
62 |   };
63 | 
64 |   return exports;
65 | })();
66 | 


--------------------------------------------------------------------------------
/extension/browserId.js:
--------------------------------------------------------------------------------
 1 | /* globals util, log, communication, catcher */
 2 | 
 3 | this.browserId = null;
 4 | this.sessionId = null;
 5 | this.browserIdPromise = catcher.watchPromise(browser.storage.local.get(["browserId"]).then(async (result) => {
 6 |   if (!result || !result.browserId) {
 7 |     browserId = util.makeUuid();
 8 |     await browser.storage.local.set({browserId}).catch((error) => {
 9 |       log.error("Error setting browserId", error);
10 |     });
11 |   } else {
12 |     browserId = result.browserId;
13 |   }
14 |   sessionId = util.makeUuid();
15 | }));
16 | 


--------------------------------------------------------------------------------
/extension/buildSettings.js.tmpl:
--------------------------------------------------------------------------------
 1 | this.buildSettings = (function() {
 2 |   function toBoolean(n, defaultValue) {
 3 |     if (n !== 0 && !n) {
 4 |       return defaultValue;
 5 |     }
 6 |     if (typeof n === "string") {
 7 |       n = n.toLowerCase();
 8 |     }
 9 |     if (n === "false" || n === "0" || n === 0 || n === "off" || n === "no") {
10 |       return false;
11 |     }
12 |     return true;
13 |   }
14 | 
15 |   return {
16 |     nativeScriptName: process.env.NATIVE_SCRIPT || "browsinglab.connector",
17 |     logLevel: process.env.LOG_LEVEL || "info",
18 |     serverLogLevel: process.env.SERVER_LOG_LEVEL || "warn",
19 |     setFreezeMarker: toBoolean(process.env.SET_FREEZE_MARKER, false),
20 |     notifyError: true,
21 |     testingBrowser: process.env.TESTING_BROWSER || false,
22 |     cspRestrict: toBoolean(process.env.CSP_RESTRICT, false),
23 |     defaultArchiveLocation: process.env.DEFAULT_ARCHIVE_LOCATION || null,
24 |     updateServerPeriod: 10000, // 60 * 60 * 1000, // 1 hour
25 |     temporaryArchiveLocation: process.env.TEMPORARY_ARCHIVE_LOCATION || "__prefix__/../dev-data",
26 |     temporaryUpdateServerPeriod: 10000, // 10 seconds
27 |   };
28 | })();
29 | 


--------------------------------------------------------------------------------
/extension/catcher.js:
--------------------------------------------------------------------------------
 1 | /* globals log, buildSettings, util, backgroundOnMessage */
 2 | 
 3 | this.catcher = (function() {
 4 |   let exports = {};
 5 | 
 6 |   exports.watchFunction = function(func) {
 7 |     return function(...args) {
 8 |       try {
 9 |         let result = func(...args);
10 |         if (result && "then" in result && result.then) {
11 |           return exports.watchPromise(result);
12 |         }
13 |         return result;
14 |       } catch (error) {
15 |         report(error);
16 |         throw error;
17 |       }
18 |     };
19 |   };
20 | 
21 |   exports.watchPromise = function(promise) {
22 |     return promise.catch((error) => {
23 |       report(error);
24 |       throw error;
25 |     });
26 |   };
27 | 
28 |   const report = exports.report = function(error) {
29 |     log.error("Error:", error);
30 |     if (buildSettings.notifyError) {
31 |       if (typeof backgroundOnMessage === "undefined") {
32 |         // Then we are in a worker context
33 |         browser.runtime.sendMessage({type: "reportError", error: String(error)});
34 |       } else {
35 |         exports.notifyError(error);
36 |       }
37 |     }
38 |   };
39 | 
40 |   exports.notifyError = function(error) {
41 |     error = String(error);
42 |     let id = util.makeUuid();
43 |     browser.notifications.create(id, {
44 |       type: "basic",
45 |       title: "Browsing Lab Error",
46 |       message: error
47 |     });
48 |   };
49 | 
50 |   if (typeof backgroundOnMessage !== "undefined") {
51 |     backgroundOnMessage.register("reportError", (message) => {
52 |       exports.notifyError(message.error);
53 |     });
54 |   }
55 | 
56 |   return exports;
57 | })();
58 | 


--------------------------------------------------------------------------------
/extension/communication.js:
--------------------------------------------------------------------------------
  1 | /* globals buildSettings, log, browserId, sessionId */
  2 | 
  3 | /** Routines to communicate with the backend via native connection */
  4 | this.communication = (function() {
  5 |   let exports = {};
  6 |   let port = browser.runtime.connectNative(buildSettings.nativeScriptName);
  7 |   let responderId = 1;
  8 |   let responders = new Map();
  9 |   let hasActiveArchive = false;
 10 |   let callCache = [];
 11 |   const CALL_CACHE_LIMIT = 10;
 12 | 
 13 |   function portCall(name, args, kwargs, withoutArchive = false) {
 14 |     if (!sessionId) {
 15 |       // Stuff really hasn't initialized yet!
 16 |       log.warn(`Calling ${name}() before sessionId is set`);
 17 |       return new Promise((resolve, reject) => {
 18 |         callCache.push({name, args, kwargs, resolve, reject});
 19 |       });
 20 |     }
 21 |     if (!withoutArchive && !hasActiveArchive) {
 22 |       if (callCache.length > CALL_CACHE_LIMIT) {
 23 |         throw new Error("Attempted to send too many messages before setting archive");
 24 |       }
 25 |       log.info(`Deferring message: ${name}()`);
 26 |       return new Promise((resolve, reject) => {
 27 |         callCache.push({name, args, kwargs, resolve, reject});
 28 |       });
 29 |     }
 30 |     args = args || [];
 31 |     kwargs = kwargs || {};
 32 |     let id = responderId++;
 33 |     for (let i = 0; i < args.length; i++) {
 34 |       if (args[i] && typeof args[i] === "object" && "toJSON" in args[i]) {
 35 |         args[i] = args[i].toJSON();
 36 |       }
 37 |     }
 38 |     for (let name in (kwargs || {})) {
 39 |       if (kwargs[name] && typeof kwargs[name] === "object" && "toJSON" in kwargs[name]) {
 40 |         kwargs[name] = kwargs[name].toJSON();
 41 |       }
 42 |     }
 43 |     port.postMessage({name, args, kwargs, id});
 44 |     return new Promise((resolve, reject) => {
 45 |       responders.set(id, {resolve, reject, name});
 46 |     });
 47 |   }
 48 | 
 49 |   port.onMessage.addListener((message) => {
 50 |     let id = message.id;
 51 |     let responder = responders.get(id);
 52 |     if ("result" in message) {
 53 |       responder.resolve(message.result);
 54 |     } else if (message.error) {
 55 |       // Using console.error so we don't ever send this back to the server:
 56 |       //
 57 |       console.error("Error calling", responder.name, ":", message.error, message.traceback); // eslint-disable-line no-console
 58 |       responder.reject(new Error(`Backend error: ${message.error}`));
 59 |     } else {
 60 |       log.warn("Response without result/error:", message);
 61 |     }
 62 |     responders.delete(id);
 63 |   });
 64 | 
 65 |   function setHasActiveArchive() {
 66 |     hasActiveArchive = true;
 67 |     for (let item of callCache) {
 68 |       portCall(item.name, item.args, item.kwargs).then(item.resolve).catch(item.reject);
 69 |     }
 70 |     callCache = [];
 71 |   }
 72 | 
 73 |   /* Each of these exported functions is a function in browsinglab.connect: */
 74 | 
 75 |   exports.add_activity_list = function(activityItems) {
 76 |     if (!hasActiveArchive) {
 77 |       // Just throw it away then
 78 |       log.warn("Disposing of activity", hasActiveArchive);
 79 |       return null;
 80 |     }
 81 |     return portCall("add_activity_list", [], {browserId, sessionId, activityItems});
 82 |   };
 83 | 
 84 |   exports.register_browser = function() {
 85 |     return portCall("register_browser", [], {
 86 |       browserId,
 87 |       userAgent: navigator.userAgent,
 88 |       devicePixelRatio: window.devicePixelRatio,
 89 |     });
 90 |   };
 91 | 
 92 |   exports.register_session = function() {
 93 |     return portCall("register_session", [sessionId, browserId, (new Date()).getTimezoneOffset()]);
 94 |   };
 95 | 
 96 |   exports.check_page_needed = function(url) {
 97 |     return portCall("check_page_needed", [url]);
 98 |   };
 99 | 
100 |   // FIXME: should be (url, pageData) but needs updating in saver.py
101 |   exports.add_fetched_page = function(id, url, page) {
102 |     return portCall("add_fetched_page", [id, url, page]);
103 |   };
104 | 
105 |   exports.log = function({level, args, stack}) {
106 |     return portCall("log", args, {level, stack}, true);
107 |   };
108 | 
109 |   exports.set_active_archive = async function(path) {
110 |     await portCall("set_active_archive", [path], {}, true);
111 |     setHasActiveArchive();
112 |     await exports.register_browser();
113 |     await exports.register_session();
114 |   };
115 | 
116 |   exports.unset_active_archive = async function() {
117 |     hasActiveArchive = false;
118 |     await portCall("unset_active_archive");
119 |   };
120 | 
121 |   exports.set_archive_title = function(title) {
122 |     return portCall("set_archive_title", [title]);
123 |   };
124 | 
125 |   exports.get_archive_info = function() {
126 |     return portCall("get_archive_info", [], {}, true);
127 |   };
128 | 
129 |   exports.get_all_archives = function() {
130 |     return portCall("get_all_archives", [], {}, true);
131 |   };
132 | 
133 |   return exports;
134 | })();
135 | 


--------------------------------------------------------------------------------
/extension/contentLoader.js:
--------------------------------------------------------------------------------
 1 | this.contentLoader = (function() {
 2 |   const exports = {};
 3 | 
 4 |   const SCRIPTS = [
 5 |     "browser-polyfill.js",
 6 |     "build/buildSettings.js",
 7 |     "log.js",
 8 |     "catcher.js",
 9 |     "util.js",
10 |     "elementToSelector.js",
11 |     "rssFinder.js",
12 |     "contentWatcher.js",
13 |   ];
14 | 
15 |   exports.loadScripts = async function(tabId) {
16 |     for (const script of SCRIPTS) {
17 |       await browser.tabs.executeScript(tabId, {
18 |         file: script,
19 |         runAt: "document_idle",
20 |       });
21 |     }
22 |   };
23 | 
24 |   exports.trackTabs = function() {
25 |     let callback = (tab) => {
26 |       console.log("tab update", tab.id, tab.status);
27 |       if (tab.status === "loading") {
28 |         exports.loadScripts(tab.id);
29 |       }
30 |     };
31 |     browser.tabs.onUpdated.addListener(callback, {
32 |       properties: ["status"],
33 |     });
34 |     let cancel = () => {
35 |       browser.tabs.onUpdated.removeListener(callback);
36 |     };
37 |     return cancel;
38 |   };
39 | 
40 |   return exports;
41 | })();
42 | 


--------------------------------------------------------------------------------
/extension/contentWatcher.js:
--------------------------------------------------------------------------------
  1 | /* globals elementToSelector, rssFinder */
  2 | 
  3 | this.contentWatcher = (function() {
  4 | 
  5 |   const IDLE_TIME = 30000;
  6 |   const LINK_TEXT_LIMIT = 80;
  7 | 
  8 |   document.addEventListener("click", (event) => {
  9 |     let target = event.target;
 10 |     if (target.tagName === "A") {
 11 |       browser.runtime.sendMessage({
 12 |         type: "anchorClick",
 13 |         text: target.textContent,
 14 |         href: target.href
 15 |       });
 16 |     }
 17 |   });
 18 | 
 19 |   document.addEventListener("copy", (event) => {
 20 |     let selection = window.getSelection();
 21 |     let startLocation;
 22 |     let endLocation;
 23 |     if (selection.anchorNode) {
 24 |       startLocation = elementToSelector(selection.anchorNode);
 25 |     }
 26 |     if (selection.focusNode && selection.focusNode !== selection.anchorNode) {
 27 |       endLocation = elementToSelector(selection.focusNode);
 28 |     }
 29 |     browser.runtime.sendMessage({
 30 |       type: "copy",
 31 |       text: window.getSelection().toString(),
 32 |       startLocation,
 33 |       endLocation,
 34 |     });
 35 |   });
 36 | 
 37 |   document.addEventListener("change", (event) => {
 38 |     let changed = event.target;
 39 |     let isText = changed.tagName === "TEXTAREA";
 40 |     if (changed.tagName === "INPUT") {
 41 |       let type = (changed.getAttribute("text") || "").toLowerCase();
 42 |       let textyTypes = [
 43 |         "", "text", "password", "email", "number", "search", "tel", "url",
 44 |       ];
 45 |       if (textyTypes.includes(type)) {
 46 |         isText = true;
 47 |       }
 48 |     }
 49 |     browser.runtime.sendMessage({
 50 |       type: "change",
 51 |       isText
 52 |     });
 53 |   });
 54 | 
 55 |   let maxScroll = 0;
 56 |   let sendScrollTimeout = null;
 57 | 
 58 |   window.addEventListener("scroll", function(event) {
 59 |     let position = window.scrollY;
 60 |     if (position > maxScroll) {
 61 |       maxScroll = position;
 62 |       if (!sendScrollTimeout) {
 63 |         sendScrollTimeout = setTimeout(() => {
 64 |           sendScrollTimeout = null;
 65 |           let documentHeight = Math.max(
 66 |             document.documentElement.clientHeight,
 67 |             document.body.clientHeight,
 68 |             document.documentElement.scrollHeight,
 69 |             document.body.scrollHeight);
 70 |           browser.runtime.sendMessage({
 71 |             type: "scroll",
 72 |             maxScroll,
 73 |             documentHeight
 74 |           });
 75 |         }, 100);
 76 |       }
 77 |     }
 78 |   });
 79 | 
 80 |   window.addEventListener("hashchange", (event) => {
 81 |     let newHash = (new URL(event.newURL)).hash;
 82 |     if (!newHash || newHash === "#") {
 83 |       return;
 84 |     }
 85 |     newHash = newHash.substr(1);
 86 |     let element = document.getElementById(newHash);
 87 |     if (element) {
 88 |       browser.runtime.sendMessage({
 89 |         type: "hashchange",
 90 |         hash: newHash,
 91 |         hasElement: !!element
 92 |       });
 93 |     }
 94 |   });
 95 | 
 96 |   let activityTimer;
 97 |   let lastActivity;
 98 |   let isActive = true;
 99 | 
100 |   function updateActivity() {
101 |     lastActivity = Date.now();
102 |     if (!isActive) {
103 |       browser.runtime.sendMessage({
104 |         type: "activity"
105 |       });
106 |       isActive = true;
107 |     }
108 |     if (activityTimer) {
109 |       clearTimeout(activityTimer);
110 |     }
111 |     activityTimer = setTimeout(() => {
112 |       browser.runtime.sendMessage({
113 |         type: "idle",
114 |         lastActivity
115 |       });
116 |       activityTimer = null;
117 |       isActive = false;
118 |     }, IDLE_TIME);
119 |   }
120 | 
121 |   function watchForActivity() {
122 |     document.addEventListener("mousemove", updateActivity);
123 |     document.addEventListener("keypress", updateActivity);
124 |     updateActivity();
125 |   }
126 | 
127 |   function unwatchForActivity() {
128 |     document.removeEventListener("mousemove", updateActivity);
129 |     document.removeEventListener("keypress", updateActivity);
130 |     if (!isActive) {
131 |       isActive = true;
132 |     }
133 |     clearTimeout(activityTimer);
134 |   }
135 | 
136 |   document.addEventListener("visibilitychange", () => {
137 |     if (document.hidden) {
138 |       unwatchForActivity();
139 |     } else {
140 |       watchForActivity();
141 |     }
142 |   });
143 | 
144 |   if (!document.hidden) {
145 |     watchForActivity();
146 |   }
147 | 
148 |   function sendDevicePixelRatio() {
149 |     browser.runtime.sendMessage({
150 |       type: "devicePixelRatio",
151 |       devicePixelRatio: window.devicePixelRatio
152 |     });
153 |   }
154 | 
155 |   window.addEventListener("resize", () => {
156 |     sendDevicePixelRatio();
157 |   });
158 | 
159 |   function sendBasicMetadata() {
160 |     let message = {
161 |       type: "basicPageMetadata",
162 |       title: document.title
163 |     };
164 |     let el = document.querySelector("link[rel=canonical]");
165 |     if (el) {
166 |       message.canonicalUrl = el.href;
167 |     }
168 |     let ogTitleEl = document.querySelector("meta[name='og:title'], meta[name='twitter:title']");
169 |     if (ogTitleEl) {
170 |       message.ogTitle = ogTitleEl.getAttribute("content");
171 |     }
172 |     browser.runtime.sendMessage(message);
173 |   }
174 | 
175 |   function sendFeedInformation() {
176 |     let info = rssFinder();
177 |     if (info) {
178 |       browser.runtime.sendMessage({
179 |         type: "feedInformation",
180 |         mainFeedUrl: info.mainFeedUrl,
181 |         allFeeds: info.allFeeds,
182 |       });
183 |     }
184 |   }
185 | 
186 |   function sendLinkInformation() {
187 |     let links = Array.from(document.querySelectorAll("a[href]"));
188 |     links = links.filter(el => el.getAttribute("href") !== "#");
189 |     let linkInformation = links.map((el) => {
190 |       let info = {
191 |         url: el.href
192 |       };
193 |       let text = el.textContent;
194 |       if (text.length > LINK_TEXT_LIMIT) {
195 |         text = text.substr(0, LINK_TEXT_LIMIT) + "...";
196 |       }
197 |       info.text = text;
198 |       if (el.href.startsWith(location.href.split("#")[0] + "#")) {
199 |         info.url = "#" + el.href.split("#")[1];
200 |       }
201 |       if (el.rel) {
202 |         info.rel = el.rel;
203 |       }
204 |       if (el.target) {
205 |         info.target = el.target;
206 |       }
207 |       if (el.id) {
208 |         info.elementId = el.id;
209 |       }
210 |       return info;
211 |     });
212 |     browser.runtime.sendMessage({
213 |       type: "linkInformation",
214 |       linkInformation
215 |     });
216 |   }
217 | 
218 |   sendDevicePixelRatio();
219 |   sendBasicMetadata();
220 |   setTimeout(sendFeedInformation);
221 |   setTimeout(sendLinkInformation);
222 | 
223 | })();
224 | 


--------------------------------------------------------------------------------
/extension/controller.js:
--------------------------------------------------------------------------------
 1 | /* globals backgroundOnMessage, buildSettings, communication, activityTracker, browserIdPromise, log */
 2 | 
 3 | this.controller = (function() {
 4 |   const exports = {};
 5 |   let tracker;
 6 |   let model = {
 7 |     selectContainers: false,
 8 |     selectedContainers: new Set(),
 9 |     track: false,
10 |     archive: {
11 |       title: null,
12 |       path: null,
13 |     }
14 |   };
15 | 
16 |   const onInstalled = new Promise((resolve) => {
17 |     browser.runtime.onInstalled.addListener(resolve);
18 |   });
19 | 
20 |   async function init() {
21 |     let { temporary } = await onInstalled;
22 |     if (temporary) {
23 |       if (!model.archive.path) {
24 |         model.archive.path = buildSettings.temporaryArchiveLocation;
25 |         model.track = true;
26 |         openTracker();
27 |       }
28 |     }
29 |   }
30 | 
31 |   backgroundOnMessage.register("updateArchive", (info) => {
32 |     if (tracker) {
33 |       if (model.track && !info.track) {
34 |         closeTracker();
35 |       } else if (model.archive.path !== info.archive.path) {
36 |         closeTracker();
37 |       }
38 |     }
39 |     model.selectContainers = info.selectContainers;
40 |     model.selectedContainers = new Set(info.selectedContainers);
41 |     model.track = info.track;
42 |     model.archive = info.archive;
43 |     if (model.track && model.archive.path) {
44 |       openTracker();
45 |     }
46 |   });
47 | 
48 |   backgroundOnMessage.register("requestUpdateArchive", () => {
49 |     browser.runtime.sendMessage({
50 |       type: "updateArchive",
51 |       selectContainers: model.selectContainers,
52 |       selectedContainers: Array.from(model.selectedContainers.values()),
53 |       track: model.track,
54 |       archive: model.archive,
55 |     });
56 |   });
57 | 
58 |   function closeTracker() {
59 |     tracker.uninit();
60 |     tracker = null;
61 |     communication.unset_active_archive();
62 |   }
63 | 
64 |   async function openTracker() {
65 |     await communication.set_active_archive(model.archive.path);
66 |     await communication.set_archive_title(model.archive.title);
67 |     tracker = new activityTracker.Tracker();
68 |     tracker.init();
69 |   }
70 | 
71 |   browserIdPromise.then(async () => {
72 |     await init();
73 |   }).catch((e) => {
74 |     log.error("Error initializing:", String(e), e, e.stack);
75 |   });
76 | 
77 |   return exports;
78 | })();
79 | 


--------------------------------------------------------------------------------
/extension/controls/popup.css:
--------------------------------------------------------------------------------
1 | #container {
2 |   padding: 1em;
3 | }
4 | 


--------------------------------------------------------------------------------
/extension/controls/popup.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Popup</title>
 6 |     <link rel="stylesheet" href="popup.css">
 7 |   </head>
 8 |   <body>
 9 |     <div id="container"></div>
10 |     <script src="../build/react.production.min.js"></script>
11 |     <script src="../build/react-dom.production.min.js"></script>
12 |     <script src="../browser-polyfill.js"></script>
13 |     <script src="../build/buildSettings.js"></script>
14 |     <script src="../log.js"></script>
15 |     <script src="../catcher.js"></script>
16 |     <script src="../build/popup.js"></script>
17 |   </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/extension/controls/popup.jsx:
--------------------------------------------------------------------------------
  1 | /* globals React, ReactDOM */
  2 | 
  3 | const model = {
  4 |   selectContainers: false,
  5 |   selectedContainers: new Set(),
  6 |   track: false,
  7 |   archive: {
  8 |     title: null,
  9 |     path: null,
 10 |   },
 11 | };
 12 | 
 13 | class Interface extends React.Component {
 14 |   render() {
 15 |     return <div>
 16 |       <ArchiveSelector archive={this.props.archive} />
 17 |       <GeneralControl track={this.props.track} />
 18 |       <ContainerSelector
 19 |        containers={this.props.containers}
 20 |        selectContainers={this.props.selectContainers}
 21 |        selectedContainers={this.props.selectedContainers}
 22 |        />
 23 |     </div>;
 24 |   }
 25 | }
 26 | 
 27 | class ArchiveSelector extends React.Component {
 28 |   render() {
 29 |     return <div>
 30 |       <label>
 31 |         <div>Archive title:</div>
 32 |         <div>
 33 |           <input type="text" value={this.props.archive.title}
 34 |            style={{width: "100%"}}
 35 |            onChange={this.changeArchiveTitle.bind(this)}
 36 |            placeholder="Something for your reference" />
 37 |         </div>
 38 |       </label>
 39 |       <label>
 40 |         <div>Archive path:</div>
 41 |         <div>
 42 |           <input type="text" value={this.props.archive.path}
 43 |           style={{width: "100%"}}
 44 |           onChange={this.changeArchivePath.bind(this)}
 45 |           placeholder="A path on disk" />
 46 |         </div>
 47 |       </label>
 48 |     </div>;
 49 |   }
 50 | 
 51 |   changeArchiveTitle(event) {
 52 |     let title = event.target.value;
 53 |     model.archive.title = title;
 54 |     render();
 55 |   }
 56 | 
 57 |   changeArchivePath(event) {
 58 |     let path = event.target.value;
 59 |     model.archive.path = path;
 60 |     render();
 61 |   }
 62 | }
 63 | 
 64 | class GeneralControl extends React.Component {
 65 |   render() {
 66 |     return <div>
 67 |       <label>
 68 |         Track information in this browser:
 69 |         <input type="checkbox" checked={this.props.track} onChange={this.onCheck.bind(this)} />
 70 |       </label>
 71 |     </div>;
 72 |   }
 73 | 
 74 |   onCheck(event) {
 75 |     model.track = event.target.checked;
 76 |     browser.runtime.sendMessage({
 77 |       type: "track",
 78 |       value: model.track,
 79 |     });
 80 |     render();
 81 |   }
 82 | }
 83 | 
 84 | class ContainerSelector extends React.Component {
 85 |   render() {
 86 |     return <div>
 87 |       <ul>
 88 |         <li><label>
 89 |           <input type="checkbox" checked={this.props.selectContainers}
 90 |            onChange={this.onCheckSelectContainers.bind(this)} />
 91 |           Track only specific containers
 92 |         </label></li>
 93 |         { this.props.containers.map(c => {
 94 |           return <li key={c.name}><label>
 95 |             <input type="checkbox" checked={this.props.selectedContainers.has(c.name)}
 96 |              disabled={!this.props.selectContainers} onChange={this.onCheckContainer.bind(this, c)} />
 97 |              <img src={c.iconUrl} /> <span style={{backgroundColor: c.colorCode}}>{c.name}</span>
 98 |           </label></li>;
 99 |         })}
100 |       </ul>
101 |     </div>;
102 |   }
103 | 
104 |   onCheckSelectContainers(event) {
105 |     model.selectContainers = !!event.target.checked;
106 |     sendModel();
107 |     render();
108 |   }
109 | 
110 |   onCheckContainer(c, event) {
111 |     if (event.target.checked) {
112 |       model.selectedContainers.add(c.name);
113 |     } else {
114 |       model.selectedContainers.delete(c.name);
115 |     }
116 |     sendModel();
117 |     render();
118 |   }
119 | }
120 | 
121 | function sendModel() {
122 |   browser.runtime.sendMessage({
123 |     type: "updateArchive",
124 |     selectContainers: model.selectContainers,
125 |     selectedContainers: Array.from(model.selectedContainers.values()),
126 |     track: model.track,
127 |     archive: model.archive,
128 |   });
129 | }
130 | 
131 | browser.runtime.onMessage.addListener((message) => {
132 |   if (message.type !== "updateArchive") {
133 |     return;
134 |   }
135 |   if ("selectContainers" in message) {
136 |     model.selectContainers = message.selectContainers;
137 |   }
138 |   if ("selectedContainers" in message) {
139 |     model.selectedContainers = new Set(message.selectedContainers);
140 |   }
141 |   if ("track" in message) {
142 |     model.track = !!message.track;
143 |   }
144 |   if ("archive" in message) {
145 |     model.archive = message.archive;
146 |   }
147 |   render();
148 | });
149 | 
150 | browser.runtime.sendMessage({
151 |   type: "requestUpdateArchive",
152 | });
153 | 
154 | async function render() {
155 |   let containers = await browser.contextualIdentities.query({});
156 |   let page = <Interface containers={containers} {...model} />;
157 |   ReactDOM.render(page, document.getElementById("container"));
158 | }
159 | 
160 | render();
161 | 


--------------------------------------------------------------------------------
/extension/elementToSelector.js:
--------------------------------------------------------------------------------
 1 | this.elementToSelector = function elementToSelector(el) {
 2 |   let singletons = {BODY: true, HEAD: true};
 3 |   let parts = [];
 4 |   for (;;) {
 5 |     if (singletons[el.tagName]) {
 6 |       parts.unshift(el.tagName.toLowerCase());
 7 |       break;
 8 |     }
 9 |     if (el.id) {
10 |       parts.unshift(`#${el.id}`);
11 |       break;
12 |     }
13 |     let parent = el.parentNode;
14 |     let position = Array.from(parent.childNodes).indexOf(el);
15 |     parts.unshift(`*:nth-child(${position + 1})`);
16 |     el = parent;
17 |   }
18 |   return parts.join(" > ");
19 | };
20 | null;
21 | 


--------------------------------------------------------------------------------
/extension/icon-live.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="8" height="8" viewBox="0 0 8 8">
2 |   <path fill="teal" d="M1.34 0a.5.5 0 0 0 .16 1h.5v1.41c-.09.17-1.2 2.31-1.66 3.09-.16.26-.34.61-.34 1.06 0 .39.15.77.41 1.03s.64.41 1.03.41h5.13c.38 0 .74-.16 1-.41h.03c.26-.26.41-.64.41-1.03 0-.45-.19-.8-.34-1.06-.46-.78-1.57-2.92-1.66-3.09v-1.41h.5a.5.5 0 1 0 0-1h-5a.5.5 0 0 0-.09 0 .5.5 0 0 0-.06 0zm1.66 1h2v1.63l.06.09s.63 1.23 1.19 2.28h-4.5c.56-1.05 1.19-2.28 1.19-2.28l.06-.09v-1.63z"
3 |   />
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/extension/icon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="8" height="8" viewBox="0 0 8 8">
2 |   <path d="M1.34 0a.5.5 0 0 0 .16 1h.5v1.41c-.09.17-1.2 2.31-1.66 3.09-.16.26-.34.61-.34 1.06 0 .39.15.77.41 1.03s.64.41 1.03.41h5.13c.38 0 .74-.16 1-.41h.03c.26-.26.41-.64.41-1.03 0-.45-.19-.8-.34-1.06-.46-.78-1.57-2.92-1.66-3.09v-1.41h.5a.5.5 0 1 0 0-1h-5a.5.5 0 0 0-.09 0 .5.5 0 0 0-.06 0zm1.66 1h2v1.63l.06.09s.63 1.23 1.19 2.28h-4.5c.56-1.05 1.19-2.28 1.19-2.28l.06-.09v-1.63z"
3 |   />
4 | </svg>


--------------------------------------------------------------------------------
/extension/log.js:
--------------------------------------------------------------------------------
 1 | /* globals buildSettings, communication, backgroundOnMessage */
 2 | /* eslint-disable no-console */
 3 | 
 4 | "use strict";
 5 | 
 6 | this.log = (function() {
 7 |   const exports = {};
 8 | 
 9 |   const levels = ["debug", "info", "warn", "error"];
10 |   if (!levels.includes(buildSettings.logLevel)) {
11 |     console.warn("Invalid buildSettings.logLevel:", buildSettings.logLevel);
12 |   }
13 |   const shouldLog = {};
14 |   const shouldLogServer = {};
15 | 
16 |   {
17 |     let startLogging = false;
18 |     let startServerLogging = false;
19 |     for (const level of levels) {
20 |       if (buildSettings.logLevel === level) {
21 |         startLogging = true;
22 |       }
23 |       if (buildSettings.serverLogLevel === level) {
24 |         startServerLogging = true;
25 |       }
26 |       if (startLogging) {
27 |         shouldLog[level] = true;
28 |       }
29 |       if (startServerLogging) {
30 |         shouldLogServer[level] = true;
31 |       }
32 |     }
33 |   }
34 | 
35 |   function logger(level) {
36 |     return function(...args) {
37 |       logWithLevel(level, args);
38 |     };
39 |   }
40 | 
41 |   function logWithLevel(level, args, stack) {
42 |     if (shouldLog[level]) {
43 |       let newArgs = [];
44 |       for (let arg of args) {
45 |         newArgs.push(arg);
46 |         if (arg instanceof Error) {
47 |           newArgs.push(String(arg));
48 |         }
49 |       }
50 |       console[level](...newArgs);
51 |     }
52 |     if (shouldLogServer[level]) {
53 |       let newArgs = [];
54 |       if (!stack) {
55 |         let stackLines = (new Error()).stack.split("\n");
56 |         while (stackLines[0] && /\/log.js:/.test(stackLines[0])) {
57 |           stackLines.shift();
58 |         }
59 |         stack = stackLines.join("\n");
60 |       }
61 |       for (let arg of args) {
62 |         if (arg instanceof Error) {
63 |           newArgs.push(String(arg));
64 |           newArgs.push(arg.stack);
65 |         } else {
66 |           newArgs.push(arg);
67 |         }
68 |       }
69 |       if (typeof communication !== "undefined") {
70 |         communication.log({level, args: newArgs, stack});
71 |       } else {
72 |         browser.runtime.sendMessage({type: "log", level, args: newArgs, stack});
73 |       }
74 |     }
75 |   }
76 | 
77 |   if (typeof backgroundOnMessage !== "undefined") {
78 |     backgroundOnMessage.register("log", (message) => {
79 |       logWithLevel(message.level, message.args, message.stack);
80 |     });
81 |   }
82 | 
83 |   exports.debug = logger("debug");
84 |   exports.info = logger("info");
85 |   exports.warn = logger("warn");
86 |   exports.error = logger("error");
87 | 
88 |   return exports;
89 | })();
90 | 


--------------------------------------------------------------------------------
/extension/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "manifest_version": 2,
 3 |   "name": "Browsing Lab",
 4 |   "version": "0.1.0",
 5 |   "description": "Collects browsing information for later study",
 6 |   "author": "Ian Bicking <ian@ianbicking.org>",
 7 |   "homepage_url": "https://github.com/ianb/personal-history-archive",
 8 |   "applications": {
 9 |     "gecko": {
10 |       "id": "browsing-lab@ianbicking.org"
11 |     }
12 |   },
13 |   "browser_action": {
14 |     "default_icon": {
15 |       "32": "icon.svg"
16 |     },
17 |     "default_title": "Browsing Lab",
18 |     "browser_style": true,
19 |     "default_popup": "controls/popup.html"
20 |   },
21 |   "background": {
22 |     "scripts": [
23 |       "browser-polyfill.js",
24 |       "build/buildSettings.js",
25 |       "backgroundOnMessage.js",
26 |       "log.js",
27 |       "catcher.js",
28 |       "util.js",
29 |       "browserId.js",
30 |       "communication.js",
31 |       "scraper/scrapeTab.js",
32 |       "contentLoader.js",
33 |       "activityTracker.js",
34 |       "controller.js"
35 |     ]
36 |   },
37 |   "permissions": [
38 |     "<all_urls>",
39 |     "history",
40 |     "storage",
41 |     "tabs",
42 |     "webNavigation",
43 |     "webRequest",
44 |     "nativeMessaging",
45 |     "notifications",
46 |     "webRequestBlocking",
47 |     "contextualIdentities"
48 |   ]
49 | }
50 | 


--------------------------------------------------------------------------------
/extension/rssFinder.js:
--------------------------------------------------------------------------------
  1 | this.rssFinder = (function() {
  2 | 
  3 |   const urlPatterns = [
  4 |     /^\/feeds?$/,
  5 |     /^\/feeds?\/[a-zA-Z0-9]+$/,
  6 |     /\.xml$/,
  7 |     /\/feed\/?$/,
  8 |     /$\/(rss|atom)/,
  9 |     /\/rss\//,
 10 |     /[./]rss2?$/,
 11 |     // Business Insider:
 12 |     /rss.*\.cms$/,
 13 |     // The Philly Inquirer and others:
 14 |     /rss\.html$/,
 15 |     // Seattle PI:
 16 |     /collectionRss/,
 17 |   ];
 18 | 
 19 |   const domainPatterns = [
 20 |     /^feeds\./,
 21 |     // USA Today:
 22 |     /^rss(feeds)?\./,
 23 |     /^feeds[0-9]?\.feedburner\.com/,
 24 |   ];
 25 | 
 26 |   const queryStringPatterns = [
 27 |     // Miami Herald:
 28 |     /getXmlFeed/,
 29 |     /rssfeed/,
 30 |     // Sun Times:
 31 |     /template=rss/,
 32 |     // St Louis Post-Dispatch:
 33 |     /f=rss/,
 34 |     /feed=rss/,
 35 |   ];
 36 | 
 37 |   // FIXME: use these
 38 |   const hintPatterns = [
 39 |     /^https?:\/\/add.my.yahoo.com\/rss\?url=([^&]+)/,
 40 |     /^https?:\/\/feedly.com\/#subscription\/feed\/(.*)/,
 41 |     /https?:\/\/reader.aol.com\/#subscription\/(.*)/,
 42 |   ];
 43 | 
 44 |   function isMaybeRssLink(url) {
 45 |     let urlObj = new URL(url);
 46 |     for (let pat of urlPatterns) {
 47 |       if (pat.test(urlObj.pathname)) {
 48 |         return true;
 49 |       }
 50 |     }
 51 |     for (let pat of domainPatterns) {
 52 |       if (pat.test(urlObj.hostname)) {
 53 |         return true;
 54 |       }
 55 |     }
 56 |     for (let pat of queryStringPatterns) {
 57 |       if (pat.test(urlObj.search)) {
 58 |         return true;
 59 |       }
 60 |     }
 61 |     return false;
 62 |   }
 63 | 
 64 |   function rssFinder() {
 65 |     let contentTypes = [
 66 |       "application/rss+xml",
 67 |       "application/atom+xml",
 68 |       "application/rdf+xml",
 69 |       "application/rss",
 70 |       "application/atom",
 71 |       "application/rdf",
 72 |       "text/rss+xml",
 73 |       "text/atom+xml",
 74 |       "text/rdf+xml",
 75 |       "text/rss",
 76 |       "text/atom",
 77 |       "text/rdf",
 78 |     ];
 79 |     let selector = contentTypes.map((t) => `link[rel=alternate][type="${t}"]`).join(", ");
 80 |     let feeds = document.querySelectorAll(selector);
 81 |     if (!feeds.length) {
 82 |       return null;
 83 |     }
 84 |     let mainFeedUrl = feeds[0].href;
 85 |     let allFeeds = Array.from(feeds).map(el => ({type: el.type, href: el.href, title: el.title}));
 86 |     let speculativeFeedLinks = Array.from(document.querySelectorAll("a[href]"));
 87 |     speculativeFeedLinks = speculativeFeedLinks.filter(a => a.href && isMaybeRssLink(a.href));
 88 |     speculativeFeedLinks = speculativeFeedLinks.map(a => {
 89 |       return {
 90 |         href: a.href,
 91 |         anchorText: a.textContent.substr(0, 100),
 92 |       };
 93 |     });
 94 |     // Never keep more than 40 links, just in case:
 95 |     speculativeFeedLinks.splice(40);
 96 |     if (!speculativeFeedLinks.length) {
 97 |       speculativeFeedLinks = undefined;
 98 |     }
 99 |     return {
100 |       mainFeedUrl,
101 |       allFeeds,
102 |       speculativeFeedLinks,
103 |     };
104 |   }
105 | 
106 |   return rssFinder;
107 | 
108 | })();
109 | null;
110 | 


--------------------------------------------------------------------------------
/extension/scraper/extractor-worker.js:
--------------------------------------------------------------------------------
  1 | /* globals Readability, document, console, location, makeStaticHtml, log, util, buildSettings */
  2 | 
  3 | /** extractor-worker is a content worker that is attached to a page when
  4 |     making a shot
  5 | 
  6 |     extractData() does the main work
  7 |     */
  8 | 
  9 | var extractorWorker = (function() { // eslint-disable-line no-unused-vars
 10 |   /** Extracts data:
 11 |       - Gets the Readability version of the page (`.readable`)
 12 |       - Finds images in roughly the preferred order (`.images`)
 13 |       */
 14 |   let exports = {};
 15 | 
 16 |   exports.extractData = function() {
 17 |     let start = Date.now();
 18 |     let readableDiv;
 19 |     let readable;
 20 |     if (typeof Readability !== "undefined") {
 21 |       let result = extractReadable();
 22 |       if (result) {
 23 |         readable = result;
 24 |       } else {
 25 |         readable = null;
 26 |       }
 27 |     } else {
 28 |       log.info("Skipping readability: not installed");
 29 |     }
 30 |     let images = findImages([
 31 |       {element: document.head, isReadable: false},
 32 |       {element: readableDiv, isReadable: true},
 33 |       {element: document.body, isReadable: false}]);
 34 |     log.info(`Image time: ${Date.now() - start}ms`);
 35 |     let siteName = findSiteName();
 36 |     log.info(`extractData time: ${Date.now() - start}ms`);
 37 |     return {
 38 |       readable,
 39 |       images,
 40 |       siteName
 41 |     };
 42 |   };
 43 | 
 44 |   function extractReadable() {
 45 |     // Readability is destructive, so we have to run it on a copy
 46 |     let loc = document.location;
 47 |     let uri = {
 48 |       spec: loc.href,
 49 |       host: loc.host,
 50 |       prePath: loc.protocol + "//" + loc.host,
 51 |       scheme: loc.protocol.substr(0, loc.protocol.indexOf(":")),
 52 |       pathBase: loc.protocol + "//" + loc.host + loc.pathname.substr(0, loc.pathname.lastIndexOf("/") + 1)
 53 |     };
 54 |     let article;
 55 |     let id = util.makeUuid();
 56 |     let index = 1;
 57 |     for (let el of document.getElementsByTagName("*")) {
 58 |       el.setAttribute("data-tmp-id", `${id}-${index}`);
 59 |       index++;
 60 |     }
 61 |     var documentClone = document.cloneNode(true);
 62 |     try {
 63 |       article = new Readability(uri, documentClone).parse();
 64 |       if (article) {
 65 |         let newDiv = document.createElement("div");
 66 |         newDiv.innerHTML = article.content; // eslint-disable-line no-unsanitized/property
 67 |         for (let el of newDiv.querySelectorAll("*[data-tmp-id]")) {
 68 |           let id = el.getAttribute("data-tmp-id");
 69 |           let origEl = document.querySelector(`*[data-tmp-id='${id}']`);
 70 |           let found = false;
 71 |           let parent = origEl.parentNode;
 72 |           while (parent) {
 73 |             if (parent.getAttribute && parent.getAttribute("data-isreadable")) {
 74 |               found = true;
 75 |               break;
 76 |             }
 77 |             parent = parent.parentNode;
 78 |           }
 79 |           if (!found) {
 80 |             origEl.setAttribute("data-isreadable", "1");
 81 |           }
 82 |         }
 83 |       }
 84 |     } catch (e) {
 85 |       log.warn("Exception getting readable version:", e);
 86 |       article = {error: String(e), errorStack: e.stack};
 87 |     }
 88 |     for (let el of document.getElementsByTagName("*")) {
 89 |       el.removeAttribute("data-tmp-id");
 90 |     }
 91 |     return article;
 92 |   }
 93 | 
 94 |   // Images smaller than either of these sizes are skipped:
 95 |   let MIN_IMAGE_WIDTH = 250;
 96 |   let MIN_IMAGE_HEIGHT = 200;
 97 | 
 98 |   /** Finds images in any of the given elements, avoiding duplicates
 99 |       Looks for Open Graph og:image, then img elements, sorting img
100 |       elements by width (largest preferred) */
101 |   function findImages(elements) {
102 |     let images = [];
103 |     let found = {};
104 |     function addImage(imgData) {
105 |       if (!(imgData && imgData.url)) {
106 |         return;
107 |       }
108 |       if (found[imgData.url]) {
109 |         return;
110 |       }
111 |       images.push(imgData);
112 |       found[imgData.url] = true;
113 |     }
114 |     for (let i = 0; i < elements.length; i++) {
115 |       let el = elements[i].element;
116 |       if (!el) {
117 |         continue;
118 |       }
119 |       let isReadable = elements[i].isReadable;
120 |       let ogs = el.querySelectorAll("meta[property='og:image'], meta[name='twitter:image']");
121 |       let j;
122 |       for (j = 0; j < ogs.length; j++) {
123 |         let src = ogs[j].getAttribute("content");
124 |         let a = document.createElement("a");
125 |         a.href = src;
126 |         src = a.href;
127 |         if (src.search(/^https?/i) === -1) {
128 |           continue;
129 |         }
130 |         addImage({
131 |           url: src
132 |         });
133 |       }
134 |       let imgs = el.querySelectorAll("img");
135 |       imgs = Array.prototype.slice.call(imgs);
136 |       // Widest images first:
137 |       imgs.sort(function(a, b) {
138 |         if (a.width > b.width) {
139 |           return -1;
140 |         }
141 |         return 1;
142 |       });
143 |       for (j = 0; j < imgs.length; j++) {
144 |         let img = imgs[j];
145 |         if ((!img.src) || (img.src.search(/^https?/i) === -1)) {
146 |           continue;
147 |         }
148 |         if (img.width >= MIN_IMAGE_WIDTH && img.height >= MIN_IMAGE_HEIGHT) {
149 |           addImage({
150 |             url: img.src,
151 |             dimensions: {x: img.width, y: img.height},
152 |             title: img.getAttribute("title") || null,
153 |             alt: img.getAttribute("alt") || null,
154 |             isReadable
155 |           });
156 |         }
157 |       }
158 |     }
159 |     return images;
160 |   }
161 | 
162 |   function findSiteName() {
163 |     let el = document.querySelector("meta[property='og:site_name']");
164 |     if (el) {
165 |       return el.getAttribute("content");
166 |     }
167 |     // nytimes.com uses this property:
168 |     el = document.querySelector("meta[name='cre']");
169 |     if (el) {
170 |       return el.getAttribute("content");
171 |     }
172 |     return null;
173 |   }
174 | 
175 |   exports.documentStaticJson = async function() {
176 |     let json = {};
177 |     Object.assign(json, exports.extractData());
178 |     Object.assign(json, await makeStaticHtml.documentStaticData());
179 |     if (buildSettings.setFreezeMarker) {
180 |       let el = document.createElement("span");
181 |       el.id = "browsinglab-completed-freeze";
182 |       el.style.display = "none";
183 |       document.body.appendChild(el);
184 |     }
185 |     return json;
186 |   };
187 | 
188 |   return exports;
189 | 
190 | })();
191 | 


--------------------------------------------------------------------------------
/extension/scraper/scrapeTab.js:
--------------------------------------------------------------------------------
  1 | /* globals util, log, buildSettings */
  2 | 
  3 | this.scrapeTab = (function() {
  4 | 
  5 |   let restrictiveCsp = "font-src 'none'; frame-src 'self' data:; object-src 'none'; worker-src 'none'; manifest-src 'none'";
  6 | 
  7 |   let rssContentTypes = [
  8 |     "application/rss+xml",
  9 |     "application/atom+xml",
 10 |     "application/rdf+xml",
 11 |     "application/rss",
 12 |     "application/atom",
 13 |     "application/rdf",
 14 |     "text/rss+xml",
 15 |     "text/atom+xml",
 16 |     "text/rdf+xml",
 17 |     "text/rss",
 18 |     "text/atom",
 19 |     "text/rdf",
 20 |   ];
 21 | 
 22 |   async function scrapeTab(tabId, requireUrl) {
 23 |     let scraped = await scrapeTabDOM(tabId, requireUrl);
 24 |     await addRss(scraped);
 25 |     return scraped;
 26 |   }
 27 | 
 28 |   async function scrapeTabDOM(tabId, requireUrl) {
 29 |     let start = Date.now();
 30 |     let foundUrl = await waitForStableTab(tabId);
 31 |     if (foundUrl !== requireUrl) {
 32 |       log.debug("Change", requireUrl, "to", foundUrl);
 33 |       throw new Error("URL changed from what was expected");
 34 |     }
 35 |     for (let file of ["build/buildSettings.js", "log.js", "util.js", "elementToSelector.js", "rssFinder.js", "scraper/make-static-html.js", "scraper/Readability.js", "scraper/extractor-worker.js"]) {
 36 |       await browser.tabs.executeScript(tabId, {file});
 37 |     }
 38 |     let resultList = await browser.tabs.executeScript(tabId, {
 39 |       code: "extractorWorker.documentStaticJson()"
 40 |     });
 41 |     resultList[0].timeToFetch = Date.now() - start;
 42 |     return resultList[0];
 43 |   }
 44 | 
 45 |   async function addRss(scraped) {
 46 |     if (scraped.allFeeds) {
 47 |       scraped.feeds = [];
 48 |       for (let feed of scraped.allFeeds) {
 49 |         scraped.feeds.push(await getFeed(feed, true));
 50 |       }
 51 |       log.info("Scraped feeds:", scraped.feeds.length, "bytes:", JSON.stringify(scraped.feeds).length);
 52 |     }
 53 |     if (scraped.speculativeFeedLinks) {
 54 |       let found = 0;
 55 |       for (let feed of scraped.speculativeFeedLinks) {
 56 |         let fetched = await getFeed(feed, false);
 57 |         if (fetched) {
 58 |           found++;
 59 |           scraped.feeds.push(fetched);
 60 |         } else {
 61 |           feed.shouldDelete = true;
 62 |         }
 63 |       }
 64 |       log.info("Scraped feed links:", found, "of potential", scraped.speculativeFeedLinks.length);
 65 |       scraped.speculativeFeedLinks = scraped.speculativeFeedLinks.filter(f => !f.shouldDelete);
 66 |       if (!scraped.speculativeFeedLinks.length) {
 67 |         delete scraped.speculativeFeedLinks;
 68 |       }
 69 |     }
 70 |   }
 71 | 
 72 |   async function getFeed(feed, ignoreContentType) {
 73 |     let start = Date.now();
 74 |     let result = {
 75 |       url: feed.href,
 76 |       fetchStart: start,
 77 |     };
 78 |     try {
 79 |       let resp = await fetch(feed.href);
 80 |       if (!resp.ok) {
 81 |         result.error = "Response error";
 82 |         result.status = resp.status;
 83 |         result.statusCode = resp.statusCode;
 84 |       } else {
 85 |         result.body = await resp.text();
 86 |         result.contentType = resp.headers.get("Content-Type").split(";")[0];
 87 |         if (!ignoreContentType && !rssContentTypes.includes(result.contentType)) {
 88 |           return null;
 89 |         }
 90 |         result.lastModified = (new Date(resp.headers.get("Last-Modified"))).getTime();
 91 |       }
 92 |       result.fetchTime = Date.now() - start;
 93 |       if (resp.url !== feed.href) {
 94 |         result.redirectUrl = resp.url;
 95 |       }
 96 |       return result;
 97 |     } catch (e) {
 98 |       log.error("Got error fetching feed", feed, e);
 99 |       result.fetchTime = Date.now() - start;
100 |       result.error = String(e);
101 |       result.errorStack = e.stack;
102 |       return result;
103 |     }
104 |   }
105 | 
106 |   async function waitForStableTab(tabId, attempts = 3) {
107 |     let originalUrl;
108 |     let tab = await browser.tabs.get(tabId);
109 |     originalUrl = tab.url;
110 |     await waitForIdle(tabId);
111 |     if (!attempts) {
112 |       return tab.url;
113 |     }
114 |     await util.sleep(buildSettings.idleWaitTime);
115 |     tab = await browser.tabs.get(tabId);
116 |     if (tab.url !== originalUrl) {
117 |       return waitForStableTab(tabId, attempts - 1);
118 |     }
119 |     return tab.url;
120 |   }
121 | 
122 |   function waitForIdle(tabId) {
123 |     return browser.tabs.executeScript(tabId, {
124 |       code: "null",
125 |       runAt: "document_start"
126 |     });
127 |   }
128 | 
129 |   function installCsp() {
130 |     let options = ["blocking", "responseHeaders"];
131 |     let filter = {
132 |       types: ["main_frame"],
133 |       urls: ["http://*/*", "https://*/*"],
134 |     };
135 |     browser.webRequest.onHeadersReceived.addListener(
136 |       cspHeaderRewriter,
137 |       filter,
138 |       options,
139 |     );
140 |     return () => {
141 |       browser.webRequest.onHeadersReceived.removeListener(
142 |         cspHeaderRewriter,
143 |         filter,
144 |         options,
145 |       );
146 |     };
147 |   }
148 | 
149 |   function cspHeaderRewriter(info) {
150 |     let headers = info.responseHeaders;
151 |     for (let i = 0; i < headers.length; i++) {
152 |       let name = headers[i].name.toLowerCase();
153 |       if (name === "content-security-policy" || name === "content-security-policy-report-only") {
154 |         headers.splice(i, 1);
155 |         i--;
156 |       }
157 |     }
158 |     headers.push({
159 |       name: "Content-Security-Policy",
160 |       value: restrictiveCsp,
161 |     });
162 |     return {"responseHeaders": headers};
163 |   }
164 | 
165 |   if (buildSettings.cspRestrict) {
166 |     installCsp();
167 |     log.info("Installed CSP adder for all requests");
168 |   }
169 | 
170 |   return scrapeTab;
171 | })();
172 | 


--------------------------------------------------------------------------------
/extension/util.js:
--------------------------------------------------------------------------------
 1 | this.util = (function() {
 2 |   let exports = {};
 3 | 
 4 |   exports.sleep = function(time) {
 5 |     return new Promise((resolve) => {
 6 |       setTimeout(resolve, time);
 7 |     });
 8 |   };
 9 | 
10 |   exports.makeUuid = function() { // eslint-disable-line no-unused-vars
11 |     // get sixteen unsigned 8 bit random values
12 |     let randomValues = window
13 |       .crypto
14 |       .getRandomValues(new Uint8Array(36));
15 | 
16 |     return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, function(c) {
17 |       let i = Array.prototype.slice.call(arguments).slice(-2)[0]; // grab the `offset` parameter
18 |       let r = randomValues[i] % 16|0, v = c === "x" ? r : (r & 0x3 | 0x8);
19 |       return v.toString(16);
20 |     });
21 |   };
22 | 
23 |   return exports;
24 | })();
25 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | python3 -m venv .venv
 4 | ./.venv/bin/pip install --upgrade pip
 5 | ./.venv/bin/pip install -e .
 6 | ./.venv/bin/pip install -r ./dev-requirements.txt
 7 | if [[ ! -e blab ]] ; then
 8 |   ln -s ./.venv/bin/blab .
 9 | fi
10 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "personal-history-archive",
 3 |   "description": "A server and browser extension for saving a personal archive",
 4 |   "version": "0.1.0",
 5 |   "author": "Ian Bicking <ian@ianbicking.org>",
 6 |   "bugs": {
 7 |     "url": "https://github.com/ianb/personal-history-archive/issues"
 8 |   },
 9 |   "dependencies": {
10 |     "react": "^16.3.1",
11 |     "react-dom": "^16.3.1",
12 |     "readability": "git+https://github.com/mozilla/readability.git",
13 |     "webextension-polyfill": "^0.2.1"
14 |   },
15 |   "devDependencies": {
16 |     "babel-cli": "^6.26.0",
17 |     "babel-preset-env": "^1.6.1",
18 |     "babel-preset-react": "^6.24.1",
19 |     "cookie-parser": "^1.4.3",
20 |     "eslint-plugin-mozilla": "^0.4.10",
21 |     "eslint-plugin-no-unsanitized": "^2.0.2",
22 |     "eslint-plugin-promise": "^3.6.0",
23 |     "eslint-plugin-react": "^7.12.4",
24 |     "firefox-profile": "^1.2.0",
25 |     "geckodriver": "^1.14.1",
26 |     "mocha": "^5.0.4",
27 |     "node-feedparser": "^1.0.1",
28 |     "random-seed": "^0.3.0",
29 |     "selenium-webdriver": "^4.0.0-alpha.1",
30 |     "web-ext": "^2.9.3"
31 |   },
32 |   "homepage": "https://github.com/ianb/personal-history-archive",
33 |   "license": "MPL-2.0",
34 |   "repository": {
35 |     "type": "git",
36 |     "url": "git+https://github.com/ianb/personal-history-archive.git"
37 |   },
38 |   "scripts": {
39 |     "start": "npm run build:dev && web-ext run -f \"${FIREFOX:-nightly}\" -s extension/ --keep-profile-changes --firefox-profile dev-data/Profile --browser-console",
40 |     "build:dev": "mkdir -p dev-data && npm run build:deps && LOG_LEVEL=debug SERVER_LOG_LEVEL=debug TESTING_BROWSER=1 DEST=dev-data npm run build:ext && mkdir -p dev-data/Profile",
41 |     "lint:js": "eslint .",
42 |     "build:zip": "web-ext build --source-dir ${DEST:-build}/extension/ --ignore-files '**/README.md' --ignore-files '**/*.template' --overwrite-dest && mv web-ext-artifacts/personal_history_saver*.zip ${DEST:-build}/extension.zip && rmdir web-ext-artifacts || true",
43 |     "build:ext": "mkdir -p extension/build && .venv/bin/python -m browsinglab.subenvvars < extension/buildSettings.js.tmpl > extension/build/buildSettings.js",
44 |     "build:deps": "mkdir -p extension/build/ && cp node_modules/react/umd/react.production.min.js node_modules/react-dom/umd/react-dom.production.min.js node_modules/readability/Readability.js extension/build/ && babel --retain-lines extension/controls/popup.jsx > extension/build/popup.js",
45 |     "test": "npm run test:build-data && npm run test:build-ext && npm run test:selenium",
46 |     "test:build-data": "rm -rf test/test-data/ && mkdir -p test/test-data/",
47 |     "test:build-ext": "NATIVE_SCRIPT=pha.saver.test IDLE_WAIT_TIME=0 HISTORY_PAUSE=0 LOG_LEVEL=debug SERVER_LOG_LEVEL=debug TESTING_BROWSER=1 DEST=test/build/ npm run build:ext && DEST=test/build/ npm run build:zip && .venv/bin/python -m pha.saver --script-location test/test-data/pha-saver-script.py --native-name pha.saver.test test/test-data/",
48 |     "test:selenium": "PATH=$PATH:/Applications/FirefoxNightly.app/Contents/MacOS/ mocha test/test.js $TEST_ARGS"
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # Python Library
 2 | 
 3 | To install:
 4 | 
 5 | ```sh
 6 | $ pip install -e python/
 7 | # Optional requirements:
 8 | $ pip install -r python/requirements.txt
 9 | ```
10 | 
11 | ## Usage
12 | 
13 | You'll probably want to get an instance of Archive:
14 | 
15 | ```
16 | from pha import Archive
17 | archive = Archive.default_location()
18 | ```
19 | 
20 | Or `Archive(path)`, but normal installation always puts the data into the `data/` directory.
21 | 
22 | The key objects are all implemented in [`__init__.py`](./pha/__init__.py): `Archive`, `Activity`, and `Page`.
23 | 
24 | * `Activity` is one visit in the browser. This includes any changes to the location hash. This represents both old activity fetched from browser history (from [`HistoryItem`](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/HistoryItem) and [`VisitItem`](https://developer.mozilla.org/en-US/Add-ons/WebExtensions/API/history/VisitItem)), as well as new activity (with more complete information available).
25 | * `Page` is a fetched page. By default only one version a page will be created for a given URL (though the code/database allows for multiple pages fetched over time). A page is both stored in the database, as well as in a JSON file in `data/pages/` (the library tries to be resilient when the two sources don't match).
26 | 
27 | Note that URLs *do* include the fragment/hash, so `http://example.com/` and `http://example.com/#header` are treated as different.
28 | 
29 | Typically you'll call:
30 | 
31 | * `archive.get_activity(url)`: get a list of activities for the URL
32 | * `archive.activity()`: get a list of ALL activities
33 | * `archive.activity_with_page()`: get a list of all activity that also have a fetched page
34 | * `archive.sample_activity_with_page(number, unique_url=True, unique_domain=False)`: fetch a random sample of pages. Because there tend to be *lots* of pages from some domains (e.g., gmail.com) this tries to get a sampling of "unique" pages. If you ask for `unique_url` then it will look at the entire URL, normalize segments of the URL, and treat number and non-number segments differently. So it would include a homepage and an article page, but probably not multiple article pages from the same site. `unique_domain` gets only one page per domain.
35 | * `archive.get_activity_by_source(activity.id)`: get every activity that came from the given activity (typically through navigation).
36 | 
37 | ### Pages
38 | 
39 | You might spend most of your time with the Page objects, at least if you are interested in content parsing and interpretation.
40 | 
41 | A few highlights:
42 | 
43 | * `page.html`: returns a viewable HTML representation of the page.
44 | * `page.lxml`: returns the page, having been parsed with [lxml.html](http://lxml.de/lxmlhtml.html).
45 | * `page.full_text`: tries to get the text of page.
46 | * `page.readable_text`: if the page was parseable with [Readability](https://github.com/mozilla/readability) then this will contain the text extracted as part of the article view (excluding navigation, etc).
47 | * `page.readable_html`: an HTML view of the readable portion of the page.
48 | * `page.display_page()`: run in a Jupyter Notebook, this will show the page in an iframe (see also `notebooktools`).
49 | 
50 | ## Helpers
51 | 
52 | There's several helper modules:
53 | 
54 | * [`glovehelper`](./pha/glovehelper.py): helps with calling [GloVe](https://nlp.stanford.edu/projects/glove/). You must install and build the code from that site. The helper lets you pass in a sequence of strings and get vectors back. See [the analyze_classnames notebook](./analyze_classnames.ipynb) for an example.
55 | * [`htmltools`](./pha/htmltools.py): this includes various little functions to help you work with the HTML. Look at [analyze_classnames](./analyze_classnames.ipynb) for examples.
56 | * [`notebooktools`](./pha/notebooktools.py): other tools for working in Jupyter Notebooks. It's used to show inline HTML.
57 | * [`search`](./pha/search.py): creates a search index of your pages. You need the SQLite [FTS5](https://sqlite.org/fts5.html) extension installed. See [the search_example notebook](./search_example.ipynb) for more.
58 | * [`summarytools`](./pha/summarytools.py): some small helpers for doing document summarization. See [the document_summary notebook](./document_summary.ipynb) for more.
59 | 
60 | ## Notebooks
61 | 
62 | I'm collecting notebooks in this directory as examples, and hopefully they'll grow into simultaneously documentation and interesting data interpretation. It would be cool to have more!
63 | 


--------------------------------------------------------------------------------
/python/analyze_classnames.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 3,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "<Archive at '/Users/ianbicking/src/personal-history-archive' 19596/52625 fetched, 31043 errored>\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "%autoreload\n",
 27 |     "import pha\n",
 28 |     "import pha.htmltools\n",
 29 |     "archive = pha.Archive.default_location()\n",
 30 |     "print(archive)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 4,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "14995\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "histories = archive.histories_with_page()\n",
 48 |     "print(len(histories))"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 6,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "<History https://github.com/mozilla-services/screenshots/pulls #visits=522>"
 60 |       ]
 61 |      },
 62 |      "execution_count": 6,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "histories[0]"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 7,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from collections import Counter\n",
 78 |     "\n",
 79 |     "def count_classes(doc):\n",
 80 |     "    counter = Counter()\n",
 81 |     "    for el in doc.cssselect(\"*[class]\"):\n",
 82 |     "        for phrase in pha.htmltools.normalize_classes(el):\n",
 83 |     "            counter[phrase] += 1\n",
 84 |     "    return counter"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 8,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "base_counter = Counter()\n",
 94 |     "by_doc = Counter()\n",
 95 |     "for history in histories:\n",
 96 |     "    c = count_classes(history.page.lxml)\n",
 97 |     "    base_counter.update(c)\n",
 98 |     "    by_doc.update(c.keys())"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 9,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "(280151, 280151)"
110 |       ]
111 |      },
112 |      "execution_count": 9,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "len(base_counter), len(by_doc)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 10,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "Total counts: [('blank-may', 115345), ('noncollaps', 122856), ('reportform', 127379), ('thing', 127395), ('child', 127401), ('entri', 127438), ('parent', 127660), ('flat-list', 128492), ('taglin', 128495), ('-gb', 143091), ('button', 163053), ('bylink', 169683), ('arrow', 205311), ('scope-style', 215843), ('ctrl-f-no', 253178), ('clearleft', 254758), ('score', 299354), ('unvot', 328883), ('access-requir', 503082), ('login-requir', 602254)]\n",
131 |       "By document: [('clear', 1513), ('js', 1548), ('hidden', 1553), ('undefin', 1604), ('comment', 1707), ('md', 1751), ('col', 1752), ('link', 1784), ('activ', 1858), ('titl', 1948), ('author', 2014), ('dropdown', 2113), ('footer', 2136), ('button', 2155), ('select', 2354), ('fit-shrink-to', 2396), ('btn', 2534), ('contain', 2539), ('icon', 2632), ('content', 3173)]\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "print(\"Total counts:\", sorted(base_counter.items(), key=lambda x: x[1])[-20:])\n",
137 |     "print(\"By document:\", sorted(by_doc.items(), key=lambda x: x[1])[-20:])"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "## Prepare classes to be vectorized\n",
145 |     "\n",
146 |     "This creates one long file that has all the concatenated stemmed class names for all documents. This is reasonable for training different embedding vectors (mapping class names to vectors of floats):"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 52,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "%autoreload\n",
156 |     "import pha.glovehelper\n",
157 |     "pha.glovehelper.set_glove_path(\"/Users/ianbicking/src/personal-history-archive/tmp/GloVe\")"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 53,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "#: 23005752 Mb: 232\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "%autoreload\n",
175 |     "import pha.htmltools\n",
176 |     "import random\n",
177 |     "shuffled_histories = list(histories)\n",
178 |     "random.shuffle(shuffled_histories)\n",
179 |     "all_classes = []\n",
180 |     "for history in shuffled_histories:\n",
181 |     "    for el in history.page.lxml.iter():\n",
182 |     "        classes = pha.htmltools.normalize_classes(el, shuffle=True)\n",
183 |     "        if classes:\n",
184 |     "            all_classes.extend(classes)\n",
185 |     "        else:\n",
186 |     "            all_classes.append(\"no-class\")\n",
187 |     "print(\"#:\", len(all_classes), \"Mb:\", len(\" \".join(all_classes)) // 1000000)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 54,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "['dtlwc-report-t', 'bqe-id-t', 'nj', 'drjof-id-t', '--c-waypoint-waypoint-xsk', 'wi', 'amphtml-i-interfac-video', 'navig-target', 'i', 'aafa-sx']\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "from pha.glovehelper import vectorize\n",
205 |     "class_vectors = vectorize(\n",
206 |     "    all_classes, 50)\n",
207 |     "print(list(class_vectors.keys())[:10])"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 55,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "#: 15403930 Mb: 60\n"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "tag_shuffled_histories = list(histories)\n",
225 |     "random.shuffle(tag_shuffled_histories)\n",
226 |     "all_tags = []\n",
227 |     "for history in shuffled_histories:\n",
228 |     "    for el in history.page.lxml.iter():\n",
229 |     "        all_tags.append(el.tag)\n",
230 |     "print(\"#:\", len(all_tags), \"Mb:\", len(\" \".join(all_tags)) // 1000000)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 56,
236 |    "metadata": {},
237 |    "outputs": [
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "Number of tags: 588\n",
243 |       "Removed 223 tags, with: 365 left\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "from collections import Counter\n",
249 |     "tag_counter = Counter(all_tags)\n",
250 |     "print(\"Number of tags:\", len(tag_counter))\n",
251 |     "removed = 0\n",
252 |     "for tag, count in tag_counter.most_common():\n",
253 |     "    if count <= 5:\n",
254 |     "        removed += 1\n",
255 |     "        all_tags.remove(tag)\n",
256 |     "        del tag_counter[tag]\n",
257 |     "print(\"Removed\", removed, \"tags, with:\", len(tag_counter), \"left\")"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 57,
263 |    "metadata": {},
264 |    "outputs": [],
265 |    "source": [
266 |     "tag_vectors = vectorize(\n",
267 |     "    all_tags, 20)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 58,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "import json\n",
277 |     "json.dump({\"classes\": class_vectors, \"tags\": tag_vectors}, open(\"html-vectors.json\", \"w\"))"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": []
286 |   }
287 |  ],
288 |  "metadata": {
289 |   "kernelspec": {
290 |    "display_name": "Python 3",
291 |    "language": "python",
292 |    "name": "python3"
293 |   },
294 |   "language_info": {
295 |    "codemirror_mode": {
296 |     "name": "ipython",
297 |     "version": 3
298 |    },
299 |    "file_extension": ".py",
300 |    "mimetype": "text/x-python",
301 |    "name": "python",
302 |    "nbconvert_exporter": "python",
303 |    "pygments_lexer": "ipython3",
304 |    "version": "3.5.1"
305 |   }
306 |  },
307 |  "nbformat": 4,
308 |  "nbformat_minor": 2
309 | }
310 | 


--------------------------------------------------------------------------------
/python/pha/__main__.py:
--------------------------------------------------------------------------------
 1 | from . import Archive
 2 | 
 3 | if __name__ == "__main__":
 4 |     import sys
 5 |     archive = Archive.default_location()
 6 |     print("Archive:", archive)
 7 |     if sys.argv[1:]:
 8 |         history = archive.get_history(sys.argv[1])
 9 |         page = history.page
10 |         print("History:", history, history.visits)
11 |         print("Page:", page)
12 |         print("HTML:\n", page.html)
13 | 


--------------------------------------------------------------------------------
/python/pha/glovehelper.py:
--------------------------------------------------------------------------------
  1 | """Simple wrapper for GloVe: https://nlp.stanford.edu/projects/glove/
  2 | 
  3 | Runs the scripts and produces vector output"""
  4 | 
  5 | import tempfile
  6 | import os
  7 | import subprocess
  8 | 
  9 | default_glove_path = None
 10 | 
 11 | 
 12 | def set_glove_path(value):
 13 |     """
 14 |     Sets the path where we can find GloVe installed, for all future calls to vectorize.
 15 |     """
 16 |     global default_glove_path
 17 |     default_glove_path = value
 18 | 
 19 | 
 20 | def vectorize(
 21 |         corpus,
 22 |         vector_size=50,
 23 |         *,
 24 |         glove_path=None,
 25 |         debug_print=False,
 26 |         vocab_min_count=5,
 27 |         window_size=15):
 28 |     """
 29 |     Takes a corpus (list of words, or one big string with spaces separating words) and creates a mapping from words to vectors.
 30 | 
 31 |     This calls the scripts in GloVe and processes the results, it doesn't implement any vectorization itself.
 32 |     """
 33 |     glove_path = glove_path or default_glove_path
 34 |     if not os.path.exists(glove_path):
 35 |         raise OSError("No such directory: %s" % glove_path)
 36 |     if os.path.exists(os.path.join(glove_path, "build")):
 37 |         glove_path = os.path.join(glove_path, "build")
 38 |     if not isinstance(corpus, (str, bytes)):
 39 |         corpus = " ".join(corpus)
 40 |     if isinstance(corpus, str):
 41 |         corpus = corpus.encode("UTF-8")
 42 |     with tempfile.TemporaryDirectory() as dirname:
 43 |         if debug_print:
 44 |             print("Temporary directory:", dirname)
 45 |         vocab_file = os.path.join(dirname, "vocab.txt")
 46 |         with open(vocab_file, "wb") as fp:
 47 |             proc = _exec([
 48 |                 os.path.join(glove_path, "vocab_count"),
 49 |                 "-min-count", str(vocab_min_count),
 50 |                 "-verbose", "2"],
 51 |                 input=corpus,
 52 |                 debug_print=debug_print)
 53 |             fp.write(proc.stdout)
 54 |         proc = _exec([
 55 |             os.path.join(glove_path, "cooccur"),
 56 |             "-memory", "4.0",
 57 |             "-vocab-file", vocab_file,
 58 |             "-window-size", str(window_size)],
 59 |             input=corpus,
 60 |             debug_print=debug_print)
 61 |         cooccur_data = proc.stdout
 62 |         cooccur_file = os.path.join(dirname, "coocur.txt")
 63 |         with open(cooccur_file, "wb") as fp:
 64 |             proc = _exec([
 65 |                 os.path.join(glove_path, "shuffle"),
 66 |                 "-memory", "4.0"],
 67 |                 input=cooccur_data,
 68 |                 debug_print=debug_print)
 69 |             fp.write(proc.stdout)
 70 |         save_file = os.path.join(dirname, "vectors.txt")
 71 |         proc = _exec([
 72 |             os.path.join(glove_path, "glove"),
 73 |             "-save-file", os.path.splitext(save_file)[0],
 74 |             "-threads", "8",
 75 |             "-input-file", cooccur_file,
 76 |             "-x-max", "10",
 77 |             "-iter", "15",
 78 |             "-vector-size", str(vector_size),
 79 |             "-binary", "2",
 80 |             "-vocab-file", vocab_file],
 81 |             debug_print=debug_print)
 82 |         result = {}
 83 |         with open(save_file, "r", encoding="UTF-8") as fp:
 84 |             for line in fp.readlines():
 85 |                 line = line.strip().split()
 86 |                 name = line[0]
 87 |                 result[name] = [float(n) for n in line[1:]]
 88 |         return result
 89 | 
 90 | 
 91 | def _exec(command, input=None, debug_print=False):
 92 |     if isinstance(input, str):
 93 |         input = input.encode("UTF-8")
 94 |     proc = subprocess.run(command, check=True, input=input, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 95 |     if debug_print:
 96 |         print(" ".join(command))
 97 |         if input:
 98 |             print("Input:  %s bytes" % len(input))
 99 |         print("Output: %s bytes" % len(proc.stdout))
100 |         if proc.stderr:
101 |             print(proc.stderr.decode("UTF-8").rstrip())
102 |             print("---------------------------------------------")
103 |     return proc
104 | 


--------------------------------------------------------------------------------
/python/pha/htmltools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Some helpers for use with HTML.
  3 | 
  4 | Mostly normalize_classes()
  5 | """
  6 | import re
  7 | import random
  8 | from nltk.stem import PorterStemmer
  9 | import lxml
 10 | from urllib.parse import urlparse, parse_qsl
 11 | 
 12 | mixed_regex = re.compile(r'([a-z])([A-Z])')
 13 | non_char_regex = re.compile(r'[^a-z\-]', re.I)
 14 | stemmer = PorterStemmer()
 15 | 
 16 | 
 17 | def wordify_class(c):
 18 |     """Changes a class into a set of words"""
 19 |     c = mixed_regex.sub(r"\1-\2", c)
 20 |     c = c.replace("_", "-")
 21 |     c = non_char_regex.sub("", c)
 22 |     c = c.strip("-")
 23 |     return "-".join(c.lower().split("-"))
 24 | 
 25 | 
 26 | def stem_words(c):
 27 |     return "-".join([stemmer.stem(w) for w in c.split("-")])
 28 | 
 29 | 
 30 | def sort_words(c):
 31 |     return "-".join(sorted(c.split("-")))
 32 | 
 33 | 
 34 | def normalize_classes(c, shuffle=False):
 35 |     """Takes an HTML class attribute (or element) and returns a normalized form of the classes:
 36 | 
 37 |     * Each class name is split into "words", either based on dashes or mixed case
 38 |     * Numbers are removed
 39 |     * Each word is stemmed
 40 |     * The words are sorted
 41 |     * They are combined back using dashes.
 42 | 
 43 |     If `shuffle` is true, then (if there is more than one class), the classes will be randomly shuffled.
 44 |     """
 45 |     if isinstance(c, lxml.etree.ElementBase):
 46 |         c = c.get("class")
 47 |     if not c:
 48 |         return []
 49 |     if isinstance(c, str):
 50 |         c = c.split()
 51 |     result = list(filter(None, [sort_words(stem_words(wordify_class(a_class))) for a_class in c]))
 52 |     if shuffle and len(result) > 1:
 53 |         random.shuffle(result)
 54 |     return result
 55 | 
 56 | 
 57 | www_regex = re.compile(r'^www[0-9]*\.')
 58 | number_regex = re.compile(r'^[0-9]+$')
 59 | hex_only = re.compile(r'^[a-f0-9]+$', re.I)
 60 | 
 61 | 
 62 | def _url_ignore_word(w):
 63 |     return w.strip() and number_regex.search(w) or (len(w) > 10 and hex_only.search(w))
 64 | 
 65 | 
 66 | def url_words(url):
 67 |     """
 68 |     Tries to reduce a URL to a set of "words" that define the URL. This leaves out numbers,
 69 |     things that look like hex tokens, and the TLD.
 70 | 
 71 |     Typically used for searchable full text indexing of the URL.
 72 |     """
 73 |     result = []
 74 |     parsed = urlparse(url)
 75 |     hostname = parsed.hostname
 76 |     hostname = www_regex.sub("", hostname)
 77 |     hostname_parts = hostname.split(".")
 78 |     if len(hostname_parts) > 1:
 79 |         # Strip the TLD
 80 |         hostname_parts = hostname_parts[:-1]
 81 |     result.extend(hostname_parts)
 82 |     path = parsed.path.split("/")
 83 |     path = [p for p in path if not _url_ignore_word(p)]
 84 |     result.extend(path)
 85 |     if not _url_ignore_word(parsed.fragment or ""):
 86 |         result.append(parsed.fragment)
 87 |     query = parse_qsl(parsed.query)
 88 |     for name, value in query:
 89 |         if not _url_ignore_word(value):
 90 |             result.extend([name, value])
 91 |     return result
 92 | 
 93 | 
 94 | DEFAULT_DISPLAY = {
 95 |     "a": "inline",
 96 |     "applet": "inline",
 97 |     "article": "block",
 98 |     "area": "none",
 99 |     "audio": "none",
100 |     "base": "none",
101 |     "basefont": "none",
102 |     "bgsound": "inline",
103 |     "blockquote": "block",
104 |     "body": "flex",
105 |     "br": "inline",
106 |     "button": "inline-block",
107 |     "canvas": "inline",
108 |     "col": "table-column",
109 |     "colgroup": "table-column-group",
110 |     "del": "inline",
111 |     "details": "block",
112 |     "dir": "block",
113 |     "div": "block",
114 |     "dl": "block",
115 |     "embed": "inline",
116 |     "fieldset": "block",
117 |     "footer": "block",
118 |     "font": "inline",
119 |     "form": "block",
120 |     "frame": "inline",
121 |     "frameset": "block",
122 |     "h1": "block",
123 |     "h2": "block",
124 |     "h3": "block",
125 |     "h4": "block",
126 |     "h5": "block",
127 |     "h6": "block",
128 |     "head": "none",
129 |     "hr": "block",
130 |     "iframe": "inline",
131 |     "img": "inline",
132 |     "input": "inline",
133 |     "ins": "inline",
134 |     "isindex": "inline",
135 |     "label": "inline",
136 |     "li": "list-item",
137 |     "link": "none",
138 |     "nav": "block",
139 |     "map": "inline",
140 |     "marquee": "inline-block",
141 |     "menu": "block",
142 |     "meta": "none",
143 |     "meter": "inline-block",
144 |     "object": "inline",
145 |     "ol": "block",
146 |     "optgroup": "block",
147 |     "option": "block",
148 |     "output": "inline",
149 |     "p": "block",
150 |     "param": "none",
151 |     "pre": "block",
152 |     "progress": "inline-block",
153 |     "q": "inline",
154 |     "script": "none",
155 |     "select": "inline-block",
156 |     "source": "inline",
157 |     "span": "inline",
158 |     "style": "none",
159 |     "table": "table",
160 |     "tbody": "table-row-group",
161 |     "td": "table-cell",
162 |     "textarea": "inline",
163 |     "tfoot": "table-footer-group",
164 |     "title": "none",
165 |     "th": "table-cell",
166 |     "thead": "table-header-group",
167 |     "time": "inline",
168 |     "tr": "table-row",
169 |     "track": "inline",
170 |     "ul": "block",
171 |     "video": "inline"
172 | }
173 | 
174 | blockish_display_values = ["block", "table-cell", "table", "flex", "list-item"]
175 | 
176 | 
177 | def _make_blockish_selector():
178 |     blockish_elements = set()
179 |     for tagname, display_value in DEFAULT_DISPLAY.items():
180 |         if display_value in blockish_display_values:
181 |             blockish_elements.add(tagname)
182 |     blockish_selectors = ', '.join(
183 |         '%s:not([data-display])' % tagname for tagname in sorted(blockish_elements))
184 |     extra_selectors = ', '.join(
185 |         "*[data-display='%s']" % display for display in sorted(blockish_display_values))
186 |     return "%s, %s" % (blockish_selectors, extra_selectors)
187 | 
188 | 
189 | blockish_selector = _make_blockish_selector()
190 | 
191 | 
192 | def iter_block_level_elements(el):
193 |     return el.cssselect(blockish_selector)
194 | 
195 | 
196 | def iter_block_level_text(el):
197 |     """
198 |     Goes through the document, returning `[(text, element), ...]` for block-level elements.
199 |     When block-level elements are nested, the text of the outer element only includes text that
200 |     isn't in an inner element. Elements that have no text or only whitespace text are omitted.
201 |     """
202 |     for child in el.iter():
203 |         if not is_blockish(child):
204 |             continue
205 |         text_chunks = get_unblockish_text(child)
206 |         text_chunks = [s.strip() for s in text_chunks if s and s.strip()]
207 |         if text_chunks:
208 |             yield (' '.join(text_chunks), child)
209 | 
210 | 
211 | def is_blockish(el):
212 |     display = el.get("data-display") or DEFAULT_DISPLAY.get(el.tag, "block")
213 |     return display in blockish_display_values
214 | 
215 | 
216 | def get_unblockish_text(el):
217 |     chunks = [el.text]
218 |     for child in el:
219 |         if not is_blockish(child):
220 |             chunks.extend(get_unblockish_text(child))
221 |         chunks.append(child.tail)
222 |     return chunks
223 | 
224 | 
225 | def element_to_css(el):
226 |     """
227 |     Create a CSS selector that will select the given element
228 |     """
229 |     singleton_elements = ["body", "head"]
230 |     parts = []
231 |     context = el
232 |     while True:
233 |         if context.tag in singleton_elements:
234 |             parts.insert(0, context.tag)
235 |             break
236 |         if context.get("id"):
237 |             parts.insert(0, "#" + context.get("id"))
238 |             break
239 |         parent = context.getparent()
240 |         position = parent.index(context)
241 |         parts.insert(0, "*:nth-child(%s)" % (position + 1))
242 |         context = parent
243 |     return " > ".join(parts)
244 | 


--------------------------------------------------------------------------------
/python/pha/notebooktools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tools for use in Jupyter Notebooks, especially display_html()
  3 | """
  4 | import base64
  5 | from IPython.core.display import display, HTML
  6 | from cgi import escape as html_escape
  7 | import lxml.etree
  8 | import time
  9 | import os
 10 | import shutil
 11 | from urllib.request import urlopen
 12 | 
 13 | 
 14 | def make_data_url(content_type, content):
 15 |     encoded = base64.b64encode(content.encode('UTF-8')).decode('ASCII')
 16 |     return 'data:%s;base64,%s' % (content_type, encoded.replace('\n', ''))
 17 | 
 18 | 
 19 | def display_html(html_page, header='', footer='', height="12em", title=None, link=None, link_title=None):
 20 |     """
 21 |     Display an HTML page inline in a Jupyter notebook.
 22 | 
 23 |     The page will go in an iframe. The header and footer are optional extra HTML. The title, link, and link_title are all used as part of a header.
 24 |     """
 25 |     if isinstance(html_page, lxml.etree.ElementBase):
 26 |         html_page = lxml.html.tostring(html_page)
 27 |     if isinstance(html_page, bytes):
 28 |         html_page = html_page.decode("UTF-8")
 29 |     literal_data = make_data_url("text/html", html_page)
 30 |     if title:
 31 |         if link and not link_title:
 32 |             title = '<strong><a href="%s" target=_blank>%s</a></strong>' % (
 33 |                 html_escape(title), html_escape(link))
 34 |         elif link:
 35 |             title = '<strong>%s</strong> <a href="%s" target=_blank>%s</a>' % (
 36 |                 html_escape(title), html_escape(link), html_escape(link_title))
 37 |         else:
 38 |             title = '<strong>%s</strong>' % html_escape(title)
 39 |         header = title + "\n" + header
 40 |     if header:
 41 |         header = '<div>%s</div>' % header
 42 |     if footer:
 43 |         footer = '<div>%s</div>' % footer
 44 |     html = '''
 45 |     <div>
 46 |       %s
 47 |       <iframe style="width: 100%%; height: %s; overflow: scroll" scrolling="yes" src="%s"></iframe>
 48 |       %s
 49 |     </div>
 50 |     ''' % (header, html_escape(height), literal_data, footer)
 51 |     display(HTML(html))
 52 | 
 53 | 
 54 | def tag(t, c=None, **attrs):
 55 |     content = c or None
 56 |     for key, value in list(attrs.items()):
 57 |         if value is None:
 58 |             continue
 59 |         if key.startswith("style_"):
 60 |             name = key[len("style_"):]
 61 |             name = name.replace("_", "-")
 62 |             existing = attrs.get("style")
 63 |             if existing:
 64 |                 attrs["style"] = "%s; %s: %s" % (existing, name, value)
 65 |             else:
 66 |                 attrs["style"] = "%s: %s" % (name, value)
 67 |             del attrs[key]
 68 |     attrs = [
 69 |         ' %s="%s"' % (html_escape(key), html_escape(str(value)))
 70 |         for key, value in sorted(attrs.items())
 71 |         if value is not None
 72 |     ]
 73 |     start = '<%s%s' % (
 74 |         t,
 75 |         "".join(attrs),
 76 |     )
 77 |     if content:
 78 |         if isinstance(content, (list, tuple)):
 79 |             content = "".join(content)
 80 |         return "%s>%s</%s>" % (start, content, t)
 81 |     else:
 82 |         return "%s />" % (start)
 83 | 
 84 | 
 85 | class Image:
 86 | 
 87 |     def __init__(self, src_or_metadata, max_height='100px'):
 88 |         if isinstance(src_or_metadata, str):
 89 |             self.metadata = {"href": src_or_metadata}
 90 |         else:
 91 |             self.metadata = src_or_metadata
 92 |         self.max_height = max_height
 93 | 
 94 |     def _repr_html_(self):
 95 |         src = self.metadata.get("src") or self.metadata.get("href")
 96 |         return tag(
 97 |             "img",
 98 |             src=src,
 99 |             alt=src,
100 |             style_max_height=self.max_height,
101 |             style_width="auto",
102 |             width=self.metadata.get("width"),
103 |             height=self.metadata.get("height"),
104 |         )
105 | 
106 | 
107 | class Link:
108 | 
109 |     def __init__(self, url, title=None, domain=False):
110 |         if domain is True:
111 |             from . import domain
112 |             if title:
113 |                 title = "%s (%s)" % (title, domain(url))
114 |             else:
115 |                 title = domain(url)
116 |         if not title:
117 |             title = url
118 |         self.url = url
119 |         self.title = title
120 | 
121 |     def _repr_html_(self):
122 |         return tag("a", href=self.url, target="_blank", c=html_escape(self.title))
123 | 
124 | 
125 | class Table:
126 | 
127 |     def __init__(self, rows, header=None, max_height=None):
128 |         self.rows = list(rows)
129 |         self.header = header
130 |         self.max_height = max_height
131 |         if rows and self.header is None:
132 |             first_row = rows[0]
133 |             if isinstance(first_row, dict):
134 |                 self.header = sorted(first_row.keys())
135 | 
136 |     def _repr_html_(self):
137 |         if not self.rows:
138 |             return '(No records)'
139 |         rows = []
140 |         if self.header:
141 |             rows.append(tag("tr", [
142 |                 tag("th", c=h) for h in self.header
143 |             ]))
144 |         for row in self.rows:
145 |             if isinstance(row, dict):
146 |                 row = [row[h] for h in self.header]
147 |             values = [
148 |                 c._repr_html_() if hasattr(c, "_repr_html_") else html_escape(str(c))
149 |                 for c in row
150 |             ]
151 |             rows.append(tag("tr", [
152 |                 tag("td", v) for v in values
153 |             ]))
154 |         table = tag("table", style_overflow="scroll-y", style_max_height=self.max_height, c=rows)
155 |         if self.max_height:
156 |             return tag(
157 |                 "div",
158 |                 table,
159 |                 style_overflow="scroll",
160 |                 style_max_height=self.max_height,
161 |                 style_border="box-shadow: 5px 10px 18px #888888",
162 |                 style_border_radius="3px",
163 |             )
164 |         return table
165 | 
166 | 
167 | chooser_id = int(time.time())
168 | 
169 | 
170 | def display_chooser(links, height="12em"):
171 |     display(HTML(display_chooser_html(links, height=height)))
172 | 
173 | 
174 | def display_chooser_html(links, height="12em"):
175 |     global chooser_id
176 |     if not links:
177 |         return '<div>Nothing to choose from</div>'
178 |     chooser_id, my_id = chooser_id + 1, "chooser-%s" % chooser_id
179 |     links_html = []
180 |     for link in links:
181 |         if isinstance(link, str):
182 |             link = {"src": link}
183 |         if not link.get("title"):
184 |             link["title"] = link["src"]
185 |         links_html.append('''
186 |         <button onclick="document.querySelector('#%s iframe').src = %s">%s</button>
187 |         ''' % (
188 |             my_id,
189 |             html_escape(repr(link["src"])),
190 |             html_escape(link["title"]),
191 |         ))
192 |     return '''\
193 | <div id="%(id)s">
194 |   %(links)s
195 |   <iframe style="width: 100%% ; height: %(height)s;  overflow: scroll" scrolling="yes"></iframe>
196 | </div>
197 | ''' % dict(
198 |         id=my_id,
199 |         links=' '.join(links_html),
200 |         height=height,
201 |     )
202 | 
203 | 
204 | def lazyget(url, filename):
205 |     if os.path.exists(filename):
206 |         if os.path.getsize(filename):
207 |             print("File", filename, "already exists")
208 |             return
209 |         else:
210 |             print("File", filename, "is empty; overwriting")
211 |     dirname = os.path.dirname(filename)
212 |     if not os.path.exists(dirname):
213 |         print("Creating directory %s/" % dirname)
214 |         os.makedirs(dirname)
215 |     with urlopen(url) as resp:
216 |         try:
217 |             length = int(resp.getheader("Content-Length")) // 1000
218 |             length = "%skb" % length
219 |         except ValueError:
220 |             length = "unknown size"
221 |         print("Reading %s into %s..." % (length, filename), end="")
222 |         with open(filename, "wb") as fp:
223 |             shutil.copyfileobj(resp, fp)
224 |         print(" done.")
225 | 


--------------------------------------------------------------------------------
/python/pha/saver.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implements saving information into the database/files
  3 | """
  4 | 
  5 | import os
  6 | import re
  7 | import stat
  8 | import json
  9 | import sys
 10 | import struct
 11 | import time
 12 | import pprint
 13 | import traceback
 14 | import uuid
 15 | from . import Page
 16 | 
 17 | message_handlers = {}
 18 | 
 19 | 
 20 | def addon(func):
 21 |     message_handlers[func.__name__] = func
 22 |     return func
 23 | 
 24 | 
 25 | @addon
 26 | def add_history_list(archive, *, browserId, sessionId, historyItems):
 27 |     visits_to_ids = {}
 28 |     for history in historyItems.values():
 29 |         for visitId, visit in history["visits"].items():
 30 |             visits_to_ids[visitId] = visit["activity_id"] = str(uuid.uuid1())
 31 |     for historyId, history in historyItems.items():
 32 |         c = archive.conn.cursor()
 33 |         for visitId, visit in history["visits"].items():
 34 |             c.execute("""
 35 |                 DELETE FROM activity WHERE browserVisitId = ?
 36 |             """, (visitId,))
 37 |             sourceId = None
 38 |             if visit.get("referringVisitId"):
 39 |                 sourceId = visits_to_ids.get(visit["referringVisitId"])
 40 |                 if not sourceId:
 41 |                     c.execute("""
 42 |                         SELECT id FROM activity WHERE browserVisitId = ?
 43 |                     """, (visit["referringVisitId"],))
 44 |                     row = c.fetchone()
 45 |                     if row:
 46 |                         sourceId = row.id
 47 |             c.execute("""
 48 |                 INSERT INTO activity (
 49 |                     id,
 50 |                     title,
 51 |                     browserId,
 52 |                     sessionId,
 53 |                     url,
 54 |                     browserHistoryId,
 55 |                     browserVisitId,
 56 |                     loadTime,
 57 |                     transitionType,
 58 |                     browserReferringVisitId,
 59 |                     sourceId
 60 |                 ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
 61 |             """, (
 62 |                 visit["activity_id"],
 63 |                 history["title"],
 64 |                 browserId,
 65 |                 sessionId,
 66 |                 history["url"],
 67 |                 historyId,
 68 |                 visitId,
 69 |                 visit["visitTime"],
 70 |                 visit["transition"],
 71 |                 visit["referringVisitId"],
 72 |                 sourceId))
 73 |         archive.conn.commit()
 74 |     c = archive.conn.cursor()
 75 |     c.execute("""
 76 |         UPDATE browser
 77 |           SET
 78 |             newestHistory = (SELECT MAX(loadTime)
 79 |                              FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL),
 80 |             oldestHistory = (SELECT MIN(loadTime)
 81 |                              FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL)
 82 |     """, (browserId, browserId))
 83 |     archive.conn.commit()
 84 | 
 85 | 
 86 | @addon
 87 | def add_activity_list(archive, *, browserId, activityItems):
 88 |     for activity in activityItems:
 89 |         c = archive.conn.cursor()
 90 |         columns = """
 91 |             id
 92 |             browserId
 93 |             sessionId
 94 |             url
 95 |             title
 96 |             ogTitle
 97 |             loadTime
 98 |             unloadTime
 99 |             transitionType
100 |             sourceClickText
101 |             sourceClickHref
102 |             client_redirect
103 |             server_redirect
104 |             forward_back
105 |             from_address_bar
106 |             sourceId
107 |             initialLoadId
108 |             newTab
109 |             activeCount
110 |             activeTime
111 |             closedReason
112 |             method
113 |             statusCode
114 |             contentType
115 |             hasSetCookie
116 |             hasCookie
117 |             copyEvents
118 |             formControlInteraction
119 |             formTextInteraction
120 |             isHashChange
121 |             maxScroll
122 |             documentHeight
123 |             hashPointsToElement
124 |             zoomLevel
125 |             canonicalUrl
126 |             mainFeedUrl
127 |             allFeeds
128 |         """.strip().split()
129 |         for null_default in "sourceId transitionType".split():
130 |             activity.setdefault(null_default, None)
131 |         marks = ["?"] * len(columns)
132 |         activity["browserId"] = browserId
133 |         linkInformation = activity["linkInformation"]
134 |         del activity["linkInformation"]
135 |         if activity["copyEvents"]:
136 |             activity["copyEvents"] = json.dumps(activity["copyEvents"])
137 |         else:
138 |             activity["copyEvents"] = None
139 |         if activity["allFeeds"]:
140 |             activity["allFeeds"] = json.dumps(activity["allFeeds"])
141 |         else:
142 |             activity["allFeeds"] = None
143 |         log(archive, activity)
144 |         values = [activity[column] for column in columns]
145 |         unused = set(activity).difference(columns)
146 |         if unused:
147 |             raise Exception("Unused keys in activity submission: {}".format(unused))
148 |         c.execute("""
149 |             INSERT OR REPLACE INTO activity (
150 |               %s
151 |             ) VALUES (%s)
152 |         """ % (", ".join(columns), ", ".join(marks)), values)
153 |         c.execute("""
154 |             DELETE FROM activity_link WHERE activity_id = ?
155 |         """, (activity["id"],))
156 |         for link in linkInformation or []:
157 |             c.execute("""
158 |                 INSERT INTO activity_link (
159 |                     url,
160 |                     text,
161 |                     rel,
162 |                     target,
163 |                     elementId
164 |                 ) VALUES (?, ?, ?, ?, ?)
165 |             """, (link["url"], link["text"], link.get("rel"), link.get("target"), link.get("elementId")))
166 |     archive.conn.commit()
167 | 
168 | 
169 | @addon
170 | def register_browser(archive, *, browserId, userAgent, testing=False, autofetch=False, devicePixelRatio=1):
171 |     c = archive.conn.cursor()
172 |     c.execute("""
173 |         INSERT OR REPLACE INTO browser (id, userAgent, testing, autofetch, devicePixelRatio)
174 |         VALUES (?, ?, ?, ?, ?)
175 |     """, (browserId, userAgent, testing, autofetch, devicePixelRatio))
176 |     c.execute("""
177 |         UPDATE browser
178 |           SET
179 |             newestHistory = (SELECT MAX(loadTime)
180 |                              FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL),
181 |             oldestHistory = (SELECT MIN(loadTime)
182 |                              FROM activity WHERE browserId = ? AND browserHistoryId IS NOT NULL)
183 |     """, (browserId, browserId))
184 |     archive.conn.commit()
185 | 
186 | 
187 | @addon
188 | def register_session(archive, sessionId, browserId, timezoneOffset):
189 |     c = archive.conn.cursor()
190 |     c.execute("""
191 |         INSERT OR REPLACE INTO browser_session (id, browserId, startTime, timezoneOffset)
192 |         VALUES (?, ?, CURRENT_TIMESTAMP, ?)
193 |     """, (sessionId, browserId, timezoneOffset))
194 |     archive.conn.commit()
195 | 
196 | 
197 | @addon
198 | def get_needed_pages(archive, limit=100):
199 |     c = archive.conn.cursor()
200 |     rows = c.execute("""
201 |         SELECT history.url, fetch_error.errorMessage FROM history
202 |         LEFT JOIN page
203 |             ON page.url = history.url
204 |         LEFT JOIN fetch_error
205 |             ON fetch_error.url = history.url
206 |         WHERE page.url IS NULL
207 |         ORDER BY fetch_error.url IS NULL DESC, lastVisitTime DESC
208 |         LIMIT ?
209 |     """, (limit,))
210 |     return [{"url": row["url"], "lastError": row["errorMessage"]} for row in rows]
211 | 
212 | 
213 | @addon
214 | def check_page_needed(archive, url):
215 |     c = archive.conn.cursor()
216 |     c.execute("""
217 |         SELECT COUNT(*) AS counter FROM page WHERE page.url = ?
218 |     """, (url,))
219 |     return not c.fetchone()[0]
220 | 
221 | 
222 | @addon
223 | def add_fetched_page(archive, id, url, page):
224 |     redirectUrl = page["url"].split("#")[0]
225 |     origUrl = url.split("#")[0]
226 |     page["originalUrl"] = url
227 |     if redirectUrl == origUrl:
228 |         redirectUrl = None
229 |     else:
230 |         redirectUrl = page["url"]
231 |     if redirectUrl:
232 |         # Removes the YouTube start time we add
233 |         redirectUrl = redirectUrl.replace("&start=86400", "")
234 |     c = archive.conn.cursor()
235 |     c.execute("""
236 |         INSERT OR REPLACE INTO page (id, url, activityId, fetched, redirectUrl, timeToFetch)
237 |         VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
238 |     """, (id, url, page.get("activityId"), redirectUrl, page["timeToFetch"]))
239 |     c.execute("""
240 |         DELETE FROM fetch_error
241 |         WHERE url = ?
242 |     """, (url,))
243 |     archive.conn.commit()
244 |     write_page(archive, url, page)
245 | 
246 | 
247 | @addon
248 | def add_fetch_failure(archive, url, errorMessage):
249 |     c = archive.conn.cursor()
250 |     c.execute("""
251 |         INSERT OR REPLACE INTO fetch_error (url, errorMessage)
252 |         VALUES (?, ?)
253 |     """, (url, errorMessage))
254 |     archive.conn.commit()
255 | 
256 | 
257 | @addon
258 | def status(archive, browserId):
259 |     c = archive.conn.cursor()
260 |     c.execute("""
261 |         SELECT
262 |             (SELECT COUNT(*) FROM activity) AS activity_count,
263 |             (SELECT newestHistory FROM browser WHERE id = ?) AS latest,
264 |             (SELECT oldestHistory FROM browser WHERE id = ?) AS oldest,
265 |             (SELECT COUNT(*) FROM page) AS fetched_count
266 |     """, (browserId, browserId))
267 |     row = c.fetchone()
268 |     return dict(row)
269 | 
270 | 
271 | @addon
272 | def log(archive, *args, level='log', stack=None):
273 |     filename = os.path.join(archive.path, "addon.log")
274 |     with open(filename, "a") as fp:
275 |         if stack:
276 |             log_location = stack.splitlines()[0]
277 |             log_location = re.sub(r'moz-extension://[a-f0-9-]+/', '/', log_location)
278 |         else:
279 |             log_location = ""
280 |         print("Log/{: <5} {} {}".format(level, int(time.time() * 1000), log_location), file=fp)
281 |         if len(str(args)) < 70 and len(args) > 1:
282 |             args = (args,)
283 |         for arg in args:
284 |             if isinstance(arg, str):
285 |                 s = arg
286 |             else:
287 |                 s = pprint.pformat(arg, compact=True)
288 |                 if isinstance(arg, tuple):
289 |                     s = s[1:-1]
290 |             s = s.splitlines()
291 |             for line in s:
292 |                 print("   ", line, file=fp)
293 |         if not args:
294 |             print("    (no arguments)", file=fp)
295 |         print(file=fp)
296 | 
297 | 
298 | def write_page(archive, url, data):
299 |     filename = Page.json_filename(archive, url)
300 |     with open(filename, "wb") as fp:
301 |         fp.write(json.dumps(data).encode("UTF-8"))
302 | 
303 | 
304 | def run_saver(storage_directory=None):
305 |     from . import Archive
306 |     if not storage_directory:
307 |         archive = Archive.default_location()
308 |     else:
309 |         archive = Archive(storage_directory)
310 |     while True:
311 |         m_name = "(unknown)"
312 |         try:
313 |             message = get_message()
314 |             m_name = "%(name)s(%(args)s%(kwargs)s)" % dict(
315 |                 name=message["name"],
316 |                 args=", ".join(json.dumps(s) for s in message.get("args", [])),
317 |                 kwargs=", ".join("%s=%s" % (name, json.dumps(value)) for name, value in message.get("kwargs", {}).items()),
318 |             )
319 |             if len(m_name) > 100:
320 |                 m_name = m_name[:60] + " ... " + m_name[-10:]
321 |             print("Message:", m_name, file=sys.stderr)
322 |             handler = message_handlers.get(message["name"])
323 |             if not handler:
324 |                 print("Error: got unexpected message name: %r" % message["name"], file=sys.stderr)
325 |                 continue
326 |             result = handler(archive, *message.get("args", ()), **message.get("kwargs", {}))
327 |             send_message({"id": message["id"], "result": result})
328 |         except Exception as e:
329 |             tb = traceback.format_exc()
330 |             log(archive, "Error processing message %s(): %s" % (m_name, e), tb, level='s_err')
331 |             send_message({"id": message["id"], "error": str(e), "traceback": tb})
332 | 
333 | 
334 | def get_message():
335 |     length = sys.stdin.buffer.read(4)
336 |     if len(length) == 0:
337 |         sys.exit(0)
338 |     length = struct.unpack('@I', length)[0]
339 |     message = sys.stdin.buffer.read(length).decode('utf-8')
340 |     message = json.loads(message)
341 |     return message
342 | 
343 | 
344 | def encode_message(message):
345 |     content = json.dumps(message).encode('utf-8')
346 |     length = struct.pack('@I', len(content))
347 |     return length + content
348 | 
349 | 
350 | def send_message(message):
351 |     sys.stdout.buffer.write(encode_message(message))
352 |     sys.stdout.buffer.flush()
353 | 
354 | 
355 | def install_json_command():
356 |     import argparse
357 |     default_location = os.path.abspath(os.path.join(os.path.abspath(__file__), "../../../data"))
358 |     script_location = os.path.join(default_location, ".pha-starter.py")
359 |     parser = argparse.ArgumentParser()
360 |     parser.add_argument("storage_directory", help="Location for storing the database and files", default=default_location)
361 |     parser.add_argument("--script-location", "-s", help="Location to keep the connection script", default=script_location)
362 |     parser.add_argument("--native-name", help="Name this will be registered for", default="pha.saver")
363 |     args = parser.parse_args()
364 |     print("Using the storage directory", args.storage_directory)
365 |     print("Writing a connector script to", args.script_location)
366 |     install_json_file(args.storage_directory, args.script_location, args.native_name)
367 | 
368 | 
369 | def install_json_file(storage_directory, script_location, native_name):
370 |     # FIXME: support Windows
371 |     manifest_path = os.path.abspath(os.path.join(__file__, "../../../extension/manifest.json"))
372 |     script_location = os.path.abspath(script_location)
373 |     with open(manifest_path) as fp:
374 |         manifest = json.load(fp)
375 |     manifest_id = manifest["applications"]["gecko"]["id"]
376 |     with open(script_location, "w") as fp:
377 |         # This script should support a Windows .BAT file
378 |         fp.write("""\
379 | #!%s
380 | storage_directory = %r
381 | from pha.saver import run_saver
382 | run_saver(storage_directory)
383 | """ % (sys.executable, os.path.abspath(storage_directory)))
384 |     st = os.stat(script_location)
385 |     os.chmod(script_location, st.st_mode | stat.S_IEXEC)
386 |     native_manifest = {
387 |         "name": native_name,
388 |         "description": "Saves information from the personal-history-archive extension",
389 |         "path": script_location,
390 |         "type": "stdio",
391 |         "allowed_extensions": [manifest_id]
392 |     }
393 |     if sys.platform == "darwin":
394 |         filename = os.path.expanduser("~/Library/Application Support/Mozilla/NativeMessagingHosts/%s.json" % native_name)
395 |     elif sys.platform.startswith("linux"):
396 |         filename = os.path.expanduser("~/.mozilla/native-messaging-hosts/%s.json" % native_name)
397 |     else:
398 |         raise Exception("Not a supported platform")
399 |     dir = os.path.dirname(filename)
400 |     if not os.path.exists(dir):
401 |         os.makedirs(dir)
402 |     with open(filename, "wb") as fp:
403 |         fp.write(json.dumps(native_manifest, indent=2).encode("UTF-8"))
404 | 
405 | 
406 | if __name__ == "__main__":
407 |     install_json_command()
408 | 


--------------------------------------------------------------------------------
/python/pha/schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS browser (
 2 |   id TEXT PRIMARY KEY,
 3 |   created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
 4 |   oldestHistory INT,
 5 |   newestHistory INT,
 6 |   userAgent TEXT,
 7 |   testing BOOLEAN,
 8 |   autofetch BOOLEAN,
 9 |   devicePixelRatio FLOAT
10 | );
11 | 
12 | CREATE TABLE IF NOT EXISTS browser_session (
13 |   id TEXT PRIMARY KEY,
14 |   browserId TEXT REFERENCES browser (id) ON DELETE CASCADE,
15 |   startTime INT,
16 |   endTime INT,
17 |   timezoneOffset INT
18 | );
19 | 
20 | CREATE TABLE IF NOT EXISTS page (
21 |   id TEXT PRIMARY KEY,
22 |   url TEXT NOT NULL,
23 |   fetched TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
24 |   activityId TEXT REFERENCES activity (id) ON DELETE SET NULL,
25 |   timeToFetch INT,
26 |   redirectUrl TEXT,
27 |   redirectOk BOOLEAN DEFAULT FALSE
28 | );
29 | 
30 | CREATE TABLE IF NOT EXISTS fetch_error (
31 |   url TEXT PRIMARY KEY,
32 |   attempted TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
33 |   errorMessage TEXT
34 | );
35 | 
36 | CREATE TABLE IF NOT EXISTS activity (
37 |   id TEXT PRIMARY KEY,
38 |   browserId TEXT REFERENCES browser (id) ON DELETE CASCADE,
39 |   sessionId TEXT REFERENCES browser_session (id) ON DELETE CASCADE,
40 |   url TEXT NOT NULL,
41 |   title TEXT,
42 |   ogTitle TEXT,
43 |   browserHistoryId TEXT,
44 |   browserVisitId TEXT,
45 |   loadTime INT,
46 |   unloadTime INT,
47 |   transitionType TEXT,
48 |   sourceClickText TEXT,
49 |   sourceClickHref TEXT,
50 |   client_redirect BOOLEAN DEFAULT FALSE,
51 |   server_redirect BOOLEAN DEFAULT FALSE,
52 |   forward_back BOOLEAN DEFAULT FALSE,
53 |   from_address_bar BOOLEAN DEFAULT FALSE,
54 |   sourceId TEXT REFERENCES activity (id) ON DELETE SET NULL,
55 |   browserReferringVisitId TEXT,
56 |   initialLoadId TEXT REFERENCES activity (id) ON DELETE SET NULL,
57 |   newTab BOOLEAN, -- was opened in new tab?
58 |   activeCount INT, -- Count of times it was "activated"
59 |   activeTime INT, -- Millisecond active time
60 |   closedReason TEXT,
61 |   method TEXT, -- HTTP request method
62 |   statusCode INT, -- HTTP status code
63 |   contentType TEXT, -- HTTP Content-Type
64 |   hasSetCookie BOOLEAN, -- has Set-Cookie response header
65 |   hasCookie BOOLEAN, -- has Cookie request header
66 |   copyEvents TEXT, -- Actually JSON
67 |   formControlInteraction INT, -- count of form interactions
68 |   formTextInteraction INT, -- count of form interactions
69 |   isHashChange BOOLEAN,
70 |   maxScroll INT, -- pixel Y location
71 |   documentHeight INT, -- pixel height
72 |   hashPointsToElement BOOLEAN,
73 |   zoomLevel FLOAT, -- 1.0 means 100% zoom
74 |   canonicalUrl TEXT, -- URL
75 |   mainFeedUrl TEXT, -- URL
76 |   allFeeds TEXT -- JSON
77 | );
78 | 
79 | CREATE TABLE IF NOT EXISTS activity_link (
80 |   activity_id TEXT REFERENCES activity (id) ON DELETE CASCADE,
81 |   url TEXT NOT NULL,
82 |   text TEXT NOT NULL,
83 |   rel TEXT,
84 |   target TEXT,
85 |   elementId TEXT
86 | );
87 | 


--------------------------------------------------------------------------------
/python/pha/search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | See this for SQLite FTS5/full text installation instructions: https://sqlite.org/fts5.html
  3 | 
  4 | Or: brew upgrade sqlite3 --with-fts5
  5 | 
  6 | Use: `python -m pha.search` to create a fresh index.
  7 | 
  8 | Use: `python -m pha.search entities` to create an entity index
  9 | """
 10 | import re
 11 | from urllib.parse import quote as url_quote
 12 | from . import htmltools
 13 | from . import domain
 14 | from collections.abc import Sequence
 15 | import time
 16 | import random
 17 | 
 18 | 
 19 | def create_index(archive, purge=True):
 20 |     """
 21 |     Creates an index of all pages, in a SQLite table.
 22 | 
 23 |     If `purge` is true, then throw away any past index.
 24 |     """
 25 |     c = archive.conn.cursor()
 26 |     c.execute("""
 27 |         CREATE VIRTUAL TABLE IF NOT EXISTS search_index
 28 |         USING FTS5 (
 29 |             url UNINDEXED,
 30 |             url_words,
 31 |             title,
 32 |             readable,
 33 |             readable_byline,
 34 |             readable_excerpt,
 35 |             meta_description,
 36 |             full_text
 37 |         )
 38 |     """)
 39 |     existing = set()
 40 |     if purge:
 41 |         c.execute("""
 42 |             DELETE FROM search_index;
 43 |         """)
 44 |     else:
 45 |         rows = c.execute("""
 46 |             SELECT url FROM search_index
 47 |         """)
 48 |         for (url,) in rows:
 49 |             existing.add(url)
 50 |     count = 0
 51 |     for history in archive.histories_with_page():
 52 |         if history.url in existing:
 53 |             continue
 54 |         count += 1
 55 |         page = history.page
 56 |         url_words = " ".join(htmltools.url_words(page.url))
 57 |         title = page.title
 58 |         readable = page.readable_text
 59 |         full_text = page.full_text
 60 |         r = page.data.get("readable") or {}
 61 |         readable_byline = r.get("byline")
 62 |         readable_excerpt = r.get("excerpt")
 63 |         meta_description = ""  # FIXME: do this
 64 |         c.execute("""
 65 |             INSERT INTO search_index
 66 |               (url, url_words, title, readable, readable_byline, readable_excerpt, meta_description, full_text)
 67 |             VALUES
 68 |               (?, ?, ?, ?, ?, ?, ?, ?)
 69 |         """, (page.url, url_words, title, readable, readable_byline, readable_excerpt, meta_description, full_text))
 70 |     c.close()
 71 |     archive.conn.commit()
 72 |     return count
 73 | 
 74 | 
 75 | def search(archive, query):
 76 |     """
 77 |     Searches pages from an archive. Returns a list-like object.
 78 |     """
 79 |     c = archive.conn.cursor()
 80 |     rows = c.execute("""
 81 |         SELECT url FROM search_index WHERE search_index MATCH ? ORDER BY rank
 82 |     """, (query,))
 83 |     urls = [row[0] for row in rows]
 84 |     return SearchResult(archive, query, urls)
 85 | 
 86 | 
 87 | class SearchResult(Sequence):
 88 | 
 89 |     def __init__(self, archive, query, urls):
 90 |         self.archive = archive
 91 |         self.query = query
 92 |         self.urls = urls
 93 |         self.fetched_histories = {}
 94 | 
 95 |     def __repr__(self):
 96 |         return '<SearchResult[] %r: %i results>' % (self.query, len(self.urls))
 97 | 
 98 |     def __getitem__(self, i):
 99 |         url = self.urls[i]
100 |         history = self.fetched_histories.get(url)
101 |         if history is None:
102 |             history = self.fetched_histories[url] = self.archive.get_history(url)
103 |         return history
104 | 
105 |     def __len__(self):
106 |         return len(self.urls)
107 | 
108 | 
109 | def create_entity_index(archive, purge=True, verbose=False):
110 |     from .summarytools import find_entities
111 |     c = archive.conn.cursor()
112 |     c.execute("""
113 |         CREATE TABLE IF NOT EXISTS entity_index (
114 |             entity TEXT,
115 |             entity_label TEXT,
116 |             url TEXT REFERENCES page (url) ON DELETE CASCADE,
117 |             selector TEXT
118 |         )
119 |     """)
120 |     if verbose:
121 |         print("Created table")
122 |     existing = set()
123 |     if purge:
124 |         c.execute("""
125 |             DELETE FROM entity_index;
126 |         """)
127 |         if verbose:
128 |             print("Removed any previous entries")
129 |     else:
130 |         rows = c.execute("""
131 |             SELECT DISTINCT url FROM entity_index;
132 |         """)
133 |         for (url,) in rows:
134 |             existing.add(url)
135 |         if verbose:
136 |             print("Left", len(existing), "existing entries")
137 |     c.close()
138 |     archive.conn.commit()
139 |     histories = [h for h in archive.histories_with_page() if h.url not in existing]
140 |     loop_start = time.time()
141 |     for count, history in enumerate(histories):
142 |         start = time.time()
143 |         c = archive.conn.cursor()
144 |         page = history.page
145 |         body = page.lxml.find("body")
146 |         entities = list(find_entities(body))
147 |         if not entities:
148 |             entities = [("no-entity", None, body)]
149 |         for entity, entity_label, element in entities:
150 |             selector = htmltools.element_to_css(element)
151 |             c.execute("""
152 |                 INSERT INTO entity_index (entity, entity_label, url, selector)
153 |                 VALUES (?, ?, ?, ?)
154 |             """, (entity, entity_label, page.url, selector))
155 |         if verbose:
156 |             print("Indexed %6i/%6i  %s" % (count + 1, len(histories), page.url))
157 |             print("  entities: %i in %i elements" % (len(entities), len(set(el for ent, ent_label, el in entities))))
158 |             print("  time %is; total %s; eta %s" % (
159 |                 time.time() - start,
160 |                 format_time(time.time() - loop_start),
161 |                 format_time((time.time() - loop_start) * len(histories) / (count + 1)),
162 |             ))
163 |             random.shuffle(entities)
164 |             entities_string = ", ".join(["%r:%s" % (ent, ent_label) for ent, ent_label, el in entities])
165 |             print("  entities: %s" % entities_string[:145])
166 |             print()
167 |         c.close()
168 |         archive.conn.commit()
169 |     if verbose:
170 |         print("Inserted a total of", count, "pages")
171 |     return count + 1
172 | 
173 | 
174 | def format_time(seconds):
175 |     if seconds < 60:
176 |         return '%is' % seconds
177 |     minutes = seconds / 60
178 |     if minutes < 60:
179 |         return '%im' % minutes
180 |     hours, minutes = minutes // 60, minutes % 60
181 |     return '%ih%im' % (hours, minutes)
182 | 
183 | 
184 | def summarize_entities(archive, most_common=0):
185 |     c = archive.conn.cursor()
186 |     c.execute("""
187 |         SELECT
188 |           (SELECT COUNT(DISTINCT entity) FROM entity_index) AS distinct_entities,
189 |           (SELECT COUNT(*) FROM entity_index) AS total_entities,
190 |           (SELECT COUNT(DISTINCT url) from entity_index) AS distinct_urls,
191 |           (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'PER') AS total_label_per,
192 |           (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'LOC') AS total_label_loc,
193 |           (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'ORG') AS total_label_org,
194 |           (SELECT COUNT(*) FROM entity_index WHERE entity_label = 'MISC') AS total_label_misc,
195 |           (SELECT COUNT(*) FROM entity_index WHERE entity_label IS NULL OR entity_label NOT IN ('PER', 'LOC', 'ORG', 'MISC')) AS total_label_unknown
196 |     """)
197 |     row = c.fetchone()
198 |     result = {
199 |         "distinct_entities": row[0],
200 |         "total_entities": row[1],
201 |         "distinct_urls": row[2],
202 |         "total_labels": {
203 |             "per": row[3],
204 |             "loc": row[4],
205 |             "org": row[5],
206 |             "misc": row[6],
207 |             "unknown": row[7],
208 |         }
209 |     }
210 |     if most_common:
211 |         c.execute("""
212 |             SELECT entity, COUNT(url)
213 |             FROM entity_index
214 |             GROUP BY entity
215 |             ORDER BY COUNT(url) DESC
216 |             LIMIT ?
217 |         """, (most_common,))
218 |         result["most_common_entities"] = m = []
219 |         for row in c:
220 |             m.append((row[0], row[1]))
221 |     return result
222 | 
223 | 
224 | def search_entities(archive, entity, entity_label=None, wildcard=False):
225 |     c = archive.conn.cursor()
226 |     entity_arg = (entity,)
227 |     entity_query = 'entity = ?'
228 |     if wildcard:
229 |         entity_query = 'LOWER(entity) LIKE ?'
230 |         entity_arg = ('%' + entity.lower() + '%',)
231 |     if entity_label:
232 |         entity_query += " AND entity_label = ?"
233 |         entity_arg += (entity_label,)
234 |     rows = c.execute("""
235 |         SELECT entity, entity_label, url, selector
236 |         FROM entity_index
237 |         WHERE %s
238 |     """ % entity_query, entity_arg)
239 |     rows = [(row[0], row[1], row[2], row[3]) for row in rows]
240 |     return EntitySearchResult(archive, entity, rows, wildcard=wildcard)
241 | 
242 | 
243 | class EntitySearchResult(Sequence):
244 |     def __init__(self, archive, entity, rows, wildcard=False):
245 |         self.archive = archive
246 |         self.entity = entity
247 |         self.wildcard = wildcard
248 |         self.rows = rows
249 |         self.fetched_results = {}
250 | 
251 |     def __repr__(self):
252 |         return '<EntitySearchResult[] %s%r: %i results>' % ('like ' if self.wildcard else '', self.entity, len(self.rows))
253 | 
254 |     def __getitem__(self, i):
255 |         if isinstance(i, slice):
256 |             return self.__class__(self.archive, self.entity, self.rows[i], wildcard=self.wildcard)
257 |         row = self.rows[i]
258 |         result = self.fetched_results.get(row)
259 |         if result is None:
260 |             result = self.fetched_results[row] = EntityResult(self.archive, *row)
261 |         return result
262 | 
263 |     def __len__(self):
264 |         return len(self.rows)
265 | 
266 | 
267 | class EntityResult:
268 |     def __init__(self, archive, entity, entity_label, url, selector):
269 |         self.archive = archive
270 |         self.entity = entity
271 |         self.entity_label = entity_label
272 |         self.url = url
273 |         self.selector = selector
274 | 
275 |     def __repr__(self):
276 |         return '<EntityResult %s %s: %r (%s)>' % (self.url, self.selector, self.entity, self.entity_label)
277 | 
278 |     @property
279 |     def page(self):
280 |         if not hasattr(self, "_page"):
281 |             self._page = self.archive.get_history(self.url).page
282 |         return self._page
283 | 
284 |     @property
285 |     def data_url(self):
286 |         from .notebooktools import make_data_url
287 |         url = make_data_url("text/html", self.page.html)
288 |         if re.search(r"^#[^:]+$", self.selector):
289 |             url += "#" + self.selector[1:]
290 |         else:
291 |             url += "#css=" + url_quote(self.selector)
292 |         return url
293 | 
294 |     @property
295 |     def domain(self):
296 |         return domain(self.url)
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     import sys
301 |     arg = sys.argv[1] if sys.argv[1:] else None
302 |     import pha
303 |     archive = pha.Archive.default_location()
304 |     try:
305 |         if arg == "entities" or arg == "entities":
306 |             print(create_entity_index(archive, verbose=True, purge=False), "pages entity indexed")
307 |         else:
308 |             print(create_index(archive), "pages full text indexed")
309 |     except KeyboardInterrupt:
310 |         print(" aborted")
311 | 


--------------------------------------------------------------------------------
/python/pha/searchquery.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tools for finding search/query-related pages in history
 3 | """
 4 | 
 5 | 
 6 | def find_queries(archive):
 7 |     activities = archive.get_activity_by_url(like='%google.com%')
 8 |     actual = []
 9 |     for a in activities:
10 |         q = a.query.get('q')
11 |         if not q:
12 |             continue
13 |         q = q[0]
14 |         actual.append((q, a))
15 |     archive.set_all_activity_from_sources([a for q, a in actual])
16 |     return actual
17 | 


--------------------------------------------------------------------------------
/python/pha/summarytools.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for summarization, using either textteaser or sumy
 3 | """
 4 | import re
 5 | 
 6 | text_teaser_instance = None
 7 | 
 8 | 
 9 | def textteaser_summary(page, *, try_readable=True):
10 |     """Uses TextTeaser (https://github.com/IndigoResearch/textteaser/tree/master/textteaser) to summarize
11 |     the page into a list of sentences
12 |     """
13 |     global text_teaser_instance
14 |     if text_teaser_instance is None:
15 |         from textteaser import TextTeaser
16 |         text_teaser_instance = TextTeaser()
17 |     text = (try_readable and page.readable_text) or page.full_text
18 |     return text_teaser_instance.summarize(page.title, text)
19 | 
20 | 
21 | def normalize_sentences(sentences, sep="  "):
22 |     sentences = [normalize_sentence(s) for s in sentences]
23 |     return sep.join(sentences)
24 | 
25 | 
26 | def normalize_sentence(sentence):
27 |     return re.sub(r'\s+', ' ', str(sentence).replace("\n", " "))
28 | 
29 | 
30 | def sumy_summary(page, sentence_count=5, *, language="english"):
31 |     from sumy.parsers.html import HtmlParser
32 |     from sumy.nlp.tokenizers import Tokenizer
33 |     from sumy.summarizers.lsa import LsaSummarizer as Summarizer
34 |     from sumy.nlp.stemmers import Stemmer
35 |     from sumy.utils import get_stop_words
36 |     parser = HtmlParser.from_string(page.html, page.url, Tokenizer(language))
37 |     stemmer = Stemmer(language)
38 |     summarizer = Summarizer(stemmer)
39 |     summarizer.stop_words = get_stop_words(language)
40 |     return summarizer(parser.document, sentence_count)
41 | 
42 | 
43 | _has_letter_re = re.compile(r"[a-zA-Z]")
44 | 
45 | 
46 | def is_good_entity(e):
47 |     """
48 |     Is this a plausible entity? For some reason scapy select entities like '-' or '\\n    '
49 |     """
50 |     return _has_letter_re.search(e)
51 | 
52 | 
53 | _whitespace_re = re.compile(r"\s\s+", re.S)
54 | 
55 | 
56 | def find_entities(page_element):
57 |     """
58 |     Uses SpaCy to find entities in the page element. Returns `[(entity_text, entity_label, element), ...]`
59 |     """
60 |     import xx_ent_wiki_sm
61 |     from .htmltools import iter_block_level_text
62 |     nlp = xx_ent_wiki_sm.load()
63 |     for text, element in iter_block_level_text(page_element):
64 |         text = _whitespace_re.sub(" ", text)
65 |         doc = nlp(text)
66 |         seen = set()
67 |         for entity in doc.ents:
68 |             if entity.text in seen:
69 |                 continue
70 |             seen.add(entity.text)
71 |             if not is_good_entity(entity.text):
72 |                 continue
73 |             yield entity.text, entity.label_, element
74 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | # These are not strict requirements, but useful libraries that can be used with pha, and are used in the notebooks:
 2 | jupyter
 3 | jupyterlab
 4 | 
 5 | # Some JupyterLab extensions:
 6 | jupyterlab_iframe
 7 | jupyterlab_templates
 8 | 
 9 | 
10 | # Some general machine learning libraries...
11 | numpy
12 | keras
13 | tensorflow
14 | pandas
15 | 
16 | # This is a fork of a simple NLP library to support Python 3
17 | -e git+https://www.github.com/ianb/textteaser.git#egg=TextTeaser
18 | 
19 | # Used for entity search and NLP:
20 | spacy
21 | # This is the english entity database for SpaCy:
22 | https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-2.0.0/xx_ent_wiki_sm-2.0.0.tar.gz#egg=xx_ent_wiki_sm
23 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """The setup script."""
 5 | 
 6 | from setuptools import setup, find_packages
 7 | 
 8 | requirements = [
 9 |     "lxml",
10 |     "cssselect",
11 |     "nltk",
12 |     "sumy",
13 |     "feedparser",
14 | ]
15 | 
16 | setup_requirements = [
17 |     # 'pytest-runner',
18 |     # TODO(ianb): put setup requirements (distutils extensions, etc.) here
19 | ]
20 | 
21 | test_requirements = [
22 |     # 'pytest',
23 |     # TODO: put package test requirements here
24 | ]
25 | 
26 | setup(
27 |     name='pha',
28 |     version='0.1.0',
29 |     description="Library to access the Personal History Archive",
30 |     # long_description=readme + '\n\n' + history,
31 |     author="Ian Bicking",
32 |     author_email='ian@ianbicking.org',
33 |     url='https://github.com/ianb/personal-history-archive',
34 |     packages=find_packages(include=['pha']),
35 |     include_package_data=True,
36 |     install_requires=requirements,
37 |     license="MIT license",
38 |     zip_safe=True,
39 |     # keywords='',
40 |     classifiers=[
41 |         'Development Status :: 2 - Pre-Alpha',
42 |         'Intended Audience :: Developers',
43 |         'License :: OSI Approved :: MIT License',
44 |         'Natural Language :: English',
45 |         'Programming Language :: Python :: 3',
46 |         'Programming Language :: Python :: 3.3',
47 |         'Programming Language :: Python :: 3.4',
48 |         'Programming Language :: Python :: 3.5',
49 |     ],
50 |     # test_suite='tests',
51 |     # tests_require=test_requirements,
52 |     setup_requires=setup_requirements,
53 | )
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """The setup script."""
 5 | 
 6 | from setuptools import setup, find_packages
 7 | 
 8 | requirements = [
 9 |     "lxml",
10 |     "cssselect",
11 |     "nltk",
12 |     "sumy",
13 |     "feedparser",
14 |     "click",
15 |     "sqlobject",
16 |     "colorama",
17 |     "yarl",
18 | ]
19 | 
20 | setup(
21 |     name='browsinglab',
22 |     version='0.1.0',
23 |     description="Generate and access data about browsing history",
24 |     # long_description=readme + '\n\n' + history,
25 |     author="Ian Bicking",
26 |     author_email='ian@ianbicking.org',
27 |     url='https://github.com/ianb/personal-history-archive',
28 |     packages=find_packages(include=['browsinglab']),
29 |     include_package_data=True,
30 |     install_requires=requirements,
31 |     license="MIT license",
32 |     zip_safe=True,
33 |     # keywords='',
34 |     classifiers=[
35 |         'Development Status :: 2 - Pre-Alpha',
36 |         'Intended Audience :: Developers',
37 |         'License :: OSI Approved :: MIT License',
38 |         'Natural Language :: English',
39 |         'Programming Language :: Python :: 3',
40 |         'Programming Language :: Python :: 3.3',
41 |         'Programming Language :: Python :: 3.4',
42 |         'Programming Language :: Python :: 3.5',
43 |     ],
44 |     entry_points='''
45 |         [console_scripts]
46 |         blab=browsinglab.cli:cli
47 |         browser-connector=browsinglab.connector:connect
48 |     ''',
49 |     # test_suite='tests',
50 |     # tests_require=test_requirements,
51 |     # setup_requires=setup_requirements,
52 | )
53 | 


--------------------------------------------------------------------------------
/test/.eslintrc.js:
--------------------------------------------------------------------------------
1 | "use strict";
2 | 
3 | module.exports = {
4 |   "rules": {
5 |     "no-console": "off"
6 |   }
7 | };
8 | 
9 | 


--------------------------------------------------------------------------------
/test/commands.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | const fs = require("fs");
 3 | const crypto = require("crypto");
 4 | const feedparser = require("node-feedparser");
 5 | const { By, until, Key } = require("selenium-webdriver");
 6 | const { promiseTimeout, eitherPromise } = require("./test-utils");
 7 | 
 8 | const LOAD_TIMEOUT = 20000;
 9 | 
10 | exports.fetchPage = async function(driver, url, base) {
11 |   let timer = setTimeout(() => {
12 |     console.log("    Sending ESCAPE key");
13 |     driver.findElement(By.tagName("body")).sendKeys(Key.ESCAPE);
14 |   }, LOAD_TIMEOUT);
15 |   await driver.get(url);
16 |   clearTimeout(timer);
17 |   let result = await eitherPromise(
18 |     driver.wait(until.elementLocated(By.css("#pha-completed-freeze"))).then(() => true),
19 |     promiseTimeout(30000).then(() => false)
20 |   );
21 |   url = await driver.getCurrentUrl();
22 |   if (!result) {
23 |     console.log("Freezing page timed out");
24 |     return null;
25 |   }
26 |   await promiseTimeout(500);
27 |   let filename = filenameForUrl(base, url);
28 |   let json = await readJson(filename, null);
29 |   if (json && json.feeds) {
30 |     json.parsedFeeds = [];
31 |     for (let feed of json.feeds) {
32 |       let parsed = await parseFeed(feed.body);
33 |       if (parsed) {
34 |         json.parsedFeeds.push(parsed);
35 |       }
36 |     }
37 |   }
38 |   return json;
39 | };
40 | 
41 | exports.pageExists = function(url, base) {
42 |   let filename = filenameForUrl(base, url);
43 |   return new Promise((resolve, reject) => {
44 |     fs.access(filename, (error) => {
45 |       resolve(!error);
46 |     });
47 |   });
48 | };
49 | 
50 | function filenameForUrl(base, url) {
51 |   let name = encodeURIComponent(url);
52 |   if (name.length > 200) {
53 |     let sha1 = crypto.createHash("sha1");
54 |     let hash = sha1.digest(url).toString("hex");
55 |     name = `${name.substr(0, 100)}-${hash}-trunc`;
56 |   }
57 |   return path.join(base, "pages", name + "-page.json");
58 | }
59 | 
60 | function readJson(filename, defaultValue) {
61 |   return new Promise((resolve, reject) => {
62 |     fs.readFile(filename, {encoding: "UTF-8"}, (error, data) => {
63 |       if (error && error.code === "ENOENT") {
64 |         resolve(defaultValue);
65 |         return;
66 |       } else if (error) {
67 |         reject(error);
68 |         return;
69 |       }
70 |       let json;
71 |       try {
72 |         json = JSON.parse(data);
73 |       } catch (e) {
74 |         console.error("Error parsing JSON from", filename, ":", e);
75 |         console.error(e.stack);
76 |         console.error("text:", JSON.stringify(data));
77 |         reject(e);
78 |         return;
79 |       }
80 |       resolve(json);
81 |     });
82 |   });
83 | }
84 | 
85 | function parseFeed(feedBody) {
86 |   return new Promise((resolve, reject) => {
87 |     feedparser(feedBody, (error, result) => {
88 |       if (error) {
89 |         console.log("Got a bad field:", error);
90 |         console.log(error.stack);
91 |         resolve(null);
92 |       } else {
93 |         resolve(result);
94 |       }
95 |     });
96 |   });
97 | }
98 | 


--------------------------------------------------------------------------------
/test/driver-setup.js:
--------------------------------------------------------------------------------
 1 | const firefox = require("selenium-webdriver/firefox");
 2 | const webdriver = require("selenium-webdriver");
 3 | 
 4 | exports.getDriver = function(addonFileLocation) {
 5 |   const channel = process.env.FIREFOX_CHANNEL || "NIGHTLY";
 6 |   if (!(channel in firefox.Channel)) {
 7 |     throw new Error(`Unknown channel: "${channel}"`);
 8 |   }
 9 | 
10 |   const options = new firefox.Options()
11 |     .setBinary(firefox.Channel[channel])
12 |     // Let our unsigned add-on be installed:
13 |     .setPreference("xpinstall.signatures.required", false)
14 |     // Try to keep audio from playing (doesn't work):
15 |     .setPreference("dom.webaudio.enabled", false)
16 |     // Try to keep videos from auto-playing (doesn't work that well):
17 |     .setPreference("media.autoplay.enabled", false)
18 |     // Don't let pages do something before unloading:
19 |     .setPreference("dom.disable_beforeunload", true)
20 |     // Automatically deny all these permission prompts:
21 |     .setPreference("permissions.default.camera", 2)
22 |     .setPreference("permissions.default.desktop-notification", 2)
23 |     .setPreference("permissions.default.geo", 2)
24 |     .setPreference("permissions.default.microphone", 2)
25 |     .setPreference("permissions.default.shortcuts", 2)
26 |     // Don't let pages make popups:
27 |     .setPreference("capability.policy.default.Window.alert", "noAccess")
28 |     .setPreference("capability.policy.default.Window.confirm", "noAccess")
29 |     .setPreference("capability.policy.default.Window.prompt", "noAccess")
30 |     // Tracking protection blocks some nice thing to block:
31 |     .setPreference("privacy.trackingprotection.enabled", true)
32 |     .setPreference("privacy.trackingprotection.introCount", 20)
33 |     // Time out requests after 20 seconds:
34 |     .setPreference("network.http.response.timeout", 20)
35 |     .setPreference("network.http.connection-timeout", 20);
36 | 
37 |   const driver = new webdriver.Builder()
38 |     .withCapabilities({"moz:webdriverClick": true})
39 |     .forBrowser("firefox")
40 |     .setFirefoxOptions(options)
41 |     .build();
42 | 
43 |   driver.installAddon(addonFileLocation);
44 | 
45 |   return driver;
46 | };
47 | 
48 | exports.closeBrowser = async function(driver) {
49 |   // This works around some geckodriver bugs in driver.quit()
50 |   let handles = await driver.getAllWindowHandles();
51 |   for (let handle of handles) {
52 |     await driver.switchTo().window(handle);
53 |     await driver.close();
54 |   }
55 |   try {
56 |     driver.quit();
57 |   } catch (error) {
58 |     // Ignore it (probably the browser is closed by now)
59 |   }
60 | };
61 | 


--------------------------------------------------------------------------------
/test/random-walk.js:
--------------------------------------------------------------------------------
  1 | const { getDriver, closeBrowser } = require("./driver-setup");
  2 | const { By, until } = require("selenium-webdriver");
  3 | const { promiseTimeout, eitherPromise } = require("./test-utils");
  4 | const fs = require("fs");
  5 | const path = require("path");
  6 | const RandomGenerator = require("random-seed");
  7 | 
  8 | let seed = process.env.SEED || Date.now();
  9 | 
 10 | const randomGenerator = RandomGenerator.create(seed);
 11 | const random = randomGenerator.random.bind(randomGenerator);
 12 | 
 13 | const addonFileLocation = path.join(process.cwd(), "test", "build-walk", "extension.zip");
 14 | 
 15 | function choose(options) {
 16 |   return options[Math.floor(options.length * random())];
 17 | }
 18 | 
 19 | function weightedChoice(options) {
 20 |   let sum = 0;
 21 |   for (let pair of options) {
 22 |     sum += pair[1];
 23 |   }
 24 |   let choice = sum * random();
 25 |   let pos = 0;
 26 |   for (let pair of options) {
 27 |     pos += pair[1];
 28 |     if (pos >= choice) {
 29 |       return pair[0];
 30 |     }
 31 |   }
 32 |   throw new Error("Weight choice returned nothing, how?");
 33 | }
 34 | 
 35 | function chooseDestination(destinations, seenUrls) {
 36 |   let chooseOptions = destinations.filter(u => !seenUrls.has(u));
 37 |   if (!chooseOptions.length) {
 38 |     chooseOptions = destinations;
 39 |   }
 40 |   return choose(chooseOptions);
 41 | }
 42 | 
 43 | function chooseQuery(queries, url) {
 44 |   let choices = [];
 45 |   for (let prefix in queries) {
 46 |     if (!url.startsWith(prefix)) {
 47 |       continue;
 48 |     }
 49 |     for (let selector in queries[prefix]) {
 50 |       choices.push([selector, queries[prefix][selector]]);
 51 |     }
 52 |   }
 53 |   if (!choices.length) {
 54 |     return null;
 55 |   }
 56 |   return weightedChoice(choices);
 57 | }
 58 | 
 59 | function chooseSearchTerm(terms) {
 60 |   let wordCount = choose([1, 2, 3]);
 61 |   let words = [];
 62 |   while (words.length < wordCount) {
 63 |     let w = choose(terms);
 64 |     if (!words.includes(w)) {
 65 |       words.push(w);
 66 |     }
 67 |   }
 68 |   return words.join(" ");
 69 | }
 70 | 
 71 | let driver;
 72 | 
 73 | async function walk(config) {
 74 |   console.log("");
 75 |   console.log("");
 76 |   console.log("======================== RANDOM WALK ========================");
 77 |   console.log("");
 78 |   driver = await getDriver(addonFileLocation);
 79 |   // Give the add-on a moment to load:
 80 |   await promiseTimeout(1000);
 81 |   let seenUrls = new Set();
 82 |   let steps = 0;
 83 |   let lastWasSearch = false;
 84 |   for (;;) {
 85 |     await promiseTimeout(500);
 86 |     steps++;
 87 |     let url = await driver.getCurrentUrl();
 88 |     seenUrls.add(url);
 89 |     console.log("---Running step", steps, "url:", url);
 90 |     if (url.startsWith("http")) {
 91 |       let result = await eitherPromise(
 92 |         driver.wait(until.elementLocated(By.css("#pha-completed-freeze"))).then(() => true),
 93 |         promiseTimeout(30000).then(() => false)
 94 |       );
 95 |       if (!result) {
 96 |         console.log("Freezing page timed out");
 97 |       }
 98 |     } else {
 99 |       console.log("Unfreezable page");
100 |     }
101 |     let queryElement = chooseQuery(config.queries, url);
102 |     if (queryElement && !lastWasSearch) {
103 |       let term = chooseSearchTerm(config.searchTerms);
104 |       console.log("Doing search on", queryElement, "term:", term);
105 |       await driver.findElement(By.css(queryElement)).sendKeys(term + "\n");
106 |       await promiseTimeout(100);
107 |       lastWasSearch = true;
108 |       continue;
109 |     }
110 |     lastWasSearch = false;
111 |     if (url === "about:blank" || random() < config.destinations.frequency) {
112 |       let dest = chooseDestination(config.destinations.urls, seenUrls);
113 |       // Just in case a redirect happens and this exact URL isn't added:
114 |       seenUrls.add(dest);
115 |       await driver.get(dest);
116 |       await promiseTimeout(100);
117 |       continue;
118 |     }
119 |     let anchors = await driver.findElements(By.css("a"));
120 |     let anchor = choose(anchors);
121 |     if (!anchor) {
122 |       console.log("Warning: no anchor found in page", url);
123 |       continue;
124 |     }
125 |     let anchorUrl = await anchor.getAttribute("href");
126 |     if (!anchorUrl || anchorUrl.startsWith("mailto:")) {
127 |       console.log("Chose bad anchor:", anchorUrl);
128 |       continue;
129 |     }
130 |     if (!anchor) {
131 |       console.log("Got no anchor");
132 |       continue;
133 |     }
134 |     try {
135 |       await anchor.click();
136 |     } catch (e) {
137 |       if (e.name === "ElementNotInteractableError") {
138 |         console.log("Could not interact with anchor", anchorUrl);
139 |       } else if (e.name === "ElementClickInterceptedError") {
140 |         console.log("Could not interact with anchor due to cover", anchorUrl);
141 |       } else {
142 |         console.log("Error interacting with anchor:", anchorUrl, e);
143 |       }
144 |       continue;
145 |     }
146 |     promiseTimeout(100);
147 |   }
148 | }
149 | 
150 | async function fetchPages(pages) {
151 |   const { fetchPage, pageExists } = require("./commands");
152 |   console.log("");
153 |   console.log("");
154 |   console.log("======================== FETCHER ========================");
155 |   console.log("");
156 |   driver = await getDriver(addonFileLocation);
157 |   // Give the add-on a moment to load:
158 |   await promiseTimeout(1000);
159 |   let base = process.env.PHA_DATA || path.join(__dirname, "../walk-data");
160 |   let seenUrls = new Set();
161 |   for (let page of pages) {
162 |     console.log("-----------------------", page);
163 |     if (await pageExists(page, base)) {
164 |       console.log("  ...already exists.");
165 |       continue;
166 |     }
167 |     let result = await fetchPage(driver, page, base);
168 |     if (!result) {
169 |       console.log("  ...loaded but not fetched.");
170 |       continue;
171 |     }
172 |     console.log("  ...fetched.");
173 |     for (let feed of (result.parsedFeeds || [])) {
174 |       for (let item of feed.items) {
175 |         if (!seenUrls.has(item.link)) {
176 |           seenUrls.add(item.link);
177 |           console.log("    fetching", item.link);
178 |           let feedResult = await fetchPage(driver, item.link, base);
179 |           if (feedResult) {
180 |             console.log("      ...fetched.");
181 |           } else {
182 |             console.log("      ...loaded but not fetched.");
183 |           }
184 |         } else {
185 |           console.log("    skipping", item.link);
186 |         }
187 |       }
188 |     }
189 |   }
190 | }
191 | 
192 | async function main() {
193 |   let names = ["default.json"];
194 |   if (process.env.CONFIG) {
195 |     names.push(process.env.CONFIG);
196 |   }
197 |   let config = await loadConfig(names);
198 |   console.log("config:", config);
199 |   try {
200 |     await walk(config);
201 |   } catch (e) {
202 |     console.log("Error:", e);
203 |     console.log(e.stack);
204 |   }
205 |   console.log("---- closing");
206 |   await closeBrowser(driver);
207 | }
208 | 
209 | async function mainFetchPages() {
210 |   let names = ["default.json"];
211 |   if (process.env.CONFIG) {
212 |     names.push(process.env.CONFIG);
213 |   }
214 |   let config = await loadConfig(names);
215 |   console.log("config:", config);
216 |   try {
217 |     await fetchPages(config.destinations.urls);
218 |   } catch (e) {
219 |     console.log("Error:", e);
220 |     console.log(e.stack);
221 |   }
222 |   // await closeBrowser(driver);
223 | }
224 | 
225 | async function loadConfig(names) {
226 |   let configs = [];
227 |   for (let name of names) {
228 |     if (!name.endsWith(".json")) {
229 |       name += ".json";
230 |     }
231 |     if (!fs.existsSync(name)) {
232 |       name = path.join(__dirname, "walk-configs", name);
233 |     }
234 |     let data = fs.readFileSync(name, {encoding: "UTF-8"});
235 |     data = JSON.parse(data);
236 |     if (typeof data.searchTerms === "string") {
237 |       data.searchTerms = data.searchTerms.trim().split(/[\s\n]+/g);
238 |     }
239 |     configs.push(data);
240 |   }
241 |   let result = {
242 |     destinations: {
243 |       urls: [],
244 |       frequency: 0.05
245 |     },
246 |     queries: {},
247 |     searchTerms: []
248 |   };
249 |   for (let config of configs) {
250 |     let newUrls = config.destinations && config.destinations.urls;
251 |     if (!newUrls) {
252 |       newUrls = result.destinations.urls;
253 |     } if (newUrls.includes("*")) {
254 |       let newUrls = result.destinations.urls.concat(newUrls.filter(u => u !== "*"));
255 |     }
256 |     let newSearchTerms = config.searchTerms;
257 |     if (!newSearchTerms) {
258 |       newSearchTerms = config.searchTerms;
259 |     } else if (newSearchTerms.includes("*")) {
260 |       newSearchTerms = result.searchTerms.concat(newSearchTerms.filter(u => u !== "*"));
261 |     }
262 |     Object.assign(result, config);
263 |     result.destinations.urls = newUrls;
264 |     result.searchTerms = newSearchTerms;
265 |   }
266 |   return result;
267 | }
268 | 
269 | if (require.main === module) {
270 |   if (process.argv[2] === "fetch") {
271 |     mainFetchPages();
272 |   } else {
273 |     main();
274 |   }
275 | }
276 | 


--------------------------------------------------------------------------------
/test/static/blank.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <title></title>
 6 |   </head>
 7 |   <body>
 8 |   </body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/test/static/debug.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Debug page</title>
 6 |   </head>
 7 |   <body>
 8 |     <p>Note: this page is used in testing to send information to and from the add-on</p>
 9 | 
10 |     <fieldset>
11 |       <legend>Status</legend>
12 |       <textarea id="status"></textarea>
13 |     </fieldset>
14 | 
15 |     <fieldset>
16 |       <legend>Controls</legend>
17 |       <div>
18 |         <button id="flush">Flush</button> Time: <span id="flush-status"></span>
19 |       </div>
20 |     </fieldset>
21 | 
22 |   </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/test/static/search-destination.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Pretend destination</title>
 6 |     <link rel="stylesheet" href="style.css">
 7 |   </head>
 8 |   <body>
 9 |     <h1>Pretend destination</h1>
10 | 
11 |     <div>
12 |       Look at this table of contents!
13 | 
14 |       <ol>
15 |         <li><a id="first-link" href="#first">first place</a></li>
16 |         <li><a id="second-link" href="#second">second place</a></li>
17 |       </ol>
18 |     </div>
19 | 
20 |     <p id="first">First paragraph</p>
21 | 
22 |     <p id="second">Second paragraph</p>
23 | 
24 |   </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/test/static/search-results.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Search results</title>
 6 |   </head>
 7 |   <body>
 8 |     <h1>Search results</h1>
 9 | 
10 |     <ul>
11 |       <li>
12 |         <a class="result" href="search-destination.html">A pretend destination</a>
13 |       </li>
14 |     </ul>
15 |   </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/test/static/search.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Pretend Search</title>
 6 |   </head>
 7 |   <body>
 8 |     <h1>Search</h1>
 9 |     <form action="search-results.html" method="GET">
10 |       <fieldset>
11 |         <legend>Search</legend>
12 |         <input type="text" name="q">
13 |         <button type="submit">Search</button>
14 |       </fieldset>
15 |     </form>
16 |   </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/static/style.css:
--------------------------------------------------------------------------------
1 | body {
2 |   font-family: sans-serif;
3 | }
4 | 


--------------------------------------------------------------------------------
/test/test-utils.js:
--------------------------------------------------------------------------------
 1 | exports.promiseTimeout = function(time) {
 2 |   return new Promise((resolve) => {
 3 |     setTimeout(resolve, time);
 4 |   });
 5 | };
 6 | 
 7 | exports.eitherPromise = function(...promises) {
 8 |   return new Promise((resolve, reject) => {
 9 |     function sendResolve(value) {
10 |       if (resolve) {
11 |         resolve(value);
12 |         resolve = null;
13 |       }
14 |     }
15 |     function sendReject(error) {
16 |       if (reject) {
17 |         reject(error);
18 |         reject = null;
19 |       }
20 |     }
21 |     for (let promise of promises) {
22 |       promise.then(sendResolve).catch(sendReject);
23 |     }
24 |   });
25 | };
26 | 


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
  1 | /* globals describe, it, before, after */
  2 | 
  3 | /* Environmental variables that help control this test:
  4 | 
  5 | FIREFOX_CHANNEL = empty (default NIGHTLY)
  6 |                   NIGHTLY
  7 |                   AURORA (often Developer Edition)
  8 |                   BETA
  9 |                   RELEASE
 10 | 
 11 | NO_CLOSE = if not empty then when the test is finished, the browser will not be closed
 12 | 
 13 | */
 14 | 
 15 | const assert = require("assert");
 16 | const webdriver = require("selenium-webdriver");
 17 | const express = require("express");
 18 | const cookieParser = require("cookie-parser");
 19 | const http = require("http");
 20 | const { By, until, Key } = webdriver;
 21 | const path = require("path");
 22 | const fs = require("fs");
 23 | const { getDriver, closeBrowser } = require("./driver-setup");
 24 | const { promiseTimeout } = require("./test-utils");
 25 | 
 26 | const PORT = 11180;
 27 | const SERVER = `http://localhost:${PORT}`;
 28 | const SERVER_STATIC = `${SERVER}/test-static`;
 29 | const COMMAND_MOD = process.platform === "darwin" ? Key.COMMAND : Key.CONTROL;
 30 | const addonFileLocation = path.join(process.cwd(), "test", "build", "extension.zip");
 31 | 
 32 | let server;
 33 | 
 34 | function startServer() {
 35 |   if (server) {
 36 |     server.close();
 37 |   }
 38 |   const app = express();
 39 |   app.use(cookieParser());
 40 |   app.get("/cookie", (req, res) => {
 41 |     if (req.query.remove) {
 42 |       res.cookie("testCookie", "", {maxAge: 0});
 43 |     } else {
 44 |       res.cookie("testCookie", "test value", {maxAge: 3600000});
 45 |     }
 46 |     res.send("OK");
 47 |   });
 48 |   app.use("/test-static", express.static(path.join(__dirname, "static"), {
 49 |     index: ["index.html"],
 50 |     maxAge: null
 51 |   }));
 52 |   server = http.createServer(app);
 53 |   server.listen(PORT);
 54 | }
 55 | 
 56 | function stopServer() {
 57 |   server.close();
 58 |   server = null;
 59 | }
 60 | 
 61 | function filenameForUrl(url) {
 62 |   // FIXME: this won't work for long pages
 63 |   return path.join(__dirname, "test-data", "pages", encodeURIComponent(url) + "-page.json");
 64 | }
 65 | 
 66 | async function collectInformation(driver) {
 67 |   await driver.get(`${SERVER}/test-static/debug.html`);
 68 |   await driver.wait(until.elementLocated(By.css("#status")));
 69 |   let result = await driver.findElement(By.css("#status")).getAttribute("value");
 70 |   result = JSON.parse(result);
 71 |   await driver.findElement(By.css("#flush")).click();
 72 |   let status = await driver.findElement(By.css("#flush-status"));
 73 |   await driver.wait(until.elementTextContains(status, "finished"));
 74 |   return result;
 75 | }
 76 | 
 77 | describe("Test history collection", function() {
 78 |   this.timeout(120000);
 79 |   let driver;
 80 | 
 81 |   before(async function() {
 82 |     startServer();
 83 |     driver = await getDriver(addonFileLocation);
 84 |     // Give the add-on a moment to load:
 85 |     await promiseTimeout(1000);
 86 |   });
 87 | 
 88 |   after(async function() {
 89 |     stopServer();
 90 |     if (!process.env.NO_CLOSE) {
 91 |       closeBrowser(driver);
 92 |       return null;
 93 |     }
 94 |     console.info("Note: leaving browser open");
 95 |     return null;
 96 |   });
 97 | 
 98 |   it("will browse about", async function() {
 99 |     this.timeout(15000);
100 |     await driver.get(`${SERVER_STATIC}/search.html`);
101 |     await driver.findElement(By.name("q")).sendKeys("test query\n");
102 |     await driver.findElement(By.css("button")).click();
103 |     await driver.wait(until.titleIs("Search results"));
104 |     await driver.wait(until.elementLocated(By.css("a.result")));
105 |     await driver.findElement(By.css("a.result")).click();
106 |     await driver.wait(async () => {
107 |       let url = await driver.getCurrentUrl();
108 |       return !url.includes("search-results.html");
109 |     });
110 |     await driver.wait(until.elementLocated(By.css("#first-link")));
111 |     await driver.findElement(By.css("#first-link")).click();
112 |     await driver.navigate().back();
113 |     await driver.navigate().back();
114 |     await driver.wait(until.elementLocated(By.css("a.result")));
115 |     let selectLinkOpeninNewTab = Key.chord(COMMAND_MOD, Key.RETURN);
116 |     await driver.findElement(By.css("a.result")).sendKeys(selectLinkOpeninNewTab);
117 |     // We want to be sure the Cmd+click opens a tab before we do the next step:
118 |     await promiseTimeout(1000);
119 | 
120 |     /** *********************
121 |      *  fetch the results  */
122 |     let result = await collectInformation(driver);
123 | 
124 |     /** **********************
125 |      *  analyze the results */
126 |     let pages = result.currentPages.concat(result.pendingPages);
127 |     pages.sort((a, b) => a.loadTime > b.loadTime ? 1 : -1);
128 |     if (pages[0].url === "about:blank") {
129 |       // Sometimes about:blank shows up in the history, and sometimes it doesn't (presumably related
130 |       // to load time), so we remove it if it is the first
131 |       pages.shift();
132 |     }
133 |     function idToIndex(id) {
134 |       return pages.map(p => p.id).indexOf(id);
135 |     }
136 |     function property(name) {
137 |       return pages.map(p => p[name]);
138 |     }
139 |     let urls = pages.map(p => p.url);
140 |     let expectedUrls = [
141 |       `${SERVER_STATIC}/search.html`,
142 |       `${SERVER_STATIC}/search-results.html?q=test+query`,
143 |       `${SERVER_STATIC}/search-destination.html`,
144 |       `${SERVER_STATIC}/search-destination.html#first`,
145 |       `${SERVER_STATIC}/search-destination.html`,
146 |       `${SERVER_STATIC}/search-results.html?q=test+query`,
147 |       `${SERVER_STATIC}/search-destination.html`,
148 |       `${SERVER}/test-static/debug.html`,
149 |     ];
150 |     assert.deepEqual(urls, expectedUrls);
151 |     // Apparently driver.get() doesn't act like from_address_bar
152 |     assert.deepEqual(property("from_address_bar"), [
153 |       false, false, false, false, false, false, false, false
154 |     ], "from_address_bar");
155 |     // We went "back" to the 4th item (the google search)
156 |     assert.deepEqual(property("forward_back"), [
157 |       false, false, false, false, true, true, false, false
158 |     ], "forward_back");
159 |     assert.deepEqual(property("transitionType"), [
160 |       "link",
161 |       "form_submit", // search result
162 |       "link", // clicked on search result
163 |       "link", // clicked on anchor link
164 |       "link", // clicked on back...?
165 |       "link", // clicked on back again
166 |       undefined, // apparently open in new window is misunderstood
167 |       "link", // driver.get looks like link?
168 |     ], "transitionType");
169 |     assert.deepEqual(pages.map(p => idToIndex(p.sourceId)), [
170 |       -1, // Didn't come from anywhere, about:blank
171 |       0, // search page
172 |       1, // search result
173 |       2, // click on link
174 |       3, // went "back" to this page... FIXME: is this right?
175 |       4, // came from previous search result,
176 |       5, // something else...
177 |       5, // mysterious extra copy of a page
178 |     ]);
179 |     assert.deepEqual(property("newTab"), [
180 |       false, false, false, false, false, false, true, false,
181 |     ], "newTab");
182 |     assert.deepEqual(property("sourceClickText"), [
183 |       null,
184 |       null,
185 |       "A pretend destination",
186 |       "first place",
187 |       null,
188 |       null,
189 |       null,
190 |       "A pretend destination",
191 |     ], "sourceClickText");
192 |     assert.deepEqual(pages.map(p => !!p.unloadTime), [
193 |       true, true, true, true, true, true,
194 |       false, false, // only the last two pages are still loaded
195 |     ], "is unloaded");
196 |     assert.deepEqual(pages.map(p => typeof p.activeTime), [
197 |       "number", "number", "number", "number", "number", "number", "number", "number",
198 |     ]);
199 |     assert.deepEqual(property("closedReason"), [
200 |       "navigation",
201 |       "navigation",
202 |       "navigation",
203 |       "navigation",
204 |       "navigation",
205 |       "navigation",
206 |       null,
207 |       null, // Only the last two pages haven't been redirected away
208 |     ], "closedReason");
209 |     assert.deepEqual(property("title"), [
210 |       "Pretend Search",
211 |       "Search results",
212 |       "Pretend destination",
213 |       null,
214 |       null,
215 |       "Search results",
216 |       "Pretend destination",
217 |       null,
218 |     ], "captured title");
219 |     let searchResultLinks = [{
220 |       text: "A pretend destination",
221 |       url: "http://localhost:11180/test-static/search-destination.html",
222 |     }];
223 |     let destinationLinks = [
224 |       {
225 |         elementId: "first-link",
226 |         text: "first place",
227 |         url: "#first",
228 |       },
229 |       {
230 |         elementId: "second-link",
231 |         text: "second place",
232 |         url: "#second",
233 |       }
234 |     ];
235 |     assert.deepEqual(property("linkInformation"), [
236 |       [],
237 |       searchResultLinks,
238 |       destinationLinks,
239 |       null, // I'm not sure why these are null, probably because there isn't time to get the information?
240 |       null, // that's not a good reason for null values, might be fragile in the future
241 |       searchResultLinks,
242 |       null,
243 |       null,
244 |     ]);
245 |     return true;
246 |   });
247 | 
248 |   it("Will detect 404s", async function() {
249 |     this.timeout(10000);
250 |     let url = `${SERVER_STATIC}/does-not-exist.html`;
251 |     await driver.get(url);
252 |     await promiseTimeout(5000);
253 |     let result = await collectInformation(driver);
254 |     let page = result.pendingPages.filter(p => p.url.endsWith("does-not-exist.html"))[0];
255 |     assert.equal(page.statusCode, 404, `Status code not 404: ${page.statusCode}`);
256 |     assert(page.contentType.startsWith("text/html"), `contentType: ${page.contentType}`);
257 |     let filename = filenameForUrl(url);
258 |     let pageData = JSON.parse(fs.readFileSync(filename, {encoding: "UTF-8"}));
259 |     assert.equal(pageData.statusCode, 404);
260 |     return true;
261 |   });
262 | 
263 |   it("Will detect cookies", async function() {
264 |     this.timeout(10000);
265 |     let url = `${SERVER}/cookie`;
266 |     await driver.get(url);
267 |     await promiseTimeout(500);
268 |     await driver.get(url + "?remove=1");
269 |     await promiseTimeout(500);
270 |     let result = await collectInformation(driver);
271 |     let pages = result.currentPages.concat(result.pendingPages);
272 |     pages.sort((a, b) => a.loadTime > b.loadTime ? 1 : -1);
273 |     // Depending on previous tests, there might be other pages before the one we care about
274 |     for (let i = 0; i < pages.length; i++) {
275 |       if (pages[i].url.endsWith("cookie")) {
276 |         // The page we want to start with
277 |         pages.splice(0, i);
278 |         break;
279 |       }
280 |     }
281 |     assert.deepEqual(pages.map(p => [p.hasCookie, p.hasSetCookie]), [
282 |       [false, true], // has no cookie, but did set one
283 |       [true, true], // has no cookie, but did set the deleting cookie
284 |       [false, false], // the debug page, sets no cookie, and cookie has been deleted
285 |     ]);
286 |   });
287 | 
288 | });
289 | 


--------------------------------------------------------------------------------
/test/walk-configs/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "destinations": {
 3 |     "urls": [
 4 |       "https://www.google.com",
 5 |       "https://news.ycombinator.com",
 6 |       "https://news.google.com",
 7 |       "https://www.reddit.com"
 8 |     ],
 9 |     "frequency": 0.05
10 |   },
11 |   "queries": {
12 |     "https://www.google.com": {
13 |       "input[name='q']": 1.0
14 |     },
15 |     "https://www.reddit.com": {
16 |       "input[name='q']": 0.3
17 |     },
18 |     "https://news.google.com": {
19 |       "input[aria-label='Search']": 0.3
20 |     }
21 |   },
22 |   "searchTerms": "tornado watch associated press united states winter storm dallas cowboys real fake news oklahoma yodeling atlanta braves tennis los angeles cleveland interstate michigan missouri new york mets lake dancing minnesota"
23 | }
24 | 


--------------------------------------------------------------------------------
/test/walk-configs/news.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "destinations": {
  3 |     "urls": [
  4 |       "https://www.huffingtonpost.com/",
  5 |       "https://www.washingtonpost.com/",
  6 |       "https://www.wired.com/about/rss_feeds/",
  7 |       "https://www.telegraph.co.uk/",
  8 |       "https://www.bloomberg.com/",
  9 |       "https://www.reuters.com/",
 10 |       "http://www.dailymail.co.uk/",
 11 |       "https://www.usatoday.com/",
 12 |       "http://time.com/",
 13 |       "http://www.latimes.com/",
 14 |       "http://www.bbc.com/",
 15 |       "http://www.businessinsider.com/",
 16 |       "https://www.wired.com/",
 17 |       "https://www.nationalgeographic.com/",
 18 |       "https://www.buzzfeed.com/",
 19 |       "https://www.theatlantic.com/",
 20 |       "https://www.cbsnews.com/",
 21 |       "http://www.foxnews.com/",
 22 |       "https://www.nature.com/",
 23 |       "https://techcrunch.com/",
 24 |       "https://mashable.com/",
 25 |       "http://www.cbc.ca/",
 26 |       "https://www.ft.com/",
 27 |       "https://www.usnews.com/",
 28 |       "http://www.chicagotribune.com/",
 29 |       "https://www.sfgate.com/",
 30 |       "https://www.newyorker.com/",
 31 |       "http://www.abc.net.au/",
 32 |       "http://cbslocal.com/",
 33 |       "https://slate.com/",
 34 |       "https://gizmodo.com/",
 35 |       "https://www.engadget.com/",
 36 |       "http://www.nydailynews.com/",
 37 |       "https://www.vice.com/",
 38 |       "http://fortune.com/",
 39 |       "https://www.theverge.com/",
 40 |       "https://www.sciencedaily.com/",
 41 |       "https://www.bloomberg.com/businessweek",
 42 |       "https://www.psychologytoday.com/",
 43 |       "https://nypost.com/",
 44 |       "https://www.indiatimes.com/",
 45 |       "https://www.marketwatch.com/",
 46 |       "https://www.fastcompany.com/",
 47 |       "https://www.scientificamerican.com/",
 48 |       "https://www.thetimes.co.uk/",
 49 |       "https://www.theglobeandmail.com/",
 50 |       "https://www.entrepreneur.com/",
 51 |       "http://www.ox.ac.uk/",
 52 |       "https://hbr.org/",
 53 |       "https://www.inc.com/",
 54 |       "http://www.sciencemag.org/",
 55 |       "http://www.newsweek.com/",
 56 |       "https://www.rollingstone.com/",
 57 |       "http://nymag.com/",
 58 |       "https://www.politico.com/",
 59 |       "https://www.hollywoodreporter.com/",
 60 |       "https://academic.oup.com/journals/",
 61 |       "https://www.ap.org/",
 62 |       "https://www.vox.com/",
 63 |       "http://www.adweek.com/",
 64 |       "http://thehill.com/",
 65 |       "https://venturebeat.com/",
 66 |       "https://www.today.com/",
 67 |       "http://www.mtv.com/",
 68 | 
 69 |       "https://www.cnn.com/",
 70 |       "http://www.msnbc.com/",
 71 |       "https://www.aol.com/news/",
 72 |       "https://www.nytimes.com/",
 73 |       "http://www.drudgereport.com/",
 74 |       "http://abcnews.go.com/",
 75 |       "https://www.wsj.com/",
 76 |       "http://www.mcclatchydc.com/",
 77 |       "http://talkingpointsmemo.com",
 78 | 
 79 |       "https://www.npr.org/",
 80 |       "https://www.nbcnews.com/",
 81 |       "http://www.breitbart.com/",
 82 |       "https://www.denverpost.com/",
 83 |       "http://newyork.cbslocal.com/",
 84 |       "http://losangeles.cbslocal.com/",
 85 |       "http://chicago.cbslocal.com/",
 86 |       "http://dfw.cbslocal.com/",
 87 |       "https://www.wusa9.com/",
 88 |       "http://washington.cbslocal.com/",
 89 |       "http://boston.cbslocal.com/",
 90 |       "http://philadelphia.cbslocal.com/",
 91 |       "http://minnesota.cbslocal.com/",
 92 |       "http://sanfrancisco.cbslocal.com/",
 93 |       "http://detroit.cbslocal.com/",
 94 |       "http://tampa.cbslocal.com/",
 95 |       "https://www.newsmax.com/",
 96 |       "https://www.washingtontimes.com/",
 97 |       "https://www.boston.com/",
 98 |       "https://www.mercurynews.com/",
 99 |       "http://www.philly.com/",
100 |       "https://www.seattletimes.com/",
101 |       "http://www.miamiherald.com/",
102 |       "http://observer.com/",
103 |       "http://www.stltoday.com/",
104 |       "http://gothamist.com/",
105 |       "http://ktla.com/",
106 |       "https://www.seattlepi.com/",
107 |       "https://www.newsday.com/",
108 |       "https://chicago.suntimes.com/",
109 |       "http://www.laweekly.com/",
110 |       "http://abc13.com/",
111 |       "https://wtop.com/",
112 |       "http://www.bostonherald.com/",
113 |       "https://www.nbcnewyork.com/",
114 |       "http://wgntv.com/",
115 |       "http://abc7news.com/",
116 |       "http://www.autonews.com/",
117 |       "http://kdvr.com/",
118 |       "http://www.miaminewtimes.com/",
119 |       "https://www.twincities.com/",
120 |       "http://www.kxan.com/",
121 |       "https://www.nbcchicago.com/",
122 |       "https://www.nbcwashington.com/",
123 |       "http://fox2now.com/",
124 |       "http://longisland.news12.com/",
125 |       "https://www.nbclosangeles.com/",
126 |       "http://pix11.com/",
127 |       "http://www.phillyvoice.com/",
128 |       "https://www.villagevoice.com/",
129 |       "http://www.westword.com/",
130 |       "http://www.houstonpress.com/",
131 |       "http://www.kron4.com/",
132 |       "https://www.nbcphiladelphia.com/",
133 |       "http://www.dailyherald.com/",
134 |       "https://www.nbcsandiego.com/",
135 |       "https://www.nbcdfw.com/",
136 |       "http://www.phoenixnewtimes.com/",
137 |       "http://arlington.wickedlocal.com/",
138 |       "https://www.amny.com/",
139 |       "http://www.chicagobusiness.com/",
140 |       "https://www.pe.com/",
141 |       "https://www.wxyz.com/",
142 |       "https://whdh.com/",
143 |       "http://www.wfla.com/",
144 |       "http://fox5sandiego.com/",
145 |       "https://www.nbcmiami.com/",
146 |       "https://wsvn.com/",
147 |       "https://www.riverfronttimes.com/",
148 |       "https://www.abcactionnews.com/",
149 |       "http://www.wivb.com/",
150 |       "https://www.chicagoreader.com/",
151 |       "https://www.minnpost.com/",
152 |       "http://www.news10.com/",
153 |       "https://www.metrotimes.com/",
154 |       "https://www.texasobserver.org/",
155 |       "https://billypenn.com/",
156 |       "https://timesofsandiego.com/",
157 |       "https://www.nysun.com/",
158 |       "http://www.laobserved.com/",
159 |       "https://citylimits.org/",
160 |       "http://www.miamitodaynews.com/",
161 |       "http://kplr11.com/",
162 |       "https://atlantaintownpaper.com/",
163 |       "http://heartlandnewsfeed.com/",
164 |       "http://laindependent.com/",
165 | 
166 |       "http://www.startribune.com/",
167 |       "https://weather.com/",
168 |       "https://www.forbes.com/",
169 |       "https://www.cnbc.com/",
170 |       "https://www.theguardian.com/",
171 |       "https://www.salon.com/",
172 |       "http://dailycaller.com/",
173 |       "https://www.theblaze.com/",
174 | 
175 |       "https://news.google.com",
176 |       "https://www.yahoo.com/news/",
177 |       "https://medium.com/",
178 | 
179 |       "https://www.businessinsider.in/rss_feeds.cms",
180 |       "http://www.bbc.com/news/10628494",
181 |       "http://www.latimes.com/la-rssinfopage-htmlstory.html",
182 |       "https://www.huffingtonpost.com/syndication",
183 |       "https://talkingpointsmemo.com/feeds",
184 |       "https://archive.nytimes.com/www.nytimes.com/services/xml/rss/index.html?8dpc",
185 |       "https://blog.feedspot.com/nytimes_rss_feeds/",
186 |       "https://www.reuters.com/tools/rss",
187 |       "https://www.huffingtonpost.com/syndication",
188 |       "https://www.washingtontimes.com/feeds/",
189 |       "http://www.dailymail.co.uk/home/article-2684527/RSS-Feeds.html",
190 |       "https://www.usatoday.com/rss/",
191 |       "http://content.time.com/time/rss/",
192 |       "https://www.buzzfeed.com/rss",
193 |       "http://dailycaller.com/rss-feeds/",
194 |       "https://www.theguardian.com/help/feeds",
195 |       "https://www.cnbc.com/rss-feeds/",
196 |       "https://developer.yahoo.com/finance/?guccounter=1",
197 |       "https://developer.yahoo.com/rss/",
198 |       "http://www.startribune.com/rss-index/112994779/",
199 |       "http://www.startribune.com/rss-index/112994779/",
200 |       "http://www.miamiherald.com/site-services/rss/",
201 |       "http://www.laobserved.com/pages/feeds.php",
202 |       "https://www.nysun.com/rss.php",
203 |       "https://www.metrotimes.com/detroit/Syndication",
204 |       "https://www.minnpost.com/rss-feed-list",
205 |       "https://www.chicagoreader.com/chicago/Syndication/Page",
206 |       "http://www.dailyherald.com/rss/",
207 |       "http://www.phillyvoice.com/rss-feeds/",
208 |       "http://www.autonews.com/section/syndication",
209 |       "http://www.laweekly.com/arts/rss-feeds-page-2143509",
210 |       "http://www.thesuntimes.com/section/feed",
211 |       "https://www.seattlepi.com/rss/",
212 |       "http://www.stltoday.com/rss/",
213 |       "http://observer.com/rss-feeds/",
214 |       "https://www.seattletimes.com/rss-feeds/#all-content",
215 |       "https://www.seattletimes.com/rss-feeds/#local-news",
216 |       "https://www.seattletimes.com/rss-feeds/#nation-and-world",
217 |       "https://www.seattletimes.com/rss-feeds/#business",
218 |       "https://www.seattletimes.com/rss-feeds/#sports",
219 |       "https://www.seattletimes.com/rss-feeds/#entertainment",
220 |       "https://www.seattletimes.com/rss-feeds/#life",
221 |       "https://www.seattletimes.com/rss-feeds/#opinion",
222 |       "https://www.seattletimes.com/rss-feeds/#photo-and-video",
223 |       "http://www.philly.com/philly/about/rss_index/",
224 |       "https://www.boston.com/rss-feeds",
225 |       "https://www.washingtontimes.com/feeds/",
226 |       "https://www.newsmax.com/rss/",
227 |       "https://www.denverpost.com/web-feeds/",
228 |       "http://www.mcclatchydc.com/customer-service/rss/",
229 |       "http://www.wsj.com/public/page/rss_news_and_feeds_podcast.html",
230 |       "http://www.wsj.com/public/page/rss_news_and_feeds_videos.html",
231 |       "http://www.wsj.com/public/page/rss_news_and_feeds_blogs.html",
232 |       "http://www.wsj.com/public/page/rss_news_and_feeds.html",
233 |       "http://abcnews.go.com/Site/page/rss--3520115",
234 |       "https://archive.nytimes.com/www.nytimes.com/services/xml/rss/index.html?8dpc",
235 |       "http://www.cnn.com/services/rss/",
236 |       "http://thehill.com/resources/rss-feeds",
237 |       "https://www.politico.com/rss",
238 |       "https://www.rollingstone.com/services/rss",
239 |       "http://www.newsweek.com/rss",
240 |       "http://www.sciencemag.org/about/email-alerts-and-rss-feeds",
241 |       "https://hbphelp.zendesk.com/hc/en-us/articles/215259487-RSS-Feed",
242 |       "http://www.rssmix.com/uk-news-feeds",
243 |       "https://www.marketwatch.com/rss/",
244 |       "https://timesofindia.indiatimes.com/rss.cms",
245 |       "https://nypost.com/rssfeeds/",
246 |       "https://www.sciencedaily.com/newsfeeds.htm",
247 |       "http://www.nydailynews.com/services/feeds",
248 |       "http://minnesota.cbslocal.com/rss-feeds/",
249 |       "http://abcnews.go.com/Site/page/rss--3520115",
250 |       "https://www.newyorker.com/about/feeds",
251 |       "https://www.sfgate.com/rss/",
252 |       "http://www.chicagotribune.com/cs-rssfeeds-htmlstory.html",
253 |       "https://www.usnews.com/info/features/rss-feeds",
254 |       "http://www.cbc.ca/rss/",
255 |       "https://www.nature.com/webfeeds/index.html",
256 |       "http://www.foxnews.com/about/rss/",
257 |       "https://www.cbsnews.com/rss/",
258 |       "https://www.theatlantic.com/follow-the-atlantic/"
259 |     ],
260 |     "frequency": 0.10
261 |   },
262 |   "queries": {
263 |     "https://news.google.com": {
264 |       "input[aria-label='Search']": 0.3
265 |     },
266 |     "https://www.yahoo.com/news": {
267 |       "input[aria-label='Search']": 0.3
268 |     }
269 |   },
270 |   "searchTerms": "tornado watch associated press united states winter storm dallas cowboys real fake news oklahoma yodeling atlanta braves tennis los angeles cleveland interstate michigan missouri new york mets lake dancing minnesota"
271 | }
272 | 


--------------------------------------------------------------------------------