├── .circleci
    └── config.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── Makefile
├── README.md
├── cli
    ├── .flake8
    ├── Makefile
    ├── README.rst
    ├── dev-requirements.txt
    ├── doc-building-requirements.txt
    ├── fathom_web
    │   ├── __init__.py
    │   ├── accuracy.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── extract.py
    │   │   ├── fox.py
    │   │   ├── histogram.py
    │   │   ├── label.py
    │   │   ├── list.py
    │   │   ├── pick.py
    │   │   ├── serve.py
    │   │   ├── test.py
    │   │   └── train.py
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── resources
    │   │   │   └── train
    │   │   │   │   ├── vectorize_ruleset.js
    │   │   │   │   ├── vectorize_sample_1.html
    │   │   │   │   └── vectorize_sample_2.html
    │   │   ├── test_extract.py
    │   │   ├── test_label.py
    │   │   ├── test_list.py
    │   │   ├── test_pick.py
    │   │   ├── test_test.py
    │   │   ├── test_train.py
    │   │   └── test_utils.py
    │   ├── utils.py
    │   └── vectorizer.py
    ├── setup.cfg
    └── setup.py
├── docs
    ├── Makefile
    ├── clustering.rst
    ├── commands
    │   ├── extract.rst
    │   ├── fox.rst
    │   ├── histogram.rst
    │   ├── label.rst
    │   ├── list.rst
    │   ├── pick.rst
    │   ├── serve.rst
    │   ├── test.rst
    │   └── train.rst
    ├── conf.py
    ├── debugging.rst
    ├── deploy-docs
    ├── development.rst
    ├── example.rst
    ├── exceptions.rst
    ├── fnodes.rst
    ├── glossary.rst
    ├── img
    │   ├── histogram.png
    │   └── price_tracker_screenshot.png
    ├── index.rst
    ├── installing.rst
    ├── integrating.rst
    ├── intro.rst
    ├── maintaining.rst
    ├── rules.rst
    ├── ruleset.rst
    ├── samples.rst
    ├── theme
    │   ├── static
    │   │   └── tweaks.css
    │   └── theme.conf
    ├── training.rst
    ├── utilities.rst
    ├── versions.rst
    ├── zoo.rst
    └── zoo
    │   ├── login.rst
    │   ├── new_password.rst
    │   ├── price_tracker.rst
    │   ├── smoot_articles.rst
    │   └── smoot_shopping.rst
├── fathom
    ├── .babelrc
    ├── .eslintignore
    ├── .eslintrc.yml
    ├── .npmignore
    ├── Makefile
    ├── clusters.mjs
    ├── exceptions.mjs
    ├── fnode.mjs
    ├── index.mjs
    ├── lhs.mjs
    ├── package-lock.json
    ├── package.json
    ├── rhs.mjs
    ├── rollup.config.js
    ├── rule.mjs
    ├── ruleset.mjs
    ├── side.mjs
    ├── test
    │   ├── browser
    │   │   ├── http_server.js
    │   │   ├── isVisible.html
    │   │   └── isVisible.js
    │   ├── clusters_tests.mjs
    │   ├── demos.mjs
    │   ├── lhs_tests.mjs
    │   ├── rhs_tests.mjs
    │   ├── rule_tests.mjs
    │   ├── ruleset_tests.mjs
    │   ├── side_tests.mjs
    │   └── utils_tests.mjs
    ├── utils.mjs
    ├── utilsForBackend.mjs
    └── utilsForFrontend.mjs
├── fathom_fox
    ├── .eslintignore
    ├── .eslintrc.json
    ├── LICENSE
    ├── README.md
    ├── Tagged Head.afdesign
    ├── addon
    │   ├── actionMenu.js
    │   ├── background.js
    │   ├── corpus.js
    │   ├── devtoolsOpener.js
    │   ├── devtoolsPanel.js
    │   ├── download.js
    │   ├── icons
    │   │   └── icon.svg
    │   ├── manifest.json
    │   ├── measureWindowSize.js
    │   ├── pages
    │   │   ├── actionMenu.html
    │   │   ├── blank.html
    │   │   ├── corpus.html
    │   │   ├── devtoolsOpener.html
    │   │   ├── devtoolsPanel.html
    │   │   ├── evaluate.html
    │   │   └── vector.html
    │   ├── utils.js
    │   ├── vector.js
    │   └── visit.js
    ├── package.json
    ├── rollup.config.js
    ├── src
    │   ├── contentScript.js
    │   ├── evaluate.js
    │   ├── rollup-plugin-webpack-postcss
    │   │   ├── LICENSE.md
    │   │   ├── README.md
    │   │   └── rollup-plugin-webpack-postcss.js
    │   └── rulesets.js
    └── yarn.lock
└── smoo


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   browser-tools: circleci/browser-tools@1.1.1
 5 | 
 6 | jobs:
 7 |   test_js:
 8 |     docker:
 9 |       - image: cimg/node:15.3.0-browsers
10 |     environment:
11 |       MOZ_HEADLESS: 1
12 |     steps:
13 |       - browser-tools/install-firefox:
14 |           version: 86.0.1
15 |       - checkout
16 |       - run: make -C fathom lint test
17 |       # Upload new coveralls stats only on master, which is the only place
18 |       # COVERALLS_REPO_TOKEN is defined:
19 |       - run:
20 |           name: Publish code coverage (if on master)
21 |           command: |
22 |            if [ ! -z "$COVERALLS_REPO_TOKEN" ]
23 |            then
24 |                make -C fathom coveralls
25 |            fi
26 |   test_python:
27 |     docker:
28 |       - image: cimg/python:3.7.9-node
29 |     environment:
30 |       MOZ_HEADLESS: 1
31 |     steps:
32 |       - browser-tools/install-firefox:
33 |           version: 86.0.1
34 |       - checkout
35 |       - restore_cache:
36 |           keys:
37 |             - venv-v1-{{ arch }}-{{ checksum "cli/dev-requirements.txt" }}-{{ checksum "cli/doc-building-requirements.txt" }}-{{ checksum "cli/setup.py" }}
38 |       - run: make -C cli lint test
39 |       - run: make docs
40 |       - save_cache:
41 |           key: venv-v1-{{ arch }}-{{ checksum "cli/dev-requirements.txt" }}-{{ checksum "cli/doc-building-requirements.txt" }}-{{ checksum "cli/setup.py" }}
42 |           paths:
43 |             - cli/venv
44 |       # Upload new docs only on master, which is the only place GH_TOKEN is
45 |       # defined. This saves time over doing it in a separate job.
46 |       - run:
47 |           name: Publish docs (if on master)
48 |           command: |
49 |            if [ ! -z "$GH_TOKEN" ]
50 |            then
51 |                docs/deploy-docs
52 |            fi
53 | 
54 | workflows:
55 |   version: 2
56 |   js_python_and_docs:
57 |     jobs:
58 |       - test_js
59 |       - test_python
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /cli/build
 2 | /cli/dist
 3 | /cli/*.egg-info
 4 | /cli/runs
 5 | /cli/venv
 6 | /cli/fathom_web/fathom.zip
 7 | /docs/_build
 8 | /docs/venv
 9 | /fathom/.npm_installed
10 | /fathom/node_modules
11 | /fathom/*.log
12 | /fathom/.nyc_output
13 | /fathom/coverage
14 | /fathom/dist
15 | /fathom/LICENSE
16 | /fathom/README.md
17 | /fathom/**/*.js
18 | !/fathom/rollup.config.js
19 | !/fathom/test/browser/*
20 | /fathom_fox/node_modules
21 | /fathom_fox/addon/contentScript.js
22 | /fathom_fox/addon/web-ext-artifacts
23 | /fathom_fox/addon/evaluate.js
24 | /fathom_fox/addon/simmer.js
25 | /fathom_fox/addon/rulesets.js
26 | **/__pycache__
27 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Community Participation Guidelines
 2 | 
 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 
 4 | For more details, please read the
 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 
 6 | 
 7 | ## How to Report
 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.
 9 | 
10 | <!--
11 | ## Project Specific Etiquette
12 | 
13 | In some cases, there will be additional project etiquette i.e.: (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html).
14 | Please update for your project.
15 | -->
16 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Convenience targets for executing common actions from the root of the repo
 2 | 
 3 | all: docs
 4 | 	$(MAKE) -C fathom
 5 | 	$(MAKE) -C cli
 6 | 
 7 | docs:
 8 | 	$(MAKE) -C docs clean html
 9 | 
10 | lint:
11 | 	$(MAKE) -C cli lint
12 | 	$(MAKE) -C fathom lint
13 | 
14 | test:
15 | 	$(MAKE) -C cli test
16 | 	$(MAKE) -C fathom test
17 | 
18 | clean:
19 | 	$(MAKE) -C cli clean
20 | 	$(MAKE) -C docs clean
21 | 	$(MAKE) -C fathom clean
22 | 
23 | 
24 | .PHONY: clean docs lint test
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fathom
2 | 
3 | Fathom is a supervised-learning system for recognizing parts of web pages—pop-ups, address forms, slideshows—or for classifying a page as a whole. A DOM flows in one side, and DOM nodes flow out the other, tagged with types and probabilities that those types are correct. A Prolog-like language makes it straightforward to specify the “smells” that suggest each type, and a neural-net-based trainer determines the optimal contribution of each smell. Finally, the FathomFox web extension lets you collect and label a corpus of web pages for training.
4 | 
5 | Continue reading at <https://mozilla.github.io/fathom/intro.html#why>.
6 | 
7 | __[Documentation](https://mozilla.github.io/fathom)__
8 | 


--------------------------------------------------------------------------------
/cli/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E129,E501,E741,F841,W292,W391,W503,W504
3 | exclude = build


--------------------------------------------------------------------------------
/cli/Makefile:
--------------------------------------------------------------------------------
 1 | # We avoid $(CURDIR) because it spits out /cygdrive/c/... on Windows Cygwin
 2 | # installs and leads to things that don't work.
 3 | VIRTUAL_ENV = ./venv
 4 | PYTHON3 ?= python3
 5 | # PATH seems to be exported even without "export", but I kept it to be explicit.
 6 | export PATH := $(VIRTUAL_ENV)/bin:$(VIRTUAL_ENV)/Scripts:$(PATH)
 7 | 
 8 | all: venv fathom.zip
 9 | 
10 | release: venv fathom.zip
11 | 	PATH="$(PATH)" python setup.py sdist bdist_wheel
12 | 
13 | lint: venv npm_installed
14 | 	@PATH="$(PATH)" flake8 --exclude $(VIRTUAL_ENV) .
15 | 	@cd ../fathom && node_modules/.bin/eslint -c .eslintrc.yml ../cli/fathom_web/test/resources
16 | 
17 | test: venv fathom.zip
18 | 	@PATH="$(PATH)" pytest fathom_web/test
19 | 
20 | # I'm open to ideas on how to fire this off only when necessary. But it's
21 | # pretty fast, at least.
22 | fathom.zip:
23 | 	cd .. && git archive --format zip --output cli/fathom_web/fathom.zip HEAD -9 fathom fathom_fox
24 | 
25 | clean:
26 | 	rm -rf $(VIRTUAL_ENV) fathom.zip
27 | 
28 | venv: $(VIRTUAL_ENV)/pyvenv.cfg
29 | 
30 | 
31 | # Private targets:
32 | 
33 | # Make a virtualenv at $VIRTUAL_ENV if there isn't one or if requirements have
34 | # changed. Install the dev requirements and the actual requirements.
35 | #
36 | # If the prereqs for this target change, UPDATE THE CACHE KEYS in the CircleCI
37 | # config as well!
38 | $(VIRTUAL_ENV)/pyvenv.cfg: dev-requirements.txt doc-building-requirements.txt setup.py
39 | 	$(PYTHON3) -m venv $(VIRTUAL_ENV)
40 | 	# We don't path-qualify pip3 because python -m venv on Travis creates a
41 | 	# venv with no pip executable in it.
42 | 	PATH="$(PATH)" pip3 install -r dev-requirements.txt -r doc-building-requirements.txt
43 | 	PATH="$(PATH)" pip3 install -e .
44 | 
45 | npm_installed:
46 | 	@$(MAKE) -C ../fathom .npm_installed
47 | 
48 | .PHONY: release lint test clean venv npm_installed
49 | 


--------------------------------------------------------------------------------
/cli/README.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Fathom Commandline Tools
 3 | ========================
 4 | 
 5 | This is the commandline trainer and other tools for `Fathom <https://mozilla.github.io/fathom/>`_, which itself is a supervised-learning system for recognizing parts of web pages. See `docs for the trainer <http://mozilla.github.io/fathom/training.html#running-the-trainer>`_ and `reference docs for the other tools <https://mozilla.github.io/fathom/index.html#command-reference>`_ in the Fathom docs.
 6 | 
 7 | Version History
 8 | ===============
 9 | 
10 | See the `version history <https://mozilla.github.io/fathom/versions.html>`_ in the main Fathom docs, under the "CLI tools" headers.
11 | 


--------------------------------------------------------------------------------
/cli/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | # Requirements needed for running tests and such. Broken off into a separate
2 | # file so Make can notice if they change and re-run pip install.
3 | flake8==3.8.4
4 | flake8-quotes==3.2.0
5 | pytest==5.1.2
6 | wheel==0.34.2
7 | 


--------------------------------------------------------------------------------
/cli/doc-building-requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster==0.7.12
 2 | Babel==2.8.0
 3 | certifi==2020.6.20
 4 | chardet==3.0.4
 5 | docutils==0.16
 6 | idna==2.10
 7 | imagesize==1.2.0
 8 | Jinja2==2.11.2
 9 | MarkupSafe==1.1.1
10 | packaging==20.4
11 | parsimonious==0.7.0
12 | pbr==5.4.5
13 | Pygments==2.6.1
14 | pyparsing==2.4.7
15 | pytz==2020.1
16 | requests==2.24.0
17 | six==1.15.0
18 | snowballstemmer==2.0.0
19 | Sphinx==3.1.2
20 | sphinx-click==2.3.2
21 | sphinx-js==3.0
22 | sphinx-rtd-theme==0.5.0
23 | sphinxcontrib-applehelp==1.0.2
24 | sphinxcontrib-devhelp==1.0.2
25 | sphinxcontrib-htmlhelp==1.0.3
26 | sphinxcontrib-jsmath==1.0.1
27 | sphinxcontrib-qthelp==1.0.3
28 | sphinxcontrib-serializinghtml==1.1.4
29 | urllib3==1.25.9
30 | 


--------------------------------------------------------------------------------
/cli/fathom_web/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/cli/fathom_web/__init__.py


--------------------------------------------------------------------------------
/cli/fathom_web/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from .extract import extract
 2 | from .fox import fox
 3 | from .histogram import histogram
 4 | from .label import label
 5 | from .list import list
 6 | from .pick import pick
 7 | from .serve import serve
 8 | from .test import test
 9 | from .train import train
10 | 
11 | from click import group
12 | 
13 | 
14 | @group()
15 | def fathom():
16 |     """Pass fathom COMMAND --help to learn more about an individual command."""
17 | 
18 | 
19 | fathom.add_command(extract)
20 | fathom.add_command(fox)
21 | fathom.add_command(histogram)
22 | fathom.add_command(label)
23 | fathom.add_command(list)
24 | fathom.add_command(pick)
25 | fathom.add_command(serve)
26 | fathom.add_command(test)
27 | fathom.add_command(train)
28 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/fox.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from zipfile import ZipFile
 3 | 
 4 | import click
 5 | from click import command, option, pause
 6 | 
 7 | from ..utils import path_or_none
 8 | from ..vectorizer import fathom_fox_addon, fathom_zip, running_firefox
 9 | 
10 | 
11 | @command()
12 | @option('--ruleset', '-r',
13 |         type=click.Path(exists=True, dir_okay=False, resolve_path=True),
14 |         callback=path_or_none,
15 |         help='The rulesets.js file containing your rules. The file must have no imports except from fathom-web, so pre-bundle if necessary. [default: the demo ruleset included with FathomFox]')
16 | def fox(ruleset):
17 |     """
18 |     Launch Firefox with FathomFox installed.
19 | 
20 |     This launches a fresh instance of Firefox with a blank profile as a
21 |     suitably clean environment for labeling samples.
22 | 
23 |     """
24 |     with ruleset_or_default(ruleset) as ruleset_file:
25 |         with fathom_fox_addon(ruleset_file) as addon_and_geckodriver:
26 |             addon_path, geckodriver_path = addon_and_geckodriver
27 |             with running_firefox(addon_path, True, geckodriver_path):
28 |                 pause(info='Press any key to quit.')
29 | 
30 | 
31 | @contextmanager
32 | def ruleset_or_default(ruleset_path_or_none):
33 |     """Yield the ruleset file-like object to use.
34 | 
35 |     This allows us to conditionally call various needed context managers.
36 | 
37 |     """
38 |     if ruleset_path_or_none:
39 |         with ruleset_path_or_none.open('rb') as ruleset_file:
40 |             yield ruleset_file
41 |     else:
42 |         # Go get the default demo ruleset:
43 |         with fathom_zip() as zip_file:
44 |             zip = ZipFile(zip_file)
45 |             # Opens in binary mode:
46 |             with zip.open('fathom_fox/src/rulesets.js') as default_ruleset:
47 |                 yield default_ruleset
48 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/histogram.py:
--------------------------------------------------------------------------------
  1 | from math import ceil
  2 | from pathlib import Path
  3 | 
  4 | import click
  5 | from click import argument, BadOptionUsage, command, get_terminal_size, option, style
  6 | from more_itertools import pairwise
  7 | import numpy
  8 | 
  9 | from ..utils import path_or_none, tensors_from
 10 | from ..vectorizer import make_or_find_vectors
 11 | 
 12 | 
 13 | @command()
 14 | @argument('training_set',
 15 |           type=click.Path(exists=True, resolve_path=True),
 16 |           metavar='TRAINING_SET_FOLDER')
 17 | @option('--ruleset', '-r',
 18 |         type=click.Path(exists=True, dir_okay=False, resolve_path=True),
 19 |         callback=path_or_none,
 20 |         help='The rulesets.js file containing your rules. The file must have no imports except from fathom-web, so pre-bundle if necessary.')
 21 | @option('--trainee',
 22 |         type=str,
 23 |         metavar='ID',
 24 |         help='The trainee ID of the ruleset you are testing. Usually, this is the same as the type you are testing.')
 25 | @option('--training-cache',
 26 |         type=click.Path(dir_okay=False, resolve_path=True),
 27 |         callback=path_or_none,
 28 |         help='Where to cache training vectors to speed future testing runs. Any existing file will be overwritten. [default: vectors/training_yourTraineeId.json next to your ruleset]')
 29 | @option('--delay',
 30 |         default=5,
 31 |         type=int,
 32 |         show_default=True,
 33 |         help='Number of seconds to wait for a page to load before vectorizing it')
 34 | @option('--tabs',
 35 |         default=16,
 36 |         type=int,
 37 |         show_default=True,
 38 |         help='Number of concurrent browser tabs to use while vectorizing')
 39 | @option('--show-browser',
 40 |         default=False,
 41 |         is_flag=True,
 42 |         help='Show browser window while vectorizing. (Browser runs in headless mode by default.)')
 43 | @option('--buckets', '-b',
 44 |         default=10,
 45 |         type=int,
 46 |         show_default=True,
 47 |         help='Number of histogram buckets to use for non-boolean features')
 48 | @option('rules', '--rule',
 49 |         type=str,
 50 |         multiple=True,
 51 |         help='The rule to graph. Can be repeated. Omitting this graphs all rules.')
 52 | def histogram(training_set, ruleset, trainee, training_cache, delay, tabs, show_browser, buckets, rules):
 53 |     """Show a histogram of rule scores.
 54 | 
 55 |     We also break down what proportion of each bucket comprised positive or
 56 |     negative samples. Altogether, this gives you an idea whether a rule is
 57 |     broadly applicable, discriminatory, and spitting out what you expect.
 58 | 
 59 |     """
 60 |     training_set = Path(training_set)
 61 |     if training_set.is_dir():
 62 |         if not ruleset:
 63 |             raise BadOptionUsage('ruleset', 'A --ruleset file must be specified when TRAINING_SET_FOLDER is passed a directory.')
 64 |         if not trainee:
 65 |             raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER is passed a directory.')
 66 | 
 67 |     training_data = make_or_find_vectors(
 68 |         ruleset,
 69 |         trainee,
 70 |         training_set,
 71 |         training_cache,
 72 |         show_browser,
 73 |         'training',
 74 |         delay,
 75 |         tabs)
 76 |     training_pages = training_data['pages']
 77 |     x, y, num_yes, _ = tensors_from(training_pages)
 78 |     feature_names = training_data['header']['featureNames']
 79 |     print_feature_report(feature_metrics(feature_names, x, y, buckets, rules or feature_names))
 80 | 
 81 | 
 82 | def feature_metrics(feature_names, x, y, buckets, enabled_rules):
 83 |     x_t = x.T  # [[...feature0 values across all pages...], [...feature1 values...], ...].
 84 |     for name, values in zip(feature_names, x_t):
 85 |         if name not in enabled_rules:
 86 |             continue
 87 |         is_boolean = is_boolean_feature(values)
 88 |         _, boundaries = numpy.histogram(values.numpy(),
 89 |                                         bins=2 if is_boolean else buckets)
 90 |         highest_boundary = boundaries[-1]
 91 |         bars = []
 92 |         for boundary, (low_bound, high_bound) in zip(boundaries, pairwise(boundaries)):
 93 |             is_last_time = high_bound == highest_boundary
 94 | 
 95 |             # Whether each feature value is a member of this bucket. Last
 96 |             # interval is inclusive on the right.
 97 |             x_is_for_this_bar = ((values >= low_bound) &
 98 |                                  ((values <= high_bound) if is_last_time else
 99 |                                   (values < high_bound)))
100 | 
101 |             y_for_this_bar = y.T[0].masked_select(x_is_for_this_bar)
102 |             positives = (y_for_this_bar.numpy() == 1).sum()
103 |             negatives = len(y_for_this_bar) - positives
104 |             label = str(ceil(boundary)) if is_boolean else f'{boundary:.1f}'
105 |             bars.append((label, positives, negatives))
106 |         yield name, bars
107 | 
108 | 
109 | def print_feature_report(metrics):
110 |     def bar(length, label):
111 |         """Return a bar of about the given length with the given label printed
112 |         on it.
113 | 
114 |         We may cheat and expand a bar a bit to fit the label.
115 | 
116 |         """
117 |         if not label:
118 |             # Don't expand a bar just to print a 0. The bar's absence serves.
119 |             label = ''
120 |         return ('{label: ^%i}' % length).format(label=label)
121 | 
122 |     term_width = get_terminal_size()[0]
123 |     pos_style = style('', fg='black', bg='bright_green', bold=True, reset=False)
124 |     neg_style = style('', fg='bright_white', bg='bright_black', bold=True, reset=False)
125 |     style_reset = style('', reset=True)
126 |     print(f'{pos_style} {style_reset} Positive Samples   {neg_style} {style_reset} Negative Samples')
127 |     for feature, bars in metrics:
128 |         longest_bar = max((positives + negatives) for _, positives, negatives in bars)
129 |         print('\n', style(feature, bold=True), sep='')
130 |         longest_label = max(len(label) for label, _, _ in bars)
131 |         longest_total = max(len(str(n + p)) for _, p, n in bars)
132 |         # This could still be slightly short if bar() has to cheat any bar lengths:
133 |         samples_per_char = longest_bar / (term_width - longest_label - longest_total - 4)
134 |         for label, positives, negatives in bars:
135 |             pos_length = int(round(positives / samples_per_char))
136 |             neg_length = int(round(negatives / samples_per_char))
137 |             padded_label = ('{label: >%i}' % longest_label).format(label=label)
138 |             pos_bar = bar(pos_length, positives)
139 |             neg_bar = bar(neg_length, negatives)
140 |             print(f'  {padded_label} {pos_style}{pos_bar}{style_reset}{neg_style}{neg_bar}{style_reset}{" " if (positives + negatives) else ""}{positives + negatives}')
141 | 
142 | 
143 | def is_boolean_feature(t):
144 |     """Given a 1-D Tensor of a single feature's value across many samples,
145 |     return whether it appears to be a yes/no feature."""
146 |     return ((t == 0) | (t == 1)).min().item()
147 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/label.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | from html.parser import HTMLParser
  3 | import multiprocessing
  4 | import os
  5 | import pathlib
  6 | import shutil
  7 | 
  8 | from click import argument, command, option, Path, progressbar, STRING
  9 | 
 10 | 
 11 | @command()
 12 | @option('--preserve-originals/--no-preserve-originals',
 13 |         default=True,
 14 |         help='Save original HTML files in a newly created `originals`'
 15 |              ' directory in IN_DIRECTORY (default: True)')
 16 | @option('--number-of-workers',
 17 |         default=multiprocessing.cpu_count(),
 18 |         help='Use the specified number of workers to speed up the labeling'
 19 |              ' process (default: the number of logical cores the machine has)')
 20 | @argument('in_directory', type=Path(exists=True, file_okay=False))
 21 | @argument('in_type', type=STRING)
 22 | def label(in_directory, in_type, preserve_originals, number_of_workers):
 23 |     """
 24 |     Apply a whole-page label to each page in a directory.
 25 | 
 26 |     Add the ``data-fathom`` attribute with a value of IN_TYPE to the
 27 |     opening tag of any ``<html>`` elements in the HTML pages in
 28 |     IN_DIRECTORY. This tool is used to label an entire webpage (e.g.
 29 |     IN_TYPE could be "article" for article webpages).
 30 | 
 31 |     """
 32 |     if preserve_originals:
 33 |         originals_dir = pathlib.Path(in_directory) / 'originals'
 34 |         try:
 35 |             originals_dir.mkdir(parents=True)
 36 |         except FileExistsError:
 37 |             raise RuntimeError(f'Tried to make directory {originals_dir.as_posix()}, but it already exists. To protect'
 38 |                                f' against unwanted data loss, please move or remove the existing directory.')
 39 |     else:
 40 |         originals_dir = None
 41 | 
 42 |     list_of_items = os.listdir(in_directory)
 43 | 
 44 |     print_statements = []  # Capture any print statements to log at the end.
 45 | 
 46 |     # Make a pool of workers. Each worker is in its own process. We use a scaling factor to account for the overhead of
 47 |     # setting up all of the processes.
 48 |     pool = multiprocessing.Pool(number_of_workers)
 49 |     # Curry ``task``, so we can pass more than one argument into pool.imap_unordered.
 50 |     task = partial(label_task, in_directory, in_type, originals_dir, preserve_originals)
 51 | 
 52 |     with progressbar(pool.imap_unordered(task, list_of_items),
 53 |                      label='Labeling pages',
 54 |                      length=len(list_of_items)) as bar:
 55 |         for result in bar:
 56 |             if result is not None:
 57 |                 print_statements.append(result)
 58 | 
 59 |     for statement in print_statements:
 60 |         print(statement)
 61 | 
 62 | 
 63 | def label_task(in_directory, in_type, originals_dir, preserve_originals, filename):
 64 |     file = pathlib.Path(in_directory) / filename
 65 |     if file == originals_dir:
 66 |         return
 67 |     if file.is_dir():
 68 |         return f'Skipped directory {file.name}/'
 69 |     if file.suffix != '.html':
 70 |         return f'Skipped {file.name}; not an HTML file'
 71 | 
 72 |     with file.open(encoding='utf-8') as fp:
 73 |         html = fp.read()
 74 | 
 75 |     new_html = label_html_tags_in_html_string(html, in_type)
 76 | 
 77 |     if preserve_originals:
 78 |         shutil.move(file, originals_dir / file.name)
 79 | 
 80 |     with file.open('w', encoding='utf-8') as fp:
 81 |         fp.write(new_html)
 82 | 
 83 | 
 84 | def label_html_tags_in_html_string(html: str, in_type: str) -> str:
 85 |     """
 86 |     Finds all opening ``html`` tags in the HTML string and adds a
 87 |     ``' data-fathom="${in_type}"'`` substring to each one.
 88 | 
 89 |     We do this by building a new HTML string with the inserted substring(s).
 90 | 
 91 |     The ``html`` tags are found using the HTMLParser class in Python's
 92 |     built-in html.parser library.
 93 |     """
 94 |     parser = HTMLParserSubclass(in_type)
 95 |     parser.feed(html)
 96 | 
 97 |     new_html = html
 98 | 
 99 |     for (original_html_tag, new_html_tag) in parser.html_tags_list:
100 |         new_html = new_html.replace(original_html_tag, new_html_tag, 1)
101 | 
102 |     return new_html
103 | 
104 | 
105 | class HTMLParserSubclass(HTMLParser):
106 |     def __init__(self, in_type, **kwargs):
107 |         self.in_type = in_type
108 |         self.html_tags_list = []
109 |         super().__init__(**kwargs)
110 | 
111 |     def handle_starttag(self, tag, attrs):
112 |         if tag == 'html':
113 |             original_html_tag = self.get_starttag_text()
114 |             new_html_substring = f'html data-fathom="{self.in_type}"'
115 |             new_html_tag = original_html_tag.replace('html', new_html_substring, 1)
116 |             self.html_tags_list.append((original_html_tag, new_html_tag))
117 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/list.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from click import argument, command, File, option, Path
 4 | 
 5 | from ..utils import samples_from_dir
 6 | 
 7 | 
 8 | @command()
 9 | @argument('in_directory', type=Path(exists=True, file_okay=False))
10 | @option('--base-dir', '-b', type=Path(exists=True, file_okay=False),
11 |         help='The directory to create relative paths from.')
12 | @option('--out-file', '-o', type=File(mode='w'), default=None,
13 |         help='A file for saving the printed filenames for easy future reference.')
14 | @option('--show-urls', '-u', default=False, is_flag=True,
15 |         help='Also show the original URL of each sample.')
16 | def list(in_directory, base_dir, out_file, show_urls):
17 |     """
18 |     List URL paths to samples.
19 | 
20 |     Recursively list paths of HTML files in IN_DIRECTORY relative to
21 |     <base_dir>, one path per line. If <base_dir> is not specified,
22 |     paths are relative to IN_DIRECTORY. Optionally saves output to
23 |     <out_file>.
24 | 
25 |     This is useful for vectorizing samples using FathomFox. FathomFox expects
26 |     input filenames copied into a text box with one filename per line and
27 |     relative to some path you are serving files from using ``fathom serve``.
28 | 
29 |     """
30 |     if base_dir is None:
31 |         base_dir = in_directory
32 | 
33 |     if out_file is not None:
34 |         filenames_to_save = []
35 | 
36 |     there_were_no_files = True
37 |     for file in samples_from_dir(in_directory):
38 |         there_were_no_files = False
39 |         relative_path = file.relative_to(base_dir)
40 |         if show_urls:
41 |             with file.open() as open_file:
42 |                 print(relative_path, original_url(open_file))
43 |         else:
44 |             print(relative_path)
45 | 
46 |         if out_file is not None:
47 |             filenames_to_save.append(relative_path.as_posix() + '\n')
48 | 
49 |     if out_file is not None:
50 |         if there_were_no_files:
51 |             print(f'No .html files found in {in_directory}. Did not create {out_file.name}.')
52 |         else:
53 |             out_file.writelines(filenames_to_save)
54 | 
55 | 
56 | def original_url(open_file):
57 |     """Return the original URL that FathomFox embedded in a given sample."""
58 |     # I started to write a clever loop to read only as much from each file as
59 |     # we needed, but it turns out reading 67 entire unextracted samples takes
60 |     # only 1.2s on my laptop.
61 |     match = re.search('<link rel="original" href="([^"]+)">', open_file.read())
62 |     if not match:
63 |         return ''
64 |     return match.group(1)
65 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/pick.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from random import sample
 3 | from shutil import move
 4 | 
 5 | from click import argument, command, Path, UsageError
 6 | 
 7 | 
 8 | @command()
 9 | @argument('from_dir',
10 |           type=Path(exists=True, file_okay=False, writable=True, dir_okay=True))
11 | @argument('to_dir',
12 |           type=Path(exists=True, file_okay=False, writable=True, dir_okay=True))
13 | @argument('number', type=int)
14 | def pick(from_dir, to_dir, number):
15 |     """
16 |     Randomly move samples to a training, validation, or test set.
17 | 
18 |     Move a random selection of HTML files and their extracted resources, if
19 |     any, from one directory to another. Ignore hidden files.
20 | 
21 |     """
22 |     # Make these strings into ``Path``s so they are easier to work with
23 |     from_dir = pathlib.Path(from_dir)
24 |     to_dir = pathlib.Path(to_dir)
25 | 
26 |     for file in sample(list(from_dir.glob('*.html')), number):
27 |         # If the file has resources, we must move those as well:
28 |         if (from_dir / 'resources' / file.stem).exists():
29 |             # Make sure we don't overwrite an existing resources directory
30 |             if (to_dir / 'resources' / file.stem).exists():
31 |                 raise UsageError(f'Tried to make directory {(to_dir / "resources" / file.stem).as_posix()}, but it'
32 |                                  f' already exists. To protect against unwanted data loss, please move or remove the'
33 |                                  f' existing directory.')
34 |             move(from_dir / 'resources' / file.stem, to_dir / 'resources' / file.stem)
35 |         move(file.as_posix(), to_dir)
36 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/serve.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
 3 | import os
 4 | 
 5 | from click import command, option, Path
 6 | 
 7 | 
 8 | @command()
 9 | @option('--port', '-p', type=int, default=8000,
10 |         help='The port to use (default: 8000)')
11 | @option('--directory', '-d', type=Path(exists=True, file_okay=False), default=os.getcwd(),
12 |         help='The directory to serve files from (default: current working directory)')
13 | def serve(directory, port):
14 |     """
15 |     Serve samples locally over HTTP.
16 | 
17 |     Serve the files in <directory> at http://localhost:<port>. This is useful
18 |     for vectorizing samples using FathomFox. FathomFox expects you to provide,
19 |     in the vectorizer page, an address to an HTTP server that is serving your
20 |     samples.
21 | 
22 |     """
23 |     server = ThreadingHTTPServer(('localhost', port), partial(SimpleHTTPRequestHandler, directory=directory))
24 |     print(f'Serving {directory} over http://localhost:{port}.')
25 |     print('Press Ctrl+C to stop.')
26 |     server.serve_forever()
27 | 


--------------------------------------------------------------------------------
/cli/fathom_web/commands/test.py:
--------------------------------------------------------------------------------
  1 | from json import JSONDecodeError, loads
  2 | from pathlib import Path
  3 | 
  4 | import click
  5 | from click import argument, BadOptionUsage, BadParameter, command, option
  6 | 
  7 | from ..accuracy import accuracy_per_tag, per_tag_metrics, pretty_accuracy, print_per_tag_report
  8 | from ..utils import classifier, path_or_none, speed_readout, tensor, tensors_from
  9 | from ..vectorizer import make_or_find_vectors
 10 | 
 11 | 
 12 | def decode_weights(ctx, param, value):
 13 |     """Validate a click option, making sure it's a valid JSON object with
 14 |     properly formatted "coeff" and "bias" keys."""
 15 |     try:
 16 |         decoded_weights = loads(value)
 17 |     except JSONDecodeError:
 18 |         raise BadParameter('Weights must be a valid JSON object.')
 19 | 
 20 |     if 'coeffs' not in decoded_weights or 'bias' not in decoded_weights:
 21 |         raise BadParameter('Weights must contain "coeffs" and "bias" keys.')
 22 |     if not isinstance(decoded_weights['bias'], float):
 23 |         raise BadParameter('Bias must be a float.')
 24 |     if not (isinstance(decoded_weights['coeffs'], list) and
 25 |             all((len(pair) == 2 and
 26 |                  isinstance(pair[0], str) and
 27 |                  isinstance(pair[1], float))
 28 |                 for pair in decoded_weights['coeffs'])):
 29 |         raise BadParameter('Coeffs must be a list of 2-element lists: [["ruleName", numericCoefficient], ...].')
 30 |     return decoded_weights
 31 | 
 32 | 
 33 | def model_from_json(weights, num_outputs, feature_names):
 34 |     """Return a linear model with the the passed in coeffs and biases.
 35 | 
 36 |     :arg weights: A dict with coeff and bias keys, as the program takes from
 37 |         the commandline
 38 |     :arg num_outputs: The number of output nodes of the network, typically 1
 39 |     :arg feature_names: The ordered list of feature names so we can get the
 40 |         coeffs lined up with the feature order used by the vectors
 41 | 
 42 |     """
 43 |     model = classifier(len(weights['coeffs']), num_outputs)
 44 |     coeffs = dict(weights['coeffs'])
 45 |     model.load_state_dict({'0.weight': tensor([[coeffs[f] for f in feature_names]]),
 46 |                            '0.bias': tensor([weights['bias']])})
 47 |     return model
 48 | 
 49 | 
 50 | @command()
 51 | @argument('testing_set',
 52 |           type=click.Path(exists=True, resolve_path=True),
 53 |           metavar='TESTING_SET_FOLDER')
 54 | @argument('weights', callback=decode_weights)
 55 | @option('--confidence-threshold', '-t',
 56 |         default=0.5,
 57 |         show_default=True,
 58 |         help='Threshold at which a sample is considered positive. Higher values decrease false positives and increase false negatives.')
 59 | @option('--ruleset', '-r',
 60 |         type=click.Path(exists=True, dir_okay=False, resolve_path=True),
 61 |         callback=path_or_none,
 62 |         help='The rulesets.js file containing your rules. The file must have no imports except from fathom-web, so pre-bundle if necessary.')
 63 | @option('--trainee',
 64 |         type=str,
 65 |         metavar='ID',
 66 |         help='The trainee ID of the ruleset you are testing. Usually, this is the same as the type you are testing.')
 67 | @option('--testing-cache',
 68 |         type=click.Path(dir_okay=False, resolve_path=True),
 69 |         callback=path_or_none,
 70 |         help='Where to cache testing vectors to speed future testing runs. Any existing file will be overwritten. [default: vectors/testing_yourTraineeId.json next to your ruleset]')
 71 | @option('--delay',
 72 |         default=5,
 73 |         type=int,
 74 |         show_default=True,
 75 |         help='Number of seconds to wait for a page to load before vectorizing it')
 76 | @option('--tabs',
 77 |         default=16,
 78 |         type=int,
 79 |         show_default=True,
 80 |         help='Number of concurrent browser tabs to use while vectorizing')
 81 | @option('--show-browser',
 82 |         default=False,
 83 |         is_flag=True,
 84 |         help='Show browser window while vectorizing. (Browser runs in headless mode by default.)')
 85 | @option('--verbose', '-v',
 86 |         default=False,
 87 |         is_flag=True,
 88 |         help='Show per-tag diagnostics, even though that could ruin blinding for the test set.')
 89 | def test(testing_set, weights, confidence_threshold, ruleset, trainee, testing_cache, delay, tabs, show_browser, verbose):
 90 |     """
 91 |     Evaluate how well a trained ruleset does.
 92 | 
 93 |     TESTING_SET_FOLDER is a directory of labeled testing pages. It can also be,
 94 |     for backward compatibility, a JSON file of vectors from FathomFox's
 95 |     Vectorizer.
 96 | 
 97 |     WEIGHTS should be a JSON-formatted object, as follows. You can paste it
 98 |     directly from the output of trainer.
 99 | 
100 |     \b
101 |         {"coeffs": [["nextAnchorIsJavaScript", 1.1627885103225708],
102 |                     ["nextButtonTypeSubmit", 4.613410949707031],
103 |                     ["nextInputTypeSubmit", 4.374269008636475]],
104 |     \b
105 |          "bias": -8.645608901977539}
106 | 
107 |     """
108 |     testing_set = Path(testing_set)
109 |     if testing_set.is_dir():
110 |         if not ruleset:
111 |             raise BadOptionUsage('ruleset', 'A --ruleset file must be specified when TESTING_SET_FOLDER is passed a directory.')
112 |         if not trainee:
113 |             raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TESTING_SET_FOLDER is passed a directory.')
114 | 
115 |     testing_data = make_or_find_vectors(ruleset,
116 |                                         trainee,
117 |                                         testing_set,
118 |                                         testing_cache,
119 |                                         show_browser,
120 |                                         'testing',
121 |                                         delay,
122 |                                         tabs)
123 |     testing_pages = testing_data['pages']
124 |     x, y, num_yes, num_prunes = tensors_from(testing_pages)
125 |     model = model_from_json(weights, len(y[0]), testing_data['header']['featureNames'])
126 | 
127 |     accuracy, false_positives, false_negatives = accuracy_per_tag(y, model(x), confidence_threshold, num_prunes)
128 |     print(pretty_accuracy('Testing', accuracy, len(x), false_positives, false_negatives, num_yes + num_prunes))
129 | 
130 |     if testing_pages and 'time' in testing_pages[0]:
131 |         print(speed_readout(testing_pages))
132 | 
133 |     if verbose:
134 |         print('\nTesting per-tag results:')
135 |         print_per_tag_report([per_tag_metrics(page, model, confidence_threshold) for page in testing_pages])
136 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/cli/fathom_web/test/__init__.py


--------------------------------------------------------------------------------
/cli/fathom_web/test/resources/train/vectorize_ruleset.js:
--------------------------------------------------------------------------------
 1 | // TODO: Address node rules evaluating against browser only files
 2 | /* eslint-disable node/no-unsupported-features/es-syntax */
 3 | // eslint-disable-next-line import/extensions, node/no-missing-import
 4 | import {dom, out, rule, ruleset, score, type} from 'fathom-web';
 5 | 
 6 | let coefficients = {
 7 |     'secret': [
 8 |         ['hasSecretParagraph', 0.0]
 9 |     ]
10 | };
11 | 
12 | let biases = [
13 |     ['secret', 0.0]
14 | ];
15 | 
16 | function caselessIncludes(haystack, needle) {
17 |     return haystack.toLowerCase().includes(needle.toLowerCase());
18 | }
19 | 
20 | function hasSecretParagraph(fnode) {
21 |     return caselessIncludes(fnode.element.innerText, 'secret');
22 | }
23 | 
24 | function makeRuleset(coeffs, biases) {
25 |     return ruleset(
26 |         [
27 |             rule(dom('html'), type('secret')),
28 |             rule(type('secret'), score(hasSecretParagraph.bind(this)), {name: 'hasSecretParagraph'}),
29 |             rule(type('secret'), out('secret'))
30 |         ],
31 |         coeffs,
32 |         biases
33 |     );
34 | }
35 | 
36 | const trainees = new Map();
37 | const VIEWPORT_SIZE = {width: 1680, height: 950};
38 | 
39 | const FEATURES = ['secret'];
40 | for (const feature of FEATURES) {
41 |     const ruleset = {
42 |         coeffs: new Map(coefficients[feature]),
43 |         viewportSize: VIEWPORT_SIZE,
44 |         vectorType: feature,
45 |         rulesetMaker: () => makeRuleset(
46 |             [
47 |                 ...coefficients.secret,
48 |             ],
49 |             biases
50 |         ),
51 |     };
52 |     trainees.set(feature, ruleset);
53 | }
54 | 
55 | export default trainees;
56 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/resources/train/vectorize_sample_1.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Sample 1</title>
 6 | </head>
 7 | <body>
 8 |     <p>Contains the secret word.</p>
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/resources/train/vectorize_sample_2.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Sample 2</title>
 6 | </head>
 7 | <body>
 8 |     <p>Contains a boring sentence.</p>
 9 | </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_extract.py:
--------------------------------------------------------------------------------
 1 | from ..commands.extract import BASE64_DATA_PATTERN, decode
 2 | 
 3 | 
 4 | def test_common_example():
 5 |     """Confirm we handle the well-behaving case"""
 6 |     mime_type = 'image/png'
 7 |     base64_string = 'aorienstar/tar/ararnsoine98daQAAAIST+++/rstienf='
 8 |     test_string = f'data:{mime_type};base64,{base64_string}'
 9 |     matches = get_base64_regex_matches(test_string)
10 |     assert len(matches) == 1
11 |     assert matches[0].group('mime') == mime_type
12 |     assert matches[0].group('string') == base64_string
13 | 
14 | 
15 | def get_base64_regex_matches(from_string):
16 |     """Helper method to get the list of matches from the given string.
17 | 
18 |     We need to use finditer() here because it returns Match objects while
19 |     findall() does not, and we use Match objects in ``fathom extract``.
20 |     """
21 |     return list(BASE64_DATA_PATTERN.finditer(from_string))
22 | 
23 | 
24 | def test_empty_string():
25 |     """Some base64 strings are actually empty"""
26 |     test_string = 'data:;base64,'
27 |     matches = get_base64_regex_matches(test_string)
28 |     assert len(matches) == 0
29 | 
30 | 
31 | def test_presence_of_charset():
32 |     """Some base64 strings contain a character set specification"""
33 |     test_string = 'data:image/png; charset=utf-8;base64,iVBORw0K'
34 |     matches = get_base64_regex_matches(test_string)
35 |     assert len(matches) == 1
36 | 
37 | 
38 | def test_string_with_multiple_base64_strings():
39 |     test_string = 'data:image/png;base64,rsoitenaofi2345wf/+ste data:image/png;base64,arsti390/'
40 |     matches = get_base64_regex_matches(test_string)
41 |     assert len(matches) == 2
42 | 
43 | 
44 | def test_string_with_percent_encoded_equals_signs_is_found():
45 |     """Some base64 strings have their padding characters (=) percent
46 |     encoded so they appear as %3D. Our regex should capture them.
47 |     """
48 |     base64_string = 'R0lGODlhAQABAID/AMDAwAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw%3D%3D'
49 |     test_string = f'url(&quot;data:image/gif;base64,{base64_string}&quot;)'
50 |     matches = get_base64_regex_matches(test_string)
51 |     assert len(matches) == 1
52 |     assert matches[0].group('string') == base64_string
53 | 
54 | 
55 | def test_string_with_percent_encoded_equals_signs_is_decoded():
56 |     """Some base64 strings have their padding characters (=) percent
57 |     encoded so they appear as %3D. We should be able to decode them.
58 | 
59 |     At the moment, we will trust the decoding is correct, we just want
60 |     to make sure no errors are raised.
61 |     """
62 |     base64_string = 'R0lGODlhAQABAID/AMDAwAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw%3D%3D'
63 |     decode(base64_string)
64 | 
65 | 
66 | def test_unpadded_string_is_decoded():
67 |     """Some base64 strings do not have padding characters. Python's
68 |     base64.b64decode() expects the string to be padded to a number of
69 |     characters that is a multiple of four.
70 | 
71 |     At the moment, we will trust the decoding is correct, we just want
72 |     to make sure no errors are raised.
73 |     """
74 |     base64_string = 'R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs'
75 |     decode(base64_string)
76 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_label.py:
--------------------------------------------------------------------------------
 1 | from ..commands.label import label_html_tags_in_html_string
 2 | 
 3 | 
 4 | IN_TYPE = 'test'
 5 | 
 6 | 
 7 | def test_opening_html_tag_has_no_attributes():
 8 |     """Some HTML tags may not have any attributes"""
 9 |     input_string = '<html>'
10 |     expected_string = f'<html data-fathom="{IN_TYPE}">'
11 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
12 | 
13 | 
14 | def test_opening_html_tag_has_attributes():
15 |     """Most HTML tags have at least one attribute"""
16 |     input_string = '<html lang="en-us">'
17 |     expected_string = f'<html data-fathom="{IN_TYPE}" lang="en-us">'
18 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
19 | 
20 | 
21 | def test_html_string_has_multiple_opening_html_tags():
22 |     """Some HTML tags may have multiple HTML tags"""
23 |     input_string = '<html><div></div><html>'
24 |     expected_string = f'<html data-fathom="{IN_TYPE}"><div></div><html data-fathom="{IN_TYPE}">'
25 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
26 | 
27 | 
28 | def test_html_string_has_right_angle_bracket_as_attribute_value():
29 |     """Some HTML tags may contain a right angle bracket in an unexpected location."""
30 |     input_string = '<html data-bracket=">" class="foo">'
31 |     expected_string = f'<html data-fathom="{IN_TYPE}" data-bracket=">" class="foo">'
32 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
33 | 
34 | 
35 | def test_html_string_is_multiline():
36 |     """Some HTML tags may span multiple lines"""
37 |     input_string = '<html\n' + \
38 |         'class="foo"\n' + \
39 |         'id="bar"\n' + \
40 |         '>'
41 |     expected_string = f'<html data-fathom="{IN_TYPE}"\n' + \
42 |         'class="foo"\n' + \
43 |         'id="bar"\n' + \
44 |         '>'
45 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
46 | 
47 | 
48 | def test_html_string_has_extra_spaces():
49 |     """
50 |     Some HTML tags may have extra spaces inside the HTML tag. Note that having a space
51 |     between the '<' and the tag name (e.g. 'html') is not valid HTML.
52 |     """
53 |     input_string = '<html   >'
54 |     expected_string = f'<html data-fathom="{IN_TYPE}"   >'
55 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
56 | 
57 | 
58 | def test_html_string_has_comments():
59 |     """
60 |     Some HTML tags may have HTML comments throughout. Note that comments cannot
61 |     occur within a tag.
62 |     """
63 |     input_string = '<!-- this is a comment --><html lang="en">\n' + \
64 |         '<!-- this is another comment --></html><!-- this is yet another comment -->'
65 |     expected_string = f'<!-- this is a comment --><html data-fathom="{IN_TYPE}" lang="en">\n' + \
66 |         '<!-- this is another comment --></html><!-- this is yet another comment -->'
67 |     assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string
68 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_list.py:
--------------------------------------------------------------------------------
  1 | from click.testing import CliRunner
  2 | 
  3 | from ..commands.list import list as list_main
  4 | 
  5 | 
  6 | def test_end_to_end(tmp_path):
  7 |     """Test expected outcome when using all of the optional parameters"""
  8 |     # Make temporary in_directory and base_dir directories
  9 |     base_dir, in_directory = make_directories(tmp_path)
 10 | 
 11 |     # Make HTML files in in_directory in two separate subdirectories
 12 |     # so we can exercise the recursive option.
 13 |     a1, a2, b1, b2 = make_html_files(in_directory)
 14 | 
 15 |     # Make the out_file we will save the output to
 16 |     out_file = (base_dir / 'out_file.txt')
 17 | 
 18 |     # Run fathom list
 19 |     result = CliRunner().invoke(
 20 |         list_main,
 21 |         [
 22 |             in_directory.as_posix(),
 23 |             '-b',
 24 |             f'{base_dir.as_posix()}',
 25 |             '-o',
 26 |             f'{out_file.as_posix()}',
 27 |         ],
 28 |     )
 29 |     assert result.exit_code == 0
 30 | 
 31 |     expected_file_contents = {
 32 |         a1.relative_to(base_dir).as_posix(),
 33 |         a2.relative_to(base_dir).as_posix(),
 34 |         b1.relative_to(base_dir).as_posix(),
 35 |         b2.relative_to(base_dir).as_posix(),
 36 |     }
 37 |     actual_file_contents = set(out_file.read_text().splitlines())
 38 |     assert expected_file_contents == actual_file_contents
 39 | 
 40 | 
 41 | def make_directories(tmp_path):
 42 |     """Makes the directories used as base_dir and in_directory in our fathom list calls"""
 43 |     base_dir = tmp_path / 'base_dir'
 44 |     base_dir.mkdir()
 45 |     in_directory = base_dir / 'in_directory'
 46 |     in_directory.mkdir()
 47 |     return base_dir, in_directory
 48 | 
 49 | 
 50 | def make_html_files(in_directory):
 51 |     """Makes four HTML files in a common directory structure for using in our fathom list calls"""
 52 |     (in_directory / 'source_a').mkdir()
 53 |     a1 = (in_directory / 'source_a' / '1.html')
 54 |     a1.touch()
 55 |     a2 = (in_directory / 'source_a' / '2.html')
 56 |     a2.touch()
 57 |     (in_directory / 'source_b').mkdir()
 58 |     b1 = (in_directory / 'source_b' / '1.html')
 59 |     b1.touch()
 60 |     b2 = (in_directory / 'source_b' / '2.html')
 61 |     b2.touch()
 62 |     return a1, a2, b1, b2
 63 | 
 64 | 
 65 | def test_no_files_to_list(tmp_path):
 66 |     """Test an empty in_directory using all of the optional parameters"""
 67 |     # Make temporary in_directory and base_dir directories
 68 |     base_dir, in_directory = make_directories(tmp_path)
 69 | 
 70 |     # Make the out_file we will save the output to
 71 |     out_file = (in_directory / 'out_file.txt')
 72 | 
 73 |     # Run fathom list
 74 |     result = CliRunner().invoke(
 75 |         list_main,
 76 |         [
 77 |             in_directory.as_posix(),
 78 |             '-o',
 79 |             f'{out_file.as_posix()}',
 80 |         ],
 81 |     )
 82 |     assert result.exit_code == 0
 83 | 
 84 |     assert 'No .html files found' in result.output
 85 | 
 86 | 
 87 | def test_without_base_dir(tmp_path):
 88 |     """Test omission of base-dir parameter"""
 89 |     # Make temporary in_directory and base_dir directories
 90 |     base_dir, in_directory = make_directories(tmp_path)
 91 | 
 92 |     # Make HTML files in in_directory in two separate subdirectories
 93 |     # so we can exercise the recursive option.
 94 |     a1, a2, b1, b2 = make_html_files(in_directory)
 95 | 
 96 |     # Make the out_file we will save the output to
 97 |     out_file = (base_dir / 'out_file.txt')
 98 | 
 99 |     # Run fathom list
100 |     result = CliRunner().invoke(
101 |         list_main,
102 |         [
103 |             in_directory.as_posix(),
104 |             '-o',
105 |             f'{out_file.as_posix()}',
106 |         ],
107 |     )
108 |     assert result.exit_code == 0
109 | 
110 |     expected_file_contents = {
111 |         a1.relative_to(in_directory).as_posix(),
112 |         a2.relative_to(in_directory).as_posix(),
113 |         b1.relative_to(in_directory).as_posix(),
114 |         b2.relative_to(in_directory).as_posix(),
115 |     }
116 |     actual_file_contents = set(out_file.read_text().splitlines())
117 |     assert expected_file_contents == actual_file_contents
118 | 
119 | 
120 | def test_in_directory_does_not_exist():
121 |     """Test giving an invalid path for in_directory causes an error"""
122 |     # Run fathom list
123 |     result = CliRunner().invoke(
124 |         list_main,
125 |         [
126 |             'fake_in_dir',
127 |         ],
128 |     )
129 |     # Assert the program exited with an error message about in directory not existing
130 |     assert result.exit_code == 2
131 |     # Different versions of click use different quotes:
132 |     assert ('"fake_in_dir" does not exist.' in result.output or
133 |             "'fake_in_dir' does not exist." in result.output)
134 | 
135 | 
136 | def test_base_dir_does_not_exist(tmp_path):
137 |     """Test giving an invalid path for base-dir causes an error"""
138 |     _, in_directory = make_directories(tmp_path)
139 | 
140 |     # Run fathom list
141 |     result = CliRunner().invoke(
142 |         list_main,
143 |         [
144 |             in_directory.as_posix(),
145 |             '-b',
146 |             'fake_base_dir',
147 |         ],
148 |     )
149 |     # Assert the program exited with an error message about base_dir not existing
150 |     assert result.exit_code == 2
151 |     assert ('"fake_base_dir" does not exist.' in result.output or
152 |             "'fake_base_dir' does not exist." in result.output)
153 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_pick.py:
--------------------------------------------------------------------------------
 1 | from click.testing import CliRunner
 2 | 
 3 | from ..commands.pick import pick
 4 | 
 5 | 
 6 | def test_end_to_end(tmp_path):
 7 |     """
 8 |     Given a directory of three files, use ``fathom pick`` to move two files, and
 9 |     check that the files and their potential resources directories have moved.
10 |     """
11 |     # Make temporary source and destination directories
12 |     source = tmp_path / 'source'
13 |     source.mkdir()
14 |     destination = tmp_path / 'destination'
15 |     destination.mkdir()
16 | 
17 |     # Add files to the source directory
18 |     (source / '1.html').touch()
19 |     (source / '2.html').touch()
20 |     (source / '3.html').touch()
21 | 
22 |     # Add resource directories for files 1 and 2
23 |     (source / 'resources' / '1').mkdir(parents=True)
24 |     (source / 'resources' / '1' / '1.png').touch()
25 |     (source / 'resources' / '1' / '2.css').touch()
26 |     (source / 'resources' / '2').mkdir(parents=True)
27 |     (source / 'resources' / '2' / '1.png').touch()
28 |     (source / 'resources' / '2' / '2.css').touch()
29 | 
30 |     # Run fathom pick to move 2 files from source to destination
31 |     runner = CliRunner()
32 |     # Arguments to invoke() must be passed as strings (this isn't documented!!!)
33 |     result = runner.invoke(pick, [source.as_posix(), destination.as_posix(), '2'])
34 |     assert result.exit_code == 0
35 | 
36 |     # Check the correct number of files have moved
37 |     files_in_source = list(source.glob('*.html'))
38 |     assert len(files_in_source) == 1
39 |     files_in_destination = list(destination.glob('*.html'))
40 |     assert len(files_in_destination) == 2
41 | 
42 |     # Check any resource directories have moved
43 |     if (destination / '1.html').exists():
44 |         assert (destination / 'resources' / '1' / '1.png').exists()
45 |         assert (destination / 'resources' / '1' / '2.css').exists()
46 |     if (destination / '2.html').exists():
47 |         assert (destination / 'resources' / '2' / '1.png').exists()
48 |         assert (destination / 'resources' / '2' / '2.css').exists()
49 | 
50 |     # Make sure we didn't lose any files
51 |     files_in_directories = {file.name for file in files_in_source} | {file.name for file in files_in_destination}
52 |     assert {'1.html', '2.html', '3.html'} == files_in_directories
53 | 
54 | 
55 | def test_resource_directory_path_collision(tmp_path):
56 |     """
57 |     Ensure an exception is raised when moving a resource directory
58 |     if that directory already exists in the destination directory.
59 |     """
60 |     # Make temporary source and destination directories
61 |     source = tmp_path / 'source'
62 |     source.mkdir()
63 |     destination = tmp_path / 'destination'
64 |     destination.mkdir()
65 | 
66 |     # Add the file to the source directory
67 |     (source / '1.html').touch()
68 | 
69 |     # Add the resource directory for our file
70 |     (source / 'resources' / '1').mkdir(parents=True)
71 |     (source / 'resources' / '1' / '1.png').touch()
72 |     (source / 'resources' / '1' / '2.css').touch()
73 | 
74 |     # Add a resource directory for the same file in the destination directory
75 |     (destination / 'resources' / '1').mkdir(parents=True)
76 | 
77 |     # Run fathom pick to move the only file from source to destination
78 |     runner = CliRunner()
79 |     # Arguments to invoke() must be passed as strings (this isn't documented!!!)
80 |     result = runner.invoke(pick, [source.as_posix(), destination.as_posix(), '1'])
81 | 
82 |     # Assert the program exited with a UsageError and our error message is in the program output
83 |     assert result.exit_code == 2
84 |     assert 'Error: Tried to make directory' in result.output
85 | 
86 |     # Check that our files haven't moved
87 |     files_in_source = list(source.glob('*.html'))
88 |     assert len(files_in_source) == 1
89 |     assert (source / 'resources' / '1' / '1.png').exists()
90 |     assert (source / 'resources' / '1' / '2.css').exists()
91 |     files_in_destination = list(destination.glob('*.html'))
92 |     assert len(files_in_destination) == 0
93 |     assert (destination / 'resources' / '1').exists()
94 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_test.py:
--------------------------------------------------------------------------------
 1 | from click import BadParameter
 2 | from pytest import raises
 3 | 
 4 | from ..commands.test import decode_weights
 5 | 
 6 | 
 7 | def test_expected_input_format():
 8 |     """Test that an example of good input decodes as expected"""
 9 |     json_string = '{"coeffs": [["rule1", 0.1], ["rule2", 0.2]], "bias": 0.5}'
10 |     expected_dict = {
11 |         'coeffs': [
12 |             ['rule1', 0.1],
13 |             ['rule2', 0.2],
14 |         ],
15 |         'bias': 0.5,
16 |     }
17 |     decoded_weights = decode_weights(None, None, json_string)
18 |     assert decoded_weights == expected_dict
19 | 
20 | 
21 | def test_not_json():
22 |     run_invalid_json('not_json', r'.*valid.*')
23 | 
24 | 
25 | def run_invalid_json(json_string, assertion_match_regex):
26 |     """Helper method to run `decode_weights()` with invalid input"""
27 |     with raises(BadParameter, match=assertion_match_regex):
28 |         decode_weights(None, None, json_string)
29 | 
30 | 
31 | def test_no_coeffs():
32 |     run_invalid_json('{"bias": 0.5}', r'.*contain.*coeffs.*')
33 | 
34 | 
35 | def test_no_bias():
36 |     run_invalid_json('{"coeffs": [["rule", 0.5]]}', r'.*contain.*bias.*')
37 | 
38 | 
39 | def test_coeffs_not_list():
40 |     run_invalid_json('{"coeffs": {"not": "a_list"}, "bias": 0.5}', r'Coeffs must be a list of 2-element lists.*')
41 | 
42 | 
43 | def test_coeffs_not_pairs():
44 |     run_invalid_json(
45 |         '{"coeffs": [["rule1"], ["rule2", 0.2]], "bias": 0.5}',
46 |         r'Coeffs must be a list of 2-element lists.*'
47 |     )
48 | 
49 | 
50 | def test_rulename_not_string():
51 |     run_invalid_json(
52 |         '{"coeffs": [[0.2, 0.2], ["rule2", 0.2]], "bias": 0.5}',
53 |         r'Coeffs must be a list of 2-element lists.*'
54 |     )
55 | 
56 | 
57 | def test_coeff_value_not_float():
58 |     run_invalid_json(
59 |         '{"coeffs": [["rule1", "rule1"], ["rule2", 0.2]], "bias": 0.5}',
60 |         r'Coeffs must be a list of 2-element lists.*'
61 |     )
62 | 


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import operator
  3 | 
  4 | from click.testing import CliRunner
  5 | 
  6 | from ..commands.train import exclude_indices, train, find_optimal_cutoff, single_cutoff, possible_cutoffs, accuracy_per_tag
  7 | from ..utils import tensor
  8 | 
  9 | 
 10 | def test_exclude_indices():
 11 |     assert exclude_indices([0, 2, 3], ['a', 'b', 'c', 'd', 'e', 'f']) == ['b', 'e', 'f']  # omit first, last, and some consecutive
 12 |     assert exclude_indices([1], ['a', 'b', 'c', 'd']) == ['a', 'c', 'd']  # leave ends alone
 13 |     assert exclude_indices([], ['a', 'b', 'c']) == ['a', 'b', 'c']  # do nothing
 14 |     assert exclude_indices([0], ['a']) == []  # omit everything
 15 | 
 16 | 
 17 | def test_auto_vectorization_smoke(tmp_path):
 18 |     """Make sure we get through auto-vectorization of at least the training
 19 |     set."""
 20 |     test_dir = os.path.dirname(os.path.abspath(__file__))
 21 | 
 22 |     runner = CliRunner()
 23 |     result = runner.invoke(
 24 |         train,
 25 |         [
 26 |             f'{test_dir}/resources/train/',
 27 |             '--ruleset',
 28 |             f'{test_dir}/resources/train/vectorize_ruleset.js',
 29 |             '--trainee',
 30 |             'secret',
 31 |             '--training-cache',
 32 |             f'{tmp_path.as_posix()}/training_vectors.json',
 33 |         ]
 34 |     )
 35 |     assert result.exit_code == 0
 36 |     assert (tmp_path / 'training_vectors.json').exists()
 37 | 
 38 | 
 39 | def test_possible_cutoffs():
 40 |     # single cutoff
 41 |     y_pred = tensor([1.2512])
 42 |     expected = [0.78]
 43 |     possibles = possible_cutoffs(y_pred)
 44 |     assert possibles == expected
 45 | 
 46 |     # Reduces to single cutoff since the midpoint is used.
 47 |     y_pred = tensor([1.2512, 1.2516])
 48 |     expected = [0.78]
 49 |     possibles = possible_cutoffs(y_pred)
 50 |     assert possibles == expected
 51 | 
 52 |     # Reduces to a single cutoff (due to rounding) from 2 cutoffs (due to midpoint)
 53 |     y_pred = tensor([1.2512, 1.2516, 1.255])
 54 |     expected = [0.78]
 55 |     possibles = possible_cutoffs(y_pred)
 56 |     assert possibles == expected
 57 | 
 58 |     # Partial reduction in number of cutoffs
 59 |     y_pred = tensor([-2.1605, -0.5696, 0.4886, 0.8633, -1.3479,
 60 |                      -0.5813, -0.5696, 0.5696, -0.5950, -0.5696])
 61 |     expected = [0.15, 0.28, 0.36, 0.49, 0.63, 0.67]
 62 |     possibles = possible_cutoffs(y_pred)
 63 |     assert possibles == expected
 64 | 
 65 |     # No reduction in number of cutoffs (since midpoints are used, 3 cutoffs are calculated pre rounding).
 66 |     y_pred = tensor([-2, -2.25, -1.95, 1.251])
 67 |     expected = [0.11, 0.12, 0.45]
 68 |     possibles = possible_cutoffs(y_pred)
 69 |     assert possibles == expected
 70 | 
 71 | 
 72 | def test_find_optimal_cutoff_single_cutoff_with_highest_accuracy():
 73 |     # This test is doing the steps completed by find_optimal_cutoff separately to
 74 |     # determine the expected value.  The functions used are covered by other tests.
 75 |     y_pred = tensor([-2.1605, -0.5696, 0.4886, 0.8633, -1.3479, -0.5813, -0.5696, 0.5696, -0.5950, -0.5696])
 76 |     y = tensor([0., 0., 1., 1., 0., 0., 0., 1., 0., 0.])
 77 | 
 78 |     # Determining the expected_cutoff
 79 |     expected_cutoffs = determine_expected_cutoffs(y, y_pred)
 80 | 
 81 |     # Expecting a single cutoff for the best accuracy
 82 |     assert len(expected_cutoffs) == 1
 83 |     expected_cutoff = expected_cutoffs[0]
 84 | 
 85 |     # Now that we have the expected expected_cutoff check the value returned from
 86 |     # find_optimal_cutoff against it (this is the real test)
 87 |     optimal_cutoff = find_optimal_cutoff(y, y_pred, num_prunes=0)
 88 |     assert optimal_cutoff == expected_cutoff
 89 |     # and a final double check
 90 |     assert optimal_cutoff == 0.49
 91 | 
 92 | 
 93 | def test_find_optimal_cutoff_multiple_cutoffs_with_highest_accuracy():
 94 |     # This test is doing the steps completed by find_optimal_cutoff separately to
 95 |     # determine the expected value.  The functions used are covered by other tests.
 96 |     y_pred = tensor([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.25, 2, 2.5])
 97 |     y = tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1.])
 98 | 
 99 |     # Determining the expected_cutoff
100 |     expected_cutoffs = determine_expected_cutoffs(y, y_pred)
101 | 
102 |     # Verifying that there are more than 1 cutoff with the best accuracy (the basis for this test case)
103 |     assert len(expected_cutoffs) > 1
104 | 
105 |     # From the list of best cutoffs get the single value (single_cutoff is tested elsewhere)
106 |     expected_cutoff = single_cutoff(expected_cutoffs)
107 | 
108 |     # Now that we have the expected cutoff check the value returned from
109 |     # find_optimal_cutoff against it (this is the real test)
110 |     optimal_cutoff = find_optimal_cutoff(y, y_pred, num_prunes=0)
111 |     assert optimal_cutoff == expected_cutoff
112 |     # and a final double check
113 |     assert optimal_cutoff == 0.56
114 | 
115 | 
116 | def test_single_cutoff():
117 |     # single
118 |     cutoffs = [0]
119 |     assert single_cutoff(cutoffs) == 0
120 | 
121 |     # last element
122 |     cutoffs = [0, 1]
123 |     assert single_cutoff(cutoffs) == 1
124 | 
125 |     cutoffs = [0, 1, 10]
126 |     assert single_cutoff(cutoffs) == 10
127 | 
128 |     # middle element
129 |     cutoffs = [0, 1, 2]
130 |     assert single_cutoff(cutoffs) == 1
131 | 
132 |     cutoffs = [0, 1, 2, 3]
133 |     assert single_cutoff(cutoffs) == 2
134 | 
135 |     cutoffs = [0, 1, 2, 3, 4]
136 |     assert single_cutoff(cutoffs) == 2
137 | 
138 |     cutoffs = [0, 1, 2, 3, 4, 5]
139 |     assert single_cutoff(cutoffs) == 3
140 | 
141 | 
142 | def determine_expected_cutoffs(y, y_pred):
143 |     # This list will contain the optimal cutoff
144 |     possibles = possible_cutoffs(y_pred)
145 | 
146 |     # Get the accuracy for each possible cutoff
147 |     # Note this is a different method of tracking the best cutoff than in find_optimal_cutoff.
148 |     cutoff_accuracy = {}
149 |     for possible in possibles:
150 |         accuracy, _, _ = accuracy_per_tag(y, y_pred, possible, num_prunes=0)
151 |         cutoff_accuracy[possible] = accuracy
152 | 
153 |     # Get the cutoffs with the max accuracy, there could be more than 1 cutoff
154 |     max_accuracy = max(cutoff_accuracy.items(), key=operator.itemgetter(1))[1]
155 |     optimal_cutoffs = [cutoff for cutoff, accuracy in cutoff_accuracy.items() if max_accuracy == accuracy]
156 | 
157 |     return optimal_cutoffs


--------------------------------------------------------------------------------
/cli/fathom_web/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | from ..utils import fit_unicode
 2 | 
 3 | 
 4 | def test_fit_unicode():
 5 |     assert fit_unicode('abc', 3) == 'abc'
 6 |     assert fit_unicode('abc', 2) == 'ab'
 7 |     assert fit_unicode('a母', 2) == 'a '
 8 |     assert fit_unicode('a母', 3) == 'a母'
 9 |     assert fit_unicode('a母母母s', 7) == 'a母母母'
10 |     assert fit_unicode('a母母母s', 6) == 'a母母 '
11 |     assert fit_unicode('a母母母s', 5) == 'a母母'
12 |     assert fit_unicode('a母母', 4) == 'a母 '
13 |     assert fit_unicode('a母', 6) == 'a母   '
14 | 


--------------------------------------------------------------------------------
/cli/fathom_web/utils.py:
--------------------------------------------------------------------------------
  1 | """Additional factored-up routines for which no clear pattern of organization
  2 | has yet emerged"""
  3 | 
  4 | import io
  5 | from os import walk
  6 | from pathlib import Path
  7 | from random import sample
  8 | from unicodedata import east_asian_width
  9 | 
 10 | from more_itertools import ilen, pairwise
 11 | from numpy import array, histogram
 12 | from sklearn.preprocessing import minmax_scale
 13 | import torch
 14 | from torch.nn import Sequential, Linear, ReLU
 15 | 
 16 | 
 17 | def tensor(some_list):
 18 |     """Cast a list to a tensor of the proper type for our problem."""
 19 |     return torch.tensor(some_list, dtype=torch.float)
 20 | 
 21 | 
 22 | def tensors_from(pages, shuffle=False):
 23 |     """Return (inputs, correct outputs, number of tags that are recognition
 24 |     targets, number of tags that were prematurely pruned) tuple.
 25 | 
 26 |     Can also shuffle to improve training performance.
 27 | 
 28 |     """
 29 |     xs = []
 30 |     ys = []
 31 |     num_targets = num_prunes = 0
 32 |     maybe_shuffled_pages = sample(pages, len(pages)) if shuffle else pages
 33 |     for page in maybe_shuffled_pages:
 34 |         for tag in page['nodes']:
 35 |             if tag.get('pruned'):
 36 |                 num_prunes += 1
 37 |             else:
 38 |                 xs.append(tag['features'])
 39 |                 ys.append([1 if tag['isTarget'] else 0])  # Tried 0.1 and 0.9 instead. Was much worse.
 40 |             if tag['isTarget']:
 41 |                 num_targets += 1
 42 |     return tensor(xs), tensor(ys), num_targets, num_prunes
 43 | 
 44 | 
 45 | def classifier(num_inputs, num_outputs, hidden_layer_sizes=None):
 46 |     """Return a new model of the type Fathom uses.
 47 | 
 48 |     At present, this is a linear binary classifier modeled as a perceptron.
 49 | 
 50 |     :arg num_inputs: The number of input nodes (layer 0 of the net)
 51 |     :arg num_outputs: The number of outputs. So far, always 1 since it's a
 52 |         binary classifier. We may expand to multiclass someday, however.
 53 |     :arg hidden_layer_sizes: For each hidden layer, the number of nodes in it.
 54 |         Fully-connectedness is assumed.
 55 | 
 56 |     """
 57 |     if hidden_layer_sizes is None:
 58 |         hidden_layer_sizes = []
 59 |     sizes = [num_inputs] + hidden_layer_sizes
 60 | 
 61 |     layers = []
 62 |     for i, o in pairwise(sizes):
 63 |         layers.append(Linear(i, o, bias=True))
 64 |         layers.append(ReLU())  # Sigmoid does worse, Tanh about the same.
 65 |     layers.append(Linear(sizes[-1], num_outputs, bias=True))
 66 | 
 67 |     return Sequential(*layers)
 68 | 
 69 | 
 70 | def mini_histogram(data):
 71 |     """Return a histogram of a list of numbers with min and max numbers
 72 |     labeled."""
 73 |     chars = ' ▁▂▃▄▅▆▇█'
 74 |     data_array = array(data)
 75 |     counts, _ = histogram(data_array, bins=10)
 76 |     indices = minmax_scale(counts, feature_range=(0, 8)).round()
 77 |     chart = ''.join(chars[int(i)] for i in indices)
 78 |     return '{min} |{chart}| {max}'.format(min=data_array.min(),
 79 |                                           chart=chart,
 80 |                                           max=data_array.max())
 81 | 
 82 | 
 83 | def speed_readout(pages):
 84 |     """Return human-readable metrics on ruleset-running speed based on
 85 |     benchmarks taken by the Vectorizer."""
 86 |     num_unpruned_nodes = sum(ilen(n for n in p['nodes'] if not n.get('pruned')) for p in pages)
 87 |     average = sum(p['time'] for p in pages) / num_unpruned_nodes
 88 |     histogram = mini_histogram([p['time'] for p in pages])
 89 |     return f'\nTime per page (ms): {histogram}    Average per tag: {average:.0f}'
 90 | 
 91 | 
 92 | def fit_unicode(string, width):
 93 |     """Truncate or pad a string to width, taking into account that some unicode
 94 |     chars are double-width."""
 95 |     width_so_far = 0
 96 |     for num_chars, char in enumerate(string, start=1):
 97 |         width_so_far += 2 if east_asian_width(char) == 'W' else 1
 98 |         if width_so_far == width:
 99 |             break
100 |         elif width_so_far > width:
101 |             num_chars -= 1
102 |             width_so_far -= 2
103 |             break
104 |     return string[:num_chars] + (' ' * (width - width_so_far))
105 | 
106 | 
107 | def samples_from_dir(in_dir):
108 |     """Return an iterable of Paths to samples found in ``in_dir``,
109 |     recursively."""
110 |     for dir_path, dirs, files in walk(in_dir):
111 |         try:
112 |             # Skip resources/ folders. Sometimes they contain .html files, and
113 |             # those aren't samples.
114 |             dirs.remove('resources')
115 |         except ValueError:
116 |             pass
117 |         yield from (Path(dir_path) / file for file in files
118 |                     if file.endswith('.html'))
119 | 
120 | 
121 | def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE):
122 |     """Yield pieces of data from a file-like object until EOF."""
123 |     while True:
124 |         chunk = file.read(size)
125 |         if not chunk:
126 |             break
127 |         yield chunk
128 | 
129 | 
130 | def path_or_none(ctx, param, value):
131 |     return None if value is None else Path(value)
132 | 


--------------------------------------------------------------------------------
/cli/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1


--------------------------------------------------------------------------------
/cli/setup.py:
--------------------------------------------------------------------------------
 1 | from io import open
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | setup(
 6 |     name='fathom-web',
 7 |     version='3.7.3',
 8 |     description='Commandline tools for training Fathom rulesets',
 9 |     long_description=open('README.rst', 'r', encoding='utf8').read(),
10 |     author='Erik Rose',
11 |     author_email='erik@mozilla.com',
12 |     license='MPL',
13 |     packages=find_packages(exclude=['*.test']),
14 |     url='https://mozilla.github.io/fathom/',
15 |     install_requires=[
16 |         'click>=7.0,<8.0',
17 |         'more-itertools>=8.2,<9.0',
18 |         'numpy>=1.18.1,<2.0',
19 |         'filelock>=3.0.12',
20 |         'scikit-learn>=0.22.2',
21 |         'selenium>=3.141.0',
22 |         'tensorboardX>=1.6,<2.0',
23 |         'torch>=1.0,<2.0',
24 |         'protobuf <= 3.20.1',
25 |     ],
26 |     dependency_links=[
27 |         'https://download.pytorch.org/whl/cu110/torch_stable.html'
28 |     ],
29 |     entry_points={'console_scripts': [
30 |         'fathom = fathom_web.commands:fathom',
31 |     ]},
32 |     package_data={'': ['fathom.zip']},
33 |     classifiers=[
34 |         'Intended Audience :: Developers',
35 |         'Natural Language :: English',
36 |         'Development Status :: 5 - Production/Stable',
37 |         'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
38 |         'Programming Language :: Python :: 3'
39 |     ],
40 |     keywords=['machine learning', 'ml', 'semantic extraction'],
41 | )
42 | 


--------------------------------------------------------------------------------
/docs/clustering.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Clustering
 3 | ==========
 4 | 
 5 | Fathom provides a flexible clustering algorithm, useful for finding nodes that are bunched together spatially or according to some other metric. By default, it groups nodes based on their proximity and ancestry. It is documented here as top-level functions but is also available directly within rulesets as :func:`bestCluster`, which has the advantage of letting you direct its results to further rules.
 6 | 
 7 | The clustering routines hang off a ``clusters`` object in the top-level Fathom module. To import them, do something like this:
 8 | 
 9 | .. code-block:: js
10 | 
11 |    const {
12 |      clusters: { distance },
13 |    } = require('fathom-web');
14 | 
15 | This will result in a top-level ``distance`` symbol.
16 | 
17 | .. note::
18 | 
19 |    Clustering is computationally expensive (at least O(n^2)). It is powerful, but it should be used only when more efficient alternatives are exhausted.
20 | 
21 | .. autofunction:: clusters
22 | 
23 |    Example:
24 | 
25 |    .. code-block:: js
26 | 
27 |       const {clusters} = require('fathom-web/clusters');
28 |       theClusters = clusters(anArrayOfNodes, 4);
29 | 
30 |    In the above, 4 is the distance beyond which Fathom will decide nodes belong in separate clusters. Turn it up to more aggressively invite nearby nodes into a cluster. Turn it down to keep clusters smaller. The output looks like a list of lists, with each list representing a cluster:
31 | 
32 |    .. code-block:: js
33 | 
34 |       [[nodeA, nodeB, nodeC],
35 |        [nodeD]]
36 | 
37 |    Various factors influence the measured distance between nodes. The first is the obvious one: topological distance, the number of steps along the DOM tree from one node to another.
38 | 
39 |    The second is structural similarity. In the following, the divs ``a`` and ``b`` are farther apart…
40 | 
41 |    .. code-block:: html
42 | 
43 |       <center>
44 |           <div id="a">
45 |           </div>
46 |       </center>
47 |       <div>
48 |           <div id="b">
49 |           </div>
50 |       </div>
51 | 
52 |    …than they would be if the ``center`` tag were a ``div`` as well:
53 | 
54 |    .. code-block:: html
55 | 
56 |       <div>
57 |           <div id="a">
58 |           </div>
59 |       </div>
60 |       <div>
61 |           <div id="b">
62 |           </div>
63 |       </div>
64 | 
65 |    Third is depth disparity. Nodes are considered farther from each other if they are not the same distance from the root.
66 | 
67 |    Finally is the presence of "stride" nodes, which are siblings or siblings-of-ancestors that lie
68 |    between 2 nodes. (These are the nodes that would appear between the 2 nodes in a straightforward rendering of the page.) Each stride node makes it less likely that the 2 nodes will be together in a cluster.
69 | 
70 |    The costs for each factor can be customized by wrapping :func:`distance` in an arrow function and passing it as the third param.
71 | 
72 |    .. note::
73 | 
74 |         ``clusters()`` can actually cluster anything, not just DOM nodes. All you need to do is pass in a suitable distance function as the ``getDistance`` param.
75 | 
76 | .. autofunction:: distance(fnodeA, fnodeB, {differentDepthCost = 2, differentTagCost = 2, sameTagCost = 1, strideCost = 1, additionalCost = (fnodeA, fnodeB) => 0})
77 | 
78 | .. autofunction:: euclidean
79 | 


--------------------------------------------------------------------------------
/docs/commands/extract.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.extract:extract
2 |    :prog: fathom extract
3 | 


--------------------------------------------------------------------------------
/docs/commands/fox.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.fox:fox
2 |    :prog: fathom fox
3 | 


--------------------------------------------------------------------------------
/docs/commands/histogram.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.histogram:histogram
2 |    :prog: fathom histogram
3 | 
4 | ----
5 | 
6 | .. image:: ../img/histogram.png
7 | 


--------------------------------------------------------------------------------
/docs/commands/label.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.label:label
2 |    :prog: fathom label
3 | 


--------------------------------------------------------------------------------
/docs/commands/list.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | 
3 |    This command is rarely needed anymore. As of Fathom 3.4, vectorization happens automatically when you run a command that needs it.
4 | 
5 | .. click:: fathom_web.commands.list:list
6 |    :prog: fathom list
7 | 


--------------------------------------------------------------------------------
/docs/commands/pick.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.pick:pick
2 |    :prog: fathom pick
3 | 


--------------------------------------------------------------------------------
/docs/commands/serve.rst:
--------------------------------------------------------------------------------
1 | .. note::
2 | 
3 |    This command is rarely needed anymore. As of Fathom 3.4, vectorization happens automatically when you run a command that needs it.
4 | 
5 | .. click:: fathom_web.commands.serve:serve
6 |    :prog: fathom serve
7 | 


--------------------------------------------------------------------------------
/docs/commands/test.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.test:test
2 |    :prog: fathom test
3 | 


--------------------------------------------------------------------------------
/docs/commands/train.rst:
--------------------------------------------------------------------------------
1 | .. click:: fathom_web.commands.train:train
2 |    :prog: fathom train
3 | 


--------------------------------------------------------------------------------
/docs/debugging.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Debugging
 3 | =========
 4 | 
 5 | Setting Breakpoints
 6 | ===================
 7 | 
 8 | If the :doc:`trainer<training>` reports JavaScript errors, you've probably got a bug in your ruleset code. If you can't find it by examination and need to place a breakpoint, the tool of choice is the FathomFox Evaluator.
 9 | 
10 | #. Run :doc:`fathom fox<commands/fox>`, and pass it your ruleset::
11 | 
12 |     fathom fox -r rulesets.js
13 | 
14 | #. Use the instance of Firefox that comes up to open a page that you think will reproduce the problem.
15 | #. Show the dev tools, and navigate to the Debugger panel.
16 | #. In the disclosure tree to the left, disclose FathomFox, and select `rulesets.js`.
17 | #. Scroll to the bottom, past the minified mess, and you’ll see your ruleset code. Place a breakpoint as you like, probably in one of your scoring callbacks.
18 | #. Invoke the Evaluator from the Fathom toolbar menu.
19 | #. Click Evaluate to run the ruleset over the loaded tabs.
20 | 
21 | You’ll end up in the debugger, paused at your breakpoint.
22 | 
23 | Identifying Misrecognized Elements
24 | ==================================
25 | 
26 | .. note::
27 |    Make sure you have the latest trained coefficients and biases pasted into your ruleset before you do this, or recognition won't work well.
28 | 
29 | FathomFox's Evaluator can point out misrecognized elements, in case the tag exerpts emitted by the trainer are insufficient to identify them. To use the Evaluator:
30 | 
31 | #. Open an instance of Firefox with FathomFox and your ruleset loaded (``fathom-fox -r rulesets.js`` makes this simple).
32 | #. Open all of the samples you want to diagnose as separate tabs.
33 | #. Open the Evaluator page using FathomFox's browser action button.
34 | #. In the Trainee dropdown, select the trainee you want to diagnose.
35 | #. Click the Evaluate button.
36 | #. Click any red box to navigate to a page with misrecognized nodes.
37 | #. On that tab, open the dev tools panel (ctrl-shift-N) and switch to the Fathom panel. Unfortunately, there aren't yet web extension APIs to do this part automatically.
38 | #. At this point, you’ll see a quick and dirty representation of the “bad” element: a new label called “BAD [the trainee]”. Be sure to delete this if you choose to re-save the page for some reason. Also note that the BAD label is created only when the bad cell is clicked, for speed; if you navigate to the bad page manually, the label won’t be there, or there might be an old label from a previous iteration.
39 | #. Return to the Evaluator tab and click any other red boxes you want to explore.
40 | 
41 | Histograms
42 | ==========
43 | 
44 | Finally, a great way to examine the scores your rules are emitting is :doc:`fathom histogram<commands/histogram>`. It can show you how useful a discriminator a rule is and help you notice when the distribution of output values is not what you expect.
45 | 
46 | .. image:: img/histogram.png
47 | 


--------------------------------------------------------------------------------
/docs/deploy-docs:
--------------------------------------------------------------------------------
 1 | #!/bin/sh -e
 2 | # Upload Sphinx docs to gh-pages branch.
 3 | 
 4 | cd docs/_build/html
 5 | touch .nojekyll
 6 | REV=$(git rev-parse HEAD)
 7 | git init
 8 | git config user.name "Fathom Documenter"
 9 | git config user.email "erik+fathomdoc@mozilla.commm"
10 | git checkout -b gh-pages
11 | git add .
12 | git commit -m "Update docs to ${REV}. [skip ci]"
13 | git remote add mozilla "https://$GH_TOKEN@github.com/mozilla/fathom.git"
14 | # Eat output so it doesn't spit out the sensitive GH_TOKEN if something goes wrong:
15 | git push -q -f mozilla gh-pages > /dev/null 2>&1
16 | 


--------------------------------------------------------------------------------
/docs/development.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Development
 3 | ===========
 4 | 
 5 | Source
 6 | ======
 7 | 
 8 | It's on `GitHub <https://github.com/mozilla/fathom>`_.
 9 | 
10 | Tests and Examples
11 | ==================
12 | 
13 | To run the tests, run... ::
14 | 
15 |     make lint test
16 | 
17 | This will also run the linter and analyze test coverage. To render the coverage report human-legibly, run ``make coverage``. You can then find the coverage report in the ``coverage`` directory.
18 | 
19 | You can also run the linter or tests for just one subproject at a time. For example, to test the CLI tools... ::
20 | 
21 |     cd cli
22 |     make lint test
23 | 
24 | If you want to drop into the debugger in the middle of a JS test, add a ``debugger;`` statement at your desired breakpoint, then run ``make debugtest`` in the ``fathom`` subproject::
25 | 
26 |     cd fathom
27 |     make debugtest
28 | 
29 | Docs
30 | ====
31 | 
32 | To build the docs... ::
33 | 
34 |     make docs
35 | 
36 | Gotchas
37 | =======
38 | 
39 | If you are developing the CLI tools and your changes to their embedded copy of the Fathom JS framework don't seem to be taking effect, commit first. The make target that builds ``fathom.zip`` uses ``git archive`` to pull from ``HEAD``. In this scenario, we tend to use a single local commit we amend with ``git commit --amend --no-edit`` when we want to test our changes.
40 | 
41 | Windows Considerations
42 | ======================
43 | 
44 | Fathom uses `makefiles <https://www.gnu.org/software/make/manual/make.html>`_ to do its builds and run its tests. These makefiles rely on Unix commands. Therefore, if you are developing on Windows, you need access to these Unix commands through something like `Cygwin <https://www.cygwin.com/>`_. You can build and test Fathom using `Windows Subsystem for Linux <https://docs.microsoft.com/en-us/windows/wsl/>`_, but just know that you are technically building and testing Fathom in Linux when you do.
45 | 
46 | Future Roadmap
47 | ==============
48 | 
49 | Fathom 3.x: the incremental gains
50 | ---------------------------------
51 | 
52 | * Regularization. Overfitting doesn't generally happen much, if you keep your eyes on the Tensorboard graphs to dodge wrong LRs, but sometimes you can still add signal to the model and get worse accuracy as a result. That should never happen. Regularization might help with that.
53 | * Automatic normalization. Right now, it's the ruleset author's responsibility to keep scoring callback outputs between 0 and 1. There are helpers to scale things linearly and sigmoidally, but it would be great to do this intelligently and automatically in the trainer, informed by the corpus rather than having the dev make guesses or painstaking calculations about the distribution.
54 | * Shuffle every iteration. Might help avoid overfitting. We shuffle once now.
55 | * Learn cutoff values. Sometimes there are values that, <7, should be treated one way and >7 another. We've had to model these by hand so far, but this should be automatic. We could using bucketing or deeper NNs, but we probably need much bigger corpora to support deeper NNs. The trainer already supports deeper NNs, but the client code needs support, and that'll be a breaking change because the format of the coefficients and biases will have to expand. The math itself, of course, is trivial.
56 | * Make corpus collection cheaper. Another theme for the future, related to the above, is making training data much cheaper to collect, because that would let us trade skilled labor of rule creation for unskilled corpus collection.
57 | * Text signal. So far, we mostly pay attention to markup. Any body-text stuff has to be implemented by the ruleset author. There's no reason we can't integrate a Bayesian (or other) text model on body text or even tokenized CSS classes and IDs. Or URL segments. Or other attribute values. A Bayesian classifier could happily live as a scoring callback, though the trainer would have to be special-cased to go do a separate pass to collect bag-of-words numbers, then in the main pass hand that to the Bayesian scoring callbacks and let the NNs balance the outputs of them as usual. But at this point, I prefer putting effort toward Fathom 4 than this fairly expensive effort with much overlap.
58 | * Visualization. It would be great to have a visualization tool that would show, on sample pages, what's getting classified right and wrong. Just haven't got around to it. Not hard.
59 | 
60 | Fathom 4: the great beyond
61 | --------------------------
62 | 
63 | We had perf problems using Fathom for the FF Companion: running it on every page or several times per page. I've never done much optimization, though profiling shows that 80% of time is spent on DOM calls. DOM calls are both slow and block the main thread, and the DOM cannot be moved off the main thread to do recognition concurrently. So I took a few afternoons and said "What if we dispense with all the DOM calls, then?" Reader Mode just throws the markup across thread boundaries. Let's see what we can get out of that. Sure, we lose heights and widths and visibilities and positions on the page, but there's still lots of signal in that thar text, and Fathom 1 started out there, as a node app running against a stub DOM implementation without access to a renderer. To make a long story short, I build a whole-page categorizer using logistic regression on TFIDF'd bags of words, with all markup stripped out, and...
64 | 
65 | * It gives 85% testing accuracy, very comparable with Smooth Shopping's 90% *validation* accuracy.
66 | * It took a month or more to write the Shopping ruleset. This one I didn't have to write at all; it was trained in 5 seconds.
67 | * I didn't engineer a single feature for this. Not so much as a price regex. It's a general classifier. It did similarly well against our hand-rolled Smoot Article recognizer, which is especially interesting since Articles have wider subject matter than shopping pages.
68 | * There's tons of signal still left on the floor:
69 |     * Stemming. Tried it but didn't have an obvious impact. Odd. Try again.
70 |     * All the markup. I stripped out everything but body text. Teach it to use tag names, CSS classes, IDs, and URL segments.
71 | 
72 | What's a more open question is whether this can be adapted from whole-page categorization to element recognition, like Fathoms 1-3, which is the more major case.
73 | 
74 | * Continue with this bag-of-words approach on a pruned down set of candidate tags, statistically informed? Either algorithmically come up with a minimal querySelector arg, or use a compressed model to predict which tags we ought to examine, like an attention system in computer vision.
75 | * Perhaps add some hand-rolled but still generic signals, like innertext length, markup bits, or consideration of surrounding elements (parents, grandparents, siblings, etc.).
76 | 
77 | If this could work, it would be a game-changer. Just as Fathoms 1-3 let us do something we couldn't do before at all, Fathom 4 would let you do it in a couple afternoons of low-skilled work rather than a couple weeks to months of skilled.


--------------------------------------------------------------------------------
/docs/example.rst:
--------------------------------------------------------------------------------
 1 | ===============
 2 | Example Ruleset
 3 | ===============
 4 | 
 5 | This is the simple example ruleset that ships with FathomFox; it is made available for experimentation when you run :doc:`commands/fox` without passing in your own ruleset. In its comments, it documents the structure of the ``trainees`` object, which is what :doc:`the trainer<commands/train>` needs to do its job.
 6 | 
 7 | .. literalinclude:: ../fathom_fox/src/rulesets.js
 8 |    :language: js
 9 |    :linenos:
10 | 


--------------------------------------------------------------------------------
/docs/exceptions.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Exceptions
 3 | ==========
 4 | 
 5 | Fathom's exceptions hang off an ``exceptions`` object in the top-level Fathom module. To import them, do something like this:
 6 | 
 7 | .. code-block:: js
 8 | 
 9 |    const {
10 |      exceptions: { NoWindowError },
11 |    } = require('fathom-web');
12 | 
13 | This will result in a top-level ``NoWindowError`` symbol.
14 | 
15 | .. autoclass:: CycleError
16 | .. autoclass:: NoWindowError
17 | 


--------------------------------------------------------------------------------
/docs/fnodes.rst:
--------------------------------------------------------------------------------
1 | ======
2 | Fnodes
3 | ======
4 | 
5 | Fnodes are typically returned from methods on Fathom :doc:`rulesets<ruleset>`.
6 | 
7 | .. autoclass:: Fnode
8 |    :members: element, hasNoteFor, hasType, noteFor, scoreFor
9 | 


--------------------------------------------------------------------------------
/docs/glossary.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Glossary
 3 | ========
 4 | 
 5 | .. glossary::
 6 | 
 7 |    candidate
 8 |        Any node (:term:`target` or not) brought into the ruleset by a :func:`dom` or :func:`element` call for consideration
 9 | 
10 |    fnode
11 |        A wrapper around a DOM node, holding :term:`scores<score>`, :term:`notes<note>`, and :term:`types<type>` pertaining to it. See :doc:`fnodes`.
12 | 
13 |    note
14 |        An arbitrary, opaque-to-Fathom piece of data attached to a given :term:`type` on a :term:`fnode`. Notes can be consulted by scoring callbacks and are a good place to park expensive-to-recompute information. They are the main way of passing data between rules.
15 | 
16 |    ruleset
17 |        The unordered collection of rules that forms a Fathom program. See :doc:`rules` for more on the relationships between top-level constructs.
18 | 
19 |    score
20 |        The fuzzy-edged part of :term:`fnode` state. A floating-point number, typically between 0 and 1, attached to a certain :term:`type` on a :term:`fnode`. They represent the confidence with which a node belongs to a type.
21 | 
22 |    subscore
23 |        A single rule's contribution to a node's score for some type. In Fathom's current incarnation as a series of (single-layer) perceptrons, each rule's subscore is multiplied by a coefficient, which is derived from training. The weighted subscores are then added together and fed through a sigmoid function to get the final score for a node for a type.
24 | 
25 |    target
26 |        A "right answer" DOM node, one that should be recognized as belonging to some type
27 | 
28 |    type
29 |        A string-typed category assigned to a :term:`fnode`. Types are the boolean, hard-edged, enumerated parts of fnode state. They also largely determine inter-rule dependencies and thus which rules get run in response to a query.
30 | 
31 |    vectorize
32 |        To turn a collection of sample HTML pages into vectors of numbers which the trainer then imbibes.
33 | 


--------------------------------------------------------------------------------
/docs/img/histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/docs/img/histogram.png


--------------------------------------------------------------------------------
/docs/img/price_tracker_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/docs/img/price_tracker_screenshot.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ======
 2 | Fathom
 3 | ======
 4 | 
 5 | .. image:: https://circleci.com/gh/mozilla/fathom.svg?style=svg
 6 |    :alt: Build Status
 7 |    :target: https://circleci.com/gh/mozilla/fathom
 8 | 
 9 | .. image:: https://coveralls.io/repos/github/mozilla/fathom/badge.svg?branch=master
10 |    :alt: Coverage Status
11 |    :target: https://coveralls.io/github/mozilla/fathom?branch=master
12 | 
13 | Find meaning in the web.
14 | 
15 | .. toctree::
16 |    :caption: Documentation
17 |    :maxdepth: 2
18 | 
19 |    intro
20 |    installing
21 |    samples
22 |    rules
23 |    training
24 |    debugging
25 |    integrating
26 |    maintaining
27 |    zoo
28 |    development
29 | 
30 | .. toctree::
31 |    :caption: API Reference
32 |    :maxdepth: 2
33 | 
34 |    clustering
35 |    exceptions
36 |    fnodes
37 |    ruleset
38 |    utilities
39 | 
40 | .. _command-reference:
41 | 
42 | .. toctree::
43 |    :caption: Command Reference
44 |    :titlesonly:
45 |    :glob:
46 | 
47 |    commands/*
48 | 
49 | Support
50 | =======
51 | 
52 | You can find us on...
53 | 
54 | * `Our Matrix chat room <https://chat.mozilla.org/#/room/#fathom:mozilla.org>`_
55 | * `GitHub <https://github.com/mozilla/fathom>`_
56 | * `The mailing list <https://mail.mozilla.org/listinfo/fathom>`_
57 | 
58 | .. toctree::
59 |    :caption: Back Matter
60 |    :titlesonly:
61 | 
62 |    example
63 |    versions
64 |    glossary
65 | 
66 | * :ref:`genindex`
67 | 
68 | .. toctree::
69 |    :hidden:
70 | 
71 |    zoo/new_password
72 |    zoo/login
73 |    zoo/smoot_articles
74 |    zoo/smoot_shopping
75 |    zoo/price_tracker


--------------------------------------------------------------------------------
/docs/installing.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Installing
 3 | ==========
 4 | 
 5 | Fathom consists of 3 parts. Here's how to install each one.
 6 | 
 7 | .. _fathomfox-installation:
 8 | 
 9 | Commandline Tools
10 | =================
11 | 
12 | Fathom's commandline tools take your labeled pages as input and train the machine-learning model. They also contain an embedded copy of FathomFox (see below), the simplest way to collect pages. If you don't already have Python 3.7 or better, download it from https://www.python.org/downloads/. Then, install the tools by running... ::
13 | 
14 |     pip3 install fathom-web
15 | 
16 | It's possible your Python package manager is called simply "pip" rather than "pip3". Give that a try if the above fails.
17 | 
18 | You will also need to install `Node.js <https://nodejs.org/en/>`_ to use many of the commandline tools.
19 | 
20 | FathomFox
21 | =========
22 | 
23 | FathomFox is a browser extension used to label web pages. The best way to get it is to first install the commandline tools and then run… ::
24 | 
25 |     fathom fox
26 | 
27 | This will launch a built-in copy of FathomFox in a fresh Firefox profile so ad blockers and other customizations don't interfere with the clean capture of labeled pages. (Some ad blockers will make changes to the DOM, like adding style attributes to ad iframes to hide them.) Using the commandline launcher also lets you pass in your own rulesets for debugging with the FathomFox Evaluator. See the ``-r`` option on the :doc:`fathom fox reference page<commands/fox>`.
28 | 
29 | For more casual use, you can instead `install FathomFox through the web <https://addons.mozilla.org/en-US/firefox/addon/fathomfox/>`_, in which case it will be your responsibility to avoid addons that might mutate the DOM.
30 | 
31 | Fathom
32 | ======
33 | 
34 | Fathom proper is a JS library which runs trained rulesets to do the actual recognition. You don't need to worry about installing it until your rulesets are performing satisfactorily and you're ready to integrate them with your application.
35 | 
36 | If your application runs server-side under `Node.js <https://nodejs.org/en/>`_, you can install `the Fathom node package <https://www.npmjs.com/package/fathom-web>`_ like any other dependency::
37 | 
38 |     npm install fathom-web
39 | 
40 | If, instead, you're working on a Firefox feature, you can use the copy of Fathom already in Firefox by saying something like this at the top of the file containing your ruleset::
41 | 
42 |     ChromeUtils.defineModuleGetter(
43 |       this,
44 |       "fathom",
45 |       "resource://gre/modules/third_party/fathom/fathom.jsm"
46 |     );
47 | 
48 |     const {
49 |       dom,
50 |       element,
51 |       out,
52 |       rule,
53 |       ruleset,
54 |       score,
55 |       type,
56 |       utils: { identity, isVisible, min },
57 |       clusters: { euclidean },
58 |     } = fathom;
59 | 
60 | Finally, if you need a self-contained bundle of Fathom in a context that can't use node packages, check out our `source <https://github.com/mozilla/fathom>`_ and run ``make -C fathom bundle``. This creates the bundle at ``fathom/dist/fathom.js``.
61 | 


--------------------------------------------------------------------------------
/docs/integrating.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Integrating
 3 | ===========
 4 | 
 5 | Once your ruleset is written and trained, your application can run a DOM tree through it:
 6 | 
 7 | .. code-block:: js
 8 | 
 9 |    // Tell the ruleset which DOM to run against, yielding a factbase about the
10 |    // document:
11 |    const facts = rules.against(document);
12 | 
13 | Then you can pull answers out of the factbase. In the case of the :doc:`example`, we want the node representing the highest-scoring overlay, which the ruleset conveniently stores under the "overlay" output key:
14 | 
15 | .. code-block:: js
16 | 
17 |    const bestOverlayFnode = facts.get('overlay');
18 | 
19 | If you're using a third-party ruleset that doesn't anticipate the output you want, you can ask for it more explicitly by passing a query, in the form of a full :ref:`LHS <lhs>`, to :func:`~BoundRuleset.get`. For example, if you simply want all the overlay-typed things so you can do further computation on them...
20 | 
21 | .. code-block:: js
22 | 
23 |    const allOverlayFnodes = facts.get(type('overlay'));
24 | 
25 | Or if you have a reference to a DOM element from elsewhere in your program, you can look up the scores, types, and notes Fathom attached to it:
26 | 
27 | .. code-block:: js
28 | 
29 |    const fnode = facts.get(dom.getElementById('someOverlay'));
30 | 
31 | Remember, once you have a :class:`~Fnode`, you can access the wrapped element from its :attr:`~Fnode.element` property.
32 | 


--------------------------------------------------------------------------------
/docs/intro.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Introduction
 3 | ============
 4 | 
 5 | Fathom is a supervised-learning system for recognizing parts of web pages—pop-ups, address forms, slideshows—or for classifying a page as a whole. A DOM flows in one side, and DOM nodes flow out the other, tagged with types and probabilities that those types are correct. A Prolog-like language makes it straightforward to specify the hints that suggest each type, and a neural-net-based trainer determines the optimal contribution of each. Finally, the `FathomFox <https://addons.mozilla.org/en-US/firefox/addon/fathomfox/>`_ web extension and a rich assortment of commandline tools help you collect, label, and use a corpus of web pages to train a recognizer.
 6 | 
 7 | Why?
 8 | ====
 9 | 
10 | A study of existing projects like Readability and Distiller suggests that purely imperative approaches to semantic extraction get bogged down in the mechanics of DOM traversal and state accumulation, obscuring the operative parts of the extractors and making new ones long and tedious to write. They involve a lot of human guessing of numerical weights. And they are brittle due to the promiscuous profusion of state. Fathom makes extractors easier to write by providing a declarative language, corpus capture, and neural-net-based training. With these, Fathom handles tree-walking, execution order, weight determination, and annotation bookkeeping, letting you concentrate on your application.
11 | 
12 | Specific Areas We Address
13 | =========================
14 | 
15 | * Browser-native DOM nodes are mostly immutable, and ``HTMLElement.dataset`` is string-typed, so storing arbitrary intermediate data on nodes is clumsy. Fathom addresses this by providing the Fathom node (or :term:`fnode`, pronounced fuh-NODE), a proxy around each DOM node which we can scribble on.
16 | * With imperative extractors, any experiments or site-specific customizations must be hard-coded in. On the other hand, Fathom's :term:`rulesets<ruleset>` (the programs you write in Fathom) are unordered and thereby decoupled, stitched together only by the :term:`types<type>` they consume and emit. External rules can thus be plugged into existing rulesets, making it easy to experiment without maintaining a fork—or to provide dedicated rules for particularly intractable web sites.
17 | * Types provide an easy way to categorize DOM nodes. They are also Fathom's black-box units of abstraction, as functions are in other programming languages.
18 | * The type system also makes explicit the division between a ruleset's public and private APIs: the types are public, and the imperative activity that goes on inside callback functions is private. This provides the freedom to extend existing rulesets without editing them directly, so multiple third-party refinements can be mixed together.
19 | * Persistent state is cordoned off in typed :term:`notes<note>` on fnodes. Thus, when a rule declares that it takes such-and-such a type as input, it can rightly assume (if rules are written consistently) there will be a note of that type on the fnodes that are passed in.
20 | * A :doc:`neural-network-powered trainer<training>` quickly adjusts the weights of your rules to maximize accuracy.
21 | 
22 | Bonus Features
23 | --------------
24 | 
25 | * Efficient execution, driven by a query planner that understands inter-rule dependencies
26 | * Lazy execution, so you can have arbitrarily large rulesets with impunity
27 | * Caching to keep from re-deriving intermediate results between queries
28 | * Clustering based on a notion of DOM node distance influenced by structural similarity
29 | * Many handy utils from which to compose scoring callbacks
30 | 
31 | Where It Works
32 | ==============
33 | 
34 | Fathom is a JavaScript framework that works against the DOM API, so you can use it server-side with ``jsdom`` or any other implementation, or you can embed it in a browser and pass it a native DOM.
35 | 


--------------------------------------------------------------------------------
/docs/maintaining.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Maintaining
 3 | ===========
 4 | 
 5 | A successful production ruleset will need to be improved from time to time.
 6 | 
 7 | Reviewing a Change
 8 | ==================
 9 | 
10 | Points to consider when reviewing a model change:
11 | 
12 | * Make sure the metrics are better. If the change involved adding samples, do a :doc:`fathom test<commands/test>` run with the old coefficients (and the new samples) as a baseline. This should result in worse metrics than the production ruleset, since you made it harder by introducing failing samples. Then compare those metrics to a new :doc:`fathom train<commands/train>` run with the new samples and any ruleset code changes. If the second metrics are better, you should adopt the new model. See :ref:`Evaluating Metrics <evaluating-metrics>` for how to compare them.
13 | 
14 |   Ideally you can collect several samples representative of the problem you're trying to solve and distribute them across the training/validation/test sets. If you can find only one, you'll have to settle for putting it in training so the coefficients can be informed by it.
15 | * Make sure the "before" and "after" metrics, with commandline flags, are in the commit message to justify the change.
16 | * Review ruleset code changes as in a normal code review, for correctness and comprehensibility.
17 | 
18 | If Adding Samples
19 | -----------------
20 | 
21 | If you added samples to the corpus, do these as well:
22 | 
23 | * Make sure the names of the samples conform to the convention documented in ``samples/rubric.txt``.
24 | * Check that the samples have been :doc:`extracted<commands/extract>` and render properly in Firefox. Use :doc:`fathom serve<commands/serve>` to make sure cross-origin policies (which are picky for ``file://`` URLs) aren't preventing the loading of subresources. Improper rendering can cause improper training.
25 | 


--------------------------------------------------------------------------------
/docs/ruleset.rst:
--------------------------------------------------------------------------------
  1 | ==================
  2 | Rules and Rulesets
  3 | ==================
  4 | 
  5 | Most everything on this page is a top-level object in the Fathom library, importable like this, for instance:
  6 | 
  7 | .. code-block:: js
  8 | 
  9 |    const {
 10 |       dom,
 11 |       element,
 12 |       out,
 13 |       rule,
 14 |       ruleset
 15 |     } = require('fathom-web');
 16 | 
 17 | Rulesets
 18 | ========
 19 | 
 20 | The most important Fathom object is the ruleset, an unordered collection of rules. The plain old :class:`Ruleset` is what you typically construct, via the ``ruleset`` convenience function:
 21 | 
 22 | .. autofunction:: ruleset
 23 | 
 24 | .. autoclass:: Ruleset
 25 |    :members: against, rules
 26 | 
 27 | Then you call :func:`Ruleset.against` to get back a :class:`BoundRuleset`, which is specific to a given DOM tree. From that, you pull answers.
 28 | 
 29 | .. autoclass:: BoundRuleset
 30 |    :members: get, setCoeffsAndBiases
 31 | 
 32 | Rules
 33 | =====
 34 | 
 35 | These are the control structures which govern the flow of scores, types, and notes through a ruleset. You construct a rule by calling :func:`rule` and passing it a left-hand side and a right-hand side:
 36 | 
 37 | .. autofunction:: rule
 38 | 
 39 | .. _lhs:
 40 | 
 41 | Left-hand Sides
 42 | ---------------
 43 | 
 44 | Left-hand sides are currently a few special forms which select nodes to be fed to right-hand sides.
 45 | 
 46 | .. autofunction:: dom
 47 | 
 48 | .. autofunction:: lhs.element
 49 |    :short-name:
 50 | 
 51 | .. function:: type(theType)
 52 | 
 53 |    Take nodes that have the given type. Example: ``type('titley')``
 54 | 
 55 |    .. autofunction:: TypeLhs#max
 56 |       :short-name:
 57 | 
 58 |    .. autofunction:: TypeLhs#bestCluster
 59 |       :short-name:
 60 | 
 61 | .. autofunction:: and(typeCall[, typeCall, ...])
 62 | 
 63 | .. autofunction:: nearest(typeCallA, typeCallB[, distance=euclidean])
 64 | 
 65 | .. autofunction:: when(predicate)
 66 | 
 67 | 
 68 | Right-hand Sides
 69 | ----------------
 70 | 
 71 | A right-hand side takes the nodes chosen by the left-hand side and mutates them. Spelling-wise, a RHS is a strung-together series of calls like this::
 72 | 
 73 |     type('smoo').props(someCallback).type('whee').score(2)
 74 | 
 75 | To facilitate factoring up repetition in right-hand sides, calls layer together like sheets of transparent acetate: if there are repeats, as with ``type`` in the above example, the rightmost takes precedence and the left becomes useless. Similarly, if :func:`props`, which can return multiple properties of a fact (element, note, score, and type), is missing any of these properties, we continue searching to the left for anything that provides them (excepting other :func:`props` calls—if you want that, write a combinator, and use it to combine the 2 functions you want)). To prevent this, return all properties explicitly from your props callback, even if they are no-ops (like ``{score: 1, note: undefined, type: undefined}``). Aside from this layering precedence, the order of calls does not matter.
 76 | 
 77 | A good practice is to use more declarative calls—:func:`score`, :func:`note`, and :func:`type`—as much as possible and save :func:`props` for when you need it. The query planner can get more out of the more specialized calls without you having to tack on verbose hints like :func:`atMost` or :func:`typeIn`.
 78 | 
 79 | .. autofunction:: InwardRhs#atMost
 80 |    :short-name:
 81 | 
 82 | .. autofunction:: InwardRhs#props
 83 |    :short-name:
 84 | 
 85 |    For example...
 86 | 
 87 |    .. code-block:: js
 88 | 
 89 |       function callback(fnode) {
 90 |           return [{score: 3,
 91 |                    element: fnode.element,  // unnecessary, since this is the default
 92 |                    type: 'texty',
 93 |                    note: {suspicious: true}}];
 94 |       }
 95 | 
 96 |    If you use ``props``, Fathom cannot look inside your callback to see what type you are emitting, so you must declare your output types with :func:`typeIn` or set a single static type with ``type``. Fathom will complain if you don't. (You can still opt not to return any type if the node turns out not to be a good match, even if you declare a :func:`typeIn`.)
 97 | 
 98 | .. autofunction:: InwardRhs#note
 99 |    :short-name:
100 | 
101 |    Since every node can have multiple, independent notes (one for each type), this applies to the type explicitly set by the RHS or, if none, to the type named by the `type` call on the LHS. If the LHS has none because it's a `dom(...)` LHS, an error is raised.
102 | 
103 |    When you query for fnodes of a certain type, you can expect to find notes of any form you specified on any RHS with that type. If no note is specified, it will be undefined. However, if two RHSs emits a given type, one adding a note and the other not adding one (or adding an undefined one), the meaningful note overrides the undefined one. This allows elaboration on a RHS's score (for example) without needing to repeat note logic.
104 | 
105 |    Indeed, ``undefined`` is not considered a note. So, though notes cannot in general be overwritten, a note that is ``undefined`` can. Symmetrically, an ``undefined`` returned from a :func:`note` or :func:`props` or the like will quietly decline to overwrite an existing defined note, where any other value would cause an error. Rationale: letting ``undefined`` be a valid note value would mean you couldn't shadow a leftward note in a RHS without introducing a new singleton value to serve as a "no value" flag. It's not worth the complexity and the potential differences between the (internal) fact and fnode note value semantics.
106 | 
107 |    Best practice: any rule adding a type should apply the same note. If only one rule of several type-foo-emitting ones did, it should be made to emit a different type instead so downstream rules can explicitly state that they require the note to be there. Otherwise, there is nothing to guarantee the note-adding rule will run before the note-needing one.
108 | 
109 | .. autofunction:: out
110 | 
111 |    If you are not using ``through()`` or ``allThrough()``, you can omit the call to ``out()`` and simply use specify the key as the RHS of the rule. For example: ``rule(type('titley').max(), out('title'))`` can be written as ``rule(type('titley').max(), 'title')``.
112 | 
113 |    .. autofunction:: OutwardRhs#through
114 |       :short-name:
115 | 
116 |    .. autofunction:: OutwardRhs#allThrough
117 |       :short-name:
118 | 
119 | .. autofunction:: InwardRhs#score
120 |    :short-name:
121 | 
122 | .. autofunction:: InwardRhs#type
123 |    :short-name:
124 | 
125 | .. autofunction:: InwardRhs#typeIn(type[, type, ...])
126 |    :short-name:
127 | 


--------------------------------------------------------------------------------
/docs/theme/static/tweaks.css:
--------------------------------------------------------------------------------
 1 | @import url(css/theme.css);
 2 | 
 3 | /* Don't make all the code literals distractingly red and bold. Bold now indicates a link. */
 4 | 
 5 | code, .rst-content tt, .rst-content code, .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
 6 |     color: #000;
 7 |     font-weight: normal;
 8 | }
 9 | 
10 | .rst-content dl:not(.docutils) tt, .rst-content dl:not(.docutils) tt, .rst-content dl:not(.docutils) code {
11 |     font-weight: normal;
12 | }
13 | 
14 | a code, .rst-content a tt, .rst-content a code, .rst-content a tt.literal, .rst-content a tt.literal, .rst-content a code.literal {
15 |     font-weight: bold;
16 | }
17 | 
18 | .rst-content .section ol li, .rst-content .section ul li, .rst-content ol.arabic li, .rst-content ul li, article ul li, article ol li {
19 |     font-size: 16px;
20 | }
21 | 


--------------------------------------------------------------------------------
/docs/theme/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = sphinx_rtd_theme
3 | stylesheet = tweaks.css
4 | 


--------------------------------------------------------------------------------
/docs/utilities.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | Utility Functions
 3 | =================
 4 | 
 5 | In addition to components intrinsically tied to rulesets, Fathom comes with a variety of utility procedures for building scoring and other callback functions or just for improving the imperative shell around your ruleset.
 6 | 
 7 | The utilities hang off a ``utils`` object in the top-level Fathom module. To import them, do something like this:
 8 | 
 9 | .. code-block:: js
10 | 
11 |    const {
12 |      utils: { isBlock, isVisible },
13 |    } = require('fathom-web');
14 | 
15 | This will result in top-level ``isBlock`` and ``isVisible`` symbols.
16 | 
17 | .. autofunction:: ancestors
18 | .. autofunction:: attributesMatch
19 | .. autofunction:: best
20 | .. autofunction:: collapseWhitespace
21 | .. autofunction:: domSort
22 | .. autofunction:: first
23 | .. autofunction:: getDefault
24 | .. autofunction:: identity
25 | .. autofunction:: inlineTextLength
26 | .. autofunction:: inlineTexts
27 | .. autofunction:: isBlock
28 | .. autofunction:: isVisible
29 | .. autofunction:: isWhitespace
30 | .. autofunction:: length
31 | .. autofunction:: linearScale
32 | .. autofunction:: linkDensity
33 | .. autofunction:: utilsForFrontend.max
34 |    :short-name:
35 | .. autofunction:: maxes
36 | .. autofunction:: min
37 | .. autoclass:: NiceSet
38 |    :members:
39 | .. autofunction:: numberOfMatches
40 | .. autofunction:: page
41 | .. autofunction:: reversed
42 | .. autofunction:: rgbaFromString
43 | .. autofunction:: rootElement
44 | .. autofunction:: saturation
45 | .. autofunction:: setDefault
46 | .. autofunction:: sigmoid
47 | .. autofunction:: sum
48 | .. autofunction:: toDomElement
49 | .. autofunction:: toposort
50 | .. autofunction:: walk
51 | 


--------------------------------------------------------------------------------
/docs/zoo.rst:
--------------------------------------------------------------------------------
 1 | ===========
 2 | Ruleset Zoo
 3 | ===========
 4 | 
 5 | Welcome to the Fathom Ruleset Zoo, a bestiary of Fathom real-world examples. Each gives an overview and links to a repository with full source code.
 6 | 
 7 | .. note::
 8 |    Some repos are private because they contain copyrighted training samples. While we believe this is fair use, we don't wish to provoke cease-and-desist bots. If you work for Mozilla, just ask, and we’ll grant you access. Otherwise, we've pasted the ruleset source code into the docs, so you can at least see that. Enjoy!
 9 | 
10 | New-Password Forms
11 | ==================
12 | 
13 | Firefox's password manager needed a way to identify new-password fields so it could suggest (and memorize) high-entropy passwords for them. There is standardized markup for this, but only 2-4% of sites use it. Fathom thus stepped in to backstop the other 97%. On a corpus of 508 pages, we trained to a testing precision of 99.2% and recall of 92.1%. (We used ``fathom train --pos-weight`` to slant the results in favor of fewer false positives, sacrificing some recall for it.) Independent QA work showed an accuracy and false-negative rate better than that of Google Chrome—and a false-positive rate only 1% worse—and all of that with a purely client-side model. It shipped in Firefox 76.
14 | 
15 | :doc:`Ruleset source<zoo/new_password>`
16 | 
17 | `Full repo <https://github.com/mozilla-services/fathom-login-forms/blob/master/new-password/rulesets.js>`_
18 | 
19 | Login Forms
20 | ===========
21 | 
22 | As a proof-of-concept next-generation autofiller for `Firefox Lockwise <https://www.mozilla.org/en-US/firefox/lockwise/>`_, we built recognizers for login forms’ username fields and Log In buttons.
23 | 
24 | This is a clean, simple example of a Fathom 3 ruleset. It was designed for Fathom 3 from the beginning, solves the problem concisely, and has respectable accuracy.
25 | 
26 | Recognizers
27 | -----------
28 | 
29 | * **Username field.** This is the username or (as is increasingly the case) email field of the login form. The ruleset finds the precise ``<input>`` element for form fill. Validation precision and recall: both 96.6%, on 162 candidate tags across 64 pages, including ones with no login forms or with adversarial constructs like password-change, credit-card, and shipping forms.
30 | * **Next button.** The Log In button or, for multi-page login flows, whatever you click to advance to the next step. This was the more challenging recognizer, since there is a wider diversity of both markup and text for these constructs. Validation precision: 100%. Validation recall: 72.9%. This is across 490 candidate tags on 64 pages. There is plenty of signal left on the table, so more invested time should give us another percentage point or two. (The whole project was timeboxed to about 3 weeks.)
31 | 
32 | :doc:`Ruleset source<zoo/login>`
33 | 
34 | `Full repo <https://github.com/mozilla-services/fathom-login-forms/blob/master/lockwise-proof-of-concept/trainees.js>`_
35 | 
36 | Smoot: Page Classification
37 | ==========================
38 | 
39 | An upcoming Firefox metrics effort, Project Smoot will use a set of whole-page classifiers to characterize user tasks in a privacy-preserving way.
40 | 
41 | Recognizers
42 | -----------
43 | * **Shopping.** A page is a shopping page iff a user would seek it out in the process of choosing or buying things. This is a very challenging rubric, as it almost demands the model reach inside the head of the user to determine intent. A page about Amazon's affiliate program is not a shopping page, even though it appears on a shopping-focused domain. A forum thread on Reddit discussing the merits of competing products is a shopping page, even though it’s not near any actual Buy buttons.
44 | 
45 |   Despite the difficulty of the task, our model, still under development, scores over 90% in validation on a corpus of 100 pages.
46 | * **Article.** A page whose main attraction is prose to read. Though still under development, this model scores 90% in validation on a corpus of 60 pages.
47 | * **“Techie” Article.** An article aimed at a computer-savvy audience. This is intended for audience segmentation. It’s too early for numbers here as well.
48 | 
49 | :doc:`Articles ruleset source<zoo/smoot_articles>`
50 | 
51 | :doc:`Shopping ruleset source<zoo/smoot_shopping>`
52 | 
53 | `Full repo <https://github.com/mozilla-services/fathom-smoot>`_
54 | 
55 | Price Tracker
56 | =============
57 | 
58 | Originally designed for Fathom 2.0 but ported to 3.0 as a team familiarization exercise, Firefox Price Tracker is a now-retired web extension that periodically polled the prices of a wishlist of products and notified the user of price drops. Fathom provided the recognition of products for sale: their names, images, and prices. Out of an abundance of caution, Price Tracker underutilized Fathom’s ability to generalize, artificially limiting itself to the 5 top commerce sites in the U.S. However, its compact example is easy to digest in a sitting, and it’s a fine instance of Fathom increasing the agency of thousands of users when wrapped in a quality, lightweight UI.
59 | 
60 | .. image:: img/price_tracker_screenshot.png
61 | 
62 | Recognizers
63 | -----------
64 | 
65 | * **Image.** The “hero” image showing the product. Validation accuracy: 99.34%. Testing accuracy: 75%.
66 | * **Title.** The name of the product. Validation accuracy: 100%. Testing accuracy: 83.38%.
67 | * **Price.** The price charged for the product. Validation accuracy: 99.27%. Testing accuracy: 99.46%.
68 | 
69 | Price Tracker’s accuracy numbers are unusually noisy, partly due to the rules being written with an earlier version of Fathom in mind and partly due to its small, homogeneous sample corpus. Pages came from only 5 sites, and testing and validation corpora were each only 20 pages. The 95% confidence interval for accuracy numbers thus spans as much as 30%. If we were to ship a Fathom-3.0-powered Price Tracker, we would refine until we had only a few percentage points of spread.
70 | 
71 | More metrics are available on `the pull request that merged the Fathom 3 upgrade <https://github.com/mozilla/price-tracker/pull/317>`_, but they mostly serve as a warning that a more diverse corpus is necessary for confident measurement. Take Price Tracker as an example of coding practices and product-market fit, not corpus design.
72 | 
73 | :doc:`Ruleset source<zoo/price_tracker>`
74 | 
75 | `Full repo <https://github.com/mozilla/price-tracker/blob/master/src/extraction/fathom/ruleset_factory.js>`_
76 | 
77 | Pop-up Detector
78 | ===============
79 | 
80 | Pop-up “windows” on the web have migrated from actual windows to in-page elements, largely due to browsers’ success at blocking the old kind. We mentored a student project to recognize in-page pop-ups using the older Fathom 2.
81 | 
82 | Results were encouraging, hovering around 85% on a blind testing corpus. Revamped for a modern Fathom, it might give higher numbers with little effort. In the meantime, it serves as a good example of perceptive rules. But don't lean overmuch on the ranges of numbers returned from scoring callbacks; that all changed in Fathom 3.
83 | 
84 | `Pop-up Detector source <https://github.com/capstone-2018873/fathom-trainees/tree/master/src/models>`_
85 | 


--------------------------------------------------------------------------------
/fathom/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "plugins": [
3 |     "transform-es2015-modules-commonjs", "dynamic-import-node", "@babel/plugin-proposal-export-namespace-from", "@babel/plugin-transform-exponentiation-operator"
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/fathom/.eslintignore:
--------------------------------------------------------------------------------
1 | # eslint will trip over the export statements herein until https://github.com/eslint/eslint/issues/12629 is fixed:
2 | index.mjs
3 | 


--------------------------------------------------------------------------------
/fathom/.eslintrc.yml:
--------------------------------------------------------------------------------
 1 | env:
 2 |   es6: true
 3 |   node: true
 4 |   mocha: true
 5 | 
 6 | parserOptions:
 7 |   sourceType: module
 8 |   ecmaVersion: 8
 9 | 
10 | extends:
11 |   - eslint:recommended
12 |   - plugin:node/recommended
13 | 
14 | plugins:
15 |   - import
16 |   - node
17 | 
18 | root: true
19 | 
20 | rules:
21 |   array-bracket-spacing: [error, never]
22 |   eqeqeq: error
23 |   generator-star-spacing: [warn, {before: true, after: false}]
24 |   guard-for-in: warn  # There's nothing wrong with for..in if you know what you're doing. This is here just to keep me from accidentally saying "for..in" when I mean "for..of". Delete this and come up with a better solution if we ever need to use "for..in".
25 |   indent: [error, 4, {ObjectExpression: first, ArrayExpression: first, CallExpression: {arguments: first}, FunctionDeclaration: {parameters: first}}]
26 |   max-len: [off, {code: 100, ignoreComments: true, ignoreStrings: true, ignoreTemplateLiterals: true}]
27 |   node/exports-style: [error, module.exports]
28 |   node/no-missing-import: [error, {tryExtensions: [".js", ".mjs"]}]
29 |   node/no-unpublished-require: off
30 |   no-console: off
31 |   no-dupe-class-members: error
32 |   no-loop-func: error
33 |   no-new-func: error  # equivalent to eval()
34 |   no-prototype-builtins: off
35 |   no-restricted-globals: [error, getComputedStyle]
36 |   no-throw-literal: error
37 |   no-trailing-spaces: error
38 |   no-underscore-dangle: off
39 |   no-unused-vars: [warn, {vars: all, args: none}]
40 |   no-use-before-define: [error, {functions: false, classes: false}]
41 |   no-useless-escape: error
42 |   no-var: warn
43 |   no-warning-comments: [warn, {terms: [xxx, fixme, hack], location: start}]
44 |   object-curly-spacing: [error, never]
45 |   object-shorthand: [error, properties]
46 |   prefer-const: off
47 |   quotes: [error, single, {avoidEscape: true, allowTemplateLiterals: true}]
48 |   semi: [error, always]
49 |   space-before-blocks: [error, always]
50 |   space-before-function-paren: [error, {anonymous: always, named: never}]
51 |   import/extensions: [error, always, {js: never, mjs: never}]
52 | 
53 | settings:
54 |   import/resolver:
55 |     node:
56 |       extensions: ['.js', '.mjs']
57 | 


--------------------------------------------------------------------------------
/fathom/.npmignore:
--------------------------------------------------------------------------------
 1 | /.npm_installed
 2 | /.babelrc
 3 | /.eslintignore
 4 | /.eslintrc.yml
 5 | /.nyc_output
 6 | /Makefile
 7 | /rollup.config.js
 8 | /test
 9 | /coverage
10 | /*.log
11 | /dist
12 | /venv
13 | 


--------------------------------------------------------------------------------
/fathom/Makefile:
--------------------------------------------------------------------------------
 1 | PATH := ./node_modules/.bin:$(PATH)
 2 | 
 3 | JS  := $(shell find . -name '*.mjs' | grep -v '^./node_modules/.*' | sed 's/\.mjs/\.js/')
 4 | MJS := $(shell find . -name '*.mjs' | grep -v '^./node_modules/.*')
 5 | 
 6 | # It's faster to invoke Babel once and compile everything than to invoke it
 7 | # separately on even 2 individual files that changed.
 8 | %.js: %.mjs .npm_installed .babelrc; @node_modules/.bin/babel *.mjs **/*.mjs --out-dir . --relative
 9 | 
10 | js: $(JS)
11 | 
12 | lint: .npm_installed
13 | 	@node_modules/.bin/eslint --ext mjs .
14 | 	@node_modules/.bin/eslint test/browser
15 | 
16 | test: $(JS) .npm_installed
17 | 	@node_modules/.bin/nyc --reporter=text-summary node_modules/mocha/bin/_mocha --recursive
18 | 
19 | coverage: .npm_installed test
20 | 	@node_modules/.bin/nyc report --reporter=html
21 | 
22 | coveralls: .npm_installed
23 | 	node_modules/.bin/nyc report --reporter=text-lcov | coveralls
24 | 
25 | debugtest: $(JS) .npm_installed
26 | 	# This is known to work on node 7.6.0.
27 | 	@node_modules/.bin/mocha --inspect-brk
28 | 
29 | publish: $(JS)
30 | 	cp ../LICENSE ./
31 | 	cp ../README.md ./
32 | 	npm publish
33 | 
34 | bundle: dist/fathom.js
35 | 
36 | # .npm_installed is an empty file we touch whenever we run npm install. This
37 | # target redoes the install if package.json is newer than that file:
38 | .npm_installed: package.json
39 | 	npm install
40 | 	touch $@
41 | 
42 | clean:
43 | 	rm -rf $(JS) node_modules .npm_installed LICENSE README.md
44 | 
45 | 
46 | # Private targets:
47 | 
48 | dist/fathom.js: rollup.config.js .npm_installed $(MJS)
49 | 	@node_modules/.bin/rollup -c
50 | 
51 | 
52 | .PHONY: js lint test coveralls debugtest publish bundle clean
53 | 


--------------------------------------------------------------------------------
/fathom/exceptions.mjs:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * A :func:`rule` depends on another rule which itself depends on the first
 3 |  * rule again, either directly or indirectly.
 4 |  */
 5 | export class CycleError extends Error {
 6 | }
 7 | 
 8 | /**
 9 |   * An examined element was not contained in a browser ``window`` object, but
10 |   * something needed it to be.
11 |   */
12 | export class NoWindowError extends Error {
13 | }
14 | 


--------------------------------------------------------------------------------
/fathom/fnode.mjs:
--------------------------------------------------------------------------------
  1 | import {type} from './side';
  2 | import {getDefault, setDefault, sigmoid} from './utilsForFrontend';
  3 | 
  4 | 
  5 | /**
  6 |  * A wrapper around a DOM node, storing :term:`types<type>`,
  7 |  * :term:`scores<score>`, and :term:`notes<note>` that apply to it
  8 |  */
  9 | export class Fnode {
 10 |     /**
 11 |      * @arg element The DOM element described by the fnode.
 12 |      * @arg ruleset The ruleset which created the fnode.
 13 |      */
 14 |     constructor(element, ruleset) {
 15 |         if (element === undefined) {
 16 |             throw new Error("Someone tried to make a fnode without specifying the element they're talking about.");
 17 |         }
 18 |         /**
 19 |          * The raw DOM element this fnode describes
 20 |          */
 21 |         this.element = element;
 22 |         this._ruleset = ruleset;
 23 | 
 24 |         // A map of type => {score: number, note: anything}. `score` is always
 25 |         // present and defaults to 1. A note is set iff `note` is present and
 26 |         // not undefined.
 27 |         this._types = new Map();
 28 | 
 29 |         // Note: conserveScore() is temporarily absent in 3.0.
 30 |         //
 31 |         // By default, a fnode has an independent score for each of its types.
 32 |         // However, a RHS can opt to conserve the score of an upstream type,
 33 |         // carrying it forward into another type. To avoid runaway scores in
 34 |         // the case that multiple rules choose to do this, we limit the
 35 |         // contribution of an upstream type's score to being multiplied in a
 36 |         // single time. In this set, we keep track of which upstream types'
 37 |         // scores have already been multiplied into each type. LHS fnode => Set
 38 |         // of types whose score for that node have been multiplied into this
 39 |         // node's score.
 40 |         this._conservedScores = new Map();
 41 |     }
 42 | 
 43 |     /**
 44 |      * Return whether the given type is one of the ones attached to the wrapped
 45 |      * HTML node.
 46 |      */
 47 |     hasType(type) {
 48 |         // Run type(theType) against the ruleset to make sure this doesn't
 49 |         // return false just because we haven't lazily run certain rules yet.
 50 |         this._computeType(type);
 51 |         return this._types.has(type);
 52 |     }
 53 | 
 54 |     /**
 55 |      * Return the confidence, in the range (0, 1), that the fnode belongs to the
 56 |      * given type, 0 by default.
 57 |      */
 58 |     scoreFor(type) {
 59 |         this._computeType(type);
 60 |         return sigmoid(this._ruleset.weightedScore(this.scoresSoFarFor(type)) +
 61 |                        getDefault(this._ruleset.biases, type, () => 0));
 62 |     }
 63 | 
 64 |     /**
 65 |      * Return the fnode's note for the given type, ``undefined`` if none.
 66 |      */
 67 |     noteFor(type) {
 68 |         this._computeType(type);
 69 |         return this._noteSoFarFor(type);
 70 |     }
 71 | 
 72 |     /**
 73 |      * Return whether this fnode has a note for the given type.
 74 |      *
 75 |      * ``undefined`` is not considered a note and may be overwritten with
 76 |      * impunity.
 77 |      */
 78 |     hasNoteFor(type) {
 79 |         this._computeType(type);
 80 |         return this._hasNoteSoFarFor(type);
 81 |     }
 82 | 
 83 |     // -------- Methods below this point are private to the framework. --------
 84 | 
 85 |     /**
 86 |      * Return an iterable of the types tagged onto me by rules that have
 87 |      * already executed.
 88 |      */
 89 |     typesSoFar() {
 90 |         return this._types.keys();
 91 |     }
 92 | 
 93 |     _noteSoFarFor(type) {
 94 |         return this._typeRecordForGetting(type).note;
 95 |     }
 96 | 
 97 |     _hasNoteSoFarFor(type) {
 98 |         return this._noteSoFarFor(type) !== undefined;
 99 |     }
100 | 
101 |     /**
102 |      * Return the score thus far computed on me for a certain type. Doesn't
103 |      * implicitly run any rules. If no score has yet been determined for the
104 |      * given type, return undefined.
105 |      */
106 |     scoresSoFarFor(type) {
107 |         return this._typeRecordForGetting(type).score;
108 |     }
109 | 
110 |     /**
111 |      * Add a given number to one of our per-type scores. Implicitly assign us
112 |      * the given type. Keep track of which rule it resulted from so we can
113 |      * later mess with the coeffs.
114 |      */
115 |     addScoreFor(type, score, ruleName) {
116 |         this._typeRecordForSetting(type).score.set(ruleName, score);
117 |     }
118 | 
119 |     /**
120 |      * Set the note attached to one of our types. Implicitly assign us that
121 |      * type if we don't have it already.
122 |      */
123 |     setNoteFor(type, note) {
124 |         if (this._hasNoteSoFarFor(type)) {
125 |             if (note !== undefined) {
126 |                 throw new Error(`Someone (likely the right-hand side of a rule) tried to add a note of type ${type} to an element, but one of that type already exists. Overwriting notes is not allowed, since it would make the order of rules matter.`);
127 |             }
128 |             // else the incoming note is undefined and we already have the
129 |             // type, so it's a no-op
130 |         } else {
131 |             // Apply either a type and note or just a type (which means a note
132 |             // that is undefined):
133 |             this._typeRecordForSetting(type).note = note;
134 |         }
135 |     }
136 | 
137 |     /**
138 |      * Return a score/note record for a type, creating it if it doesn't exist.
139 |      */
140 |     _typeRecordForSetting(type) {
141 |         return setDefault(this._types, type, () => ({score: new Map()}));
142 |     }
143 | 
144 |     /**
145 |      * Manifest a temporary type record for reading, working around the lack of
146 |      * a .? operator in JS.
147 |      */
148 |     _typeRecordForGetting(type) {
149 |         return getDefault(this._types, type, () => ({score: new Map()}));
150 |     }
151 | 
152 |     /**
153 |      * Make sure any scores, notes, and type-tagging for the given type are
154 |      * computed for my element.
155 |      */
156 |     _computeType(theType) {
157 |         if (!this._types.has(theType)) {  // Prevent infinite recursion when an A->A rule looks at A's note in a callback.
158 |             this._ruleset.get(type(theType));
159 |         }
160 |     }
161 | }
162 | 


--------------------------------------------------------------------------------
/fathom/index.mjs:
--------------------------------------------------------------------------------
 1 | /* This Source Code Form is subject to the terms of the Mozilla Public
 2 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 3 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 4 | 
 5 | const version = '3.7.3';
 6 | import {rule} from './rule';
 7 | import {ruleset} from './ruleset';
 8 | import {dom, element} from './lhs';
 9 | import {out} from './rhs';
10 | import {and, atMost, nearest, note, props, score, type, typeIn} from './side';
11 | 
12 | export * as clusters from './clusters';
13 | export * as utils from './utilsForFrontend';
14 | export * as exceptions from './exceptions';
15 | export {
16 |     and,
17 |     atMost,
18 |     dom,
19 |     element,
20 |     nearest,
21 |     note,
22 |     out,
23 |     props,
24 |     rule,
25 |     ruleset,
26 |     score,
27 |     type,
28 |     typeIn,
29 |     version
30 | };
31 | 


--------------------------------------------------------------------------------
/fathom/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "fathom-web",
 3 |   "description": "Find meaning in the web.",
 4 |   "version": "3.7.3",
 5 |   "author": "Erik Rose <erik@mozilla.com> (https://www.grinchcentral.com/)",
 6 |   "bugs": {
 7 |     "url": "https://github.com/mozilla/fathom/issues"
 8 |   },
 9 |   "dependencies": {
10 |     "jsdom": "^11.12.0"
11 |   },
12 |   "devDependencies": {
13 |     "@babel/cli": "^7.7.4",
14 |     "@babel/core": "^7.7.4",
15 |     "@babel/plugin-proposal-export-namespace-from": "^7.7.4",
16 |     "@babel/plugin-transform-exponentiation-operator": "^7.7.4",
17 |     "acorn": "^7.1.0",
18 |     "babel-eslint": "^8.2.6",
19 |     "babel-plugin-dynamic-import-node": "^2.3.0",
20 |     "babel-plugin-transform-es2015-modules-commonjs": "^6.26.2",
21 |     "chai": "^4.2.0",
22 |     "coveralls": "^3.1.0",
23 |     "eslint": "^6.7.1",
24 |     "eslint-plugin-import": "^2.18.2",
25 |     "eslint-plugin-node": "^10.0.0",
26 |     "geckodriver": "^3.0.1",
27 |     "jsdoc": "^3.5.4",
28 |     "mocha": "^6.2.2",
29 |     "nyc": "^14.1.1",
30 |     "rollup": "^1.27.5",
31 |     "selenium-webdriver": "^4.1.1"
32 |   },
33 |   "engines": {
34 |     "node": ">= 7.6.0"
35 |   },
36 |   "homepage": "https://github.com/mozilla/fathom",
37 |   "keywords": [
38 |     "semantic extraction",
39 |     "scoring",
40 |     "ranking",
41 |     "clustering"
42 |   ],
43 |   "license": "MPL-2.0",
44 |   "repository": {
45 |     "type": "git",
46 |     "url": "https://github.com/mozilla/fathom.git"
47 |   },
48 |   "main": "index"
49 | }
50 | 


--------------------------------------------------------------------------------
/fathom/rollup.config.js:
--------------------------------------------------------------------------------
 1 | // Bundle all of Fathom into a single file for use inside web extensions or
 2 | // other applications. If possible, use ES6-style import statements in your
 3 | // code instead, and let rollup pull in just what Fathom code is necessary. See
 4 | // /fathom_fox/rollup.config.js for an example.
 5 | export default {
 6 |   input: 'index.mjs',
 7 |   output: {
 8 |     file: 'dist/fathom.js',
 9 |     format: 'umd',
10 |     name: 'fathom',
11 |   }
12 | };
13 | 


--------------------------------------------------------------------------------
/fathom/side.mjs:
--------------------------------------------------------------------------------
  1 | import {euclidean} from './clusters';
  2 | import {Lhs} from './lhs';
  3 | import {InwardRhs} from './rhs';
  4 | 
  5 | 
  6 | export function props(callback) {
  7 |     return new Side({method: 'props', args: [callback]});
  8 | }
  9 | 
 10 | /** Constrain to an input type on the LHS, or apply a type on the RHS. */
 11 | export function type(theType) {
 12 |     return new Side({method: 'type', args: [theType]});
 13 | }
 14 | 
 15 | export function note(callback) {
 16 |     return new Side({method: 'note', args: [callback]});
 17 | }
 18 | 
 19 | export function score(scoreOrCallback) {
 20 |     return new Side({method: 'score', args: [scoreOrCallback]});
 21 | }
 22 | 
 23 | export function atMost(score) {
 24 |     return new Side({method: 'atMost', args: [score]});
 25 | }
 26 | 
 27 | export function typeIn(...types) {
 28 |     return new Side({method: 'typeIn', args: types});
 29 | }
 30 | 
 31 | /**
 32 |  * Pull nodes that conform to multiple conditions at once.
 33 |  *
 34 |  * For example: ``and(type('title'), type('english'))``
 35 |  *
 36 |  * Caveats: ``and`` supports only simple ``type`` calls as arguments for now,
 37 |  * and it may fire off more rules as prerequisites than strictly necessary.
 38 |  * ``not`` and ``or`` don't exist yet, but you can express ``or`` the long way
 39 |  * around by having 2 rules with identical RHSs.
 40 |  */
 41 | export function and(...lhss) {
 42 |     return new Side({method: 'and', args: lhss});
 43 | }
 44 | 
 45 | /**
 46 |  * Experimental. For each :term:`fnode` from ``typeCallA``, find the closest
 47 |  * node from ``typeCallB``, and attach it as a note. The note is attached to
 48 |  * the type specified by the RHS, defaulting to the type of ``typeCallA``. If
 49 |  * no nodes are emitted from ``typeCallB``, do nothing.
 50 |  *
 51 |  * For example... ::
 52 |  *
 53 |  *     nearest(type('image'), type('price'))
 54 |  *
 55 |  * The score of the ``typeCallA`` can be added to the new type's score by using
 56 |  * :func:`conserveScore` (though this routine has since been removed)::
 57 |  *
 58 |  *     rule(nearest(type('image'), type('price')),
 59 |  *          type('imageWithPrice').score(2).conserveScore())
 60 |  *
 61 |  * Caveats: ``nearest`` supports only simple ``type`` calls as arguments ``a``
 62 |  * and ``b`` for now.
 63 |  *
 64 |  * @arg distance {function} A function that takes 2 fnodes and returns a
 65 |  *     numerical distance between them. Included options are :func:`distance`,
 66 |  *     which is a weighted topological distance, and :func:`euclidean`, which
 67 |  *     is a spatial distance.
 68 |  */
 69 | export function nearest(typeCallA, typeCallB, distance = euclidean) {
 70 |     return new Side({method: 'nearest', args: [typeCallA, typeCallB, distance]});
 71 | }
 72 | 
 73 | /**
 74 |  * A chain of calls that can be compiled into a Rhs or Lhs, depending on its
 75 |  * position in a Rule. This lets us use type() as a leading call for both RHSs
 76 |  * and LHSs. I would prefer to do this dynamically, but that wouldn't compile
 77 |  * down to old versions of ES.
 78 |  */
 79 | class Side {
 80 |     constructor(...calls) {
 81 |         // A "call" is like {method: 'dom', args: ['p.smoo']}.
 82 |         this._calls = calls;
 83 |     }
 84 | 
 85 |     max() {
 86 |         return this._and('max');
 87 |     }
 88 | 
 89 |     bestCluster(options) {
 90 |         return this._and('bestCluster', options);
 91 |     }
 92 | 
 93 |     props(callback) {
 94 |         return this._and('props', callback);
 95 |     }
 96 | 
 97 |     type(...types) {
 98 |         return this._and('type', ...types);
 99 |     }
100 | 
101 |     note(callback) {
102 |         return this._and('note', callback);
103 |     }
104 | 
105 |     score(scoreOrCallback) {
106 |         return this._and('score', scoreOrCallback);
107 |     }
108 | 
109 |     atMost(score) {
110 |         return this._and('atMost', score);
111 |     }
112 | 
113 |     typeIn(...types) {
114 |         return this._and('typeIn', ...types);
115 |     }
116 | 
117 |     and(...lhss) {
118 |         return this._and('and', lhss);
119 |     }
120 | 
121 |     _and(method, ...args) {
122 |         return new this.constructor(...this._calls.concat({method, args}));
123 |     }
124 | 
125 |     asLhs() {
126 |         return this._asSide(Lhs.fromFirstCall(this._calls[0]), this._calls.slice(1));
127 |     }
128 | 
129 |     asRhs() {
130 |         return this._asSide(new InwardRhs(), this._calls);
131 |     }
132 | 
133 |     _asSide(side, calls) {
134 |         for (let call of calls) {
135 |             side = side[call.method](...call.args);
136 |         }
137 |         return side;
138 |     }
139 | 
140 |     when(pred) {
141 |         return this._and('when', pred);
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/fathom/test/browser/http_server.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Before any test in the entire project starts, spin up a server to serve test
 3 |  * pages to the Selenium-driven headless Firefox we use in some tests.
 4 |  */
 5 | const http = require('http');
 6 | const fs = require('fs');
 7 | const url = require('url');
 8 | 
 9 | 
10 | const PORT = 8000;
11 | const server = http.createServer((request, response) => {
12 |     // TODO: Replace url.parse with url.URL.
13 |     // eslint-disable-next-line node/no-deprecated-api
14 |     const path = url.parse(request.url).pathname;
15 |     fs.readFile(__dirname + path, 'utf8', (error, data) => {
16 |         if (error) {
17 |             console.error(`There was a ${error.code} error fetching the resource at ${path}.`);
18 |         } else {
19 |             response.writeHead(200, {'Content-Type': 'text/html'});
20 |             response.write(data);
21 |             response.end();
22 |         }
23 |     });
24 | });
25 | 
26 | before(
27 |     function start_server() {
28 |         server.listen(PORT);
29 |         console.log(`Serving from ${__dirname} at http://localhost:${PORT}...`);
30 |     }
31 | );
32 | 
33 | after(
34 |     function stop_server() {
35 |         server.close();
36 |     }
37 | );
38 | 


--------------------------------------------------------------------------------
/fathom/test/browser/isVisible.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en" dir="ltr">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <!-- Empty favicon to avoid HTTP GET error when Firefox requests it -->
 6 |     <link rel="icon" href="data:,">
 7 |     <title>isVisible functional test</title>
 8 |     <style type="text/css">
 9 |       :root {
10 |         --size: 100px;
11 |       }
12 |       .off-screen {
13 |         width: var(--size);
14 |         height: var(--size);
15 |         border: 1px solid black;
16 |         position: absolute;
17 |       }
18 |     </style>
19 |   </head>
20 |   <body>
21 |     <h1>isVisible functional test</h1>
22 |     <div id="not-visible-1" style="width: 0px; overflow: hidden;"></div>
23 |     <div id="not-visible-2" style="height: 0px; overflow: hidden;"></div>
24 |     <div id="not-visible-3" style="display: none;"></div>
25 |     <!-- ``display: none`` ancestor -->
26 |     <div id="ancestor-not-visible-4" style="display: none;">
27 |       <div id="not-visible-4"></div>
28 |     </div>
29 |     <div id="not-visible-5" style="opacity: 0;"></div>
30 |     <!-- ``opacity: 0`` ancestor -->
31 |     <div id="ancestor-not-visible-6" style="opacity: 0;">
32 |       <div id="not-visible-6"></div>
33 |     </div>
34 |     <!-- ``overflow: hidden`` with zero width ancestor -->
35 |     <div id="ancestor-not-visible-7" style="width: 0;">
36 |       <div id="not-visible-7" style="overflow: hidden;"></div>
37 |     </div>
38 |     <!-- ``overflow: hidden`` with zero height ancestor -->
39 |     <div id="ancestor-not-visible-8" style="height: 0;">
40 |       <div id="not-visible-8" style="overflow: hidden;"></div>
41 |     </div>
42 |     <!-- off-screen to the left of viewport -->
43 |     <div id="not-visible-9" class="off-screen" style="left: calc(-2 * var(--size));"></div>
44 |     <!-- off-screen above viewport -->
45 |     <div id="not-visible-10" class="off-screen" style="top: calc(-2 * var(--size));"></div>
46 |     <!-- off-screen to the right of viewport -->
47 |     <div id="not-visible-11" class="off-screen" style="right: calc(100vw + 2 * var(--size));"></div>
48 |     <!-- off-screen below viewport -->
49 |     <div id="not-visible-12" class="off-screen" style="bottom: calc(100vh + 2 * var(--size));"></div>
50 |     <div id="not-visible-13" style="display: contents"></div>
51 |     <!-- ``display: contents`` ancestor -->
52 |     <div id="ancestor-visible-1" style="display: contents">
53 |       <div id="visible-1"></div>
54 |     </div>
55 |   </body>
56 | </html>
57 | 


--------------------------------------------------------------------------------
/fathom/test/browser/isVisible.js:
--------------------------------------------------------------------------------
 1 | const {assert} = require('chai');
 2 | const firefox = require('selenium-webdriver/firefox');
 3 | const {Builder, until, By} = require('selenium-webdriver');
 4 | const {ancestors, isDomElement, isVisible, toDomElement, windowForElement} = require('../../utilsForFrontend'); // eslint-disable-line node/no-missing-require
 5 | 
 6 | const WAIT_MS = 10000;
 7 | const TEST_PAGE_URL = 'http://localhost:8000/isVisible.html';
 8 | 
 9 | describe('isVisible', () => {
10 |     const options = new firefox.Options();
11 |     options.headless();
12 | 
13 |     const driver = new Builder()
14 |         .forBrowser('firefox')
15 |         .setFirefoxOptions(options)
16 |         .build();
17 | 
18 |     async function checkElementVisibility(id, expected) {
19 |         await driver.wait(until.elementLocated(By.id(id)), WAIT_MS);
20 |         const isElementVisible = await driver.executeScript(`
21 |             ${ancestors}
22 |             ${isDomElement}
23 |             ${toDomElement}
24 |             ${windowForElement}
25 |             return ${isVisible}(document.getElementById('${id}'));
26 |         `);
27 |         assert.equal(
28 |             isElementVisible,
29 |             expected,
30 |             `isVisible should return ${expected} for element with id '${id}'.`
31 |         );
32 |     }
33 | 
34 |     async function checkElementsVisibility(idStub, isVisible) {
35 |         const elementIds = await driver.executeScript(`
36 |             return Array.prototype.map.call(document.querySelectorAll('[id^="${idStub}"]'), (element) => element.id);
37 |         `);
38 | 
39 |         await driver.get(TEST_PAGE_URL);
40 | 
41 |         for (const id of elementIds) {
42 |             await checkElementVisibility(id, isVisible);
43 |         }
44 |     }
45 | 
46 |     it('should return false when an element is hidden', async function () {
47 |         this.timeout(WAIT_MS);
48 |         await checkElementsVisibility('not-visible-', false);
49 |     });
50 | 
51 |     it('should return true when an element is visible', async function () {
52 |         this.timeout(WAIT_MS);
53 |         await checkElementsVisibility('visible-', true);
54 |     });
55 | 
56 |     after(async function () {
57 |         this.timeout(WAIT_MS);
58 |         return driver.quit();
59 |     });
60 | });
61 | 


--------------------------------------------------------------------------------
/fathom/test/demos.mjs:
--------------------------------------------------------------------------------
 1 | import {assert} from 'chai';
 2 | 
 3 | import {dom, rule, ruleset, type} from '../index';
 4 | import {sigmoid, staticDom} from '../utils';
 5 | 
 6 | 
 7 | describe('Design-driving demos', function () {
 8 |     it('handles a simple series of short-circuiting rules', function () {
 9 |         // TODO: Short-circuiting isn't implemented yet. The motivation of this
10 |         // test is to inspire engine so it's smart enough to run the highest-
11 |         // possible-scoring type-chain of rules first and, if it succeeds,
12 |         // omit the others.
13 |         const doc = staticDom(`
14 |             <meta name="hdl" content="HDL">
15 |             <meta property="og:title" content="OpenGraph">
16 |             <meta property="twitter:title" content="Twitter">
17 |             <title>Title</title>
18 |         `);
19 |         const typeAndNote = type('titley').note(fnode => fnode.element.getAttribute('content'));
20 |         const rules = ruleset([
21 |             rule(dom('meta[property="og:title"]'),
22 |                  typeAndNote.score(40)),
23 |             rule(dom('meta[property="twitter:title"]'),
24 |                  typeAndNote.score(30)),
25 |             rule(dom('meta[name="hdl"]'),
26 |                  typeAndNote.score(20)),
27 |             rule(dom('title'),
28 |                  typeAndNote.score(10).note(fnode => fnode.element.text)),
29 |             rule(type('titley').max(), 'bestTitle')
30 |         ]);
31 |         const facts = rules.against(doc);
32 |         const node = facts.get('bestTitle')[0];
33 |         assert.equal(node.scoreFor('titley'), sigmoid(40));
34 |         assert.equal(node.noteFor('titley'), 'OpenGraph');
35 |     });
36 | });
37 | 
38 | // Right now, I'm writing features and using optimization algos to find their coefficients. Someday, we can stop writing features and have deep learning come up with them. TODO: Grok unsupervised learning, and apply it to OpenCrawl.
39 | 


--------------------------------------------------------------------------------
/fathom/test/lhs_tests.mjs:
--------------------------------------------------------------------------------
 1 | import {assert} from 'chai';
 2 | 
 3 | import {dom, rule, ruleset, type} from '../index';
 4 | import {staticDom} from '../utils';
 5 | 
 6 | 
 7 | describe('LHS', function () {
 8 |     it('makes a dom() LHS that rule() tolerates', function () {
 9 |         const lhs = dom('smoo');
10 |         const rhs = type('bar');
11 |         rule(lhs, rhs);
12 |     });
13 | 
14 |     it('finds max-scoring nodes of a type', function () {
15 |         const doc = staticDom(`
16 |             <p></p>
17 |             <div></div>
18 |             <div></div>
19 |         `);
20 |         const rules = ruleset([
21 |             rule(dom('p'), type('smoo').score(2)),
22 |             rule(dom('div'), type('smoo').score(5)),
23 |             rule(type('smoo').max(), 'best')
24 |         ]);
25 |         const facts = rules.against(doc);
26 |         const best = facts.get('best');
27 |         assert.equal(best.length, 2);
28 |         assert.equal(best[0].element.nodeName, 'DIV');
29 |         assert.equal(best[1].element.nodeName, 'DIV');
30 |     });
31 | 
32 |     it('returns [] for a top-totaling cluster of 0 nodes', function () {
33 |         const doc = staticDom(`
34 |             <p></p>
35 |         `);
36 |         const rules = ruleset([
37 |             rule(dom('div'), type('smoo')),
38 |             rule(type('smoo').bestCluster(), 'cluster')
39 |         ]);
40 |         const facts = rules.against(doc);
41 |         assert.deepEqual(facts.get('cluster'), []);
42 |     });
43 | 
44 |     it('can have its type overridden', function () {
45 |         const doc = staticDom('<p></p>');
46 |         const rules = ruleset([
47 |             rule(dom('p'), type('bar')),
48 |             rule(type('foo').type('bar'), 'best')
49 |         ]);
50 |         const facts = rules.against(doc);
51 |         const best = facts.get('best');
52 |         assert.equal(best.length, 1);
53 |     });
54 | 
55 |     it('filters using when() on type()', function () {
56 |         const doc = staticDom('<p id="fat"></p><p id="bat"></p>');
57 |         const rules = ruleset([
58 |             rule(dom('p'), type('bar')),
59 |             rule(type('bar').when(fnode => fnode.element.id === 'fat'), type('when')),
60 |             rule(type('when'), 'best')
61 |         ]);
62 |         const facts = rules.against(doc);
63 |         const best = facts.get('best');
64 |         assert.equal(best.length, 1);
65 |         assert.equal(best[0].element.id, 'fat');
66 |     });
67 | 
68 |     it('filters using when() on dom()', function () {
69 |         const doc = staticDom('<p id="fat"></p><p id="bat"></p>');
70 |         const rules = ruleset([
71 |             rule(dom('p').when(fnode => fnode.element.id === 'bat'), type('when')),
72 |             rule(type('when'), 'best')
73 |         ]);
74 |         const facts = rules.against(doc);
75 |         const best = facts.get('best');
76 |         assert.equal(best.length, 1);
77 |         assert.equal(best[0].element.id, 'bat');
78 |     });
79 | });
80 | 


--------------------------------------------------------------------------------
/fathom/test/rhs_tests.mjs:
--------------------------------------------------------------------------------
  1 | import {assert} from 'chai';
  2 | 
  3 | import {atMost, dom, note, out, props, rule, ruleset, score, type, typeIn} from '../index';
  4 | import {sigmoid, staticDom} from '../utils';
  5 | 
  6 | 
  7 | describe('RHS', function () {
  8 |     it('combines different calls piecewise, with rightmost repeated subfacts shadowing', function () {
  9 |         const rhs = type('foo').score(5).props(node => ({score: 6})).asRhs();
 10 |         assert.deepEqual(rhs.fact('dummy'), {type: 'foo', score: 6});
 11 |     });
 12 | 
 13 |     it('has same-named calls shadow, with rightmost winning', function () {
 14 |         const rhs = props(node => ({score: 1})).props(node => ({note: 'foo'})).asRhs();
 15 |         assert.deepEqual(rhs.fact('dummy'), {note: 'foo'});
 16 |     });
 17 | 
 18 |     it('runs callbacks only once', function () {
 19 |         let count = 0;
 20 |         function addOne() {
 21 |             count++;
 22 |             return {};
 23 |         }
 24 |         const rhs = props(addOne).asRhs();
 25 |         assert.deepEqual(rhs.fact('dummy'), {});
 26 |         assert.equal(count, 1);
 27 |     });
 28 | 
 29 |     it('ignores unexpected subfacts returned from props() callbacks', function () {
 30 |         const rhs = props(node => ({booga: true, score: 3})).asRhs();
 31 |         assert.deepEqual(rhs.fact('dummy'), {score: 3});
 32 |     });
 33 | 
 34 |     it('enforces atMost()', function () {
 35 |         const doc = staticDom('<p></p>');
 36 |         const rules = ruleset([
 37 |             rule(dom('p'), score(8).type('para').atMost(3))
 38 |         ]);
 39 |         const facts = rules.against(doc);
 40 |         assert.throws(() => facts.get(type('para')),
 41 |                       'Score of 8 exceeds the declared atMost(3).');
 42 |     });
 43 | 
 44 |     it('works fine when atMost() is satisfied', function () {
 45 |         const doc = staticDom('<p></p>');
 46 |         const rules = ruleset([
 47 |             rule(dom('p'), atMost(3).score(2).type('para'))
 48 |         ]);
 49 |         const facts = rules.against(doc);
 50 |         assert.equal(facts.get(type('para'))[0].scoreFor('para'), sigmoid(2));
 51 |     });
 52 | 
 53 |     it('enforces typeIn() for explicit types', function () {
 54 |         const doc = staticDom('<p></p>');
 55 |         const rules = ruleset([
 56 |             rule(dom('p'), typeIn('nope').type('para'))
 57 |         ]);
 58 |         const facts = rules.against(doc);
 59 |         assert.throws(() => facts.get(type('para')),
 60 |                       'A right-hand side claimed, via typeIn(...) to emit one of the types {nope} but actually emitted para.');
 61 |     });
 62 | 
 63 |     it('enforces typeIn() for inherited types', function () {
 64 |         const doc = staticDom('<p></p>');
 65 |         const rules = ruleset([
 66 |             rule(dom('p'), type('para')),
 67 |             rule(type('para'), props(n => ({})).typeIn('nope'))
 68 |         ]);
 69 |         const facts = rules.against(doc);
 70 |         assert.throws(() => facts.get(type('nope')),
 71 |                       'A right-hand side claimed, via typeIn(...) to emit one of the types {nope} but actually inherited para from the left-hand side.');
 72 |     });
 73 | 
 74 |     it('works fine when typeIn() is satisfied', function () {
 75 |         const doc = staticDom('<p></p>');
 76 |         const rules = ruleset([
 77 |             rule(dom('p'), typeIn('para').type('para'))
 78 |         ]);
 79 |         const facts = rules.against(doc);
 80 |         assert.equal(facts.get(type('para')).length, 1);
 81 |     });
 82 | 
 83 |     it('runs out().through() callbacks', function () {
 84 |         const doc = staticDom('<p></p>');
 85 |         const rules = ruleset([
 86 |             rule(dom('p'), out('para').through(fnode => fnode.element.tagName))
 87 |         ]);
 88 |         const facts = rules.against(doc);
 89 |         assert.equal(facts.get('para')[0], 'P');
 90 |     });
 91 | 
 92 |     it('paves over undefined notes', function () {
 93 |         // We shouldn't re-run any rules. Run order shouldn't matter, because
 94 |         // we forbid notes from overwriting, score contribution is
 95 |         // commutative, and type assignment is idempotent and immutable.
 96 |         const doc = staticDom('<p></p>');
 97 |         const rules = ruleset([
 98 |             rule(dom('p'), type('para')),
 99 |             rule(type('para'), note(fnode => undefined)),
100 |             rule(type('para'), note(fnode => 'foo'))
101 |         ]);
102 |         const facts = rules.against(doc);
103 |         assert.equal(facts.get(type('para'))[0].noteFor('para'), 'foo');
104 |     });
105 | 
106 |     it('runs scoring callbacks', function () {
107 |         const doc = staticDom('<p></p>');
108 |         const rules = ruleset([
109 |             rule(dom('p'), type('p').score(fnode => 5))
110 |         ]);
111 |         const facts = rules.against(doc);
112 |         assert.equal(facts.get(type('p'))[0].scoreFor('p'), sigmoid(5));
113 |     });
114 | });
115 | 


--------------------------------------------------------------------------------
/fathom/test/rule_tests.mjs:
--------------------------------------------------------------------------------
 1 | import {assert} from 'chai';
 2 | 
 3 | import {dom, rule, ruleset, score, type, typeIn} from '../index';
 4 | import {staticDom} from '../utils';
 5 | 
 6 | 
 7 | describe('Rule', function () {
 8 |     it('knows what it can add and emit', function () {
 9 |         const a = rule(dom('p'), type('para'));
10 |         assert.sameMembers(Array.from(a.typesItCouldEmit()), ['para']);
11 |         assert.sameMembers(Array.from(a.typesItCouldAdd()), ['para']);
12 | 
13 |         const b = rule(type('r'), typeIn('q').props('dummy').typeIn('r', 's'));
14 |         assert.sameMembers(Array.from(b.typesItCouldEmit()), ['r', 's']);
15 |         assert.sameMembers(Array.from(b.typesItCouldAdd()), ['s']);
16 | 
17 |         const c = rule(type('a'), score(2));
18 |         assert.sameMembers(Array.from(c.typesItCouldEmit()), ['a']);
19 |     });
20 | 
21 |     it('identifies prerequisite rules', function () {
22 |         const domRule = rule(dom('p'), type('a'));
23 |         const maxRule = rule(type('a').max(), type('b'));
24 |         const maintainRule = rule(type('b'), score(2));
25 |         const addRule = rule(type('b'), type('c'));
26 |         const rules = ruleset([domRule, maxRule, maintainRule, addRule]);
27 |         const facts = rules.against(staticDom(''));
28 |         assert.sameMembers(Array.from(domRule.prerequisites(facts)), []);
29 |         assert.sameMembers(Array.from(maxRule.prerequisites(facts)), [domRule]);
30 |         assert.sameMembers(Array.from(maintainRule.prerequisites(facts)), [maxRule]);
31 |         assert.sameMembers(Array.from(addRule.prerequisites(facts)), [maxRule, maintainRule]);
32 | 
33 |         const prereqs = facts._prerequisitesTo(addRule);
34 |         // TODO: Replace with deepEqual when chai >= 4.0 supports Maps and Sets.
35 |         assert.equal(prereqs.size, 3);
36 |         assert.deepEqual(prereqs.get(maintainRule), [addRule]);
37 |         assert.deepEqual(prereqs.get(domRule), [maxRule]);
38 |         assert.deepEqual(prereqs.get(maxRule), [addRule, maintainRule]);
39 |     });
40 | });
41 | 


--------------------------------------------------------------------------------
/fathom/test/side_tests.mjs:
--------------------------------------------------------------------------------
 1 | // Tests for fathom/side.js
 2 | 
 3 | import {assert} from 'chai';
 4 | 
 5 | import {type} from '../index';
 6 | 
 7 | 
 8 | describe('Side', function () {
 9 |     it('makes a LHS out of a type()', function () {
10 |         const side = type('smoo');
11 |         assert(side.asLhs);  // It appears to be a Side.
12 |         const lhs = side.asLhs();
13 |         assert.notStrictEqual(lhs.max);  // It appears to be a TypeLhs.
14 |     });
15 | 
16 |     it('is immutable and so can be factored up', function () {
17 |         const defaults = type('smoo');
18 |         const another = defaults.atMost(1);
19 |         assert.equal(defaults._calls.length, 1);
20 |         assert.equal(another._calls.length, 2);
21 |     });
22 | });
23 | 


--------------------------------------------------------------------------------
/fathom/test/utils_tests.mjs:
--------------------------------------------------------------------------------
  1 | import {assert} from 'chai';
  2 | import {NoWindowError} from '../exceptions';
  3 | import {dom, rule, ruleset, score, type} from '../index';
  4 | import {NiceSet, toposort, staticDom, attributesMatch, windowForElement} from '../utils';
  5 | 
  6 | 
  7 | describe('Utils', function () {
  8 |     describe('NiceSet', function () {
  9 |         it('pops', function () {
 10 |             const s = new NiceSet([1, 2]);
 11 |             assert.equal(s.pop(), 1);
 12 |             assert.equal(s.pop(), 2);
 13 |             assert.throws(() => s.pop(),
 14 |                           'Tried to pop from an empty NiceSet.');
 15 |         });
 16 |     });
 17 | 
 18 |     describe('toposort', function () {
 19 |         it('sorts', function () {
 20 |             // Return answers that express the graph...
 21 |             // 4 <- 5 <- 6   <-  7
 22 |             //           |       |
 23 |             //           v       v
 24 |             //          5.1  <- 6.1
 25 |             // ...where -> means "needs".
 26 |             function nodesThatNeed(node) {
 27 |                 return node === 5.1 ? [6, 6.1] : (node === 7 ? [] : [Math.floor(node) + 1]);
 28 |             }
 29 |             assert.deepEqual(toposort([4, 5, 5.1, 6, 6.1, 7], nodesThatNeed),
 30 |                              [7, 6, 5, 4, 6.1, 5.1]);
 31 |         });
 32 |         it('detects cycles', function () {
 33 |             // Express a graph of 3 nodes pointing in a circle.
 34 |             function nodesThatNeed(node) {
 35 |                 return [(node + 1) % 3];
 36 |             }
 37 |             assert.throws(() => toposort([0, 1, 2], nodesThatNeed),
 38 |                           'The graph has a cycle.');
 39 |         });
 40 |     });
 41 | 
 42 |     describe('attributesMatch', function () {
 43 |         it('searches all attributes', function () {
 44 |             const doc = staticDom(`
 45 |                 <img id="foo" alt="boo"></img><img id="fat" src= "bat"></img>
 46 |             `);
 47 |             const rules = ruleset([
 48 |                 rule(dom('img'), type('attr')),
 49 |                 rule(type('attr'), score(scoreFunc)),
 50 |                 rule(type('attr').max(), 'best')
 51 |             ]);
 52 | 
 53 |             function scoreFunc(fnode) {
 54 |                 return attributesMatch(fnode.element, attr => attr.includes('oo')) ? 5 : 1;
 55 |             }
 56 | 
 57 |             const facts = rules.against(doc);
 58 |             const best = facts.get('best');
 59 |             assert.equal(best.length, 1);
 60 |             assert.equal(best[0].element.id, 'foo');
 61 |         });
 62 | 
 63 |         it('searches specified attributes', function () {
 64 |             const doc = staticDom(`
 65 |                 <img id="foo" alt="bat"></img><img id="sat" src="bat"></img>
 66 |             `);
 67 |             const rules = ruleset([
 68 |                 rule(dom('img'), type('attr')),
 69 |                 rule(type('attr'), score(scoreFunc)),
 70 |                 rule(type('attr').max(), 'best')
 71 |             ]);
 72 | 
 73 |             function scoreFunc(fnode) {
 74 |                 return attributesMatch(fnode.element, attr => attr.includes('at'), ['id']) ? 5 : 1;
 75 |             }
 76 | 
 77 |             const facts = rules.against(doc);
 78 |             const best = facts.get('best');
 79 |             assert.equal(best.length, 1);
 80 |             assert.equal(best[0].element.id, 'sat');
 81 |         });
 82 | 
 83 |         it('searches attributes which are arrays', function () {
 84 |             const doc = staticDom(`
 85 |                 <img id="fat" class="fat bat sat" ></img><img id="foo" class="foo bar boo"></img>
 86 |             `);
 87 |             const rules = ruleset([
 88 |                 rule(dom('img'), type('attr')),
 89 |                 rule(type('attr'), score(scoreFunc)),
 90 |                 rule(type('attr').max(), 'best')
 91 |             ]);
 92 | 
 93 |             function scoreFunc(fnode) {
 94 |                 return attributesMatch(fnode.element, attr => attr.includes('at')) ? 5 : 1;
 95 |             }
 96 | 
 97 |             const facts = rules.against(doc);
 98 |             const best = facts.get('best');
 99 |             assert.equal(best.length, 1);
100 |             assert.equal(best[0].element.id, 'fat');
101 |         });
102 | 
103 |         it('returns false for elements that lack the requested attributes', function () {
104 |             // The first element has the alt attribute, and the second one doesn't, so it shouldn't get included in the results
105 |             const doc = staticDom(`
106 |                 <img id="foo" alt="bat"></img><img id="bar"></img>
107 |             `);
108 |             const rules = ruleset([
109 |                 rule(dom('img'), type('attr')),
110 |                 rule(type('attr'), score(scoreFunc)),
111 |                 rule(type('attr').max(), 'best')
112 |             ]);
113 | 
114 |             function scoreFunc(fnode) {
115 |                 return attributesMatch(fnode.element, attr => attr.includes('at'), ['alt']) ? 5 : 1;
116 |             }
117 | 
118 |             const facts = rules.against(doc);
119 |             const best = facts.get('best');
120 |             assert.equal(best.length, 1);
121 |             assert.equal(best[0].element.id, 'foo');
122 |         });
123 | 
124 |         it("doesn't touch nodes that don't match", function () {
125 |             const doc = staticDom(`
126 |                 <img id="foo"></img><img id="bar"></img>
127 |             `);
128 |             const rules = ruleset([
129 |                 rule(dom('img'), type('attr')),
130 |                 rule(type('attr'), score(scoreFunc)),
131 |                 rule(type('attr').max(), 'best')
132 |             ]);
133 | 
134 |             function scoreFunc(fnode) {
135 |                 return attributesMatch(fnode.element, attr => attr.includes('z')) ? 5 : 1;
136 |             }
137 | 
138 |             const facts = rules.against(doc);
139 |             const best = facts.get('best');
140 |             assert.equal(best.length, 2);
141 |         });
142 | 
143 |         it('searches multiple explicitly specified attributes', function () {
144 |             const doc = staticDom(`
145 |                 <img id="foo" alt="bat"></img><img id="cat"></img><img ignored="fat"></img>
146 |             `);
147 |             const rules = ruleset([
148 |                 rule(dom('img'), type('attr')),
149 |                 rule(type('attr'), score(scoreFunc)),
150 |                 rule(type('attr').max(), 'best')
151 |             ]);
152 | 
153 |             function scoreFunc(fnode) {
154 |                 return attributesMatch(fnode.element, attr => attr.includes('at'), ['alt', 'id']) ? 5 : 1;
155 |             }
156 | 
157 |             const facts = rules.against(doc);
158 |             const best = facts.get('best');
159 |             assert.equal(best.length, 2);
160 |             assert.equal(best[0].element.id, 'foo');
161 |             assert.equal(best[1].element.id, 'cat');
162 |         });
163 |     });
164 | 
165 |     describe('windowForElement', function () {
166 |         it('raises NoWindowError when run outside a window', function () {
167 |             // We mock out the element because jsdom actually provides a window
168 |             // object:
169 |             const element = {ownerDocument: {defaultView: null}};
170 |             assert.throws(() => windowForElement(element),
171 |                           NoWindowError);
172 |         });
173 |     });
174 | });
175 | 


--------------------------------------------------------------------------------
/fathom/utils.mjs:
--------------------------------------------------------------------------------
1 | export * from './utilsForBackend';
2 | export * from './utilsForFrontend';
3 | 


--------------------------------------------------------------------------------
/fathom/utilsForBackend.mjs:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Things that work only on a command-line node.js environment
 3 |  */
 4 | 
 5 | import {jsdom} from 'jsdom/lib/old-api';
 6 | 
 7 | 
 8 | /**
 9 |  * Parse an HTML doc, and return a DOM-compliant interface to it. Do not
10 |  * execute any of its inline scripts.
11 |  */
12 | export function staticDom(html) {
13 |     return jsdom(html, {features: {ProcessExternalResources: false,
14 |                                    FetchExternalResources: false}});
15 | }
16 | 


--------------------------------------------------------------------------------
/fathom_fox/.eslintignore:
--------------------------------------------------------------------------------
1 | addon/contentScript.js
2 | addon/evaluate.js
3 | addon/simmer.js
4 | 


--------------------------------------------------------------------------------
/fathom_fox/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "env": {
 3 |     "browser": true,
 4 |     "es6": true,
 5 |     "amd": true,
 6 |     "webextensions": true
 7 |   },
 8 |   "parserOptions": {
 9 |     "ecmaVersion": 9
10 |   },
11 |   "rules": {
12 |     "object-curly-spacing": "error",
13 |     "keyword-spacing": "error"
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/fathom_fox/README.md:
--------------------------------------------------------------------------------
 1 | # FathomFox
 2 | 
 3 | A suite of tools for developing [Fathom](https://mozilla.github.io/fathom/) rulesets within Firefox:
 4 | 
 5 | * [Corpus collection and labeling tools](https://mozilla.github.io/fathom/samples.html) (which are likely all you will need)
 6 | * An Evaluator which can help you [drop into the JS debugger](https://mozilla.github.io/fathom/training.html#setting-breakpoints) inside your ruleset
 7 | * A Vectorizer, which you can ignore. (It persists, for now, as an optional manual alternative to simply letting `fathom train` and other tools take care of vectorization automatically.)
 8 | 
 9 | For most use cases, it's better to run FathomFox from the commandline rather than installing it through the web. See [Fathom's installation page](https://mozilla.github.io/fathom/installing.html) for instructions.
10 | 
11 | ## Full Documentation
12 | 
13 | See [the Fathom docs](https://mozilla.github.io/fathom/versions.html).
14 | 
15 | ## Running FathomFox from a Source Checkout
16 | 
17 | This is necessary only if you are developing FathomFox itself.
18 | 
19 | 1. Clone the [Fathom repository](https://github.com/mozilla/fathom/).
20 | 2. From within the checkout, inside the `fathom_fox` folder, install dependencies: `yarn run build`.
21 | 3. Run a clean copy of Firefox with FathomFox installed: `yarn run browser`.
22 | 4. Run `yarn run watch` in a separate terminal. This will keep your running copy of FathomFox up to date as you edit your ruleset.
23 | 
24 | ## Thanks
25 | 
26 | Thanks to Treora for his excellent freeze-dry library!
27 | 


--------------------------------------------------------------------------------
/fathom_fox/Tagged Head.afdesign:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/fathom_fox/Tagged Head.afdesign


--------------------------------------------------------------------------------
/fathom_fox/addon/actionMenu.js:
--------------------------------------------------------------------------------
1 | function openTab(url) {
2 |     browser.tabs.create({url, active: true});
3 |     window.close();
4 | }
5 | 
6 | document.getElementById('collectCorpus').addEventListener('click', () => openTab('/pages/corpus.html'));
7 | document.getElementById('evaluate').addEventListener('click', () => openTab('/pages/evaluate.html'));
8 | document.getElementById('vectorize').addEventListener('click', () => openTab('/pages/vector.html'));
9 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/background.js:
--------------------------------------------------------------------------------
 1 | /** Dispatch messages sent to the background script. */
 2 | function handleBackgroundScriptMessage(request, sender, sendResponse) {
 3 |     if (request.type === 'rulesetSucceededOnTabs') {
 4 |         // Run a given ruleset on a given set of tabs, and return an array of
 5 |         // responses saying whether they got the right answer on each. It's
 6 |         // necessary to do this in the background script so we have permission
 7 |         // to call the APIs we need.
 8 |         Promise.all(request.tabIds.map(
 9 |                         tabId => browser.tabs.sendMessage(
10 |                             tabId,
11 |                             {type: 'rulesetSucceeded',
12 |                              traineeId: request.traineeId,
13 |                              coeffs: request.coeffs})))
14 |                .then(sendResponse);
15 |         return true;  // so sendResponse hangs around after we return
16 |     } else if (request.type === 'refresh') {
17 |         // Bridge between content and the devtools panel.
18 |         browser.runtime.sendMessage({type: 'refresh'}).catch(() => {});
19 |     }
20 | }
21 | browser.runtime.onMessage.addListener(handleBackgroundScriptMessage);
22 | 
23 | /**
24 |  * Connect a dev panel, at its request, to the content script in its inspected
25 |  * tab.
26 |  */
27 | function connectADevPanel(port) {
28 |     // Open a port to our content script on the tab that's being inspected.
29 |     port.onMessage.addListener(handleMessage);
30 | 
31 |     /**
32 |      * Handle any of the various messages that can come flying at the
33 |      * background script from various sources.
34 |      */
35 |     async function handleMessage(request) {
36 |         if (request.type === 'freeze') {
37 |             // Send 'freeze' request to content-script to fetch frozen html
38 |             browser.tabs.sendMessage(request.tabId, request)
39 |                 .then((html) => {
40 |                     // Show save file dialog.  When the dialog is closed send a 'refresh'
41 |                     // message to the devpanel so it can hide the spinner.
42 |                     download(html, {saveAs: true})
43 |                         .then(() => {
44 |                             browser.runtime.sendMessage({type: 'refresh'});
45 |                         })
46 |                         .catch(() => {
47 |                             browser.runtime.sendMessage({type: 'refresh'});
48 |                         });
49 |                 });
50 |         } else {
51 |             // Most requests are passed unmodified to the content script.
52 |             await browser.tabs.sendMessage(request.tabId, request);
53 |         }
54 |     }
55 | }
56 | browser.runtime.onConnect.addListener(connectADevPanel);
57 | 
58 | // Update devtools panel when tab navigates to new page.
59 | browser.tabs.onUpdated.addListener((tabId, changeInfo, tabInfo) => {
60 |     if (changeInfo.status === 'complete') {
61 |         browser.runtime.sendMessage({type: 'init'})
62 |             .catch((error) => {
63 |                 console.error(error)
64 |             });
65 |     }
66 | });
67 | 
68 | async function freeze_tab(tab) {
69 |     const html = await browser.tabs.sendMessage(
70 |         tab.id,
71 |         {
72 |             type: 'freeze',
73 |             options: {
74 |                 wait: 0,
75 |                 shouldScroll: false
76 |             }
77 |         }
78 |     );
79 |     await download(html, {saveAs: true});
80 | }
81 | 
82 | browser.commands.onCommand.addListener((command) => {
83 |     if (command === 'freeze-page') {
84 |         browser.tabs.query({currentWindow: true, active: true})
85 |             .then((tabs) => {
86 |                 return tabs[0];
87 |             })
88 |             .then((tab) => {
89 |                 return freeze_tab(tab);
90 |             })
91 |             .catch((error) => {
92 |                 console.log(error);
93 |             });
94 |     }
95 | });
96 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/corpus.js:
--------------------------------------------------------------------------------
  1 | class CorpusCollector extends PageVisitor {
  2 |     formOptions() {
  3 |         if (!(this.doc.getElementById('wait').validity.valid &&
  4 |               this.doc.getElementById('timeout').validity.valid)) {
  5 |             return undefined;
  6 |         }
  7 | 
  8 |         const options = {};
  9 | 
 10 |         // Initialize options from the form.
 11 |         options.otherOptions = {
 12 |             wait: parseFloat(this.doc.getElementById('wait').value),
 13 |             shouldScroll: this.doc.getElementById('shouldScroll').checked,
 14 |         };
 15 | 
 16 |         // Note we extend the timeout by the freeze delay.
 17 |         options.timeout = parseFloat(this.doc.getElementById('timeout').value) + options.otherOptions.wait;
 18 | 
 19 |         // Load each url line-by-line from the textarea.
 20 |         // If a line contains a space, the first word will be used as the filename.
 21 |         options.urls = this.doc
 22 |             .getElementById('pages')
 23 |             .value
 24 |             .split('\n')
 25 |             .map(line => line.trim())
 26 |             .filter(line => line.length > 0)
 27 |             .map(line => {
 28 |                 // Split into filename and url.
 29 |                 const parts = line.split(/\s+/, 2);
 30 |                 let obj;
 31 |                 if (parts.length === 1) {
 32 |                     obj = {filename: undefined, url: parts[0]};
 33 |                 } else {
 34 |                     obj = {filename: parts[0] + '.html', url: parts[1]};
 35 |                 }
 36 |                 // Prepend protocol if missing.
 37 |                 if (!obj.url.match(/^https?:\/\//)) {
 38 |                     obj.url = 'http://' + obj.url;
 39 |                 }
 40 |                 // Name the file from the host if not specified.
 41 |                 if (!obj.filename) {
 42 |                     obj.filename = obj.url
 43 |                         .replace(/^https?:\/\//, '')  // Remove protocol.
 44 |                         .replace(/^([^\/]+)\/.*$/, '$1')  // Delete everything after first /
 45 |                         + '.html';
 46 |                 }
 47 |                 return obj;
 48 |             });
 49 |         // We need at least one url.
 50 |         if (options.urls.length === 0) {
 51 |             return undefined;
 52 |         }
 53 | 
 54 |         // Not customizeable just because nobody asked for it yet:
 55 |         options.maxTabs = 16;
 56 | 
 57 |         return options;
 58 |     }
 59 | 
 60 |     getViewportHeightAndWidth() {
 61 |         return {
 62 |             height: parseInt(this.doc.getElementById('viewportHeight').value),
 63 |             width: parseInt(this.doc.getElementById('viewportWidth').value)
 64 |         }
 65 |     }
 66 | 
 67 |     async processWithinTimeout(tab, windowId) {
 68 |         this.setCurrentStatus({message: 'freezing', index: tab.id});
 69 |         // Inject dispatcher to listen to the message we then send. Can't get a
 70 |         // return value directly out of the content script because webpack
 71 |         // wraps our top-level stuff in a function. Instead, we use messaging.
 72 |         await browser.tabs.executeScript(
 73 |             tab.id,
 74 |             {file: '/contentScript.js'}
 75 |         );
 76 | 
 77 |         // Call freeze-dry to fetch html.
 78 |         const html = await browser.tabs.sendMessage(
 79 |             tab.id,
 80 |             {type: 'freeze', options: {wait: this.otherOptions.wait,
 81 |                                        shouldScroll: this.otherOptions.shouldScroll}}
 82 |         );
 83 |         return html;
 84 |     }
 85 | 
 86 |     async processWithoutTimeout(html, tabId) {
 87 |         // Save html to disk.
 88 |         const filename = this.urls[this.tabIdToUrlsIndex.get(tabId)].filename;
 89 |         const download_filename = await download(html, {filename});
 90 | 
 91 |         this.setCurrentStatus({
 92 |           message: 'downloaded as ' + download_filename,
 93 |           index: tabId,
 94 |           isFinal: true
 95 |         });
 96 |     }
 97 | }
 98 | 
 99 | const collector = new CorpusCollector(document);
100 | collector.addEventListeners();
101 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/devtoolsOpener.js:
--------------------------------------------------------------------------------
 1 | let backgroundPort = browser.runtime.connect();
 2 | 
 3 | function createPanel() {
 4 |     browser.devtools.panels.create(
 5 |         'Fathom',
 6 |         '/icons/icon.svg',
 7 |         '/pages/devtoolsPanel.html'
 8 |     ).then((extensionPanel) => {
 9 |         extensionPanel.onShown.addListener(panelShown);
10 |         extensionPanel.onHidden.addListener(panelHidden);
11 |     });
12 | }
13 | 
14 | function panelShown() {
15 |     inspectedElementSelector()
16 |         .then((selector) => {
17 |             backgroundPort.postMessage({
18 |                 type: 'showHighlight',
19 |                 tabId: browser.devtools.inspectedWindow.tabId,
20 |                 selector: selector,
21 |             });
22 |             browser.runtime.sendMessage({type: 'refresh'});
23 |         })
24 |         .catch((error) => {
25 |             console.error(error);
26 |         });
27 | }
28 | 
29 | function panelHidden() {
30 |     backgroundPort.postMessage({
31 |         type: 'hideHighlight',
32 |         tabId: browser.devtools.inspectedWindow.tabId,
33 |     });
34 | }
35 | 
36 | createPanel();
37 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/download.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Save the given HTML to the user's downloads folder.
 3 |  */
 4 | async function download(html, options = {}) {
 5 |     const blob = new Blob([html], {type: 'text/html'});
 6 |     const url = URL.createObjectURL(blob);
 7 | 
 8 |     // Save html using the specified filename as a template.
 9 |     let downloadId = await browser.downloads.download({
10 |         url,
11 |         filename: options.filename || 'Untitled.html',
12 |         saveAs: options.saveAs || false,
13 |     });
14 | 
15 |     // Give it 10 seconds; FF can be a bit slow.
16 |     window.setTimeout(() => URL.revokeObjectURL(url), 1000 * 10);
17 | 
18 |     // Return the basename of the chosen filename.
19 |     let filename = (await browser.downloads.search({id: downloadId}))[0].filename;
20 |     return filename.replace(/^.*\//, '');
21 | }
22 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/icons/icon.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg width="77" height="77" viewBox="0 0 77 75" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1.5;"><path id="path3763" d="M61.714,69.52c0.01,-7.556 2.015,-14.084 6.58,-21.429c13.477,-21.679 -1.435,-47.889 -27.219,-47.841c-14.307,0.027 -25.43,4.786 -30.159,16.246c-1.34,3.248 -0.586,4.916 -3.084,9.04c-2.461,4.06 -5.962,9.471 -6.064,12.996c-0.054,1.853 1.527,2.596 3.655,3.696c3.284,1.699 5.935,2.256 5.988,7.08c0.004,0.322 -0.05,0.666 0.005,1.021c1.935,12.289 3.944,13.455 11.406,13.163c5.138,-0.202 5.358,3.948 5.358,7.649l0,3.859l16.763,0l16.764,0l0.007,-5.48l0,0Z" style="fill-rule:nonzero;"/><path d="M29.632,20.457l-9.671,14.1l9.671,14.1" style="fill:none;stroke:#fff;stroke-width:7.5px;"/><path d="M50.993,20.457l9.671,14.1l-9.671,14.1" style="fill:none;stroke:#fff;stroke-width:7.5px;"/></svg>


--------------------------------------------------------------------------------
/fathom_fox/addon/manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "manifest_version": 2,
 3 |     "name": "FathomFox",
 4 |     "version": "3.7.3",
 5 |     "description": "Tools for developing Fathom rulesets",
 6 |     "applications": {
 7 |         "gecko": {
 8 |             "id": "{954efd86-8f62-49e7-8a65-80016051e382}"
 9 |         }
10 |     },
11 |     "icons": {
12 |         "48": "icons/icon.svg",
13 |         "96": "icons/icon.svg"
14 |     },
15 |     "browser_action": {
16 |         "default_icon": "icons/icon.svg",
17 |         "default_title": "FathomFox",
18 |         "default_popup": "pages/actionMenu.html",
19 |         "browser_style": true
20 |     },
21 |     "background": {
22 |         "scripts": ["download.js", "background.js"]
23 |     },
24 |     "content_scripts": [{
25 |         "matches": ["<all_urls>"],
26 |         "js": ["rulesets.js", "utils.js", "contentScript.js", "simmer.js"]
27 |     }],
28 |     "web_accessible_resources": [
29 |         "simmer.js"
30 |     ],
31 |     "permissions": [
32 |         "<all_urls>",
33 |         "downloads",
34 |         "tabs"
35 |     ],
36 |     "devtools_page": "pages/devtoolsOpener.html",
37 |     "commands": {
38 |         "freeze-page": {
39 |             "suggested_key": {
40 |                 "default": "Ctrl+Shift+O"
41 |             },
42 |             "description": "Download page in current active tab"
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/measureWindowSize.js:
--------------------------------------------------------------------------------
1 | ({outerWidth: window.outerWidth,
2 |   innerWidth: window.innerWidth,
3 |   outerHeight: window.outerHeight,
4 |   innerHeight: window.innerHeight});
5 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/actionMenu.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |   </head>
 6 |   <body>
 7 |     <div class="panel-section panel-section-list">
 8 |       <div id="collectCorpus" class="panel-list-item">
 9 |         <div class="text">Corpus Collector</div>
10 |       </div>
11 |       <div id="evaluate" class="panel-list-item">
12 |         <div class="text">Evaluator</div>
13 |       </div>
14 |       <div id="vectorize" class="panel-list-item">
15 |         <div class="text">Vectorizer</div>
16 |       </div>
17 |     </div>
18 |     <script src="/actionMenu.js"></script>
19 |   </body>
20 | </html>
21 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/blank.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <html>
3 |   <head>
4 |     <meta charset="UTF-8"> 
5 |   </head>
6 |   <body>
7 |   </body>
8 | </html>
9 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/corpus.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <link rel="stylesheet" type="text/css" href="chrome://browser/content/extension.css">
 6 |     <style>
 7 |       body {
 8 |         padding: 0 20px;
 9 |       }
10 |       #options {
11 |         float: right;
12 |         padding-right: 10px;
13 |       }
14 |       #options div {
15 |         text-align: right;
16 |       }
17 |       #options input[type="text"] {
18 |         text-align: right;
19 |       }
20 |       #status li.error {
21 |         color: darkred;
22 |       }
23 |     </style>
24 |   </head>
25 |   <body class="browser-style">
26 |     <h1>Corpus Collector</h1>
27 |     <p>
28 |       This “freezes” a series of pages for use as a training corpus. External resources like images and CSS are inlined to create a convenient single-file package for each page. Scripts are removed or deactivated, and external network resources that can't be inlined are blocked. The downloaded pages land in your usual downloads folder.
29 |     </p>
30 |     <p>
31 |       Optionally, you may prefix each line with a filename to save the page to, followed by whitespace.
32 |     </p>
33 |     <label for="pages">Enter the pages to download, one per line:</label>
34 |     <textarea id="pages" placeholder="https://example.com/
35 | https://another.com/page" rows=10 class="browser-style"></textarea>
36 |     <form id="freezeForm">
37 |       <button id="freeze" disabled>Download</button>
38 |       <div id="options">
39 |         <div>
40 |           <input type="checkbox" id="shouldScroll">
41 |           <label for="shouldScroll" title="Scroll to the bottom of the page before freezing.">Scroll to Bottom</label>
42 |         </div>
43 |         <div>
44 |           <label for="wait" title="Wait this long before freezing the page.">Delay:</label>
45 |           <input type="text" required pattern="[0-9]+" min="0" size="2" id="wait" value="1"> sec
46 |         </div>
47 |         <div>
48 |           <label for="timeout" title="Give up if page load and freeze takes longer than this.">Timeout:</label>
49 |           <input type="text" required pattern="[0-9]+" size="2" id="timeout" value="60"> sec
50 |         </div>
51 |         <div>
52 |           <label title="Resize the window so the viewport is this big.">
53 |             Viewport Size:
54 |             <input type="text" required pattern="[0-9]+" size="4" id="viewportWidth" value="1024">
55 |             ×
56 |             <input type="text" required pattern="[0-9]+" size="4" id="viewportHeight" value="768">
57 |           </label>
58 |         </div>
59 |       </div>
60 |     </form>
61 |     <ul id="status"></ul>
62 |     <script src="../download.js"></script>
63 |     <script src="../utils.js"></script>
64 |     <script src="../visit.js"></script>
65 |     <script src="../corpus.js"></script>
66 |   </body>
67 | </html>
68 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/devtoolsOpener.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |   </head>
 6 |   <body>
 7 |     <script src="../utils.js"></script>
 8 |     <script src="../devtoolsOpener.js"></script>
 9 |   </body>
10 | </html>
11 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/devtoolsPanel.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="UTF-8">
  5 |     <link rel="stylesheet" type="text/css" href="chrome://browser/content/extension.css">
  6 |     <style>
  7 |       .hidden {
  8 |         display: none !important;
  9 |       }
 10 | 
 11 |       body {
 12 |         padding: 10px;
 13 |       }
 14 | 
 15 |       h1 {
 16 |         margin: 0 0 10px 0;
 17 |         font-size: 100%;
 18 |       }
 19 | 
 20 |       .box {
 21 |         border: 1px solid silver;
 22 |         padding: 10px;
 23 |         margin-bottom: 10px;
 24 |       }
 25 | 
 26 |       #no-selection, #no-labels {
 27 |         font-style: italic;
 28 |       }
 29 | 
 30 |       #labels table {
 31 |         border-spacing: 0;
 32 |         padding: 0;
 33 |       }
 34 | 
 35 |       table {
 36 |         table-layout: fixed;
 37 |         width: 100%;
 38 |       }
 39 | 
 40 |       #labels .path, #labels .preview {
 41 |         min-width: 40%;
 42 |       }
 43 | 
 44 |       #labels .action {
 45 |         width: 2.5em;
 46 |       }
 47 | 
 48 |       /* Make the input take up the whole cell so it's close to the delete button. */
 49 |       #labels .label, #labels .label input {
 50 |         width: 25ex
 51 |       }
 52 | 
 53 |       #labels td.preview {
 54 |         font-family: monospace;
 55 |         font-size: 90%;
 56 |         max-height: 1em;
 57 |       }
 58 | 
 59 |       #labels tr.hover:hover {
 60 |         background-color: #eee;
 61 |       }
 62 | 
 63 |       #labels .action-button {
 64 |         color: #000;
 65 |         text-decoration: none;
 66 |         width: 1.5em;
 67 |         display: inline-block;
 68 |         text-align: center;
 69 |       }
 70 | 
 71 |       #labels .action-button:hover {
 72 |         color: rebeccapurple;
 73 |         outline: 1px solid #444;
 74 |       }
 75 | 
 76 |       #labels th, #labels td {
 77 |         padding: 5px 5px;
 78 |       }
 79 | 
 80 |       #spinner-container {
 81 |         display: inline-block;
 82 |         color: #888;
 83 |       }
 84 | 
 85 |       #spinner {
 86 |         display: inline-block;
 87 |         width: 1em;
 88 |         height: 1em;
 89 |         margin: 0 8px;
 90 |       }
 91 | 
 92 |       #spinner:after {
 93 |         content: " ";
 94 |         display: block;
 95 |         width: 0.5em;
 96 |         height: 0.5em;
 97 |         margin: 1px;
 98 |         border-radius: 50%;
 99 |         border: 5px solid #888;
100 |         border-color: #888 transparent #888 transparent;
101 |         animation: spinner 1.2s linear infinite;
102 |       }
103 | 
104 |       @keyframes spinner {
105 |         0% {
106 |           transform: rotate(0deg);
107 |         }
108 |         100% {
109 |           transform: rotate(360deg);
110 |         }
111 |       }
112 | 
113 |     </style>
114 |   </head>
115 | 
116 |   <body class="browser-style">
117 |     <div id="current-labels" class="box">
118 |       <div id="no-selection" class="hidden">
119 |         No currently selected element&mdash;use the DevTools <b>Inspector</b> to select an element.
120 |       </div>
121 |       <div id="no-labels" class="hidden">
122 |         No labeled elements found.
123 |       </div>
124 |       <div id="labels" class="hidden"></div>
125 |     </div>
126 | 
127 |     <div id="freeze-page-container">
128 |       <button id="freeze-button">Save Page...</button>
129 |       <div id="spinner-container" class="hidden">
130 |         <div id="spinner"></div>
131 |         Working...
132 |       </div>
133 |     </div>
134 | 
135 |     <script src="../utils.js"></script>
136 |     <script src="../devtoolsPanel.js"></script>
137 |   </body>
138 | </html>
139 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/evaluate.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |     <meta charset="UTF-8">
  5 |     <link rel="stylesheet" type="text/css" href="chrome://browser/content/extension.css">
  6 |     <style type="text/css">
  7 |       .uiGrid {
  8 |           display: grid;
  9 |           grid-template-columns: 10em 30em;
 10 |           grid-gap: 6px;
 11 |       }
 12 | 
 13 |       .uiGrid .label,
 14 |       .uiGrid p {
 15 |         text-align: right;
 16 |       }
 17 | 
 18 |       .uiGrid select,
 19 |       .uiGrid meter {
 20 |         width: 100%;
 21 |       }
 22 | 
 23 |       .uiGrid div.bottommost {
 24 |         margin-bottom: 2em;
 25 |       }
 26 | 
 27 |       .hidden {
 28 |         display: none;
 29 |       }
 30 | 
 31 |       #caveats {
 32 |         color: #666;
 33 |         font-size: 9pt;
 34 |         text-align: left;
 35 |       }
 36 | 
 37 |       p.warning {
 38 |         background-color: #FFFFBE;
 39 |         border: 1px solid #FFE446;
 40 |         padding: 1em;
 41 |       }
 42 | 
 43 |       #goodBad {
 44 |         display: grid;
 45 |         grid-gap: 2px;
 46 |         grid-template-columns: repeat(8, 1fr);
 47 |       }
 48 | 
 49 |       #goodBad > div {
 50 |         text-align: center;
 51 |         padding: 3px;
 52 |       }
 53 | 
 54 |       #goodBad > .good {
 55 |         background-color: rgb(173, 245, 175);
 56 |       }
 57 | 
 58 |       #goodBad > .bad {
 59 |         background-color: rgb(255, 174, 186);
 60 |       }
 61 | 
 62 |       #goodBad > .bad:hover {
 63 |         background-color: rgb(225, 144, 156);
 64 |         box-shadow: inset 0px 0px 3px 1px red;
 65 |       }
 66 | 
 67 |       #goodBad > .good:hover {
 68 |         background-color: rgb(143, 215, 145);
 69 |         box-shadow: inset 0px 0px 3px 1px green;
 70 |       }
 71 | 
 72 |       /* Button styles based on https://github.com/FirefoxUX/photon-extension-kit/blob/master/extension.css */
 73 |       button.browser-style {
 74 |         background-color: #fbfbfb;
 75 |         border: 1px solid #b1b1b1;
 76 |         box-shadow: 0 0 0 0 transparent;
 77 |         font: caption;
 78 |         height: 24px;
 79 |         outline: 0 !important;
 80 |         padding: 0 8px 0;
 81 |         transition-duration: 250ms;
 82 |         transition-property: box-shadow, border;
 83 |       }
 84 | 
 85 |       button.browser-style::-moz-focus-inner {
 86 |         border: 0;
 87 |         outline: 0;
 88 |       }
 89 | 
 90 |       button.browser-style:hover {
 91 |         background-color: #ebebeb;
 92 |         border: 1px solid #b1b1b1;
 93 |       }
 94 | 
 95 |       button.browser-style:active {
 96 |         background-color: #d4d4d4;
 97 |         border: 1px solid #858585;
 98 |       }
 99 | 
100 |       button.browser-style:disabled {
101 |         color: #999;
102 |         opacity: .5;
103 |       }
104 | 
105 |       button.browser-style:focus {
106 |         border-color: #fff;
107 |         box-shadow: 0 0 0 2px rgba(97, 181, 255, 0.75);
108 |       }
109 | 
110 |       button.browser-style:hover:not(:active):not(:disabled):not(:focus) {
111 |         background-color: #ebebeb;
112 |         border: 1px solid #b1b1b1;
113 |       }
114 | 
115 |       button.browser-style:hover:active:not(:hover):not(:disabled):not(:focus) {
116 |         background-color: #d4d4d4;
117 |         border: 1px solid #858585;
118 |       }
119 | 
120 |       button.browser-style:focus:not(:disabled) {
121 |         border-color: #fff !important;
122 |         box-shadow: 0 0 0 2px rgba(97, 181, 255, 0.75);
123 |       }
124 | 
125 |       button#evaluate {
126 |         float: right;
127 |       }
128 |     </style>
129 |   </head>
130 |   <body class="browser-style" style="padding: 0 20px">
131 |     <h1>Evaluator</h1>
132 |     <p id="please-install" class="warning hidden">
133 |       No rulesets found. Please download a source checkout of FathomFox, and define one in rulesets.js.
134 |     </p>
135 |     <p>
136 |       Evaluate your ruleset, for the selected trainee, against the samples you’ve loaded as tabs in this window.
137 |     </p>
138 |     <div class="uiGrid">
139 |       <div class="label">
140 |         <label for="trainee">Trainee:</label>
141 |       </div>
142 |       <div>
143 |         <select id="trainee"></select>
144 |       </div>
145 |       <div>
146 |       </div>
147 |       <div>
148 |         <button type="button" id="evaluate" class="browser-style">Evaluate</button>
149 |       </div>
150 |       <div>
151 |       </div>
152 |       <div>
153 |       </div>
154 |       <div id="output" class="hidden uiGrid">
155 |           <div class="label">
156 |             Coefficients:
157 |           </div>
158 |           <div id="coeffs"></div>
159 |           <div class="label">
160 |             Accuracy:
161 |           </div>
162 |           <div id="accuracy"></div>
163 |           <div class="label">
164 |             Accuracy (95% CI):
165 |           </div>
166 |           <div id="ci"></div>
167 |           <div class="label">
168 |             Cost:
169 |           </div>
170 |           <div id="cost"></div>
171 |           <div class="label">
172 |             Good/Bad:
173 |           </div>
174 |           <div id="goodBad"></div>
175 |           <div></div>
176 |           <div id="caveats" class="bottommost">
177 |             Click a Bad cell to temporarily label any wrongly chosen element. Then, examine it using the Fathom devtools panel.
178 |           </div>
179 |       </div>
180 |     </div>
181 |     <script src="../rulesets.js"></script>
182 |     <script src="../utils.js"></script>
183 |     <script src="../evaluate.js"></script>
184 |   </body>
185 | </html>
186 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/pages/vector.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 |     <link rel="stylesheet" type="text/css" href="chrome://browser/content/extension.css">
 6 |     <style>
 7 |       body {
 8 |         padding: 0 20px;
 9 |       }
10 |       #options {
11 |         float: right;
12 |         padding-right: 10px;
13 |       }
14 |       #options div {
15 |         text-align: right;
16 |       }
17 |       #options input[class="number"] {
18 |         text-align: right;
19 |       }
20 |       #status li.error {
21 |         color: darkred;
22 |       }
23 |       p.warning {
24 |         background-color: #FFFFBE;
25 |         border: 1px solid #FFE446;
26 |         padding: 1em;
27 |       }
28 |       .hidden {
29 |         display: none;
30 |       }
31 |     </style>
32 |   </head>
33 |   <body class="browser-style">
34 |     <h1>Vectorizer</h1>
35 |     <p id="please-install" class="warning hidden">
36 |       No rulesets found. Please download a source checkout of FathomFox, and define one in rulesets.js.
37 |     </p>
38 |     <p>
39 |       This turns a series of frozen, labeled pages into feature vectors for use with <code>fathom train</code>. The feature vectors land in a file in your usual downloads folder. Because web extensions aren't allowed to load <code>file://</code> URLs, you might find it necessary to run a local web server like <code>fathom serve</code> in the directory where your local samples live. <code>fathom list</code> can help you get a list of filenames to paste here.
40 |     </p>
41 |     <div class="label">
42 |       <label for="trainee">Trainee:</label>
43 |     </div>
44 |     <div>
45 |       <select id="trainee"></select>
46 |     </div>
47 |     <label for="pages">Enter the pages to vectorize, one per line:</label>
48 |     <textarea id="pages" placeholder="1.html
49 | 2.html" rows=10 class="browser-style"></textarea>
50 |     <form id="vectorizeForm">
51 |       <button id="freeze" disabled>Vectorize</button>
52 |       <div id="options">
53 |         <div>
54 |           <label for="wait" title="Wait this long before vectorizing the page. Sometimes this provides the necessary time for stylesheets to apply, for instance.">Delay:</label>
55 |           <input class="number" type="text" required pattern="[0-9]+" min="0" size="2" id="wait" value="5"> sec
56 |         </div>
57 |         <div>
58 |           <label for="baseUrl" title="This gets prepended to each page title to make a URL.">Base URL:</label>
59 |           <input type="text" size="30" id="baseUrl" value="http://localhost:8000/">
60 |         </div>
61 |         <div>
62 |           <label for="maxTabs" title="Vectorize this many tab at a time.">Concurrency:</label>
63 |           <input class="number" type="text" required pattern="[0-9]+" min="1" size="2" id="maxTabs" value="16"> tabs
64 |         </div>
65 |       </div>
66 |     </form>
67 |     <ul id="status"></ul>
68 |     <script src="../download.js"></script>
69 |     <script src="../rulesets.js"></script>
70 |     <script src="../utils.js"></script>
71 |     <script src="../visit.js"></script>
72 |     <script src="../vector.js"></script>
73 |   </body>
74 | </html>
75 | 


--------------------------------------------------------------------------------
/fathom_fox/addon/utils.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Return the result of a browser.devtools.inspectedWindow.eval call. Throw the
 3 |  * error object on failure.
 4 |  */
 5 | async function resultOfEval(codeString) {
 6 |     let [result, error] = await browser.devtools.inspectedWindow.eval(codeString);
 7 |     if (error !== undefined) {
 8 |         throw error;
 9 |     }
10 |     return result;
11 | }
12 | 
13 | /**
14 |  * Return a backward iterator over an Array.
15 |  */
16 | function *reversed(array) {
17 |     for (let i = array.length - 1; i >= 0; i--) {
18 |         yield array[i];
19 |     }
20 | }
21 | 
22 | /**
23 |  * Deletes all children of the specified element.
24 |  */
25 | function emptyElement(element) {
26 |     while (element.firstChild) {
27 |         element.removeChild(element.firstChild);
28 |     }
29 | }
30 | 
31 | // Requires simmer.js injected into current page.
32 | // simmer.js is injected when the devtools panel is initialised when first opened.
33 | async function inspectedElementSelector() {
34 |     return resultOfEval(`Simmer.configure({depth: 25})($0)`);
35 | }
36 | 
37 | /**
38 |  * Set the current window's size such that the content area is the size you
39 |  * pass in.
40 |  *
41 |  * @arg tab {tabs.Tab} A tab in the window we're adjusting that we can inject
42 |  *     the window-measuring script into
43 |  *
44 |  * @return a Promise that is resolved when the window size has been changed
45 |  */
46 | async function setViewportSize(tab, width, height) {
47 |     // Because window.outerHeight and friends are undefined from background
48 |     // scripts, we have to collect the info by injecting a content script into
49 |     // (arbitrarily) the active tab. However, we have to ensure that tab is not
50 |     // showing about:blank, because webexts aren't allowed to inject scripts
51 |     // there. So we open a page of our own first.
52 |     const windowSizes = (await browser.tabs.executeScript(tab.id, {file: '/measureWindowSize.js'}))[0];
53 |     return browser.windows.update(
54 |         tab.windowId,
55 |         {width: windowSizes.outerWidth - windowSizes.innerWidth + width,
56 |          height: windowSizes.outerHeight - windowSizes.innerHeight + height});
57 | }
58 | 
59 | /**
60 |  * Given a URL as a string, return the last segment, minus any ".html"
61 |  * extension.
62 |  */
63 | function urlFilename(url) {
64 |     return url.substring(url.lastIndexOf('/') + 1, url.endsWith('.html') ? url.length - 5 : url.length)
65 | }
66 | 
67 | function sleep(ms) {
68 |     return new Promise(resolve => setTimeout(resolve, ms));
69 | }
70 | 
71 | async function initTraineeMenu(goButton) {
72 |     // Draw Ruleset menu:
73 |     let traineeKeys;
74 |     traineeKeys = Array.from(trainees.keys());
75 |     const menu = document.getElementById('trainee');
76 |     if (traineeKeys.length) {
77 |         for (const traineeKey of traineeKeys) {
78 |             const option = document.createElement('option');
79 |             option.text = option.value = traineeKey;
80 |             menu.add(option);
81 |         }
82 |     } else {
83 |         goButton.disabled = true;
84 |         menu.disabled = true;
85 |         document.getElementById('please-install').classList.remove('hidden');
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/fathom_fox/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "fathom-fox",
 3 |   "version": "3.7.3",
 4 |   "description": "Tools for collecting a Fathom training corpus and developing rulesets",
 5 |   "scripts": {
 6 |     "build": "yarn install --ignore-engines && rollup -c",
 7 |     "watch": "rollup -c -w",
 8 |     "browser": "web-ext run -s addon/",
 9 |     "release": "cd addon && web-ext build"
10 |   },
11 |   "license": "MPL-2.0",
12 |   "devDependencies": {
13 |     "fathom-web": "file:../fathom",
14 |     "freeze-dry": "^0.2.4",
15 |     "geckodriver": "^3.0.1",
16 |     "rollup": "^1.17.0",
17 |     "rollup-plugin-commonjs": "^10.0.1",
18 |     "rollup-plugin-json": "^4.0.0",
19 |     "rollup-plugin-node-builtins": "^2.1.2",
20 |     "rollup-plugin-node-globals": "^1.4.0",
21 |     "rollup-plugin-node-resolve": "^5.2.0",
22 |     "rollup-plugin-copy": "^3.0.0",
23 |     "simmerjs": "^0.5.6",
24 |     "web-ext": "^3.1.0",
25 |     "webpack": "^4.36.1"
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/fathom_fox/rollup.config.js:
--------------------------------------------------------------------------------
 1 | import commonjs from 'rollup-plugin-commonjs';
 2 | import resolve from 'rollup-plugin-node-resolve';
 3 | import json from 'rollup-plugin-json';
 4 | import builtins from 'rollup-plugin-node-builtins';
 5 | import globals from 'rollup-plugin-node-globals';
 6 | import copy from 'rollup-plugin-copy';
 7 | const webpackPostcss = require('./src/rollup-plugin-webpack-postcss/rollup-plugin-webpack-postcss');
 8 | 
 9 | /**
10 |  * Return typical rollup settings for a file of a given name.
11 |  */
12 | function mindlesslyFactoredOutSettings(name, globalVarName) {
13 |     return {
14 |         input: 'src/' + name + '.js',
15 |         output: {
16 |             file: 'addon/' + name + '.js',
17 |             format: 'iife',
18 |             name: globalVarName || name  // Convention: name the var the same thing.
19 |         },
20 |         plugins: [
21 |             resolve({preferBuiltins: true}),
22 |             webpackPostcss(),
23 |             commonjs({
24 |                 namedExports: {
25 |                     'wu': ['forEach', 'map', 'flatten']
26 |                 }
27 |             }),
28 |             json(),
29 |             globals(),
30 |             builtins(),
31 |             copy({
32 |                 targets: [
33 |                     { src: 'node_modules/simmerjs/dist/simmer.js', dest: 'addon' },
34 |                 ]
35 |             }),
36 |         ],
37 |         watch: {
38 |             chokidar: false
39 |         }
40 |     }
41 | }
42 | 
43 | export default [
44 |     mindlesslyFactoredOutSettings('contentScript'),
45 |     mindlesslyFactoredOutSettings('evaluate'),
46 |     mindlesslyFactoredOutSettings('rulesets', 'trainees'),
47 | ];
48 | 


--------------------------------------------------------------------------------
/fathom_fox/src/evaluate.js:
--------------------------------------------------------------------------------
  1 | let gCoeffsDiv, gAccuracyDiv, gCiDiv, gCostDiv, gGoodBadDiv = false;
  2 | 
  3 | class Evaluator {
  4 |     constructor(tabs, traineeId) {
  5 |         this.tabs = tabs;
  6 |         this.traineeId = traineeId;
  7 |     }
  8 | 
  9 |     async evaluate() {
 10 |         const coeffs = trainees.get(this.traineeId).coeffs;
 11 |         const successReport = await this.verboseSuccessReports(coeffs);
 12 |         const cost = successReport.reduce((accum, value) => accum + value.cost, 0);
 13 |         updateOutputs(coeffs, cost, successReport);
 14 |     }
 15 | 
 16 |     /**
 17 |      * Try the ruleset on each tab, and return a bigger blob of info that
 18 |      * allows us to show the user which element it found, for debugging.
 19 |      */
 20 |     async verboseSuccessReports(coeffs) {
 21 |         const results = await this.resultsForPages(coeffs);
 22 |         return results.map((result, i) => ({
 23 |             didSucceed: result.didSucceed,
 24 |             cost: result.cost,
 25 |             filename: urlFilename(this.tabs[i].url),
 26 |             tabId: this.tabs[i].id}));
 27 |     }
 28 | 
 29 |     /**
 30 |      * Send a message to all the pages in the corpus, telling them "Run ruleset
 31 |      * ID X, and tell me how its default query (the one with the same out() key
 32 |      * as its ID) did."
 33 |      *
 34 |      * @return an Array of {didSucceed: bool, cost: number} objects, one per
 35 |      *     page
 36 |      */
 37 |     async resultsForPages(coeffs) {
 38 |         return browser.runtime.sendMessage(
 39 |             {
 40 |                 type: 'rulesetSucceededOnTabs',
 41 |                 tabIds: this.tabs.map(tab => tab.id),
 42 |                 traineeId: this.traineeId,
 43 |                 coeffs: Array.from(coeffs.entries())
 44 |             }
 45 |         );
 46 |     }
 47 | }
 48 | 
 49 | async function evaluateTabs() {
 50 |     // Grey out Evaluate button:
 51 |     const evaluateButton = document.getElementById('evaluate');
 52 |     evaluateButton.disabled = true;
 53 | 
 54 |     // Show output.
 55 |     document.getElementById('output').classList.remove('hidden');
 56 | 
 57 |     try {
 58 |         // TODO: Using "active" here rather than a tab ID presents a race condition
 59 |         // if you quickly switch away from the tab after clicking the Evaluate button.
 60 |         let tabs = await browser.tabs.query({currentWindow: true, active: false});
 61 |         // We don't have permission to mess with about: tabs, so they crash.
 62 |         // Filter them out:
 63 |         tabs = tabs.filter(tab => !tab.url.startsWith('about:'));
 64 |         const rulesetName = document.getElementById('trainee').value;
 65 |         const viewportSize = trainees.get(rulesetName).viewportSize || {width: 1024, height: 768};
 66 |         await setViewportSize(tabs[0], viewportSize.width, viewportSize.height);  // for consistent element sizing in samples due to text wrap, etc.
 67 |         const evaluator = new Evaluator(tabs, rulesetName);
 68 |         await evaluator.evaluate();
 69 |     } finally {
 70 |         // Restore UI state, leaving output visible.
 71 |         evaluateButton.disabled = false;
 72 |     }
 73 | }
 74 | 
 75 | function empty(element) {
 76 |     while (element.firstChild) {
 77 |         element.removeChild(element.firstChild);
 78 |     }
 79 | }
 80 | 
 81 | /**
 82 |  * Return [low end, high end] of 95% CI for accuracy using binomial proportion
 83 |  * confidence interval formula.
 84 |  */
 85 | function confidenceInterval(successRatio, numberOfSamples) {
 86 |     const z_for_95_percent = 1.96;
 87 |     const addend = z_for_95_percent * Math.sqrt(successRatio * (1 - successRatio) / numberOfSamples);
 88 |     return [successRatio - addend, Math.min(1, successRatio + addend)];
 89 | }
 90 | 
 91 | /**
 92 |  * Format a ratio as a rounded-off percentage.
 93 |  */
 94 | function percentify(ratio) {
 95 |     return `${(ratio * 100).toFixed(1)}%`;
 96 | }
 97 | 
 98 | function updateOutputs(coeffs, cost, successesOrFailures) {
 99 |     // Update best coeffs and accuracy.
100 |     const coeffStrings = [];
101 |     for (const [key, val] of coeffs.entries()) {
102 |         coeffStrings.push(`${key}: ${val}`);
103 |     }
104 |     gCoeffsDiv.firstChild.textContent = `[${coeffStrings.join(', ')}]`;
105 |     gCostDiv.firstChild.textContent = Math.trunc(cost);
106 | 
107 |     if (successesOrFailures) {
108 |         // Compute and show accuracy:
109 |         const accuracy = successesOrFailures.reduce((accum, sf) => accum + sf.didSucceed, 0) / successesOrFailures.length;
110 |         gAccuracyDiv.firstChild.textContent = percentify(accuracy);
111 | 
112 |         // Draw CI readout:
113 |         const [ciLow, ciHigh] = confidenceInterval(accuracy, successesOrFailures.length);
114 |         gCiDiv.firstChild.textContent = `${percentify(ciLow)} - ${percentify(ciHigh)}`;
115 | 
116 |         // Draw good/bad chart:
117 |         if (gGoodBadDiv.childElementCount !== successesOrFailures.length) {
118 |             empty(gGoodBadDiv);
119 |             for (const _ of successesOrFailures) {
120 |                 const div = document.createElement('div');
121 |                 div.appendChild(document.createTextNode(''));
122 |                 gGoodBadDiv.appendChild(div);
123 |             }
124 |         }
125 |         let div = gGoodBadDiv.firstElementChild;
126 |         const traineeId = document.getElementById('trainee').value;
127 |         for (let sf of successesOrFailures) {
128 |             div.firstChild.textContent = sf.filename;
129 |             div.addEventListener('click', function focusTab() {
130 |                 // Label the bad element if bad, clear it if good:
131 |                 browser.tabs.sendMessage(
132 |                     sf.tabId,
133 |                     {type: 'labelBadElement',
134 |                      traineeId,
135 |                      coeffs});
136 |                 browser.tabs.update(sf.tabId, {active: true});
137 |                 // Update the Fathom dev tools panel if it's open:
138 |                 browser.runtime.sendMessage({type: 'refresh'});
139 |             });
140 |             div.setAttribute('class', sf.didSucceed ? 'good' : 'bad');
141 |             div = div.nextElementSibling;
142 |         }
143 |     }
144 | }
145 | 
146 | /**
147 |  * Draw and outfit the Evaluator page.
148 |  */
149 | async function initPage(document) {
150 |     // Find elements once.
151 |     gCoeffsDiv = document.getElementById('coeffs');
152 |     gAccuracyDiv = document.getElementById('accuracy');
153 |     gCiDiv = document.getElementById('ci');
154 |     gCostDiv = document.getElementById('cost');
155 |     gGoodBadDiv = document.getElementById('goodBad');
156 | 
157 |     // Initialise elements to a known state.
158 |     empty(gCoeffsDiv);
159 |     empty(gAccuracyDiv);
160 |     empty(gCiDiv);
161 |     empty(gCostDiv);
162 |     empty(gGoodBadDiv);
163 | 
164 |     // Create a text node in coeffs and accuracy once, rather than on each update.
165 |     gCoeffsDiv.appendChild(document.createTextNode(''));
166 |     gAccuracyDiv.appendChild(document.createTextNode(''));
167 |     gCiDiv.appendChild(document.createTextNode(''));
168 |     gCostDiv.appendChild(document.createTextNode(''));
169 | 
170 |     document.getElementById('evaluate').onclick = evaluateTabs;
171 | 
172 |     initTraineeMenu(document.getElementById('evaluate'));
173 | }
174 | 
175 | initPage(document);
176 | 


--------------------------------------------------------------------------------
/fathom_fox/src/rollup-plugin-webpack-postcss/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 2019 Patrick D. Cavit
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/fathom_fox/src/rollup-plugin-webpack-postcss/README.md:
--------------------------------------------------------------------------------
 1 | # Rollup Plugin Webpack PostCSS
 2 | 
 3 | An attempt to use webpack within rollup to allow for bundling postcss, since the [dependency cycles in postcss mean rollup can't bundle it](https://github.com/postcss/postcss/issues/1030).
 4 | 
 5 | This is a very silly-seeming idea, but it also seems like it might be working?
 6 | 
 7 | ## Rollup Plugins
 8 | 
 9 | - `rollup-plugin-node-resolve`
10 | - **This plugin**
11 | - `rollup-plugin-commonjs`
12 | - `rollup-plugin-node-globals`
13 | - `rollup-plugin-node-builtins`
14 | 
15 | Other orderings/lists might work, but that's what I'm using atm.
16 | 
17 | ---
18 | 
19 | ⚡💀🔥 **USE AT YOUR OWN RISK** 


--------------------------------------------------------------------------------
/fathom_fox/src/rollup-plugin-webpack-postcss/rollup-plugin-webpack-postcss.js:
--------------------------------------------------------------------------------
 1 | "use strict";
 2 | 
 3 | const path = require("path");
 4 | 
 5 | const webpack = require("webpack");
 6 | const MemoryFS = require("memory-fs");
 7 | 
 8 | const postcss = require.resolve("postcss");
 9 | 
10 | module.exports = () => ({
11 |     name : "rollup-plugin-postcss",
12 | 
13 |     load(id) {
14 |         if(id !== postcss) {
15 |             return null;
16 |         }
17 | 
18 |         const memfs = new MemoryFS();
19 |         const compiler = webpack({
20 |             entry : "postcss",
21 | 
22 |             output : {
23 |                 path          : __dirname,
24 |                 filename      : "postcss.js",
25 |                 library       : "postcss",
26 |                 libraryTarget : "commonjs2",
27 |             },
28 |         });
29 | 
30 |         // Write files to memory, not disk
31 |         compiler.outputFileSystem = memfs;
32 | 
33 |         return new Promise((resolve, reject) => {
34 |             compiler.run((err, stats) => {
35 |                 if(err || stats.hasErrors()) {
36 |                     if(err) {
37 |                         return reject(err);
38 |                     }
39 | 
40 |                     const info = stats.toJson();
41 | 
42 |                     return reject(info.errors);
43 |                 }
44 | 
45 |                 return resolve({
46 |                     code : memfs.readFileSync(path.join(__dirname, "./postcss.js"), "utf8"),
47 | 
48 |                     // TODO: figure out source map
49 |                 });
50 |             });
51 |         });
52 |     }
53 | });
54 | 


--------------------------------------------------------------------------------
/fathom_fox/src/rulesets.js:
--------------------------------------------------------------------------------
  1 | import {ruleset, rule, dom, type, score, out, utils} from 'fathom-web';
  2 | const {ancestors, isVisible, linearScale, rgbaFromString, saturation} = utils;
  3 | 
  4 | 
  5 | /**
  6 |  * Rulesets to vectorize or debug (and metadata about them)
  7 |  *
  8 |  * More mechanically, a map of names to {coeffs, rulesetMaker, ...} objects,
  9 |  * which we call "trainees". The rulesets you specify here are available to the
 10 |  * trainer and also show up in the FathomFox UI, from which you can debug a
 11 |  * ruleset. Most often, all the entries here point to the same ruleset but have
 12 |  * different values of `vectorType` for separately training each type of thing
 13 |  * the ruleset recognizes.
 14 |  */
 15 | const trainees = new Map();
 16 | 
 17 | /**
 18 |  * An example ruleset. Replace it with your own.
 19 |  *
 20 |  * This one finds the full-screen, content-blocking overlays that often go
 21 |  * behind modal popups. It's not the most well-honed thing, but it's simple and
 22 |  * short.
 23 |  */
 24 | trainees.set(
 25 |     // The ID for this trainee, which must be the same as the Fathom type you
 26 |     // are evaluating, if you are using the FathomFox Evaluator:
 27 |     'overlay',
 28 | 
 29 |     // Here we paste in coefficients from ``fathom train``. This lets us use
 30 |     // the Evaluator to see what Fathom is getting wrong. Otherwise, these
 31 |     // numbers do nothing until you deploy your application, so there's no need
 32 |     // to maintain them until then.
 33 |     {coeffs: new Map([  // [rule name, coefficient]
 34 |         ['big', 50.4946],
 35 |         ['nearlyOpaque', 48.6396],
 36 |         ['monochrome', 42.8406],
 37 |         ['classOrId', 0.5005],
 38 |         ['visible', 55.8750]]),
 39 |      // Bias is -139.3106 for this example, though that isn't needed until
 40 |      // production.
 41 | 
 42 |      // The content-area size to use while training. Defaults to 1024x768.
 43 |      viewportSize: {width: 1024, height: 768},
 44 | 
 45 |      // The type of node to extract features from when using the Vectorizer.
 46 |      // Defaults to the trainee ID.
 47 |      //
 48 |      // vectorType: 'overlay',
 49 | 
 50 |      rulesetMaker:
 51 |         function () {
 52 |             /**
 53 |              * Return whether the passed-in div is the size of the whole viewport/document
 54 |              * or nearly so.
 55 |              */
 56 |             function big(fnode) {
 57 |                 // Compare the size of the fnode to the size of the viewport. So far, spot-
 58 |                 // checking shows the overlay is never the size of the whole document, just
 59 |                 // the viewport.
 60 |                 const rect = fnode.element.getBoundingClientRect();
 61 |                 const hDifference = Math.abs(rect.height - window.innerHeight);
 62 |                 const wDifference = Math.abs(rect.width - window.innerWidth);
 63 |                 return linearScale(hDifference + wDifference, 250, 0);  // 250px is getting into "too tall to just be nav or something" territory.
 64 |             }
 65 | 
 66 |             /**
 67 |              * Return whether the fnode is almost but not entirely opaque.
 68 |              */
 69 |             function nearlyOpaque(fnode) {
 70 |                 const style = getComputedStyle(fnode.element);
 71 |                 const opacity = parseFloat(style.getPropertyValue('opacity'));
 72 |                 let bgColorAlpha = rgbaFromString(style.getPropertyValue('background-color'))[3];
 73 |                 if (bgColorAlpha === undefined) {
 74 |                     bgColorAlpha = 1;
 75 |                 }
 76 |                 const totalOpacity = opacity * bgColorAlpha;
 77 |                 let ret;
 78 |                 if (totalOpacity === 1) {  // seems to work even though a float
 79 |                     ret = 0;
 80 |                 } else {
 81 |                     ret = linearScale(totalOpacity, .4, .6);
 82 |                 }
 83 |                 return ret;
 84 |             }
 85 | 
 86 |             /**
 87 |              * Return whether the fnode's bgcolor is nearly black or white.
 88 |              */
 89 |             function monochrome(fnode) {
 90 |                 const rgba = rgbaFromString(getComputedStyle(fnode.element).getPropertyValue('background-color'));
 91 |                 return linearScale(1 - saturation(...rgba), .96, 1);
 92 |             }
 93 | 
 94 |             function suspiciousClassOrId(fnode) {
 95 |                 const element = fnode.element;
 96 |                 const attributeNames = ['class', 'id'];
 97 |                 let numOccurences = 0;
 98 |                 function numberOfSuspiciousSubstrings(value) {
 99 |                     return value.includes('popup') + value.includes('modal') + value.includes('overlay') + value.includes('underlay') + value.includes('backdrop');
100 |                 }
101 | 
102 |                 for (const name of attributeNames) {
103 |                     let values = element.getAttribute(name);
104 |                     if (values) {
105 |                         if (!Array.isArray(values)) {
106 |                             values = [values];
107 |                         }
108 |                         for (const value of values) {
109 |                             numOccurences += numberOfSuspiciousSubstrings(value);
110 |                         }
111 |                     }
112 |                 }
113 | 
114 |                 // 1 occurrence gets us to about 75% certainty; 2, 92%. It bottoms
115 |                 // out at 0 and tops out at 1.
116 |                 // TODO: Figure out how to derive the magic number .1685 from
117 |                 // 0 and 1.
118 |                 return (-(.3 ** (numOccurences + .1685)) + 1);
119 |             }
120 | 
121 |             /* The actual ruleset */
122 | 
123 |             const rules = ruleset([
124 |                 // Consider all <div> tags as candidate overlays:
125 |                 rule(dom('div'), type('overlay')),
126 | 
127 |                 // Contribute the "bigness" of the node to its overlay score:
128 |                 rule(type('overlay'), score(big), {name: 'big'}),
129 |         
130 |                 // Contibute the opacity of the node to its overlay score:
131 |                 rule(type('overlay'), score(nearlyOpaque), {name: 'nearlyOpaque'}),
132 |         
133 |                 // Contribute some other signals as well:
134 |                 rule(type('overlay'), score(monochrome), {name: 'monochrome'}),
135 |                 rule(type('overlay'), score(suspiciousClassOrId), {name: 'classOrId'}),
136 |                 rule(type('overlay'), score(isVisible), {name: 'visible'}),
137 | 
138 |                 // Offer the max-scoring overlay-typed node under the output key
139 |                 // "overlay". The score on that node will represent the probability,
140 |                 // informed by a corpus of training samples, that the node is, indeed,
141 |                 // a pop-up overlay.
142 |                 rule(type('overlay').max(), out('overlay'))
143 |             ]);
144 |             return rules;
145 |         }
146 | 
147 |      // isTarget is an optional function which returns whether the Vectorizer
148 |      // should consider a fnode a target. The default is to consider it a
149 |      // target iff its ``data-fathom`` attribute === the trainee ID.
150 |      //
151 |      // isTarget: fnode => fnode.element.dataset.fathom === 'foo'
152 |     }
153 | );
154 | 
155 | export default trainees;
156 | 


--------------------------------------------------------------------------------
/smoo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/smoo


--------------------------------------------------------------------------------