├── .circleci └── config.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── Makefile ├── README.md ├── cli ├── .flake8 ├── Makefile ├── README.rst ├── dev-requirements.txt ├── doc-building-requirements.txt ├── fathom_web │ ├── __init__.py │ ├── accuracy.py │ ├── commands │ │ ├── __init__.py │ │ ├── extract.py │ │ ├── fox.py │ │ ├── histogram.py │ │ ├── label.py │ │ ├── list.py │ │ ├── pick.py │ │ ├── serve.py │ │ ├── test.py │ │ └── train.py │ ├── test │ │ ├── __init__.py │ │ ├── resources │ │ │ └── train │ │ │ │ ├── vectorize_ruleset.js │ │ │ │ ├── vectorize_sample_1.html │ │ │ │ └── vectorize_sample_2.html │ │ ├── test_extract.py │ │ ├── test_label.py │ │ ├── test_list.py │ │ ├── test_pick.py │ │ ├── test_test.py │ │ ├── test_train.py │ │ └── test_utils.py │ ├── utils.py │ └── vectorizer.py ├── setup.cfg └── setup.py ├── docs ├── Makefile ├── clustering.rst ├── commands │ ├── extract.rst │ ├── fox.rst │ ├── histogram.rst │ ├── label.rst │ ├── list.rst │ ├── pick.rst │ ├── serve.rst │ ├── test.rst │ └── train.rst ├── conf.py ├── debugging.rst ├── deploy-docs ├── development.rst ├── example.rst ├── exceptions.rst ├── fnodes.rst ├── glossary.rst ├── img │ ├── histogram.png │ └── price_tracker_screenshot.png ├── index.rst ├── installing.rst ├── integrating.rst ├── intro.rst ├── maintaining.rst ├── rules.rst ├── ruleset.rst ├── samples.rst ├── theme │ ├── static │ │ └── tweaks.css │ └── theme.conf ├── training.rst ├── utilities.rst ├── versions.rst ├── zoo.rst └── zoo │ ├── login.rst │ ├── new_password.rst │ ├── price_tracker.rst │ ├── smoot_articles.rst │ └── smoot_shopping.rst ├── fathom ├── .babelrc ├── .eslintignore ├── .eslintrc.yml ├── .npmignore ├── Makefile ├── clusters.mjs ├── exceptions.mjs ├── fnode.mjs ├── index.mjs ├── lhs.mjs ├── package-lock.json ├── package.json ├── rhs.mjs ├── rollup.config.js ├── rule.mjs ├── ruleset.mjs ├── side.mjs ├── test │ ├── browser │ │ ├── http_server.js │ │ ├── isVisible.html │ │ └── isVisible.js │ ├── clusters_tests.mjs │ ├── demos.mjs │ ├── lhs_tests.mjs │ ├── rhs_tests.mjs │ ├── rule_tests.mjs │ ├── ruleset_tests.mjs │ ├── side_tests.mjs │ └── utils_tests.mjs ├── utils.mjs ├── utilsForBackend.mjs └── utilsForFrontend.mjs ├── fathom_fox ├── .eslintignore ├── .eslintrc.json ├── LICENSE ├── README.md ├── Tagged Head.afdesign ├── addon │ ├── actionMenu.js │ ├── background.js │ ├── corpus.js │ ├── devtoolsOpener.js │ ├── devtoolsPanel.js │ ├── download.js │ ├── icons │ │ └── icon.svg │ ├── manifest.json │ ├── measureWindowSize.js │ ├── pages │ │ ├── actionMenu.html │ │ ├── blank.html │ │ ├── corpus.html │ │ ├── devtoolsOpener.html │ │ ├── devtoolsPanel.html │ │ ├── evaluate.html │ │ └── vector.html │ ├── utils.js │ ├── vector.js │ └── visit.js ├── package.json ├── rollup.config.js ├── src │ ├── contentScript.js │ ├── evaluate.js │ ├── rollup-plugin-webpack-postcss │ │ ├── LICENSE.md │ │ ├── README.md │ │ └── rollup-plugin-webpack-postcss.js │ └── rulesets.js └── yarn.lock └── smoo /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | browser-tools: circleci/browser-tools@1.1.1 5 | 6 | jobs: 7 | test_js: 8 | docker: 9 | - image: cimg/node:15.3.0-browsers 10 | environment: 11 | MOZ_HEADLESS: 1 12 | steps: 13 | - browser-tools/install-firefox: 14 | version: 86.0.1 15 | - checkout 16 | - run: make -C fathom lint test 17 | # Upload new coveralls stats only on master, which is the only place 18 | # COVERALLS_REPO_TOKEN is defined: 19 | - run: 20 | name: Publish code coverage (if on master) 21 | command: | 22 | if [ ! -z "$COVERALLS_REPO_TOKEN" ] 23 | then 24 | make -C fathom coveralls 25 | fi 26 | test_python: 27 | docker: 28 | - image: cimg/python:3.7.9-node 29 | environment: 30 | MOZ_HEADLESS: 1 31 | steps: 32 | - browser-tools/install-firefox: 33 | version: 86.0.1 34 | - checkout 35 | - restore_cache: 36 | keys: 37 | - venv-v1-{{ arch }}-{{ checksum "cli/dev-requirements.txt" }}-{{ checksum "cli/doc-building-requirements.txt" }}-{{ checksum "cli/setup.py" }} 38 | - run: make -C cli lint test 39 | - run: make docs 40 | - save_cache: 41 | key: venv-v1-{{ arch }}-{{ checksum "cli/dev-requirements.txt" }}-{{ checksum "cli/doc-building-requirements.txt" }}-{{ checksum "cli/setup.py" }} 42 | paths: 43 | - cli/venv 44 | # Upload new docs only on master, which is the only place GH_TOKEN is 45 | # defined. This saves time over doing it in a separate job. 46 | - run: 47 | name: Publish docs (if on master) 48 | command: | 49 | if [ ! -z "$GH_TOKEN" ] 50 | then 51 | docs/deploy-docs 52 | fi 53 | 54 | workflows: 55 | version: 2 56 | js_python_and_docs: 57 | jobs: 58 | - test_js 59 | - test_python 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /cli/build 2 | /cli/dist 3 | /cli/*.egg-info 4 | /cli/runs 5 | /cli/venv 6 | /cli/fathom_web/fathom.zip 7 | /docs/_build 8 | /docs/venv 9 | /fathom/.npm_installed 10 | /fathom/node_modules 11 | /fathom/*.log 12 | /fathom/.nyc_output 13 | /fathom/coverage 14 | /fathom/dist 15 | /fathom/LICENSE 16 | /fathom/README.md 17 | /fathom/**/*.js 18 | !/fathom/rollup.config.js 19 | !/fathom/test/browser/* 20 | /fathom_fox/node_modules 21 | /fathom_fox/addon/contentScript.js 22 | /fathom_fox/addon/web-ext-artifacts 23 | /fathom_fox/addon/evaluate.js 24 | /fathom_fox/addon/simmer.js 25 | /fathom_fox/addon/rulesets.js 26 | **/__pycache__ 27 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Community Participation Guidelines 2 | 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 4 | For more details, please read the 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 6 | 7 | ## How to Report 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page. 9 | 10 | 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Convenience targets for executing common actions from the root of the repo 2 | 3 | all: docs 4 | $(MAKE) -C fathom 5 | $(MAKE) -C cli 6 | 7 | docs: 8 | $(MAKE) -C docs clean html 9 | 10 | lint: 11 | $(MAKE) -C cli lint 12 | $(MAKE) -C fathom lint 13 | 14 | test: 15 | $(MAKE) -C cli test 16 | $(MAKE) -C fathom test 17 | 18 | clean: 19 | $(MAKE) -C cli clean 20 | $(MAKE) -C docs clean 21 | $(MAKE) -C fathom clean 22 | 23 | 24 | .PHONY: clean docs lint test 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fathom 2 | 3 | Fathom is a supervised-learning system for recognizing parts of web pages—pop-ups, address forms, slideshows—or for classifying a page as a whole. A DOM flows in one side, and DOM nodes flow out the other, tagged with types and probabilities that those types are correct. A Prolog-like language makes it straightforward to specify the “smells” that suggest each type, and a neural-net-based trainer determines the optimal contribution of each smell. Finally, the FathomFox web extension lets you collect and label a corpus of web pages for training. 4 | 5 | Continue reading at . 6 | 7 | __[Documentation](https://mozilla.github.io/fathom)__ 8 | -------------------------------------------------------------------------------- /cli/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E129,E501,E741,F841,W292,W391,W503,W504 3 | exclude = build -------------------------------------------------------------------------------- /cli/Makefile: -------------------------------------------------------------------------------- 1 | # We avoid $(CURDIR) because it spits out /cygdrive/c/... on Windows Cygwin 2 | # installs and leads to things that don't work. 3 | VIRTUAL_ENV = ./venv 4 | PYTHON3 ?= python3 5 | # PATH seems to be exported even without "export", but I kept it to be explicit. 6 | export PATH := $(VIRTUAL_ENV)/bin:$(VIRTUAL_ENV)/Scripts:$(PATH) 7 | 8 | all: venv fathom.zip 9 | 10 | release: venv fathom.zip 11 | PATH="$(PATH)" python setup.py sdist bdist_wheel 12 | 13 | lint: venv npm_installed 14 | @PATH="$(PATH)" flake8 --exclude $(VIRTUAL_ENV) . 15 | @cd ../fathom && node_modules/.bin/eslint -c .eslintrc.yml ../cli/fathom_web/test/resources 16 | 17 | test: venv fathom.zip 18 | @PATH="$(PATH)" pytest fathom_web/test 19 | 20 | # I'm open to ideas on how to fire this off only when necessary. But it's 21 | # pretty fast, at least. 22 | fathom.zip: 23 | cd .. && git archive --format zip --output cli/fathom_web/fathom.zip HEAD -9 fathom fathom_fox 24 | 25 | clean: 26 | rm -rf $(VIRTUAL_ENV) fathom.zip 27 | 28 | venv: $(VIRTUAL_ENV)/pyvenv.cfg 29 | 30 | 31 | # Private targets: 32 | 33 | # Make a virtualenv at $VIRTUAL_ENV if there isn't one or if requirements have 34 | # changed. Install the dev requirements and the actual requirements. 35 | # 36 | # If the prereqs for this target change, UPDATE THE CACHE KEYS in the CircleCI 37 | # config as well! 38 | $(VIRTUAL_ENV)/pyvenv.cfg: dev-requirements.txt doc-building-requirements.txt setup.py 39 | $(PYTHON3) -m venv $(VIRTUAL_ENV) 40 | # We don't path-qualify pip3 because python -m venv on Travis creates a 41 | # venv with no pip executable in it. 42 | PATH="$(PATH)" pip3 install -r dev-requirements.txt -r doc-building-requirements.txt 43 | PATH="$(PATH)" pip3 install -e . 44 | 45 | npm_installed: 46 | @$(MAKE) -C ../fathom .npm_installed 47 | 48 | .PHONY: release lint test clean venv npm_installed 49 | -------------------------------------------------------------------------------- /cli/README.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Fathom Commandline Tools 3 | ======================== 4 | 5 | This is the commandline trainer and other tools for `Fathom `_, which itself is a supervised-learning system for recognizing parts of web pages. See `docs for the trainer `_ and `reference docs for the other tools `_ in the Fathom docs. 6 | 7 | Version History 8 | =============== 9 | 10 | See the `version history `_ in the main Fathom docs, under the "CLI tools" headers. 11 | -------------------------------------------------------------------------------- /cli/dev-requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements needed for running tests and such. Broken off into a separate 2 | # file so Make can notice if they change and re-run pip install. 3 | flake8==3.8.4 4 | flake8-quotes==3.2.0 5 | pytest==5.1.2 6 | wheel==0.34.2 7 | -------------------------------------------------------------------------------- /cli/doc-building-requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.7.12 2 | Babel==2.8.0 3 | certifi==2020.6.20 4 | chardet==3.0.4 5 | docutils==0.16 6 | idna==2.10 7 | imagesize==1.2.0 8 | Jinja2==2.11.2 9 | MarkupSafe==1.1.1 10 | packaging==20.4 11 | parsimonious==0.7.0 12 | pbr==5.4.5 13 | Pygments==2.6.1 14 | pyparsing==2.4.7 15 | pytz==2020.1 16 | requests==2.24.0 17 | six==1.15.0 18 | snowballstemmer==2.0.0 19 | Sphinx==3.1.2 20 | sphinx-click==2.3.2 21 | sphinx-js==3.0 22 | sphinx-rtd-theme==0.5.0 23 | sphinxcontrib-applehelp==1.0.2 24 | sphinxcontrib-devhelp==1.0.2 25 | sphinxcontrib-htmlhelp==1.0.3 26 | sphinxcontrib-jsmath==1.0.1 27 | sphinxcontrib-qthelp==1.0.3 28 | sphinxcontrib-serializinghtml==1.1.4 29 | urllib3==1.25.9 30 | -------------------------------------------------------------------------------- /cli/fathom_web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/cli/fathom_web/__init__.py -------------------------------------------------------------------------------- /cli/fathom_web/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from .extract import extract 2 | from .fox import fox 3 | from .histogram import histogram 4 | from .label import label 5 | from .list import list 6 | from .pick import pick 7 | from .serve import serve 8 | from .test import test 9 | from .train import train 10 | 11 | from click import group 12 | 13 | 14 | @group() 15 | def fathom(): 16 | """Pass fathom COMMAND --help to learn more about an individual command.""" 17 | 18 | 19 | fathom.add_command(extract) 20 | fathom.add_command(fox) 21 | fathom.add_command(histogram) 22 | fathom.add_command(label) 23 | fathom.add_command(list) 24 | fathom.add_command(pick) 25 | fathom.add_command(serve) 26 | fathom.add_command(test) 27 | fathom.add_command(train) 28 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/fox.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from zipfile import ZipFile 3 | 4 | import click 5 | from click import command, option, pause 6 | 7 | from ..utils import path_or_none 8 | from ..vectorizer import fathom_fox_addon, fathom_zip, running_firefox 9 | 10 | 11 | @command() 12 | @option('--ruleset', '-r', 13 | type=click.Path(exists=True, dir_okay=False, resolve_path=True), 14 | callback=path_or_none, 15 | help='The rulesets.js file containing your rules. The file must have no imports except from fathom-web, so pre-bundle if necessary. [default: the demo ruleset included with FathomFox]') 16 | def fox(ruleset): 17 | """ 18 | Launch Firefox with FathomFox installed. 19 | 20 | This launches a fresh instance of Firefox with a blank profile as a 21 | suitably clean environment for labeling samples. 22 | 23 | """ 24 | with ruleset_or_default(ruleset) as ruleset_file: 25 | with fathom_fox_addon(ruleset_file) as addon_and_geckodriver: 26 | addon_path, geckodriver_path = addon_and_geckodriver 27 | with running_firefox(addon_path, True, geckodriver_path): 28 | pause(info='Press any key to quit.') 29 | 30 | 31 | @contextmanager 32 | def ruleset_or_default(ruleset_path_or_none): 33 | """Yield the ruleset file-like object to use. 34 | 35 | This allows us to conditionally call various needed context managers. 36 | 37 | """ 38 | if ruleset_path_or_none: 39 | with ruleset_path_or_none.open('rb') as ruleset_file: 40 | yield ruleset_file 41 | else: 42 | # Go get the default demo ruleset: 43 | with fathom_zip() as zip_file: 44 | zip = ZipFile(zip_file) 45 | # Opens in binary mode: 46 | with zip.open('fathom_fox/src/rulesets.js') as default_ruleset: 47 | yield default_ruleset 48 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/histogram.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | from pathlib import Path 3 | 4 | import click 5 | from click import argument, BadOptionUsage, command, get_terminal_size, option, style 6 | from more_itertools import pairwise 7 | import numpy 8 | 9 | from ..utils import path_or_none, tensors_from 10 | from ..vectorizer import make_or_find_vectors 11 | 12 | 13 | @command() 14 | @argument('training_set', 15 | type=click.Path(exists=True, resolve_path=True), 16 | metavar='TRAINING_SET_FOLDER') 17 | @option('--ruleset', '-r', 18 | type=click.Path(exists=True, dir_okay=False, resolve_path=True), 19 | callback=path_or_none, 20 | help='The rulesets.js file containing your rules. The file must have no imports except from fathom-web, so pre-bundle if necessary.') 21 | @option('--trainee', 22 | type=str, 23 | metavar='ID', 24 | help='The trainee ID of the ruleset you are testing. Usually, this is the same as the type you are testing.') 25 | @option('--training-cache', 26 | type=click.Path(dir_okay=False, resolve_path=True), 27 | callback=path_or_none, 28 | help='Where to cache training vectors to speed future testing runs. Any existing file will be overwritten. [default: vectors/training_yourTraineeId.json next to your ruleset]') 29 | @option('--delay', 30 | default=5, 31 | type=int, 32 | show_default=True, 33 | help='Number of seconds to wait for a page to load before vectorizing it') 34 | @option('--tabs', 35 | default=16, 36 | type=int, 37 | show_default=True, 38 | help='Number of concurrent browser tabs to use while vectorizing') 39 | @option('--show-browser', 40 | default=False, 41 | is_flag=True, 42 | help='Show browser window while vectorizing. (Browser runs in headless mode by default.)') 43 | @option('--buckets', '-b', 44 | default=10, 45 | type=int, 46 | show_default=True, 47 | help='Number of histogram buckets to use for non-boolean features') 48 | @option('rules', '--rule', 49 | type=str, 50 | multiple=True, 51 | help='The rule to graph. Can be repeated. Omitting this graphs all rules.') 52 | def histogram(training_set, ruleset, trainee, training_cache, delay, tabs, show_browser, buckets, rules): 53 | """Show a histogram of rule scores. 54 | 55 | We also break down what proportion of each bucket comprised positive or 56 | negative samples. Altogether, this gives you an idea whether a rule is 57 | broadly applicable, discriminatory, and spitting out what you expect. 58 | 59 | """ 60 | training_set = Path(training_set) 61 | if training_set.is_dir(): 62 | if not ruleset: 63 | raise BadOptionUsage('ruleset', 'A --ruleset file must be specified when TRAINING_SET_FOLDER is passed a directory.') 64 | if not trainee: 65 | raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER is passed a directory.') 66 | 67 | training_data = make_or_find_vectors( 68 | ruleset, 69 | trainee, 70 | training_set, 71 | training_cache, 72 | show_browser, 73 | 'training', 74 | delay, 75 | tabs) 76 | training_pages = training_data['pages'] 77 | x, y, num_yes, _ = tensors_from(training_pages) 78 | feature_names = training_data['header']['featureNames'] 79 | print_feature_report(feature_metrics(feature_names, x, y, buckets, rules or feature_names)) 80 | 81 | 82 | def feature_metrics(feature_names, x, y, buckets, enabled_rules): 83 | x_t = x.T # [[...feature0 values across all pages...], [...feature1 values...], ...]. 84 | for name, values in zip(feature_names, x_t): 85 | if name not in enabled_rules: 86 | continue 87 | is_boolean = is_boolean_feature(values) 88 | _, boundaries = numpy.histogram(values.numpy(), 89 | bins=2 if is_boolean else buckets) 90 | highest_boundary = boundaries[-1] 91 | bars = [] 92 | for boundary, (low_bound, high_bound) in zip(boundaries, pairwise(boundaries)): 93 | is_last_time = high_bound == highest_boundary 94 | 95 | # Whether each feature value is a member of this bucket. Last 96 | # interval is inclusive on the right. 97 | x_is_for_this_bar = ((values >= low_bound) & 98 | ((values <= high_bound) if is_last_time else 99 | (values < high_bound))) 100 | 101 | y_for_this_bar = y.T[0].masked_select(x_is_for_this_bar) 102 | positives = (y_for_this_bar.numpy() == 1).sum() 103 | negatives = len(y_for_this_bar) - positives 104 | label = str(ceil(boundary)) if is_boolean else f'{boundary:.1f}' 105 | bars.append((label, positives, negatives)) 106 | yield name, bars 107 | 108 | 109 | def print_feature_report(metrics): 110 | def bar(length, label): 111 | """Return a bar of about the given length with the given label printed 112 | on it. 113 | 114 | We may cheat and expand a bar a bit to fit the label. 115 | 116 | """ 117 | if not label: 118 | # Don't expand a bar just to print a 0. The bar's absence serves. 119 | label = '' 120 | return ('{label: ^%i}' % length).format(label=label) 121 | 122 | term_width = get_terminal_size()[0] 123 | pos_style = style('', fg='black', bg='bright_green', bold=True, reset=False) 124 | neg_style = style('', fg='bright_white', bg='bright_black', bold=True, reset=False) 125 | style_reset = style('', reset=True) 126 | print(f'{pos_style} {style_reset} Positive Samples {neg_style} {style_reset} Negative Samples') 127 | for feature, bars in metrics: 128 | longest_bar = max((positives + negatives) for _, positives, negatives in bars) 129 | print('\n', style(feature, bold=True), sep='') 130 | longest_label = max(len(label) for label, _, _ in bars) 131 | longest_total = max(len(str(n + p)) for _, p, n in bars) 132 | # This could still be slightly short if bar() has to cheat any bar lengths: 133 | samples_per_char = longest_bar / (term_width - longest_label - longest_total - 4) 134 | for label, positives, negatives in bars: 135 | pos_length = int(round(positives / samples_per_char)) 136 | neg_length = int(round(negatives / samples_per_char)) 137 | padded_label = ('{label: >%i}' % longest_label).format(label=label) 138 | pos_bar = bar(pos_length, positives) 139 | neg_bar = bar(neg_length, negatives) 140 | print(f' {padded_label} {pos_style}{pos_bar}{style_reset}{neg_style}{neg_bar}{style_reset}{" " if (positives + negatives) else ""}{positives + negatives}') 141 | 142 | 143 | def is_boolean_feature(t): 144 | """Given a 1-D Tensor of a single feature's value across many samples, 145 | return whether it appears to be a yes/no feature.""" 146 | return ((t == 0) | (t == 1)).min().item() 147 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/label.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from html.parser import HTMLParser 3 | import multiprocessing 4 | import os 5 | import pathlib 6 | import shutil 7 | 8 | from click import argument, command, option, Path, progressbar, STRING 9 | 10 | 11 | @command() 12 | @option('--preserve-originals/--no-preserve-originals', 13 | default=True, 14 | help='Save original HTML files in a newly created `originals`' 15 | ' directory in IN_DIRECTORY (default: True)') 16 | @option('--number-of-workers', 17 | default=multiprocessing.cpu_count(), 18 | help='Use the specified number of workers to speed up the labeling' 19 | ' process (default: the number of logical cores the machine has)') 20 | @argument('in_directory', type=Path(exists=True, file_okay=False)) 21 | @argument('in_type', type=STRING) 22 | def label(in_directory, in_type, preserve_originals, number_of_workers): 23 | """ 24 | Apply a whole-page label to each page in a directory. 25 | 26 | Add the ``data-fathom`` attribute with a value of IN_TYPE to the 27 | opening tag of any ```` elements in the HTML pages in 28 | IN_DIRECTORY. This tool is used to label an entire webpage (e.g. 29 | IN_TYPE could be "article" for article webpages). 30 | 31 | """ 32 | if preserve_originals: 33 | originals_dir = pathlib.Path(in_directory) / 'originals' 34 | try: 35 | originals_dir.mkdir(parents=True) 36 | except FileExistsError: 37 | raise RuntimeError(f'Tried to make directory {originals_dir.as_posix()}, but it already exists. To protect' 38 | f' against unwanted data loss, please move or remove the existing directory.') 39 | else: 40 | originals_dir = None 41 | 42 | list_of_items = os.listdir(in_directory) 43 | 44 | print_statements = [] # Capture any print statements to log at the end. 45 | 46 | # Make a pool of workers. Each worker is in its own process. We use a scaling factor to account for the overhead of 47 | # setting up all of the processes. 48 | pool = multiprocessing.Pool(number_of_workers) 49 | # Curry ``task``, so we can pass more than one argument into pool.imap_unordered. 50 | task = partial(label_task, in_directory, in_type, originals_dir, preserve_originals) 51 | 52 | with progressbar(pool.imap_unordered(task, list_of_items), 53 | label='Labeling pages', 54 | length=len(list_of_items)) as bar: 55 | for result in bar: 56 | if result is not None: 57 | print_statements.append(result) 58 | 59 | for statement in print_statements: 60 | print(statement) 61 | 62 | 63 | def label_task(in_directory, in_type, originals_dir, preserve_originals, filename): 64 | file = pathlib.Path(in_directory) / filename 65 | if file == originals_dir: 66 | return 67 | if file.is_dir(): 68 | return f'Skipped directory {file.name}/' 69 | if file.suffix != '.html': 70 | return f'Skipped {file.name}; not an HTML file' 71 | 72 | with file.open(encoding='utf-8') as fp: 73 | html = fp.read() 74 | 75 | new_html = label_html_tags_in_html_string(html, in_type) 76 | 77 | if preserve_originals: 78 | shutil.move(file, originals_dir / file.name) 79 | 80 | with file.open('w', encoding='utf-8') as fp: 81 | fp.write(new_html) 82 | 83 | 84 | def label_html_tags_in_html_string(html: str, in_type: str) -> str: 85 | """ 86 | Finds all opening ``html`` tags in the HTML string and adds a 87 | ``' data-fathom="${in_type}"'`` substring to each one. 88 | 89 | We do this by building a new HTML string with the inserted substring(s). 90 | 91 | The ``html`` tags are found using the HTMLParser class in Python's 92 | built-in html.parser library. 93 | """ 94 | parser = HTMLParserSubclass(in_type) 95 | parser.feed(html) 96 | 97 | new_html = html 98 | 99 | for (original_html_tag, new_html_tag) in parser.html_tags_list: 100 | new_html = new_html.replace(original_html_tag, new_html_tag, 1) 101 | 102 | return new_html 103 | 104 | 105 | class HTMLParserSubclass(HTMLParser): 106 | def __init__(self, in_type, **kwargs): 107 | self.in_type = in_type 108 | self.html_tags_list = [] 109 | super().__init__(**kwargs) 110 | 111 | def handle_starttag(self, tag, attrs): 112 | if tag == 'html': 113 | original_html_tag = self.get_starttag_text() 114 | new_html_substring = f'html data-fathom="{self.in_type}"' 115 | new_html_tag = original_html_tag.replace('html', new_html_substring, 1) 116 | self.html_tags_list.append((original_html_tag, new_html_tag)) 117 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/list.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from click import argument, command, File, option, Path 4 | 5 | from ..utils import samples_from_dir 6 | 7 | 8 | @command() 9 | @argument('in_directory', type=Path(exists=True, file_okay=False)) 10 | @option('--base-dir', '-b', type=Path(exists=True, file_okay=False), 11 | help='The directory to create relative paths from.') 12 | @option('--out-file', '-o', type=File(mode='w'), default=None, 13 | help='A file for saving the printed filenames for easy future reference.') 14 | @option('--show-urls', '-u', default=False, is_flag=True, 15 | help='Also show the original URL of each sample.') 16 | def list(in_directory, base_dir, out_file, show_urls): 17 | """ 18 | List URL paths to samples. 19 | 20 | Recursively list paths of HTML files in IN_DIRECTORY relative to 21 | , one path per line. If is not specified, 22 | paths are relative to IN_DIRECTORY. Optionally saves output to 23 | . 24 | 25 | This is useful for vectorizing samples using FathomFox. FathomFox expects 26 | input filenames copied into a text box with one filename per line and 27 | relative to some path you are serving files from using ``fathom serve``. 28 | 29 | """ 30 | if base_dir is None: 31 | base_dir = in_directory 32 | 33 | if out_file is not None: 34 | filenames_to_save = [] 35 | 36 | there_were_no_files = True 37 | for file in samples_from_dir(in_directory): 38 | there_were_no_files = False 39 | relative_path = file.relative_to(base_dir) 40 | if show_urls: 41 | with file.open() as open_file: 42 | print(relative_path, original_url(open_file)) 43 | else: 44 | print(relative_path) 45 | 46 | if out_file is not None: 47 | filenames_to_save.append(relative_path.as_posix() + '\n') 48 | 49 | if out_file is not None: 50 | if there_were_no_files: 51 | print(f'No .html files found in {in_directory}. Did not create {out_file.name}.') 52 | else: 53 | out_file.writelines(filenames_to_save) 54 | 55 | 56 | def original_url(open_file): 57 | """Return the original URL that FathomFox embedded in a given sample.""" 58 | # I started to write a clever loop to read only as much from each file as 59 | # we needed, but it turns out reading 67 entire unextracted samples takes 60 | # only 1.2s on my laptop. 61 | match = re.search('', open_file.read()) 62 | if not match: 63 | return '' 64 | return match.group(1) 65 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/pick.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from random import sample 3 | from shutil import move 4 | 5 | from click import argument, command, Path, UsageError 6 | 7 | 8 | @command() 9 | @argument('from_dir', 10 | type=Path(exists=True, file_okay=False, writable=True, dir_okay=True)) 11 | @argument('to_dir', 12 | type=Path(exists=True, file_okay=False, writable=True, dir_okay=True)) 13 | @argument('number', type=int) 14 | def pick(from_dir, to_dir, number): 15 | """ 16 | Randomly move samples to a training, validation, or test set. 17 | 18 | Move a random selection of HTML files and their extracted resources, if 19 | any, from one directory to another. Ignore hidden files. 20 | 21 | """ 22 | # Make these strings into ``Path``s so they are easier to work with 23 | from_dir = pathlib.Path(from_dir) 24 | to_dir = pathlib.Path(to_dir) 25 | 26 | for file in sample(list(from_dir.glob('*.html')), number): 27 | # If the file has resources, we must move those as well: 28 | if (from_dir / 'resources' / file.stem).exists(): 29 | # Make sure we don't overwrite an existing resources directory 30 | if (to_dir / 'resources' / file.stem).exists(): 31 | raise UsageError(f'Tried to make directory {(to_dir / "resources" / file.stem).as_posix()}, but it' 32 | f' already exists. To protect against unwanted data loss, please move or remove the' 33 | f' existing directory.') 34 | move(from_dir / 'resources' / file.stem, to_dir / 'resources' / file.stem) 35 | move(file.as_posix(), to_dir) 36 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/serve.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer 3 | import os 4 | 5 | from click import command, option, Path 6 | 7 | 8 | @command() 9 | @option('--port', '-p', type=int, default=8000, 10 | help='The port to use (default: 8000)') 11 | @option('--directory', '-d', type=Path(exists=True, file_okay=False), default=os.getcwd(), 12 | help='The directory to serve files from (default: current working directory)') 13 | def serve(directory, port): 14 | """ 15 | Serve samples locally over HTTP. 16 | 17 | Serve the files in at http://localhost:. This is useful 18 | for vectorizing samples using FathomFox. FathomFox expects you to provide, 19 | in the vectorizer page, an address to an HTTP server that is serving your 20 | samples. 21 | 22 | """ 23 | server = ThreadingHTTPServer(('localhost', port), partial(SimpleHTTPRequestHandler, directory=directory)) 24 | print(f'Serving {directory} over http://localhost:{port}.') 25 | print('Press Ctrl+C to stop.') 26 | server.serve_forever() 27 | -------------------------------------------------------------------------------- /cli/fathom_web/commands/test.py: -------------------------------------------------------------------------------- 1 | from json import JSONDecodeError, loads 2 | from pathlib import Path 3 | 4 | import click 5 | from click import argument, BadOptionUsage, BadParameter, command, option 6 | 7 | from ..accuracy import accuracy_per_tag, per_tag_metrics, pretty_accuracy, print_per_tag_report 8 | from ..utils import classifier, path_or_none, speed_readout, tensor, tensors_from 9 | from ..vectorizer import make_or_find_vectors 10 | 11 | 12 | def decode_weights(ctx, param, value): 13 | """Validate a click option, making sure it's a valid JSON object with 14 | properly formatted "coeff" and "bias" keys.""" 15 | try: 16 | decoded_weights = loads(value) 17 | except JSONDecodeError: 18 | raise BadParameter('Weights must be a valid JSON object.') 19 | 20 | if 'coeffs' not in decoded_weights or 'bias' not in decoded_weights: 21 | raise BadParameter('Weights must contain "coeffs" and "bias" keys.') 22 | if not isinstance(decoded_weights['bias'], float): 23 | raise BadParameter('Bias must be a float.') 24 | if not (isinstance(decoded_weights['coeffs'], list) and 25 | all((len(pair) == 2 and 26 | isinstance(pair[0], str) and 27 | isinstance(pair[1], float)) 28 | for pair in decoded_weights['coeffs'])): 29 | raise BadParameter('Coeffs must be a list of 2-element lists: [["ruleName", numericCoefficient], ...].') 30 | return decoded_weights 31 | 32 | 33 | def model_from_json(weights, num_outputs, feature_names): 34 | """Return a linear model with the the passed in coeffs and biases. 35 | 36 | :arg weights: A dict with coeff and bias keys, as the program takes from 37 | the commandline 38 | :arg num_outputs: The number of output nodes of the network, typically 1 39 | :arg feature_names: The ordered list of feature names so we can get the 40 | coeffs lined up with the feature order used by the vectors 41 | 42 | """ 43 | model = classifier(len(weights['coeffs']), num_outputs) 44 | coeffs = dict(weights['coeffs']) 45 | model.load_state_dict({'0.weight': tensor([[coeffs[f] for f in feature_names]]), 46 | '0.bias': tensor([weights['bias']])}) 47 | return model 48 | 49 | 50 | @command() 51 | @argument('testing_set', 52 | type=click.Path(exists=True, resolve_path=True), 53 | metavar='TESTING_SET_FOLDER') 54 | @argument('weights', callback=decode_weights) 55 | @option('--confidence-threshold', '-t', 56 | default=0.5, 57 | show_default=True, 58 | help='Threshold at which a sample is considered positive. Higher values decrease false positives and increase false negatives.') 59 | @option('--ruleset', '-r', 60 | type=click.Path(exists=True, dir_okay=False, resolve_path=True), 61 | callback=path_or_none, 62 | help='The rulesets.js file containing your rules. The file must have no imports except from fathom-web, so pre-bundle if necessary.') 63 | @option('--trainee', 64 | type=str, 65 | metavar='ID', 66 | help='The trainee ID of the ruleset you are testing. Usually, this is the same as the type you are testing.') 67 | @option('--testing-cache', 68 | type=click.Path(dir_okay=False, resolve_path=True), 69 | callback=path_or_none, 70 | help='Where to cache testing vectors to speed future testing runs. Any existing file will be overwritten. [default: vectors/testing_yourTraineeId.json next to your ruleset]') 71 | @option('--delay', 72 | default=5, 73 | type=int, 74 | show_default=True, 75 | help='Number of seconds to wait for a page to load before vectorizing it') 76 | @option('--tabs', 77 | default=16, 78 | type=int, 79 | show_default=True, 80 | help='Number of concurrent browser tabs to use while vectorizing') 81 | @option('--show-browser', 82 | default=False, 83 | is_flag=True, 84 | help='Show browser window while vectorizing. (Browser runs in headless mode by default.)') 85 | @option('--verbose', '-v', 86 | default=False, 87 | is_flag=True, 88 | help='Show per-tag diagnostics, even though that could ruin blinding for the test set.') 89 | def test(testing_set, weights, confidence_threshold, ruleset, trainee, testing_cache, delay, tabs, show_browser, verbose): 90 | """ 91 | Evaluate how well a trained ruleset does. 92 | 93 | TESTING_SET_FOLDER is a directory of labeled testing pages. It can also be, 94 | for backward compatibility, a JSON file of vectors from FathomFox's 95 | Vectorizer. 96 | 97 | WEIGHTS should be a JSON-formatted object, as follows. You can paste it 98 | directly from the output of trainer. 99 | 100 | \b 101 | {"coeffs": [["nextAnchorIsJavaScript", 1.1627885103225708], 102 | ["nextButtonTypeSubmit", 4.613410949707031], 103 | ["nextInputTypeSubmit", 4.374269008636475]], 104 | \b 105 | "bias": -8.645608901977539} 106 | 107 | """ 108 | testing_set = Path(testing_set) 109 | if testing_set.is_dir(): 110 | if not ruleset: 111 | raise BadOptionUsage('ruleset', 'A --ruleset file must be specified when TESTING_SET_FOLDER is passed a directory.') 112 | if not trainee: 113 | raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TESTING_SET_FOLDER is passed a directory.') 114 | 115 | testing_data = make_or_find_vectors(ruleset, 116 | trainee, 117 | testing_set, 118 | testing_cache, 119 | show_browser, 120 | 'testing', 121 | delay, 122 | tabs) 123 | testing_pages = testing_data['pages'] 124 | x, y, num_yes, num_prunes = tensors_from(testing_pages) 125 | model = model_from_json(weights, len(y[0]), testing_data['header']['featureNames']) 126 | 127 | accuracy, false_positives, false_negatives = accuracy_per_tag(y, model(x), confidence_threshold, num_prunes) 128 | print(pretty_accuracy('Testing', accuracy, len(x), false_positives, false_negatives, num_yes + num_prunes)) 129 | 130 | if testing_pages and 'time' in testing_pages[0]: 131 | print(speed_readout(testing_pages)) 132 | 133 | if verbose: 134 | print('\nTesting per-tag results:') 135 | print_per_tag_report([per_tag_metrics(page, model, confidence_threshold) for page in testing_pages]) 136 | -------------------------------------------------------------------------------- /cli/fathom_web/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/cli/fathom_web/test/__init__.py -------------------------------------------------------------------------------- /cli/fathom_web/test/resources/train/vectorize_ruleset.js: -------------------------------------------------------------------------------- 1 | // TODO: Address node rules evaluating against browser only files 2 | /* eslint-disable node/no-unsupported-features/es-syntax */ 3 | // eslint-disable-next-line import/extensions, node/no-missing-import 4 | import {dom, out, rule, ruleset, score, type} from 'fathom-web'; 5 | 6 | let coefficients = { 7 | 'secret': [ 8 | ['hasSecretParagraph', 0.0] 9 | ] 10 | }; 11 | 12 | let biases = [ 13 | ['secret', 0.0] 14 | ]; 15 | 16 | function caselessIncludes(haystack, needle) { 17 | return haystack.toLowerCase().includes(needle.toLowerCase()); 18 | } 19 | 20 | function hasSecretParagraph(fnode) { 21 | return caselessIncludes(fnode.element.innerText, 'secret'); 22 | } 23 | 24 | function makeRuleset(coeffs, biases) { 25 | return ruleset( 26 | [ 27 | rule(dom('html'), type('secret')), 28 | rule(type('secret'), score(hasSecretParagraph.bind(this)), {name: 'hasSecretParagraph'}), 29 | rule(type('secret'), out('secret')) 30 | ], 31 | coeffs, 32 | biases 33 | ); 34 | } 35 | 36 | const trainees = new Map(); 37 | const VIEWPORT_SIZE = {width: 1680, height: 950}; 38 | 39 | const FEATURES = ['secret']; 40 | for (const feature of FEATURES) { 41 | const ruleset = { 42 | coeffs: new Map(coefficients[feature]), 43 | viewportSize: VIEWPORT_SIZE, 44 | vectorType: feature, 45 | rulesetMaker: () => makeRuleset( 46 | [ 47 | ...coefficients.secret, 48 | ], 49 | biases 50 | ), 51 | }; 52 | trainees.set(feature, ruleset); 53 | } 54 | 55 | export default trainees; 56 | -------------------------------------------------------------------------------- /cli/fathom_web/test/resources/train/vectorize_sample_1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Sample 1 6 | 7 | 8 |

Contains the secret word.

9 | 10 | 11 | -------------------------------------------------------------------------------- /cli/fathom_web/test/resources/train/vectorize_sample_2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Sample 2 6 | 7 | 8 |

Contains a boring sentence.

9 | 10 | 11 | -------------------------------------------------------------------------------- /cli/fathom_web/test/test_extract.py: -------------------------------------------------------------------------------- 1 | from ..commands.extract import BASE64_DATA_PATTERN, decode 2 | 3 | 4 | def test_common_example(): 5 | """Confirm we handle the well-behaving case""" 6 | mime_type = 'image/png' 7 | base64_string = 'aorienstar/tar/ararnsoine98daQAAAIST+++/rstienf=' 8 | test_string = f'data:{mime_type};base64,{base64_string}' 9 | matches = get_base64_regex_matches(test_string) 10 | assert len(matches) == 1 11 | assert matches[0].group('mime') == mime_type 12 | assert matches[0].group('string') == base64_string 13 | 14 | 15 | def get_base64_regex_matches(from_string): 16 | """Helper method to get the list of matches from the given string. 17 | 18 | We need to use finditer() here because it returns Match objects while 19 | findall() does not, and we use Match objects in ``fathom extract``. 20 | """ 21 | return list(BASE64_DATA_PATTERN.finditer(from_string)) 22 | 23 | 24 | def test_empty_string(): 25 | """Some base64 strings are actually empty""" 26 | test_string = 'data:;base64,' 27 | matches = get_base64_regex_matches(test_string) 28 | assert len(matches) == 0 29 | 30 | 31 | def test_presence_of_charset(): 32 | """Some base64 strings contain a character set specification""" 33 | test_string = 'data:image/png; charset=utf-8;base64,iVBORw0K' 34 | matches = get_base64_regex_matches(test_string) 35 | assert len(matches) == 1 36 | 37 | 38 | def test_string_with_multiple_base64_strings(): 39 | test_string = ' ' 40 | matches = get_base64_regex_matches(test_string) 41 | assert len(matches) == 2 42 | 43 | 44 | def test_string_with_percent_encoded_equals_signs_is_found(): 45 | """Some base64 strings have their padding characters (=) percent 46 | encoded so they appear as %3D. Our regex should capture them. 47 | """ 48 | base64_string = 'R0lGODlhAQABAID/AMDAwAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw%3D%3D' 49 | test_string = f'url("data:image/gif;base64,{base64_string}")' 50 | matches = get_base64_regex_matches(test_string) 51 | assert len(matches) == 1 52 | assert matches[0].group('string') == base64_string 53 | 54 | 55 | def test_string_with_percent_encoded_equals_signs_is_decoded(): 56 | """Some base64 strings have their padding characters (=) percent 57 | encoded so they appear as %3D. We should be able to decode them. 58 | 59 | At the moment, we will trust the decoding is correct, we just want 60 | to make sure no errors are raised. 61 | """ 62 | base64_string = 'R0lGODlhAQABAID/AMDAwAAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw%3D%3D' 63 | decode(base64_string) 64 | 65 | 66 | def test_unpadded_string_is_decoded(): 67 | """Some base64 strings do not have padding characters. Python's 68 | base64.b64decode() expects the string to be padded to a number of 69 | characters that is a multiple of four. 70 | 71 | At the moment, we will trust the decoding is correct, we just want 72 | to make sure no errors are raised. 73 | """ 74 | base64_string = 'R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs' 75 | decode(base64_string) 76 | -------------------------------------------------------------------------------- /cli/fathom_web/test/test_label.py: -------------------------------------------------------------------------------- 1 | from ..commands.label import label_html_tags_in_html_string 2 | 3 | 4 | IN_TYPE = 'test' 5 | 6 | 7 | def test_opening_html_tag_has_no_attributes(): 8 | """Some HTML tags may not have any attributes""" 9 | input_string = '' 10 | expected_string = f'' 11 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 12 | 13 | 14 | def test_opening_html_tag_has_attributes(): 15 | """Most HTML tags have at least one attribute""" 16 | input_string = '' 17 | expected_string = f'' 18 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 19 | 20 | 21 | def test_html_string_has_multiple_opening_html_tags(): 22 | """Some HTML tags may have multiple HTML tags""" 23 | input_string = '
' 24 | expected_string = f'
' 25 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 26 | 27 | 28 | def test_html_string_has_right_angle_bracket_as_attribute_value(): 29 | """Some HTML tags may contain a right angle bracket in an unexpected location.""" 30 | input_string = '' 31 | expected_string = f'' 32 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 33 | 34 | 35 | def test_html_string_is_multiline(): 36 | """Some HTML tags may span multiple lines""" 37 | input_string = '' 41 | expected_string = f'' 45 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 46 | 47 | 48 | def test_html_string_has_extra_spaces(): 49 | """ 50 | Some HTML tags may have extra spaces inside the HTML tag. Note that having a space 51 | between the '<' and the tag name (e.g. 'html') is not valid HTML. 52 | """ 53 | input_string = '' 54 | expected_string = f'' 55 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 56 | 57 | 58 | def test_html_string_has_comments(): 59 | """ 60 | Some HTML tags may have HTML comments throughout. Note that comments cannot 61 | occur within a tag. 62 | """ 63 | input_string = '\n' + \ 64 | '' 65 | expected_string = f'\n' + \ 66 | '' 67 | assert label_html_tags_in_html_string(input_string, IN_TYPE) == expected_string 68 | -------------------------------------------------------------------------------- /cli/fathom_web/test/test_list.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | 3 | from ..commands.list import list as list_main 4 | 5 | 6 | def test_end_to_end(tmp_path): 7 | """Test expected outcome when using all of the optional parameters""" 8 | # Make temporary in_directory and base_dir directories 9 | base_dir, in_directory = make_directories(tmp_path) 10 | 11 | # Make HTML files in in_directory in two separate subdirectories 12 | # so we can exercise the recursive option. 13 | a1, a2, b1, b2 = make_html_files(in_directory) 14 | 15 | # Make the out_file we will save the output to 16 | out_file = (base_dir / 'out_file.txt') 17 | 18 | # Run fathom list 19 | result = CliRunner().invoke( 20 | list_main, 21 | [ 22 | in_directory.as_posix(), 23 | '-b', 24 | f'{base_dir.as_posix()}', 25 | '-o', 26 | f'{out_file.as_posix()}', 27 | ], 28 | ) 29 | assert result.exit_code == 0 30 | 31 | expected_file_contents = { 32 | a1.relative_to(base_dir).as_posix(), 33 | a2.relative_to(base_dir).as_posix(), 34 | b1.relative_to(base_dir).as_posix(), 35 | b2.relative_to(base_dir).as_posix(), 36 | } 37 | actual_file_contents = set(out_file.read_text().splitlines()) 38 | assert expected_file_contents == actual_file_contents 39 | 40 | 41 | def make_directories(tmp_path): 42 | """Makes the directories used as base_dir and in_directory in our fathom list calls""" 43 | base_dir = tmp_path / 'base_dir' 44 | base_dir.mkdir() 45 | in_directory = base_dir / 'in_directory' 46 | in_directory.mkdir() 47 | return base_dir, in_directory 48 | 49 | 50 | def make_html_files(in_directory): 51 | """Makes four HTML files in a common directory structure for using in our fathom list calls""" 52 | (in_directory / 'source_a').mkdir() 53 | a1 = (in_directory / 'source_a' / '1.html') 54 | a1.touch() 55 | a2 = (in_directory / 'source_a' / '2.html') 56 | a2.touch() 57 | (in_directory / 'source_b').mkdir() 58 | b1 = (in_directory / 'source_b' / '1.html') 59 | b1.touch() 60 | b2 = (in_directory / 'source_b' / '2.html') 61 | b2.touch() 62 | return a1, a2, b1, b2 63 | 64 | 65 | def test_no_files_to_list(tmp_path): 66 | """Test an empty in_directory using all of the optional parameters""" 67 | # Make temporary in_directory and base_dir directories 68 | base_dir, in_directory = make_directories(tmp_path) 69 | 70 | # Make the out_file we will save the output to 71 | out_file = (in_directory / 'out_file.txt') 72 | 73 | # Run fathom list 74 | result = CliRunner().invoke( 75 | list_main, 76 | [ 77 | in_directory.as_posix(), 78 | '-o', 79 | f'{out_file.as_posix()}', 80 | ], 81 | ) 82 | assert result.exit_code == 0 83 | 84 | assert 'No .html files found' in result.output 85 | 86 | 87 | def test_without_base_dir(tmp_path): 88 | """Test omission of base-dir parameter""" 89 | # Make temporary in_directory and base_dir directories 90 | base_dir, in_directory = make_directories(tmp_path) 91 | 92 | # Make HTML files in in_directory in two separate subdirectories 93 | # so we can exercise the recursive option. 94 | a1, a2, b1, b2 = make_html_files(in_directory) 95 | 96 | # Make the out_file we will save the output to 97 | out_file = (base_dir / 'out_file.txt') 98 | 99 | # Run fathom list 100 | result = CliRunner().invoke( 101 | list_main, 102 | [ 103 | in_directory.as_posix(), 104 | '-o', 105 | f'{out_file.as_posix()}', 106 | ], 107 | ) 108 | assert result.exit_code == 0 109 | 110 | expected_file_contents = { 111 | a1.relative_to(in_directory).as_posix(), 112 | a2.relative_to(in_directory).as_posix(), 113 | b1.relative_to(in_directory).as_posix(), 114 | b2.relative_to(in_directory).as_posix(), 115 | } 116 | actual_file_contents = set(out_file.read_text().splitlines()) 117 | assert expected_file_contents == actual_file_contents 118 | 119 | 120 | def test_in_directory_does_not_exist(): 121 | """Test giving an invalid path for in_directory causes an error""" 122 | # Run fathom list 123 | result = CliRunner().invoke( 124 | list_main, 125 | [ 126 | 'fake_in_dir', 127 | ], 128 | ) 129 | # Assert the program exited with an error message about in directory not existing 130 | assert result.exit_code == 2 131 | # Different versions of click use different quotes: 132 | assert ('"fake_in_dir" does not exist.' in result.output or 133 | "'fake_in_dir' does not exist." in result.output) 134 | 135 | 136 | def test_base_dir_does_not_exist(tmp_path): 137 | """Test giving an invalid path for base-dir causes an error""" 138 | _, in_directory = make_directories(tmp_path) 139 | 140 | # Run fathom list 141 | result = CliRunner().invoke( 142 | list_main, 143 | [ 144 | in_directory.as_posix(), 145 | '-b', 146 | 'fake_base_dir', 147 | ], 148 | ) 149 | # Assert the program exited with an error message about base_dir not existing 150 | assert result.exit_code == 2 151 | assert ('"fake_base_dir" does not exist.' in result.output or 152 | "'fake_base_dir' does not exist." in result.output) 153 | -------------------------------------------------------------------------------- /cli/fathom_web/test/test_pick.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | 3 | from ..commands.pick import pick 4 | 5 | 6 | def test_end_to_end(tmp_path): 7 | """ 8 | Given a directory of three files, use ``fathom pick`` to move two files, and 9 | check that the files and their potential resources directories have moved. 10 | """ 11 | # Make temporary source and destination directories 12 | source = tmp_path / 'source' 13 | source.mkdir() 14 | destination = tmp_path / 'destination' 15 | destination.mkdir() 16 | 17 | # Add files to the source directory 18 | (source / '1.html').touch() 19 | (source / '2.html').touch() 20 | (source / '3.html').touch() 21 | 22 | # Add resource directories for files 1 and 2 23 | (source / 'resources' / '1').mkdir(parents=True) 24 | (source / 'resources' / '1' / '1.png').touch() 25 | (source / 'resources' / '1' / '2.css').touch() 26 | (source / 'resources' / '2').mkdir(parents=True) 27 | (source / 'resources' / '2' / '1.png').touch() 28 | (source / 'resources' / '2' / '2.css').touch() 29 | 30 | # Run fathom pick to move 2 files from source to destination 31 | runner = CliRunner() 32 | # Arguments to invoke() must be passed as strings (this isn't documented!!!) 33 | result = runner.invoke(pick, [source.as_posix(), destination.as_posix(), '2']) 34 | assert result.exit_code == 0 35 | 36 | # Check the correct number of files have moved 37 | files_in_source = list(source.glob('*.html')) 38 | assert len(files_in_source) == 1 39 | files_in_destination = list(destination.glob('*.html')) 40 | assert len(files_in_destination) == 2 41 | 42 | # Check any resource directories have moved 43 | if (destination / '1.html').exists(): 44 | assert (destination / 'resources' / '1' / '1.png').exists() 45 | assert (destination / 'resources' / '1' / '2.css').exists() 46 | if (destination / '2.html').exists(): 47 | assert (destination / 'resources' / '2' / '1.png').exists() 48 | assert (destination / 'resources' / '2' / '2.css').exists() 49 | 50 | # Make sure we didn't lose any files 51 | files_in_directories = {file.name for file in files_in_source} | {file.name for file in files_in_destination} 52 | assert {'1.html', '2.html', '3.html'} == files_in_directories 53 | 54 | 55 | def test_resource_directory_path_collision(tmp_path): 56 | """ 57 | Ensure an exception is raised when moving a resource directory 58 | if that directory already exists in the destination directory. 59 | """ 60 | # Make temporary source and destination directories 61 | source = tmp_path / 'source' 62 | source.mkdir() 63 | destination = tmp_path / 'destination' 64 | destination.mkdir() 65 | 66 | # Add the file to the source directory 67 | (source / '1.html').touch() 68 | 69 | # Add the resource directory for our file 70 | (source / 'resources' / '1').mkdir(parents=True) 71 | (source / 'resources' / '1' / '1.png').touch() 72 | (source / 'resources' / '1' / '2.css').touch() 73 | 74 | # Add a resource directory for the same file in the destination directory 75 | (destination / 'resources' / '1').mkdir(parents=True) 76 | 77 | # Run fathom pick to move the only file from source to destination 78 | runner = CliRunner() 79 | # Arguments to invoke() must be passed as strings (this isn't documented!!!) 80 | result = runner.invoke(pick, [source.as_posix(), destination.as_posix(), '1']) 81 | 82 | # Assert the program exited with a UsageError and our error message is in the program output 83 | assert result.exit_code == 2 84 | assert 'Error: Tried to make directory' in result.output 85 | 86 | # Check that our files haven't moved 87 | files_in_source = list(source.glob('*.html')) 88 | assert len(files_in_source) == 1 89 | assert (source / 'resources' / '1' / '1.png').exists() 90 | assert (source / 'resources' / '1' / '2.css').exists() 91 | files_in_destination = list(destination.glob('*.html')) 92 | assert len(files_in_destination) == 0 93 | assert (destination / 'resources' / '1').exists() 94 | -------------------------------------------------------------------------------- /cli/fathom_web/test/test_test.py: -------------------------------------------------------------------------------- 1 | from click import BadParameter 2 | from pytest import raises 3 | 4 | from ..commands.test import decode_weights 5 | 6 | 7 | def test_expected_input_format(): 8 | """Test that an example of good input decodes as expected""" 9 | json_string = '{"coeffs": [["rule1", 0.1], ["rule2", 0.2]], "bias": 0.5}' 10 | expected_dict = { 11 | 'coeffs': [ 12 | ['rule1', 0.1], 13 | ['rule2', 0.2], 14 | ], 15 | 'bias': 0.5, 16 | } 17 | decoded_weights = decode_weights(None, None, json_string) 18 | assert decoded_weights == expected_dict 19 | 20 | 21 | def test_not_json(): 22 | run_invalid_json('not_json', r'.*valid.*') 23 | 24 | 25 | def run_invalid_json(json_string, assertion_match_regex): 26 | """Helper method to run `decode_weights()` with invalid input""" 27 | with raises(BadParameter, match=assertion_match_regex): 28 | decode_weights(None, None, json_string) 29 | 30 | 31 | def test_no_coeffs(): 32 | run_invalid_json('{"bias": 0.5}', r'.*contain.*coeffs.*') 33 | 34 | 35 | def test_no_bias(): 36 | run_invalid_json('{"coeffs": [["rule", 0.5]]}', r'.*contain.*bias.*') 37 | 38 | 39 | def test_coeffs_not_list(): 40 | run_invalid_json('{"coeffs": {"not": "a_list"}, "bias": 0.5}', r'Coeffs must be a list of 2-element lists.*') 41 | 42 | 43 | def test_coeffs_not_pairs(): 44 | run_invalid_json( 45 | '{"coeffs": [["rule1"], ["rule2", 0.2]], "bias": 0.5}', 46 | r'Coeffs must be a list of 2-element lists.*' 47 | ) 48 | 49 | 50 | def test_rulename_not_string(): 51 | run_invalid_json( 52 | '{"coeffs": [[0.2, 0.2], ["rule2", 0.2]], "bias": 0.5}', 53 | r'Coeffs must be a list of 2-element lists.*' 54 | ) 55 | 56 | 57 | def test_coeff_value_not_float(): 58 | run_invalid_json( 59 | '{"coeffs": [["rule1", "rule1"], ["rule2", 0.2]], "bias": 0.5}', 60 | r'Coeffs must be a list of 2-element lists.*' 61 | ) 62 | -------------------------------------------------------------------------------- /cli/fathom_web/test/test_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import operator 3 | 4 | from click.testing import CliRunner 5 | 6 | from ..commands.train import exclude_indices, train, find_optimal_cutoff, single_cutoff, possible_cutoffs, accuracy_per_tag 7 | from ..utils import tensor 8 | 9 | 10 | def test_exclude_indices(): 11 | assert exclude_indices([0, 2, 3], ['a', 'b', 'c', 'd', 'e', 'f']) == ['b', 'e', 'f'] # omit first, last, and some consecutive 12 | assert exclude_indices([1], ['a', 'b', 'c', 'd']) == ['a', 'c', 'd'] # leave ends alone 13 | assert exclude_indices([], ['a', 'b', 'c']) == ['a', 'b', 'c'] # do nothing 14 | assert exclude_indices([0], ['a']) == [] # omit everything 15 | 16 | 17 | def test_auto_vectorization_smoke(tmp_path): 18 | """Make sure we get through auto-vectorization of at least the training 19 | set.""" 20 | test_dir = os.path.dirname(os.path.abspath(__file__)) 21 | 22 | runner = CliRunner() 23 | result = runner.invoke( 24 | train, 25 | [ 26 | f'{test_dir}/resources/train/', 27 | '--ruleset', 28 | f'{test_dir}/resources/train/vectorize_ruleset.js', 29 | '--trainee', 30 | 'secret', 31 | '--training-cache', 32 | f'{tmp_path.as_posix()}/training_vectors.json', 33 | ] 34 | ) 35 | assert result.exit_code == 0 36 | assert (tmp_path / 'training_vectors.json').exists() 37 | 38 | 39 | def test_possible_cutoffs(): 40 | # single cutoff 41 | y_pred = tensor([1.2512]) 42 | expected = [0.78] 43 | possibles = possible_cutoffs(y_pred) 44 | assert possibles == expected 45 | 46 | # Reduces to single cutoff since the midpoint is used. 47 | y_pred = tensor([1.2512, 1.2516]) 48 | expected = [0.78] 49 | possibles = possible_cutoffs(y_pred) 50 | assert possibles == expected 51 | 52 | # Reduces to a single cutoff (due to rounding) from 2 cutoffs (due to midpoint) 53 | y_pred = tensor([1.2512, 1.2516, 1.255]) 54 | expected = [0.78] 55 | possibles = possible_cutoffs(y_pred) 56 | assert possibles == expected 57 | 58 | # Partial reduction in number of cutoffs 59 | y_pred = tensor([-2.1605, -0.5696, 0.4886, 0.8633, -1.3479, 60 | -0.5813, -0.5696, 0.5696, -0.5950, -0.5696]) 61 | expected = [0.15, 0.28, 0.36, 0.49, 0.63, 0.67] 62 | possibles = possible_cutoffs(y_pred) 63 | assert possibles == expected 64 | 65 | # No reduction in number of cutoffs (since midpoints are used, 3 cutoffs are calculated pre rounding). 66 | y_pred = tensor([-2, -2.25, -1.95, 1.251]) 67 | expected = [0.11, 0.12, 0.45] 68 | possibles = possible_cutoffs(y_pred) 69 | assert possibles == expected 70 | 71 | 72 | def test_find_optimal_cutoff_single_cutoff_with_highest_accuracy(): 73 | # This test is doing the steps completed by find_optimal_cutoff separately to 74 | # determine the expected value. The functions used are covered by other tests. 75 | y_pred = tensor([-2.1605, -0.5696, 0.4886, 0.8633, -1.3479, -0.5813, -0.5696, 0.5696, -0.5950, -0.5696]) 76 | y = tensor([0., 0., 1., 1., 0., 0., 0., 1., 0., 0.]) 77 | 78 | # Determining the expected_cutoff 79 | expected_cutoffs = determine_expected_cutoffs(y, y_pred) 80 | 81 | # Expecting a single cutoff for the best accuracy 82 | assert len(expected_cutoffs) == 1 83 | expected_cutoff = expected_cutoffs[0] 84 | 85 | # Now that we have the expected expected_cutoff check the value returned from 86 | # find_optimal_cutoff against it (this is the real test) 87 | optimal_cutoff = find_optimal_cutoff(y, y_pred, num_prunes=0) 88 | assert optimal_cutoff == expected_cutoff 89 | # and a final double check 90 | assert optimal_cutoff == 0.49 91 | 92 | 93 | def test_find_optimal_cutoff_multiple_cutoffs_with_highest_accuracy(): 94 | # This test is doing the steps completed by find_optimal_cutoff separately to 95 | # determine the expected value. The functions used are covered by other tests. 96 | y_pred = tensor([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.25, 2, 2.5]) 97 | y = tensor([0., 1., 0., 1., 0., 1., 0., 1., 0., 1.]) 98 | 99 | # Determining the expected_cutoff 100 | expected_cutoffs = determine_expected_cutoffs(y, y_pred) 101 | 102 | # Verifying that there are more than 1 cutoff with the best accuracy (the basis for this test case) 103 | assert len(expected_cutoffs) > 1 104 | 105 | # From the list of best cutoffs get the single value (single_cutoff is tested elsewhere) 106 | expected_cutoff = single_cutoff(expected_cutoffs) 107 | 108 | # Now that we have the expected cutoff check the value returned from 109 | # find_optimal_cutoff against it (this is the real test) 110 | optimal_cutoff = find_optimal_cutoff(y, y_pred, num_prunes=0) 111 | assert optimal_cutoff == expected_cutoff 112 | # and a final double check 113 | assert optimal_cutoff == 0.56 114 | 115 | 116 | def test_single_cutoff(): 117 | # single 118 | cutoffs = [0] 119 | assert single_cutoff(cutoffs) == 0 120 | 121 | # last element 122 | cutoffs = [0, 1] 123 | assert single_cutoff(cutoffs) == 1 124 | 125 | cutoffs = [0, 1, 10] 126 | assert single_cutoff(cutoffs) == 10 127 | 128 | # middle element 129 | cutoffs = [0, 1, 2] 130 | assert single_cutoff(cutoffs) == 1 131 | 132 | cutoffs = [0, 1, 2, 3] 133 | assert single_cutoff(cutoffs) == 2 134 | 135 | cutoffs = [0, 1, 2, 3, 4] 136 | assert single_cutoff(cutoffs) == 2 137 | 138 | cutoffs = [0, 1, 2, 3, 4, 5] 139 | assert single_cutoff(cutoffs) == 3 140 | 141 | 142 | def determine_expected_cutoffs(y, y_pred): 143 | # This list will contain the optimal cutoff 144 | possibles = possible_cutoffs(y_pred) 145 | 146 | # Get the accuracy for each possible cutoff 147 | # Note this is a different method of tracking the best cutoff than in find_optimal_cutoff. 148 | cutoff_accuracy = {} 149 | for possible in possibles: 150 | accuracy, _, _ = accuracy_per_tag(y, y_pred, possible, num_prunes=0) 151 | cutoff_accuracy[possible] = accuracy 152 | 153 | # Get the cutoffs with the max accuracy, there could be more than 1 cutoff 154 | max_accuracy = max(cutoff_accuracy.items(), key=operator.itemgetter(1))[1] 155 | optimal_cutoffs = [cutoff for cutoff, accuracy in cutoff_accuracy.items() if max_accuracy == accuracy] 156 | 157 | return optimal_cutoffs -------------------------------------------------------------------------------- /cli/fathom_web/test/test_utils.py: -------------------------------------------------------------------------------- 1 | from ..utils import fit_unicode 2 | 3 | 4 | def test_fit_unicode(): 5 | assert fit_unicode('abc', 3) == 'abc' 6 | assert fit_unicode('abc', 2) == 'ab' 7 | assert fit_unicode('a母', 2) == 'a ' 8 | assert fit_unicode('a母', 3) == 'a母' 9 | assert fit_unicode('a母母母s', 7) == 'a母母母' 10 | assert fit_unicode('a母母母s', 6) == 'a母母 ' 11 | assert fit_unicode('a母母母s', 5) == 'a母母' 12 | assert fit_unicode('a母母', 4) == 'a母 ' 13 | assert fit_unicode('a母', 6) == 'a母 ' 14 | -------------------------------------------------------------------------------- /cli/fathom_web/utils.py: -------------------------------------------------------------------------------- 1 | """Additional factored-up routines for which no clear pattern of organization 2 | has yet emerged""" 3 | 4 | import io 5 | from os import walk 6 | from pathlib import Path 7 | from random import sample 8 | from unicodedata import east_asian_width 9 | 10 | from more_itertools import ilen, pairwise 11 | from numpy import array, histogram 12 | from sklearn.preprocessing import minmax_scale 13 | import torch 14 | from torch.nn import Sequential, Linear, ReLU 15 | 16 | 17 | def tensor(some_list): 18 | """Cast a list to a tensor of the proper type for our problem.""" 19 | return torch.tensor(some_list, dtype=torch.float) 20 | 21 | 22 | def tensors_from(pages, shuffle=False): 23 | """Return (inputs, correct outputs, number of tags that are recognition 24 | targets, number of tags that were prematurely pruned) tuple. 25 | 26 | Can also shuffle to improve training performance. 27 | 28 | """ 29 | xs = [] 30 | ys = [] 31 | num_targets = num_prunes = 0 32 | maybe_shuffled_pages = sample(pages, len(pages)) if shuffle else pages 33 | for page in maybe_shuffled_pages: 34 | for tag in page['nodes']: 35 | if tag.get('pruned'): 36 | num_prunes += 1 37 | else: 38 | xs.append(tag['features']) 39 | ys.append([1 if tag['isTarget'] else 0]) # Tried 0.1 and 0.9 instead. Was much worse. 40 | if tag['isTarget']: 41 | num_targets += 1 42 | return tensor(xs), tensor(ys), num_targets, num_prunes 43 | 44 | 45 | def classifier(num_inputs, num_outputs, hidden_layer_sizes=None): 46 | """Return a new model of the type Fathom uses. 47 | 48 | At present, this is a linear binary classifier modeled as a perceptron. 49 | 50 | :arg num_inputs: The number of input nodes (layer 0 of the net) 51 | :arg num_outputs: The number of outputs. So far, always 1 since it's a 52 | binary classifier. We may expand to multiclass someday, however. 53 | :arg hidden_layer_sizes: For each hidden layer, the number of nodes in it. 54 | Fully-connectedness is assumed. 55 | 56 | """ 57 | if hidden_layer_sizes is None: 58 | hidden_layer_sizes = [] 59 | sizes = [num_inputs] + hidden_layer_sizes 60 | 61 | layers = [] 62 | for i, o in pairwise(sizes): 63 | layers.append(Linear(i, o, bias=True)) 64 | layers.append(ReLU()) # Sigmoid does worse, Tanh about the same. 65 | layers.append(Linear(sizes[-1], num_outputs, bias=True)) 66 | 67 | return Sequential(*layers) 68 | 69 | 70 | def mini_histogram(data): 71 | """Return a histogram of a list of numbers with min and max numbers 72 | labeled.""" 73 | chars = ' ▁▂▃▄▅▆▇█' 74 | data_array = array(data) 75 | counts, _ = histogram(data_array, bins=10) 76 | indices = minmax_scale(counts, feature_range=(0, 8)).round() 77 | chart = ''.join(chars[int(i)] for i in indices) 78 | return '{min} |{chart}| {max}'.format(min=data_array.min(), 79 | chart=chart, 80 | max=data_array.max()) 81 | 82 | 83 | def speed_readout(pages): 84 | """Return human-readable metrics on ruleset-running speed based on 85 | benchmarks taken by the Vectorizer.""" 86 | num_unpruned_nodes = sum(ilen(n for n in p['nodes'] if not n.get('pruned')) for p in pages) 87 | average = sum(p['time'] for p in pages) / num_unpruned_nodes 88 | histogram = mini_histogram([p['time'] for p in pages]) 89 | return f'\nTime per page (ms): {histogram} Average per tag: {average:.0f}' 90 | 91 | 92 | def fit_unicode(string, width): 93 | """Truncate or pad a string to width, taking into account that some unicode 94 | chars are double-width.""" 95 | width_so_far = 0 96 | for num_chars, char in enumerate(string, start=1): 97 | width_so_far += 2 if east_asian_width(char) == 'W' else 1 98 | if width_so_far == width: 99 | break 100 | elif width_so_far > width: 101 | num_chars -= 1 102 | width_so_far -= 2 103 | break 104 | return string[:num_chars] + (' ' * (width - width_so_far)) 105 | 106 | 107 | def samples_from_dir(in_dir): 108 | """Return an iterable of Paths to samples found in ``in_dir``, 109 | recursively.""" 110 | for dir_path, dirs, files in walk(in_dir): 111 | try: 112 | # Skip resources/ folders. Sometimes they contain .html files, and 113 | # those aren't samples. 114 | dirs.remove('resources') 115 | except ValueError: 116 | pass 117 | yield from (Path(dir_path) / file for file in files 118 | if file.endswith('.html')) 119 | 120 | 121 | def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE): 122 | """Yield pieces of data from a file-like object until EOF.""" 123 | while True: 124 | chunk = file.read(size) 125 | if not chunk: 126 | break 127 | yield chunk 128 | 129 | 130 | def path_or_none(ctx, param, value): 131 | return None if value is None else Path(value) 132 | -------------------------------------------------------------------------------- /cli/setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /cli/setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | from setuptools import setup, find_packages 3 | 4 | 5 | setup( 6 | name='fathom-web', 7 | version='3.7.3', 8 | description='Commandline tools for training Fathom rulesets', 9 | long_description=open('README.rst', 'r', encoding='utf8').read(), 10 | author='Erik Rose', 11 | author_email='erik@mozilla.com', 12 | license='MPL', 13 | packages=find_packages(exclude=['*.test']), 14 | url='https://mozilla.github.io/fathom/', 15 | install_requires=[ 16 | 'click>=7.0,<8.0', 17 | 'more-itertools>=8.2,<9.0', 18 | 'numpy>=1.18.1,<2.0', 19 | 'filelock>=3.0.12', 20 | 'scikit-learn>=0.22.2', 21 | 'selenium>=3.141.0', 22 | 'tensorboardX>=1.6,<2.0', 23 | 'torch>=1.0,<2.0', 24 | 'protobuf <= 3.20.1', 25 | ], 26 | dependency_links=[ 27 | 'https://download.pytorch.org/whl/cu110/torch_stable.html' 28 | ], 29 | entry_points={'console_scripts': [ 30 | 'fathom = fathom_web.commands:fathom', 31 | ]}, 32 | package_data={'': ['fathom.zip']}, 33 | classifiers=[ 34 | 'Intended Audience :: Developers', 35 | 'Natural Language :: English', 36 | 'Development Status :: 5 - Production/Stable', 37 | 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)', 38 | 'Programming Language :: Python :: 3' 39 | ], 40 | keywords=['machine learning', 'ml', 'semantic extraction'], 41 | ) 42 | -------------------------------------------------------------------------------- /docs/clustering.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Clustering 3 | ========== 4 | 5 | Fathom provides a flexible clustering algorithm, useful for finding nodes that are bunched together spatially or according to some other metric. By default, it groups nodes based on their proximity and ancestry. It is documented here as top-level functions but is also available directly within rulesets as :func:`bestCluster`, which has the advantage of letting you direct its results to further rules. 6 | 7 | The clustering routines hang off a ``clusters`` object in the top-level Fathom module. To import them, do something like this: 8 | 9 | .. code-block:: js 10 | 11 | const { 12 | clusters: { distance }, 13 | } = require('fathom-web'); 14 | 15 | This will result in a top-level ``distance`` symbol. 16 | 17 | .. note:: 18 | 19 | Clustering is computationally expensive (at least O(n^2)). It is powerful, but it should be used only when more efficient alternatives are exhausted. 20 | 21 | .. autofunction:: clusters 22 | 23 | Example: 24 | 25 | .. code-block:: js 26 | 27 | const {clusters} = require('fathom-web/clusters'); 28 | theClusters = clusters(anArrayOfNodes, 4); 29 | 30 | In the above, 4 is the distance beyond which Fathom will decide nodes belong in separate clusters. Turn it up to more aggressively invite nearby nodes into a cluster. Turn it down to keep clusters smaller. The output looks like a list of lists, with each list representing a cluster: 31 | 32 | .. code-block:: js 33 | 34 | [[nodeA, nodeB, nodeC], 35 | [nodeD]] 36 | 37 | Various factors influence the measured distance between nodes. The first is the obvious one: topological distance, the number of steps along the DOM tree from one node to another. 38 | 39 | The second is structural similarity. In the following, the divs ``a`` and ``b`` are farther apart… 40 | 41 | .. code-block:: html 42 | 43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | 52 | …than they would be if the ``center`` tag were a ``div`` as well: 53 | 54 | .. code-block:: html 55 | 56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 | 65 | Third is depth disparity. Nodes are considered farther from each other if they are not the same distance from the root. 66 | 67 | Finally is the presence of "stride" nodes, which are siblings or siblings-of-ancestors that lie 68 | between 2 nodes. (These are the nodes that would appear between the 2 nodes in a straightforward rendering of the page.) Each stride node makes it less likely that the 2 nodes will be together in a cluster. 69 | 70 | The costs for each factor can be customized by wrapping :func:`distance` in an arrow function and passing it as the third param. 71 | 72 | .. note:: 73 | 74 | ``clusters()`` can actually cluster anything, not just DOM nodes. All you need to do is pass in a suitable distance function as the ``getDistance`` param. 75 | 76 | .. autofunction:: distance(fnodeA, fnodeB, {differentDepthCost = 2, differentTagCost = 2, sameTagCost = 1, strideCost = 1, additionalCost = (fnodeA, fnodeB) => 0}) 77 | 78 | .. autofunction:: euclidean 79 | -------------------------------------------------------------------------------- /docs/commands/extract.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.extract:extract 2 | :prog: fathom extract 3 | -------------------------------------------------------------------------------- /docs/commands/fox.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.fox:fox 2 | :prog: fathom fox 3 | -------------------------------------------------------------------------------- /docs/commands/histogram.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.histogram:histogram 2 | :prog: fathom histogram 3 | 4 | ---- 5 | 6 | .. image:: ../img/histogram.png 7 | -------------------------------------------------------------------------------- /docs/commands/label.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.label:label 2 | :prog: fathom label 3 | -------------------------------------------------------------------------------- /docs/commands/list.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | 3 | This command is rarely needed anymore. As of Fathom 3.4, vectorization happens automatically when you run a command that needs it. 4 | 5 | .. click:: fathom_web.commands.list:list 6 | :prog: fathom list 7 | -------------------------------------------------------------------------------- /docs/commands/pick.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.pick:pick 2 | :prog: fathom pick 3 | -------------------------------------------------------------------------------- /docs/commands/serve.rst: -------------------------------------------------------------------------------- 1 | .. note:: 2 | 3 | This command is rarely needed anymore. As of Fathom 3.4, vectorization happens automatically when you run a command that needs it. 4 | 5 | .. click:: fathom_web.commands.serve:serve 6 | :prog: fathom serve 7 | -------------------------------------------------------------------------------- /docs/commands/test.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.test:test 2 | :prog: fathom test 3 | -------------------------------------------------------------------------------- /docs/commands/train.rst: -------------------------------------------------------------------------------- 1 | .. click:: fathom_web.commands.train:train 2 | :prog: fathom train 3 | -------------------------------------------------------------------------------- /docs/debugging.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Debugging 3 | ========= 4 | 5 | Setting Breakpoints 6 | =================== 7 | 8 | If the :doc:`trainer` reports JavaScript errors, you've probably got a bug in your ruleset code. If you can't find it by examination and need to place a breakpoint, the tool of choice is the FathomFox Evaluator. 9 | 10 | #. Run :doc:`fathom fox`, and pass it your ruleset:: 11 | 12 | fathom fox -r rulesets.js 13 | 14 | #. Use the instance of Firefox that comes up to open a page that you think will reproduce the problem. 15 | #. Show the dev tools, and navigate to the Debugger panel. 16 | #. In the disclosure tree to the left, disclose FathomFox, and select `rulesets.js`. 17 | #. Scroll to the bottom, past the minified mess, and you’ll see your ruleset code. Place a breakpoint as you like, probably in one of your scoring callbacks. 18 | #. Invoke the Evaluator from the Fathom toolbar menu. 19 | #. Click Evaluate to run the ruleset over the loaded tabs. 20 | 21 | You’ll end up in the debugger, paused at your breakpoint. 22 | 23 | Identifying Misrecognized Elements 24 | ================================== 25 | 26 | .. note:: 27 | Make sure you have the latest trained coefficients and biases pasted into your ruleset before you do this, or recognition won't work well. 28 | 29 | FathomFox's Evaluator can point out misrecognized elements, in case the tag exerpts emitted by the trainer are insufficient to identify them. To use the Evaluator: 30 | 31 | #. Open an instance of Firefox with FathomFox and your ruleset loaded (``fathom-fox -r rulesets.js`` makes this simple). 32 | #. Open all of the samples you want to diagnose as separate tabs. 33 | #. Open the Evaluator page using FathomFox's browser action button. 34 | #. In the Trainee dropdown, select the trainee you want to diagnose. 35 | #. Click the Evaluate button. 36 | #. Click any red box to navigate to a page with misrecognized nodes. 37 | #. On that tab, open the dev tools panel (ctrl-shift-N) and switch to the Fathom panel. Unfortunately, there aren't yet web extension APIs to do this part automatically. 38 | #. At this point, you’ll see a quick and dirty representation of the “bad” element: a new label called “BAD [the trainee]”. Be sure to delete this if you choose to re-save the page for some reason. Also note that the BAD label is created only when the bad cell is clicked, for speed; if you navigate to the bad page manually, the label won’t be there, or there might be an old label from a previous iteration. 39 | #. Return to the Evaluator tab and click any other red boxes you want to explore. 40 | 41 | Histograms 42 | ========== 43 | 44 | Finally, a great way to examine the scores your rules are emitting is :doc:`fathom histogram`. It can show you how useful a discriminator a rule is and help you notice when the distribution of output values is not what you expect. 45 | 46 | .. image:: img/histogram.png 47 | -------------------------------------------------------------------------------- /docs/deploy-docs: -------------------------------------------------------------------------------- 1 | #!/bin/sh -e 2 | # Upload Sphinx docs to gh-pages branch. 3 | 4 | cd docs/_build/html 5 | touch .nojekyll 6 | REV=$(git rev-parse HEAD) 7 | git init 8 | git config user.name "Fathom Documenter" 9 | git config user.email "erik+fathomdoc@mozilla.commm" 10 | git checkout -b gh-pages 11 | git add . 12 | git commit -m "Update docs to ${REV}. [skip ci]" 13 | git remote add mozilla "https://$GH_TOKEN@github.com/mozilla/fathom.git" 14 | # Eat output so it doesn't spit out the sensitive GH_TOKEN if something goes wrong: 15 | git push -q -f mozilla gh-pages > /dev/null 2>&1 16 | -------------------------------------------------------------------------------- /docs/development.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Development 3 | =========== 4 | 5 | Source 6 | ====== 7 | 8 | It's on `GitHub `_. 9 | 10 | Tests and Examples 11 | ================== 12 | 13 | To run the tests, run... :: 14 | 15 | make lint test 16 | 17 | This will also run the linter and analyze test coverage. To render the coverage report human-legibly, run ``make coverage``. You can then find the coverage report in the ``coverage`` directory. 18 | 19 | You can also run the linter or tests for just one subproject at a time. For example, to test the CLI tools... :: 20 | 21 | cd cli 22 | make lint test 23 | 24 | If you want to drop into the debugger in the middle of a JS test, add a ``debugger;`` statement at your desired breakpoint, then run ``make debugtest`` in the ``fathom`` subproject:: 25 | 26 | cd fathom 27 | make debugtest 28 | 29 | Docs 30 | ==== 31 | 32 | To build the docs... :: 33 | 34 | make docs 35 | 36 | Gotchas 37 | ======= 38 | 39 | If you are developing the CLI tools and your changes to their embedded copy of the Fathom JS framework don't seem to be taking effect, commit first. The make target that builds ``fathom.zip`` uses ``git archive`` to pull from ``HEAD``. In this scenario, we tend to use a single local commit we amend with ``git commit --amend --no-edit`` when we want to test our changes. 40 | 41 | Windows Considerations 42 | ====================== 43 | 44 | Fathom uses `makefiles `_ to do its builds and run its tests. These makefiles rely on Unix commands. Therefore, if you are developing on Windows, you need access to these Unix commands through something like `Cygwin `_. You can build and test Fathom using `Windows Subsystem for Linux `_, but just know that you are technically building and testing Fathom in Linux when you do. 45 | 46 | Future Roadmap 47 | ============== 48 | 49 | Fathom 3.x: the incremental gains 50 | --------------------------------- 51 | 52 | * Regularization. Overfitting doesn't generally happen much, if you keep your eyes on the Tensorboard graphs to dodge wrong LRs, but sometimes you can still add signal to the model and get worse accuracy as a result. That should never happen. Regularization might help with that. 53 | * Automatic normalization. Right now, it's the ruleset author's responsibility to keep scoring callback outputs between 0 and 1. There are helpers to scale things linearly and sigmoidally, but it would be great to do this intelligently and automatically in the trainer, informed by the corpus rather than having the dev make guesses or painstaking calculations about the distribution. 54 | * Shuffle every iteration. Might help avoid overfitting. We shuffle once now. 55 | * Learn cutoff values. Sometimes there are values that, <7, should be treated one way and >7 another. We've had to model these by hand so far, but this should be automatic. We could using bucketing or deeper NNs, but we probably need much bigger corpora to support deeper NNs. The trainer already supports deeper NNs, but the client code needs support, and that'll be a breaking change because the format of the coefficients and biases will have to expand. The math itself, of course, is trivial. 56 | * Make corpus collection cheaper. Another theme for the future, related to the above, is making training data much cheaper to collect, because that would let us trade skilled labor of rule creation for unskilled corpus collection. 57 | * Text signal. So far, we mostly pay attention to markup. Any body-text stuff has to be implemented by the ruleset author. There's no reason we can't integrate a Bayesian (or other) text model on body text or even tokenized CSS classes and IDs. Or URL segments. Or other attribute values. A Bayesian classifier could happily live as a scoring callback, though the trainer would have to be special-cased to go do a separate pass to collect bag-of-words numbers, then in the main pass hand that to the Bayesian scoring callbacks and let the NNs balance the outputs of them as usual. But at this point, I prefer putting effort toward Fathom 4 than this fairly expensive effort with much overlap. 58 | * Visualization. It would be great to have a visualization tool that would show, on sample pages, what's getting classified right and wrong. Just haven't got around to it. Not hard. 59 | 60 | Fathom 4: the great beyond 61 | -------------------------- 62 | 63 | We had perf problems using Fathom for the FF Companion: running it on every page or several times per page. I've never done much optimization, though profiling shows that 80% of time is spent on DOM calls. DOM calls are both slow and block the main thread, and the DOM cannot be moved off the main thread to do recognition concurrently. So I took a few afternoons and said "What if we dispense with all the DOM calls, then?" Reader Mode just throws the markup across thread boundaries. Let's see what we can get out of that. Sure, we lose heights and widths and visibilities and positions on the page, but there's still lots of signal in that thar text, and Fathom 1 started out there, as a node app running against a stub DOM implementation without access to a renderer. To make a long story short, I build a whole-page categorizer using logistic regression on TFIDF'd bags of words, with all markup stripped out, and... 64 | 65 | * It gives 85% testing accuracy, very comparable with Smooth Shopping's 90% *validation* accuracy. 66 | * It took a month or more to write the Shopping ruleset. This one I didn't have to write at all; it was trained in 5 seconds. 67 | * I didn't engineer a single feature for this. Not so much as a price regex. It's a general classifier. It did similarly well against our hand-rolled Smoot Article recognizer, which is especially interesting since Articles have wider subject matter than shopping pages. 68 | * There's tons of signal still left on the floor: 69 | * Stemming. Tried it but didn't have an obvious impact. Odd. Try again. 70 | * All the markup. I stripped out everything but body text. Teach it to use tag names, CSS classes, IDs, and URL segments. 71 | 72 | What's a more open question is whether this can be adapted from whole-page categorization to element recognition, like Fathoms 1-3, which is the more major case. 73 | 74 | * Continue with this bag-of-words approach on a pruned down set of candidate tags, statistically informed? Either algorithmically come up with a minimal querySelector arg, or use a compressed model to predict which tags we ought to examine, like an attention system in computer vision. 75 | * Perhaps add some hand-rolled but still generic signals, like innertext length, markup bits, or consideration of surrounding elements (parents, grandparents, siblings, etc.). 76 | 77 | If this could work, it would be a game-changer. Just as Fathoms 1-3 let us do something we couldn't do before at all, Fathom 4 would let you do it in a couple afternoons of low-skilled work rather than a couple weeks to months of skilled. -------------------------------------------------------------------------------- /docs/example.rst: -------------------------------------------------------------------------------- 1 | =============== 2 | Example Ruleset 3 | =============== 4 | 5 | This is the simple example ruleset that ships with FathomFox; it is made available for experimentation when you run :doc:`commands/fox` without passing in your own ruleset. In its comments, it documents the structure of the ``trainees`` object, which is what :doc:`the trainer` needs to do its job. 6 | 7 | .. literalinclude:: ../fathom_fox/src/rulesets.js 8 | :language: js 9 | :linenos: 10 | -------------------------------------------------------------------------------- /docs/exceptions.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Exceptions 3 | ========== 4 | 5 | Fathom's exceptions hang off an ``exceptions`` object in the top-level Fathom module. To import them, do something like this: 6 | 7 | .. code-block:: js 8 | 9 | const { 10 | exceptions: { NoWindowError }, 11 | } = require('fathom-web'); 12 | 13 | This will result in a top-level ``NoWindowError`` symbol. 14 | 15 | .. autoclass:: CycleError 16 | .. autoclass:: NoWindowError 17 | -------------------------------------------------------------------------------- /docs/fnodes.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Fnodes 3 | ====== 4 | 5 | Fnodes are typically returned from methods on Fathom :doc:`rulesets`. 6 | 7 | .. autoclass:: Fnode 8 | :members: element, hasNoteFor, hasType, noteFor, scoreFor 9 | -------------------------------------------------------------------------------- /docs/glossary.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Glossary 3 | ======== 4 | 5 | .. glossary:: 6 | 7 | candidate 8 | Any node (:term:`target` or not) brought into the ruleset by a :func:`dom` or :func:`element` call for consideration 9 | 10 | fnode 11 | A wrapper around a DOM node, holding :term:`scores`, :term:`notes`, and :term:`types` pertaining to it. See :doc:`fnodes`. 12 | 13 | note 14 | An arbitrary, opaque-to-Fathom piece of data attached to a given :term:`type` on a :term:`fnode`. Notes can be consulted by scoring callbacks and are a good place to park expensive-to-recompute information. They are the main way of passing data between rules. 15 | 16 | ruleset 17 | The unordered collection of rules that forms a Fathom program. See :doc:`rules` for more on the relationships between top-level constructs. 18 | 19 | score 20 | The fuzzy-edged part of :term:`fnode` state. A floating-point number, typically between 0 and 1, attached to a certain :term:`type` on a :term:`fnode`. They represent the confidence with which a node belongs to a type. 21 | 22 | subscore 23 | A single rule's contribution to a node's score for some type. In Fathom's current incarnation as a series of (single-layer) perceptrons, each rule's subscore is multiplied by a coefficient, which is derived from training. The weighted subscores are then added together and fed through a sigmoid function to get the final score for a node for a type. 24 | 25 | target 26 | A "right answer" DOM node, one that should be recognized as belonging to some type 27 | 28 | type 29 | A string-typed category assigned to a :term:`fnode`. Types are the boolean, hard-edged, enumerated parts of fnode state. They also largely determine inter-rule dependencies and thus which rules get run in response to a query. 30 | 31 | vectorize 32 | To turn a collection of sample HTML pages into vectors of numbers which the trainer then imbibes. 33 | -------------------------------------------------------------------------------- /docs/img/histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/docs/img/histogram.png -------------------------------------------------------------------------------- /docs/img/price_tracker_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/docs/img/price_tracker_screenshot.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Fathom 3 | ====== 4 | 5 | .. image:: https://circleci.com/gh/mozilla/fathom.svg?style=svg 6 | :alt: Build Status 7 | :target: https://circleci.com/gh/mozilla/fathom 8 | 9 | .. image:: https://coveralls.io/repos/github/mozilla/fathom/badge.svg?branch=master 10 | :alt: Coverage Status 11 | :target: https://coveralls.io/github/mozilla/fathom?branch=master 12 | 13 | Find meaning in the web. 14 | 15 | .. toctree:: 16 | :caption: Documentation 17 | :maxdepth: 2 18 | 19 | intro 20 | installing 21 | samples 22 | rules 23 | training 24 | debugging 25 | integrating 26 | maintaining 27 | zoo 28 | development 29 | 30 | .. toctree:: 31 | :caption: API Reference 32 | :maxdepth: 2 33 | 34 | clustering 35 | exceptions 36 | fnodes 37 | ruleset 38 | utilities 39 | 40 | .. _command-reference: 41 | 42 | .. toctree:: 43 | :caption: Command Reference 44 | :titlesonly: 45 | :glob: 46 | 47 | commands/* 48 | 49 | Support 50 | ======= 51 | 52 | You can find us on... 53 | 54 | * `Our Matrix chat room `_ 55 | * `GitHub `_ 56 | * `The mailing list `_ 57 | 58 | .. toctree:: 59 | :caption: Back Matter 60 | :titlesonly: 61 | 62 | example 63 | versions 64 | glossary 65 | 66 | * :ref:`genindex` 67 | 68 | .. toctree:: 69 | :hidden: 70 | 71 | zoo/new_password 72 | zoo/login 73 | zoo/smoot_articles 74 | zoo/smoot_shopping 75 | zoo/price_tracker -------------------------------------------------------------------------------- /docs/installing.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Installing 3 | ========== 4 | 5 | Fathom consists of 3 parts. Here's how to install each one. 6 | 7 | .. _fathomfox-installation: 8 | 9 | Commandline Tools 10 | ================= 11 | 12 | Fathom's commandline tools take your labeled pages as input and train the machine-learning model. They also contain an embedded copy of FathomFox (see below), the simplest way to collect pages. If you don't already have Python 3.7 or better, download it from https://www.python.org/downloads/. Then, install the tools by running... :: 13 | 14 | pip3 install fathom-web 15 | 16 | It's possible your Python package manager is called simply "pip" rather than "pip3". Give that a try if the above fails. 17 | 18 | You will also need to install `Node.js `_ to use many of the commandline tools. 19 | 20 | FathomFox 21 | ========= 22 | 23 | FathomFox is a browser extension used to label web pages. The best way to get it is to first install the commandline tools and then run… :: 24 | 25 | fathom fox 26 | 27 | This will launch a built-in copy of FathomFox in a fresh Firefox profile so ad blockers and other customizations don't interfere with the clean capture of labeled pages. (Some ad blockers will make changes to the DOM, like adding style attributes to ad iframes to hide them.) Using the commandline launcher also lets you pass in your own rulesets for debugging with the FathomFox Evaluator. See the ``-r`` option on the :doc:`fathom fox reference page`. 28 | 29 | For more casual use, you can instead `install FathomFox through the web `_, in which case it will be your responsibility to avoid addons that might mutate the DOM. 30 | 31 | Fathom 32 | ====== 33 | 34 | Fathom proper is a JS library which runs trained rulesets to do the actual recognition. You don't need to worry about installing it until your rulesets are performing satisfactorily and you're ready to integrate them with your application. 35 | 36 | If your application runs server-side under `Node.js `_, you can install `the Fathom node package `_ like any other dependency:: 37 | 38 | npm install fathom-web 39 | 40 | If, instead, you're working on a Firefox feature, you can use the copy of Fathom already in Firefox by saying something like this at the top of the file containing your ruleset:: 41 | 42 | ChromeUtils.defineModuleGetter( 43 | this, 44 | "fathom", 45 | "resource://gre/modules/third_party/fathom/fathom.jsm" 46 | ); 47 | 48 | const { 49 | dom, 50 | element, 51 | out, 52 | rule, 53 | ruleset, 54 | score, 55 | type, 56 | utils: { identity, isVisible, min }, 57 | clusters: { euclidean }, 58 | } = fathom; 59 | 60 | Finally, if you need a self-contained bundle of Fathom in a context that can't use node packages, check out our `source `_ and run ``make -C fathom bundle``. This creates the bundle at ``fathom/dist/fathom.js``. 61 | -------------------------------------------------------------------------------- /docs/integrating.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Integrating 3 | =========== 4 | 5 | Once your ruleset is written and trained, your application can run a DOM tree through it: 6 | 7 | .. code-block:: js 8 | 9 | // Tell the ruleset which DOM to run against, yielding a factbase about the 10 | // document: 11 | const facts = rules.against(document); 12 | 13 | Then you can pull answers out of the factbase. In the case of the :doc:`example`, we want the node representing the highest-scoring overlay, which the ruleset conveniently stores under the "overlay" output key: 14 | 15 | .. code-block:: js 16 | 17 | const bestOverlayFnode = facts.get('overlay'); 18 | 19 | If you're using a third-party ruleset that doesn't anticipate the output you want, you can ask for it more explicitly by passing a query, in the form of a full :ref:`LHS `, to :func:`~BoundRuleset.get`. For example, if you simply want all the overlay-typed things so you can do further computation on them... 20 | 21 | .. code-block:: js 22 | 23 | const allOverlayFnodes = facts.get(type('overlay')); 24 | 25 | Or if you have a reference to a DOM element from elsewhere in your program, you can look up the scores, types, and notes Fathom attached to it: 26 | 27 | .. code-block:: js 28 | 29 | const fnode = facts.get(dom.getElementById('someOverlay')); 30 | 31 | Remember, once you have a :class:`~Fnode`, you can access the wrapped element from its :attr:`~Fnode.element` property. 32 | -------------------------------------------------------------------------------- /docs/intro.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Introduction 3 | ============ 4 | 5 | Fathom is a supervised-learning system for recognizing parts of web pages—pop-ups, address forms, slideshows—or for classifying a page as a whole. A DOM flows in one side, and DOM nodes flow out the other, tagged with types and probabilities that those types are correct. A Prolog-like language makes it straightforward to specify the hints that suggest each type, and a neural-net-based trainer determines the optimal contribution of each. Finally, the `FathomFox `_ web extension and a rich assortment of commandline tools help you collect, label, and use a corpus of web pages to train a recognizer. 6 | 7 | Why? 8 | ==== 9 | 10 | A study of existing projects like Readability and Distiller suggests that purely imperative approaches to semantic extraction get bogged down in the mechanics of DOM traversal and state accumulation, obscuring the operative parts of the extractors and making new ones long and tedious to write. They involve a lot of human guessing of numerical weights. And they are brittle due to the promiscuous profusion of state. Fathom makes extractors easier to write by providing a declarative language, corpus capture, and neural-net-based training. With these, Fathom handles tree-walking, execution order, weight determination, and annotation bookkeeping, letting you concentrate on your application. 11 | 12 | Specific Areas We Address 13 | ========================= 14 | 15 | * Browser-native DOM nodes are mostly immutable, and ``HTMLElement.dataset`` is string-typed, so storing arbitrary intermediate data on nodes is clumsy. Fathom addresses this by providing the Fathom node (or :term:`fnode`, pronounced fuh-NODE), a proxy around each DOM node which we can scribble on. 16 | * With imperative extractors, any experiments or site-specific customizations must be hard-coded in. On the other hand, Fathom's :term:`rulesets` (the programs you write in Fathom) are unordered and thereby decoupled, stitched together only by the :term:`types` they consume and emit. External rules can thus be plugged into existing rulesets, making it easy to experiment without maintaining a fork—or to provide dedicated rules for particularly intractable web sites. 17 | * Types provide an easy way to categorize DOM nodes. They are also Fathom's black-box units of abstraction, as functions are in other programming languages. 18 | * The type system also makes explicit the division between a ruleset's public and private APIs: the types are public, and the imperative activity that goes on inside callback functions is private. This provides the freedom to extend existing rulesets without editing them directly, so multiple third-party refinements can be mixed together. 19 | * Persistent state is cordoned off in typed :term:`notes` on fnodes. Thus, when a rule declares that it takes such-and-such a type as input, it can rightly assume (if rules are written consistently) there will be a note of that type on the fnodes that are passed in. 20 | * A :doc:`neural-network-powered trainer` quickly adjusts the weights of your rules to maximize accuracy. 21 | 22 | Bonus Features 23 | -------------- 24 | 25 | * Efficient execution, driven by a query planner that understands inter-rule dependencies 26 | * Lazy execution, so you can have arbitrarily large rulesets with impunity 27 | * Caching to keep from re-deriving intermediate results between queries 28 | * Clustering based on a notion of DOM node distance influenced by structural similarity 29 | * Many handy utils from which to compose scoring callbacks 30 | 31 | Where It Works 32 | ============== 33 | 34 | Fathom is a JavaScript framework that works against the DOM API, so you can use it server-side with ``jsdom`` or any other implementation, or you can embed it in a browser and pass it a native DOM. 35 | -------------------------------------------------------------------------------- /docs/maintaining.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Maintaining 3 | =========== 4 | 5 | A successful production ruleset will need to be improved from time to time. 6 | 7 | Reviewing a Change 8 | ================== 9 | 10 | Points to consider when reviewing a model change: 11 | 12 | * Make sure the metrics are better. If the change involved adding samples, do a :doc:`fathom test` run with the old coefficients (and the new samples) as a baseline. This should result in worse metrics than the production ruleset, since you made it harder by introducing failing samples. Then compare those metrics to a new :doc:`fathom train` run with the new samples and any ruleset code changes. If the second metrics are better, you should adopt the new model. See :ref:`Evaluating Metrics ` for how to compare them. 13 | 14 | Ideally you can collect several samples representative of the problem you're trying to solve and distribute them across the training/validation/test sets. If you can find only one, you'll have to settle for putting it in training so the coefficients can be informed by it. 15 | * Make sure the "before" and "after" metrics, with commandline flags, are in the commit message to justify the change. 16 | * Review ruleset code changes as in a normal code review, for correctness and comprehensibility. 17 | 18 | If Adding Samples 19 | ----------------- 20 | 21 | If you added samples to the corpus, do these as well: 22 | 23 | * Make sure the names of the samples conform to the convention documented in ``samples/rubric.txt``. 24 | * Check that the samples have been :doc:`extracted` and render properly in Firefox. Use :doc:`fathom serve` to make sure cross-origin policies (which are picky for ``file://`` URLs) aren't preventing the loading of subresources. Improper rendering can cause improper training. 25 | -------------------------------------------------------------------------------- /docs/ruleset.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Rules and Rulesets 3 | ================== 4 | 5 | Most everything on this page is a top-level object in the Fathom library, importable like this, for instance: 6 | 7 | .. code-block:: js 8 | 9 | const { 10 | dom, 11 | element, 12 | out, 13 | rule, 14 | ruleset 15 | } = require('fathom-web'); 16 | 17 | Rulesets 18 | ======== 19 | 20 | The most important Fathom object is the ruleset, an unordered collection of rules. The plain old :class:`Ruleset` is what you typically construct, via the ``ruleset`` convenience function: 21 | 22 | .. autofunction:: ruleset 23 | 24 | .. autoclass:: Ruleset 25 | :members: against, rules 26 | 27 | Then you call :func:`Ruleset.against` to get back a :class:`BoundRuleset`, which is specific to a given DOM tree. From that, you pull answers. 28 | 29 | .. autoclass:: BoundRuleset 30 | :members: get, setCoeffsAndBiases 31 | 32 | Rules 33 | ===== 34 | 35 | These are the control structures which govern the flow of scores, types, and notes through a ruleset. You construct a rule by calling :func:`rule` and passing it a left-hand side and a right-hand side: 36 | 37 | .. autofunction:: rule 38 | 39 | .. _lhs: 40 | 41 | Left-hand Sides 42 | --------------- 43 | 44 | Left-hand sides are currently a few special forms which select nodes to be fed to right-hand sides. 45 | 46 | .. autofunction:: dom 47 | 48 | .. autofunction:: lhs.element 49 | :short-name: 50 | 51 | .. function:: type(theType) 52 | 53 | Take nodes that have the given type. Example: ``type('titley')`` 54 | 55 | .. autofunction:: TypeLhs#max 56 | :short-name: 57 | 58 | .. autofunction:: TypeLhs#bestCluster 59 | :short-name: 60 | 61 | .. autofunction:: and(typeCall[, typeCall, ...]) 62 | 63 | .. autofunction:: nearest(typeCallA, typeCallB[, distance=euclidean]) 64 | 65 | .. autofunction:: when(predicate) 66 | 67 | 68 | Right-hand Sides 69 | ---------------- 70 | 71 | A right-hand side takes the nodes chosen by the left-hand side and mutates them. Spelling-wise, a RHS is a strung-together series of calls like this:: 72 | 73 | type('smoo').props(someCallback).type('whee').score(2) 74 | 75 | To facilitate factoring up repetition in right-hand sides, calls layer together like sheets of transparent acetate: if there are repeats, as with ``type`` in the above example, the rightmost takes precedence and the left becomes useless. Similarly, if :func:`props`, which can return multiple properties of a fact (element, note, score, and type), is missing any of these properties, we continue searching to the left for anything that provides them (excepting other :func:`props` calls—if you want that, write a combinator, and use it to combine the 2 functions you want)). To prevent this, return all properties explicitly from your props callback, even if they are no-ops (like ``{score: 1, note: undefined, type: undefined}``). Aside from this layering precedence, the order of calls does not matter. 76 | 77 | A good practice is to use more declarative calls—:func:`score`, :func:`note`, and :func:`type`—as much as possible and save :func:`props` for when you need it. The query planner can get more out of the more specialized calls without you having to tack on verbose hints like :func:`atMost` or :func:`typeIn`. 78 | 79 | .. autofunction:: InwardRhs#atMost 80 | :short-name: 81 | 82 | .. autofunction:: InwardRhs#props 83 | :short-name: 84 | 85 | For example... 86 | 87 | .. code-block:: js 88 | 89 | function callback(fnode) { 90 | return [{score: 3, 91 | element: fnode.element, // unnecessary, since this is the default 92 | type: 'texty', 93 | note: {suspicious: true}}]; 94 | } 95 | 96 | If you use ``props``, Fathom cannot look inside your callback to see what type you are emitting, so you must declare your output types with :func:`typeIn` or set a single static type with ``type``. Fathom will complain if you don't. (You can still opt not to return any type if the node turns out not to be a good match, even if you declare a :func:`typeIn`.) 97 | 98 | .. autofunction:: InwardRhs#note 99 | :short-name: 100 | 101 | Since every node can have multiple, independent notes (one for each type), this applies to the type explicitly set by the RHS or, if none, to the type named by the `type` call on the LHS. If the LHS has none because it's a `dom(...)` LHS, an error is raised. 102 | 103 | When you query for fnodes of a certain type, you can expect to find notes of any form you specified on any RHS with that type. If no note is specified, it will be undefined. However, if two RHSs emits a given type, one adding a note and the other not adding one (or adding an undefined one), the meaningful note overrides the undefined one. This allows elaboration on a RHS's score (for example) without needing to repeat note logic. 104 | 105 | Indeed, ``undefined`` is not considered a note. So, though notes cannot in general be overwritten, a note that is ``undefined`` can. Symmetrically, an ``undefined`` returned from a :func:`note` or :func:`props` or the like will quietly decline to overwrite an existing defined note, where any other value would cause an error. Rationale: letting ``undefined`` be a valid note value would mean you couldn't shadow a leftward note in a RHS without introducing a new singleton value to serve as a "no value" flag. It's not worth the complexity and the potential differences between the (internal) fact and fnode note value semantics. 106 | 107 | Best practice: any rule adding a type should apply the same note. If only one rule of several type-foo-emitting ones did, it should be made to emit a different type instead so downstream rules can explicitly state that they require the note to be there. Otherwise, there is nothing to guarantee the note-adding rule will run before the note-needing one. 108 | 109 | .. autofunction:: out 110 | 111 | If you are not using ``through()`` or ``allThrough()``, you can omit the call to ``out()`` and simply use specify the key as the RHS of the rule. For example: ``rule(type('titley').max(), out('title'))`` can be written as ``rule(type('titley').max(), 'title')``. 112 | 113 | .. autofunction:: OutwardRhs#through 114 | :short-name: 115 | 116 | .. autofunction:: OutwardRhs#allThrough 117 | :short-name: 118 | 119 | .. autofunction:: InwardRhs#score 120 | :short-name: 121 | 122 | .. autofunction:: InwardRhs#type 123 | :short-name: 124 | 125 | .. autofunction:: InwardRhs#typeIn(type[, type, ...]) 126 | :short-name: 127 | -------------------------------------------------------------------------------- /docs/theme/static/tweaks.css: -------------------------------------------------------------------------------- 1 | @import url(css/theme.css); 2 | 3 | /* Don't make all the code literals distractingly red and bold. Bold now indicates a link. */ 4 | 5 | code, .rst-content tt, .rst-content code, .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal { 6 | color: #000; 7 | font-weight: normal; 8 | } 9 | 10 | .rst-content dl:not(.docutils) tt, .rst-content dl:not(.docutils) tt, .rst-content dl:not(.docutils) code { 11 | font-weight: normal; 12 | } 13 | 14 | a code, .rst-content a tt, .rst-content a code, .rst-content a tt.literal, .rst-content a tt.literal, .rst-content a code.literal { 15 | font-weight: bold; 16 | } 17 | 18 | .rst-content .section ol li, .rst-content .section ul li, .rst-content ol.arabic li, .rst-content ul li, article ul li, article ol li { 19 | font-size: 16px; 20 | } 21 | -------------------------------------------------------------------------------- /docs/theme/theme.conf: -------------------------------------------------------------------------------- 1 | [theme] 2 | inherit = sphinx_rtd_theme 3 | stylesheet = tweaks.css 4 | -------------------------------------------------------------------------------- /docs/utilities.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | Utility Functions 3 | ================= 4 | 5 | In addition to components intrinsically tied to rulesets, Fathom comes with a variety of utility procedures for building scoring and other callback functions or just for improving the imperative shell around your ruleset. 6 | 7 | The utilities hang off a ``utils`` object in the top-level Fathom module. To import them, do something like this: 8 | 9 | .. code-block:: js 10 | 11 | const { 12 | utils: { isBlock, isVisible }, 13 | } = require('fathom-web'); 14 | 15 | This will result in top-level ``isBlock`` and ``isVisible`` symbols. 16 | 17 | .. autofunction:: ancestors 18 | .. autofunction:: attributesMatch 19 | .. autofunction:: best 20 | .. autofunction:: collapseWhitespace 21 | .. autofunction:: domSort 22 | .. autofunction:: first 23 | .. autofunction:: getDefault 24 | .. autofunction:: identity 25 | .. autofunction:: inlineTextLength 26 | .. autofunction:: inlineTexts 27 | .. autofunction:: isBlock 28 | .. autofunction:: isVisible 29 | .. autofunction:: isWhitespace 30 | .. autofunction:: length 31 | .. autofunction:: linearScale 32 | .. autofunction:: linkDensity 33 | .. autofunction:: utilsForFrontend.max 34 | :short-name: 35 | .. autofunction:: maxes 36 | .. autofunction:: min 37 | .. autoclass:: NiceSet 38 | :members: 39 | .. autofunction:: numberOfMatches 40 | .. autofunction:: page 41 | .. autofunction:: reversed 42 | .. autofunction:: rgbaFromString 43 | .. autofunction:: rootElement 44 | .. autofunction:: saturation 45 | .. autofunction:: setDefault 46 | .. autofunction:: sigmoid 47 | .. autofunction:: sum 48 | .. autofunction:: toDomElement 49 | .. autofunction:: toposort 50 | .. autofunction:: walk 51 | -------------------------------------------------------------------------------- /docs/zoo.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | Ruleset Zoo 3 | =========== 4 | 5 | Welcome to the Fathom Ruleset Zoo, a bestiary of Fathom real-world examples. Each gives an overview and links to a repository with full source code. 6 | 7 | .. note:: 8 | Some repos are private because they contain copyrighted training samples. While we believe this is fair use, we don't wish to provoke cease-and-desist bots. If you work for Mozilla, just ask, and we’ll grant you access. Otherwise, we've pasted the ruleset source code into the docs, so you can at least see that. Enjoy! 9 | 10 | New-Password Forms 11 | ================== 12 | 13 | Firefox's password manager needed a way to identify new-password fields so it could suggest (and memorize) high-entropy passwords for them. There is standardized markup for this, but only 2-4% of sites use it. Fathom thus stepped in to backstop the other 97%. On a corpus of 508 pages, we trained to a testing precision of 99.2% and recall of 92.1%. (We used ``fathom train --pos-weight`` to slant the results in favor of fewer false positives, sacrificing some recall for it.) Independent QA work showed an accuracy and false-negative rate better than that of Google Chrome—and a false-positive rate only 1% worse—and all of that with a purely client-side model. It shipped in Firefox 76. 14 | 15 | :doc:`Ruleset source` 16 | 17 | `Full repo `_ 18 | 19 | Login Forms 20 | =========== 21 | 22 | As a proof-of-concept next-generation autofiller for `Firefox Lockwise `_, we built recognizers for login forms’ username fields and Log In buttons. 23 | 24 | This is a clean, simple example of a Fathom 3 ruleset. It was designed for Fathom 3 from the beginning, solves the problem concisely, and has respectable accuracy. 25 | 26 | Recognizers 27 | ----------- 28 | 29 | * **Username field.** This is the username or (as is increasingly the case) email field of the login form. The ruleset finds the precise ```` element for form fill. Validation precision and recall: both 96.6%, on 162 candidate tags across 64 pages, including ones with no login forms or with adversarial constructs like password-change, credit-card, and shipping forms. 30 | * **Next button.** The Log In button or, for multi-page login flows, whatever you click to advance to the next step. This was the more challenging recognizer, since there is a wider diversity of both markup and text for these constructs. Validation precision: 100%. Validation recall: 72.9%. This is across 490 candidate tags on 64 pages. There is plenty of signal left on the table, so more invested time should give us another percentage point or two. (The whole project was timeboxed to about 3 weeks.) 31 | 32 | :doc:`Ruleset source` 33 | 34 | `Full repo `_ 35 | 36 | Smoot: Page Classification 37 | ========================== 38 | 39 | An upcoming Firefox metrics effort, Project Smoot will use a set of whole-page classifiers to characterize user tasks in a privacy-preserving way. 40 | 41 | Recognizers 42 | ----------- 43 | * **Shopping.** A page is a shopping page iff a user would seek it out in the process of choosing or buying things. This is a very challenging rubric, as it almost demands the model reach inside the head of the user to determine intent. A page about Amazon's affiliate program is not a shopping page, even though it appears on a shopping-focused domain. A forum thread on Reddit discussing the merits of competing products is a shopping page, even though it’s not near any actual Buy buttons. 44 | 45 | Despite the difficulty of the task, our model, still under development, scores over 90% in validation on a corpus of 100 pages. 46 | * **Article.** A page whose main attraction is prose to read. Though still under development, this model scores 90% in validation on a corpus of 60 pages. 47 | * **“Techie” Article.** An article aimed at a computer-savvy audience. This is intended for audience segmentation. It’s too early for numbers here as well. 48 | 49 | :doc:`Articles ruleset source` 50 | 51 | :doc:`Shopping ruleset source` 52 | 53 | `Full repo `_ 54 | 55 | Price Tracker 56 | ============= 57 | 58 | Originally designed for Fathom 2.0 but ported to 3.0 as a team familiarization exercise, Firefox Price Tracker is a now-retired web extension that periodically polled the prices of a wishlist of products and notified the user of price drops. Fathom provided the recognition of products for sale: their names, images, and prices. Out of an abundance of caution, Price Tracker underutilized Fathom’s ability to generalize, artificially limiting itself to the 5 top commerce sites in the U.S. However, its compact example is easy to digest in a sitting, and it’s a fine instance of Fathom increasing the agency of thousands of users when wrapped in a quality, lightweight UI. 59 | 60 | .. image:: img/price_tracker_screenshot.png 61 | 62 | Recognizers 63 | ----------- 64 | 65 | * **Image.** The “hero” image showing the product. Validation accuracy: 99.34%. Testing accuracy: 75%. 66 | * **Title.** The name of the product. Validation accuracy: 100%. Testing accuracy: 83.38%. 67 | * **Price.** The price charged for the product. Validation accuracy: 99.27%. Testing accuracy: 99.46%. 68 | 69 | Price Tracker’s accuracy numbers are unusually noisy, partly due to the rules being written with an earlier version of Fathom in mind and partly due to its small, homogeneous sample corpus. Pages came from only 5 sites, and testing and validation corpora were each only 20 pages. The 95% confidence interval for accuracy numbers thus spans as much as 30%. If we were to ship a Fathom-3.0-powered Price Tracker, we would refine until we had only a few percentage points of spread. 70 | 71 | More metrics are available on `the pull request that merged the Fathom 3 upgrade `_, but they mostly serve as a warning that a more diverse corpus is necessary for confident measurement. Take Price Tracker as an example of coding practices and product-market fit, not corpus design. 72 | 73 | :doc:`Ruleset source` 74 | 75 | `Full repo `_ 76 | 77 | Pop-up Detector 78 | =============== 79 | 80 | Pop-up “windows” on the web have migrated from actual windows to in-page elements, largely due to browsers’ success at blocking the old kind. We mentored a student project to recognize in-page pop-ups using the older Fathom 2. 81 | 82 | Results were encouraging, hovering around 85% on a blind testing corpus. Revamped for a modern Fathom, it might give higher numbers with little effort. In the meantime, it serves as a good example of perceptive rules. But don't lean overmuch on the ranges of numbers returned from scoring callbacks; that all changed in Fathom 3. 83 | 84 | `Pop-up Detector source `_ 85 | -------------------------------------------------------------------------------- /fathom/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | "transform-es2015-modules-commonjs", "dynamic-import-node", "@babel/plugin-proposal-export-namespace-from", "@babel/plugin-transform-exponentiation-operator" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /fathom/.eslintignore: -------------------------------------------------------------------------------- 1 | # eslint will trip over the export statements herein until https://github.com/eslint/eslint/issues/12629 is fixed: 2 | index.mjs 3 | -------------------------------------------------------------------------------- /fathom/.eslintrc.yml: -------------------------------------------------------------------------------- 1 | env: 2 | es6: true 3 | node: true 4 | mocha: true 5 | 6 | parserOptions: 7 | sourceType: module 8 | ecmaVersion: 8 9 | 10 | extends: 11 | - eslint:recommended 12 | - plugin:node/recommended 13 | 14 | plugins: 15 | - import 16 | - node 17 | 18 | root: true 19 | 20 | rules: 21 | array-bracket-spacing: [error, never] 22 | eqeqeq: error 23 | generator-star-spacing: [warn, {before: true, after: false}] 24 | guard-for-in: warn # There's nothing wrong with for..in if you know what you're doing. This is here just to keep me from accidentally saying "for..in" when I mean "for..of". Delete this and come up with a better solution if we ever need to use "for..in". 25 | indent: [error, 4, {ObjectExpression: first, ArrayExpression: first, CallExpression: {arguments: first}, FunctionDeclaration: {parameters: first}}] 26 | max-len: [off, {code: 100, ignoreComments: true, ignoreStrings: true, ignoreTemplateLiterals: true}] 27 | node/exports-style: [error, module.exports] 28 | node/no-missing-import: [error, {tryExtensions: [".js", ".mjs"]}] 29 | node/no-unpublished-require: off 30 | no-console: off 31 | no-dupe-class-members: error 32 | no-loop-func: error 33 | no-new-func: error # equivalent to eval() 34 | no-prototype-builtins: off 35 | no-restricted-globals: [error, getComputedStyle] 36 | no-throw-literal: error 37 | no-trailing-spaces: error 38 | no-underscore-dangle: off 39 | no-unused-vars: [warn, {vars: all, args: none}] 40 | no-use-before-define: [error, {functions: false, classes: false}] 41 | no-useless-escape: error 42 | no-var: warn 43 | no-warning-comments: [warn, {terms: [xxx, fixme, hack], location: start}] 44 | object-curly-spacing: [error, never] 45 | object-shorthand: [error, properties] 46 | prefer-const: off 47 | quotes: [error, single, {avoidEscape: true, allowTemplateLiterals: true}] 48 | semi: [error, always] 49 | space-before-blocks: [error, always] 50 | space-before-function-paren: [error, {anonymous: always, named: never}] 51 | import/extensions: [error, always, {js: never, mjs: never}] 52 | 53 | settings: 54 | import/resolver: 55 | node: 56 | extensions: ['.js', '.mjs'] 57 | -------------------------------------------------------------------------------- /fathom/.npmignore: -------------------------------------------------------------------------------- 1 | /.npm_installed 2 | /.babelrc 3 | /.eslintignore 4 | /.eslintrc.yml 5 | /.nyc_output 6 | /Makefile 7 | /rollup.config.js 8 | /test 9 | /coverage 10 | /*.log 11 | /dist 12 | /venv 13 | -------------------------------------------------------------------------------- /fathom/Makefile: -------------------------------------------------------------------------------- 1 | PATH := ./node_modules/.bin:$(PATH) 2 | 3 | JS := $(shell find . -name '*.mjs' | grep -v '^./node_modules/.*' | sed 's/\.mjs/\.js/') 4 | MJS := $(shell find . -name '*.mjs' | grep -v '^./node_modules/.*') 5 | 6 | # It's faster to invoke Babel once and compile everything than to invoke it 7 | # separately on even 2 individual files that changed. 8 | %.js: %.mjs .npm_installed .babelrc; @node_modules/.bin/babel *.mjs **/*.mjs --out-dir . --relative 9 | 10 | js: $(JS) 11 | 12 | lint: .npm_installed 13 | @node_modules/.bin/eslint --ext mjs . 14 | @node_modules/.bin/eslint test/browser 15 | 16 | test: $(JS) .npm_installed 17 | @node_modules/.bin/nyc --reporter=text-summary node_modules/mocha/bin/_mocha --recursive 18 | 19 | coverage: .npm_installed test 20 | @node_modules/.bin/nyc report --reporter=html 21 | 22 | coveralls: .npm_installed 23 | node_modules/.bin/nyc report --reporter=text-lcov | coveralls 24 | 25 | debugtest: $(JS) .npm_installed 26 | # This is known to work on node 7.6.0. 27 | @node_modules/.bin/mocha --inspect-brk 28 | 29 | publish: $(JS) 30 | cp ../LICENSE ./ 31 | cp ../README.md ./ 32 | npm publish 33 | 34 | bundle: dist/fathom.js 35 | 36 | # .npm_installed is an empty file we touch whenever we run npm install. This 37 | # target redoes the install if package.json is newer than that file: 38 | .npm_installed: package.json 39 | npm install 40 | touch $@ 41 | 42 | clean: 43 | rm -rf $(JS) node_modules .npm_installed LICENSE README.md 44 | 45 | 46 | # Private targets: 47 | 48 | dist/fathom.js: rollup.config.js .npm_installed $(MJS) 49 | @node_modules/.bin/rollup -c 50 | 51 | 52 | .PHONY: js lint test coveralls debugtest publish bundle clean 53 | -------------------------------------------------------------------------------- /fathom/exceptions.mjs: -------------------------------------------------------------------------------- 1 | /** 2 | * A :func:`rule` depends on another rule which itself depends on the first 3 | * rule again, either directly or indirectly. 4 | */ 5 | export class CycleError extends Error { 6 | } 7 | 8 | /** 9 | * An examined element was not contained in a browser ``window`` object, but 10 | * something needed it to be. 11 | */ 12 | export class NoWindowError extends Error { 13 | } 14 | -------------------------------------------------------------------------------- /fathom/fnode.mjs: -------------------------------------------------------------------------------- 1 | import {type} from './side'; 2 | import {getDefault, setDefault, sigmoid} from './utilsForFrontend'; 3 | 4 | 5 | /** 6 | * A wrapper around a DOM node, storing :term:`types`, 7 | * :term:`scores`, and :term:`notes` that apply to it 8 | */ 9 | export class Fnode { 10 | /** 11 | * @arg element The DOM element described by the fnode. 12 | * @arg ruleset The ruleset which created the fnode. 13 | */ 14 | constructor(element, ruleset) { 15 | if (element === undefined) { 16 | throw new Error("Someone tried to make a fnode without specifying the element they're talking about."); 17 | } 18 | /** 19 | * The raw DOM element this fnode describes 20 | */ 21 | this.element = element; 22 | this._ruleset = ruleset; 23 | 24 | // A map of type => {score: number, note: anything}. `score` is always 25 | // present and defaults to 1. A note is set iff `note` is present and 26 | // not undefined. 27 | this._types = new Map(); 28 | 29 | // Note: conserveScore() is temporarily absent in 3.0. 30 | // 31 | // By default, a fnode has an independent score for each of its types. 32 | // However, a RHS can opt to conserve the score of an upstream type, 33 | // carrying it forward into another type. To avoid runaway scores in 34 | // the case that multiple rules choose to do this, we limit the 35 | // contribution of an upstream type's score to being multiplied in a 36 | // single time. In this set, we keep track of which upstream types' 37 | // scores have already been multiplied into each type. LHS fnode => Set 38 | // of types whose score for that node have been multiplied into this 39 | // node's score. 40 | this._conservedScores = new Map(); 41 | } 42 | 43 | /** 44 | * Return whether the given type is one of the ones attached to the wrapped 45 | * HTML node. 46 | */ 47 | hasType(type) { 48 | // Run type(theType) against the ruleset to make sure this doesn't 49 | // return false just because we haven't lazily run certain rules yet. 50 | this._computeType(type); 51 | return this._types.has(type); 52 | } 53 | 54 | /** 55 | * Return the confidence, in the range (0, 1), that the fnode belongs to the 56 | * given type, 0 by default. 57 | */ 58 | scoreFor(type) { 59 | this._computeType(type); 60 | return sigmoid(this._ruleset.weightedScore(this.scoresSoFarFor(type)) + 61 | getDefault(this._ruleset.biases, type, () => 0)); 62 | } 63 | 64 | /** 65 | * Return the fnode's note for the given type, ``undefined`` if none. 66 | */ 67 | noteFor(type) { 68 | this._computeType(type); 69 | return this._noteSoFarFor(type); 70 | } 71 | 72 | /** 73 | * Return whether this fnode has a note for the given type. 74 | * 75 | * ``undefined`` is not considered a note and may be overwritten with 76 | * impunity. 77 | */ 78 | hasNoteFor(type) { 79 | this._computeType(type); 80 | return this._hasNoteSoFarFor(type); 81 | } 82 | 83 | // -------- Methods below this point are private to the framework. -------- 84 | 85 | /** 86 | * Return an iterable of the types tagged onto me by rules that have 87 | * already executed. 88 | */ 89 | typesSoFar() { 90 | return this._types.keys(); 91 | } 92 | 93 | _noteSoFarFor(type) { 94 | return this._typeRecordForGetting(type).note; 95 | } 96 | 97 | _hasNoteSoFarFor(type) { 98 | return this._noteSoFarFor(type) !== undefined; 99 | } 100 | 101 | /** 102 | * Return the score thus far computed on me for a certain type. Doesn't 103 | * implicitly run any rules. If no score has yet been determined for the 104 | * given type, return undefined. 105 | */ 106 | scoresSoFarFor(type) { 107 | return this._typeRecordForGetting(type).score; 108 | } 109 | 110 | /** 111 | * Add a given number to one of our per-type scores. Implicitly assign us 112 | * the given type. Keep track of which rule it resulted from so we can 113 | * later mess with the coeffs. 114 | */ 115 | addScoreFor(type, score, ruleName) { 116 | this._typeRecordForSetting(type).score.set(ruleName, score); 117 | } 118 | 119 | /** 120 | * Set the note attached to one of our types. Implicitly assign us that 121 | * type if we don't have it already. 122 | */ 123 | setNoteFor(type, note) { 124 | if (this._hasNoteSoFarFor(type)) { 125 | if (note !== undefined) { 126 | throw new Error(`Someone (likely the right-hand side of a rule) tried to add a note of type ${type} to an element, but one of that type already exists. Overwriting notes is not allowed, since it would make the order of rules matter.`); 127 | } 128 | // else the incoming note is undefined and we already have the 129 | // type, so it's a no-op 130 | } else { 131 | // Apply either a type and note or just a type (which means a note 132 | // that is undefined): 133 | this._typeRecordForSetting(type).note = note; 134 | } 135 | } 136 | 137 | /** 138 | * Return a score/note record for a type, creating it if it doesn't exist. 139 | */ 140 | _typeRecordForSetting(type) { 141 | return setDefault(this._types, type, () => ({score: new Map()})); 142 | } 143 | 144 | /** 145 | * Manifest a temporary type record for reading, working around the lack of 146 | * a .? operator in JS. 147 | */ 148 | _typeRecordForGetting(type) { 149 | return getDefault(this._types, type, () => ({score: new Map()})); 150 | } 151 | 152 | /** 153 | * Make sure any scores, notes, and type-tagging for the given type are 154 | * computed for my element. 155 | */ 156 | _computeType(theType) { 157 | if (!this._types.has(theType)) { // Prevent infinite recursion when an A->A rule looks at A's note in a callback. 158 | this._ruleset.get(type(theType)); 159 | } 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /fathom/index.mjs: -------------------------------------------------------------------------------- 1 | /* This Source Code Form is subject to the terms of the Mozilla Public 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 | 5 | const version = '3.7.3'; 6 | import {rule} from './rule'; 7 | import {ruleset} from './ruleset'; 8 | import {dom, element} from './lhs'; 9 | import {out} from './rhs'; 10 | import {and, atMost, nearest, note, props, score, type, typeIn} from './side'; 11 | 12 | export * as clusters from './clusters'; 13 | export * as utils from './utilsForFrontend'; 14 | export * as exceptions from './exceptions'; 15 | export { 16 | and, 17 | atMost, 18 | dom, 19 | element, 20 | nearest, 21 | note, 22 | out, 23 | props, 24 | rule, 25 | ruleset, 26 | score, 27 | type, 28 | typeIn, 29 | version 30 | }; 31 | -------------------------------------------------------------------------------- /fathom/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fathom-web", 3 | "description": "Find meaning in the web.", 4 | "version": "3.7.3", 5 | "author": "Erik Rose (https://www.grinchcentral.com/)", 6 | "bugs": { 7 | "url": "https://github.com/mozilla/fathom/issues" 8 | }, 9 | "dependencies": { 10 | "jsdom": "^11.12.0" 11 | }, 12 | "devDependencies": { 13 | "@babel/cli": "^7.7.4", 14 | "@babel/core": "^7.7.4", 15 | "@babel/plugin-proposal-export-namespace-from": "^7.7.4", 16 | "@babel/plugin-transform-exponentiation-operator": "^7.7.4", 17 | "acorn": "^7.1.0", 18 | "babel-eslint": "^8.2.6", 19 | "babel-plugin-dynamic-import-node": "^2.3.0", 20 | "babel-plugin-transform-es2015-modules-commonjs": "^6.26.2", 21 | "chai": "^4.2.0", 22 | "coveralls": "^3.1.0", 23 | "eslint": "^6.7.1", 24 | "eslint-plugin-import": "^2.18.2", 25 | "eslint-plugin-node": "^10.0.0", 26 | "geckodriver": "^3.0.1", 27 | "jsdoc": "^3.5.4", 28 | "mocha": "^6.2.2", 29 | "nyc": "^14.1.1", 30 | "rollup": "^1.27.5", 31 | "selenium-webdriver": "^4.1.1" 32 | }, 33 | "engines": { 34 | "node": ">= 7.6.0" 35 | }, 36 | "homepage": "https://github.com/mozilla/fathom", 37 | "keywords": [ 38 | "semantic extraction", 39 | "scoring", 40 | "ranking", 41 | "clustering" 42 | ], 43 | "license": "MPL-2.0", 44 | "repository": { 45 | "type": "git", 46 | "url": "https://github.com/mozilla/fathom.git" 47 | }, 48 | "main": "index" 49 | } 50 | -------------------------------------------------------------------------------- /fathom/rollup.config.js: -------------------------------------------------------------------------------- 1 | // Bundle all of Fathom into a single file for use inside web extensions or 2 | // other applications. If possible, use ES6-style import statements in your 3 | // code instead, and let rollup pull in just what Fathom code is necessary. See 4 | // /fathom_fox/rollup.config.js for an example. 5 | export default { 6 | input: 'index.mjs', 7 | output: { 8 | file: 'dist/fathom.js', 9 | format: 'umd', 10 | name: 'fathom', 11 | } 12 | }; 13 | -------------------------------------------------------------------------------- /fathom/side.mjs: -------------------------------------------------------------------------------- 1 | import {euclidean} from './clusters'; 2 | import {Lhs} from './lhs'; 3 | import {InwardRhs} from './rhs'; 4 | 5 | 6 | export function props(callback) { 7 | return new Side({method: 'props', args: [callback]}); 8 | } 9 | 10 | /** Constrain to an input type on the LHS, or apply a type on the RHS. */ 11 | export function type(theType) { 12 | return new Side({method: 'type', args: [theType]}); 13 | } 14 | 15 | export function note(callback) { 16 | return new Side({method: 'note', args: [callback]}); 17 | } 18 | 19 | export function score(scoreOrCallback) { 20 | return new Side({method: 'score', args: [scoreOrCallback]}); 21 | } 22 | 23 | export function atMost(score) { 24 | return new Side({method: 'atMost', args: [score]}); 25 | } 26 | 27 | export function typeIn(...types) { 28 | return new Side({method: 'typeIn', args: types}); 29 | } 30 | 31 | /** 32 | * Pull nodes that conform to multiple conditions at once. 33 | * 34 | * For example: ``and(type('title'), type('english'))`` 35 | * 36 | * Caveats: ``and`` supports only simple ``type`` calls as arguments for now, 37 | * and it may fire off more rules as prerequisites than strictly necessary. 38 | * ``not`` and ``or`` don't exist yet, but you can express ``or`` the long way 39 | * around by having 2 rules with identical RHSs. 40 | */ 41 | export function and(...lhss) { 42 | return new Side({method: 'and', args: lhss}); 43 | } 44 | 45 | /** 46 | * Experimental. For each :term:`fnode` from ``typeCallA``, find the closest 47 | * node from ``typeCallB``, and attach it as a note. The note is attached to 48 | * the type specified by the RHS, defaulting to the type of ``typeCallA``. If 49 | * no nodes are emitted from ``typeCallB``, do nothing. 50 | * 51 | * For example... :: 52 | * 53 | * nearest(type('image'), type('price')) 54 | * 55 | * The score of the ``typeCallA`` can be added to the new type's score by using 56 | * :func:`conserveScore` (though this routine has since been removed):: 57 | * 58 | * rule(nearest(type('image'), type('price')), 59 | * type('imageWithPrice').score(2).conserveScore()) 60 | * 61 | * Caveats: ``nearest`` supports only simple ``type`` calls as arguments ``a`` 62 | * and ``b`` for now. 63 | * 64 | * @arg distance {function} A function that takes 2 fnodes and returns a 65 | * numerical distance between them. Included options are :func:`distance`, 66 | * which is a weighted topological distance, and :func:`euclidean`, which 67 | * is a spatial distance. 68 | */ 69 | export function nearest(typeCallA, typeCallB, distance = euclidean) { 70 | return new Side({method: 'nearest', args: [typeCallA, typeCallB, distance]}); 71 | } 72 | 73 | /** 74 | * A chain of calls that can be compiled into a Rhs or Lhs, depending on its 75 | * position in a Rule. This lets us use type() as a leading call for both RHSs 76 | * and LHSs. I would prefer to do this dynamically, but that wouldn't compile 77 | * down to old versions of ES. 78 | */ 79 | class Side { 80 | constructor(...calls) { 81 | // A "call" is like {method: 'dom', args: ['p.smoo']}. 82 | this._calls = calls; 83 | } 84 | 85 | max() { 86 | return this._and('max'); 87 | } 88 | 89 | bestCluster(options) { 90 | return this._and('bestCluster', options); 91 | } 92 | 93 | props(callback) { 94 | return this._and('props', callback); 95 | } 96 | 97 | type(...types) { 98 | return this._and('type', ...types); 99 | } 100 | 101 | note(callback) { 102 | return this._and('note', callback); 103 | } 104 | 105 | score(scoreOrCallback) { 106 | return this._and('score', scoreOrCallback); 107 | } 108 | 109 | atMost(score) { 110 | return this._and('atMost', score); 111 | } 112 | 113 | typeIn(...types) { 114 | return this._and('typeIn', ...types); 115 | } 116 | 117 | and(...lhss) { 118 | return this._and('and', lhss); 119 | } 120 | 121 | _and(method, ...args) { 122 | return new this.constructor(...this._calls.concat({method, args})); 123 | } 124 | 125 | asLhs() { 126 | return this._asSide(Lhs.fromFirstCall(this._calls[0]), this._calls.slice(1)); 127 | } 128 | 129 | asRhs() { 130 | return this._asSide(new InwardRhs(), this._calls); 131 | } 132 | 133 | _asSide(side, calls) { 134 | for (let call of calls) { 135 | side = side[call.method](...call.args); 136 | } 137 | return side; 138 | } 139 | 140 | when(pred) { 141 | return this._and('when', pred); 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /fathom/test/browser/http_server.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Before any test in the entire project starts, spin up a server to serve test 3 | * pages to the Selenium-driven headless Firefox we use in some tests. 4 | */ 5 | const http = require('http'); 6 | const fs = require('fs'); 7 | const url = require('url'); 8 | 9 | 10 | const PORT = 8000; 11 | const server = http.createServer((request, response) => { 12 | // TODO: Replace url.parse with url.URL. 13 | // eslint-disable-next-line node/no-deprecated-api 14 | const path = url.parse(request.url).pathname; 15 | fs.readFile(__dirname + path, 'utf8', (error, data) => { 16 | if (error) { 17 | console.error(`There was a ${error.code} error fetching the resource at ${path}.`); 18 | } else { 19 | response.writeHead(200, {'Content-Type': 'text/html'}); 20 | response.write(data); 21 | response.end(); 22 | } 23 | }); 24 | }); 25 | 26 | before( 27 | function start_server() { 28 | server.listen(PORT); 29 | console.log(`Serving from ${__dirname} at http://localhost:${PORT}...`); 30 | } 31 | ); 32 | 33 | after( 34 | function stop_server() { 35 | server.close(); 36 | } 37 | ); 38 | -------------------------------------------------------------------------------- /fathom/test/browser/isVisible.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | isVisible functional test 8 | 19 | 20 | 21 |

isVisible functional test

22 |
23 |
24 | 25 | 26 | 29 |
30 | 31 |
32 |
33 |
34 | 35 |
36 |
37 |
38 | 39 |
40 |
41 |
42 | 43 |
44 | 45 |
46 | 47 |
48 | 49 |
50 |
51 | 52 |
53 |
54 |
55 | 56 | 57 | -------------------------------------------------------------------------------- /fathom/test/browser/isVisible.js: -------------------------------------------------------------------------------- 1 | const {assert} = require('chai'); 2 | const firefox = require('selenium-webdriver/firefox'); 3 | const {Builder, until, By} = require('selenium-webdriver'); 4 | const {ancestors, isDomElement, isVisible, toDomElement, windowForElement} = require('../../utilsForFrontend'); // eslint-disable-line node/no-missing-require 5 | 6 | const WAIT_MS = 10000; 7 | const TEST_PAGE_URL = 'http://localhost:8000/isVisible.html'; 8 | 9 | describe('isVisible', () => { 10 | const options = new firefox.Options(); 11 | options.headless(); 12 | 13 | const driver = new Builder() 14 | .forBrowser('firefox') 15 | .setFirefoxOptions(options) 16 | .build(); 17 | 18 | async function checkElementVisibility(id, expected) { 19 | await driver.wait(until.elementLocated(By.id(id)), WAIT_MS); 20 | const isElementVisible = await driver.executeScript(` 21 | ${ancestors} 22 | ${isDomElement} 23 | ${toDomElement} 24 | ${windowForElement} 25 | return ${isVisible}(document.getElementById('${id}')); 26 | `); 27 | assert.equal( 28 | isElementVisible, 29 | expected, 30 | `isVisible should return ${expected} for element with id '${id}'.` 31 | ); 32 | } 33 | 34 | async function checkElementsVisibility(idStub, isVisible) { 35 | const elementIds = await driver.executeScript(` 36 | return Array.prototype.map.call(document.querySelectorAll('[id^="${idStub}"]'), (element) => element.id); 37 | `); 38 | 39 | await driver.get(TEST_PAGE_URL); 40 | 41 | for (const id of elementIds) { 42 | await checkElementVisibility(id, isVisible); 43 | } 44 | } 45 | 46 | it('should return false when an element is hidden', async function () { 47 | this.timeout(WAIT_MS); 48 | await checkElementsVisibility('not-visible-', false); 49 | }); 50 | 51 | it('should return true when an element is visible', async function () { 52 | this.timeout(WAIT_MS); 53 | await checkElementsVisibility('visible-', true); 54 | }); 55 | 56 | after(async function () { 57 | this.timeout(WAIT_MS); 58 | return driver.quit(); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /fathom/test/demos.mjs: -------------------------------------------------------------------------------- 1 | import {assert} from 'chai'; 2 | 3 | import {dom, rule, ruleset, type} from '../index'; 4 | import {sigmoid, staticDom} from '../utils'; 5 | 6 | 7 | describe('Design-driving demos', function () { 8 | it('handles a simple series of short-circuiting rules', function () { 9 | // TODO: Short-circuiting isn't implemented yet. The motivation of this 10 | // test is to inspire engine so it's smart enough to run the highest- 11 | // possible-scoring type-chain of rules first and, if it succeeds, 12 | // omit the others. 13 | const doc = staticDom(` 14 | 15 | 16 | 17 | Title 18 | `); 19 | const typeAndNote = type('titley').note(fnode => fnode.element.getAttribute('content')); 20 | const rules = ruleset([ 21 | rule(dom('meta[property="og:title"]'), 22 | typeAndNote.score(40)), 23 | rule(dom('meta[property="twitter:title"]'), 24 | typeAndNote.score(30)), 25 | rule(dom('meta[name="hdl"]'), 26 | typeAndNote.score(20)), 27 | rule(dom('title'), 28 | typeAndNote.score(10).note(fnode => fnode.element.text)), 29 | rule(type('titley').max(), 'bestTitle') 30 | ]); 31 | const facts = rules.against(doc); 32 | const node = facts.get('bestTitle')[0]; 33 | assert.equal(node.scoreFor('titley'), sigmoid(40)); 34 | assert.equal(node.noteFor('titley'), 'OpenGraph'); 35 | }); 36 | }); 37 | 38 | // Right now, I'm writing features and using optimization algos to find their coefficients. Someday, we can stop writing features and have deep learning come up with them. TODO: Grok unsupervised learning, and apply it to OpenCrawl. 39 | -------------------------------------------------------------------------------- /fathom/test/lhs_tests.mjs: -------------------------------------------------------------------------------- 1 | import {assert} from 'chai'; 2 | 3 | import {dom, rule, ruleset, type} from '../index'; 4 | import {staticDom} from '../utils'; 5 | 6 | 7 | describe('LHS', function () { 8 | it('makes a dom() LHS that rule() tolerates', function () { 9 | const lhs = dom('smoo'); 10 | const rhs = type('bar'); 11 | rule(lhs, rhs); 12 | }); 13 | 14 | it('finds max-scoring nodes of a type', function () { 15 | const doc = staticDom(` 16 |

17 |
18 |
19 | `); 20 | const rules = ruleset([ 21 | rule(dom('p'), type('smoo').score(2)), 22 | rule(dom('div'), type('smoo').score(5)), 23 | rule(type('smoo').max(), 'best') 24 | ]); 25 | const facts = rules.against(doc); 26 | const best = facts.get('best'); 27 | assert.equal(best.length, 2); 28 | assert.equal(best[0].element.nodeName, 'DIV'); 29 | assert.equal(best[1].element.nodeName, 'DIV'); 30 | }); 31 | 32 | it('returns [] for a top-totaling cluster of 0 nodes', function () { 33 | const doc = staticDom(` 34 |

35 | `); 36 | const rules = ruleset([ 37 | rule(dom('div'), type('smoo')), 38 | rule(type('smoo').bestCluster(), 'cluster') 39 | ]); 40 | const facts = rules.against(doc); 41 | assert.deepEqual(facts.get('cluster'), []); 42 | }); 43 | 44 | it('can have its type overridden', function () { 45 | const doc = staticDom('

'); 46 | const rules = ruleset([ 47 | rule(dom('p'), type('bar')), 48 | rule(type('foo').type('bar'), 'best') 49 | ]); 50 | const facts = rules.against(doc); 51 | const best = facts.get('best'); 52 | assert.equal(best.length, 1); 53 | }); 54 | 55 | it('filters using when() on type()', function () { 56 | const doc = staticDom('

'); 57 | const rules = ruleset([ 58 | rule(dom('p'), type('bar')), 59 | rule(type('bar').when(fnode => fnode.element.id === 'fat'), type('when')), 60 | rule(type('when'), 'best') 61 | ]); 62 | const facts = rules.against(doc); 63 | const best = facts.get('best'); 64 | assert.equal(best.length, 1); 65 | assert.equal(best[0].element.id, 'fat'); 66 | }); 67 | 68 | it('filters using when() on dom()', function () { 69 | const doc = staticDom('

'); 70 | const rules = ruleset([ 71 | rule(dom('p').when(fnode => fnode.element.id === 'bat'), type('when')), 72 | rule(type('when'), 'best') 73 | ]); 74 | const facts = rules.against(doc); 75 | const best = facts.get('best'); 76 | assert.equal(best.length, 1); 77 | assert.equal(best[0].element.id, 'bat'); 78 | }); 79 | }); 80 | -------------------------------------------------------------------------------- /fathom/test/rhs_tests.mjs: -------------------------------------------------------------------------------- 1 | import {assert} from 'chai'; 2 | 3 | import {atMost, dom, note, out, props, rule, ruleset, score, type, typeIn} from '../index'; 4 | import {sigmoid, staticDom} from '../utils'; 5 | 6 | 7 | describe('RHS', function () { 8 | it('combines different calls piecewise, with rightmost repeated subfacts shadowing', function () { 9 | const rhs = type('foo').score(5).props(node => ({score: 6})).asRhs(); 10 | assert.deepEqual(rhs.fact('dummy'), {type: 'foo', score: 6}); 11 | }); 12 | 13 | it('has same-named calls shadow, with rightmost winning', function () { 14 | const rhs = props(node => ({score: 1})).props(node => ({note: 'foo'})).asRhs(); 15 | assert.deepEqual(rhs.fact('dummy'), {note: 'foo'}); 16 | }); 17 | 18 | it('runs callbacks only once', function () { 19 | let count = 0; 20 | function addOne() { 21 | count++; 22 | return {}; 23 | } 24 | const rhs = props(addOne).asRhs(); 25 | assert.deepEqual(rhs.fact('dummy'), {}); 26 | assert.equal(count, 1); 27 | }); 28 | 29 | it('ignores unexpected subfacts returned from props() callbacks', function () { 30 | const rhs = props(node => ({booga: true, score: 3})).asRhs(); 31 | assert.deepEqual(rhs.fact('dummy'), {score: 3}); 32 | }); 33 | 34 | it('enforces atMost()', function () { 35 | const doc = staticDom('

'); 36 | const rules = ruleset([ 37 | rule(dom('p'), score(8).type('para').atMost(3)) 38 | ]); 39 | const facts = rules.against(doc); 40 | assert.throws(() => facts.get(type('para')), 41 | 'Score of 8 exceeds the declared atMost(3).'); 42 | }); 43 | 44 | it('works fine when atMost() is satisfied', function () { 45 | const doc = staticDom('

'); 46 | const rules = ruleset([ 47 | rule(dom('p'), atMost(3).score(2).type('para')) 48 | ]); 49 | const facts = rules.against(doc); 50 | assert.equal(facts.get(type('para'))[0].scoreFor('para'), sigmoid(2)); 51 | }); 52 | 53 | it('enforces typeIn() for explicit types', function () { 54 | const doc = staticDom('

'); 55 | const rules = ruleset([ 56 | rule(dom('p'), typeIn('nope').type('para')) 57 | ]); 58 | const facts = rules.against(doc); 59 | assert.throws(() => facts.get(type('para')), 60 | 'A right-hand side claimed, via typeIn(...) to emit one of the types {nope} but actually emitted para.'); 61 | }); 62 | 63 | it('enforces typeIn() for inherited types', function () { 64 | const doc = staticDom('

'); 65 | const rules = ruleset([ 66 | rule(dom('p'), type('para')), 67 | rule(type('para'), props(n => ({})).typeIn('nope')) 68 | ]); 69 | const facts = rules.against(doc); 70 | assert.throws(() => facts.get(type('nope')), 71 | 'A right-hand side claimed, via typeIn(...) to emit one of the types {nope} but actually inherited para from the left-hand side.'); 72 | }); 73 | 74 | it('works fine when typeIn() is satisfied', function () { 75 | const doc = staticDom('

'); 76 | const rules = ruleset([ 77 | rule(dom('p'), typeIn('para').type('para')) 78 | ]); 79 | const facts = rules.against(doc); 80 | assert.equal(facts.get(type('para')).length, 1); 81 | }); 82 | 83 | it('runs out().through() callbacks', function () { 84 | const doc = staticDom('

'); 85 | const rules = ruleset([ 86 | rule(dom('p'), out('para').through(fnode => fnode.element.tagName)) 87 | ]); 88 | const facts = rules.against(doc); 89 | assert.equal(facts.get('para')[0], 'P'); 90 | }); 91 | 92 | it('paves over undefined notes', function () { 93 | // We shouldn't re-run any rules. Run order shouldn't matter, because 94 | // we forbid notes from overwriting, score contribution is 95 | // commutative, and type assignment is idempotent and immutable. 96 | const doc = staticDom('

'); 97 | const rules = ruleset([ 98 | rule(dom('p'), type('para')), 99 | rule(type('para'), note(fnode => undefined)), 100 | rule(type('para'), note(fnode => 'foo')) 101 | ]); 102 | const facts = rules.against(doc); 103 | assert.equal(facts.get(type('para'))[0].noteFor('para'), 'foo'); 104 | }); 105 | 106 | it('runs scoring callbacks', function () { 107 | const doc = staticDom('

'); 108 | const rules = ruleset([ 109 | rule(dom('p'), type('p').score(fnode => 5)) 110 | ]); 111 | const facts = rules.against(doc); 112 | assert.equal(facts.get(type('p'))[0].scoreFor('p'), sigmoid(5)); 113 | }); 114 | }); 115 | -------------------------------------------------------------------------------- /fathom/test/rule_tests.mjs: -------------------------------------------------------------------------------- 1 | import {assert} from 'chai'; 2 | 3 | import {dom, rule, ruleset, score, type, typeIn} from '../index'; 4 | import {staticDom} from '../utils'; 5 | 6 | 7 | describe('Rule', function () { 8 | it('knows what it can add and emit', function () { 9 | const a = rule(dom('p'), type('para')); 10 | assert.sameMembers(Array.from(a.typesItCouldEmit()), ['para']); 11 | assert.sameMembers(Array.from(a.typesItCouldAdd()), ['para']); 12 | 13 | const b = rule(type('r'), typeIn('q').props('dummy').typeIn('r', 's')); 14 | assert.sameMembers(Array.from(b.typesItCouldEmit()), ['r', 's']); 15 | assert.sameMembers(Array.from(b.typesItCouldAdd()), ['s']); 16 | 17 | const c = rule(type('a'), score(2)); 18 | assert.sameMembers(Array.from(c.typesItCouldEmit()), ['a']); 19 | }); 20 | 21 | it('identifies prerequisite rules', function () { 22 | const domRule = rule(dom('p'), type('a')); 23 | const maxRule = rule(type('a').max(), type('b')); 24 | const maintainRule = rule(type('b'), score(2)); 25 | const addRule = rule(type('b'), type('c')); 26 | const rules = ruleset([domRule, maxRule, maintainRule, addRule]); 27 | const facts = rules.against(staticDom('')); 28 | assert.sameMembers(Array.from(domRule.prerequisites(facts)), []); 29 | assert.sameMembers(Array.from(maxRule.prerequisites(facts)), [domRule]); 30 | assert.sameMembers(Array.from(maintainRule.prerequisites(facts)), [maxRule]); 31 | assert.sameMembers(Array.from(addRule.prerequisites(facts)), [maxRule, maintainRule]); 32 | 33 | const prereqs = facts._prerequisitesTo(addRule); 34 | // TODO: Replace with deepEqual when chai >= 4.0 supports Maps and Sets. 35 | assert.equal(prereqs.size, 3); 36 | assert.deepEqual(prereqs.get(maintainRule), [addRule]); 37 | assert.deepEqual(prereqs.get(domRule), [maxRule]); 38 | assert.deepEqual(prereqs.get(maxRule), [addRule, maintainRule]); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /fathom/test/side_tests.mjs: -------------------------------------------------------------------------------- 1 | // Tests for fathom/side.js 2 | 3 | import {assert} from 'chai'; 4 | 5 | import {type} from '../index'; 6 | 7 | 8 | describe('Side', function () { 9 | it('makes a LHS out of a type()', function () { 10 | const side = type('smoo'); 11 | assert(side.asLhs); // It appears to be a Side. 12 | const lhs = side.asLhs(); 13 | assert.notStrictEqual(lhs.max); // It appears to be a TypeLhs. 14 | }); 15 | 16 | it('is immutable and so can be factored up', function () { 17 | const defaults = type('smoo'); 18 | const another = defaults.atMost(1); 19 | assert.equal(defaults._calls.length, 1); 20 | assert.equal(another._calls.length, 2); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /fathom/test/utils_tests.mjs: -------------------------------------------------------------------------------- 1 | import {assert} from 'chai'; 2 | import {NoWindowError} from '../exceptions'; 3 | import {dom, rule, ruleset, score, type} from '../index'; 4 | import {NiceSet, toposort, staticDom, attributesMatch, windowForElement} from '../utils'; 5 | 6 | 7 | describe('Utils', function () { 8 | describe('NiceSet', function () { 9 | it('pops', function () { 10 | const s = new NiceSet([1, 2]); 11 | assert.equal(s.pop(), 1); 12 | assert.equal(s.pop(), 2); 13 | assert.throws(() => s.pop(), 14 | 'Tried to pop from an empty NiceSet.'); 15 | }); 16 | }); 17 | 18 | describe('toposort', function () { 19 | it('sorts', function () { 20 | // Return answers that express the graph... 21 | // 4 <- 5 <- 6 <- 7 22 | // | | 23 | // v v 24 | // 5.1 <- 6.1 25 | // ...where -> means "needs". 26 | function nodesThatNeed(node) { 27 | return node === 5.1 ? [6, 6.1] : (node === 7 ? [] : [Math.floor(node) + 1]); 28 | } 29 | assert.deepEqual(toposort([4, 5, 5.1, 6, 6.1, 7], nodesThatNeed), 30 | [7, 6, 5, 4, 6.1, 5.1]); 31 | }); 32 | it('detects cycles', function () { 33 | // Express a graph of 3 nodes pointing in a circle. 34 | function nodesThatNeed(node) { 35 | return [(node + 1) % 3]; 36 | } 37 | assert.throws(() => toposort([0, 1, 2], nodesThatNeed), 38 | 'The graph has a cycle.'); 39 | }); 40 | }); 41 | 42 | describe('attributesMatch', function () { 43 | it('searches all attributes', function () { 44 | const doc = staticDom(` 45 | boo 46 | `); 47 | const rules = ruleset([ 48 | rule(dom('img'), type('attr')), 49 | rule(type('attr'), score(scoreFunc)), 50 | rule(type('attr').max(), 'best') 51 | ]); 52 | 53 | function scoreFunc(fnode) { 54 | return attributesMatch(fnode.element, attr => attr.includes('oo')) ? 5 : 1; 55 | } 56 | 57 | const facts = rules.against(doc); 58 | const best = facts.get('best'); 59 | assert.equal(best.length, 1); 60 | assert.equal(best[0].element.id, 'foo'); 61 | }); 62 | 63 | it('searches specified attributes', function () { 64 | const doc = staticDom(` 65 | bat 66 | `); 67 | const rules = ruleset([ 68 | rule(dom('img'), type('attr')), 69 | rule(type('attr'), score(scoreFunc)), 70 | rule(type('attr').max(), 'best') 71 | ]); 72 | 73 | function scoreFunc(fnode) { 74 | return attributesMatch(fnode.element, attr => attr.includes('at'), ['id']) ? 5 : 1; 75 | } 76 | 77 | const facts = rules.against(doc); 78 | const best = facts.get('best'); 79 | assert.equal(best.length, 1); 80 | assert.equal(best[0].element.id, 'sat'); 81 | }); 82 | 83 | it('searches attributes which are arrays', function () { 84 | const doc = staticDom(` 85 | 86 | `); 87 | const rules = ruleset([ 88 | rule(dom('img'), type('attr')), 89 | rule(type('attr'), score(scoreFunc)), 90 | rule(type('attr').max(), 'best') 91 | ]); 92 | 93 | function scoreFunc(fnode) { 94 | return attributesMatch(fnode.element, attr => attr.includes('at')) ? 5 : 1; 95 | } 96 | 97 | const facts = rules.against(doc); 98 | const best = facts.get('best'); 99 | assert.equal(best.length, 1); 100 | assert.equal(best[0].element.id, 'fat'); 101 | }); 102 | 103 | it('returns false for elements that lack the requested attributes', function () { 104 | // The first element has the alt attribute, and the second one doesn't, so it shouldn't get included in the results 105 | const doc = staticDom(` 106 | bat 107 | `); 108 | const rules = ruleset([ 109 | rule(dom('img'), type('attr')), 110 | rule(type('attr'), score(scoreFunc)), 111 | rule(type('attr').max(), 'best') 112 | ]); 113 | 114 | function scoreFunc(fnode) { 115 | return attributesMatch(fnode.element, attr => attr.includes('at'), ['alt']) ? 5 : 1; 116 | } 117 | 118 | const facts = rules.against(doc); 119 | const best = facts.get('best'); 120 | assert.equal(best.length, 1); 121 | assert.equal(best[0].element.id, 'foo'); 122 | }); 123 | 124 | it("doesn't touch nodes that don't match", function () { 125 | const doc = staticDom(` 126 | 127 | `); 128 | const rules = ruleset([ 129 | rule(dom('img'), type('attr')), 130 | rule(type('attr'), score(scoreFunc)), 131 | rule(type('attr').max(), 'best') 132 | ]); 133 | 134 | function scoreFunc(fnode) { 135 | return attributesMatch(fnode.element, attr => attr.includes('z')) ? 5 : 1; 136 | } 137 | 138 | const facts = rules.against(doc); 139 | const best = facts.get('best'); 140 | assert.equal(best.length, 2); 141 | }); 142 | 143 | it('searches multiple explicitly specified attributes', function () { 144 | const doc = staticDom(` 145 | bat 146 | `); 147 | const rules = ruleset([ 148 | rule(dom('img'), type('attr')), 149 | rule(type('attr'), score(scoreFunc)), 150 | rule(type('attr').max(), 'best') 151 | ]); 152 | 153 | function scoreFunc(fnode) { 154 | return attributesMatch(fnode.element, attr => attr.includes('at'), ['alt', 'id']) ? 5 : 1; 155 | } 156 | 157 | const facts = rules.against(doc); 158 | const best = facts.get('best'); 159 | assert.equal(best.length, 2); 160 | assert.equal(best[0].element.id, 'foo'); 161 | assert.equal(best[1].element.id, 'cat'); 162 | }); 163 | }); 164 | 165 | describe('windowForElement', function () { 166 | it('raises NoWindowError when run outside a window', function () { 167 | // We mock out the element because jsdom actually provides a window 168 | // object: 169 | const element = {ownerDocument: {defaultView: null}}; 170 | assert.throws(() => windowForElement(element), 171 | NoWindowError); 172 | }); 173 | }); 174 | }); 175 | -------------------------------------------------------------------------------- /fathom/utils.mjs: -------------------------------------------------------------------------------- 1 | export * from './utilsForBackend'; 2 | export * from './utilsForFrontend'; 3 | -------------------------------------------------------------------------------- /fathom/utilsForBackend.mjs: -------------------------------------------------------------------------------- 1 | /** 2 | * Things that work only on a command-line node.js environment 3 | */ 4 | 5 | import {jsdom} from 'jsdom/lib/old-api'; 6 | 7 | 8 | /** 9 | * Parse an HTML doc, and return a DOM-compliant interface to it. Do not 10 | * execute any of its inline scripts. 11 | */ 12 | export function staticDom(html) { 13 | return jsdom(html, {features: {ProcessExternalResources: false, 14 | FetchExternalResources: false}}); 15 | } 16 | -------------------------------------------------------------------------------- /fathom_fox/.eslintignore: -------------------------------------------------------------------------------- 1 | addon/contentScript.js 2 | addon/evaluate.js 3 | addon/simmer.js 4 | -------------------------------------------------------------------------------- /fathom_fox/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "browser": true, 4 | "es6": true, 5 | "amd": true, 6 | "webextensions": true 7 | }, 8 | "parserOptions": { 9 | "ecmaVersion": 9 10 | }, 11 | "rules": { 12 | "object-curly-spacing": "error", 13 | "keyword-spacing": "error" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /fathom_fox/README.md: -------------------------------------------------------------------------------- 1 | # FathomFox 2 | 3 | A suite of tools for developing [Fathom](https://mozilla.github.io/fathom/) rulesets within Firefox: 4 | 5 | * [Corpus collection and labeling tools](https://mozilla.github.io/fathom/samples.html) (which are likely all you will need) 6 | * An Evaluator which can help you [drop into the JS debugger](https://mozilla.github.io/fathom/training.html#setting-breakpoints) inside your ruleset 7 | * A Vectorizer, which you can ignore. (It persists, for now, as an optional manual alternative to simply letting `fathom train` and other tools take care of vectorization automatically.) 8 | 9 | For most use cases, it's better to run FathomFox from the commandline rather than installing it through the web. See [Fathom's installation page](https://mozilla.github.io/fathom/installing.html) for instructions. 10 | 11 | ## Full Documentation 12 | 13 | See [the Fathom docs](https://mozilla.github.io/fathom/versions.html). 14 | 15 | ## Running FathomFox from a Source Checkout 16 | 17 | This is necessary only if you are developing FathomFox itself. 18 | 19 | 1. Clone the [Fathom repository](https://github.com/mozilla/fathom/). 20 | 2. From within the checkout, inside the `fathom_fox` folder, install dependencies: `yarn run build`. 21 | 3. Run a clean copy of Firefox with FathomFox installed: `yarn run browser`. 22 | 4. Run `yarn run watch` in a separate terminal. This will keep your running copy of FathomFox up to date as you edit your ruleset. 23 | 24 | ## Thanks 25 | 26 | Thanks to Treora for his excellent freeze-dry library! 27 | -------------------------------------------------------------------------------- /fathom_fox/Tagged Head.afdesign: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/fathom_fox/Tagged Head.afdesign -------------------------------------------------------------------------------- /fathom_fox/addon/actionMenu.js: -------------------------------------------------------------------------------- 1 | function openTab(url) { 2 | browser.tabs.create({url, active: true}); 3 | window.close(); 4 | } 5 | 6 | document.getElementById('collectCorpus').addEventListener('click', () => openTab('/pages/corpus.html')); 7 | document.getElementById('evaluate').addEventListener('click', () => openTab('/pages/evaluate.html')); 8 | document.getElementById('vectorize').addEventListener('click', () => openTab('/pages/vector.html')); 9 | -------------------------------------------------------------------------------- /fathom_fox/addon/background.js: -------------------------------------------------------------------------------- 1 | /** Dispatch messages sent to the background script. */ 2 | function handleBackgroundScriptMessage(request, sender, sendResponse) { 3 | if (request.type === 'rulesetSucceededOnTabs') { 4 | // Run a given ruleset on a given set of tabs, and return an array of 5 | // responses saying whether they got the right answer on each. It's 6 | // necessary to do this in the background script so we have permission 7 | // to call the APIs we need. 8 | Promise.all(request.tabIds.map( 9 | tabId => browser.tabs.sendMessage( 10 | tabId, 11 | {type: 'rulesetSucceeded', 12 | traineeId: request.traineeId, 13 | coeffs: request.coeffs}))) 14 | .then(sendResponse); 15 | return true; // so sendResponse hangs around after we return 16 | } else if (request.type === 'refresh') { 17 | // Bridge between content and the devtools panel. 18 | browser.runtime.sendMessage({type: 'refresh'}).catch(() => {}); 19 | } 20 | } 21 | browser.runtime.onMessage.addListener(handleBackgroundScriptMessage); 22 | 23 | /** 24 | * Connect a dev panel, at its request, to the content script in its inspected 25 | * tab. 26 | */ 27 | function connectADevPanel(port) { 28 | // Open a port to our content script on the tab that's being inspected. 29 | port.onMessage.addListener(handleMessage); 30 | 31 | /** 32 | * Handle any of the various messages that can come flying at the 33 | * background script from various sources. 34 | */ 35 | async function handleMessage(request) { 36 | if (request.type === 'freeze') { 37 | // Send 'freeze' request to content-script to fetch frozen html 38 | browser.tabs.sendMessage(request.tabId, request) 39 | .then((html) => { 40 | // Show save file dialog. When the dialog is closed send a 'refresh' 41 | // message to the devpanel so it can hide the spinner. 42 | download(html, {saveAs: true}) 43 | .then(() => { 44 | browser.runtime.sendMessage({type: 'refresh'}); 45 | }) 46 | .catch(() => { 47 | browser.runtime.sendMessage({type: 'refresh'}); 48 | }); 49 | }); 50 | } else { 51 | // Most requests are passed unmodified to the content script. 52 | await browser.tabs.sendMessage(request.tabId, request); 53 | } 54 | } 55 | } 56 | browser.runtime.onConnect.addListener(connectADevPanel); 57 | 58 | // Update devtools panel when tab navigates to new page. 59 | browser.tabs.onUpdated.addListener((tabId, changeInfo, tabInfo) => { 60 | if (changeInfo.status === 'complete') { 61 | browser.runtime.sendMessage({type: 'init'}) 62 | .catch((error) => { 63 | console.error(error) 64 | }); 65 | } 66 | }); 67 | 68 | async function freeze_tab(tab) { 69 | const html = await browser.tabs.sendMessage( 70 | tab.id, 71 | { 72 | type: 'freeze', 73 | options: { 74 | wait: 0, 75 | shouldScroll: false 76 | } 77 | } 78 | ); 79 | await download(html, {saveAs: true}); 80 | } 81 | 82 | browser.commands.onCommand.addListener((command) => { 83 | if (command === 'freeze-page') { 84 | browser.tabs.query({currentWindow: true, active: true}) 85 | .then((tabs) => { 86 | return tabs[0]; 87 | }) 88 | .then((tab) => { 89 | return freeze_tab(tab); 90 | }) 91 | .catch((error) => { 92 | console.log(error); 93 | }); 94 | } 95 | }); 96 | -------------------------------------------------------------------------------- /fathom_fox/addon/corpus.js: -------------------------------------------------------------------------------- 1 | class CorpusCollector extends PageVisitor { 2 | formOptions() { 3 | if (!(this.doc.getElementById('wait').validity.valid && 4 | this.doc.getElementById('timeout').validity.valid)) { 5 | return undefined; 6 | } 7 | 8 | const options = {}; 9 | 10 | // Initialize options from the form. 11 | options.otherOptions = { 12 | wait: parseFloat(this.doc.getElementById('wait').value), 13 | shouldScroll: this.doc.getElementById('shouldScroll').checked, 14 | }; 15 | 16 | // Note we extend the timeout by the freeze delay. 17 | options.timeout = parseFloat(this.doc.getElementById('timeout').value) + options.otherOptions.wait; 18 | 19 | // Load each url line-by-line from the textarea. 20 | // If a line contains a space, the first word will be used as the filename. 21 | options.urls = this.doc 22 | .getElementById('pages') 23 | .value 24 | .split('\n') 25 | .map(line => line.trim()) 26 | .filter(line => line.length > 0) 27 | .map(line => { 28 | // Split into filename and url. 29 | const parts = line.split(/\s+/, 2); 30 | let obj; 31 | if (parts.length === 1) { 32 | obj = {filename: undefined, url: parts[0]}; 33 | } else { 34 | obj = {filename: parts[0] + '.html', url: parts[1]}; 35 | } 36 | // Prepend protocol if missing. 37 | if (!obj.url.match(/^https?:\/\//)) { 38 | obj.url = 'http://' + obj.url; 39 | } 40 | // Name the file from the host if not specified. 41 | if (!obj.filename) { 42 | obj.filename = obj.url 43 | .replace(/^https?:\/\//, '') // Remove protocol. 44 | .replace(/^([^\/]+)\/.*$/, '$1') // Delete everything after first / 45 | + '.html'; 46 | } 47 | return obj; 48 | }); 49 | // We need at least one url. 50 | if (options.urls.length === 0) { 51 | return undefined; 52 | } 53 | 54 | // Not customizeable just because nobody asked for it yet: 55 | options.maxTabs = 16; 56 | 57 | return options; 58 | } 59 | 60 | getViewportHeightAndWidth() { 61 | return { 62 | height: parseInt(this.doc.getElementById('viewportHeight').value), 63 | width: parseInt(this.doc.getElementById('viewportWidth').value) 64 | } 65 | } 66 | 67 | async processWithinTimeout(tab, windowId) { 68 | this.setCurrentStatus({message: 'freezing', index: tab.id}); 69 | // Inject dispatcher to listen to the message we then send. Can't get a 70 | // return value directly out of the content script because webpack 71 | // wraps our top-level stuff in a function. Instead, we use messaging. 72 | await browser.tabs.executeScript( 73 | tab.id, 74 | {file: '/contentScript.js'} 75 | ); 76 | 77 | // Call freeze-dry to fetch html. 78 | const html = await browser.tabs.sendMessage( 79 | tab.id, 80 | {type: 'freeze', options: {wait: this.otherOptions.wait, 81 | shouldScroll: this.otherOptions.shouldScroll}} 82 | ); 83 | return html; 84 | } 85 | 86 | async processWithoutTimeout(html, tabId) { 87 | // Save html to disk. 88 | const filename = this.urls[this.tabIdToUrlsIndex.get(tabId)].filename; 89 | const download_filename = await download(html, {filename}); 90 | 91 | this.setCurrentStatus({ 92 | message: 'downloaded as ' + download_filename, 93 | index: tabId, 94 | isFinal: true 95 | }); 96 | } 97 | } 98 | 99 | const collector = new CorpusCollector(document); 100 | collector.addEventListeners(); 101 | -------------------------------------------------------------------------------- /fathom_fox/addon/devtoolsOpener.js: -------------------------------------------------------------------------------- 1 | let backgroundPort = browser.runtime.connect(); 2 | 3 | function createPanel() { 4 | browser.devtools.panels.create( 5 | 'Fathom', 6 | '/icons/icon.svg', 7 | '/pages/devtoolsPanel.html' 8 | ).then((extensionPanel) => { 9 | extensionPanel.onShown.addListener(panelShown); 10 | extensionPanel.onHidden.addListener(panelHidden); 11 | }); 12 | } 13 | 14 | function panelShown() { 15 | inspectedElementSelector() 16 | .then((selector) => { 17 | backgroundPort.postMessage({ 18 | type: 'showHighlight', 19 | tabId: browser.devtools.inspectedWindow.tabId, 20 | selector: selector, 21 | }); 22 | browser.runtime.sendMessage({type: 'refresh'}); 23 | }) 24 | .catch((error) => { 25 | console.error(error); 26 | }); 27 | } 28 | 29 | function panelHidden() { 30 | backgroundPort.postMessage({ 31 | type: 'hideHighlight', 32 | tabId: browser.devtools.inspectedWindow.tabId, 33 | }); 34 | } 35 | 36 | createPanel(); 37 | -------------------------------------------------------------------------------- /fathom_fox/addon/download.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Save the given HTML to the user's downloads folder. 3 | */ 4 | async function download(html, options = {}) { 5 | const blob = new Blob([html], {type: 'text/html'}); 6 | const url = URL.createObjectURL(blob); 7 | 8 | // Save html using the specified filename as a template. 9 | let downloadId = await browser.downloads.download({ 10 | url, 11 | filename: options.filename || 'Untitled.html', 12 | saveAs: options.saveAs || false, 13 | }); 14 | 15 | // Give it 10 seconds; FF can be a bit slow. 16 | window.setTimeout(() => URL.revokeObjectURL(url), 1000 * 10); 17 | 18 | // Return the basename of the chosen filename. 19 | let filename = (await browser.downloads.search({id: downloadId}))[0].filename; 20 | return filename.replace(/^.*\//, ''); 21 | } 22 | -------------------------------------------------------------------------------- /fathom_fox/addon/icons/icon.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fathom_fox/addon/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "name": "FathomFox", 4 | "version": "3.7.3", 5 | "description": "Tools for developing Fathom rulesets", 6 | "applications": { 7 | "gecko": { 8 | "id": "{954efd86-8f62-49e7-8a65-80016051e382}" 9 | } 10 | }, 11 | "icons": { 12 | "48": "icons/icon.svg", 13 | "96": "icons/icon.svg" 14 | }, 15 | "browser_action": { 16 | "default_icon": "icons/icon.svg", 17 | "default_title": "FathomFox", 18 | "default_popup": "pages/actionMenu.html", 19 | "browser_style": true 20 | }, 21 | "background": { 22 | "scripts": ["download.js", "background.js"] 23 | }, 24 | "content_scripts": [{ 25 | "matches": [""], 26 | "js": ["rulesets.js", "utils.js", "contentScript.js", "simmer.js"] 27 | }], 28 | "web_accessible_resources": [ 29 | "simmer.js" 30 | ], 31 | "permissions": [ 32 | "", 33 | "downloads", 34 | "tabs" 35 | ], 36 | "devtools_page": "pages/devtoolsOpener.html", 37 | "commands": { 38 | "freeze-page": { 39 | "suggested_key": { 40 | "default": "Ctrl+Shift+O" 41 | }, 42 | "description": "Download page in current active tab" 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /fathom_fox/addon/measureWindowSize.js: -------------------------------------------------------------------------------- 1 | ({outerWidth: window.outerWidth, 2 | innerWidth: window.innerWidth, 3 | outerHeight: window.outerHeight, 4 | innerHeight: window.innerHeight}); 5 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/actionMenu.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 |
9 |
Corpus Collector
10 |
11 |
12 |
Evaluator
13 |
14 |
15 |
Vectorizer
16 |
17 |
18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/blank.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/corpus.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 24 | 25 | 26 |

Corpus Collector

27 |

28 | This “freezes” a series of pages for use as a training corpus. External resources like images and CSS are inlined to create a convenient single-file package for each page. Scripts are removed or deactivated, and external network resources that can't be inlined are blocked. The downloaded pages land in your usual downloads folder. 29 |

30 |

31 | Optionally, you may prefix each line with a filename to save the page to, followed by whitespace. 32 |

33 | 34 | 36 |
37 | 38 |
39 |
40 | 41 | 42 |
43 |
44 | 45 | sec 46 |
47 |
48 | 49 | sec 50 |
51 |
52 | 58 |
59 |
60 |
61 |
    62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/devtoolsOpener.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/devtoolsPanel.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 114 | 115 | 116 | 117 |
    118 | 121 | 124 | 125 |
    126 | 127 |
    128 | 129 | 133 |
    134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/evaluate.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 129 | 130 | 131 |

    Evaluator

    132 | 135 |

    136 | Evaluate your ruleset, for the selected trainee, against the samples you’ve loaded as tabs in this window. 137 |

    138 |
    139 |
    140 | 141 |
    142 |
    143 | 144 |
    145 |
    146 |
    147 |
    148 | 149 |
    150 |
    151 |
    152 |
    153 |
    154 | 180 |
    181 | 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /fathom_fox/addon/pages/vector.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 32 | 33 | 34 |

    Vectorizer

    35 | 38 |

    39 | This turns a series of frozen, labeled pages into feature vectors for use with fathom train. The feature vectors land in a file in your usual downloads folder. Because web extensions aren't allowed to load file:// URLs, you might find it necessary to run a local web server like fathom serve in the directory where your local samples live. fathom list can help you get a list of filenames to paste here. 40 |

    41 |
    42 | 43 |
    44 |
    45 | 46 |
    47 | 48 | 50 |
    51 | 52 |
    53 |
    54 | 55 | sec 56 |
    57 |
    58 | 59 | 60 |
    61 |
    62 | 63 | tabs 64 |
    65 |
    66 |
    67 |
      68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /fathom_fox/addon/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Return the result of a browser.devtools.inspectedWindow.eval call. Throw the 3 | * error object on failure. 4 | */ 5 | async function resultOfEval(codeString) { 6 | let [result, error] = await browser.devtools.inspectedWindow.eval(codeString); 7 | if (error !== undefined) { 8 | throw error; 9 | } 10 | return result; 11 | } 12 | 13 | /** 14 | * Return a backward iterator over an Array. 15 | */ 16 | function *reversed(array) { 17 | for (let i = array.length - 1; i >= 0; i--) { 18 | yield array[i]; 19 | } 20 | } 21 | 22 | /** 23 | * Deletes all children of the specified element. 24 | */ 25 | function emptyElement(element) { 26 | while (element.firstChild) { 27 | element.removeChild(element.firstChild); 28 | } 29 | } 30 | 31 | // Requires simmer.js injected into current page. 32 | // simmer.js is injected when the devtools panel is initialised when first opened. 33 | async function inspectedElementSelector() { 34 | return resultOfEval(`Simmer.configure({depth: 25})($0)`); 35 | } 36 | 37 | /** 38 | * Set the current window's size such that the content area is the size you 39 | * pass in. 40 | * 41 | * @arg tab {tabs.Tab} A tab in the window we're adjusting that we can inject 42 | * the window-measuring script into 43 | * 44 | * @return a Promise that is resolved when the window size has been changed 45 | */ 46 | async function setViewportSize(tab, width, height) { 47 | // Because window.outerHeight and friends are undefined from background 48 | // scripts, we have to collect the info by injecting a content script into 49 | // (arbitrarily) the active tab. However, we have to ensure that tab is not 50 | // showing about:blank, because webexts aren't allowed to inject scripts 51 | // there. So we open a page of our own first. 52 | const windowSizes = (await browser.tabs.executeScript(tab.id, {file: '/measureWindowSize.js'}))[0]; 53 | return browser.windows.update( 54 | tab.windowId, 55 | {width: windowSizes.outerWidth - windowSizes.innerWidth + width, 56 | height: windowSizes.outerHeight - windowSizes.innerHeight + height}); 57 | } 58 | 59 | /** 60 | * Given a URL as a string, return the last segment, minus any ".html" 61 | * extension. 62 | */ 63 | function urlFilename(url) { 64 | return url.substring(url.lastIndexOf('/') + 1, url.endsWith('.html') ? url.length - 5 : url.length) 65 | } 66 | 67 | function sleep(ms) { 68 | return new Promise(resolve => setTimeout(resolve, ms)); 69 | } 70 | 71 | async function initTraineeMenu(goButton) { 72 | // Draw Ruleset menu: 73 | let traineeKeys; 74 | traineeKeys = Array.from(trainees.keys()); 75 | const menu = document.getElementById('trainee'); 76 | if (traineeKeys.length) { 77 | for (const traineeKey of traineeKeys) { 78 | const option = document.createElement('option'); 79 | option.text = option.value = traineeKey; 80 | menu.add(option); 81 | } 82 | } else { 83 | goButton.disabled = true; 84 | menu.disabled = true; 85 | document.getElementById('please-install').classList.remove('hidden'); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /fathom_fox/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "fathom-fox", 3 | "version": "3.7.3", 4 | "description": "Tools for collecting a Fathom training corpus and developing rulesets", 5 | "scripts": { 6 | "build": "yarn install --ignore-engines && rollup -c", 7 | "watch": "rollup -c -w", 8 | "browser": "web-ext run -s addon/", 9 | "release": "cd addon && web-ext build" 10 | }, 11 | "license": "MPL-2.0", 12 | "devDependencies": { 13 | "fathom-web": "file:../fathom", 14 | "freeze-dry": "^0.2.4", 15 | "geckodriver": "^3.0.1", 16 | "rollup": "^1.17.0", 17 | "rollup-plugin-commonjs": "^10.0.1", 18 | "rollup-plugin-json": "^4.0.0", 19 | "rollup-plugin-node-builtins": "^2.1.2", 20 | "rollup-plugin-node-globals": "^1.4.0", 21 | "rollup-plugin-node-resolve": "^5.2.0", 22 | "rollup-plugin-copy": "^3.0.0", 23 | "simmerjs": "^0.5.6", 24 | "web-ext": "^3.1.0", 25 | "webpack": "^4.36.1" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /fathom_fox/rollup.config.js: -------------------------------------------------------------------------------- 1 | import commonjs from 'rollup-plugin-commonjs'; 2 | import resolve from 'rollup-plugin-node-resolve'; 3 | import json from 'rollup-plugin-json'; 4 | import builtins from 'rollup-plugin-node-builtins'; 5 | import globals from 'rollup-plugin-node-globals'; 6 | import copy from 'rollup-plugin-copy'; 7 | const webpackPostcss = require('./src/rollup-plugin-webpack-postcss/rollup-plugin-webpack-postcss'); 8 | 9 | /** 10 | * Return typical rollup settings for a file of a given name. 11 | */ 12 | function mindlesslyFactoredOutSettings(name, globalVarName) { 13 | return { 14 | input: 'src/' + name + '.js', 15 | output: { 16 | file: 'addon/' + name + '.js', 17 | format: 'iife', 18 | name: globalVarName || name // Convention: name the var the same thing. 19 | }, 20 | plugins: [ 21 | resolve({preferBuiltins: true}), 22 | webpackPostcss(), 23 | commonjs({ 24 | namedExports: { 25 | 'wu': ['forEach', 'map', 'flatten'] 26 | } 27 | }), 28 | json(), 29 | globals(), 30 | builtins(), 31 | copy({ 32 | targets: [ 33 | { src: 'node_modules/simmerjs/dist/simmer.js', dest: 'addon' }, 34 | ] 35 | }), 36 | ], 37 | watch: { 38 | chokidar: false 39 | } 40 | } 41 | } 42 | 43 | export default [ 44 | mindlesslyFactoredOutSettings('contentScript'), 45 | mindlesslyFactoredOutSettings('evaluate'), 46 | mindlesslyFactoredOutSettings('rulesets', 'trainees'), 47 | ]; 48 | -------------------------------------------------------------------------------- /fathom_fox/src/evaluate.js: -------------------------------------------------------------------------------- 1 | let gCoeffsDiv, gAccuracyDiv, gCiDiv, gCostDiv, gGoodBadDiv = false; 2 | 3 | class Evaluator { 4 | constructor(tabs, traineeId) { 5 | this.tabs = tabs; 6 | this.traineeId = traineeId; 7 | } 8 | 9 | async evaluate() { 10 | const coeffs = trainees.get(this.traineeId).coeffs; 11 | const successReport = await this.verboseSuccessReports(coeffs); 12 | const cost = successReport.reduce((accum, value) => accum + value.cost, 0); 13 | updateOutputs(coeffs, cost, successReport); 14 | } 15 | 16 | /** 17 | * Try the ruleset on each tab, and return a bigger blob of info that 18 | * allows us to show the user which element it found, for debugging. 19 | */ 20 | async verboseSuccessReports(coeffs) { 21 | const results = await this.resultsForPages(coeffs); 22 | return results.map((result, i) => ({ 23 | didSucceed: result.didSucceed, 24 | cost: result.cost, 25 | filename: urlFilename(this.tabs[i].url), 26 | tabId: this.tabs[i].id})); 27 | } 28 | 29 | /** 30 | * Send a message to all the pages in the corpus, telling them "Run ruleset 31 | * ID X, and tell me how its default query (the one with the same out() key 32 | * as its ID) did." 33 | * 34 | * @return an Array of {didSucceed: bool, cost: number} objects, one per 35 | * page 36 | */ 37 | async resultsForPages(coeffs) { 38 | return browser.runtime.sendMessage( 39 | { 40 | type: 'rulesetSucceededOnTabs', 41 | tabIds: this.tabs.map(tab => tab.id), 42 | traineeId: this.traineeId, 43 | coeffs: Array.from(coeffs.entries()) 44 | } 45 | ); 46 | } 47 | } 48 | 49 | async function evaluateTabs() { 50 | // Grey out Evaluate button: 51 | const evaluateButton = document.getElementById('evaluate'); 52 | evaluateButton.disabled = true; 53 | 54 | // Show output. 55 | document.getElementById('output').classList.remove('hidden'); 56 | 57 | try { 58 | // TODO: Using "active" here rather than a tab ID presents a race condition 59 | // if you quickly switch away from the tab after clicking the Evaluate button. 60 | let tabs = await browser.tabs.query({currentWindow: true, active: false}); 61 | // We don't have permission to mess with about: tabs, so they crash. 62 | // Filter them out: 63 | tabs = tabs.filter(tab => !tab.url.startsWith('about:')); 64 | const rulesetName = document.getElementById('trainee').value; 65 | const viewportSize = trainees.get(rulesetName).viewportSize || {width: 1024, height: 768}; 66 | await setViewportSize(tabs[0], viewportSize.width, viewportSize.height); // for consistent element sizing in samples due to text wrap, etc. 67 | const evaluator = new Evaluator(tabs, rulesetName); 68 | await evaluator.evaluate(); 69 | } finally { 70 | // Restore UI state, leaving output visible. 71 | evaluateButton.disabled = false; 72 | } 73 | } 74 | 75 | function empty(element) { 76 | while (element.firstChild) { 77 | element.removeChild(element.firstChild); 78 | } 79 | } 80 | 81 | /** 82 | * Return [low end, high end] of 95% CI for accuracy using binomial proportion 83 | * confidence interval formula. 84 | */ 85 | function confidenceInterval(successRatio, numberOfSamples) { 86 | const z_for_95_percent = 1.96; 87 | const addend = z_for_95_percent * Math.sqrt(successRatio * (1 - successRatio) / numberOfSamples); 88 | return [successRatio - addend, Math.min(1, successRatio + addend)]; 89 | } 90 | 91 | /** 92 | * Format a ratio as a rounded-off percentage. 93 | */ 94 | function percentify(ratio) { 95 | return `${(ratio * 100).toFixed(1)}%`; 96 | } 97 | 98 | function updateOutputs(coeffs, cost, successesOrFailures) { 99 | // Update best coeffs and accuracy. 100 | const coeffStrings = []; 101 | for (const [key, val] of coeffs.entries()) { 102 | coeffStrings.push(`${key}: ${val}`); 103 | } 104 | gCoeffsDiv.firstChild.textContent = `[${coeffStrings.join(', ')}]`; 105 | gCostDiv.firstChild.textContent = Math.trunc(cost); 106 | 107 | if (successesOrFailures) { 108 | // Compute and show accuracy: 109 | const accuracy = successesOrFailures.reduce((accum, sf) => accum + sf.didSucceed, 0) / successesOrFailures.length; 110 | gAccuracyDiv.firstChild.textContent = percentify(accuracy); 111 | 112 | // Draw CI readout: 113 | const [ciLow, ciHigh] = confidenceInterval(accuracy, successesOrFailures.length); 114 | gCiDiv.firstChild.textContent = `${percentify(ciLow)} - ${percentify(ciHigh)}`; 115 | 116 | // Draw good/bad chart: 117 | if (gGoodBadDiv.childElementCount !== successesOrFailures.length) { 118 | empty(gGoodBadDiv); 119 | for (const _ of successesOrFailures) { 120 | const div = document.createElement('div'); 121 | div.appendChild(document.createTextNode('')); 122 | gGoodBadDiv.appendChild(div); 123 | } 124 | } 125 | let div = gGoodBadDiv.firstElementChild; 126 | const traineeId = document.getElementById('trainee').value; 127 | for (let sf of successesOrFailures) { 128 | div.firstChild.textContent = sf.filename; 129 | div.addEventListener('click', function focusTab() { 130 | // Label the bad element if bad, clear it if good: 131 | browser.tabs.sendMessage( 132 | sf.tabId, 133 | {type: 'labelBadElement', 134 | traineeId, 135 | coeffs}); 136 | browser.tabs.update(sf.tabId, {active: true}); 137 | // Update the Fathom dev tools panel if it's open: 138 | browser.runtime.sendMessage({type: 'refresh'}); 139 | }); 140 | div.setAttribute('class', sf.didSucceed ? 'good' : 'bad'); 141 | div = div.nextElementSibling; 142 | } 143 | } 144 | } 145 | 146 | /** 147 | * Draw and outfit the Evaluator page. 148 | */ 149 | async function initPage(document) { 150 | // Find elements once. 151 | gCoeffsDiv = document.getElementById('coeffs'); 152 | gAccuracyDiv = document.getElementById('accuracy'); 153 | gCiDiv = document.getElementById('ci'); 154 | gCostDiv = document.getElementById('cost'); 155 | gGoodBadDiv = document.getElementById('goodBad'); 156 | 157 | // Initialise elements to a known state. 158 | empty(gCoeffsDiv); 159 | empty(gAccuracyDiv); 160 | empty(gCiDiv); 161 | empty(gCostDiv); 162 | empty(gGoodBadDiv); 163 | 164 | // Create a text node in coeffs and accuracy once, rather than on each update. 165 | gCoeffsDiv.appendChild(document.createTextNode('')); 166 | gAccuracyDiv.appendChild(document.createTextNode('')); 167 | gCiDiv.appendChild(document.createTextNode('')); 168 | gCostDiv.appendChild(document.createTextNode('')); 169 | 170 | document.getElementById('evaluate').onclick = evaluateTabs; 171 | 172 | initTraineeMenu(document.getElementById('evaluate')); 173 | } 174 | 175 | initPage(document); 176 | -------------------------------------------------------------------------------- /fathom_fox/src/rollup-plugin-webpack-postcss/LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2019 Patrick D. Cavit 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /fathom_fox/src/rollup-plugin-webpack-postcss/README.md: -------------------------------------------------------------------------------- 1 | # Rollup Plugin Webpack PostCSS 2 | 3 | An attempt to use webpack within rollup to allow for bundling postcss, since the [dependency cycles in postcss mean rollup can't bundle it](https://github.com/postcss/postcss/issues/1030). 4 | 5 | This is a very silly-seeming idea, but it also seems like it might be working? 6 | 7 | ## Rollup Plugins 8 | 9 | - `rollup-plugin-node-resolve` 10 | - **This plugin** 11 | - `rollup-plugin-commonjs` 12 | - `rollup-plugin-node-globals` 13 | - `rollup-plugin-node-builtins` 14 | 15 | Other orderings/lists might work, but that's what I'm using atm. 16 | 17 | --- 18 | 19 | ⚡💀🔥 **USE AT YOUR OWN RISK** -------------------------------------------------------------------------------- /fathom_fox/src/rollup-plugin-webpack-postcss/rollup-plugin-webpack-postcss.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | const path = require("path"); 4 | 5 | const webpack = require("webpack"); 6 | const MemoryFS = require("memory-fs"); 7 | 8 | const postcss = require.resolve("postcss"); 9 | 10 | module.exports = () => ({ 11 | name : "rollup-plugin-postcss", 12 | 13 | load(id) { 14 | if(id !== postcss) { 15 | return null; 16 | } 17 | 18 | const memfs = new MemoryFS(); 19 | const compiler = webpack({ 20 | entry : "postcss", 21 | 22 | output : { 23 | path : __dirname, 24 | filename : "postcss.js", 25 | library : "postcss", 26 | libraryTarget : "commonjs2", 27 | }, 28 | }); 29 | 30 | // Write files to memory, not disk 31 | compiler.outputFileSystem = memfs; 32 | 33 | return new Promise((resolve, reject) => { 34 | compiler.run((err, stats) => { 35 | if(err || stats.hasErrors()) { 36 | if(err) { 37 | return reject(err); 38 | } 39 | 40 | const info = stats.toJson(); 41 | 42 | return reject(info.errors); 43 | } 44 | 45 | return resolve({ 46 | code : memfs.readFileSync(path.join(__dirname, "./postcss.js"), "utf8"), 47 | 48 | // TODO: figure out source map 49 | }); 50 | }); 51 | }); 52 | } 53 | }); 54 | -------------------------------------------------------------------------------- /fathom_fox/src/rulesets.js: -------------------------------------------------------------------------------- 1 | import {ruleset, rule, dom, type, score, out, utils} from 'fathom-web'; 2 | const {ancestors, isVisible, linearScale, rgbaFromString, saturation} = utils; 3 | 4 | 5 | /** 6 | * Rulesets to vectorize or debug (and metadata about them) 7 | * 8 | * More mechanically, a map of names to {coeffs, rulesetMaker, ...} objects, 9 | * which we call "trainees". The rulesets you specify here are available to the 10 | * trainer and also show up in the FathomFox UI, from which you can debug a 11 | * ruleset. Most often, all the entries here point to the same ruleset but have 12 | * different values of `vectorType` for separately training each type of thing 13 | * the ruleset recognizes. 14 | */ 15 | const trainees = new Map(); 16 | 17 | /** 18 | * An example ruleset. Replace it with your own. 19 | * 20 | * This one finds the full-screen, content-blocking overlays that often go 21 | * behind modal popups. It's not the most well-honed thing, but it's simple and 22 | * short. 23 | */ 24 | trainees.set( 25 | // The ID for this trainee, which must be the same as the Fathom type you 26 | // are evaluating, if you are using the FathomFox Evaluator: 27 | 'overlay', 28 | 29 | // Here we paste in coefficients from ``fathom train``. This lets us use 30 | // the Evaluator to see what Fathom is getting wrong. Otherwise, these 31 | // numbers do nothing until you deploy your application, so there's no need 32 | // to maintain them until then. 33 | {coeffs: new Map([ // [rule name, coefficient] 34 | ['big', 50.4946], 35 | ['nearlyOpaque', 48.6396], 36 | ['monochrome', 42.8406], 37 | ['classOrId', 0.5005], 38 | ['visible', 55.8750]]), 39 | // Bias is -139.3106 for this example, though that isn't needed until 40 | // production. 41 | 42 | // The content-area size to use while training. Defaults to 1024x768. 43 | viewportSize: {width: 1024, height: 768}, 44 | 45 | // The type of node to extract features from when using the Vectorizer. 46 | // Defaults to the trainee ID. 47 | // 48 | // vectorType: 'overlay', 49 | 50 | rulesetMaker: 51 | function () { 52 | /** 53 | * Return whether the passed-in div is the size of the whole viewport/document 54 | * or nearly so. 55 | */ 56 | function big(fnode) { 57 | // Compare the size of the fnode to the size of the viewport. So far, spot- 58 | // checking shows the overlay is never the size of the whole document, just 59 | // the viewport. 60 | const rect = fnode.element.getBoundingClientRect(); 61 | const hDifference = Math.abs(rect.height - window.innerHeight); 62 | const wDifference = Math.abs(rect.width - window.innerWidth); 63 | return linearScale(hDifference + wDifference, 250, 0); // 250px is getting into "too tall to just be nav or something" territory. 64 | } 65 | 66 | /** 67 | * Return whether the fnode is almost but not entirely opaque. 68 | */ 69 | function nearlyOpaque(fnode) { 70 | const style = getComputedStyle(fnode.element); 71 | const opacity = parseFloat(style.getPropertyValue('opacity')); 72 | let bgColorAlpha = rgbaFromString(style.getPropertyValue('background-color'))[3]; 73 | if (bgColorAlpha === undefined) { 74 | bgColorAlpha = 1; 75 | } 76 | const totalOpacity = opacity * bgColorAlpha; 77 | let ret; 78 | if (totalOpacity === 1) { // seems to work even though a float 79 | ret = 0; 80 | } else { 81 | ret = linearScale(totalOpacity, .4, .6); 82 | } 83 | return ret; 84 | } 85 | 86 | /** 87 | * Return whether the fnode's bgcolor is nearly black or white. 88 | */ 89 | function monochrome(fnode) { 90 | const rgba = rgbaFromString(getComputedStyle(fnode.element).getPropertyValue('background-color')); 91 | return linearScale(1 - saturation(...rgba), .96, 1); 92 | } 93 | 94 | function suspiciousClassOrId(fnode) { 95 | const element = fnode.element; 96 | const attributeNames = ['class', 'id']; 97 | let numOccurences = 0; 98 | function numberOfSuspiciousSubstrings(value) { 99 | return value.includes('popup') + value.includes('modal') + value.includes('overlay') + value.includes('underlay') + value.includes('backdrop'); 100 | } 101 | 102 | for (const name of attributeNames) { 103 | let values = element.getAttribute(name); 104 | if (values) { 105 | if (!Array.isArray(values)) { 106 | values = [values]; 107 | } 108 | for (const value of values) { 109 | numOccurences += numberOfSuspiciousSubstrings(value); 110 | } 111 | } 112 | } 113 | 114 | // 1 occurrence gets us to about 75% certainty; 2, 92%. It bottoms 115 | // out at 0 and tops out at 1. 116 | // TODO: Figure out how to derive the magic number .1685 from 117 | // 0 and 1. 118 | return (-(.3 ** (numOccurences + .1685)) + 1); 119 | } 120 | 121 | /* The actual ruleset */ 122 | 123 | const rules = ruleset([ 124 | // Consider all
      tags as candidate overlays: 125 | rule(dom('div'), type('overlay')), 126 | 127 | // Contribute the "bigness" of the node to its overlay score: 128 | rule(type('overlay'), score(big), {name: 'big'}), 129 | 130 | // Contibute the opacity of the node to its overlay score: 131 | rule(type('overlay'), score(nearlyOpaque), {name: 'nearlyOpaque'}), 132 | 133 | // Contribute some other signals as well: 134 | rule(type('overlay'), score(monochrome), {name: 'monochrome'}), 135 | rule(type('overlay'), score(suspiciousClassOrId), {name: 'classOrId'}), 136 | rule(type('overlay'), score(isVisible), {name: 'visible'}), 137 | 138 | // Offer the max-scoring overlay-typed node under the output key 139 | // "overlay". The score on that node will represent the probability, 140 | // informed by a corpus of training samples, that the node is, indeed, 141 | // a pop-up overlay. 142 | rule(type('overlay').max(), out('overlay')) 143 | ]); 144 | return rules; 145 | } 146 | 147 | // isTarget is an optional function which returns whether the Vectorizer 148 | // should consider a fnode a target. The default is to consider it a 149 | // target iff its ``data-fathom`` attribute === the trainee ID. 150 | // 151 | // isTarget: fnode => fnode.element.dataset.fathom === 'foo' 152 | } 153 | ); 154 | 155 | export default trainees; 156 | -------------------------------------------------------------------------------- /smoo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mozilla/fathom/2b2c84eace185b4cc6fa4f75d00d028728a30f8a/smoo --------------------------------------------------------------------------------